add start,end to entities (#1)

Browse files

- add start,end to entities (26aaacaf5bae033452c5a36cd9d5ee60b836a925)

Files changed (2) hide show

BertForJointParsing.py +23 -14
BertForPrefixMarking.py +1 -2

BertForJointParsing.py CHANGED Viewed

@@ -200,8 +200,9 @@ class BertForJointParsing(BertPreTrainedModel):
         if self.prefix is not None:
             inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, sentences, padding)
         else:
-            inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_tensors='pt')
         # Copy the tensors to the right device, and parse!
         inputs = {k:v.to(self.device) for k,v in inputs.items()}
         output = self.forward(**inputs, return_dict=True, compute_syntax_mst=compute_syntax_mst)
@@ -230,7 +231,7 @@ class BertForJointParsing(BertPreTrainedModel):
         # NER logits each sentence gets a list(tuple(word, ner))
         if output.ner_logits is not None:
-            for sent_idx,parsed in enumerate(ner_parse_logits(inputs, sentences, tokenizer, output.ner_logits, self.config.id2label)):
                 if per_token_ner:
                     merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'ner')
                 final_output[sent_idx]['ner_entities'] = aggregate_ner_tokens(parsed)
@@ -247,17 +248,18 @@ class BertForJointParsing(BertPreTrainedModel):
 def aggregate_ner_tokens(predictions):
     entities = []
     prev = None
-    for word,pred in predictions:
         # O does nothing
         if pred == 'O': prev = None
         # B- || I-entity != prev (different entity or none)
         elif pred.startswith('B-') or pred[2:] != prev:
             prev = pred[2:]
-            entities.append(([word], prev))
-        else: entities[-1][0].append(word)
-    return [dict(phrase=' '.join(words), label=label) for words,label in entities]
 def merge_token_list(src, update, key):
     for token_src, token_update in zip(src, update):
@@ -272,9 +274,9 @@ def combine_token_wordpieces(input_ids: torch.Tensor, tokenizer: BertTokenizerFa
         else: ret.append(token)
     return ret
-def ner_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor, id2label: Dict[int, str]):
-    input_ids = inputs['input_ids']
     predictions = torch.argmax(logits, dim=-1)
     batch_ret = []
     for batch_idx in range(len(sentences)):
@@ -286,11 +288,18 @@ def ner_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], toke
             if token_id in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]: continue
             token = tokenizer._convert_id_to_token(token_id)
             # wordpieces should just be appended to the previous word
             if token.startswith('##'):
-                ret[-1] = (ret[-1][0] + token[2:], ret[-1][1])
                 continue
-            ret.append((token, id2label[predictions[batch_idx, tok_idx].item()]))
     return batch_ret
 def lex_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor):

         if self.prefix is not None:
             inputs = encode_sentences_for_bert_for_prefix_marking(tokenizer, sentences, padding)
         else:
+            inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
+        offset_mapping = inputs.pop('offset_mapping')
         # Copy the tensors to the right device, and parse!
         inputs = {k:v.to(self.device) for k,v in inputs.items()}
         output = self.forward(**inputs, return_dict=True, compute_syntax_mst=compute_syntax_mst)
         # NER logits each sentence gets a list(tuple(word, ner))
         if output.ner_logits is not None:
+            for sent_idx,parsed in enumerate(ner_parse_logits(inputs, sentences, tokenizer, output.ner_logits, self.config.id2label, offset_mapping)):
                 if per_token_ner:
                     merge_token_list(final_output[sent_idx]['tokens'], map(itemgetter(1), parsed), 'ner')
                 final_output[sent_idx]['ner_entities'] = aggregate_ner_tokens(parsed)
 def aggregate_ner_tokens(predictions):
     entities = []
     prev = None
+    for word, pred, start, end in predictions:
         # O does nothing
         if pred == 'O': prev = None
         # B- || I-entity != prev (different entity or none)
         elif pred.startswith('B-') or pred[2:] != prev:
             prev = pred[2:]
+            entities.append([[word], prev, start, end])
+        else:
+            entities[-1][0].append(word)
+            entities[-1][3] = end
+    return [dict(phrase=' '.join(words), label=label, start=start, end=end) for words, label, start, end in entities]
 def merge_token_list(src, update, key):
     for token_src, token_update in zip(src, update):
         else: ret.append(token)
     return ret
+def ner_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor, id2label: Dict[int, str], offset_mapping):
+    input_ids = inputs['input_ids']
     predictions = torch.argmax(logits, dim=-1)
     batch_ret = []
     for batch_idx in range(len(sentences)):
             if token_id in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]: continue
             token = tokenizer._convert_id_to_token(token_id)
+            # get the offsets for this token
+            start_pos, end_pos = offset_mapping[batch_idx, tok_idx]
             # wordpieces should just be appended to the previous word
+            # we modify the last token in ret
+            # by discarding the original end position and replacing it with the new token's end position
             if token.startswith('##'):
+                ret[-1] = (ret[-1][0] + token[2:], ret[-1][1], ret[-1][2], end_pos.item())
                 continue
+                        # for each token, we append a tuple containing: token, label, start position, end position
+            ret.append((token, id2label[predictions[batch_idx, tok_idx].item()], start_pos.item(), end_pos.item()))
     return batch_ret
 def lex_parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenizer: BertTokenizerFast, logits: torch.Tensor):

BertForPrefixMarking.py CHANGED Viewed

@@ -184,8 +184,7 @@ def parse_logits(inputs: Dict[str, torch.Tensor], sentences: List[str], tokenize
     return ret
 def encode_sentences_for_bert_for_prefix_marking(tokenizer: BertTokenizerFast, sentences: List[str], padding='longest', truncation=True):
-    inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_tensors='pt')
     # create our prefix_id_options array which will be like the input ids shape but with an addtional
     # dimension containing for each prefix whether it can be for that word
     prefix_id_options = torch.full(inputs['input_ids'].shape + (TOTAL_POSSIBLE_PREFIX_CLASSES,), TOTAL_POSSIBLE_PREFIX_CLASSES, dtype=torch.long)

     return ret
 def encode_sentences_for_bert_for_prefix_marking(tokenizer: BertTokenizerFast, sentences: List[str], padding='longest', truncation=True):
+    inputs = tokenizer(sentences, padding=padding, truncation=truncation, return_offsets_mapping=True, return_tensors='pt')
     # create our prefix_id_options array which will be like the input ids shape but with an addtional
     # dimension containing for each prefix whether it can be for that word
     prefix_id_options = torch.full(inputs['input_ids'].shape + (TOTAL_POSSIBLE_PREFIX_CLASSES,), TOTAL_POSSIBLE_PREFIX_CLASSES, dtype=torch.long)