impresso-project
/

ner-stacked-bert-multilingual

@@ -127,25 +127,40 @@ def get_entities(tokens, tags, confidences, text):
     return entities
 def realign(
-    text_sentence, out_label_preds, softmax_scores, tokenizer, reverted_label_map
 ):
     preds_list, words_list, confidence_list = [], [], []
-    word_ids = tokenizer(text_sentence, is_split_into_words=True).word_ids()
-    for idx, word in enumerate(text_sentence):
-        beginning_index = word_ids.index(idx)
-        try:
-            preds_list.append(reverted_label_map[out_label_preds[beginning_index]])
-            confidence_list.append(max(softmax_scores[beginning_index]))
-        except Exception as ex:  # the sentence was longer then max_length
-            preds_list.append("O")
-            confidence_list.append(0.0)
-        words_list.append(word)
     return words_list, preds_list, confidence_list
 def segment_and_trim_sentences(article, language, max_length):
     try:
@@ -248,14 +263,12 @@ class MultitaskTokenClassificationPipeline(Pipeline):
             for task, logits in chunk_result.logits.items():
                 predictions[task].extend(torch.argmax(logits, dim=-1).tolist())
                 confidence_scores[task].extend(F.softmax(logits, dim=-1).tolist())
-        print(predictions)
         # Decode and process the predictions
         decoded_predictions = {}
         for task, preds in predictions.items():
             decoded_predictions[task] = [
                 [self.id2label[task][label] for label in seq] for seq in preds
             ]
-        print(decoded_predictions)
         # Extract entities from the combined predictions
         entities = {}
         for task, preds in predictions.items():
@@ -266,6 +279,7 @@ class MultitaskTokenClassificationPipeline(Pipeline):
                 self.tokenizer,
                 self.id2label[task],
             )
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
         return entities

     return entities
 def realign(
+    text_sentences, out_label_preds, softmax_scores, tokenizer, reverted_label_map
 ):
+    """
+    Realign predictions across multiple text chunks.
+    text_sentences: List of text chunks (the original text split into chunks)
+    out_label_preds: Predictions for each chunk
+    softmax_scores: Confidence scores for each chunk
+    tokenizer: The tokenizer used for encoding/decoding
+    reverted_label_map: Mapping from predicted labels to readable labels
+    """
     preds_list, words_list, confidence_list = [], [], []
+    # Process each chunk individually
+    for chunk_idx, text_sentence in enumerate(text_sentences):
+        word_ids = tokenizer(text_sentence, is_split_into_words=True).word_ids()
+        for idx, word in enumerate(text_sentence):
+            try:
+                # Align based on word indices within the current chunk
+                beginning_index = word_ids.index(idx)
+                preds_list.append(reverted_label_map[out_label_preds[chunk_idx][beginning_index]])
+                confidence_list.append(max(softmax_scores[chunk_idx][beginning_index]))
+            except Exception as ex:  # Handle any misalignment issues
+                preds_list.append("O")
+                confidence_list.append(0.0)
+            words_list.append(word)
     return words_list, preds_list, confidence_list
 def segment_and_trim_sentences(article, language, max_length):
     try:
             for task, logits in chunk_result.logits.items():
                 predictions[task].extend(torch.argmax(logits, dim=-1).tolist())
                 confidence_scores[task].extend(F.softmax(logits, dim=-1).tolist())
         # Decode and process the predictions
         decoded_predictions = {}
         for task, preds in predictions.items():
             decoded_predictions[task] = [
                 [self.id2label[task][label] for label in seq] for seq in preds
             ]
         # Extract entities from the combined predictions
         entities = {}
         for task, preds in predictions.items():
                 self.tokenizer,
                 self.id2label[task],
             )
+            print(words_list, preds_list, confidence_list)
             entities[task] = get_entities(words_list, preds_list, confidence_list, text)
         return entities