impresso-project
/

nel-mgenre-multilingual

@@ -1,6 +1,7 @@
 from transformers import Pipeline
 import nltk
 import requests
 nltk.download("averaged_perceptron_tagger")
 nltk.download("averaged_perceptron_tagger_eng")
@@ -110,9 +111,16 @@ class NelPipeline(Pipeline):
             num_beams=1,
             num_return_sequences=1,
             max_new_tokens=30,
         )
-        print(outputs)
         # Decode the predictions into readable text
         wikipedia_predictions = self.tokenizer.batch_decode(
@@ -120,7 +128,7 @@ class NelPipeline(Pipeline):
         )
         # Return the predictions along with the extracted entity, lOffset, and rOffset
-        return wikipedia_predictions, enclosed_entity, lOffset, rOffset
     def _forward(self, inputs):
         return inputs
@@ -145,7 +153,7 @@ class NelPipeline(Pipeline):
         #         ],  # This can be improved with a real API call to get the QID
         #     "confidence_nel": np.round(percentages[i], 2),
         # }
-        wikipedia_predictions, enclosed_entity, lOffset, rOffset = outputs
         results = []
         for idx, wikipedia_name in enumerate(wikipedia_predictions):
             # Get QID
@@ -156,13 +164,13 @@ class NelPipeline(Pipeline):
             wkpedia_pagename, url = get_wikipedia_title(qid)
             results.append(
                 {
-                    "id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
                     "surface": enclosed_entity,
                     "wkpedia_pagename": wkpedia_pagename,
                     "wkd_id": qid,
                     "url": url,
                     "type": "UNK",
-                    "confidence_nel": 0.0,
                     "lOffset": lOffset,
                     "rOffset": rOffset,
                 }

 from transformers import Pipeline
 import nltk
 import requests
+import torch
 nltk.download("averaged_perceptron_tagger")
 nltk.download("averaged_perceptron_tagger_eng")
             num_beams=1,
             num_return_sequences=1,
             max_new_tokens=30,
+            return_dict_in_generate=True,
+            output_scores=True,
         )
+        token_ids, scores = outputs.sequences, outputs.sequences_scores
+        # Process scores and normalize
+        scores_tensor = scores.clone().detach()
+        probabilities = torch.exp(scores_tensor)
+        percentages = (probabilities * 100.0).cpu().numpy().tolist()
         # Decode the predictions into readable text
         wikipedia_predictions = self.tokenizer.batch_decode(
         )
         # Return the predictions along with the extracted entity, lOffset, and rOffset
+        return wikipedia_predictions, enclosed_entity, lOffset, rOffset, percentages
     def _forward(self, inputs):
         return inputs
         #         ],  # This can be improved with a real API call to get the QID
         #     "confidence_nel": np.round(percentages[i], 2),
         # }
+        wikipedia_predictions, enclosed_entity, lOffset, rOffset, percentages = outputs
         results = []
         for idx, wikipedia_name in enumerate(wikipedia_predictions):
             # Get QID
             wkpedia_pagename, url = get_wikipedia_title(qid)
             results.append(
                 {
+                    # "id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
                     "surface": enclosed_entity,
                     "wkpedia_pagename": wkpedia_pagename,
                     "wkd_id": qid,
                     "url": url,
                     "type": "UNK",
+                    "confidence_nel": percentages[idx],
                     "lOffset": lOffset,
                     "rOffset": rOffset,
                 }