Emanuela Boros
commited on
Commit
·
7d0539f
1
Parent(s):
25e0e5e
added confidence
Browse files- generic_nel.py +13 -5
generic_nel.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from transformers import Pipeline
|
2 |
import nltk
|
3 |
import requests
|
|
|
4 |
|
5 |
nltk.download("averaged_perceptron_tagger")
|
6 |
nltk.download("averaged_perceptron_tagger_eng")
|
@@ -110,9 +111,16 @@ class NelPipeline(Pipeline):
|
|
110 |
num_beams=1,
|
111 |
num_return_sequences=1,
|
112 |
max_new_tokens=30,
|
|
|
|
|
113 |
)
|
114 |
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
# Decode the predictions into readable text
|
118 |
wikipedia_predictions = self.tokenizer.batch_decode(
|
@@ -120,7 +128,7 @@ class NelPipeline(Pipeline):
|
|
120 |
)
|
121 |
|
122 |
# Return the predictions along with the extracted entity, lOffset, and rOffset
|
123 |
-
return wikipedia_predictions, enclosed_entity, lOffset, rOffset
|
124 |
|
125 |
def _forward(self, inputs):
|
126 |
return inputs
|
@@ -145,7 +153,7 @@ class NelPipeline(Pipeline):
|
|
145 |
# ], # This can be improved with a real API call to get the QID
|
146 |
# "confidence_nel": np.round(percentages[i], 2),
|
147 |
# }
|
148 |
-
wikipedia_predictions, enclosed_entity, lOffset, rOffset = outputs
|
149 |
results = []
|
150 |
for idx, wikipedia_name in enumerate(wikipedia_predictions):
|
151 |
# Get QID
|
@@ -156,13 +164,13 @@ class NelPipeline(Pipeline):
|
|
156 |
wkpedia_pagename, url = get_wikipedia_title(qid)
|
157 |
results.append(
|
158 |
{
|
159 |
-
"id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
|
160 |
"surface": enclosed_entity,
|
161 |
"wkpedia_pagename": wkpedia_pagename,
|
162 |
"wkd_id": qid,
|
163 |
"url": url,
|
164 |
"type": "UNK",
|
165 |
-
"confidence_nel":
|
166 |
"lOffset": lOffset,
|
167 |
"rOffset": rOffset,
|
168 |
}
|
|
|
1 |
from transformers import Pipeline
|
2 |
import nltk
|
3 |
import requests
|
4 |
+
import torch
|
5 |
|
6 |
nltk.download("averaged_perceptron_tagger")
|
7 |
nltk.download("averaged_perceptron_tagger_eng")
|
|
|
111 |
num_beams=1,
|
112 |
num_return_sequences=1,
|
113 |
max_new_tokens=30,
|
114 |
+
return_dict_in_generate=True,
|
115 |
+
output_scores=True,
|
116 |
)
|
117 |
|
118 |
+
token_ids, scores = outputs.sequences, outputs.sequences_scores
|
119 |
+
|
120 |
+
# Process scores and normalize
|
121 |
+
scores_tensor = scores.clone().detach()
|
122 |
+
probabilities = torch.exp(scores_tensor)
|
123 |
+
percentages = (probabilities * 100.0).cpu().numpy().tolist()
|
124 |
|
125 |
# Decode the predictions into readable text
|
126 |
wikipedia_predictions = self.tokenizer.batch_decode(
|
|
|
128 |
)
|
129 |
|
130 |
# Return the predictions along with the extracted entity, lOffset, and rOffset
|
131 |
+
return wikipedia_predictions, enclosed_entity, lOffset, rOffset, percentages
|
132 |
|
133 |
def _forward(self, inputs):
|
134 |
return inputs
|
|
|
153 |
# ], # This can be improved with a real API call to get the QID
|
154 |
# "confidence_nel": np.round(percentages[i], 2),
|
155 |
# }
|
156 |
+
wikipedia_predictions, enclosed_entity, lOffset, rOffset, percentages = outputs
|
157 |
results = []
|
158 |
for idx, wikipedia_name in enumerate(wikipedia_predictions):
|
159 |
# Get QID
|
|
|
164 |
wkpedia_pagename, url = get_wikipedia_title(qid)
|
165 |
results.append(
|
166 |
{
|
167 |
+
# "id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
|
168 |
"surface": enclosed_entity,
|
169 |
"wkpedia_pagename": wkpedia_pagename,
|
170 |
"wkd_id": qid,
|
171 |
"url": url,
|
172 |
"type": "UNK",
|
173 |
+
"confidence_nel": percentages[idx],
|
174 |
"lOffset": lOffset,
|
175 |
"rOffset": rOffset,
|
176 |
}
|