Emanuela Boros commited on
Commit
7d0539f
·
1 Parent(s): 25e0e5e

added confidence

Browse files
Files changed (1) hide show
  1. generic_nel.py +13 -5
generic_nel.py CHANGED
@@ -1,6 +1,7 @@
1
  from transformers import Pipeline
2
  import nltk
3
  import requests
 
4
 
5
  nltk.download("averaged_perceptron_tagger")
6
  nltk.download("averaged_perceptron_tagger_eng")
@@ -110,9 +111,16 @@ class NelPipeline(Pipeline):
110
  num_beams=1,
111
  num_return_sequences=1,
112
  max_new_tokens=30,
 
 
113
  )
114
 
115
- print(outputs)
 
 
 
 
 
116
 
117
  # Decode the predictions into readable text
118
  wikipedia_predictions = self.tokenizer.batch_decode(
@@ -120,7 +128,7 @@ class NelPipeline(Pipeline):
120
  )
121
 
122
  # Return the predictions along with the extracted entity, lOffset, and rOffset
123
- return wikipedia_predictions, enclosed_entity, lOffset, rOffset
124
 
125
  def _forward(self, inputs):
126
  return inputs
@@ -145,7 +153,7 @@ class NelPipeline(Pipeline):
145
  # ], # This can be improved with a real API call to get the QID
146
  # "confidence_nel": np.round(percentages[i], 2),
147
  # }
148
- wikipedia_predictions, enclosed_entity, lOffset, rOffset = outputs
149
  results = []
150
  for idx, wikipedia_name in enumerate(wikipedia_predictions):
151
  # Get QID
@@ -156,13 +164,13 @@ class NelPipeline(Pipeline):
156
  wkpedia_pagename, url = get_wikipedia_title(qid)
157
  results.append(
158
  {
159
- "id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
160
  "surface": enclosed_entity,
161
  "wkpedia_pagename": wkpedia_pagename,
162
  "wkd_id": qid,
163
  "url": url,
164
  "type": "UNK",
165
- "confidence_nel": 0.0,
166
  "lOffset": lOffset,
167
  "rOffset": rOffset,
168
  }
 
1
  from transformers import Pipeline
2
  import nltk
3
  import requests
4
+ import torch
5
 
6
  nltk.download("averaged_perceptron_tagger")
7
  nltk.download("averaged_perceptron_tagger_eng")
 
111
  num_beams=1,
112
  num_return_sequences=1,
113
  max_new_tokens=30,
114
+ return_dict_in_generate=True,
115
+ output_scores=True,
116
  )
117
 
118
+ token_ids, scores = outputs.sequences, outputs.sequences_scores
119
+
120
+ # Process scores and normalize
121
+ scores_tensor = scores.clone().detach()
122
+ probabilities = torch.exp(scores_tensor)
123
+ percentages = (probabilities * 100.0).cpu().numpy().tolist()
124
 
125
  # Decode the predictions into readable text
126
  wikipedia_predictions = self.tokenizer.batch_decode(
 
128
  )
129
 
130
  # Return the predictions along with the extracted entity, lOffset, and rOffset
131
+ return wikipedia_predictions, enclosed_entity, lOffset, rOffset, percentages
132
 
133
  def _forward(self, inputs):
134
  return inputs
 
153
  # ], # This can be improved with a real API call to get the QID
154
  # "confidence_nel": np.round(percentages[i], 2),
155
  # }
156
+ wikipedia_predictions, enclosed_entity, lOffset, rOffset, percentages = outputs
157
  results = []
158
  for idx, wikipedia_name in enumerate(wikipedia_predictions):
159
  # Get QID
 
164
  wkpedia_pagename, url = get_wikipedia_title(qid)
165
  results.append(
166
  {
167
+ # "id": f"{lOffset}:{rOffset}:{enclosed_entity}:{NEL_MODEL}",
168
  "surface": enclosed_entity,
169
  "wkpedia_pagename": wkpedia_pagename,
170
  "wkd_id": qid,
171
  "url": url,
172
  "type": "UNK",
173
+ "confidence_nel": percentages[idx],
174
  "lOffset": lOffset,
175
  "rOffset": rOffset,
176
  }