|
""" |
|
Data structure classes required and used for multiple levels of granularity in spans. |
|
""" |
|
from data_loader import dl_sa |
|
from mosestokenizer import MosesDetokenizer |
|
detokenize = MosesDetokenizer('en') |
|
|
|
|
|
class PhraseAnnotation: |
|
def __init__(self, initial_word): |
|
self.words = [initial_word] |
|
self._resolved_annotation = initial_word.resolved_annotation |
|
self.ppr_for_ned_candidates = initial_word.ppr_for_ned_candidates |
|
|
|
@property |
|
def has_valid_bioes_labels(self): |
|
|
|
return all([x.has_valid_bioes_labels and x.bioes_labels is not None for x in self.words]) |
|
|
|
def add(self, word): |
|
self.words.append(word) |
|
|
|
if self._resolved_annotation > 0 and self.ppr_for_ned_candidates != word.ppr_for_ned_candidates: |
|
self.ppr_for_ned_candidates = list(set(self.ppr_for_ned_candidates) & set(word.ppr_for_ned_candidates)) |
|
|
|
def all_possible_annotations(self): |
|
all_common_ids = set.intersection(*[set([y[0] for y in x.candidates]) for x in self.words]) |
|
all_common_ids_average_confidence = map(lambda x: sum(x)/len(x), [ |
|
[sum(y[1])/len(y[1]) for x in self.words for y in x.candidates if y[0] == k] for k in all_common_ids]) |
|
return sorted(zip(all_common_ids, all_common_ids_average_confidence), key=lambda x: x[1], reverse=True) |
|
|
|
def set_alternative_as_resolved_annotation(self, alternative): |
|
self._resolved_annotation = alternative |
|
|
|
@property |
|
def resolved_annotation(self): |
|
return self._resolved_annotation |
|
|
|
@property |
|
def subword_annotations(self): |
|
return [x for w in self.words for x in w.annotations] |
|
|
|
@property |
|
def word_string(self): |
|
return detokenize([x.word_string.replace("\n", "\u010a").replace("£", "£").replace("âĦ¢", '™') |
|
.replace('ü','ü').replace('é', 'é').replace('ÃŃ', 'í') for x in self.words]) |
|
|
|
@property |
|
def begin_character(self): |
|
return self.words[0].token_offsets[0][1][0] |
|
|
|
@property |
|
def end_character(self): |
|
return self.words[-1].token_offsets[-1][1][-1] |
|
|
|
@property |
|
def average_annotation_confidence(self): |
|
ac = [x.resolved_annotation_confidence for x in self.words] |
|
return sum(ac) / len(ac) |
|
|
|
def __str__(self): |
|
return f"{self.word_string} ({self.begin_character}, {self.end_character}) | annotation: " \ |
|
f"{self.words[0].annotations[0].idx2tag[self.resolved_annotation]}" |
|
|
|
|
|
class WordAnnotation: |
|
def __init__(self, subword_annotations, token_offsets, ppr_for_ned_candidates=None): |
|
if ppr_for_ned_candidates is None: |
|
ppr_for_ned_candidates = [] |
|
self.annotations = subword_annotations |
|
self.token_offsets = token_offsets |
|
self.ppr_for_ned_candidates = ppr_for_ned_candidates |
|
self.is_valid_annotation = False if not subword_annotations else True |
|
self.word_string = ''.join([x[0].replace('\u0120', '') for x in token_offsets]) |
|
|
|
|
|
self.candidates = sorted([] if not self.is_valid_annotation else [ |
|
(cid, self._get_assigned_probabilities(cid)) for cid in set.intersection(*[set(y.top_k_i_list) |
|
for y in self.annotations])], |
|
key=lambda x: sum(x[1])/len(x[1]), reverse=True) |
|
self.resolved_annotation = self._resolve_annotation() |
|
rc = self._get_assigned_probabilities(self.resolved_annotation) |
|
self.resolved_annotation_confidence = sum(rc) / len(rc) |
|
if not self.candidates: |
|
self.candidates = [(self.resolved_annotation, rc)] |
|
assert self.resolved_annotation in [x[0] for x in self.candidates] |
|
self.has_valid_bioes_labels = all([x.has_valid_bioes_label for x in self.annotations]) |
|
self.bioes_labels = None if not self.has_valid_bioes_labels else [x.bioes_label for x in self.annotations] |
|
|
|
def _resolve_annotation(self): |
|
if not self.is_valid_annotation: |
|
return 0 |
|
r = [x.item() for x in self.annotations] |
|
if r.count(r[0]) == len(r): |
|
annotation = r[0] |
|
elif self.candidates: |
|
|
|
annotation = self.candidates[0][0] |
|
else: |
|
|
|
|
|
most_frequent = max(set(r), key=r.count) |
|
if r.count(most_frequent) == 1: |
|
annotation = r[0] |
|
else: |
|
annotation = most_frequent |
|
return annotation |
|
|
|
def _get_assigned_probabilities(self, cid): |
|
assigned_probabilities = [] |
|
for a in self.annotations: |
|
found = False |
|
for i, p in zip(a.top_k_i_list, a.top_k_p_list): |
|
if i == cid: |
|
assigned_probabilities.append(p) |
|
found = True |
|
break |
|
if not found: |
|
assigned_probabilities.append(0.0) |
|
assert len(assigned_probabilities) == len(self.annotations) |
|
return assigned_probabilities |
|
|
|
def __str__(self): |
|
ann = self.annotations[0].idx2tag[self.resolved_annotation] |
|
cdns = ','.join([f'({self.annotations[0].idx2tag[x[0]]}: {sum(x[1])/len(x[1])})' for x in self.candidates]) |
|
return f"{self.word_string} | annotation: {ann} | candidates: [{cdns}]" |
|
|
|
|
|
class SubwordAnnotation: |
|
""" |
|
The value of his class will be equal to the value of its "self.top_k_i_list[0]", the rest of the information will be |
|
carried over for future decision-making and evaluation. |
|
""" |
|
def __init__(self, top_k_p_list, top_k_i_list, subword_string): |
|
self.top_k_p_list = top_k_p_list |
|
self.top_k_i_list = top_k_i_list |
|
subword_string = "UNDEF_STR" if not subword_string else subword_string |
|
self.subword_string = subword_string.replace('\u0120', '') |
|
self.bioes_label = 2 |
|
self.has_valid_bioes_label = False |
|
self.bioes_probabilities = None |
|
|
|
def __eq__(self, other): |
|
if isinstance(other, int): |
|
return self.top_k_i_list[0] == other |
|
elif isinstance(other, SubwordAnnotation): |
|
return self.top_k_i_list[0] == other.top_k_i_list[0] |
|
else: |
|
raise ValueError |
|
|
|
def __str__(self): |
|
return f"({self.subword_string}, <<" \ |
|
f"{'>> <<'.join([f'{dl_sa.mentions_itos[i]}: {p:.3f}' for i, p in zip(self.top_k_i_list, self.top_k_p_list)])}>>)" |
|
|
|
def item(self): |
|
return self.top_k_i_list[0] |
|
|
|
def item_probability(self): |
|
return self.top_k_p_list[0] |
|
|
|
def set_bioes_label(self, label: int, probs: list): |
|
assert 0 <= label <= 5 |
|
assert len(probs) == 5 |
|
self.has_valid_bioes_label = True |
|
self.bioes_label = label |
|
self.bioes_probabilities = probs |
|
|