File size: 5,487 Bytes
c337225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
This file contains the implementation of the candidate manager in charge of loading the candidate sets,
 and modifying the phrase annotations using the loaded candidates.
"""
import json
from span_annotation import PhraseAnnotation
from configuration import get_resources_dir


class CandidateManager:
    def __init__(self, mentions_vocab, is_kb_yago = False, is_ppr_for_ned = False, is_context_agnostic = False,
                 is_indexed_for_spans= False):
        self.mentions_vocab = mentions_vocab
        self.candidates = None
        if is_kb_yago:
            print(" * Loading the candidates stored for KB+YAGO ...")
            is_context_agnostic = True
            is_indexed_for_spans = False
            self.load_kb_plus_yago()
        elif is_ppr_for_ned:
            print(" * Loading the {} PPRforNED candidate set ...".format(
                'context agnostic' if is_context_agnostic else 'context aware'))
            self.load_ppr_for_ned_candidates(is_context_agnostic, is_indexed_for_spans)
        else:
            raise ValueError("Either \'is_kb_yago\' or \'is_ppr_for_ned\' flags must be True!")
        self.is_context_agnostic = is_context_agnostic
        self.is_indexed_for_spans = is_indexed_for_spans
        self.is_kb_yago = is_kb_yago
        self.is_ppr_for_ned = is_ppr_for_ned

    def load_ppr_for_ned_candidates(self, is_context_agnostic, is_indexed_for_spans):
        if is_context_agnostic:
            file_address = "context_agnostic_mentions.json"
        elif is_indexed_for_spans:
            file_address = "context_aware_spans.json"
        else:
            file_address = "context_aware_mentions.json"
        candidates_a = json.load(open(
            get_resources_dir() / "data" / "candidates" / "aida_testa_pprforned" / file_address, "r"))
        candidates_b = json.load(open(
            get_resources_dir() / "data" / "candidates" / "aida_testb_pprforned" / file_address, "r"))
        if is_context_agnostic:
            for key in candidates_b:
                if key in candidates_a:
                    for elem in candidates_b[key]:
                        if elem not in candidates_a[key]:
                            candidates_a[key].append(elem)
                else:
                    candidates_a[key] = candidates_b[key]
        else:
            candidates_a.update(candidates_b)
        self.candidates = candidates_a

    def load_kb_plus_yago(self):
        self.candidates = json.load(open(
            get_resources_dir() / "data" / "candidates" / "kb_plus_yago_candidates.json", "r"))

    def _fetch_candidates(self, phrase_annotation, sentence = None):
        candidates = []
        if self.is_kb_yago:
            phrase_to_check = phrase_annotation.word_string.lower()
            if phrase_to_check in self.candidates:
                candidates = self.candidates[phrase_to_check]
        elif self.is_ppr_for_ned:
            # TODO lower-cased check mention surface forms
            span_key  = f"({phrase_annotation.begin_character}, {phrase_annotation.end_character})"
            if self.is_context_agnostic and phrase_annotation.word_string in self.candidates:
                candidates = self.candidates[phrase_annotation.word_string]
            elif not self.is_context_agnostic and sentence in self.candidates:
                if self.is_indexed_for_spans and span_key in self.candidates[sentence]:
                    candidates = self.candidates[sentence][span_key]
                elif not self.is_indexed_for_spans and phrase_annotation.word_string in self.candidates[sentence]:
                    candidates = self.candidates[sentence][phrase_annotation.word_string]
        return candidates

    def modify_phrase_annotation_using_candidates(self, phrase_annotation: PhraseAnnotation, sentence: str = None):
        """
        The method post processes the :param phrase_annotation: found in a :param sentence: to make sure it is bound to
          the predefined {self.candidates} set.
        It is not possible to perform candidate look up for spans in context agnostic scenario
          so {self.is_indexed_for_spans} will only be considered where {self.is_context_agnostic} is False.
        """
        if self.candidates is None or phrase_annotation.resolved_annotation == 0:
            return
        candidates = self._fetch_candidates(phrase_annotation, sentence)
        if not candidates:
            phrase_annotation.set_alternative_as_resolved_annotation(0)
            return
        if self.is_kb_yago:
            candidates_ = [self.mentions_vocab[x[0]] for x in candidates if x[0] in self.mentions_vocab]
            prior_probabilities_ = [x[1] for x in candidates if x[0] in self.mentions_vocab]
        else:
            candidates_ = [self.mentions_vocab[x] for x in candidates if x in self.mentions_vocab]
            prior_probabilities_ = [1.0 for x in candidates if x in self.mentions_vocab]
        # TODO use the prior_probabilities_ to adjust the probabilities
        if candidates_:
            all_p_anns = phrase_annotation.all_possible_annotations()
            filtered_p_predictions = sorted(
                [x for x in all_p_anns if x[0] in candidates_], key=lambda y: y[1], reverse=True)
            if filtered_p_predictions:
                phrase_annotation.set_alternative_as_resolved_annotation(filtered_p_predictions[0][0])
            else:
                phrase_annotation.set_alternative_as_resolved_annotation(0)