samuelam commited on
Commit
3891395
1 Parent(s): c1ec713

Upload 6 files

Browse files
evaluation/evaluate_utils/evaluate_dicts.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+ import numpy as np
3
+
4
+ from evaluation.evaluate_utils.utils import _align_bags
5
+
6
+
7
+ def calculate_f1_score(precision, recall):
8
+ if precision + recall == 0:
9
+ return 0 # Handle the case to avoid division by zero
10
+ return 2 * (precision * recall) / (precision + recall)
11
+
12
+
13
+ def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool):
14
+ from evaluation.evaluate_utils.evaluate_factory import get_evaluator_from_gold_answer
15
+
16
+ recall = []
17
+ for gold_key, gold_value in gold.items():
18
+ pred_value = pred.get(gold_key)
19
+ gold_value = fix_number(gold_value)
20
+ pred_value = fix_number(pred_value)
21
+ if gold_key not in pred:
22
+ recall.append(0)
23
+ else:
24
+ evaluator = (
25
+ get_evaluator_from_gold_answer(type(gold_value))
26
+ if use_gold_for_eval
27
+ else get_evaluator_from_gold_answer(type(pred_value))
28
+ )
29
+ if type(pred_value) != type(gold_value):
30
+ recall.append(0)
31
+ continue
32
+ recall.append(evaluator(pred_value, gold_value))
33
+ avg_recall = np.average(recall)
34
+ return avg_recall
35
+
36
+
37
+ def fix_number(number):
38
+
39
+ if type(number) == str:
40
+ copy_ans = number
41
+ copy_ans = ' '.join(' '.join(' '.join(copy_ans.split('$')).split('%')).split('sqft')).strip()
42
+ copy_ans = copy_ans.strip()
43
+ copy_ans = copy_ans.replace(',', '.')
44
+ try:
45
+ return float(copy_ans)
46
+ except:
47
+ return number
48
+ elif type(number) == int:
49
+ return float(number)
50
+ else:
51
+ return number
52
+
53
+ def evaluate_pair_of_dicts(pred: Dict, gold: Dict):
54
+ recall = calc_recall(pred, gold, True)
55
+ precision = calc_recall(gold, pred, False)
56
+ f1 = calculate_f1_score(precision, recall)
57
+ return f1
58
+
59
+
60
+ def evaluate_dicts(pred: List[Dict], gold: List[Dict]):
61
+ if not (
62
+ type(pred) == dict
63
+ or len(pred) == 0
64
+ or (type(pred) == list and type(pred[0]) == dict)
65
+ ):
66
+ return 0
67
+ max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts)
68
+ return np.average(max_alignment_scores)
evaluation/evaluate_utils/evaluate_factory.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union, Dict
2
+
3
+ from evaluation.evaluate_utils.evaluate_dicts import evaluate_dicts
4
+ from evaluation.evaluate_utils.evaluate_numbers import evaluate_numbers
5
+ from evaluation.evaluate_utils.evaluate_strings import evaluate_strings
6
+
7
+ EvaluatorFactory = {
8
+ "string": evaluate_strings,
9
+ "number": evaluate_numbers,
10
+ "json": evaluate_dicts,
11
+ "string list": evaluate_strings,
12
+ }
13
+
14
+ EvaluatorFactoryFromType = {
15
+ str: evaluate_strings,
16
+ int: evaluate_numbers,
17
+ float: evaluate_numbers,
18
+ bool: evaluate_strings,
19
+ list: evaluate_strings
20
+ }
21
+
22
+
23
+ def get_evaluator(evaluator: str):
24
+ return EvaluatorFactory[evaluator]
25
+
26
+
27
+ def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]):
28
+ return EvaluatorFactoryFromType[gold_answer]
evaluation/evaluate_utils/evaluate_numbers.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+ import numpy as np
3
+
4
+
5
+ # Renamed calc_z function to distance_function_log
6
+ def distance_function_log(pred: float, gold: float):
7
+ if pred == gold == 0:
8
+ return 1
9
+ if pred == 0:
10
+ pred = 1e-4
11
+ if gold == 0:
12
+ gold = 1e-4
13
+ if pred > gold:
14
+ return max(0, 1 - np.log(pred / gold))
15
+ else:
16
+ return max(0, 1 - np.log(gold / pred))
17
+
18
+
19
+ def evaluate_numbers(pred: Union[float, str], gold: float):
20
+ res = None
21
+ if type(pred) != float and type(pred) != int:
22
+ try:
23
+ pred = float(pred)
24
+ except ValueError:
25
+ res = 0
26
+ if type(gold) != float and type(gold) != int:
27
+ try:
28
+ gold = float(gold)
29
+ except ValueError:
30
+ res = 0
31
+ if res is None:
32
+ res = distance_function_log(pred, gold)
33
+ return res
evaluation/evaluate_utils/evaluate_strings.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation for two strings or list of strings.
3
+
4
+ Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
5
+ """
6
+
7
+ from collections import defaultdict
8
+ from typing import List, Set, Tuple, Union
9
+ import string
10
+ import re
11
+ import numpy as np
12
+ from scipy.optimize import linear_sum_assignment
13
+
14
+
15
+ # From here through _normalize_answer was originally copied from:
16
+ # https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
17
+ # Then cleaned up and modified a bit.
18
+ def _remove_articles(text: str) -> str:
19
+ regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
20
+ return re.sub(regex, " ", text)
21
+
22
+
23
+ def _white_space_fix(text: str) -> str:
24
+ return " ".join(text.split())
25
+
26
+
27
+ EXCLUDE = set(string.punctuation)
28
+
29
+
30
+ def _remove_punc(text: str) -> str:
31
+ if not _is_number(text):
32
+ return "".join(ch for ch in text if ch not in EXCLUDE)
33
+ else:
34
+ return text
35
+
36
+
37
+ def _lower(text: str) -> str:
38
+ return text.lower()
39
+
40
+
41
+ def _tokenize(text: str) -> List[str]:
42
+ return re.split(" |-", text)
43
+
44
+
45
+ def _normalize_answer(text: str) -> str:
46
+ """Lower text and remove punctuation, articles and extra whitespace."""
47
+
48
+ parts = [
49
+ _white_space_fix(
50
+ _remove_articles(_normalize_number(_remove_punc(_lower(token))))
51
+ )
52
+ for token in _tokenize(text)
53
+ ]
54
+ parts = [part for part in parts if part.strip()]
55
+ normalized = " ".join(parts).strip()
56
+ return normalized
57
+
58
+
59
+ def _is_number(text: str) -> bool:
60
+ try:
61
+ float(text)
62
+ return True
63
+ except ValueError:
64
+ return False
65
+
66
+
67
+ def _normalize_number(text: str) -> str:
68
+ if _is_number(text):
69
+ return str(float(text))
70
+ else:
71
+ return text
72
+
73
+
74
+ def _answer_to_bags(
75
+ answer: Union[str, List[str], Tuple[str, ...]]
76
+ ) -> Tuple[List[str], List[Set[str]]]:
77
+ if isinstance(answer, (list, tuple)):
78
+ raw_spans = answer
79
+ else:
80
+ raw_spans = [answer]
81
+ normalized_spans: List[str] = []
82
+ token_bags = []
83
+ for raw_span in raw_spans:
84
+ normalized_span = _normalize_answer(raw_span)
85
+ normalized_spans.append(normalized_span)
86
+ token_bags.append(set(normalized_span.split()))
87
+ return normalized_spans, token_bags
88
+
89
+
90
+ def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
91
+ """
92
+ Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
93
+ between them and gets maximum metric values over all the answers.
94
+ """
95
+ scores = np.zeros([len(gold), len(predicted)])
96
+ for gold_index, gold_item in enumerate(gold):
97
+ for pred_index, pred_item in enumerate(predicted):
98
+ if _match_numbers_if_present(gold_item, pred_item):
99
+ scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
100
+ row_ind, col_ind = linear_sum_assignment(-scores)
101
+
102
+ max_scores = np.zeros([max(len(gold), len(predicted))])
103
+ for row, column in zip(row_ind, col_ind):
104
+ max_scores[row] = max(max_scores[row], scores[row, column])
105
+ return max_scores
106
+
107
+
108
+ def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
109
+ intersection = len(gold_bag.intersection(predicted_bag))
110
+ if not predicted_bag:
111
+ precision = 1.0
112
+ else:
113
+ precision = intersection / float(len(predicted_bag))
114
+ if not gold_bag:
115
+ recall = 1.0
116
+ else:
117
+ recall = intersection / float(len(gold_bag))
118
+ f1 = (
119
+ (2 * precision * recall) / (precision + recall)
120
+ if not (precision == 0.0 and recall == 0.0)
121
+ else 0.0
122
+ )
123
+ return f1
124
+
125
+
126
+ def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
127
+ gold_numbers = set()
128
+ predicted_numbers = set()
129
+ for word in gold_bag:
130
+ if _is_number(word):
131
+ gold_numbers.add(word)
132
+ for word in predicted_bag:
133
+ if _is_number(word):
134
+ predicted_numbers.add(word)
135
+ if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
136
+ return True
137
+ return False
138
+
139
+
140
+ def get_metrics(
141
+ predicted: Union[str, List[str], Tuple[str, ...]],
142
+ gold: Union[str, List[str], Tuple[str, ...]],
143
+ ) -> Tuple[float, float]:
144
+ """
145
+ Takes a predicted answer and a gold answer (that are both either a string or a list of
146
+ strings), and returns exact match and the DROP F1 metric for the prediction. If you are
147
+ writing a script for evaluating objects in memory (say, the output of predictions during
148
+ validation, or while training), this is the function you want to call, after using
149
+ :func:`answer_json_to_strings` when reading the gold answer from the released data file.
150
+ """
151
+ predicted_bags = _answer_to_bags(predicted)
152
+ gold_bags = _answer_to_bags(gold)
153
+
154
+ if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
155
+ gold_bags[0]
156
+ ):
157
+ exact_match = 1.0
158
+ else:
159
+ exact_match = 0.0
160
+
161
+ f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
162
+ f1 = np.mean(f1_per_bag)
163
+ f1 = round(f1, 2)
164
+ return exact_match, f1
165
+
166
+
167
+ def evaluate_strings(prediction, gold):
168
+ if type(prediction) != list and type(prediction) != str:
169
+ prediction = str(prediction)
170
+ if type(gold) != list and type(gold) != str:
171
+ gold = str(gold)
172
+ try:
173
+ predicted_bags = _answer_to_bags(prediction)
174
+ gold_bags = _answer_to_bags(gold)
175
+ f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
176
+ f1 = np.mean(f1_per_bag)
177
+ except Exception:
178
+ f1 = 0.0
179
+ return f1
evaluation/evaluate_utils/utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Set, Tuple, Union, Callable
2
+ import numpy as np
3
+ from scipy.optimize import linear_sum_assignment
4
+
5
+
6
+ def _align_bags(
7
+ predicted: List[Set[str]],
8
+ gold: List[Set[str]],
9
+ method: Callable[[object, object], float],
10
+ ) -> List[float]:
11
+ """
12
+ Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
13
+ between them and gets maximum metric values over all the answers.
14
+ """
15
+ scores = np.zeros([len(gold), len(predicted)])
16
+ for gold_index, gold_item in enumerate(gold):
17
+ for pred_index, pred_item in enumerate(predicted):
18
+ scores[gold_index, pred_index] = method(pred_item, gold_item)
19
+ row_ind, col_ind = linear_sum_assignment(-scores)
20
+
21
+ max_scores = np.zeros([max(len(gold), len(predicted))])
22
+ for row, column in zip(row_ind, col_ind):
23
+ max_scores[row] = max(max_scores[row], scores[row, column])
24
+ return max_scores
evaluation/evaluator.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from evaluation.evaluate_utils.evaluate_factory import get_evaluator
3
+
4
+
5
+ def fix_ans(answer):
6
+
7
+ try:
8
+ answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
9
+ answer = answer.replace("': ", '": ')
10
+ return answer
11
+ except:
12
+ return answer
13
+
14
+
15
+ def parse_answer(answer):
16
+
17
+ if len(answer) == 1:
18
+ if answer[0].isnumeric():
19
+ ans, is_num = fix_number(answer[0])
20
+ if is_num:
21
+ return ans, 'number'
22
+ try:
23
+ ans = json.loads(fix_ans(answer[0]))
24
+ return [ans], 'json'
25
+ except:
26
+ ans, is_num = fix_number(answer[0])
27
+ if is_num:
28
+ return ans, 'number'
29
+ else:
30
+ return answer[0], 'string'
31
+ else:
32
+ try:
33
+ ans = [json.loads(fix_ans(ex)) for ex in answer]
34
+ return ans, 'json'
35
+ except:
36
+ return answer, "string list"
37
+
38
+
39
+ def fix_number(number):
40
+
41
+ if type(number) == str:
42
+ copy_ans = number
43
+ copy_ans = ' '.join(' '.join(' '.join(copy_ans.split('$')).split('%')).split('sqft')).strip()
44
+ copy_ans = copy_ans.strip()
45
+ copy_ans = copy_ans.replace(',', '.').replace(' square kilometers', '')
46
+ try:
47
+ return float(copy_ans), True
48
+ except:
49
+ return number, False
50
+ elif type(number) == int:
51
+ return float(number), True
52
+ else:
53
+ return number, True
54
+
55
+
56
+ def fix_prediction(prediction, gold_answer, evaluator):
57
+
58
+ if type(prediction) == list and len(prediction) == 1 and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric())):
59
+ prediction = fix_number(prediction[0])
60
+
61
+ if type(prediction) != list:
62
+ prediction, is_num = fix_number(prediction)
63
+ if evaluator == 'json':
64
+ try:
65
+ prediction = [json.loads(pred) for pred in prediction.split('\n')]
66
+ except:
67
+ prediction = [prediction]
68
+
69
+ if (hasattr(type(prediction), '__len__')) and (len(prediction) == 0):
70
+ return prediction, False
71
+
72
+ if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
73
+ return prediction, False
74
+
75
+ return prediction, True
76
+
77
+
78
+ def question_scorer(prediction, gold_answer):
79
+
80
+ answer_list = [x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
81
+ gold_answer, evaluator = parse_answer(answer_list)
82
+ prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
83
+
84
+ if not run_eval:
85
+ return 0.
86
+
87
+ metric_eval = get_evaluator(evaluator)
88
+ accuracy = metric_eval(prediction, gold_answer)
89
+ return accuracy