wadood commited on
Commit
19d19d4
·
1 Parent(s): 1c461bc

added exact span eval metric

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +11 -8
  3. evaluation_metrics.py +84 -42
  4. predefined_example.py +4 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
app.py CHANGED
@@ -4,7 +4,7 @@ from annotated_text import annotated_text
4
  from annotated_text.util import get_annotated_html
5
  from streamlit_annotation_tools import text_labeler
6
 
7
- from evaluation_metrics import EVALUATION_METRICS, get_evaluation_metric
8
  from predefined_example import EXAMPLES
9
  from span_dataclass_converters import (
10
  get_highlight_spans_from_ner_spans,
@@ -20,12 +20,13 @@ def get_examples_attributes(selected_example):
20
  selected_example.gt_labels,
21
  selected_example.gt_spans,
22
  selected_example.predictions,
 
23
  )
24
 
25
 
26
  if __name__ == "__main__":
27
  st.set_page_config(layout="wide")
28
- st.title("NER Evaluation Metrics Comparison")
29
 
30
  st.write(
31
  "Evaluation for the NER task requires a ground truth and a prediction that will be evaluated. The ground truth is shown below, add predictions in the next section to compare the evaluation metrics."
@@ -40,7 +41,9 @@ if __name__ == "__main__":
40
  format_func=lambda ex: ex.text,
41
  )
42
 
43
- text, gt_labels, gt_spans, predictions = get_examples_attributes(selected_example)
 
 
44
 
45
  annotated_text(
46
  get_highlight_spans_from_ner_spans(
@@ -116,17 +119,17 @@ Add predictions to the list of predictions on which the evaluation metric will b
116
  st.write(predictions_df.to_html(escape=False), unsafe_allow_html=True)
117
 
118
  if st.button("Get Metrics!"):
119
- for evaluation_metric_type in EVALUATION_METRICS:
120
- predictions_df[evaluation_metric_type] = predictions_df.ner_spans.apply(
121
- lambda ner_spans: get_evaluation_metric(
122
- metric_type=evaluation_metric_type,
123
  gt_ner_span=gt_spans,
124
  pred_ner_span=ner_spans,
125
  text=text,
 
126
  )
127
  )
128
 
129
  metrics_df = predictions_df.drop(["ner_spans"], axis=1)
130
 
131
  st.write(metrics_df.to_html(escape=False), unsafe_allow_html=True)
132
- print("compared")
 
4
  from annotated_text.util import get_annotated_html
5
  from streamlit_annotation_tools import text_labeler
6
 
7
+ from evaluation_metrics import EVALUATION_METRICS
8
  from predefined_example import EXAMPLES
9
  from span_dataclass_converters import (
10
  get_highlight_spans_from_ner_spans,
 
20
  selected_example.gt_labels,
21
  selected_example.gt_spans,
22
  selected_example.predictions,
23
+ selected_example.tags,
24
  )
25
 
26
 
27
  if __name__ == "__main__":
28
  st.set_page_config(layout="wide")
29
+ st.title("NER Metrics Comparison")
30
 
31
  st.write(
32
  "Evaluation for the NER task requires a ground truth and a prediction that will be evaluated. The ground truth is shown below, add predictions in the next section to compare the evaluation metrics."
 
41
  format_func=lambda ex: ex.text,
42
  )
43
 
44
+ text, gt_labels, gt_spans, predictions, tags = get_examples_attributes(
45
+ selected_example
46
+ )
47
 
48
  annotated_text(
49
  get_highlight_spans_from_ner_spans(
 
119
  st.write(predictions_df.to_html(escape=False), unsafe_allow_html=True)
120
 
121
  if st.button("Get Metrics!"):
122
+ for evaluation_metric in EVALUATION_METRICS:
123
+ predictions_df[evaluation_metric.name] = predictions_df.ner_spans.apply(
124
+ lambda ner_spans: evaluation_metric.get_evaluation_metric(
125
+ # metric_type=evaluation_metric_type,
126
  gt_ner_span=gt_spans,
127
  pred_ner_span=ner_spans,
128
  text=text,
129
+ tags=tags,
130
  )
131
  )
132
 
133
  metrics_df = predictions_df.drop(["ner_spans"], axis=1)
134
 
135
  st.write(metrics_df.to_html(escape=False), unsafe_allow_html=True)
 
evaluation_metrics.py CHANGED
@@ -1,49 +1,91 @@
 
 
1
  from nervaluate import Evaluator
2
  from sklearn.metrics import classification_report
3
 
4
  from token_level_output import get_token_output_labels
5
 
6
- EVALUATION_METRICS = [
7
- "Span Based Evaluation with Partial Overlap",
8
- "Token Based Evaluation with Micro Avg",
9
- "Token Based Evaluation with Macro Avg",
10
- ]
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- def get_span_eval(gt_ner_span, pred_ner_span, text):
14
- evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=["Disease", "Drug"])
15
- return round(evaluator.evaluate()[0]["ent_type"]["f1"], 2)
16
-
17
-
18
- def get_token_micro_eval(gt_ner_span, pred_ner_span, text):
19
- return round(
20
- classification_report(
21
- get_token_output_labels(gt_ner_span, text),
22
- get_token_output_labels(pred_ner_span, text),
23
- labels=["Disease", "Drug"],
24
- output_dict=True,
25
- )["micro avg"]["f1-score"],
26
- 2,
27
- )
28
-
29
-
30
- def get_token_macro_eval(gt_ner_span, pred_ner_span, text):
31
- return round(
32
- classification_report(
33
- get_token_output_labels(gt_ner_span, text),
34
- get_token_output_labels(pred_ner_span, text),
35
- labels=["Disease", "Drug"],
36
- output_dict=True,
37
- )["macro avg"]["f1-score"],
38
- 2,
39
- )
40
-
41
-
42
- def get_evaluation_metric(metric_type, gt_ner_span, pred_ner_span, text):
43
- match metric_type:
44
- case "Span Based Evaluation with Partial Overlap":
45
- return get_span_eval(gt_ner_span, pred_ner_span, text)
46
- case "Token Based Evaluation with Micro Avg":
47
- return get_token_micro_eval(gt_ner_span, pred_ner_span, text)
48
- case "Token Based Evaluation with Macro Avg":
49
- return get_token_macro_eval(gt_ner_span, pred_ner_span, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
  from nervaluate import Evaluator
4
  from sklearn.metrics import classification_report
5
 
6
  from token_level_output import get_token_output_labels
7
 
 
 
 
 
 
8
 
9
+ class EvaluationMetric(ABC):
10
+ """Base class defining the attributes & methods of an evaluation metric"""
11
+
12
+ name: str
13
+ description: str
14
+
15
+ @abstractmethod
16
+ def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
17
+ pass
18
+
19
+
20
+ class PartialSpanOverlapMetric(EvaluationMetric):
21
+ def __init__(self) -> None:
22
+ super().__init__()
23
+
24
+ self.name = "Span Based Evaluation with Partial Overlap"
25
+ self.description = ""
26
+
27
+ @staticmethod
28
+ def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
29
+ evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=tags)
30
+ return round(evaluator.evaluate()[0]["ent_type"]["f1"], 2)
31
+
32
+
33
+ class ExactSpanOverlapMetric(EvaluationMetric):
34
+ def __init__(self) -> None:
35
+ super().__init__()
36
+
37
+ self.name = "Span Based Evaluation with Exact Overlap"
38
+ self.description = ""
39
 
40
+ @staticmethod
41
+ def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
42
+ evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=tags)
43
+ return round(evaluator.evaluate()[0]["strict"]["f1"], 2)
44
+
45
+
46
+ class TokenMicroMetric(EvaluationMetric):
47
+ def __init__(self) -> None:
48
+ super().__init__()
49
+
50
+ self.name = "Span Based Evaluation with Micro Average"
51
+ self.description = ""
52
+
53
+ @staticmethod
54
+ def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
55
+ return round(
56
+ classification_report(
57
+ get_token_output_labels(gt_ner_span, text),
58
+ get_token_output_labels(pred_ner_span, text),
59
+ labels=tags,
60
+ output_dict=True,
61
+ )["micro avg"]["f1-score"],
62
+ 2,
63
+ )
64
+
65
+
66
+ class TokenMacroMetric(EvaluationMetric):
67
+ def __init__(self) -> None:
68
+ super().__init__()
69
+
70
+ self.name = "Token Based Evaluation with Macro Average"
71
+ self.description = ""
72
+
73
+ @staticmethod
74
+ def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
75
+ return round(
76
+ classification_report(
77
+ get_token_output_labels(gt_ner_span, text),
78
+ get_token_output_labels(pred_ner_span, text),
79
+ labels=tags,
80
+ output_dict=True,
81
+ )["macro avg"]["f1-score"],
82
+ 2,
83
+ )
84
+
85
+
86
+ EVALUATION_METRICS = [
87
+ PartialSpanOverlapMetric(),
88
+ ExactSpanOverlapMetric(),
89
+ TokenMicroMetric(),
90
+ TokenMacroMetric(),
91
+ ]
predefined_example.py CHANGED
@@ -21,6 +21,10 @@ class PredefinedExample:
21
  def predictions(self):
22
  return [self.gt_spans]
23
 
 
 
 
 
24
 
25
  small_example = PredefinedExample(
26
  text="The patient was diagnosed with bronchitis and was prescribed a mucolytic",
 
21
  def predictions(self):
22
  return [self.gt_spans]
23
 
24
+ @property
25
+ def tags(self):
26
+ return list(self.gt_labels.keys())
27
+
28
 
29
  small_example = PredefinedExample(
30
  text="The patient was diagnosed with bronchitis and was prescribed a mucolytic",