Spaces:
Running
Running
init working commit
Browse files- README.md +0 -12
- app.py +132 -0
- evaluation_metrics.py +49 -0
- predefined_example.py +58 -0
- requirements.txt +2 -0
- span_dataclass_converters.py +30 -0
- token_level_output.py +77 -0
README.md
CHANGED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Ner Evaluation Metrics
|
3 |
-
emoji: 👁
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: green
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.36.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from annotated_text import annotated_text
|
4 |
+
from annotated_text.util import get_annotated_html
|
5 |
+
from streamlit_annotation_tools import text_labeler
|
6 |
+
|
7 |
+
from evaluation_metrics import EVALUATION_METRICS, get_evaluation_metric
|
8 |
+
from predefined_example import EXAMPLES
|
9 |
+
from span_dataclass_converters import (
|
10 |
+
get_highlight_spans_from_ner_spans,
|
11 |
+
get_ner_spans_from_annotations,
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
@st.cache_resource
|
16 |
+
def get_examples_attributes(selected_example):
|
17 |
+
"Return example attributes so that they are not refreshed on every interaction"
|
18 |
+
return (
|
19 |
+
selected_example.text,
|
20 |
+
selected_example.gt_labels,
|
21 |
+
selected_example.gt_spans,
|
22 |
+
selected_example.predictions,
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
if __name__ == "__main__":
|
27 |
+
st.set_page_config(layout="wide")
|
28 |
+
st.title("NER Evaluation Metrics Comparison")
|
29 |
+
|
30 |
+
st.write(
|
31 |
+
"Evaluation for the NER task requires a ground truth and a prediction that will be evaluated. The ground truth is shown below, add predictions in the next section to compare the evaluation metrics."
|
32 |
+
)
|
33 |
+
|
34 |
+
# with st.container():
|
35 |
+
st.subheader("Ground Truth") # , divider='rainbow')
|
36 |
+
|
37 |
+
selected_example = st.selectbox(
|
38 |
+
"Select an example text from the drop down below",
|
39 |
+
[example for example in EXAMPLES],
|
40 |
+
format_func=lambda ex: ex.text,
|
41 |
+
)
|
42 |
+
|
43 |
+
text, gt_labels, gt_spans, predictions = get_examples_attributes(selected_example)
|
44 |
+
|
45 |
+
annotated_text(
|
46 |
+
get_highlight_spans_from_ner_spans(
|
47 |
+
get_ner_spans_from_annotations(gt_labels), text
|
48 |
+
)
|
49 |
+
)
|
50 |
+
|
51 |
+
annotated_predictions = [
|
52 |
+
get_annotated_html(get_highlight_spans_from_ner_spans(ner_span, text))
|
53 |
+
for ner_span in predictions
|
54 |
+
]
|
55 |
+
predictions_df = pd.DataFrame(
|
56 |
+
{
|
57 |
+
# "ID": [f"Prediction_{index}" for index in range(len(predictions))],
|
58 |
+
"Prediction": annotated_predictions,
|
59 |
+
"ner_spans": predictions,
|
60 |
+
},
|
61 |
+
index=[f"Prediction_{index}" for index in range(len(predictions))],
|
62 |
+
)
|
63 |
+
|
64 |
+
st.subheader("Predictions") # , divider='rainbow')
|
65 |
+
|
66 |
+
with st.expander("Click to Add Predictions"):
|
67 |
+
st.subheader("Adding predictions")
|
68 |
+
st.markdown(
|
69 |
+
"""
|
70 |
+
Add predictions to the list of predictions on which the evaluation metric will be caculated.
|
71 |
+
- Select the entity type/label name and then highlight the span in the text below.
|
72 |
+
- To remove a span, double click on the higlighted text.
|
73 |
+
- Once you have your desired prediction, click on the 'Add' button.(The prediction created is shown in a json below)
|
74 |
+
"""
|
75 |
+
)
|
76 |
+
st.write(
|
77 |
+
"Note: Only the spans of the selected label name is shown at a given instance.",
|
78 |
+
)
|
79 |
+
labels = text_labeler(text, gt_labels)
|
80 |
+
st.json(labels, expanded=False)
|
81 |
+
|
82 |
+
# if st.button("Add Prediction"):
|
83 |
+
# labels = text_labeler(text)
|
84 |
+
if st.button("Add!"):
|
85 |
+
spans = get_ner_spans_from_annotations(labels)
|
86 |
+
spans = sorted(spans, key=lambda span: span["start"])
|
87 |
+
predictions.append(spans)
|
88 |
+
annotated_predictions.append(
|
89 |
+
get_annotated_html(get_highlight_spans_from_ner_spans(spans, text))
|
90 |
+
)
|
91 |
+
predictions_df = pd.DataFrame(
|
92 |
+
{
|
93 |
+
# "ID": [f"Prediction_{index}" for index in range(len(predictions))],
|
94 |
+
"Prediction": annotated_predictions,
|
95 |
+
"ner_spans": predictions,
|
96 |
+
},
|
97 |
+
index=[f"Prediction_{index}" for index in range(len(predictions))],
|
98 |
+
)
|
99 |
+
print("added")
|
100 |
+
|
101 |
+
highlighted_predictions_df = predictions_df[["Prediction"]]
|
102 |
+
st.write(highlighted_predictions_df.to_html(escape=False), unsafe_allow_html=True)
|
103 |
+
st.divider()
|
104 |
+
|
105 |
+
### EVALUATION METRICS COMPARISION ###
|
106 |
+
|
107 |
+
st.subheader("Evaluation Metrics Comparision") # , divider='rainbow')
|
108 |
+
st.markdown("""
|
109 |
+
The different evaluation metrics we have for the NER task are
|
110 |
+
- Span Based Evaluation with Partial Overlap
|
111 |
+
- Token Based Evaluation with Micro Avg
|
112 |
+
- Token Based Evaluation with Macro Avg
|
113 |
+
""")
|
114 |
+
|
115 |
+
with st.expander("View Predictions Details"):
|
116 |
+
st.write(predictions_df.to_html(escape=False), unsafe_allow_html=True)
|
117 |
+
|
118 |
+
if st.button("Get Metrics!"):
|
119 |
+
for evaluation_metric_type in EVALUATION_METRICS:
|
120 |
+
predictions_df[evaluation_metric_type] = predictions_df.ner_spans.apply(
|
121 |
+
lambda ner_spans: get_evaluation_metric(
|
122 |
+
metric_type=evaluation_metric_type,
|
123 |
+
gt_ner_span=gt_spans,
|
124 |
+
pred_ner_span=ner_spans,
|
125 |
+
text=text,
|
126 |
+
)
|
127 |
+
)
|
128 |
+
|
129 |
+
metrics_df = predictions_df.drop(["ner_spans"], axis=1)
|
130 |
+
|
131 |
+
st.write(metrics_df.to_html(escape=False), unsafe_allow_html=True)
|
132 |
+
print("compared")
|
evaluation_metrics.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nervaluate import Evaluator
|
2 |
+
from sklearn.metrics import classification_report
|
3 |
+
|
4 |
+
from token_level_output import get_token_output_labels
|
5 |
+
|
6 |
+
EVALUATION_METRICS = [
|
7 |
+
"Span Based Evaluation with Partial Overlap",
|
8 |
+
"Token Based Evaluation with Micro Avg",
|
9 |
+
"Token Based Evaluation with Macro Avg",
|
10 |
+
]
|
11 |
+
|
12 |
+
|
13 |
+
def get_span_eval(gt_ner_span, pred_ner_span, text):
|
14 |
+
evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=["Disease", "Drug"])
|
15 |
+
return round(evaluator.evaluate()[0]["ent_type"]["f1"], 2)
|
16 |
+
|
17 |
+
|
18 |
+
def get_token_micro_eval(gt_ner_span, pred_ner_span, text):
|
19 |
+
return round(
|
20 |
+
classification_report(
|
21 |
+
get_token_output_labels(gt_ner_span, text),
|
22 |
+
get_token_output_labels(pred_ner_span, text),
|
23 |
+
labels=["Disease", "Drug"],
|
24 |
+
output_dict=True,
|
25 |
+
)["micro avg"]["f1-score"],
|
26 |
+
2,
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
def get_token_macro_eval(gt_ner_span, pred_ner_span, text):
|
31 |
+
return round(
|
32 |
+
classification_report(
|
33 |
+
get_token_output_labels(gt_ner_span, text),
|
34 |
+
get_token_output_labels(pred_ner_span, text),
|
35 |
+
labels=["Disease", "Drug"],
|
36 |
+
output_dict=True,
|
37 |
+
)["macro avg"]["f1-score"],
|
38 |
+
2,
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
def get_evaluation_metric(metric_type, gt_ner_span, pred_ner_span, text):
|
43 |
+
match metric_type:
|
44 |
+
case "Span Based Evaluation with Partial Overlap":
|
45 |
+
return get_span_eval(gt_ner_span, pred_ner_span, text)
|
46 |
+
case "Token Based Evaluation with Micro Avg":
|
47 |
+
return get_token_micro_eval(gt_ner_span, pred_ner_span, text)
|
48 |
+
case "Token Based Evaluation with Macro Avg":
|
49 |
+
return get_token_macro_eval(gt_ner_span, pred_ner_span, text)
|
predefined_example.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
from span_dataclass_converters import get_ner_spans_from_annotations
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class PredefinedExample:
|
8 |
+
text: str
|
9 |
+
gt_labels: dict
|
10 |
+
# gt_spans: list
|
11 |
+
# predictions: list
|
12 |
+
|
13 |
+
@property
|
14 |
+
def gt_spans(self):
|
15 |
+
return sorted(
|
16 |
+
get_ner_spans_from_annotations(self.gt_labels),
|
17 |
+
key=lambda span: span["start"],
|
18 |
+
)
|
19 |
+
|
20 |
+
@property
|
21 |
+
def predictions(self):
|
22 |
+
return [self.gt_spans]
|
23 |
+
|
24 |
+
|
25 |
+
small_example = PredefinedExample(
|
26 |
+
text="The patient was diagnosed with bronchitis and was prescribed a mucolytic",
|
27 |
+
gt_labels={
|
28 |
+
"Disease": [
|
29 |
+
{"start": 31, "end": 41, "label": "bronchitis"},
|
30 |
+
],
|
31 |
+
"Drug": [
|
32 |
+
{"start": 63, "end": 72, "label": "mucolytic"},
|
33 |
+
],
|
34 |
+
},
|
35 |
+
)
|
36 |
+
|
37 |
+
big_example = PredefinedExample(
|
38 |
+
text=(
|
39 |
+
"The patient was experiencing stomach pain and flu like symptoms for 3 days. "
|
40 |
+
"Upon investigation, the chest xray revealed acute bronchitis disease. "
|
41 |
+
"The patient was asked to take rest for a week and was prescribed a mucolytic along with paracetamol for body pains."
|
42 |
+
),
|
43 |
+
gt_labels={
|
44 |
+
"Disease": [
|
45 |
+
{"start": 120, "end": 144, "label": "acute bronchitis disease"},
|
46 |
+
],
|
47 |
+
"Drug": [
|
48 |
+
{"start": 213, "end": 222, "label": "mucolytic"},
|
49 |
+
{"start": 234, "end": 245, "label": "paracetamol"},
|
50 |
+
],
|
51 |
+
"Symptoms": [
|
52 |
+
{"start": 29, "end": 41, "label": "stomach pain"},
|
53 |
+
{"start": 46, "end": 63, "label": "flu like symptoms"},
|
54 |
+
],
|
55 |
+
},
|
56 |
+
)
|
57 |
+
|
58 |
+
EXAMPLES = [small_example, big_example]
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
streamlit_annotation_tools
|
2 |
+
annotated_text
|
span_dataclass_converters.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def get_ner_spans_from_annotations(annotated_labels):
|
2 |
+
spans = []
|
3 |
+
for entity_type, spans_list in annotated_labels.items():
|
4 |
+
for spans_dict in spans_list:
|
5 |
+
ner_span_dict = {
|
6 |
+
**spans_dict,
|
7 |
+
"label": entity_type,
|
8 |
+
"span_text": spans_dict["label"],
|
9 |
+
}
|
10 |
+
spans.append(ner_span_dict)
|
11 |
+
return spans
|
12 |
+
|
13 |
+
|
14 |
+
def get_highlight_spans_from_ner_spans(ner_spans, parent_text):
|
15 |
+
if not ner_spans:
|
16 |
+
return [parent_text]
|
17 |
+
|
18 |
+
output_list = []
|
19 |
+
prev_span_end = 0
|
20 |
+
# output_list = [parent_text[ner_spans[0]["start"]]]
|
21 |
+
for span in ner_spans:
|
22 |
+
output_list.append(parent_text[prev_span_end : span["start"]])
|
23 |
+
tup = (span["span_text"], span["label"])
|
24 |
+
output_list.append(tup)
|
25 |
+
prev_span_end = span["end"]
|
26 |
+
|
27 |
+
if prev_span_end != len(parent_text):
|
28 |
+
output_list.append(parent_text[prev_span_end:])
|
29 |
+
|
30 |
+
return output_list
|
token_level_output.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
class WhitespaceTokenSplitter:
|
5 |
+
def __init__(self):
|
6 |
+
self.whitespace_pattern = re.compile(r"\w+(?:[-_]\w+)*|\S")
|
7 |
+
|
8 |
+
def __call__(self, text):
|
9 |
+
for match in self.whitespace_pattern.finditer(text):
|
10 |
+
yield match.group(), match.start(), match.end()
|
11 |
+
|
12 |
+
|
13 |
+
tokenizer = WhitespaceTokenSplitter()
|
14 |
+
|
15 |
+
|
16 |
+
def get_char_label_map(ner_spans: list):
|
17 |
+
"""return a dict with char indices(int) as keys and the label they belong to as values
|
18 |
+
example -- {1:'label1', 2: 'label1', 5:'label2', 5:'label2'}
|
19 |
+
note: the char indices that do not belong to a span do not exist in the map
|
20 |
+
"""
|
21 |
+
char_label_map = {}
|
22 |
+
for span in ner_spans:
|
23 |
+
char_label_map = {
|
24 |
+
**char_label_map,
|
25 |
+
**{
|
26 |
+
char_index: span["label"]
|
27 |
+
for char_index in range(span["start"], span["end"])
|
28 |
+
},
|
29 |
+
}
|
30 |
+
return char_label_map
|
31 |
+
|
32 |
+
|
33 |
+
def get_tokens(text: str) -> list[str]:
|
34 |
+
tokens_with_offsets = list(tokenizer(text))
|
35 |
+
return [token for token, start, end in tokens_with_offsets]
|
36 |
+
|
37 |
+
|
38 |
+
def get_token_offsets(text: str) -> list[tuple[int, int]]:
|
39 |
+
tokens_with_offsets = list(tokenizer(text))
|
40 |
+
return [(start, end) for token, start, end in tokens_with_offsets]
|
41 |
+
|
42 |
+
|
43 |
+
def get_list_of_token_label_tuples(
|
44 |
+
tokens: list[str],
|
45 |
+
token_spans: list[tuple[int, int]],
|
46 |
+
char_label_map: dict[int, str],
|
47 |
+
) -> list[tuple[str, str]]:
|
48 |
+
"""
|
49 |
+
returns a list of tuples with first element as token and second element as the label
|
50 |
+
example - [('a', 'O'), ('cat', 'ANIMAL'), ('sits', 'O')]
|
51 |
+
note: the label of a token is decided based on the max chars in the token belonging to a span
|
52 |
+
"""
|
53 |
+
token_labels = []
|
54 |
+
for token, offsets in zip(tokens, token_spans):
|
55 |
+
if offsets[0] == offsets[1]:
|
56 |
+
token_labels.append((token, "O"))
|
57 |
+
continue
|
58 |
+
char_labels = [
|
59 |
+
char_label_map.get(char_index, "O") for char_index in range(*offsets)
|
60 |
+
]
|
61 |
+
token_label = max(set(char_labels), key=char_labels.count)
|
62 |
+
token_labels.append((token, token_label))
|
63 |
+
return token_labels
|
64 |
+
|
65 |
+
|
66 |
+
def get_token_outputs(ner_spans, parent_text):
|
67 |
+
char_label_map = get_char_label_map(ner_spans)
|
68 |
+
|
69 |
+
token_offsets = get_token_offsets(parent_text)
|
70 |
+
tokens = get_tokens(parent_text)
|
71 |
+
|
72 |
+
return get_list_of_token_label_tuples(tokens, token_offsets, char_label_map)
|
73 |
+
|
74 |
+
|
75 |
+
def get_token_output_labels(ner_spans, parent_text):
|
76 |
+
token_output = get_token_outputs(ner_spans, parent_text)
|
77 |
+
return [label for token, label in token_output]
|