“WadoodAbdul” commited on
Commit
300388f
·
1 Parent(s): 4e35351

update m2 to clinical type name

Browse files
Files changed (5) hide show
  1. app.py +21 -21
  2. eval_metrics_app.py +75 -0
  3. src/about.py +14 -14
  4. src/display/utils.py +7 -7
  5. src/leaderboard/read_evals.py +12 -12
app.py CHANGED
@@ -19,7 +19,7 @@ from src.display.utils import (
19
  DATASET_BENCHMARK_COLS,
20
  TYPES_BENCHMARK_COLS,
21
  DATASET_COLS,
22
- M2_TYPES_COLS,
23
  EVAL_COLS,
24
  EVAL_TYPES,
25
  NUMERIC_INTERVALS,
@@ -39,26 +39,26 @@ def restart_space():
39
  API.restart_space(repo_id=REPO_ID)
40
 
41
 
42
- try:
43
- print(EVAL_REQUESTS_PATH)
44
- snapshot_download(
45
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
46
- )
47
- except Exception:
48
- restart_space()
49
- try:
50
- print(EVAL_RESULTS_PATH)
51
- snapshot_download(
52
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
53
- )
54
- except Exception:
55
- restart_space()
56
 
57
 
58
  raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
59
  datasets_leaderboard_df = datasets_original_df.copy()
60
 
61
- raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, M2_TYPES_COLS, TYPES_BENCHMARK_COLS, "m2_types")
62
  types_leaderboard_df = types_original_df.copy()
63
 
64
  (
@@ -167,11 +167,11 @@ with demo:
167
  )
168
  with gr.Row():
169
  shown_columns = gr.CheckboxGroup(
170
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.m2_type_col],
171
  value=[
172
  c.name
173
  for c in fields(AutoEvalColumn)
174
- if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.m2_type_col
175
  ],
176
  label="Select columns to show",
177
  elem_id="column-select",
@@ -249,7 +249,7 @@ with demo:
249
  queue=True,
250
  )
251
 
252
- with gr.TabItem("🏅 M2 Types", elem_id="llm-benchmark-tab-table", id=4):
253
  with gr.Row():
254
  with gr.Column():
255
  with gr.Row():
@@ -309,8 +309,8 @@ with demo:
309
 
310
  # Dummy leaderboard for handling the case when the user uses backspace key
311
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
312
- value=types_original_df[M2_TYPES_COLS],
313
- headers=M2_TYPES_COLS,
314
  datatype=TYPES,
315
  visible=False,
316
  )
 
19
  DATASET_BENCHMARK_COLS,
20
  TYPES_BENCHMARK_COLS,
21
  DATASET_COLS,
22
+ Clinical_TYPES_COLS,
23
  EVAL_COLS,
24
  EVAL_TYPES,
25
  NUMERIC_INTERVALS,
 
39
  API.restart_space(repo_id=REPO_ID)
40
 
41
 
42
+ # try:
43
+ # print(EVAL_REQUESTS_PATH)
44
+ # snapshot_download(
45
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
46
+ # )
47
+ # except Exception:
48
+ # restart_space()
49
+ # try:
50
+ # print(EVAL_RESULTS_PATH)
51
+ # snapshot_download(
52
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
53
+ # )
54
+ # except Exception:
55
+ # restart_space()
56
 
57
 
58
  raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
59
  datasets_leaderboard_df = datasets_original_df.copy()
60
 
61
+ raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "clinical_types")
62
  types_leaderboard_df = types_original_df.copy()
63
 
64
  (
 
167
  )
168
  with gr.Row():
169
  shown_columns = gr.CheckboxGroup(
170
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.clinical_type_col],
171
  value=[
172
  c.name
173
  for c in fields(AutoEvalColumn)
174
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.clinical_type_col
175
  ],
176
  label="Select columns to show",
177
  elem_id="column-select",
 
249
  queue=True,
250
  )
251
 
252
+ with gr.TabItem("🏅 Clinical Types", elem_id="llm-benchmark-tab-table", id=4):
253
  with gr.Row():
254
  with gr.Column():
255
  with gr.Row():
 
309
 
310
  # Dummy leaderboard for handling the case when the user uses backspace key
311
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
312
+ value=types_original_df[Clinical_TYPES_COLS],
313
+ headers=Clinical_TYPES_COLS,
314
  datatype=TYPES,
315
  visible=False,
316
  )
eval_metrics_app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ # Function to compute evaluation metrics (dummy implementation)
4
+ def compute_metrics(gt_spans, pred_spans):
5
+ # Dummy implementation of a metric computation
6
+ # Replace this with actual metric computation logic
7
+ tp = len(set(gt_spans) & set(pred_spans))
8
+ fp = len(set(pred_spans) - set(gt_spans))
9
+ fn = len(set(gt_spans) - set(pred_spans))
10
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0
11
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0
12
+ f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
13
+
14
+ return {"precision": precision, "recall": recall, "f1_score": f1_score}
15
+
16
+ def create_app():
17
+ with gr.Blocks() as demo:
18
+ # Input components
19
+ text_input = gr.Textbox(label="Input Text")
20
+ highlight_input = gr.Textbox(label="Highlight Text and Press Add")
21
+
22
+ gt_spans_state = gr.State([])
23
+ pred_spans_state = gr.State([])
24
+
25
+ # Buttons for ground truth and prediction
26
+ add_gt_button = gr.Button("Add to Ground Truth")
27
+ add_pred_button = gr.Button("Add to Predictions")
28
+
29
+ # Outputs for highlighted spans
30
+ gt_output = gr.HighlightedText(label="Ground Truth Spans")
31
+ pred_output = gr.HighlightedText(label="Predicted Spans")
32
+
33
+ # Compute metrics button and its output
34
+ compute_button = gr.Button("Compute Metrics")
35
+ metrics_output = gr.JSON(label="Metrics")
36
+
37
+ # Function to update spans
38
+ def update_spans(text, span, gt_spans, pred_spans, is_gt):
39
+ start_idx = text.find(span)
40
+ end_idx = start_idx + len(span)
41
+ new_span = (start_idx, end_idx)
42
+ if is_gt:
43
+ gt_spans.append(new_span)
44
+ gt_spans = list(set(gt_spans))
45
+ else:
46
+ pred_spans.append(new_span)
47
+ pred_spans = list(set(pred_spans))
48
+ return gt_spans, pred_spans, highlight_spans(text, gt_spans), highlight_spans(text, pred_spans)
49
+
50
+ # Function to highlight spans
51
+ def highlight_spans(text, spans):
52
+ span_dict = {}
53
+ for span in spans:
54
+ span_dict[(span[0], span[1])] = "highlight"
55
+ return span_dict
56
+
57
+ # Event handlers for buttons
58
+ add_gt_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(True)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
59
+ add_pred_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(False)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
60
+
61
+ # Function to compute metrics
62
+ def on_compute_metrics(gt_spans, pred_spans):
63
+ metrics = compute_metrics(gt_spans, pred_spans)
64
+ return metrics
65
+
66
+ compute_button.click(fn=on_compute_metrics, inputs=[gt_spans_state, pred_spans_state], outputs=metrics_output)
67
+
68
+ # Layout arrangement
69
+ text_input.change(fn=lambda x: x, inputs=text_input, outputs=[gt_output, pred_output])
70
+
71
+ return demo
72
+
73
+ # Run the app
74
+ demo = create_app()
75
+ demo.launch()
src/about.py CHANGED
@@ -23,19 +23,19 @@ class Tasks(Enum):
23
  # task6 = Task("", "f1", "")
24
 
25
  @dataclass
26
- class M2Type:
27
  benchmark: str
28
  metric: str
29
  col_name: str
30
 
31
- class M2Types(Enum):
32
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
33
- type0 = M2Type("condition", "f1", "CONDITION")
34
- type1 = M2Type("measurement", "f1", "MEASUREMENT")
35
- type2 = M2Type("drug", "f1", "DRUG")
36
- type3 = M2Type("procedure", "f1", "PROCEDURE")
37
- type4 = M2Type("gene", "f1", "GENE")
38
- type5 = M2Type("gene variant", "f1", "GENE VARIANT")
39
 
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
@@ -43,8 +43,8 @@ NUM_FEWSHOT = 0 # Change with your few shot
43
 
44
 
45
  # Your leaderboard name
46
- TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
47
- LOGO = """<img src="file/assets/image.png" alt="M2 X HF" width="500" height="333">"""
48
  # What does your leaderboard evaluate?
49
  INTRODUCTION_TEXT = """
50
  Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
@@ -66,10 +66,10 @@ The Named Clinical Entity Recognition Leaderboard is aimed at advancing the fiel
66
 
67
  ### Datasets
68
  📈 We evaluate the models on 4 datasets, encompassing 6 entity types
69
- - [NCBI](https://huggingface.co/datasets/m42-health/m2_ncbi)
70
- - [CHIA](https://huggingface.co/datasets/m42-health/m2_chia)
71
- - [BIORED](https://huggingface.co/datasets/m42-health/m2_biored)
72
- - [BC5CD](https://huggingface.co/datasets/m42-health/m2_bc5cdr)
73
 
74
  ### Evaluation Metrics
75
  We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.
 
23
  # task6 = Task("", "f1", "")
24
 
25
  @dataclass
26
+ class ClinicalType:
27
  benchmark: str
28
  metric: str
29
  col_name: str
30
 
31
+ class ClinicalTypes(Enum):
32
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
33
+ type0 = ClinicalType("condition", "f1", "CONDITION")
34
+ type1 = ClinicalType("measurement", "f1", "MEASUREMENT")
35
+ type2 = ClinicalType("drug", "f1", "DRUG")
36
+ type3 = ClinicalType("procedure", "f1", "PROCEDURE")
37
+ type4 = ClinicalType("gene", "f1", "GENE")
38
+ type5 = ClinicalType("gene variant", "f1", "GENE VARIANT")
39
 
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
 
43
 
44
 
45
  # Your leaderboard name
46
+ TITLE = """<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
47
+ LOGO = """<img src="file/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
48
  # What does your leaderboard evaluate?
49
  INTRODUCTION_TEXT = """
50
  Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
 
66
 
67
  ### Datasets
68
  📈 We evaluate the models on 4 datasets, encompassing 6 entity types
69
+ - [NCBI](https://huggingface.co/datasets/m42-health/clinical_ncbi)
70
+ - [CHIA](https://huggingface.co/datasets/m42-health/clinical_chia)
71
+ - [BIORED](https://huggingface.co/datasets/m42-health/clinical_biored)
72
+ - [BC5CD](https://huggingface.co/datasets/m42-health/clinical_bc5cdr)
73
 
74
  ### Evaluation Metrics
75
  We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.
src/display/utils.py CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  from src.about import Tasks
7
- from src.about import M2Types
8
 
9
 
10
  def fields(raw_class):
@@ -22,7 +22,7 @@ class ColumnContent:
22
  hidden: bool = False
23
  never_hidden: bool = False
24
  dataset_task_col: bool = False
25
- m2_type_col: bool = False
26
 
27
 
28
  ## Leaderboard columns
@@ -34,8 +34,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
34
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
35
  for task in Tasks:
36
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
37
- for task in M2Types:
38
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, m2_type_col=True)])
39
  # Model information
40
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
41
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -133,8 +133,8 @@ class Precision(Enum):
133
 
134
 
135
  # Column selection
136
- DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.m2_type_col]
137
- M2_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
138
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
139
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
140
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -143,7 +143,7 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
143
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
144
 
145
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
146
- TYPES_BENCHMARK_COLS = [t.value.col_name for t in M2Types]
147
 
148
  NUMERIC_INTERVALS = {
149
  "?": pd.Interval(-1, 0, closed="right"),
 
4
  import pandas as pd
5
 
6
  from src.about import Tasks
7
+ from src.about import ClinicalTypes
8
 
9
 
10
  def fields(raw_class):
 
22
  hidden: bool = False
23
  never_hidden: bool = False
24
  dataset_task_col: bool = False
25
+ clinical_type_col: bool = False
26
 
27
 
28
  ## Leaderboard columns
 
34
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
35
  for task in Tasks:
36
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
37
+ for task in ClinicalTypes:
38
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, clinical_type_col=True)])
39
  # Model information
40
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
41
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
133
 
134
 
135
  # Column selection
136
+ DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
137
+ Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
138
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
139
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
140
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
143
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
144
 
145
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
146
+ TYPES_BENCHMARK_COLS = [t.value.col_name for t in ClinicalTypes]
147
 
148
  NUMERIC_INTERVALS = {
149
  "?": pd.Interval(-1, 0, closed="right"),
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, M2Types
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -22,7 +22,7 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  dataset_results: dict
25
- m2_type_results:dict
26
  precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -84,16 +84,16 @@ class EvalResult:
84
  dataset_results[task.benchmark] = mean_acc
85
 
86
  types_results = {}
87
- for m2_type in M2Types:
88
- m2_type = m2_type.value
89
 
90
  # We average all scores of a given metric (not all metrics are present in all files)
91
- accs = np.array([v.get(m2_type.metric, None) for k, v in data["m2_type_results"].items() if m2_type.benchmark == k])
92
  if accs.size == 0 or any([acc is None for acc in accs]):
93
  continue
94
 
95
  mean_acc = np.mean(accs) # * 100.0
96
- types_results[m2_type.benchmark] = mean_acc
97
 
98
  return self(
99
  eval_name=result_key,
@@ -101,7 +101,7 @@ class EvalResult:
101
  org=org,
102
  model=model,
103
  dataset_results=dataset_results,
104
- m2_type_results=types_results,
105
  precision=precision,
106
  revision=config.get("model_sha", ""),
107
  still_on_hub=still_on_hub,
@@ -156,8 +156,8 @@ class EvalResult:
156
 
157
  return data_dict
158
 
159
- if subset == "m2_types":
160
- average = sum([v for v in self.m2_type_results.values() if v is not None]) / len(M2Types)
161
  data_dict = {
162
  "eval_name": self.eval_name, # not a column, just a save name,
163
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -174,8 +174,8 @@ class EvalResult:
174
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
175
  }
176
 
177
- for m2_type in M2Types:
178
- data_dict[m2_type.value.col_name] = self.m2_type_results[m2_type.value.benchmark]
179
 
180
  return data_dict
181
 
@@ -232,7 +232,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
232
  eval_results[eval_name] = eval_result
233
 
234
  results = []
235
- # m2_type_results = []
236
  for v in eval_results.values():
237
  try:
238
  v.to_dict(subset="dataset") # we test if the dict version is complete
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, ClinicalTypes
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
22
  model: str
23
  revision: str # commit hash, "" if main
24
  dataset_results: dict
25
+ clinical_type_results:dict
26
  precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
84
  dataset_results[task.benchmark] = mean_acc
85
 
86
  types_results = {}
87
+ for clinical_type in ClinicalTypes:
88
+ clinical_type = clinical_type.value
89
 
90
  # We average all scores of a given metric (not all metrics are present in all files)
91
+ accs = np.array([v.get(clinical_type.metric, None) for k, v in data["clinical_type_results"].items() if clinical_type.benchmark == k])
92
  if accs.size == 0 or any([acc is None for acc in accs]):
93
  continue
94
 
95
  mean_acc = np.mean(accs) # * 100.0
96
+ types_results[clinical_type.benchmark] = mean_acc
97
 
98
  return self(
99
  eval_name=result_key,
 
101
  org=org,
102
  model=model,
103
  dataset_results=dataset_results,
104
+ clinical_type_results=types_results,
105
  precision=precision,
106
  revision=config.get("model_sha", ""),
107
  still_on_hub=still_on_hub,
 
156
 
157
  return data_dict
158
 
159
+ if subset == "clinical_types":
160
+ average = sum([v for v in self.clinical_type_results.values() if v is not None]) / len(ClinicalTypes)
161
  data_dict = {
162
  "eval_name": self.eval_name, # not a column, just a save name,
163
  AutoEvalColumn.precision.name: self.precision.value.name,
 
174
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
175
  }
176
 
177
+ for clinical_type in ClinicalTypes:
178
+ data_dict[clinical_type.value.col_name] = self.clinical_type_results[clinical_type.value.benchmark]
179
 
180
  return data_dict
181
 
 
232
  eval_results[eval_name] = eval_result
233
 
234
  results = []
235
+ # clinical_type_results = []
236
  for v in eval_results.values():
237
  try:
238
  v.to_dict(subset="dataset") # we test if the dict version is complete