Spaces:

m42-health
/

clinical_ner_leaderboard

Running

App Files Files Community

“WadoodAbdul” commited on Jul 8, 2024

Commit

300388f

1 Parent(s): 4e35351

update m2 to clinical type name

Browse files

Files changed (5) hide show

app.py +21 -21
eval_metrics_app.py +75 -0
src/about.py +14 -14
src/display/utils.py +7 -7
src/leaderboard/read_evals.py +12 -12

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from src.display.utils import (
     DATASET_BENCHMARK_COLS,
     TYPES_BENCHMARK_COLS,
     DATASET_COLS,
-    M2_TYPES_COLS,
     EVAL_COLS,
     EVAL_TYPES,
     NUMERIC_INTERVALS,
@@ -39,26 +39,26 @@ def restart_space():
     API.restart_space(repo_id=REPO_ID)
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
 raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
 datasets_leaderboard_df = datasets_original_df.copy()
-raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, M2_TYPES_COLS, TYPES_BENCHMARK_COLS, "m2_types")
 types_leaderboard_df = types_original_df.copy()
 (
@@ -167,11 +167,11 @@ with demo:
                         )
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
-                            choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.m2_type_col],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.m2_type_col
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
@@ -249,7 +249,7 @@ with demo:
                     queue=True,
                 )
-        with gr.TabItem("🏅 M2 Types", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
@@ -309,8 +309,8 @@ with demo:
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=types_original_df[M2_TYPES_COLS],
-                headers=M2_TYPES_COLS,
                 datatype=TYPES,
                 visible=False,
             )

     DATASET_BENCHMARK_COLS,
     TYPES_BENCHMARK_COLS,
     DATASET_COLS,
+    Clinical_TYPES_COLS,
     EVAL_COLS,
     EVAL_TYPES,
     NUMERIC_INTERVALS,
     API.restart_space(repo_id=REPO_ID)
+# try:
+#     print(EVAL_REQUESTS_PATH)
+#     snapshot_download(
+#         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+#     )
+# except Exception:
+#     restart_space()
+# try:
+#     print(EVAL_RESULTS_PATH)
+#     snapshot_download(
+#         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+#     )
+# except Exception:
+#     restart_space()
 raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
 datasets_leaderboard_df = datasets_original_df.copy()
+raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "clinical_types")
 types_leaderboard_df = types_original_df.copy()
 (
                         )
                     with gr.Row():
                         shown_columns = gr.CheckboxGroup(
+                            choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.clinical_type_col],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
+                                if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.clinical_type_col
                             ],
                             label="Select columns to show",
                             elem_id="column-select",
                     queue=True,
                 )
+        with gr.TabItem("🏅 Clinical Types", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
             # Dummy leaderboard for handling the case when the user uses backspace key
             hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                value=types_original_df[Clinical_TYPES_COLS],
+                headers=Clinical_TYPES_COLS,
                 datatype=TYPES,
                 visible=False,
             )

eval_metrics_app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import gradio as gr
+# Function to compute evaluation metrics (dummy implementation)
+def compute_metrics(gt_spans, pred_spans):
+    # Dummy implementation of a metric computation
+    # Replace this with actual metric computation logic
+    tp = len(set(gt_spans) & set(pred_spans))
+    fp = len(set(pred_spans) - set(gt_spans))
+    fn = len(set(gt_spans) - set(pred_spans))
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
+    return {"precision": precision, "recall": recall, "f1_score": f1_score}
+def create_app():
+    with gr.Blocks() as demo:
+        # Input components
+        text_input = gr.Textbox(label="Input Text")
+        highlight_input = gr.Textbox(label="Highlight Text and Press Add")
+        gt_spans_state = gr.State([])
+        pred_spans_state = gr.State([])
+        # Buttons for ground truth and prediction
+        add_gt_button = gr.Button("Add to Ground Truth")
+        add_pred_button = gr.Button("Add to Predictions")
+        # Outputs for highlighted spans
+        gt_output = gr.HighlightedText(label="Ground Truth Spans")
+        pred_output = gr.HighlightedText(label="Predicted Spans")
+        # Compute metrics button and its output
+        compute_button = gr.Button("Compute Metrics")
+        metrics_output = gr.JSON(label="Metrics")
+        # Function to update spans
+        def update_spans(text, span, gt_spans, pred_spans, is_gt):
+            start_idx = text.find(span)
+            end_idx = start_idx + len(span)
+            new_span = (start_idx, end_idx)
+            if is_gt:
+                gt_spans.append(new_span)
+                gt_spans = list(set(gt_spans))
+            else:
+                pred_spans.append(new_span)
+                pred_spans = list(set(pred_spans))
+            return gt_spans, pred_spans, highlight_spans(text, gt_spans), highlight_spans(text, pred_spans)
+        # Function to highlight spans
+        def highlight_spans(text, spans):
+            span_dict = {}
+            for span in spans:
+                span_dict[(span[0], span[1])] = "highlight"
+            return span_dict
+        # Event handlers for buttons
+        add_gt_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(True)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
+        add_pred_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(False)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
+        # Function to compute metrics
+        def on_compute_metrics(gt_spans, pred_spans):
+            metrics = compute_metrics(gt_spans, pred_spans)
+            return metrics
+        compute_button.click(fn=on_compute_metrics, inputs=[gt_spans_state, pred_spans_state], outputs=metrics_output)
+        # Layout arrangement
+        text_input.change(fn=lambda x: x, inputs=text_input, outputs=[gt_output, pred_output])
+    return demo
+# Run the app
+demo = create_app()
+demo.launch()

src/about.py CHANGED Viewed

@@ -23,19 +23,19 @@ class Tasks(Enum):
     # task6 = Task("", "f1", "")
 @dataclass
-class M2Type:
     benchmark: str
     metric: str
     col_name: str
-class M2Types(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    type0 = M2Type("condition", "f1", "CONDITION")
-    type1 = M2Type("measurement", "f1", "MEASUREMENT")
-    type2 = M2Type("drug", "f1", "DRUG")
-    type3 = M2Type("procedure", "f1", "PROCEDURE")
-    type4 = M2Type("gene", "f1", "GENE")
-    type5 = M2Type("gene variant", "f1", "GENE VARIANT")
 NUM_FEWSHOT = 0  # Change with your few shot
@@ -43,8 +43,8 @@ NUM_FEWSHOT = 0  # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
-LOGO = """<img src="file/assets/image.png" alt="M2 X HF" width="500" height="333">"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
@@ -66,10 +66,10 @@ The Named Clinical Entity Recognition Leaderboard is aimed at advancing the fiel
 ### Datasets
 📈 We evaluate the models on 4 datasets, encompassing 6 entity types
-- [NCBI](https://huggingface.co/datasets/m42-health/m2_ncbi)
-- [CHIA](https://huggingface.co/datasets/m42-health/m2_chia)
-- [BIORED](https://huggingface.co/datasets/m42-health/m2_biored)
-- [BC5CD](https://huggingface.co/datasets/m42-health/m2_bc5cdr)
 ### Evaluation Metrics
 We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.

     # task6 = Task("", "f1", "")
 @dataclass
+class ClinicalType:
     benchmark: str
     metric: str
     col_name: str
+class ClinicalTypes(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    type0 = ClinicalType("condition", "f1", "CONDITION")
+    type1 = ClinicalType("measurement", "f1", "MEASUREMENT")
+    type2 = ClinicalType("drug", "f1", "DRUG")
+    type3 = ClinicalType("procedure", "f1", "PROCEDURE")
+    type4 = ClinicalType("gene", "f1", "GENE")
+    type5 = ClinicalType("gene variant", "f1", "GENE VARIANT")
 NUM_FEWSHOT = 0  # Change with your few shot
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
+LOGO = """<img src="file/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
 ### Datasets
 📈 We evaluate the models on 4 datasets, encompassing 6 entity types
+- [NCBI](https://huggingface.co/datasets/m42-health/clinical_ncbi)
+- [CHIA](https://huggingface.co/datasets/m42-health/clinical_chia)
+- [BIORED](https://huggingface.co/datasets/m42-health/clinical_biored)
+- [BC5CD](https://huggingface.co/datasets/m42-health/clinical_bc5cdr)
 ### Evaluation Metrics
 We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.

src/display/utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ from enum import Enum
 import pandas as pd
 from src.about import Tasks
-from src.about import M2Types
 def fields(raw_class):
@@ -22,7 +22,7 @@ class ColumnContent:
     hidden: bool = False
     never_hidden: bool = False
     dataset_task_col: bool = False
-    m2_type_col: bool = False
 ## Leaderboard columns
@@ -34,8 +34,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
-for task in M2Types:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, m2_type_col=True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -133,8 +133,8 @@ class Precision(Enum):
 # Column selection
-DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.m2_type_col]
-M2_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -143,7 +143,7 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
-TYPES_BENCHMARK_COLS = [t.value.col_name for t in M2Types]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

 import pandas as pd
 from src.about import Tasks
+from src.about import ClinicalTypes
 def fields(raw_class):
     hidden: bool = False
     never_hidden: bool = False
     dataset_task_col: bool = False
+    clinical_type_col: bool = False
 ## Leaderboard columns
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
+for task in ClinicalTypes:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, clinical_type_col=True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 # Column selection
+DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
+Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+TYPES_BENCHMARK_COLS = [t.value.col_name for t in ClinicalTypes]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,7 +8,7 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, M2Types
 from src.submission.check_validity import is_model_on_hub
@@ -22,7 +22,7 @@ class EvalResult:
     model: str
     revision: str  # commit hash, "" if main
     dataset_results: dict
-    m2_type_results:dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original  # Original or Adapter
@@ -84,16 +84,16 @@ class EvalResult:
             dataset_results[task.benchmark] = mean_acc
         types_results = {}
-        for m2_type in M2Types:
-            m2_type = m2_type.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(m2_type.metric, None) for k, v in data["m2_type_results"].items() if m2_type.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs)  # * 100.0
-            types_results[m2_type.benchmark] = mean_acc
         return self(
             eval_name=result_key,
@@ -101,7 +101,7 @@ class EvalResult:
             org=org,
             model=model,
             dataset_results=dataset_results,
-            m2_type_results=types_results,
             precision=precision,
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
@@ -156,8 +156,8 @@ class EvalResult:
             return data_dict
-        if subset == "m2_types":
-            average = sum([v for v in self.m2_type_results.values() if v is not None]) / len(M2Types)
             data_dict = {
                 "eval_name": self.eval_name,  # not a column, just a save name,
                 AutoEvalColumn.precision.name: self.precision.value.name,
@@ -174,8 +174,8 @@ class EvalResult:
                 AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             }
-            for m2_type in M2Types:
-                data_dict[m2_type.value.col_name] = self.m2_type_results[m2_type.value.benchmark]
             return data_dict
@@ -232,7 +232,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         eval_results[eval_name] = eval_result
     results = []
-    # m2_type_results = []
     for v in eval_results.values():
         try:
             v.to_dict(subset="dataset")  # we test if the dict version is complete

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, ClinicalTypes
 from src.submission.check_validity import is_model_on_hub
     model: str
     revision: str  # commit hash, "" if main
     dataset_results: dict
+    clinical_type_results:dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original  # Original or Adapter
             dataset_results[task.benchmark] = mean_acc
         types_results = {}
+        for clinical_type in ClinicalTypes:
+            clinical_type = clinical_type.value
             # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(clinical_type.metric, None) for k, v in data["clinical_type_results"].items() if clinical_type.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             mean_acc = np.mean(accs)  # * 100.0
+            types_results[clinical_type.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             org=org,
             model=model,
             dataset_results=dataset_results,
+            clinical_type_results=types_results,
             precision=precision,
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             return data_dict
+        if subset == "clinical_types":
+            average = sum([v for v in self.clinical_type_results.values() if v is not None]) / len(ClinicalTypes)
             data_dict = {
                 "eval_name": self.eval_name,  # not a column, just a save name,
                 AutoEvalColumn.precision.name: self.precision.value.name,
                 AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             }
+            for clinical_type in ClinicalTypes:
+                data_dict[clinical_type.value.col_name] = self.clinical_type_results[clinical_type.value.benchmark]
             return data_dict
         eval_results[eval_name] = eval_result
     results = []
+    # clinical_type_results = []
     for v in eval_results.values():
         try:
             v.to_dict(subset="dataset")  # we test if the dict version is complete