“WadoodAbdul”
commited on
Commit
·
300388f
1
Parent(s):
4e35351
update m2 to clinical type name
Browse files- app.py +21 -21
- eval_metrics_app.py +75 -0
- src/about.py +14 -14
- src/display/utils.py +7 -7
- src/leaderboard/read_evals.py +12 -12
app.py
CHANGED
@@ -19,7 +19,7 @@ from src.display.utils import (
|
|
19 |
DATASET_BENCHMARK_COLS,
|
20 |
TYPES_BENCHMARK_COLS,
|
21 |
DATASET_COLS,
|
22 |
-
|
23 |
EVAL_COLS,
|
24 |
EVAL_TYPES,
|
25 |
NUMERIC_INTERVALS,
|
@@ -39,26 +39,26 @@ def restart_space():
|
|
39 |
API.restart_space(repo_id=REPO_ID)
|
40 |
|
41 |
|
42 |
-
try:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
except Exception:
|
48 |
-
|
49 |
-
try:
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
except Exception:
|
55 |
-
|
56 |
|
57 |
|
58 |
raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
|
59 |
datasets_leaderboard_df = datasets_original_df.copy()
|
60 |
|
61 |
-
raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH,
|
62 |
types_leaderboard_df = types_original_df.copy()
|
63 |
|
64 |
(
|
@@ -167,11 +167,11 @@ with demo:
|
|
167 |
)
|
168 |
with gr.Row():
|
169 |
shown_columns = gr.CheckboxGroup(
|
170 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.
|
171 |
value=[
|
172 |
c.name
|
173 |
for c in fields(AutoEvalColumn)
|
174 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.
|
175 |
],
|
176 |
label="Select columns to show",
|
177 |
elem_id="column-select",
|
@@ -249,7 +249,7 @@ with demo:
|
|
249 |
queue=True,
|
250 |
)
|
251 |
|
252 |
-
with gr.TabItem("🏅
|
253 |
with gr.Row():
|
254 |
with gr.Column():
|
255 |
with gr.Row():
|
@@ -309,8 +309,8 @@ with demo:
|
|
309 |
|
310 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
311 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
312 |
-
value=types_original_df[
|
313 |
-
headers=
|
314 |
datatype=TYPES,
|
315 |
visible=False,
|
316 |
)
|
|
|
19 |
DATASET_BENCHMARK_COLS,
|
20 |
TYPES_BENCHMARK_COLS,
|
21 |
DATASET_COLS,
|
22 |
+
Clinical_TYPES_COLS,
|
23 |
EVAL_COLS,
|
24 |
EVAL_TYPES,
|
25 |
NUMERIC_INTERVALS,
|
|
|
39 |
API.restart_space(repo_id=REPO_ID)
|
40 |
|
41 |
|
42 |
+
# try:
|
43 |
+
# print(EVAL_REQUESTS_PATH)
|
44 |
+
# snapshot_download(
|
45 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
46 |
+
# )
|
47 |
+
# except Exception:
|
48 |
+
# restart_space()
|
49 |
+
# try:
|
50 |
+
# print(EVAL_RESULTS_PATH)
|
51 |
+
# snapshot_download(
|
52 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
53 |
+
# )
|
54 |
+
# except Exception:
|
55 |
+
# restart_space()
|
56 |
|
57 |
|
58 |
raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
|
59 |
datasets_leaderboard_df = datasets_original_df.copy()
|
60 |
|
61 |
+
raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "clinical_types")
|
62 |
types_leaderboard_df = types_original_df.copy()
|
63 |
|
64 |
(
|
|
|
167 |
)
|
168 |
with gr.Row():
|
169 |
shown_columns = gr.CheckboxGroup(
|
170 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.clinical_type_col],
|
171 |
value=[
|
172 |
c.name
|
173 |
for c in fields(AutoEvalColumn)
|
174 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.clinical_type_col
|
175 |
],
|
176 |
label="Select columns to show",
|
177 |
elem_id="column-select",
|
|
|
249 |
queue=True,
|
250 |
)
|
251 |
|
252 |
+
with gr.TabItem("🏅 Clinical Types", elem_id="llm-benchmark-tab-table", id=4):
|
253 |
with gr.Row():
|
254 |
with gr.Column():
|
255 |
with gr.Row():
|
|
|
309 |
|
310 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
311 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
312 |
+
value=types_original_df[Clinical_TYPES_COLS],
|
313 |
+
headers=Clinical_TYPES_COLS,
|
314 |
datatype=TYPES,
|
315 |
visible=False,
|
316 |
)
|
eval_metrics_app.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
# Function to compute evaluation metrics (dummy implementation)
|
4 |
+
def compute_metrics(gt_spans, pred_spans):
|
5 |
+
# Dummy implementation of a metric computation
|
6 |
+
# Replace this with actual metric computation logic
|
7 |
+
tp = len(set(gt_spans) & set(pred_spans))
|
8 |
+
fp = len(set(pred_spans) - set(gt_spans))
|
9 |
+
fn = len(set(gt_spans) - set(pred_spans))
|
10 |
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
11 |
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
12 |
+
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
|
13 |
+
|
14 |
+
return {"precision": precision, "recall": recall, "f1_score": f1_score}
|
15 |
+
|
16 |
+
def create_app():
|
17 |
+
with gr.Blocks() as demo:
|
18 |
+
# Input components
|
19 |
+
text_input = gr.Textbox(label="Input Text")
|
20 |
+
highlight_input = gr.Textbox(label="Highlight Text and Press Add")
|
21 |
+
|
22 |
+
gt_spans_state = gr.State([])
|
23 |
+
pred_spans_state = gr.State([])
|
24 |
+
|
25 |
+
# Buttons for ground truth and prediction
|
26 |
+
add_gt_button = gr.Button("Add to Ground Truth")
|
27 |
+
add_pred_button = gr.Button("Add to Predictions")
|
28 |
+
|
29 |
+
# Outputs for highlighted spans
|
30 |
+
gt_output = gr.HighlightedText(label="Ground Truth Spans")
|
31 |
+
pred_output = gr.HighlightedText(label="Predicted Spans")
|
32 |
+
|
33 |
+
# Compute metrics button and its output
|
34 |
+
compute_button = gr.Button("Compute Metrics")
|
35 |
+
metrics_output = gr.JSON(label="Metrics")
|
36 |
+
|
37 |
+
# Function to update spans
|
38 |
+
def update_spans(text, span, gt_spans, pred_spans, is_gt):
|
39 |
+
start_idx = text.find(span)
|
40 |
+
end_idx = start_idx + len(span)
|
41 |
+
new_span = (start_idx, end_idx)
|
42 |
+
if is_gt:
|
43 |
+
gt_spans.append(new_span)
|
44 |
+
gt_spans = list(set(gt_spans))
|
45 |
+
else:
|
46 |
+
pred_spans.append(new_span)
|
47 |
+
pred_spans = list(set(pred_spans))
|
48 |
+
return gt_spans, pred_spans, highlight_spans(text, gt_spans), highlight_spans(text, pred_spans)
|
49 |
+
|
50 |
+
# Function to highlight spans
|
51 |
+
def highlight_spans(text, spans):
|
52 |
+
span_dict = {}
|
53 |
+
for span in spans:
|
54 |
+
span_dict[(span[0], span[1])] = "highlight"
|
55 |
+
return span_dict
|
56 |
+
|
57 |
+
# Event handlers for buttons
|
58 |
+
add_gt_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(True)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
|
59 |
+
add_pred_button.click(fn=update_spans, inputs=[text_input, highlight_input, gt_spans_state, pred_spans_state, gr.State(False)], outputs=[gt_spans_state, pred_spans_state, gt_output, pred_output])
|
60 |
+
|
61 |
+
# Function to compute metrics
|
62 |
+
def on_compute_metrics(gt_spans, pred_spans):
|
63 |
+
metrics = compute_metrics(gt_spans, pred_spans)
|
64 |
+
return metrics
|
65 |
+
|
66 |
+
compute_button.click(fn=on_compute_metrics, inputs=[gt_spans_state, pred_spans_state], outputs=metrics_output)
|
67 |
+
|
68 |
+
# Layout arrangement
|
69 |
+
text_input.change(fn=lambda x: x, inputs=text_input, outputs=[gt_output, pred_output])
|
70 |
+
|
71 |
+
return demo
|
72 |
+
|
73 |
+
# Run the app
|
74 |
+
demo = create_app()
|
75 |
+
demo.launch()
|
src/about.py
CHANGED
@@ -23,19 +23,19 @@ class Tasks(Enum):
|
|
23 |
# task6 = Task("", "f1", "")
|
24 |
|
25 |
@dataclass
|
26 |
-
class
|
27 |
benchmark: str
|
28 |
metric: str
|
29 |
col_name: str
|
30 |
|
31 |
-
class
|
32 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
33 |
-
type0 =
|
34 |
-
type1 =
|
35 |
-
type2 =
|
36 |
-
type3 =
|
37 |
-
type4 =
|
38 |
-
type5 =
|
39 |
|
40 |
|
41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
@@ -43,8 +43,8 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
43 |
|
44 |
|
45 |
# Your leaderboard name
|
46 |
-
TITLE = """<h1 align="center" id="space-title">
|
47 |
-
LOGO = """<img src="file/assets/image.png" alt="
|
48 |
# What does your leaderboard evaluate?
|
49 |
INTRODUCTION_TEXT = """
|
50 |
Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
|
@@ -66,10 +66,10 @@ The Named Clinical Entity Recognition Leaderboard is aimed at advancing the fiel
|
|
66 |
|
67 |
### Datasets
|
68 |
📈 We evaluate the models on 4 datasets, encompassing 6 entity types
|
69 |
-
- [NCBI](https://huggingface.co/datasets/m42-health/
|
70 |
-
- [CHIA](https://huggingface.co/datasets/m42-health/
|
71 |
-
- [BIORED](https://huggingface.co/datasets/m42-health/
|
72 |
-
- [BC5CD](https://huggingface.co/datasets/m42-health/
|
73 |
|
74 |
### Evaluation Metrics
|
75 |
We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.
|
|
|
23 |
# task6 = Task("", "f1", "")
|
24 |
|
25 |
@dataclass
|
26 |
+
class ClinicalType:
|
27 |
benchmark: str
|
28 |
metric: str
|
29 |
col_name: str
|
30 |
|
31 |
+
class ClinicalTypes(Enum):
|
32 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
33 |
+
type0 = ClinicalType("condition", "f1", "CONDITION")
|
34 |
+
type1 = ClinicalType("measurement", "f1", "MEASUREMENT")
|
35 |
+
type2 = ClinicalType("drug", "f1", "DRUG")
|
36 |
+
type3 = ClinicalType("procedure", "f1", "PROCEDURE")
|
37 |
+
type4 = ClinicalType("gene", "f1", "GENE")
|
38 |
+
type5 = ClinicalType("gene variant", "f1", "GENE VARIANT")
|
39 |
|
40 |
|
41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
43 |
|
44 |
|
45 |
# Your leaderboard name
|
46 |
+
TITLE = """<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
|
47 |
+
LOGO = """<img src="file/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
|
48 |
# What does your leaderboard evaluate?
|
49 |
INTRODUCTION_TEXT = """
|
50 |
Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
|
|
|
66 |
|
67 |
### Datasets
|
68 |
📈 We evaluate the models on 4 datasets, encompassing 6 entity types
|
69 |
+
- [NCBI](https://huggingface.co/datasets/m42-health/clinical_ncbi)
|
70 |
+
- [CHIA](https://huggingface.co/datasets/m42-health/clinical_chia)
|
71 |
+
- [BIORED](https://huggingface.co/datasets/m42-health/clinical_biored)
|
72 |
+
- [BC5CD](https://huggingface.co/datasets/m42-health/clinical_bc5cdr)
|
73 |
|
74 |
### Evaluation Metrics
|
75 |
We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.
|
src/display/utils.py
CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.about import Tasks
|
7 |
-
from src.about import
|
8 |
|
9 |
|
10 |
def fields(raw_class):
|
@@ -22,7 +22,7 @@ class ColumnContent:
|
|
22 |
hidden: bool = False
|
23 |
never_hidden: bool = False
|
24 |
dataset_task_col: bool = False
|
25 |
-
|
26 |
|
27 |
|
28 |
## Leaderboard columns
|
@@ -34,8 +34,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
34 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
35 |
for task in Tasks:
|
36 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
|
37 |
-
for task in
|
38 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False,
|
39 |
# Model information
|
40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
41 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
@@ -133,8 +133,8 @@ class Precision(Enum):
|
|
133 |
|
134 |
|
135 |
# Column selection
|
136 |
-
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.
|
137 |
-
|
138 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
139 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
140 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
@@ -143,7 +143,7 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
143 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
144 |
|
145 |
DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
146 |
-
TYPES_BENCHMARK_COLS = [t.value.col_name for t in
|
147 |
|
148 |
NUMERIC_INTERVALS = {
|
149 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.about import Tasks
|
7 |
+
from src.about import ClinicalTypes
|
8 |
|
9 |
|
10 |
def fields(raw_class):
|
|
|
22 |
hidden: bool = False
|
23 |
never_hidden: bool = False
|
24 |
dataset_task_col: bool = False
|
25 |
+
clinical_type_col: bool = False
|
26 |
|
27 |
|
28 |
## Leaderboard columns
|
|
|
34 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
35 |
for task in Tasks:
|
36 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
|
37 |
+
for task in ClinicalTypes:
|
38 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, clinical_type_col=True)])
|
39 |
# Model information
|
40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
41 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
133 |
|
134 |
|
135 |
# Column selection
|
136 |
+
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
|
137 |
+
Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
|
138 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
139 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
140 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
|
|
143 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
144 |
|
145 |
DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
146 |
+
TYPES_BENCHMARK_COLS = [t.value.col_name for t in ClinicalTypes]
|
147 |
|
148 |
NUMERIC_INTERVALS = {
|
149 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType,
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -22,7 +22,7 @@ class EvalResult:
|
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
dataset_results: dict
|
25 |
-
|
26 |
precision: Precision = Precision.Unknown
|
27 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
@@ -84,16 +84,16 @@ class EvalResult:
|
|
84 |
dataset_results[task.benchmark] = mean_acc
|
85 |
|
86 |
types_results = {}
|
87 |
-
for
|
88 |
-
|
89 |
|
90 |
# We average all scores of a given metric (not all metrics are present in all files)
|
91 |
-
accs = np.array([v.get(
|
92 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
93 |
continue
|
94 |
|
95 |
mean_acc = np.mean(accs) # * 100.0
|
96 |
-
types_results[
|
97 |
|
98 |
return self(
|
99 |
eval_name=result_key,
|
@@ -101,7 +101,7 @@ class EvalResult:
|
|
101 |
org=org,
|
102 |
model=model,
|
103 |
dataset_results=dataset_results,
|
104 |
-
|
105 |
precision=precision,
|
106 |
revision=config.get("model_sha", ""),
|
107 |
still_on_hub=still_on_hub,
|
@@ -156,8 +156,8 @@ class EvalResult:
|
|
156 |
|
157 |
return data_dict
|
158 |
|
159 |
-
if subset == "
|
160 |
-
average = sum([v for v in self.
|
161 |
data_dict = {
|
162 |
"eval_name": self.eval_name, # not a column, just a save name,
|
163 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -174,8 +174,8 @@ class EvalResult:
|
|
174 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
175 |
}
|
176 |
|
177 |
-
for
|
178 |
-
data_dict[
|
179 |
|
180 |
return data_dict
|
181 |
|
@@ -232,7 +232,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
232 |
eval_results[eval_name] = eval_result
|
233 |
|
234 |
results = []
|
235 |
-
#
|
236 |
for v in eval_results.values():
|
237 |
try:
|
238 |
v.to_dict(subset="dataset") # we test if the dict version is complete
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, ClinicalTypes
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
dataset_results: dict
|
25 |
+
clinical_type_results:dict
|
26 |
precision: Precision = Precision.Unknown
|
27 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
|
|
84 |
dataset_results[task.benchmark] = mean_acc
|
85 |
|
86 |
types_results = {}
|
87 |
+
for clinical_type in ClinicalTypes:
|
88 |
+
clinical_type = clinical_type.value
|
89 |
|
90 |
# We average all scores of a given metric (not all metrics are present in all files)
|
91 |
+
accs = np.array([v.get(clinical_type.metric, None) for k, v in data["clinical_type_results"].items() if clinical_type.benchmark == k])
|
92 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
93 |
continue
|
94 |
|
95 |
mean_acc = np.mean(accs) # * 100.0
|
96 |
+
types_results[clinical_type.benchmark] = mean_acc
|
97 |
|
98 |
return self(
|
99 |
eval_name=result_key,
|
|
|
101 |
org=org,
|
102 |
model=model,
|
103 |
dataset_results=dataset_results,
|
104 |
+
clinical_type_results=types_results,
|
105 |
precision=precision,
|
106 |
revision=config.get("model_sha", ""),
|
107 |
still_on_hub=still_on_hub,
|
|
|
156 |
|
157 |
return data_dict
|
158 |
|
159 |
+
if subset == "clinical_types":
|
160 |
+
average = sum([v for v in self.clinical_type_results.values() if v is not None]) / len(ClinicalTypes)
|
161 |
data_dict = {
|
162 |
"eval_name": self.eval_name, # not a column, just a save name,
|
163 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
174 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
175 |
}
|
176 |
|
177 |
+
for clinical_type in ClinicalTypes:
|
178 |
+
data_dict[clinical_type.value.col_name] = self.clinical_type_results[clinical_type.value.benchmark]
|
179 |
|
180 |
return data_dict
|
181 |
|
|
|
232 |
eval_results[eval_name] = eval_result
|
233 |
|
234 |
results = []
|
235 |
+
# clinical_type_results = []
|
236 |
for v in eval_results.values():
|
237 |
try:
|
238 |
v.to_dict(subset="dataset") # we test if the dict version is complete
|