Commit
·
ee5875c
1
Parent(s):
3445f6a
flatten results for dataset
Browse files- evaluation_logic.py +14 -18
evaluation_logic.py
CHANGED
@@ -58,33 +58,29 @@ def save_evaluation(inference_api, model_name, prompt_format, metrics):
|
|
58 |
evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
|
59 |
evaluation_folder.mkdir(parents=True, exist_ok=True)
|
60 |
|
61 |
-
# Extract
|
62 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
64 |
|
|
|
65 |
for category in categories:
|
66 |
if category in metrics['exec']:
|
67 |
category_metrics = metrics['exec'][category]
|
68 |
-
|
69 |
-
|
70 |
-
'execution_accuracy': category_metrics['exec']
|
71 |
-
}
|
72 |
else:
|
73 |
-
|
74 |
-
|
75 |
-
'execution_accuracy': 0.0
|
76 |
-
}
|
77 |
|
78 |
with evaluation_scheduler.lock:
|
79 |
with evaluation_file.open("a") as f:
|
80 |
-
json.dump(
|
81 |
-
|
82 |
-
"model_name": model_name,
|
83 |
-
"prompt_format": prompt_format,
|
84 |
-
"category_metrics": simplified_metrics,
|
85 |
-
"timestamp": datetime.now().isoformat()
|
86 |
-
}, f)
|
87 |
-
f.write('\n')
|
88 |
|
89 |
def run_prediction(inference_api, model_name, prompt_format, output_file):
|
90 |
dataset_path = str(eval_dir / "data/dev.json")
|
|
|
58 |
evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
|
59 |
evaluation_folder.mkdir(parents=True, exist_ok=True)
|
60 |
|
61 |
+
# Extract and flatten the category-specific execution metrics
|
62 |
categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
|
63 |
+
flattened_metrics = {
|
64 |
+
"inference_api": inference_api,
|
65 |
+
"model_name": model_name,
|
66 |
+
"prompt_format": prompt_format,
|
67 |
+
"timestamp": datetime.now().isoformat()
|
68 |
+
}
|
69 |
|
70 |
+
# Flatten each category's metrics into separate columns
|
71 |
for category in categories:
|
72 |
if category in metrics['exec']:
|
73 |
category_metrics = metrics['exec'][category]
|
74 |
+
flattened_metrics[f"{category}_count"] = category_metrics['count']
|
75 |
+
flattened_metrics[f"{category}_execution_accuracy"] = category_metrics['exec']
|
|
|
|
|
76 |
else:
|
77 |
+
flattened_metrics[f"{category}_count"] = 0
|
78 |
+
flattened_metrics[f"{category}_execution_accuracy"] = 0.0
|
|
|
|
|
79 |
|
80 |
with evaluation_scheduler.lock:
|
81 |
with evaluation_file.open("a") as f:
|
82 |
+
json.dump(flattened_metrics, f)
|
83 |
+
f.write('\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
def run_prediction(inference_api, model_name, prompt_format, output_file):
|
86 |
dataset_path = str(eval_dir / "data/dev.json")
|