cfahlgren1 HF staff commited on
Commit
ee5875c
·
1 Parent(s): 3445f6a

flatten results for dataset

Browse files
Files changed (1) hide show
  1. evaluation_logic.py +14 -18
evaluation_logic.py CHANGED
@@ -58,33 +58,29 @@ def save_evaluation(inference_api, model_name, prompt_format, metrics):
58
  evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
59
  evaluation_folder.mkdir(parents=True, exist_ok=True)
60
 
61
- # Extract only the category-specific execution metrics
62
  categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
63
- simplified_metrics = {}
 
 
 
 
 
64
 
 
65
  for category in categories:
66
  if category in metrics['exec']:
67
  category_metrics = metrics['exec'][category]
68
- simplified_metrics[category] = {
69
- 'count': category_metrics['count'],
70
- 'execution_accuracy': category_metrics['exec']
71
- }
72
  else:
73
- simplified_metrics[category] = {
74
- 'count': 0,
75
- 'execution_accuracy': 0.0
76
- }
77
 
78
  with evaluation_scheduler.lock:
79
  with evaluation_file.open("a") as f:
80
- json.dump({
81
- "inference_api": inference_api,
82
- "model_name": model_name,
83
- "prompt_format": prompt_format,
84
- "category_metrics": simplified_metrics,
85
- "timestamp": datetime.now().isoformat()
86
- }, f)
87
- f.write('\n')
88
 
89
  def run_prediction(inference_api, model_name, prompt_format, output_file):
90
  dataset_path = str(eval_dir / "data/dev.json")
 
58
  evaluation_file = evaluation_folder / f"evaluation_{file_uuid}.json"
59
  evaluation_folder.mkdir(parents=True, exist_ok=True)
60
 
61
+ # Extract and flatten the category-specific execution metrics
62
  categories = ['easy', 'medium', 'hard', 'duckdb', 'ddl', 'all']
63
+ flattened_metrics = {
64
+ "inference_api": inference_api,
65
+ "model_name": model_name,
66
+ "prompt_format": prompt_format,
67
+ "timestamp": datetime.now().isoformat()
68
+ }
69
 
70
+ # Flatten each category's metrics into separate columns
71
  for category in categories:
72
  if category in metrics['exec']:
73
  category_metrics = metrics['exec'][category]
74
+ flattened_metrics[f"{category}_count"] = category_metrics['count']
75
+ flattened_metrics[f"{category}_execution_accuracy"] = category_metrics['exec']
 
 
76
  else:
77
+ flattened_metrics[f"{category}_count"] = 0
78
+ flattened_metrics[f"{category}_execution_accuracy"] = 0.0
 
 
79
 
80
  with evaluation_scheduler.lock:
81
  with evaluation_file.open("a") as f:
82
+ json.dump(flattened_metrics, f)
83
+ f.write('\n')
 
 
 
 
 
 
84
 
85
  def run_prediction(inference_api, model_name, prompt_format, output_file):
86
  dataset_path = str(eval_dir / "data/dev.json")