Yeyito commited on
Commit
5e2e1fb
1 Parent(s): d74f5ae

ref model and tuple error fix

Browse files
Files changed (1) hide show
  1. app.py +11 -7
app.py CHANGED
@@ -34,7 +34,6 @@ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default an
34
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
35
 
36
  # CONFIGURATION:
37
- ref_model = "mistralai/Mistral-7B-v0.1"
38
  test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
39
  modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
40
  print(modelQueue)
@@ -48,11 +47,11 @@ def formatr(result):
48
  result = result.replace(" ","")
49
  return result
50
 
51
- def save_to_txt(model, results, model_type):
52
  file_path = "data/code_eval_board.csv"
53
 
54
  with open(file_path, "a") as f:
55
- f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])))
56
  f.close()
57
 
58
  def run_test(model,ref_model,data):
@@ -67,8 +66,7 @@ def run_test(model,ref_model,data):
67
  ratio_gen=0.4
68
  ) # Call the main function in detect-pretrain-code-contamination/src/run.py
69
 
70
- def evaluate(model,model_type):
71
- global ref_model
72
  print(f"|| EVALUATING {model} ||")
73
  results = {
74
  "arc": run_test(model, ref_model, test_datasets[2]),
@@ -81,14 +79,14 @@ def evaluate(model,model_type):
81
  }
82
 
83
  # Save to .txt file in /Evaluations/{model}
84
- save_to_txt(model, results, model_type)
85
  return "\n".join([f"{k}:{results[k]}" for k in results])
86
 
87
  def worker_thread():
88
  global modelQueue, server
89
  while True:
90
  for submission in modelQueue:
91
- #evaluate(submission[1],submission[0].split(" ")[0])
92
  #modelQueue.pop(modelQueue.index(submission))
93
 
94
  # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
@@ -110,6 +108,12 @@ def queue(model,model_type,ref_model):
110
  f.write(f"\n{model_type},{model},{ref_model}")
111
  f.close()
112
  print(f"QUEUE:\n{modelQueue}")
 
 
 
 
 
 
113
 
114
 
115
  ### bigcode/bigcode-models-leaderboard
 
34
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
35
 
36
  # CONFIGURATION:
 
37
  test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
38
  modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
39
  print(modelQueue)
 
47
  result = result.replace(" ","")
48
  return result
49
 
50
+ def save_to_txt(model, results, model_type,ref_model):
51
  file_path = "data/code_eval_board.csv"
52
 
53
  with open(file_path, "a") as f:
54
+ f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}")
55
  f.close()
56
 
57
  def run_test(model,ref_model,data):
 
66
  ratio_gen=0.4
67
  ) # Call the main function in detect-pretrain-code-contamination/src/run.py
68
 
69
+ def evaluate(model,model_type,ref_model):
 
70
  print(f"|| EVALUATING {model} ||")
71
  results = {
72
  "arc": run_test(model, ref_model, test_datasets[2]),
 
79
  }
80
 
81
  # Save to .txt file in /Evaluations/{model}
82
+ save_to_txt(model, results, model_type,ref_model)
83
  return "\n".join([f"{k}:{results[k]}" for k in results])
84
 
85
  def worker_thread():
86
  global modelQueue, server
87
  while True:
88
  for submission in modelQueue:
89
+ #evaluate(submission[1],submission[0].split(" ")[0],submission[2])
90
  #modelQueue.pop(modelQueue.index(submission))
91
 
92
  # Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
 
108
  f.write(f"\n{model_type},{model},{ref_model}")
109
  f.close()
110
  print(f"QUEUE:\n{modelQueue}")
111
+
112
+ eval_entry = {
113
+ "model": model,
114
+ "model_type": model_type,
115
+ "ref_model": ref_model,
116
+ }
117
 
118
 
119
  ### bigcode/bigcode-models-leaderboard