Spaces:
Runtime error
Runtime error
ref model and tuple error fix
Browse files
app.py
CHANGED
@@ -34,7 +34,6 @@ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default an
|
|
34 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
35 |
|
36 |
# CONFIGURATION:
|
37 |
-
ref_model = "mistralai/Mistral-7B-v0.1"
|
38 |
test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
|
39 |
modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
|
40 |
print(modelQueue)
|
@@ -48,11 +47,11 @@ def formatr(result):
|
|
48 |
result = result.replace(" ","")
|
49 |
return result
|
50 |
|
51 |
-
def save_to_txt(model, results, model_type):
|
52 |
file_path = "data/code_eval_board.csv"
|
53 |
|
54 |
with open(file_path, "a") as f:
|
55 |
-
f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])))
|
56 |
f.close()
|
57 |
|
58 |
def run_test(model,ref_model,data):
|
@@ -67,8 +66,7 @@ def run_test(model,ref_model,data):
|
|
67 |
ratio_gen=0.4
|
68 |
) # Call the main function in detect-pretrain-code-contamination/src/run.py
|
69 |
|
70 |
-
def evaluate(model,model_type):
|
71 |
-
global ref_model
|
72 |
print(f"|| EVALUATING {model} ||")
|
73 |
results = {
|
74 |
"arc": run_test(model, ref_model, test_datasets[2]),
|
@@ -81,14 +79,14 @@ def evaluate(model,model_type):
|
|
81 |
}
|
82 |
|
83 |
# Save to .txt file in /Evaluations/{model}
|
84 |
-
save_to_txt(model, results, model_type)
|
85 |
return "\n".join([f"{k}:{results[k]}" for k in results])
|
86 |
|
87 |
def worker_thread():
|
88 |
global modelQueue, server
|
89 |
while True:
|
90 |
for submission in modelQueue:
|
91 |
-
#evaluate(submission[1],submission[0].split(" ")[0])
|
92 |
#modelQueue.pop(modelQueue.index(submission))
|
93 |
|
94 |
# Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
|
@@ -110,6 +108,12 @@ def queue(model,model_type,ref_model):
|
|
110 |
f.write(f"\n{model_type},{model},{ref_model}")
|
111 |
f.close()
|
112 |
print(f"QUEUE:\n{modelQueue}")
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
|
115 |
### bigcode/bigcode-models-leaderboard
|
|
|
34 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
35 |
|
36 |
# CONFIGURATION:
|
|
|
37 |
test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
|
38 |
modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
|
39 |
print(modelQueue)
|
|
|
47 |
result = result.replace(" ","")
|
48 |
return result
|
49 |
|
50 |
+
def save_to_txt(model, results, model_type,ref_model):
|
51 |
file_path = "data/code_eval_board.csv"
|
52 |
|
53 |
with open(file_path, "a") as f:
|
54 |
+
f.write(f"\n{model_type},{model}," + str(formatr(results["arc"])) + "," + str(formatr(results["hellaswag"])) + "," + str(formatr(results["mmlu"])) + "," + str(formatr(results["truthfulQA"])) + "," + str(formatr(results["winogrande"])) + "," + str(formatr(results["gsm8k"])) + f",{ref_model}")
|
55 |
f.close()
|
56 |
|
57 |
def run_test(model,ref_model,data):
|
|
|
66 |
ratio_gen=0.4
|
67 |
) # Call the main function in detect-pretrain-code-contamination/src/run.py
|
68 |
|
69 |
+
def evaluate(model,model_type,ref_model):
|
|
|
70 |
print(f"|| EVALUATING {model} ||")
|
71 |
results = {
|
72 |
"arc": run_test(model, ref_model, test_datasets[2]),
|
|
|
79 |
}
|
80 |
|
81 |
# Save to .txt file in /Evaluations/{model}
|
82 |
+
save_to_txt(model, results, model_type,ref_model)
|
83 |
return "\n".join([f"{k}:{results[k]}" for k in results])
|
84 |
|
85 |
def worker_thread():
|
86 |
global modelQueue, server
|
87 |
while True:
|
88 |
for submission in modelQueue:
|
89 |
+
#evaluate(submission[1],submission[0].split(" ")[0],submission[2])
|
90 |
#modelQueue.pop(modelQueue.index(submission))
|
91 |
|
92 |
# Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
|
|
|
108 |
f.write(f"\n{model_type},{model},{ref_model}")
|
109 |
f.close()
|
110 |
print(f"QUEUE:\n{modelQueue}")
|
111 |
+
|
112 |
+
eval_entry = {
|
113 |
+
"model": model,
|
114 |
+
"model_type": model_type,
|
115 |
+
"ref_model": ref_model,
|
116 |
+
}
|
117 |
|
118 |
|
119 |
### bigcode/bigcode-models-leaderboard
|