Terry Zhuo
commited on
Commit
•
3204d18
1
Parent(s):
aca9a0c
update
Browse files- app.py +90 -158
- src/display/about.py +0 -148
- src/display/css_html_js.py +0 -115
- src/display/formatting.py +0 -37
- src/display/utils.py +0 -142
- src/envs.py +0 -42
- src/execute.py +0 -194
- src/populate.py +0 -50
- src/tools/plots.py +0 -72
- src/voting/vote_system.py +0 -150
app.py
CHANGED
@@ -122,11 +122,6 @@ def evaluate(
|
|
122 |
samples = "__dummy__.jsonl"
|
123 |
|
124 |
extra = subset + "_" if subset != "full" else ""
|
125 |
-
if os.path.isdir(samples):
|
126 |
-
result_path = os.path.join(samples, f"{extra}eval_results.json")
|
127 |
-
else:
|
128 |
-
assert samples.endswith(".jsonl")
|
129 |
-
result_path = samples.replace(".jsonl", f"_{extra}eval_results.json")
|
130 |
|
131 |
problems = get_bigcodebench(subset=subset)
|
132 |
dataset_hash = get_bigcodebench_hash(subset=subset)
|
@@ -139,98 +134,93 @@ def evaluate(
|
|
139 |
gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
|
140 |
failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
|
141 |
|
142 |
-
if
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
if
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
results = {
|
157 |
-
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
158 |
-
"eval": {},
|
159 |
-
}
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
)
|
176 |
-
continue
|
177 |
-
solution = (
|
178 |
-
sample["solution"]
|
179 |
-
if "solution" in sample
|
180 |
-
else problems[task_id]["complete_prompt"] + sample["completion"]
|
181 |
-
)
|
182 |
-
if "sanitized-calibrated" in samples:
|
183 |
-
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
184 |
-
remainings.add(sample["_identifier"])
|
185 |
-
args = (
|
186 |
-
completion_id[task_id],
|
187 |
-
problems[task_id],
|
188 |
-
solution,
|
189 |
-
max_as_limit,
|
190 |
-
max_data_limit,
|
191 |
-
max_stack_limit,
|
192 |
-
sample["_identifier"],
|
193 |
-
min_time_limit,
|
194 |
-
expected_time[task_id] if expected_time[task_id] else 20
|
195 |
)
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
warn(f"{len(remainings)} samples to be tested: {remainings}")
|
211 |
|
212 |
-
|
|
|
|
|
|
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
eval_results[result["task_id"]].append(result)
|
218 |
|
|
|
219 |
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
# Calculate pass@k.
|
236 |
total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
|
@@ -245,79 +235,18 @@ def evaluate(
|
|
245 |
base_correct = np.array(base_correct)
|
246 |
|
247 |
pass_at_k = {
|
248 |
-
f"pass@{k}":
|
249 |
for k in pass_k
|
250 |
if total.min() >= k
|
251 |
}
|
252 |
-
pass_at_k["
|
|
|
|
|
|
|
|
|
253 |
pass_at_k["failed_tasks"] = failed_tasks
|
254 |
-
return pass_at_k
|
255 |
-
|
256 |
-
# mode = "-calibrated" if "sanitized-calibrated" in samples else ""
|
257 |
-
# extra = subset.capitalize()
|
258 |
-
# split = split.capitalize()
|
259 |
-
# cprint(f"BigCodeBench-{split}{mode} ({extra})", "green")
|
260 |
-
|
261 |
-
# if no_gt:
|
262 |
-
# cprint(f"Groundtruth is not checked", "yellow")
|
263 |
-
# else:
|
264 |
-
# if gt_pass_rate > 0.99:
|
265 |
-
# cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
|
266 |
-
# else:
|
267 |
-
# cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
|
268 |
-
|
269 |
-
# if len(failed_tasks) > 0:
|
270 |
-
# cprint(f"Failed tasks: {failed_tasks}", "red")
|
271 |
-
|
272 |
-
# for k, v in pass_at_k.items():
|
273 |
-
# cprint(f"{k}:\t{v:.3f}", "green")
|
274 |
-
|
275 |
-
# # save results
|
276 |
-
# if os.path.isfile(result_path):
|
277 |
-
# decision = ""
|
278 |
-
# while decision.lower() not in ["y", "n"]:
|
279 |
-
# print(f"{result_path} already exists. Press [Y/N] to overwrite or exit...")
|
280 |
-
# decision = input()
|
281 |
-
|
282 |
-
# if decision.lower() == "y":
|
283 |
-
# # mv the file to a backup
|
284 |
-
# new_path = result_path + ".bak"
|
285 |
-
# while os.path.isfile(new_path):
|
286 |
-
# new_path += ".bak"
|
287 |
-
# os.rename(result_path, new_path)
|
288 |
-
# print(f"Backup {result_path} to {new_path}")
|
289 |
|
290 |
-
# if not os.path.isfile(result_path):
|
291 |
-
# with open(result_path, "w") as f:
|
292 |
-
# json.dump(results, f, indent=2)
|
293 |
-
|
294 |
-
# if save_pass_rate:
|
295 |
-
# pass_at_k_path = result_path.replace("_eval_results.json", "_pass_at_k.json")
|
296 |
-
# pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
297 |
-
# pass_at_k["calibrated"] = "sanitized-calibrated" in samples
|
298 |
-
# pass_at_k["subset"] = subset
|
299 |
-
|
300 |
-
# def save_pass_at_k():
|
301 |
-
# with open(pass_at_k_path, "w") as f:
|
302 |
-
# json.dump(pass_at_k, f, indent=2)
|
303 |
-
|
304 |
-
# if os.path.isfile(pass_at_k_path):
|
305 |
-
# saved_pass_at_k = json.load(open(pass_at_k_path, "r"))
|
306 |
-
# # compare saved_pass_at_k with pass_at_k
|
307 |
-
# for k in saved_pass_at_k.keys():
|
308 |
-
# if pass_at_k[k] != saved_pass_at_k[k]:
|
309 |
-
# cprint(f"Warning: {k} is different from the saved one", "yellow")
|
310 |
-
|
311 |
-
# # ask user whether to save the pass@k
|
312 |
-
# decision = ""
|
313 |
-
# while decision.lower() not in ["y", "n"]:
|
314 |
-
# print(f"Save pass@k to {pass_at_k_path}? [Y/N]")
|
315 |
-
# decision = input()
|
316 |
-
# if decision.lower() == "y":
|
317 |
-
# save_pass_at_k()
|
318 |
-
|
319 |
-
# else:
|
320 |
-
# save_pass_at_k()
|
321 |
|
322 |
def run_gradio():
|
323 |
interface = gr.Interface(
|
@@ -335,7 +264,10 @@ def run_gradio():
|
|
335 |
gr.Checkbox(label="Check GT Only"),
|
336 |
gr.Checkbox(label="No GT"),
|
337 |
],
|
338 |
-
outputs=
|
|
|
|
|
|
|
339 |
# concurrency_limit=None
|
340 |
)
|
341 |
interface.queue(default_concurrency_limit=None)
|
|
|
122 |
samples = "__dummy__.jsonl"
|
123 |
|
124 |
extra = subset + "_" if subset != "full" else ""
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
problems = get_bigcodebench(subset=subset)
|
127 |
dataset_hash = get_bigcodebench_hash(subset=subset)
|
|
|
134 |
gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
|
135 |
failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
|
136 |
|
137 |
+
if check_gt_only:
|
138 |
+
if gt_pass_rate > 0.99:
|
139 |
+
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
|
140 |
+
else:
|
141 |
+
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
|
142 |
+
if len(failed_tasks) > 0:
|
143 |
+
cprint(f"Failed tasks: {failed_tasks}", "red")
|
144 |
+
return {"gt_pass_rate":float(gt_pass_rate), "failed_tasks": failed_tasks}
|
145 |
+
|
146 |
+
results = {
|
147 |
+
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
148 |
+
"eval": {},
|
149 |
+
}
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
152 |
+
futures = []
|
153 |
+
completion_id = Counter()
|
154 |
+
n_samples = 0
|
155 |
+
eval_results = defaultdict(list) # task_id ->
|
156 |
+
remainings = set()
|
157 |
|
158 |
+
print("Reading samples...")
|
159 |
+
for sample in tqdm(load_solutions(samples)):
|
160 |
+
task_id = sample["task_id"]
|
161 |
+
|
162 |
+
if task_id not in problems:
|
163 |
+
warn(
|
164 |
+
f"Task {task_id} is found in the samples but not found in the dataset"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
)
|
166 |
+
continue
|
167 |
+
solution = (
|
168 |
+
sample["solution"]
|
169 |
+
if "solution" in sample
|
170 |
+
else problems[task_id]["complete_prompt"] + sample["completion"]
|
171 |
+
)
|
172 |
+
if "sanitized-calibrated" in samples:
|
173 |
+
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
174 |
+
remainings.add(sample["_identifier"])
|
175 |
+
args = (
|
176 |
+
completion_id[task_id],
|
177 |
+
problems[task_id],
|
178 |
+
solution,
|
179 |
+
max_as_limit,
|
180 |
+
max_data_limit,
|
181 |
+
max_stack_limit,
|
182 |
+
sample["_identifier"],
|
183 |
+
min_time_limit,
|
184 |
+
expected_time[task_id] if expected_time[task_id] else 20
|
185 |
+
)
|
186 |
+
futures.append(executor.submit(check_correctness, *args))
|
187 |
+
completion_id[task_id] += 1
|
188 |
+
n_samples += 1
|
189 |
|
190 |
+
assert n_samples == len(remainings), "Missing problems in unfinished"
|
191 |
+
assert len(completion_id) == len(problems), "Missing problems in samples"
|
|
|
192 |
|
193 |
+
def stucking_checker():
|
194 |
+
not_done = futures
|
195 |
+
while len(not_done) > 0:
|
196 |
+
done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
|
197 |
|
198 |
+
if len(done) == 0:
|
199 |
+
warn("No samples have finished testing in the last 240s")
|
200 |
+
warn(f"{len(remainings)} samples to be tested: {remainings}")
|
|
|
201 |
|
202 |
+
threading.Thread(target=stucking_checker).start()
|
203 |
|
204 |
+
for future in tqdm(as_completed(futures), total=n_samples):
|
205 |
+
result = future.result()
|
206 |
+
remainings.remove(result["_identifier"])
|
207 |
+
eval_results[result["task_id"]].append(result)
|
208 |
+
|
209 |
+
|
210 |
+
# sort the results for each problem by completion_id
|
211 |
+
for task_id, task_results in eval_results.items():
|
212 |
+
task_results.sort(key=lambda x: x["completion_id"])
|
213 |
+
results["eval"][task_id] = []
|
214 |
+
for res in task_results:
|
215 |
+
stat, details = res["base"]
|
216 |
+
results["eval"][task_id].append(
|
217 |
+
{
|
218 |
+
"task_id": task_id,
|
219 |
+
"solution": res["solution"],
|
220 |
+
"status": stat,
|
221 |
+
"details": details,
|
222 |
+
}
|
223 |
+
)
|
224 |
|
225 |
# Calculate pass@k.
|
226 |
total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
|
|
|
235 |
base_correct = np.array(base_correct)
|
236 |
|
237 |
pass_at_k = {
|
238 |
+
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
239 |
for k in pass_k
|
240 |
if total.min() >= k
|
241 |
}
|
242 |
+
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
243 |
+
pass_at_k["split"] = split
|
244 |
+
pass_at_k["subset"] = subset
|
245 |
+
pass_at_k["calibrated"] = "sanitized-calibrated" in samples
|
246 |
+
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
247 |
pass_at_k["failed_tasks"] = failed_tasks
|
248 |
+
return results, pass_at_k
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
251 |
def run_gradio():
|
252 |
interface = gr.Interface(
|
|
|
264 |
gr.Checkbox(label="Check GT Only"),
|
265 |
gr.Checkbox(label="No GT"),
|
266 |
],
|
267 |
+
outputs=[
|
268 |
+
gr.JSON(label="Results"),
|
269 |
+
gr.JSON(label="Eval Results"),
|
270 |
+
],
|
271 |
# concurrency_limit=None
|
272 |
)
|
273 |
interface.queue(default_concurrency_limit=None)
|
src/display/about.py
DELETED
@@ -1,148 +0,0 @@
|
|
1 |
-
TITLE = """<div style="text-align: center;"><h1> 🌸<span style='color: #C867B5;'>BigCodeBench</span> Leaderboard</h1></div>\
|
2 |
-
<br>\
|
3 |
-
<p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">⭐ Big Code Models Leaderboard</a>, we compare performance of LLMs on <a href="https://huggingface.co/datasets/bigcode/bigcodebench">BigCodeBench</a> benchmark.</p>
|
4 |
-
<p>To get started, please check out <a href="https://github.com/bigcode-project/bigcodebench">our GitHub repository</a>.
|
5 |
-
<br>\
|
6 |
-
For more details, please check our <a href="https://huggingface.co/blog/terryyz/bigcodebench-hard">blog on the Hard Set</a>, <a href="https://huggingface.co/blog/leaderboard-bigcodebench">blog on the Full Set</a> and <a href="https://arxiv.org/abs/2406.15877">paper</a>.</p>
|
7 |
-
"""
|
8 |
-
|
9 |
-
ABOUT_TEXT = """# Context
|
10 |
-
We believe that there are three main expectations of a good execution-based programming benchmark:
|
11 |
-
1. The benchmark should be easy to use and efficient in evaluating the fundamental capabilities of LLMs. Repo-level and agent-centric benchmarks (e.g., SWE-bench) are not suitable for this purpose.
|
12 |
-
2. The benchmark should be practical, covering various programming scenarios. Algo-specific benchmarks (e.g., HumanEval and MBPP) are unsuitable. Domain-specific benchmarks (e.g., DS-1000) are also unsuitable for this purpose.
|
13 |
-
3. The benchmark should be challenging, where the tasks require LLMs' strong compositional reasoning capabilities and instruction-following capabilities. The benchmarks with simple tasks (e.g., ODEX) are unsuitable.
|
14 |
-
|
15 |
-
BigCodeBench is the first benchmark that meets all three expectations. It is an <u>*__easy-to-use__*</u> benchmark that evaluates LLMs with <u>*__practical__*</u> and <u>*__challenging__*</u> programming tasks, accompanied by an end-to-end evaluation framework [`bigcodebench`](https://github.com/bigcode-project/bigcodebench). We aim to assess how well LLMs can solve programming tasks in an open-ended setting, with the following two focuses:
|
16 |
-
|
17 |
-
- Diverse Function Calls: This design requires LLMs to utilize diverse function calls.
|
18 |
-
- Complex Instructions: This design requires LLMs to follow complex instructions.
|
19 |
-
|
20 |
-
|
21 |
-
### Benchamrks & Prompts
|
22 |
-
The dataset has 2 variants:
|
23 |
-
1. `BigCodeBench-Complete`: _Code Completion based on the structured long-context docstrings_.
|
24 |
-
1. `BigCodeBench-Instruct`: _Code Generation based on the NL-oriented instructions_.
|
25 |
-
|
26 |
-
Figure below shows the example of `Complete` vs `Instruct` prompt. For `Instruct`, we only focus on instruction-tuned LLMs.
|
27 |
-
|
28 |
-
<img src="https://github.com/bigcode-bench/bigcode-bench.github.io/blob/main/asset/bigcodebench_prompt.svg?raw=true" alt="OctoCoder vs Base HumanEval prompt" width="800px">
|
29 |
-
|
30 |
-
The specific prompt template can be found [here](https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/model.py).
|
31 |
-
|
32 |
-
There are some edge cases:
|
33 |
-
- Due to the training flaws in StarCoder2 and Granite-Code, we additionally strip the trailing newlines for model inference.
|
34 |
-
- We have not included the `Instruct` results of Granite-Code-Instruct 8B & 3B as they constantly have empty outputs.
|
35 |
-
|
36 |
-
### Evaluation Parameters
|
37 |
-
- All models were evaluated with the [bigcodebench](https://github.com/bigcode-project/bigcodebench). You can install the [PyPI package](https://pypi.org/project/bigcodebench/).
|
38 |
-
To get started, please first set up the environment:
|
39 |
-
|
40 |
-
```bash
|
41 |
-
# Install to use bigcodebench.evaluate
|
42 |
-
pip install bigcodebench --upgrade
|
43 |
-
# If you want to use the evaluate locally, you need to install the requirements
|
44 |
-
pip install -I -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/main/Requirements/requirements-eval.txt
|
45 |
-
|
46 |
-
# Install to use bigcodebench.generate
|
47 |
-
# You are strongly recommended to install the generate dependencies in a separate environment
|
48 |
-
pip install bigcodebench[generate] --upgrade
|
49 |
-
```
|
50 |
-
|
51 |
-
### Scoring and Rankings
|
52 |
-
- Models are ranked according to Pass@1 using greedy decoding. Setup details can be found <a href="https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/generate.py">here</a>.
|
53 |
-
- The code to compute Elo rating is [here](https://github.com/bigcode-project/bigcodebench/blob/main/analysis/get_results.py), which is based on [Chatbot Arena Notebook](https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR#scrollTo=JdiJbB6pZB1B&line=2&uniqifier=1). We only compute the Elo rating for the `BigCodeBench-Complete` variant.
|
54 |
-
|
55 |
-
### Contact
|
56 |
-
If you have any questions, feel free to reach out to us at [terry.zhuo@monash.edu](mailto:terry.zhuo@monash.edu) or [contact@bigcode-project.org](mailto:contact@bigcode-project.org)
|
57 |
-
|
58 |
-
### Citation Information
|
59 |
-
|
60 |
-
```bibtex
|
61 |
-
@article{zhuo2024bigcodebench,
|
62 |
-
title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
|
63 |
-
author={Terry Yue Zhuo and Minh Chien Vu and Jenny Chim and Han Hu and Wenhao Yu and Ratnadira Widyasari and Imam Nur Bani Yusuf and Haolan Zhan and Junda He and Indraneil Paul and Simon Brunner and Chen Gong and Thong Hoang and Armel Randy Zebaze and Xiaoheng Hong and Wen-Ding Li and Jean Kaddour and Ming Xu and Zhihan Zhang and Prateek Yadav and Naman Jain and Alex Gu and Zhoujun Cheng and Jiawei Liu and Qian Liu and Zijian Wang and David Lo and Binyuan Hui and Niklas Muennighoff and Daniel Fried and Xiaoning Du and Harm de Vries and Leandro Von Werra},
|
64 |
-
journal={arXiv preprint arXiv:2406.15877},
|
65 |
-
year={2024}
|
66 |
-
}
|
67 |
-
```
|
68 |
-
"""
|
69 |
-
|
70 |
-
SUBMISSION_TEXT = """
|
71 |
-
<h1 align="center">
|
72 |
-
How to submit models/results to the leaderboard?
|
73 |
-
</h1>
|
74 |
-
We welcome the community to submit evaluation results of new models. We also provide an experimental feature for submitting models that our team will evaluate on the 🤗 cluster.
|
75 |
-
|
76 |
-
## Submitting Models (experimental feature)
|
77 |
-
Inspired from the Open LLM Leaderboard, we welcome code models submission from the community that will be automatically evaluated. Please note that this is still an experimental feature.
|
78 |
-
Below are some guidlines to follow before submitting your model:
|
79 |
-
|
80 |
-
#### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
81 |
-
```python
|
82 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
83 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
84 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
85 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
86 |
-
```
|
87 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
88 |
-
Note: make sure your model is public!
|
89 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet.
|
90 |
-
#### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
91 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
92 |
-
#### 3) Make sure your model has an open license!
|
93 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
94 |
-
#### 4) Fill up your model card
|
95 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
|
96 |
-
"""
|
97 |
-
|
98 |
-
SUBMISSION_TEXT_2 = """
|
99 |
-
## Sumbitting Results
|
100 |
-
You also have the option for running evaluation yourself and submitting results. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
|
101 |
-
|
102 |
-
### 1 - Running Evaluation
|
103 |
-
|
104 |
-
We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
|
105 |
-
|
106 |
-
### 2- Submitting Results 🚀
|
107 |
-
|
108 |
-
To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/bigcodebench-code-evals/tree/main/community_results) `community_results` in this repository:
|
109 |
-
- Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_my_model_terry`
|
110 |
-
- Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
|
111 |
-
|
112 |
-
The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
|
113 |
-
"""
|
114 |
-
|
115 |
-
SUBMISSION_TEXT_3 = """
|
116 |
-
<h1 align="center">
|
117 |
-
How to submit models/results to the leaderboard?
|
118 |
-
</h1>
|
119 |
-
We welcome the community to submit evaluation results of new models. These results will be added as non-verified, the authors are however required to upload their generations in case other members want to check.
|
120 |
-
|
121 |
-
### 1 - Running Evaluation
|
122 |
-
|
123 |
-
We wrote a detailed guide for running the evaluation on your model. You can find the it in [bigcode-evaluation-harness/leaderboard](https://github.com/bigcode-project/bigcode-evaluation-harness/tree/main/leaderboard). This will generate a json file summarizing the results, in addition to the raw generations and metric files.
|
124 |
-
|
125 |
-
### 2- Submitting Results 🚀
|
126 |
-
|
127 |
-
To submit your results create a **Pull Request** in the community tab to add them under the [folder](https://huggingface.co/spaces/bigcode/multilingual-code-evals/tree/main/community_results) `community_results` in this repository:
|
128 |
-
- Create a folder called `ORG_MODELNAME_USERNAME` for example `bigcode_starcoder_loubnabnl`
|
129 |
-
- Put your json file with grouped scores from the guide, in addition generations folder and metrics folder in it.
|
130 |
-
|
131 |
-
The title of the PR should be `[Community Submission] Model: org/model, Username: your_username`, replace org and model with those corresponding to the model you evaluated.
|
132 |
-
"""
|
133 |
-
|
134 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
135 |
-
|
136 |
-
CITATION_BUTTON_TEXT = r"""
|
137 |
-
@article{zhuo2024bigcodebench,
|
138 |
-
title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
|
139 |
-
author={Zhuo, Terry Yue and Vu, Minh Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and others},
|
140 |
-
journal={arXiv preprint arXiv:2406.15877},
|
141 |
-
year={2024}
|
142 |
-
}
|
143 |
-
"""
|
144 |
-
|
145 |
-
SUBMISSION_TEXT_3="""
|
146 |
-
## We welcome the community to request for new models to be added to the leaderboard.
|
147 |
-
## Please [file an issue](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard or [start a discussion](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard/discussions/new) in the community🤗
|
148 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/css_html_js.py
DELETED
@@ -1,115 +0,0 @@
|
|
1 |
-
custom_css = """
|
2 |
-
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
3 |
-
table td:first-child,
|
4 |
-
table th:first-child {
|
5 |
-
max-width: 400px;
|
6 |
-
overflow: auto;
|
7 |
-
white-space: nowrap;
|
8 |
-
}
|
9 |
-
|
10 |
-
/* Full width space */
|
11 |
-
.gradio-container {
|
12 |
-
max-width: 95% !important;
|
13 |
-
}
|
14 |
-
|
15 |
-
/* Text style and margins */
|
16 |
-
.markdown-text {
|
17 |
-
font-size: 16px !important;
|
18 |
-
}
|
19 |
-
|
20 |
-
#models-to-add-text {
|
21 |
-
font-size: 18px !important;
|
22 |
-
}
|
23 |
-
|
24 |
-
#citation-button span {
|
25 |
-
font-size: 16px !important;
|
26 |
-
}
|
27 |
-
|
28 |
-
#citation-button textarea {
|
29 |
-
font-size: 16px !important;
|
30 |
-
}
|
31 |
-
|
32 |
-
#citation-button > label > button {
|
33 |
-
margin: 6px;
|
34 |
-
transform: scale(1.3);
|
35 |
-
}
|
36 |
-
|
37 |
-
#search-bar-table-box > div:first-child {
|
38 |
-
background: none;
|
39 |
-
border: none;
|
40 |
-
}
|
41 |
-
|
42 |
-
#search-bar {
|
43 |
-
padding: 0px;
|
44 |
-
}
|
45 |
-
|
46 |
-
.tab-buttons button {
|
47 |
-
font-size: 20px;
|
48 |
-
}
|
49 |
-
|
50 |
-
/* Filters style */
|
51 |
-
#filter_type {
|
52 |
-
border: 0;
|
53 |
-
padding-left: 0;
|
54 |
-
padding-top: 0;
|
55 |
-
}
|
56 |
-
#filter_type label {
|
57 |
-
display: flex;
|
58 |
-
}
|
59 |
-
#filter_type label > span {
|
60 |
-
margin-top: var(--spacing-lg);
|
61 |
-
margin-right: 0.5em;
|
62 |
-
}
|
63 |
-
#filter_type label > .wrap {
|
64 |
-
width: 103px;
|
65 |
-
}
|
66 |
-
#filter_type label > .wrap .wrap-inner {
|
67 |
-
padding: 2px;
|
68 |
-
}
|
69 |
-
#filter_type label > .wrap .wrap-inner input {
|
70 |
-
width: 1px;
|
71 |
-
}
|
72 |
-
#filter-columns-type {
|
73 |
-
border: 0;
|
74 |
-
padding: 0.5;
|
75 |
-
}
|
76 |
-
#filter-columns-size {
|
77 |
-
border: 0;
|
78 |
-
padding: 0.5;
|
79 |
-
}
|
80 |
-
#box-filter > .form {
|
81 |
-
border: 0;
|
82 |
-
}
|
83 |
-
|
84 |
-
/* Header styles */
|
85 |
-
#header-title {
|
86 |
-
text-align: left;
|
87 |
-
display: inline-block;
|
88 |
-
}
|
89 |
-
|
90 |
-
#header-row {
|
91 |
-
display: flex;
|
92 |
-
justify-content: space-between;
|
93 |
-
align-items: center;
|
94 |
-
}
|
95 |
-
|
96 |
-
#header-row .gradio-html {
|
97 |
-
flex-grow: 1;
|
98 |
-
}
|
99 |
-
|
100 |
-
#oauth-button {
|
101 |
-
height: auto;
|
102 |
-
min-width: max-content;
|
103 |
-
white-space: nowrap;
|
104 |
-
padding: 10px 20px;
|
105 |
-
border-radius: 4px;
|
106 |
-
}
|
107 |
-
"""
|
108 |
-
|
109 |
-
get_window_url_params = """
|
110 |
-
function(url_params) {
|
111 |
-
const params = new URLSearchParams(window.location.search);
|
112 |
-
url_params = Object.fromEntries(params);
|
113 |
-
return url_params;
|
114 |
-
}
|
115 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/formatting.py
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
from huggingface_hub import HfApi
|
2 |
-
|
3 |
-
API = HfApi()
|
4 |
-
|
5 |
-
|
6 |
-
def model_hyperlink(link, model_name):
|
7 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
8 |
-
|
9 |
-
|
10 |
-
def make_clickable_model(df, model_col, link_col):
|
11 |
-
df[model_col] = df.apply(
|
12 |
-
lambda row: model_hyperlink(row[link_col], row[model_col]), axis=1
|
13 |
-
)
|
14 |
-
df["Openness"] = df.apply(
|
15 |
-
lambda row: "Open" if "huggingface.co" in row[link_col] else "Closed", axis=1
|
16 |
-
)
|
17 |
-
return df
|
18 |
-
|
19 |
-
|
20 |
-
def styled_error(error):
|
21 |
-
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
22 |
-
|
23 |
-
|
24 |
-
def styled_warning(warn):
|
25 |
-
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
26 |
-
|
27 |
-
|
28 |
-
def styled_message(message):
|
29 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
30 |
-
|
31 |
-
|
32 |
-
def has_no_nan_values(df, columns):
|
33 |
-
return df[columns].notna().all(axis=1)
|
34 |
-
|
35 |
-
|
36 |
-
def has_nan_values(df, columns):
|
37 |
-
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
DELETED
@@ -1,142 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
-
from enum import Enum
|
3 |
-
import json
|
4 |
-
import logging
|
5 |
-
from datetime import datetime
|
6 |
-
import pandas as pd
|
7 |
-
|
8 |
-
|
9 |
-
# Configure logging
|
10 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
11 |
-
|
12 |
-
# Convert ISO 8601 dates to datetime objects for comparison
|
13 |
-
def parse_iso8601_datetime(date_str):
|
14 |
-
if date_str.endswith('Z'):
|
15 |
-
date_str = date_str[:-1] + '+00:00'
|
16 |
-
return datetime.fromisoformat(date_str)
|
17 |
-
|
18 |
-
def parse_datetime(datetime_str):
|
19 |
-
formats = [
|
20 |
-
"%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
|
21 |
-
"%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
|
22 |
-
"%Y-%m-%dT%H %M %S.%f", # Spaces as separator
|
23 |
-
]
|
24 |
-
|
25 |
-
for fmt in formats:
|
26 |
-
try:
|
27 |
-
return datetime.strptime(datetime_str, fmt)
|
28 |
-
except ValueError:
|
29 |
-
continue
|
30 |
-
# in rare cases set unix start time for files with incorrect time (legacy files)
|
31 |
-
logging.error(f"No valid date format found for: {datetime_str}")
|
32 |
-
return datetime(1970, 1, 1)
|
33 |
-
|
34 |
-
|
35 |
-
def load_json_data(file_path):
|
36 |
-
"""Safely load JSON data from a file."""
|
37 |
-
try:
|
38 |
-
with open(file_path, "r") as file:
|
39 |
-
return json.load(file)
|
40 |
-
except json.JSONDecodeError:
|
41 |
-
print(f"Error reading JSON from {file_path}")
|
42 |
-
return None # Or raise an exception
|
43 |
-
|
44 |
-
|
45 |
-
def fields(raw_class):
|
46 |
-
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
47 |
-
|
48 |
-
|
49 |
-
column_map = {
|
50 |
-
"T": "T",
|
51 |
-
"model": "Model",
|
52 |
-
"type": "Model Type",
|
53 |
-
"size_range": "Size Range",
|
54 |
-
"complete": "Complete",
|
55 |
-
"instruct": "Instruct",
|
56 |
-
"average": "Average",
|
57 |
-
"elo_mle": "Elo Rating",
|
58 |
-
"link": "Link",
|
59 |
-
"act_param": "#Act Params (B)",
|
60 |
-
"size": "#Params (B)",
|
61 |
-
"moe": "MoE",
|
62 |
-
# "lazy": "Lazy",
|
63 |
-
"openness": "Openness",
|
64 |
-
# "direct_complete": "Direct Completion",
|
65 |
-
}
|
66 |
-
|
67 |
-
type_map = {
|
68 |
-
"🔶": "🔶 Chat Models (RLHF, DPO, IFT, ...)",
|
69 |
-
"🟢": "🟢 Base Models"
|
70 |
-
}
|
71 |
-
|
72 |
-
moe_map = {
|
73 |
-
True: "MoE",
|
74 |
-
False: "Dense"
|
75 |
-
}
|
76 |
-
# These classes are for user facing column names,
|
77 |
-
# to avoid having to change them all around the code
|
78 |
-
# when a modif is needed
|
79 |
-
@dataclass(frozen=True)
|
80 |
-
class ColumnContent:
|
81 |
-
name: str
|
82 |
-
type: str
|
83 |
-
displayed_by_default: bool
|
84 |
-
hidden: bool = False
|
85 |
-
never_hidden: bool = False
|
86 |
-
dummy: bool = False
|
87 |
-
|
88 |
-
|
89 |
-
auto_eval_column_dict = []
|
90 |
-
# Init
|
91 |
-
auto_eval_column_dict.append(["T", ColumnContent, ColumnContent(column_map["T"], "str", True, never_hidden=True)])
|
92 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent(column_map["model"], "markdown", True, never_hidden=True)])
|
93 |
-
auto_eval_column_dict.append(["type", ColumnContent, ColumnContent(column_map["type"], "str", False, True)])
|
94 |
-
auto_eval_column_dict.append(["size_range", ColumnContent, ColumnContent(column_map["size_range"], "str", False, True)])
|
95 |
-
# Scores
|
96 |
-
auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
|
97 |
-
auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
|
98 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
|
99 |
-
auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
|
100 |
-
|
101 |
-
# Model information
|
102 |
-
auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
|
103 |
-
auto_eval_column_dict.append(["link", ColumnContent, ColumnContent(column_map["link"], "str", False, True)])
|
104 |
-
auto_eval_column_dict.append(["size", ColumnContent, ColumnContent(column_map["size"], "number", False)])
|
105 |
-
# auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
|
106 |
-
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
|
107 |
-
auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
|
108 |
-
# auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
|
109 |
-
|
110 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
111 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
112 |
-
|
113 |
-
|
114 |
-
@dataclass(frozen=True)
|
115 |
-
class EvalQueueColumn: # Queue column
|
116 |
-
model_link = ColumnContent("link", "markdown", True)
|
117 |
-
model_name = ColumnContent("model", "str", True)
|
118 |
-
|
119 |
-
@dataclass
|
120 |
-
class ModelDetails:
|
121 |
-
name: str
|
122 |
-
symbol: str = "" # emoji, only for the model type
|
123 |
-
|
124 |
-
|
125 |
-
# Column selection
|
126 |
-
COLS = [c.name for c in fields(AutoEvalColumn)]
|
127 |
-
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
128 |
-
|
129 |
-
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
130 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
131 |
-
|
132 |
-
|
133 |
-
NUMERIC_INTERVALS = {
|
134 |
-
"?": pd.Interval(-1, 0, closed="right"),
|
135 |
-
"~1.5": pd.Interval(0, 2, closed="right"),
|
136 |
-
"~3": pd.Interval(2, 4, closed="right"),
|
137 |
-
"~7": pd.Interval(4, 9, closed="right"),
|
138 |
-
"~13": pd.Interval(9, 20, closed="right"),
|
139 |
-
"~35": pd.Interval(20, 45, closed="right"),
|
140 |
-
"~60": pd.Interval(45, 70, closed="right"),
|
141 |
-
"70+": pd.Interval(70, 10000, closed="right"),
|
142 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from huggingface_hub import HfApi
|
3 |
-
|
4 |
-
# clone / pull the lmeh eval data
|
5 |
-
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
-
|
7 |
-
DATA_VERSION = "v0.1.0_hf"
|
8 |
-
|
9 |
-
REPO_ID = "bigcode/bigcodebench-evaluator"
|
10 |
-
QUEUE_REPO = "bigcode/bigcodebench-requests"
|
11 |
-
DATA_REPO = "bigcode/bigcodebench"
|
12 |
-
RESULT_REPO = "bigcode/bigcodebench-results"
|
13 |
-
HARD_RESULT_REPO = "bigcode/bigcodebench-hard-results"
|
14 |
-
|
15 |
-
ELO_REPO = "bigcode/bigcodebench-elo"
|
16 |
-
HARD_ELO_REPO = "bigcode/bigcodebench-hard-elo"
|
17 |
-
SOLVE_REPO = "bigcode/bigcodebench-solve-rate"
|
18 |
-
HARD_SOLVE_REPO = "bigcode/bigcodebench-hard-solve-rate"
|
19 |
-
|
20 |
-
VOTES_REPO = "bigcode/bigcodebench-votes"
|
21 |
-
|
22 |
-
HF_HOME = os.getenv("HF_HOME", ".")
|
23 |
-
|
24 |
-
# Check HF_HOME write access
|
25 |
-
print(f"Initial HF_HOME set to: {HF_HOME}")
|
26 |
-
|
27 |
-
if not os.access(HF_HOME, os.W_OK):
|
28 |
-
print(f"No write access to HF_HOME: {HF_HOME}. Resetting to current directory.")
|
29 |
-
HF_HOME = "."
|
30 |
-
os.environ["HF_HOME"] = HF_HOME
|
31 |
-
else:
|
32 |
-
print("Write access confirmed for HF_HOME")
|
33 |
-
|
34 |
-
VOTES_PATH = os.path.join(HF_HOME, "model-votes")
|
35 |
-
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
36 |
-
|
37 |
-
# Rate limit variables
|
38 |
-
RATE_LIMIT_PERIOD = 7
|
39 |
-
RATE_LIMIT_QUOTA = 5
|
40 |
-
HAS_HIGHER_RATE_LIMIT = []
|
41 |
-
|
42 |
-
API = HfApi(token=HF_TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/execute.py
DELETED
@@ -1,194 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import subprocess
|
3 |
-
import sys
|
4 |
-
import os
|
5 |
-
import threading
|
6 |
-
import time
|
7 |
-
import uuid
|
8 |
-
import glob
|
9 |
-
import shutil
|
10 |
-
from pathlib import Path
|
11 |
-
|
12 |
-
default_command = "bigcodebench.evaluate"
|
13 |
-
is_running = False
|
14 |
-
|
15 |
-
def generate_command(
|
16 |
-
jsonl_file, split, subset, parallel,
|
17 |
-
min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
|
18 |
-
check_gt_only, no_gt
|
19 |
-
):
|
20 |
-
command = [default_command]
|
21 |
-
|
22 |
-
if jsonl_file is not None:
|
23 |
-
# Copy the uploaded file to the current directory
|
24 |
-
local_filename = os.path.basename(jsonl_file.name)
|
25 |
-
shutil.copy(jsonl_file.name, local_filename)
|
26 |
-
command.extend(["--samples", local_filename])
|
27 |
-
|
28 |
-
command.extend(["--split", split, "--subset", subset])
|
29 |
-
|
30 |
-
if parallel is not None and parallel != 0:
|
31 |
-
command.extend(["--parallel", str(int(parallel))])
|
32 |
-
|
33 |
-
command.extend([
|
34 |
-
"--min-time-limit", str(min_time_limit),
|
35 |
-
"--max-as-limit", str(int(max_as_limit)),
|
36 |
-
"--max-data-limit", str(int(max_data_limit)),
|
37 |
-
"--max-stack-limit", str(int(max_stack_limit))
|
38 |
-
])
|
39 |
-
|
40 |
-
if check_gt_only:
|
41 |
-
command.append("--check-gt-only")
|
42 |
-
|
43 |
-
if no_gt:
|
44 |
-
command.append("--no-gt")
|
45 |
-
|
46 |
-
return " ".join(command)
|
47 |
-
|
48 |
-
|
49 |
-
def cleanup_previous_files(jsonl_file):
|
50 |
-
if jsonl_file is not None:
|
51 |
-
file_list = ['Dockerfile', 'app.py', 'README.md', os.path.basename(jsonl_file.name), "__pycache__"]
|
52 |
-
else:
|
53 |
-
file_list = ['Dockerfile', 'app.py', 'README.md', "__pycache__"]
|
54 |
-
for file in glob.glob("*"):
|
55 |
-
try:
|
56 |
-
if file not in file_list:
|
57 |
-
os.remove(file)
|
58 |
-
except Exception as e:
|
59 |
-
print(f"Error during cleanup of {file}: {e}")
|
60 |
-
|
61 |
-
def find_result_file():
|
62 |
-
json_files = glob.glob("*.json")
|
63 |
-
if json_files:
|
64 |
-
return max(json_files, key=os.path.getmtime)
|
65 |
-
return None
|
66 |
-
|
67 |
-
def run_bigcodebench(command):
|
68 |
-
global is_running
|
69 |
-
if is_running:
|
70 |
-
yield "A command is already running. Please wait for it to finish.\n"
|
71 |
-
return
|
72 |
-
is_running = True
|
73 |
-
|
74 |
-
try:
|
75 |
-
yield f"Executing command: {command}\n"
|
76 |
-
|
77 |
-
process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
|
78 |
-
|
79 |
-
def kill_process():
|
80 |
-
if process.poll() is None: # If the process is still running
|
81 |
-
process.terminate()
|
82 |
-
is_running = False
|
83 |
-
yield "Process terminated after 12 minutes timeout.\n"
|
84 |
-
|
85 |
-
# Start a timer to kill the process after 12 minutes
|
86 |
-
timer = threading.Timer(720, kill_process)
|
87 |
-
timer.start()
|
88 |
-
|
89 |
-
for line in process.stdout:
|
90 |
-
yield line
|
91 |
-
|
92 |
-
# process.wait()
|
93 |
-
|
94 |
-
timer.cancel()
|
95 |
-
|
96 |
-
if process.returncode != 0:
|
97 |
-
yield f"Error: Command exited with status {process.returncode}\n"
|
98 |
-
|
99 |
-
yield "Evaluation completed.\n"
|
100 |
-
|
101 |
-
result_file = find_result_file()
|
102 |
-
if result_file:
|
103 |
-
yield f"Result file found: {result_file}\n"
|
104 |
-
else:
|
105 |
-
yield "No result file found.\n"
|
106 |
-
finally:
|
107 |
-
is_running = False
|
108 |
-
|
109 |
-
def stream_logs(command, jsonl_file=None):
|
110 |
-
global is_running
|
111 |
-
|
112 |
-
if is_running:
|
113 |
-
yield "A command is already running. Please wait for it to finish.\n"
|
114 |
-
return
|
115 |
-
|
116 |
-
cleanup_previous_files(jsonl_file)
|
117 |
-
yield "Cleaned up previous files.\n"
|
118 |
-
|
119 |
-
log_content = []
|
120 |
-
for log_line in run_bigcodebench(command):
|
121 |
-
log_content.append(log_line)
|
122 |
-
yield "".join(log_content)
|
123 |
-
|
124 |
-
# with gr.Blocks() as demo:
|
125 |
-
# gr.Markdown("# BigCodeBench Evaluator")
|
126 |
-
|
127 |
-
# with gr.Row():
|
128 |
-
# jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
|
129 |
-
# split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
|
130 |
-
# subset = gr.Dropdown(choices=["hard", "full"], label="Subset", value="hard")
|
131 |
-
|
132 |
-
# with gr.Row():
|
133 |
-
# parallel = gr.Number(label="Parallel (optional)", precision=0)
|
134 |
-
# min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
|
135 |
-
# max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0)
|
136 |
-
|
137 |
-
# with gr.Row():
|
138 |
-
# max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0)
|
139 |
-
# max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0)
|
140 |
-
# check_gt_only = gr.Checkbox(label="Check GT Only")
|
141 |
-
# no_gt = gr.Checkbox(label="No GT")
|
142 |
-
|
143 |
-
# command_output = gr.Textbox(label="Command", value=default_command, interactive=False)
|
144 |
-
# with gr.Row():
|
145 |
-
# submit_btn = gr.Button("Run Evaluation")
|
146 |
-
# download_btn = gr.DownloadButton(label="Download Result")
|
147 |
-
# log_output = gr.Textbox(label="Execution Logs", lines=20)
|
148 |
-
|
149 |
-
# input_components = [
|
150 |
-
# jsonl_file, split, subset, parallel,
|
151 |
-
# min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
|
152 |
-
# check_gt_only, no_gt
|
153 |
-
# ]
|
154 |
-
|
155 |
-
# for component in input_components:
|
156 |
-
# component.change(generate_command, inputs=input_components, outputs=command_output)
|
157 |
-
|
158 |
-
|
159 |
-
# def start_evaluation(command, jsonl_file, subset, split):
|
160 |
-
# extra = subset + "_" if subset != "full" else ""
|
161 |
-
# if jsonl_file is not None:
|
162 |
-
# result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json")
|
163 |
-
# else:
|
164 |
-
# result_path = None
|
165 |
-
|
166 |
-
# for log in stream_logs(command, jsonl_file):
|
167 |
-
# if jsonl_file is not None:
|
168 |
-
# yield log, gr.update(value=result_path, label=result_path), gr.update()
|
169 |
-
# else:
|
170 |
-
# yield log, gr.update(), gr.update()
|
171 |
-
# result_file = find_result_file()
|
172 |
-
# if result_file:
|
173 |
-
# return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file)
|
174 |
-
# # gr.Button(visible=False)#,
|
175 |
-
# # gr.DownloadButton(label="Download Result", value=result_file, visible=True))
|
176 |
-
# else:
|
177 |
-
# return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path)
|
178 |
-
# # gr.Button("Run Evaluation", visible=True),
|
179 |
-
# # gr.DownloadButton(visible=False))
|
180 |
-
# submit_btn.click(start_evaluation,
|
181 |
-
# inputs=[command_output, jsonl_file, subset, split],
|
182 |
-
# outputs=[log_output, download_btn])
|
183 |
-
|
184 |
-
# REPO_ID = "bigcode/bigcodebench-evaluator"
|
185 |
-
# HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
186 |
-
# API = HfApi(token=HF_TOKEN)
|
187 |
-
|
188 |
-
# def restart_space():
|
189 |
-
# API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
190 |
-
|
191 |
-
# demo.queue(max_size=300).launch(share=True, server_name="0.0.0.0", server_port=7860)
|
192 |
-
# scheduler = BackgroundScheduler()
|
193 |
-
# scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
|
194 |
-
# scheduler.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
DELETED
@@ -1,50 +0,0 @@
|
|
1 |
-
import pathlib
|
2 |
-
import pandas as pd
|
3 |
-
from datasets import Dataset
|
4 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
5 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
6 |
-
from src.display.utils import load_json_data, column_map, type_map, moe_map, NUMERIC_INTERVALS
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
def get_evaluation_queue_df(save_path, cols):
|
11 |
-
"""Generate dataframes for pending, running, and finished evaluation entries."""
|
12 |
-
save_path = pathlib.Path(save_path)
|
13 |
-
all_evals = []
|
14 |
-
|
15 |
-
for path in save_path.rglob("*.json"):
|
16 |
-
data = load_json_data(path)
|
17 |
-
# Organizing data by status
|
18 |
-
status_map = {
|
19 |
-
"PENDING": ["PENDING", "RERUN"],
|
20 |
-
"RUNNING": ["RUNNING"],
|
21 |
-
"FINISHED": ["FINISHED", "PENDING_NEW_EVAL"],
|
22 |
-
}
|
23 |
-
status_dfs = {status: [] for status in status_map}
|
24 |
-
for eval_data in all_evals:
|
25 |
-
for status, extra_statuses in status_map.items():
|
26 |
-
if eval_data["status"] in extra_statuses:
|
27 |
-
status_dfs[status].append(eval_data)
|
28 |
-
|
29 |
-
return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
|
30 |
-
|
31 |
-
|
32 |
-
def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list):
|
33 |
-
"""Retrieve and process leaderboard data."""
|
34 |
-
all_data_json = leaderboard_dataset.to_dict()
|
35 |
-
num_items = leaderboard_dataset.num_rows
|
36 |
-
all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
|
37 |
-
|
38 |
-
df = pd.DataFrame.from_records(all_data_json_list)
|
39 |
-
# replace df.moe true to false, false to true
|
40 |
-
# map column names
|
41 |
-
df = df.rename(columns=column_map)
|
42 |
-
df[AutoEvalColumn.moe.name] = df[AutoEvalColumn.moe.name].map(moe_map)
|
43 |
-
df[AutoEvalColumn.T.name] = df[AutoEvalColumn.type.name]
|
44 |
-
df[AutoEvalColumn.type.name] = df[AutoEvalColumn.type.name].map(type_map)
|
45 |
-
df[AutoEvalColumn.average.name] = df.apply(lambda x: round((x[AutoEvalColumn.complete.name] + x[AutoEvalColumn.instruct.name]) / 2, 1) if not pd.isna(x[AutoEvalColumn.complete.name]) and not pd.isna(x[AutoEvalColumn.instruct.name]) else None, axis=1)
|
46 |
-
df[AutoEvalColumn.size_range.name] = df[AutoEvalColumn.size.name].apply(lambda x: next((k for k, v in NUMERIC_INTERVALS.items() if x in v), "?"))
|
47 |
-
df = make_clickable_model(df, AutoEvalColumn.model.name, AutoEvalColumn.link.name)
|
48 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
49 |
-
df = df[cols].round(decimals=2)
|
50 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/tools/plots.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
import plotly.graph_objects as go
|
2 |
-
import plotly.express as px
|
3 |
-
import numpy as np
|
4 |
-
|
5 |
-
|
6 |
-
def plot_elo_mle(df):
|
7 |
-
fig = px.scatter(df, x="model", y="rating", error_y="error_y",
|
8 |
-
error_y_minus="error_y_minus",
|
9 |
-
# title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
|
10 |
-
)
|
11 |
-
fig.update_layout(xaxis_title="Model",
|
12 |
-
yaxis_title="Rating",
|
13 |
-
autosize=True,
|
14 |
-
# width=1300,
|
15 |
-
# height=900,
|
16 |
-
)
|
17 |
-
return fig
|
18 |
-
|
19 |
-
|
20 |
-
def plot_solve_rate(df, task, rows=30, cols=38):
|
21 |
-
keys = df["task_id"]
|
22 |
-
values = df["solve_rate"]
|
23 |
-
|
24 |
-
values = np.array(values, dtype=float) # Ensure values are floats
|
25 |
-
|
26 |
-
# Extract numerical IDs and sort by them
|
27 |
-
ids = [int(key.split('/')[-1]) for key in keys]
|
28 |
-
sorted_indices = np.argsort(ids)
|
29 |
-
keys = np.array(keys)[sorted_indices]
|
30 |
-
values = values[sorted_indices]
|
31 |
-
|
32 |
-
n = len(values)
|
33 |
-
pad_width = rows * cols - n
|
34 |
-
|
35 |
-
# Create a masked array
|
36 |
-
masked_values = np.ma.array(np.full(rows * cols, np.nan), mask=True)
|
37 |
-
masked_values[:n] = values
|
38 |
-
masked_values.mask[:n] = False
|
39 |
-
masked_values = masked_values.reshape((rows, cols))
|
40 |
-
|
41 |
-
keys_padded = np.pad(keys, (0, pad_width), 'constant', constant_values='')
|
42 |
-
keys_reshaped = keys_padded.reshape((rows, cols))
|
43 |
-
|
44 |
-
hover_text = np.empty_like(masked_values, dtype=object)
|
45 |
-
for i in range(rows):
|
46 |
-
for j in range(cols):
|
47 |
-
if not masked_values.mask[i, j]:
|
48 |
-
hover_text[i, j] = f"{keys_reshaped[i, j]}<br>Solve Rate: {masked_values[i, j]:.2f}"
|
49 |
-
else:
|
50 |
-
hover_text[i, j] = "NaN"
|
51 |
-
|
52 |
-
upper_solve_rate = round(np.count_nonzero(values) / n * 100, 2)
|
53 |
-
|
54 |
-
fig = go.Figure(data=go.Heatmap(
|
55 |
-
z=masked_values,
|
56 |
-
text=hover_text,
|
57 |
-
hoverinfo='text',
|
58 |
-
colorscale='teal',
|
59 |
-
zmin=0,
|
60 |
-
zmax=100
|
61 |
-
))
|
62 |
-
|
63 |
-
fig.update_layout(
|
64 |
-
title=f'BigCodeBench-{task}<br><i>Lowest Upper Limit: {upper_solve_rate}%</i>',
|
65 |
-
xaxis_nticks=cols,
|
66 |
-
yaxis_nticks=rows,
|
67 |
-
xaxis=dict(showticklabels=False),
|
68 |
-
yaxis=dict(showticklabels=False),
|
69 |
-
autosize=True,
|
70 |
-
)
|
71 |
-
|
72 |
-
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/voting/vote_system.py
DELETED
@@ -1,150 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import logging
|
3 |
-
import pathlib
|
4 |
-
import pandas as pd
|
5 |
-
import gradio as gr
|
6 |
-
import schedule
|
7 |
-
import time
|
8 |
-
from datetime import datetime, timezone
|
9 |
-
|
10 |
-
from src.envs import API
|
11 |
-
|
12 |
-
# Set up logging
|
13 |
-
logging.basicConfig(level=logging.INFO)
|
14 |
-
logger = logging.getLogger(__name__)
|
15 |
-
|
16 |
-
class VoteManager:
|
17 |
-
def __init__(self, votes_path, eval_requests_path, repo_id):
|
18 |
-
self.votes_path = votes_path
|
19 |
-
self.eval_requests_path = eval_requests_path
|
20 |
-
self.repo_id = repo_id
|
21 |
-
self.vote_dataset = self.read_vote_dataset()
|
22 |
-
self.vote_check_set = self.make_check_set(self.vote_dataset)
|
23 |
-
self.votes_to_upload = []
|
24 |
-
|
25 |
-
def init_vote_dataset(self):
|
26 |
-
self.vote_dataset = self.read_vote_dataset()
|
27 |
-
self.vote_check_set = self.make_check_set(self.vote_dataset)
|
28 |
-
|
29 |
-
def read_vote_dataset(self):
|
30 |
-
result = []
|
31 |
-
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
32 |
-
if votes_file.exists():
|
33 |
-
with open(votes_file, "r") as f:
|
34 |
-
for line in f:
|
35 |
-
data = json.loads(line.strip())
|
36 |
-
result.append(data)
|
37 |
-
result = pd.DataFrame(result)
|
38 |
-
return result
|
39 |
-
|
40 |
-
def make_check_set(self, vote_dataset: pd.DataFrame):
|
41 |
-
result = list()
|
42 |
-
for row in vote_dataset.itertuples(index=False, name='vote'):
|
43 |
-
result.append((row.model, row.revision, row.username))
|
44 |
-
return set(result)
|
45 |
-
|
46 |
-
def get_model_revision(self, selected_model: str) -> str:
|
47 |
-
"""Fetch the revision for the given model from the request files."""
|
48 |
-
for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
|
49 |
-
if user_folder.is_dir():
|
50 |
-
for file in user_folder.glob("*.json"):
|
51 |
-
with open(file, "r") as f:
|
52 |
-
data = json.load(f)
|
53 |
-
if data.get("model") == selected_model:
|
54 |
-
return data.get("revision", "main")
|
55 |
-
return "main"
|
56 |
-
|
57 |
-
def create_request_vote_df(self, pending_models_df: gr.Dataframe):
|
58 |
-
if pending_models_df.empty or not "model_name" in pending_models_df.columns:
|
59 |
-
return pending_models_df
|
60 |
-
self.vote_dataset = self.read_vote_dataset()
|
61 |
-
vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
|
62 |
-
|
63 |
-
pending_models_df_votes = pd.merge(
|
64 |
-
pending_models_df,
|
65 |
-
vote_counts,
|
66 |
-
left_on=["model_name", 'revision'],
|
67 |
-
right_on=['model', 'revision'],
|
68 |
-
how='left'
|
69 |
-
)
|
70 |
-
# Filling empty votes
|
71 |
-
pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
|
72 |
-
pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
|
73 |
-
# Removing useless columns
|
74 |
-
pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
|
75 |
-
return pending_models_df_votes
|
76 |
-
|
77 |
-
# Function to be called when a user votes for a model
|
78 |
-
def add_vote(
|
79 |
-
self,
|
80 |
-
selected_model: str,
|
81 |
-
pending_models_df: gr.Dataframe,
|
82 |
-
profile: gr.OAuthProfile | None
|
83 |
-
):
|
84 |
-
logger.debug(f"Type of list before usage: {type(list)}")
|
85 |
-
# model_name, revision, user_id, timestamp
|
86 |
-
if selected_model in ["str", ""]:
|
87 |
-
gr.Warning("No model selected")
|
88 |
-
return
|
89 |
-
|
90 |
-
if profile is None:
|
91 |
-
gr.Warning("Hub Login required")
|
92 |
-
return
|
93 |
-
|
94 |
-
vote_username = profile.username
|
95 |
-
model_revision = self.get_model_revision(selected_model)
|
96 |
-
|
97 |
-
# tuple (immutable) for checking than already voted for model
|
98 |
-
check_tuple = (selected_model, model_revision, vote_username)
|
99 |
-
if check_tuple in self.vote_check_set:
|
100 |
-
gr.Warning("Already voted for this model")
|
101 |
-
return
|
102 |
-
|
103 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
104 |
-
|
105 |
-
vote_obj = {
|
106 |
-
"model": selected_model,
|
107 |
-
"revision": model_revision,
|
108 |
-
"username": vote_username,
|
109 |
-
"timestamp": current_time
|
110 |
-
}
|
111 |
-
|
112 |
-
# Append the vote to the JSONL file
|
113 |
-
try:
|
114 |
-
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
115 |
-
with open(votes_file, "a") as f:
|
116 |
-
f.write(json.dumps(vote_obj) + "\n")
|
117 |
-
logger.info(f"Vote added locally: {vote_obj}")
|
118 |
-
|
119 |
-
self.votes_to_upload.append(vote_obj)
|
120 |
-
except Exception as e:
|
121 |
-
logger.error(f"Failed to write vote to file: {e}")
|
122 |
-
gr.Warning("Failed to record vote. Please try again")
|
123 |
-
return
|
124 |
-
|
125 |
-
self.vote_check_set.add(check_tuple)
|
126 |
-
gr.Info(f"Voted for {selected_model}")
|
127 |
-
|
128 |
-
return self.create_request_vote_df(pending_models_df)
|
129 |
-
|
130 |
-
def upload_votes(self):
|
131 |
-
if self.votes_to_upload:
|
132 |
-
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
133 |
-
try:
|
134 |
-
with open(votes_file, "rb") as f:
|
135 |
-
API.upload_file(
|
136 |
-
path_or_fileobj=f,
|
137 |
-
path_in_repo="votes_data.jsonl",
|
138 |
-
repo_id=self.repo_id,
|
139 |
-
repo_type="dataset",
|
140 |
-
commit_message="Updating votes_data.jsonl with new votes",
|
141 |
-
)
|
142 |
-
logger.info("Votes uploaded to votes repository")
|
143 |
-
self.votes_to_upload.clear()
|
144 |
-
except Exception as e:
|
145 |
-
logger.error(f"Failed to upload votes to repository: {e}")
|
146 |
-
|
147 |
-
def run_scheduler(vote_manager):
|
148 |
-
while True:
|
149 |
-
schedule.run_pending()
|
150 |
-
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|