Spaces:

opencompass
/

opencompass-bigcodebench-evaluator

Running

App Files Files Community

dongsheng commited on 10 days ago

Commit

2a5a31f

verified ·

1 Parent(s): 42cafa3

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +5 -0
README.md +13 -8
app.py +296 -0
pyproject.toml +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,5 @@

+FROM bigcodebench/bigcodebench-gradio:latest
+COPY . /app
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,17 @@
 ---
-title: Opencompass Bigcodebench Evaluator
-emoji: 🏆
-colorFrom: red
-colorTo: gray
-sdk: gradio
-sdk_version: 5.20.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: BigCodeBench Evaluator
+emoji: 🥇
+colorFrom: green
+colorTo: indigo
+sdk: docker
 app_file: app.py
+disable_embedding: true
 pinned: false
+license: apache-2.0
+tags:
+- leaderboard
+- eval:code
+- test:public
+- judge:auto
 ---
+Paper:arxiv.org/abs/2406.15877

app.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import gradio as gr
+import json
+import logging
+import multiprocessing
+import os
+import pickle
+import threading
+import time
+from collections import Counter, defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed, wait, FIRST_COMPLETED
+from datetime import datetime
+from typing import Any, Dict, List, Tuple
+from warnings import warn
+import gc
+import numpy as np
+from huggingface_hub import HfApi
+from bigcodebench.data import get_bigcodebench, get_bigcodebench_hash, load_solutions
+from bigcodebench.data.utils import CACHE_DIR
+from bigcodebench.eval import PASS, compatible_eval_result, estimate_pass_at_k, untrusted_check
+from bigcodebench.gen.util import trusted_check
+from apscheduler.schedulers.background import BackgroundScheduler
+REPO_ID = "bigcode/bigcodebench-evaluator"
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+API = HfApi(token=HF_TOKEN)
+Result = Tuple[str, List[bool]]
+def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit):
+    cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
+    if os.path.exists(cache_file):
+        with open(cache_file, "rb") as f:
+            return pickle.load(f)
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    tbegin = time.time()
+    with ProcessPoolExecutor(max_workers=n_workers) as executor:
+        futures = []
+        n_samples = 0
+        expected_time = dict()
+        for problem in problems.values():
+            args = (
+                problem["complete_prompt"] + "\n" + problem["canonical_solution"],
+                problem["test"],
+                problem["task_id"],
+                max_as_limit,
+                max_data_limit,
+                max_stack_limit,
+                min_time_limit,
+            )
+            futures.append(executor.submit(trusted_check, *args))
+            n_samples += 1
+        for future in as_completed(futures):
+            result = future.result()
+            expected_time[result["task_id"]] = result["time"]
+    if any(expected_time.values()):
+        with open(cache_file, "wb") as f:
+            pickle.dump(expected_time, f)
+    return expected_time
+def check_correctness(
+    completion_id: int,
+    problem: Dict[str, Any],
+    solution: str,
+    max_as_limit: float,
+    max_data_limit: float,
+    max_stack_limit: float,
+    identifier=None,
+    min_time_limit: float = 0.1,
+    gt_time_limit: float = 2.0,
+) -> Dict[str, Result]:
+    ret = {
+        "completion_id": completion_id,
+        "task_id": problem["task_id"],
+        "_identifier": identifier,
+        "solution": solution,
+    }
+    ret["base"] = untrusted_check(
+        solution,
+        problem["test"],
+        problem["entry_point"],
+        max_as_limit,
+        max_data_limit,
+        max_stack_limit,
+        min_time_limit,
+        gt_time_limit,
+    )
+    return ret
+def evaluate(
+    split: str,
+    subset: str,
+    samples: str,
+    pass_k: str="1,5,10",
+    parallel: int = -1,
+    min_time_limit: float = 1,
+    max_as_limit: int = 30 * 1024,
+    max_data_limit: int = 30 * 1024,
+    max_stack_limit: int = 10,
+    calibrated: bool = True,
+    check_gt_only: bool = False,
+    no_gt: bool = False,
+    selective_evaluate: str = "",
+):
+    passk = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()]
+    if parallel < 1:
+        n_workers = max(1, multiprocessing.cpu_count() // 2)
+    else:
+        n_workers = parallel
+    if check_gt_only:
+        samples = "__dummy__.jsonl"
+    extra = subset + "_" if subset != "full" else ""
+    problems = get_bigcodebench(subset=subset)
+    # Add selective evaluation logic
+    if selective_evaluate:
+        selected_ids = ["BigCodeBench/" + id for id in sorted(set(selective_evaluate.split(",")))]
+        problems = {k: v for k, v in problems.items() if k in selected_ids}
+        if not problems:
+            raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset")
+    dataset_hash = get_bigcodebench_hash(subset=subset)
+    if not no_gt:
+        expected_time = get_groundtruth(n_workers, problems, dataset_hash, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit)
+    else:
+        expected_time = {task_id: None for task_id in problems}
+    gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
+    failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
+    pass_at_k = dict()
+    results = {
+        "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
+        "eval": {},
+    }
+    if not check_gt_only:
+        with ProcessPoolExecutor(max_workers=n_workers) as executor:
+            futures = []
+            completion_id = Counter()
+            n_samples = 0
+            eval_results = defaultdict(list)  # task_id ->
+            remainings = set()
+            for sample in load_solutions(samples):
+                task_id = sample["task_id"]
+                if task_id not in problems:
+                    continue
+                solution = (
+                    sample["solution"]
+                    if "solution" in sample
+                    else problems[task_id]["complete_prompt"] + sample["completion"]
+                )
+                if calibrated:
+                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+                remainings.add(sample["_identifier"])
+                args = (
+                    completion_id[task_id],
+                    problems[task_id],
+                    solution,
+                    max_as_limit,
+                    max_data_limit,
+                    max_stack_limit,
+                    sample["_identifier"],
+                    min_time_limit,
+                    expected_time[task_id] if expected_time[task_id] else 20
+                )
+                futures.append(executor.submit(check_correctness, *args))
+                completion_id[task_id] += 1
+                n_samples += 1
+            assert n_samples == len(remainings), "Missing problems in unfinished"
+            assert len(completion_id) == len(problems), "Missing problems in samples"
+            for future in as_completed(futures):
+                result = future.result()
+                remainings.remove(result["_identifier"])
+                eval_results[result["task_id"]].append(result)
+                del future, result
+                gc.collect()
+        # sort the results for each problem by completion_id
+        for task_id, task_results in eval_results.items():
+            task_results.sort(key=lambda x: x["completion_id"])
+            results["eval"][task_id] = []
+            for res in task_results:
+                stat, details = res["base"]
+                results["eval"][task_id].append(
+                    {
+                        "task_id": task_id,
+                        "solution": res["solution"],
+                        "status": stat,
+                        "details": details,
+                    }
+                )
+        # Calculate pass@k.
+        total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
+        base_correct = []
+        for key, res in results["eval"].items():
+            if key not in problems:
+                continue
+            bc = sum([r["status"] == PASS for r in res])
+            base_correct.append(bc)
+        base_correct = np.array(base_correct)
+        pass_at_k.update({
+            f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
+            for k in passk
+            if total.min() >= k
+        })
+        del problems, futures
+        gc.collect()
+    pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
+    pass_at_k["split"] = split
+    pass_at_k["subset"] = subset
+    pass_at_k["calibrated"] = calibrated
+    pass_at_k["gt_pass_rate"] = gt_pass_rate
+    pass_at_k["failed_tasks"] = failed_tasks
+    return results, pass_at_k
+# def run_gradio():
+interface = gr.Interface(
+    fn=evaluate,
+    inputs=[
+        gr.Dropdown(["complete", "instruct"], label="BigCodeBench Split"),
+        gr.Dropdown(["full", "hard"], label="BigCodeBench Subset"),
+        gr.File(label="Samples Path (.jsonl)"),
+        gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
+        gr.Slider(-1, multiprocessing.cpu_count(), step=1, label="Parallel Workers", value=-1),
+        gr.Slider(0.1, 10, step=0.1, label="Min Time Limit", value=1),
+        gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024),
+        gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024),
+        gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10),
+        gr.Checkbox(label="Calibrated", value=True),
+        gr.Checkbox(label="Check GT Only"),
+        gr.Checkbox(label="No GT"),
+        gr.Textbox(label="Selective Evaluated Task IDs (comma-separated, e.g. '0,1,2')", value=""),
+    ],
+    outputs=[
+        gr.JSON(label="Results"),
+        gr.JSON(label="Eval Results"),
+    ],
+    # concurrency_limit=None
+)
+interface.queue(default_concurrency_limit=None)
+def preload_gt():
+    evaluate(split="complete", subset="full", samples="", check_gt_only=True)
+    evaluate(split="complete", subset="hard", samples="", check_gt_only=True)
+def restart_space():
+    logging.info(f"Restarting space with repo ID: {REPO_ID}")
+    try:
+        # Now restart the space
+        API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
+        logging.info("Space restarted successfully.")
+    except Exception as e:
+        logging.error(f"Failed to restart space: {e}")
+# if __name__ == "__main__":
+while True:
+    try:
+        preload_gt()
+        break
+    except:
+        continue
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", hours=1)  # Restart every 2hs
+scheduler.start()
+interface.launch(show_error=True)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119