alKoGolik's picture
Upload 169 files
c87c295 verified
"""This file checks two things:
1. Is the LLMs codegen completed for each benchmark?
2. Warn the code that are not compilable (it could be some impl issues).
"""
from termcolor import colored
from evalplus.data import load_solutions
from evalplus.sanitize import syntax_check
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--samples", type=str, required=True)
parser.add_argument(
"--dataset", required=True, type=str, choices=["humaneval", "mbpp"]
)
parser.add_argument("--nsample", type=int, default=1)
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
# List[Dict{"task_id", "solution"}]
solutions = load_solutions(args.samples)
if args.dataset == "humaneval":
from evalplus.data import get_human_eval_plus
dataset = get_human_eval_plus()
dataset_name = "HumanEval"
elif args.dataset == "mbpp":
from evalplus.data import get_mbpp_plus
dataset = get_mbpp_plus()
dataset_name = "Mbpp"
id2solutions = {}
for solution in solutions:
task_id = solution["task_id"]
if task_id not in id2solutions:
id2solutions[task_id] = []
if "solution" not in solution:
assert "completion" in solution, "solution or completion must exist!"
solution["solution"] = dataset[task_id]["prompt"] + solution["completion"]
id2solutions[task_id].append(solution)
nsample = max(args.nsample, max(len(v) for v in id2solutions.values()))
print(colored("==============================", "blue"))
print(colored(" ::: Checking completeness... ", "blue"))
print(colored(" ::::: All tasks complete? ", "blue"))
ndone = 0
task_ids = dataset.keys()
ntask = len(task_ids)
for task_id in task_ids:
if task_id not in id2solutions:
print(colored(f" ⚠️ {task_id} is missing!", "red"))
continue
nfiles = len(id2solutions[task_id])
if nfiles == nsample:
ndone += 1
continue
print(
colored(
f" ⚠️ {task_id} only has {nfiles} samples! But {nsample} are expected.",
"red",
)
)
if ntask != ndone:
ntbd = ntask - ndone
print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red"))
else:
print(colored(f" ::::: All {ntask} tasks complete!", "green"))
print(colored("==============================", "blue"))
print(colored(" ::: Checking compilation... ", "blue"))
print(colored(" ::::: All code compilable? ", "blue"))
ncode = 0
nwrong = 0
for task_id in task_ids:
# task_id must exist
if task_id not in id2solutions:
continue
for solution in id2solutions[task_id]:
ncode += 1
code = solution["solution"]
dbg_identifier = solution["_identifier"]
if code.strip() == "":
print(colored(f" ⚠️ {dbg_identifier} is empty!", "red"))
nwrong += 1
elif not syntax_check(code, args.verbose):
print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red"))
nwrong += 1
if 0 != nwrong:
print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red"))
else:
print(colored(f" ::::: All {ncode} code are compilable!", "green"))