Spaces:

alKoGolik
/

codellama-CodeLlama-7b-hf

Running

App Files Files Community

codellama-CodeLlama-7b-hf / Llama2-Code-Interpreter-main /OpenCodeInterpreter /evaluation /evalplus /tools /checker.py

alKoGolik

Upload 169 files

c87c295 verified 12 months ago

raw

history blame contribute delete

3.51 kB

	"""This file checks two things:
	1. Is the LLMs codegen completed for each benchmark?
	2. Warn the code that are not compilable (it could be some impl issues).
	"""

	from termcolor import colored

	from evalplus.data import load_solutions
	from evalplus.sanitize import syntax_check

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument("--samples", type=str, required=True)
	parser.add_argument(
	"--dataset", required=True, type=str, choices=["humaneval", "mbpp"]
	)
	parser.add_argument("--nsample", type=int, default=1)
	parser.add_argument("--verbose", action="store_true")

	args = parser.parse_args()

	# List[Dict{"task_id", "solution"}]
	solutions = load_solutions(args.samples)

	if args.dataset == "humaneval":
	from evalplus.data import get_human_eval_plus

	dataset = get_human_eval_plus()
	dataset_name = "HumanEval"
	elif args.dataset == "mbpp":
	from evalplus.data import get_mbpp_plus

	dataset = get_mbpp_plus()
	dataset_name = "Mbpp"

	id2solutions = {}
	for solution in solutions:
	task_id = solution["task_id"]
	if task_id not in id2solutions:
	id2solutions[task_id] = []
	if "solution" not in solution:
	assert "completion" in solution, "solution or completion must exist!"
	solution["solution"] = dataset[task_id]["prompt"] + solution["completion"]
	id2solutions[task_id].append(solution)

	nsample = max(args.nsample, max(len(v) for v in id2solutions.values()))
	print(colored("==============================", "blue"))
	print(colored(" ::: Checking completeness... ", "blue"))
	print(colored(" ::::: All tasks complete? ", "blue"))
	ndone = 0

	task_ids = dataset.keys()
	ntask = len(task_ids)
	for task_id in task_ids:
	if task_id not in id2solutions:
	print(colored(f" ⚠️ {task_id} is missing!", "red"))
	continue
	nfiles = len(id2solutions[task_id])
	if nfiles == nsample:
	ndone += 1
	continue

	print(
	colored(
	f" ⚠️ {task_id} only has {nfiles} samples! But {nsample} are expected.",
	"red",
	)
	)

	if ntask != ndone:
	ntbd = ntask - ndone
	print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red"))
	else:
	print(colored(f" ::::: All {ntask} tasks complete!", "green"))

	print(colored("==============================", "blue"))
	print(colored(" ::: Checking compilation... ", "blue"))
	print(colored(" ::::: All code compilable? ", "blue"))
	ncode = 0
	nwrong = 0
	for task_id in task_ids:
	# task_id must exist
	if task_id not in id2solutions:
	continue

	for solution in id2solutions[task_id]:
	ncode += 1
	code = solution["solution"]
	dbg_identifier = solution["_identifier"]
	if code.strip() == "":
	print(colored(f" ⚠️ {dbg_identifier} is empty!", "red"))
	nwrong += 1
	elif not syntax_check(code, args.verbose):
	print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red"))
	nwrong += 1
	if 0 != nwrong:
	print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red"))
	else:
	print(colored(f" ::::: All {ncode} code are compilable!", "green"))