qianxiao1111's picture
upgrade: add benchmarks eval
2a26d3b
raw
history blame
4.78 kB
import json
import os
import time
from argparse import ArgumentParser
# from accelerate import Accelerator
# from accelerate import DistributedDataParallelKwargs
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
from human_eval.evaluation import evaluate_functional_correctness
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils.dataset import HumanEvalDataset
from utils.utils import cleanup_code
from vllm import LLM, SamplingParams
class HumanEval:
"""
HumanEval evaluation class.
"""
def __init__(
self,
data_root,
language="python",
log_dir=None,
issft=False,
inference_increment=True,
n_sample=1,
k_sample=1,
):
self.data_root = data_root
self.k = k_sample
self.n_sample = n_sample
self.language = language
self.log_dir = log_dir
self.sft = issft
self.inference_increment = inference_increment
os.makedirs(self.log_dir, exist_ok=True)
@torch.no_grad()
def eval_model(self, args):
"""
Evaluate the model on HumanEval.
"""
assert (
self.log_dir is not None
), "log_dir should not be None when evaluating humaneval"
dataset = HumanEvalDataset(
self.data_root,
sample_num=self.n_sample,
language=self.language,
issft=self.sft,
)
model_name_or_path = args.model_path
print("model", model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
print(
"load tokenizer {} from {} over.".format(
tokenizer.__class__, model_name_or_path
)
)
llm = LLM(
model=model_name_or_path,
tensor_parallel_size=1,
max_model_len=4096,
trust_remote_code=True,
enforce_eager=True,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=1024,
top_p=0.95,
stop_token_ids=[tokenizer.eos_token_id],
)
messages_list = []
for i in range(len(dataset)):
data = dataset[i]
prompt = data["prompt"].strip()
messages_list.append(prompt)
outputs = llm.generate(messages_list, sampling_params=sampling_params)
assert len(dataset) == len(outputs), "dataset and outputs different lengths."
log_file = os.path.join(self.log_dir, f"{self.language}.json")
tmpfile = open(log_file, "w")
for i, output in enumerate(tqdm(outputs)):
data = dataset[i]
output = output.outputs[0].text
output = cleanup_code(
output,
self.language,
"humaneval",
self.sft,
dataset.stopwords,
)
# sft mode does not need original prompt
if not self.sft:
suffixprediction = data["original_prompt"] + "\n" + output
res = {
"task_id": data["task_id"],
"generation": suffixprediction,
"prompt": data["original_prompt"],
}
tmpfile.write(json.dumps(res) + "\n")
tmpfile.close()
# calculate the final score of pass@k
self._calculate_final_score(log_file)
return
def _calculate_final_score(self, logfilepath):
"""
Calculate the final score.
"""
res = evaluate_functional_correctness(
input_file=logfilepath,
problem_file=os.path.join(
self.data_root, f"humaneval-{self.language}.jsonl"
),
tmp_dir=self.log_dir,
language=self.language,
)
print("score is", res["pass@%d" % self.k])
os.remove(logfilepath)
return
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--logdir", type=str, default="")
parser.add_argument(
"--model_path",
type=str,
help="model name or path",
default="/data0/pretrained-models/qwen2-7b",
)
parser.add_argument("--language", type=str, default="python")
parser.add_argument(
"--dataroot",
type=str,
default="HumanEval/data",
)
args = parser.parse_args()
logdir = args.logdir
language = args.language
if logdir == "":
logdir = "output/tmp/"
evaluator = HumanEval(
data_root=args.dataroot,
log_dir=logdir,
n_sample=1,
language=language,
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
evaluator.eval_model(args)