|
import json |
|
import os |
|
import time |
|
from argparse import ArgumentParser |
|
|
|
|
|
from pathlib import Path |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn.functional as F |
|
from human_eval.evaluation import evaluate_functional_correctness |
|
from tqdm import tqdm |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from utils.dataset import HumanEvalDataset |
|
from utils.utils import cleanup_code |
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
class HumanEval: |
|
""" |
|
HumanEval evaluation class. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
data_root, |
|
language="python", |
|
log_dir=None, |
|
issft=False, |
|
inference_increment=True, |
|
n_sample=1, |
|
k_sample=1, |
|
): |
|
self.data_root = data_root |
|
self.k = k_sample |
|
self.n_sample = n_sample |
|
self.language = language |
|
self.log_dir = log_dir |
|
self.sft = issft |
|
self.inference_increment = inference_increment |
|
os.makedirs(self.log_dir, exist_ok=True) |
|
|
|
@torch.no_grad() |
|
def eval_model(self, args): |
|
""" |
|
Evaluate the model on HumanEval. |
|
""" |
|
assert ( |
|
self.log_dir is not None |
|
), "log_dir should not be None when evaluating humaneval" |
|
dataset = HumanEvalDataset( |
|
self.data_root, |
|
sample_num=self.n_sample, |
|
language=self.language, |
|
issft=self.sft, |
|
) |
|
model_name_or_path = args.model_path |
|
print("model", model_name_or_path) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) |
|
print( |
|
"load tokenizer {} from {} over.".format( |
|
tokenizer.__class__, model_name_or_path |
|
) |
|
) |
|
|
|
llm = LLM( |
|
model=model_name_or_path, |
|
tensor_parallel_size=1, |
|
max_model_len=4096, |
|
trust_remote_code=True, |
|
enforce_eager=True, |
|
) |
|
sampling_params = SamplingParams( |
|
temperature=0, |
|
max_tokens=1024, |
|
top_p=0.95, |
|
stop_token_ids=[tokenizer.eos_token_id], |
|
) |
|
messages_list = [] |
|
for i in range(len(dataset)): |
|
data = dataset[i] |
|
prompt = data["prompt"].strip() |
|
messages_list.append(prompt) |
|
outputs = llm.generate(messages_list, sampling_params=sampling_params) |
|
assert len(dataset) == len(outputs), "dataset and outputs different lengths." |
|
log_file = os.path.join(self.log_dir, f"{self.language}.json") |
|
tmpfile = open(log_file, "w") |
|
for i, output in enumerate(tqdm(outputs)): |
|
data = dataset[i] |
|
output = output.outputs[0].text |
|
output = cleanup_code( |
|
output, |
|
self.language, |
|
"humaneval", |
|
self.sft, |
|
dataset.stopwords, |
|
) |
|
|
|
if not self.sft: |
|
suffixprediction = data["original_prompt"] + "\n" + output |
|
res = { |
|
"task_id": data["task_id"], |
|
"generation": suffixprediction, |
|
"prompt": data["original_prompt"], |
|
} |
|
tmpfile.write(json.dumps(res) + "\n") |
|
|
|
tmpfile.close() |
|
|
|
self._calculate_final_score(log_file) |
|
return |
|
|
|
def _calculate_final_score(self, logfilepath): |
|
""" |
|
Calculate the final score. |
|
""" |
|
res = evaluate_functional_correctness( |
|
input_file=logfilepath, |
|
problem_file=os.path.join( |
|
self.data_root, f"humaneval-{self.language}.jsonl" |
|
), |
|
tmp_dir=self.log_dir, |
|
language=self.language, |
|
) |
|
print("score is", res["pass@%d" % self.k]) |
|
os.remove(logfilepath) |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = ArgumentParser() |
|
parser.add_argument("--logdir", type=str, default="") |
|
parser.add_argument( |
|
"--model_path", |
|
type=str, |
|
help="model name or path", |
|
default="/data0/pretrained-models/qwen2-7b", |
|
) |
|
|
|
parser.add_argument("--language", type=str, default="python") |
|
parser.add_argument( |
|
"--dataroot", |
|
type=str, |
|
default="HumanEval/data", |
|
) |
|
args = parser.parse_args() |
|
|
|
logdir = args.logdir |
|
language = args.language |
|
|
|
if logdir == "": |
|
logdir = "output/tmp/" |
|
|
|
evaluator = HumanEval( |
|
data_root=args.dataroot, |
|
log_dir=logdir, |
|
n_sample=1, |
|
language=language, |
|
) |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
evaluator.eval_model(args) |
|
|