qianxiao1111's picture
upgrade: add benchmarks eval
2a26d3b
raw
history blame
5.21 kB
import argparse
import json
import os
from pathlib import Path
import torch
from tqdm import tqdm
data_abs_dir = Path(__file__).parent / "data"
from human_eval.evaluation import evaluate_functional_correctness
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils.utils import extract_generation_code, languge_settings
def build_deepseekcoder_instruction(languge: str, question: str):
return """
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
```{}
{}
```
""".strip().format(
languge.lower(), question.strip()
)
def generate_one(example, lang, tokenizer, model):
prompt = build_deepseekcoder_instruction(
languge_settings[lang]["full_name"], example["prompt"]
)
inputs = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
return_tensors="pt",
add_generation_prompt=True,
).to(model.device)
stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
outputs = model.generate(
inputs,
max_new_tokens=1024,
do_sample=False,
# top_p=0.95,
# temperature=temperature,
pad_token_id=stop_id,
eos_token_id=stop_id,
)
output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
example["output"] = output
return extract_generation_code(example, lang_code=lang)
def generate_main(args):
model_name_or_path = args.model
lang = args.language
saved_path = args.output_path
temp_dir = args.temp_dir
os.makedirs(temp_dir, exist_ok=True)
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
print("model", model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
print(
"load tokenizer {} from {} over.".format(
tokenizer.__class__, model_name_or_path
)
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16,
device_map="auto",
# use_flash_attention_2=True
)
model.eval()
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
print("Read {} examples for evaluation over.".format(len(examples)))
generated_examples = []
for ex in tqdm(examples, desc="Generating"):
gen_example = generate_one(ex, args.language, tokenizer, model)
generated_examples.append(gen_example)
print("Generate all over!!!")
with open(saved_path, "w", encoding="utf-8") as fw:
for ex in generated_examples:
fw.write(json.dumps(ex) + "\n")
print(
"Save {} processed examples into {} over!".format(
len(generated_examples), saved_path
)
)
result = evaluate_functional_correctness(
input_file=saved_path,
tmp_dir=temp_dir,
n_workers=8,
timeout=3.0,
problem_file=problem_file,
language=lang,
)
print(lang, result, model_name_or_path)
pass
def evaluation_only(args):
lang = args.language
temp_dir = args.temp_dir
assert os.path.exists(args.output_path), "Not fond output file: {}".format(
args.output_path
)
os.makedirs(temp_dir, exist_ok=True)
output_name = os.path.basename(args.output_path)
output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
processed_examples = [
extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")
]
processed_path = os.path.join(temp_dir, output_name)
with open(processed_path, "w", encoding="utf-8") as fw:
for ex in processed_examples:
fw.write(json.dumps(ex) + "\n")
print(
"Save {} processed examples into {} over!".format(
len(processed_examples), processed_path
)
)
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
from human_eval.evaluation import evaluate_functional_correctness
result = evaluate_functional_correctness(
input_file=processed_path,
tmp_dir=temp_dir,
n_workers=8,
timeout=3.0,
problem_file=problem_file,
language=lang,
)
print(lang, result)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
type=str,
help="model name or path",
default="/data0/pretrained-models/deepseek-coder-6.7b-instruct",
)
parser.add_argument(
"--output_path",
type=str,
help="output path of your generation",
default="/home/qyhuang/DeepSeek-Coder/outputs/deepseek-chat.json",
)
parser.add_argument("--language", type=str, help="langauge", default="python")
parser.add_argument(
"--temp_dir", type=str, help="temp dir for evaluation", default="tmp"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
generate_main(args)
pass