|
import json |
|
import os |
|
|
|
import numpy as np |
|
|
|
|
|
class HumanEvalDataset: |
|
|
|
def __init__(self, root, sample_num=1, language="python", issft=False): |
|
""" |
|
root: the path to the HumanEval dataset |
|
sample_num: the number of samples for each prompt |
|
language: the language of the HumanEval dataset |
|
issft: whether to use the SFT setting |
|
""" |
|
self.root = root |
|
self.data = open( |
|
os.path.join(self.root, f"humaneval-{language}.jsonl") |
|
).readlines() |
|
|
|
tmp = self.get_qa_only_data(self.data, issft) |
|
self.clean_data = [] |
|
for i in range(len(tmp)): |
|
for j in range(sample_num): |
|
self.clean_data.append(tmp[i]) |
|
self.stopwords = self.clean_data[0]["stopwords"] |
|
np.random.seed(1234) |
|
print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}") |
|
|
|
def get_qa_only_data(self, data_json, sft=False): |
|
""" |
|
data_json: the jsonl file of HumanEval |
|
sft: whether to use the SFT setting |
|
return: a list of dict, each dict contains the prompt, task_id and stopwords |
|
""" |
|
ans = [] |
|
for line in data_json: |
|
line = json.loads(line) |
|
prompt = line["prompt"].strip() |
|
if "prefix" in line: |
|
origin_prompt = line["prefix"] |
|
else: |
|
origin_prompt = line["prompt"] |
|
|
|
if sft: |
|
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n""" |
|
if "stop_tokens" in line: |
|
s = line["stop_tokens"] |
|
else: |
|
s = [] |
|
ans.append( |
|
{ |
|
"prompt": prompt, |
|
"task_id": line["task_id"], |
|
"original_prompt": origin_prompt, |
|
"stopwords": s, |
|
} |
|
) |
|
return ans |
|
|
|
def __len__(self): |
|
""" |
|
return the number of samples in the dataset |
|
""" |
|
return len(self.clean_data) |
|
|
|
def __getitem__(self, index): |
|
""" |
|
return the sample at index |
|
""" |
|
sample = self.clean_data[index] |
|
return sample |
|
|