import json import os import numpy as np class HumanEvalDataset: def __init__(self, root, sample_num=1, language="python", issft=False): """ root: the path to the HumanEval dataset sample_num: the number of samples for each prompt language: the language of the HumanEval dataset issft: whether to use the SFT setting """ self.root = root self.data = open( os.path.join(self.root, f"humaneval-{language}.jsonl") ).readlines() tmp = self.get_qa_only_data(self.data, issft) self.clean_data = [] for i in range(len(tmp)): for j in range(sample_num): self.clean_data.append(tmp[i]) self.stopwords = self.clean_data[0]["stopwords"] np.random.seed(1234) print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}") def get_qa_only_data(self, data_json, sft=False): """ data_json: the jsonl file of HumanEval sft: whether to use the SFT setting return: a list of dict, each dict contains the prompt, task_id and stopwords """ ans = [] for line in data_json: line = json.loads(line) prompt = line["prompt"].strip() if "prefix" in line: origin_prompt = line["prefix"] else: origin_prompt = line["prompt"] if sft: prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n""" if "stop_tokens" in line: s = line["stop_tokens"] else: s = [] ans.append( { "prompt": prompt, "task_id": line["task_id"], "original_prompt": origin_prompt, "stopwords": s, } ) return ans def __len__(self): """ return the number of samples in the dataset """ return len(self.clean_data) def __getitem__(self, index): """ return the sample at index """ sample = self.clean_data[index] return sample