Text Generation
Safetensors
Chinese
English
conversational
Aurora / data /read_data.py
wangrongsheng's picture
add data
40d90bf
import json
from tqdm import tqdm
jsonl_file_path = 'common_zh_70k.jsonl'
results = []
# 打开JSON Lines文件
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
# 逐行读取文件内容
for line in tqdm(file):
# 解析JSON行
json_object = json.loads(line.strip())
# 处理json_object,根据需要执行操作
#print(json_object['conversation'])
#print(len(json_object['conversation']))
#print(json_object['conversation'][0])
if len(json_object['conversation'])>=2:
rr = []
for cc in range(len(json_object['conversation'])-1):
rr.append([str(json_object['conversation'][cc]['human']), str(json_object['conversation'][cc]['assistant'])])
info = {
"instruction": str(json_object['conversation'][-1]['human']),
"input": "",
"output": str(json_object['conversation'][-1]['assistant']),
"history": rr
}
results.append(info)
if len(json_object['conversation'])==1:
info = {
"instruction": str(json_object['conversation'][0]['human']),
"input": "",
"output": str(json_object['conversation'][0]['assistant']),
"history": []
}
results.append(info)
# 打印完第一行后终止循环
#break
with open('./sharegpt-70k.json', 'w', encoding="utf-8") as f1:
json.dump(results, f1, ensure_ascii=False, indent=4)