0816ChatGPT / modules /train_func.py
JohnSmith9982's picture
Upload 114 files
4dfaeff
raw
history blame
5.77 kB
import os
import logging
import traceback
import openai
import gradio as gr
import ujson as json
import commentjson
import openpyxl
import modules.presets as presets
from modules.utils import get_file_hash, count_token
from modules.presets import i18n
def excel_to_jsonl(filepath, preview=False):
# 打开Excel文件
workbook = openpyxl.load_workbook(filepath)
# 获取第一个工作表
sheet = workbook.active
# 获取所有行数据
data = []
for row in sheet.iter_rows(values_only=True):
data.append(row)
# 构建字典列表
headers = data[0]
jsonl = []
for row in data[1:]:
row_data = dict(zip(headers, row))
if any(row_data.values()):
jsonl.append(row_data)
formatted_jsonl = []
for i in jsonl:
if "提问" in i and "答案" in i:
if "系统" in i :
formatted_jsonl.append({
"messages":[
{"role": "system", "content": i["系统"]},
{"role": "user", "content": i["提问"]},
{"role": "assistant", "content": i["答案"]}
]
})
else:
formatted_jsonl.append({
"messages":[
{"role": "user", "content": i["提问"]},
{"role": "assistant", "content": i["答案"]}
]
})
else:
logging.warning(f"跳过一行数据,因为没有找到提问和答案: {i}")
return formatted_jsonl
def jsonl_save_to_disk(jsonl, filepath):
file_hash = get_file_hash(file_paths = [filepath])
os.makedirs("files", exist_ok=True)
save_path = f"files/{file_hash}.jsonl"
with open(save_path, "w") as f:
f.write("\n".join([json.dumps(i, ensure_ascii=False) for i in jsonl]))
return save_path
def estimate_cost(ds):
dialogues = []
for l in ds:
for m in l["messages"]:
dialogues.append(m["content"])
dialogues = "\n".join(dialogues)
tokens = count_token(dialogues)
return f"Token 数约为 {tokens},预估每轮(epoch)费用约为 {tokens / 1000 * 0.008} 美元。"
def handle_dataset_selection(file_src):
logging.info(f"Loading dataset {file_src.name}...")
preview = ""
if file_src.name.endswith(".jsonl"):
with open(file_src.name, "r") as f:
ds = [json.loads(l) for l in f.readlines()]
else:
ds = excel_to_jsonl(file_src.name)
preview = ds[0]
return preview, gr.update(interactive=True), estimate_cost(ds)
def upload_to_openai(file_src):
openai.api_key = os.getenv("OPENAI_API_KEY")
dspath = file_src.name
msg = ""
logging.info(f"Uploading dataset {dspath}...")
if dspath.endswith(".xlsx"):
jsonl = excel_to_jsonl(dspath)
dspath = jsonl_save_to_disk(jsonl, dspath)
try:
uploaded = openai.File.create(
file=open(dspath, "rb"),
purpose='fine-tune'
)
return uploaded.id, f"上传成功"
except Exception as e:
traceback.print_exc()
return "", f"上传失败,原因:{ e }"
def build_event_description(id, status, trained_tokens, name=i18n("暂时未知")):
# convert to markdown
return f"""
#### 训练任务 {id}
模型名称:{name}
状态:{status}
已经训练了 {trained_tokens} 个token
"""
def start_training(file_id, suffix, epochs):
openai.api_key = os.getenv("OPENAI_API_KEY")
try:
job = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo", suffix=suffix, hyperparameters={"n_epochs": epochs})
return build_event_description(job.id, job.status, job.trained_tokens)
except Exception as e:
traceback.print_exc()
if "is not ready" in str(e):
return "训练出错,因为文件还没准备好。OpenAI 需要一点时间准备文件,过几分钟再来试试。"
return f"训练失败,原因:{ e }"
def get_training_status():
openai.api_key = os.getenv("OPENAI_API_KEY")
active_jobs = [build_event_description(job["id"], job["status"], job["trained_tokens"], job["fine_tuned_model"]) for job in openai.FineTuningJob.list(limit=10)["data"] if job["status"] != "cancelled"]
return "\n\n".join(active_jobs), gr.update(interactive=True) if len(active_jobs) > 0 else gr.update(interactive=False)
def handle_dataset_clear():
return gr.update(value=None), gr.update(interactive=False)
def add_to_models():
openai.api_key = os.getenv("OPENAI_API_KEY")
succeeded_jobs = [job for job in openai.FineTuningJob.list()["data"] if job["status"] == "succeeded"]
extra_models = [job["fine_tuned_model"] for job in succeeded_jobs]
for i in extra_models:
if i not in presets.MODELS:
presets.MODELS.append(i)
with open('config.json', 'r') as f:
data = commentjson.load(f)
if 'extra_models' in data:
for i in extra_models:
if i not in data['extra_models']:
data['extra_models'].append(i)
else:
data['extra_models'] = extra_models
with open('config.json', 'w') as f:
commentjson.dump(data, f, indent=4)
return gr.update(choices=presets.MODELS), f"成功添加了 {len(succeeded_jobs)} 个模型。"
def cancel_all_jobs():
openai.api_key = os.getenv("OPENAI_API_KEY")
jobs = [job for job in openai.FineTuningJob.list()["data"] if job["status"] not in ["cancelled", "succeeded"]]
for job in jobs:
openai.FineTuningJob.cancel(job["id"])
return f"成功取消了 {len(jobs)} 个训练任务。"