|
import os, sys |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(__file__))) |
|
|
|
import json |
|
import re |
|
from typing import List, Dict |
|
|
|
DATA_DIR = "gpt_data_gen" |
|
|
|
B_CODE = "[CODE_START_TOK]" |
|
E_CODE = "[/CODE_END_TOK]" |
|
|
|
B_RESULT = "[RESULT_TOK]" |
|
E_RESULT = "[/RESULT_TOK]" |
|
|
|
B_INST, E_INST = "[INST]", "[/INST]" |
|
B_SYS, E_SYS = "<<SYS>>", "<</SYS>>" |
|
|
|
BOS = "<s>" |
|
EOS = "</s>" |
|
|
|
CODE_SYS_PROMPT_FOR_TRAIN = """ |
|
You are 'CodeLLama', an advanced Language Model assistant that can generate, execute, and evaluate code. |
|
Respond to user queries by providing code-based solutions and insights. |
|
""" |
|
|
|
|
|
def msg_to_code_result_tok_temp(msg: List[Dict]) -> str: |
|
full_str = f"{BOS}{B_INST} {B_SYS}\n{CODE_SYS_PROMPT_FOR_TRAIN}\n{E_SYS}\n\n" |
|
|
|
user_first_flag = True |
|
for idx, chat in enumerate(msg): |
|
if chat["role"] == "system": |
|
continue |
|
if chat["role"].lower() == "user": |
|
chat["content"] = chat["content"] |
|
if user_first_flag: |
|
full_str += f"{chat['content']} {E_INST}" |
|
user_first_flag = False |
|
else: |
|
full_str += f"{BOS}{B_INST}{chat['content']} {E_INST}" |
|
elif chat["role"] == "assistant": |
|
chat["content"] = chat["content"].replace( |
|
"/home/seungyoun/llama_code_interpreter/", "./" |
|
) |
|
|
|
|
|
code_pattern = re.compile(r"```python\n(.*?)```", re.DOTALL) |
|
chat["content"] = code_pattern.sub( |
|
r"[CODE_START_TOK]\n\1[/CODE_END_TOK]", chat["content"] |
|
) |
|
|
|
|
|
result_pattern = re.compile(r"```RESULTS?\n(.*?)```", re.DOTALL) |
|
chat["content"] = result_pattern.sub( |
|
r"[RESULT_TOK]\n\1[/RESULT_TOK]", chat["content"] |
|
) |
|
|
|
full_str += f"{chat['content']}{EOS}" |
|
|
|
full_str = full_str.replace("')()", "')") |
|
full_str = full_str.replace("/home/seungyoun/llama_code_interpreter/", "./") |
|
|
|
return full_str |
|
|
|
|
|
def json_to_code_result_tok_temp(json_file_name: str = "425.json") -> str: |
|
file_rel_path = os.path.join(DATA_DIR, json_file_name) |
|
|
|
with open(file_rel_path, "r") as json_file: |
|
msg = json.load(json_file) |
|
|
|
full_str = msg_to_code_result_tok_temp(msg) |
|
|
|
return full_str |
|
|
|
|
|
if __name__ == "__main__": |
|
print(json_to_code_result_tok_temp()) |
|
|