Spaces:
Sleeping
Sleeping
import os, sys | |
sys.path.append(os.path.dirname(os.path.dirname(__file__))) | |
import json | |
import re | |
from typing import List, Dict | |
DATA_DIR = "gpt_data_gen" | |
B_CODE = "[CODE_START_TOK]" | |
E_CODE = "[/CODE_END_TOK]" | |
B_RESULT = "[RESULT_TOK]" | |
E_RESULT = "[/RESULT_TOK]" | |
B_INST, E_INST = "[INST]", "[/INST]" | |
B_SYS, E_SYS = "<<SYS>>", "<</SYS>>" | |
BOS = "<s>" | |
EOS = "</s>" | |
CODE_SYS_PROMPT_FOR_TRAIN = """ | |
You are 'CodeLLama', an advanced Language Model assistant that can generate, execute, and evaluate code. | |
Respond to user queries by providing code-based solutions and insights. | |
""" | |
def msg_to_code_result_tok_temp(msg: List[Dict]) -> str: | |
full_str = f"{BOS}{B_INST} {B_SYS}\n{CODE_SYS_PROMPT_FOR_TRAIN}\n{E_SYS}\n\n" | |
user_first_flag = True | |
for idx, chat in enumerate(msg): | |
if chat["role"] == "system": | |
continue | |
if chat["role"].lower() == "user": | |
chat["content"] = chat["content"] | |
if user_first_flag: | |
full_str += f"{chat['content']} {E_INST}" | |
user_first_flag = False | |
else: | |
full_str += f"{BOS}{B_INST}{chat['content']} {E_INST}" | |
elif chat["role"] == "assistant": | |
chat["content"] = chat["content"].replace( | |
"/home/seungyoun/llama_code_interpreter/", "./" | |
) | |
# Replace the code block start and end markers using regex | |
code_pattern = re.compile(r"```python\n(.*?)```", re.DOTALL) | |
chat["content"] = code_pattern.sub( | |
r"[CODE_START_TOK]\n\1[/CODE_END_TOK]", chat["content"] | |
) | |
# Replace the result block start and end markers using regex | |
result_pattern = re.compile(r"```RESULTS?\n(.*?)```", re.DOTALL) | |
chat["content"] = result_pattern.sub( | |
r"[RESULT_TOK]\n\1[/RESULT_TOK]", chat["content"] | |
) | |
full_str += f"{chat['content']}{EOS}" | |
full_str = full_str.replace("')()", "')") | |
full_str = full_str.replace("/home/seungyoun/llama_code_interpreter/", "./") | |
return full_str | |
def json_to_code_result_tok_temp(json_file_name: str = "425.json") -> str: | |
file_rel_path = os.path.join(DATA_DIR, json_file_name) | |
with open(file_rel_path, "r") as json_file: | |
msg = json.load(json_file) | |
full_str = msg_to_code_result_tok_temp(msg) | |
return full_str | |
if __name__ == "__main__": | |
print(json_to_code_result_tok_temp()) | |