|
import os |
|
import random |
|
import re |
|
import uuid |
|
|
|
import pandas as pd |
|
from datasets import load_dataset |
|
from tqdm import tqdm |
|
|
|
|
|
def prepare_default_dataset_causal_language_modeling(path): |
|
ds = load_dataset("OpenAssistant/oasst2") |
|
train = ds["train"].to_pandas() |
|
val = ds["validation"].to_pandas() |
|
|
|
df = pd.concat([train, val], axis=0).reset_index(drop=True) |
|
|
|
df_assistant = df[(df.role == "assistant")].copy() |
|
df_prompter = df[(df.role == "prompter")].copy() |
|
df_prompter = df_prompter.set_index("message_id") |
|
df_assistant["output"] = df_assistant["text"].values |
|
|
|
inputs = [] |
|
parent_ids = [] |
|
for _, row in df_assistant.iterrows(): |
|
input = df_prompter.loc[row.parent_id] |
|
inputs.append(input.text) |
|
parent_ids.append(input.parent_id) |
|
|
|
df_assistant["instruction"] = inputs |
|
df_assistant["parent_id"] = parent_ids |
|
|
|
df_assistant = df_assistant[ |
|
["instruction", "output", "message_id", "parent_id", "lang", "rank"] |
|
].rename(columns={"message_id": "id"}) |
|
|
|
df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")][ |
|
["instruction", "output", "id", "parent_id"] |
|
].to_parquet(os.path.join(path, "train_full.pq"), index=False) |
|
|
|
df_assistant[df_assistant["lang"] == "en"][ |
|
["instruction", "output", "id", "parent_id"] |
|
].to_parquet(os.path.join(path, "train_full_allrank.pq"), index=False) |
|
|
|
df_assistant[df_assistant["rank"] == 0.0][ |
|
["instruction", "output", "id", "parent_id"] |
|
].to_parquet(os.path.join(path, "train_full_multilang.pq"), index=False) |
|
|
|
df_assistant[["instruction", "output", "id", "parent_id"]].to_parquet( |
|
os.path.join(path, "train_full_multilang_allrank.pq"), index=False |
|
) |
|
|
|
return df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")] |
|
|
|
|
|
def prepare_default_dataset_dpo_modeling() -> pd.DataFrame: |
|
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas() |
|
return df |
|
|
|
|
|
def extract_anthropic_prompt(prompt_and_response): |
|
"""Extract the anthropic prompt from a prompt and response pair.""" |
|
search_term = "\n\nAssistant:" |
|
search_term_idx = prompt_and_response.rfind(search_term) |
|
assert ( |
|
search_term_idx != -1 |
|
), f"Prompt and response does not contain '{search_term}'" |
|
return prompt_and_response[: search_term_idx + len(search_term)] |
|
|
|
|
|
def _parse_row(prompt_and_response): |
|
"""Extract the anthropic prompt from a prompt and response pair.""" |
|
search_term = "\n\nAssistant:" |
|
search_term_idx = prompt_and_response["chosen"].rfind(search_term) |
|
assert ( |
|
search_term_idx != -1 |
|
), f"Prompt and response does not contain '{search_term}'" |
|
prompt = prompt_and_response["chosen"][: search_term_idx + len(search_term)] |
|
|
|
chosen_response = prompt_and_response["chosen"][len(prompt) :] |
|
rejected_response = prompt_and_response["rejected"][len(prompt) :] |
|
|
|
return prompt, chosen_response, rejected_response |
|
|
|
|
|
def _split_up_prompt(prompt): |
|
human_texts = re.findall( |
|
r"\n\nHuman:(.*?)(?=(\n\nAssistant:|$))", prompt, flags=re.DOTALL |
|
) |
|
assistant_texts = re.findall( |
|
r"\n\nAssistant:(.*?)(?=(\n\nHuman:|$))", prompt, flags=re.DOTALL |
|
) |
|
human_texts = [text[0].strip() for text in human_texts] |
|
assistant_texts = [text[0].strip() for text in assistant_texts] |
|
|
|
assert len(human_texts) == len(assistant_texts), prompt |
|
dialogue = list(zip(human_texts, assistant_texts)) |
|
return dialogue |
|
|
|
|
|
def prepare_hh_dpo_modeling(split: str) -> pd.DataFrame: |
|
""" |
|
Adapted from |
|
https://github.com/eric-mitchell/direct-preference-optimization/blob/main/preference_datasets.py |
|
""" |
|
dataset = load_dataset("Anthropic/hh-rlhf", split=split) |
|
rnd = random.Random() |
|
rnd.seed(123) |
|
dfs = [] |
|
for row in tqdm(dataset): |
|
prompt, chosen_response, rejected_response = _parse_row(row) |
|
if len(rejected_response) == 0: |
|
|
|
continue |
|
|
|
parent_uuid = None |
|
parsed_texts = [] |
|
for human_text, assistant_text in _split_up_prompt(prompt): |
|
random_uuid = str(uuid.UUID(int=rnd.getrandbits(128), version=4)) |
|
parsed_texts += [ |
|
[human_text, assistant_text, random_uuid, parent_uuid, None, None] |
|
] |
|
parent_uuid = random_uuid |
|
|
|
parsed_texts[-1][-2] = chosen_response |
|
parsed_texts[-1][-1] = rejected_response |
|
df = pd.DataFrame( |
|
parsed_texts, |
|
columns=[ |
|
"instruction", |
|
"output", |
|
"id", |
|
"parent_id", |
|
"chosen_response", |
|
"rejected_response", |
|
], |
|
) |
|
dfs.append(df) |
|
df = pd.concat(dfs).reset_index(drop=True) |
|
|
|
df["chosen_response"] = df["chosen_response"].fillna(df["output"]) |
|
df["rejected_response"] = df["rejected_response"].fillna(df["output"]) |
|
del df["output"] |
|
return df |
|
|