H2OTest / llm_studio /app_utils /default_datasets.py
elineve's picture
Upload 301 files
07423df
raw
history blame
5.07 kB
import os
import random
import re
import uuid
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
def prepare_default_dataset_causal_language_modeling(path):
ds = load_dataset("OpenAssistant/oasst2")
train = ds["train"].to_pandas()
val = ds["validation"].to_pandas()
df = pd.concat([train, val], axis=0).reset_index(drop=True)
df_assistant = df[(df.role == "assistant")].copy()
df_prompter = df[(df.role == "prompter")].copy()
df_prompter = df_prompter.set_index("message_id")
df_assistant["output"] = df_assistant["text"].values
inputs = []
parent_ids = []
for _, row in df_assistant.iterrows():
input = df_prompter.loc[row.parent_id]
inputs.append(input.text)
parent_ids.append(input.parent_id)
df_assistant["instruction"] = inputs
df_assistant["parent_id"] = parent_ids
df_assistant = df_assistant[
["instruction", "output", "message_id", "parent_id", "lang", "rank"]
].rename(columns={"message_id": "id"})
df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full.pq"), index=False)
df_assistant[df_assistant["lang"] == "en"][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full_allrank.pq"), index=False)
df_assistant[df_assistant["rank"] == 0.0][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full_multilang.pq"), index=False)
df_assistant[["instruction", "output", "id", "parent_id"]].to_parquet(
os.path.join(path, "train_full_multilang_allrank.pq"), index=False
)
return df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")]
def prepare_default_dataset_dpo_modeling() -> pd.DataFrame:
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas()
return df
def extract_anthropic_prompt(prompt_and_response):
"""Extract the anthropic prompt from a prompt and response pair."""
search_term = "\n\nAssistant:"
search_term_idx = prompt_and_response.rfind(search_term)
assert (
search_term_idx != -1
), f"Prompt and response does not contain '{search_term}'"
return prompt_and_response[: search_term_idx + len(search_term)]
def _parse_row(prompt_and_response):
"""Extract the anthropic prompt from a prompt and response pair."""
search_term = "\n\nAssistant:"
search_term_idx = prompt_and_response["chosen"].rfind(search_term)
assert (
search_term_idx != -1
), f"Prompt and response does not contain '{search_term}'"
prompt = prompt_and_response["chosen"][: search_term_idx + len(search_term)]
chosen_response = prompt_and_response["chosen"][len(prompt) :]
rejected_response = prompt_and_response["rejected"][len(prompt) :]
return prompt, chosen_response, rejected_response
def _split_up_prompt(prompt):
human_texts = re.findall(
r"\n\nHuman:(.*?)(?=(\n\nAssistant:|$))", prompt, flags=re.DOTALL
)
assistant_texts = re.findall(
r"\n\nAssistant:(.*?)(?=(\n\nHuman:|$))", prompt, flags=re.DOTALL
)
human_texts = [text[0].strip() for text in human_texts]
assistant_texts = [text[0].strip() for text in assistant_texts]
assert len(human_texts) == len(assistant_texts), prompt
dialogue = list(zip(human_texts, assistant_texts))
return dialogue
def prepare_hh_dpo_modeling(split: str) -> pd.DataFrame:
"""
Adapted from
https://github.com/eric-mitchell/direct-preference-optimization/blob/main/preference_datasets.py
"""
dataset = load_dataset("Anthropic/hh-rlhf", split=split)
rnd = random.Random()
rnd.seed(123)
dfs = []
for row in tqdm(dataset):
prompt, chosen_response, rejected_response = _parse_row(row)
if len(rejected_response) == 0:
# remove rejected answers that are empty
continue
parent_uuid = None
parsed_texts = []
for human_text, assistant_text in _split_up_prompt(prompt):
random_uuid = str(uuid.UUID(int=rnd.getrandbits(128), version=4))
parsed_texts += [
[human_text, assistant_text, random_uuid, parent_uuid, None, None]
]
parent_uuid = random_uuid
parsed_texts[-1][-2] = chosen_response
parsed_texts[-1][-1] = rejected_response
df = pd.DataFrame(
parsed_texts,
columns=[
"instruction",
"output",
"id",
"parent_id",
"chosen_response",
"rejected_response",
],
)
dfs.append(df)
df = pd.concat(dfs).reset_index(drop=True)
# merge output into chosen and rejected response
df["chosen_response"] = df["chosen_response"].fillna(df["output"])
df["rejected_response"] = df["rejected_response"].fillna(df["output"])
del df["output"]
return df