alpaca-eval-explorer / prep_data.py
justinxzhao's picture
Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.
707a231
raw
history blame contribute delete
No virus
3.21 kB
import os
import json
import tiktoken
from alpaca_eval import utils, metrics, annotators, constants, analyze, plotting, main
from alpaca_eval.metrics.glm_winrate import get_length_controlled_winrate
import os
import pandas as pd
import json
# Define the path to the top-level directory
TOP_LEVEL_DIRECTORY = "submodules/alpaca_eval/results"
# Initialize an empty dictionary to hold the model name to dataframe mapping
model_dataframes_outputs = {}
# Iterate through each subdirectory in the top-level directory
for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
if os.path.isdir(model_dir):
model_output_file = os.path.join(model_dir, "model_outputs.json")
if os.path.exists(model_output_file):
df = pd.read_json(model_output_file)
df["model_name"] = model_name
model_dataframes_outputs[model_name] = df
def get_num_words(text):
return len(text.split())
ENCODING = tiktoken.get_encoding("cl100k_base")
def get_num_tokens(text):
"""Uses tiktoken to get the number of tokens in the text."""
try:
return len(ENCODING.encode(str(text)))
except:
breakpoint()
model_name_to_num_words = {}
model_name_to_num_tokens = {}
for model_name, model_dataframe in model_dataframes_outputs.items():
print(f"model_name_to_num_words for {model_name}")
model_dataframe["model_name"] = model_name
model_dataframe["output_num_words"] = model_dataframe["output"].apply(get_num_words)
model_dataframe["output_num_tokens"] = model_dataframe["output"].apply(
get_num_tokens
)
model_name_to_num_words[model_name] = {
"mean": int(model_dataframe["output_num_words"].mean()),
"std": int(model_dataframe["output_num_words"].std()),
}
model_name_to_num_tokens[model_name] = {
"mean": int(model_dataframe["output_num_tokens"].mean()),
"std": int(model_dataframe["output_num_tokens"].std()),
}
num_words_df = pd.DataFrame(model_name_to_num_words).T
num_tokens_df = pd.DataFrame(model_name_to_num_tokens).T
model_name_to_win_rate = {}
for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
print(f"model_name_to_win_rate for {model_name}")
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
if os.path.isdir(model_dir):
model_output_file = os.path.join(
model_dir, "weighted_alpaca_eval_gpt4_turbo", "annotations.json"
)
if os.path.exists(model_output_file):
model_dataframe = pd.read_json(model_output_file)
model_name_to_win_rate[model_name] = get_length_controlled_winrate(
model_dataframe
)
win_rate_df = pd.DataFrame(model_name_to_win_rate).T
df = num_words_df.join(win_rate_df, how="inner")
df = df.rename(
columns={
"mean": "num_words_mean",
"std": "num_words_std",
}
)
df = df.join(num_tokens_df, how="inner")
df = df.rename(
columns={
"mean": "num_tokens_mean",
"std": "num_tokens_std",
}
)
df["model_name"] = df.index
df = df[df["length_controlled_winrate"] > 25]
df.to_json("data/model_win_rates.jsonl", orient="records", lines=True)