Spaces:
Runtime error
Runtime error
Track large files with Git LFS, and expand app to include a data explorer and more length-based visualizations.
707a231
import os | |
import json | |
import tiktoken | |
from alpaca_eval import utils, metrics, annotators, constants, analyze, plotting, main | |
from alpaca_eval.metrics.glm_winrate import get_length_controlled_winrate | |
import os | |
import pandas as pd | |
import json | |
# Define the path to the top-level directory | |
TOP_LEVEL_DIRECTORY = "submodules/alpaca_eval/results" | |
df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records") | |
relevant_models = df["model_name"].unique().tolist() | |
# Initialize an empty dictionary to hold the model name to dataframe mapping | |
model_dataframes_outputs = {} | |
# Iterate through each subdirectory in the top-level directory | |
df_response_judging = pd.DataFrame() | |
for model_name in os.listdir(TOP_LEVEL_DIRECTORY): | |
if model_name not in relevant_models: | |
continue | |
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name) | |
if os.path.isdir(model_dir): | |
model_output_file = os.path.join( | |
model_dir, "weighted_alpaca_eval_gpt4_turbo/annotations.json" | |
) | |
if os.path.exists(model_output_file): | |
df_response_judging = pd.concat( | |
[df_response_judging, pd.read_json(model_output_file)] | |
) | |
df_responses = pd.DataFrame() | |
for model_name in os.listdir(TOP_LEVEL_DIRECTORY): | |
if model_name not in relevant_models: | |
continue | |
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name) | |
if os.path.isdir(model_dir): | |
model_output_file = os.path.join(model_dir, "model_outputs.json") | |
if os.path.exists(model_output_file): | |
df_responses = pd.concat([df_responses, pd.read_json(model_output_file)]) | |
df_responses = df_responses.drop("all_generated_texts", axis=1) | |
df_responses = df_responses.drop("Unnamed: 0.1", axis=1) | |
df_responses = df_responses.drop("index", axis=1) | |
df_responses = df_responses.drop("Unnamed: 0", axis=1) | |
df_responses = df_responses.drop("scores", axis=1) | |
df_responses = df_responses.drop("all_results_idx_best", axis=1) | |
df_responses = df_responses.drop("original_output", axis=1) | |
df_responses = df_responses.drop("new_prompt", axis=1) | |
breakpoint() | |
# Whitelist. | |
df_response_judging.to_json( | |
"data/df_response_judging.jsonl", lines=True, orient="records" | |
) | |
df_responses.to_json("data/df_responses.jsonl", lines=True, orient="records") | |