Spaces:
Runtime error
Runtime error
File size: 2,297 Bytes
707a231 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import os
import json
import tiktoken
from alpaca_eval import utils, metrics, annotators, constants, analyze, plotting, main
from alpaca_eval.metrics.glm_winrate import get_length_controlled_winrate
import os
import pandas as pd
import json
# Define the path to the top-level directory
TOP_LEVEL_DIRECTORY = "submodules/alpaca_eval/results"
df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
relevant_models = df["model_name"].unique().tolist()
# Initialize an empty dictionary to hold the model name to dataframe mapping
model_dataframes_outputs = {}
# Iterate through each subdirectory in the top-level directory
df_response_judging = pd.DataFrame()
for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
if model_name not in relevant_models:
continue
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
if os.path.isdir(model_dir):
model_output_file = os.path.join(
model_dir, "weighted_alpaca_eval_gpt4_turbo/annotations.json"
)
if os.path.exists(model_output_file):
df_response_judging = pd.concat(
[df_response_judging, pd.read_json(model_output_file)]
)
df_responses = pd.DataFrame()
for model_name in os.listdir(TOP_LEVEL_DIRECTORY):
if model_name not in relevant_models:
continue
model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name)
if os.path.isdir(model_dir):
model_output_file = os.path.join(model_dir, "model_outputs.json")
if os.path.exists(model_output_file):
df_responses = pd.concat([df_responses, pd.read_json(model_output_file)])
df_responses = df_responses.drop("all_generated_texts", axis=1)
df_responses = df_responses.drop("Unnamed: 0.1", axis=1)
df_responses = df_responses.drop("index", axis=1)
df_responses = df_responses.drop("Unnamed: 0", axis=1)
df_responses = df_responses.drop("scores", axis=1)
df_responses = df_responses.drop("all_results_idx_best", axis=1)
df_responses = df_responses.drop("original_output", axis=1)
df_responses = df_responses.drop("new_prompt", axis=1)
breakpoint()
# Whitelist.
df_response_judging.to_json(
"data/df_response_judging.jsonl", lines=True, orient="records"
)
df_responses.to_json("data/df_responses.jsonl", lines=True, orient="records")
|