Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

File size: 4,772 Bytes

#!/usr/bin/env python3

import os
import sys
import json

import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.cluster.hierarchy import linkage

from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task

from src.envs import QUEUE_REPO, RESULTS_REPO, API
from src.utils import my_snapshot_download


def find_json_files(json_path):
    res = []
    for root, dirs, files in os.walk(json_path):
        for file in files:
            if file.endswith(".json"):
                res.append(os.path.join(root, file))
    return res


my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)

result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)

model_name_to_model_map = {}

for path in request_path_lst:
    with open(path, 'r') as f:
        data = json.load(f)
    model_name_to_model_map[data["model"]] = data

model_dataset_metric_to_result_map = {}
data_map = {}

for path in result_path_lst:
    with open(path, 'r') as f:
        data = json.load(f)
    model_name = data["config"]["model_name"]
    for dataset_name, results_dict in data["results"].items():
        for metric_name, value in results_dict.items():

            # print(model_name, dataset_name, metric_name, value)

            if ',' in metric_name and '_stderr' not in metric_name \
                    and 'f1' not in metric_name \
                    and model_name_to_model_map[model_name]["likes"] > 256:

                to_add = True

                if 'selfcheck' in dataset_name:
                    if 'max' not in metric_name:
                        to_add = False

                if 'nq_open' in dataset_name or 'triviaqa' in dataset_name:
                    to_add = False
                    # pass

                # breakpoint()

                if 'bertscore' in metric_name:
                    if 'precision' not in metric_name:
                        to_add = False

                if 'correctness,' in metric_name or 'em,' in metric_name:
                    to_add = False

                if 'rouge' in metric_name:
                    if 'rougeL' not in metric_name:
                        to_add = False

                if 'ifeval' in dataset_name:
                    if 'prompt_level_strict_acc' not in metric_name:
                        to_add = False

                if 'squad' in dataset_name:
                    to_add = False

                if 'fever' in dataset_name:
                    to_add = False

                if 'rouge' in metric_name:
                    value /= 100.0

                if to_add:
                    sanitised_metric_name = metric_name.split(',')[0]
                    model_dataset_metric_to_result_map[(model_name, dataset_name, sanitised_metric_name)] = value

                    # if (model_name, dataset_name) not in data_map:
                    #     data_map[(model_name, dataset_name)] = {}
                    # data_map[(model_name, dataset_name)][metric_name] = value

                    if model_name not in data_map:
                        data_map[model_name] = {}
                    data_map[model_name][(dataset_name, sanitised_metric_name)] = value

                    print('model_name', model_name, 'dataset_name', dataset_name, 'metric_name', metric_name, 'value', value)

model_name_lst = [m for m in data_map.keys()]
for m in model_name_lst:
    if len(data_map[m]) < 8:
        del data_map[m]

df = pd.DataFrame.from_dict(data_map, orient='index')
o_df = df.copy(deep=True)

print(df)

# Check for NaN or infinite values and replace them
df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities with NaN
df.fillna(0, inplace=True)  # Replace NaN with 0 (or use another imputation strategy)

from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# df = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)

sns.set_context("notebook", font_scale=1.0)

# fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
fig = sns.clustermap(df, method='ward', metric='euclidean', cmap='coolwarm', figsize=(16, 12), annot=True, mask=o_df.isnull())

# Adjust the size of the cells (less wide)
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)

# Save the clustermap to file
fig.savefig('plots/clustermap.pdf')
fig.savefig('plots/clustermap.png')