Spaces:
Running
Running
File size: 6,306 Bytes
4c1e130 9331159 4c1e130 9331159 ebac224 4c1e130 0858809 9331159 4c1e130 9331159 ebac224 4c1e130 ebac224 4c1e130 9331159 0858809 9331159 4c1e130 9331159 ebac224 1045c52 9331159 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 ebac224 4c1e130 1045c52 4c1e130 9331159 4c1e130 9331159 4c1e130 ebac224 60415a5 4c1e130 ebac224 4c1e130 9331159 1045c52 4c1e130 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import json
import os
from typing import Any, Dict
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, metadata_load
from .dataset_handler import VIDORE_2_DATASETS_KEYWORDS, VIDORE_DATASETS_KEYWORDS, get_datasets_nickname
BLOCKLIST = ["impactframes"]
class ModelHandler:
def __init__(self, model_infos_path="model_infos.json"):
self.api = HfApi()
self.model_infos_path = model_infos_path
self.model_infos = self._load_model_infos()
def _load_model_infos(self) -> Dict:
if os.path.exists(self.model_infos_path):
with open(self.model_infos_path) as f:
return json.load(f)
return {}
def _save_model_infos(self):
with open(self.model_infos_path, "w") as f:
json.dump(self.model_infos, f)
def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
return "metadata" in results and "metrics" in results
def _is_baseline_repo(self, repo_id: str) -> bool:
return repo_id == "vidore/baseline-results"
def sanitize_model_name(self, model_name):
return model_name.replace("/", "_").replace(".", "-thisisapoint-")
def fuze_model_infos(self, model_name, results):
for dataset, metrics in results.items():
if dataset not in self.model_infos[model_name]["results"].keys():
self.model_infos[model_name]["results"][dataset] = metrics
else:
continue
def get_vidore_data(self, metric="ndcg_at_5"):
models = self.api.list_models(filter="vidore")
repositories = [model.modelId for model in models] # type: ignore
# Sort repositories to process non-baseline repos first (to prioritize their results)
repositories.sort(key=lambda x: self._is_baseline_repo(x))
for repo_id in repositories:
org_name = repo_id.split("/")[0]
if org_name in BLOCKLIST:
continue
files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]
if len(files) == 0:
continue
else:
for file in files:
if file.endswith("results.json"):
model_name = repo_id.replace("/", "_").replace(".", "-thisisapoint-")
else:
model_name = file.split("_metrics.json")[0]
model_name = model_name.replace("/", "_").replace(".", "-thisisapoint-")
# Skip if the model is from baseline and we already have results
readme_path = hf_hub_download(repo_id, filename="README.md")
meta = metadata_load(readme_path)
try:
result_path = hf_hub_download(repo_id, filename=file)
with open(result_path) as f:
results = json.load(f)
if self._are_results_in_new_vidore_format(results):
metadata = results["metadata"]
results = results["metrics"]
# Handles the case where the model is both in baseline and outside of it
# (prioritizes the non-baseline results)
if self._is_baseline_repo(repo_id) and self.sanitize_model_name(model_name) in self.model_infos:
self.fuze_model_infos(model_name, results)
self.model_infos[model_name] = {"meta": meta, "results": results}
except Exception as e:
print(f"Error loading {model_name} - {e}")
continue
# In order to keep only models relevant to a benchmark
def filter_models_by_benchmark(self, benchmark_version=1):
filtered_model_infos = {}
keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS
for model, info in self.model_infos.items():
results = info["results"]
if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
filtered_model_infos[model] = info
return filtered_model_infos
# Compute the average of a metric for each model,
def compute_averages(self, metric="ndcg_at_5", benchmark_version=1):
model_res = {}
filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
if len(filtered_model_infos) > 0:
for model in filtered_model_infos.keys():
res = filtered_model_infos[model]["results"]
dataset_res = {}
keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS
for dataset in res.keys():
if not any(keyword in dataset for keyword in keywords):
continue
dataset_nickname = get_datasets_nickname(dataset)
dataset_res[dataset_nickname] = res[dataset][metric]
model_res[model] = dataset_res
df = pd.DataFrame(model_res).T
return df
return pd.DataFrame()
@staticmethod
def add_rank(df, benchmark_version=1):
df.fillna(0.0, inplace=True)
cols_to_rank = [
col
for col in df.columns
if col
not in [
"Model",
"Model Size (Million Parameters)",
"Memory Usage (GB, fp32)",
"Embedding Dimensions",
"Max Tokens",
]
]
if len(cols_to_rank) == 1:
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
else:
df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
df.sort_values("Average", ascending=False, inplace=True)
df.insert(0, "Rank", list(range(1, len(df) + 1)))
# multiply values by 100 if they are floats and round to 1 decimal place
for col in df.columns:
if df[col].dtype == "float64":
df[col] = df[col].apply(lambda x: round(x * 100, 1))
return df
|