Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 8,457 Bytes
f766ce9 8b7a945 3b83af7 f766ce9 8b7a945 f766ce9 3b83af7 f766ce9 8b7a945 2edd122 1a2dba5 df659d0 af8395f 2edd122 f766ce9 7ff98ba f766ce9 08fea1e f766ce9 3b83af7 f766ce9 8b7a945 3b83af7 8b7a945 df659d0 9400714 f766ce9 8b7a945 3b83af7 8b7a945 1a2dba5 8b7a945 df659d0 9400714 f766ce9 8b7a945 3b83af7 8b7a945 1a2dba5 df659d0 8b7a945 df659d0 8b7a945 df659d0 9400714 bb19740 8b7a945 1a2dba5 df659d0 9400714 f766ce9 1e768ec 3b83af7 8b7a945 3d59d51 8b7a945 3d59d51 7ff98ba 1a2dba5 df659d0 af8395f 3d59d51 a0c2cea 8b7a945 982af90 1a2dba5 3d59d51 1a2dba5 f30cbcc 3d59d51 8b7a945 e5c7cad 8b7a945 f766ce9 8b7a945 3b83af7 f766ce9 3b83af7 f766ce9 cf2d912 1a2dba5 f766ce9 8b7a945 a0c2cea 1e768ec f766ce9 8b7a945 f766ce9 8b7a945 f766ce9 8b7a945 f766ce9 8b7a945 f766ce9 3b83af7 af8395f 3b83af7 af8395f 3b83af7 1a2dba5 af8395f 3b83af7 af8395f 3b83af7 7ff98ba 08fea1e 6fb03e0 3b83af7 7ff98ba df659d0 3b83af7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import json
import os.path
from collections import defaultdict
from dataclasses import dataclass
from typing import List
import pandas as pd
from src.benchmarks import get_safe_name
from src.display.utils import (
COL_NAME_RERANKING_MODEL,
COL_NAME_RETRIEVAL_MODEL,
COL_NAME_RERANKING_MODEL_LINK,
COL_NAME_RETRIEVAL_MODEL_LINK,
COL_NAME_REVISION,
COL_NAME_TIMESTAMP,
COL_NAME_IS_ANONYMOUS,
COLS_QA,
QA_BENCHMARK_COLS,
COLS_LONG_DOC,
LONG_DOC_BENCHMARK_COLS,
COL_NAME_AVG,
COL_NAME_RANK
)
from src.display.formatting import make_clickable_model
def calculate_mean(row):
if pd.isna(row).any():
return 0
else:
return row.mean()
@dataclass
class EvalResult:
"""
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different
domains, languages, and datasets
"""
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
retrieval_model: str
reranking_model: str
results: list # results on all the benchmarks stored as dict
task: str
metric: str
timestamp: str = "" # submission timestamp
revision: str = ""
is_anonymous: bool = False
@dataclass
class FullEvalResult:
"""
Evaluation result of a single embedding model with a specific reranking model on benchmarks over different tasks
"""
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
retrieval_model: str
reranking_model: str
retrieval_model_link: str
reranking_model_link: str
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
timestamp: str = ""
revision: str = ""
is_anonymous: bool = False
@classmethod
def init_from_json_file(cls, json_filepath):
"""
Initiate from the result json file for a single model.
The json file will be written only when the status is FINISHED.
"""
with open(json_filepath) as fp:
model_data = json.load(fp)
# store all the results for different metrics and tasks
result_list = []
retrieval_model_link = ""
reranking_model_link = ""
revision = ""
for item in model_data:
config = item.get("config", {})
# eval results for different metrics
results = item.get("results", [])
retrieval_model_link = config["retrieval_model_link"]
if config["reranking_model_link"] is None:
reranking_model_link = ""
else:
reranking_model_link = config["reranking_model_link"]
eval_result = EvalResult(
eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
retrieval_model=config["retrieval_model"],
reranking_model=config["reranking_model"],
results=results,
task=config["task"],
metric=config["metric"],
timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
is_anonymous=config.get("is_anonymous", False)
)
result_list.append(eval_result)
return cls(
eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
retrieval_model=result_list[0].retrieval_model,
reranking_model=result_list[0].reranking_model,
retrieval_model_link=retrieval_model_link,
reranking_model_link=reranking_model_link,
results=result_list,
timestamp=result_list[0].timestamp,
revision=result_list[0].revision,
is_anonymous=result_list[0].is_anonymous
)
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
"""
Convert the results in all the EvalResults over different tasks and metrics. The output is a list of dict compatible with the dataframe UI
"""
results = defaultdict(dict)
for eval_result in self.results:
if eval_result.metric != metric:
continue
if eval_result.task != task:
continue
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = (
make_clickable_model(self.retrieval_model, self.retrieval_model_link))
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = (
make_clickable_model(self.reranking_model, self.reranking_model_link))
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
results[eval_result.eval_name][COL_NAME_REVISION] = self.revision
results[eval_result.eval_name][COL_NAME_TIMESTAMP] = self.timestamp
results[eval_result.eval_name][COL_NAME_IS_ANONYMOUS] = self.is_anonymous
# print(f'result loaded: {eval_result.eval_name}')
for result in eval_result.results:
# add result for each domain, language, and dataset
domain = result["domain"]
lang = result["lang"]
dataset = result["dataset"]
value = result["value"] * 100
if dataset == 'default':
benchmark_name = f"{domain}_{lang}"
else:
benchmark_name = f"{domain}_{lang}_{dataset}"
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
return [v for v in results.values()]
def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
"""
Load the evaluation results from a json file
"""
model_result_filepaths = []
for root, dirs, files in os.walk(results_path):
if len(files) == 0:
continue
# select the latest results
for file in files:
if not (file.startswith("results") and file.endswith(".json")):
print(f'skip {file}')
continue
model_result_filepaths.append(os.path.join(root, file))
eval_results = {}
for model_result_filepath in model_result_filepaths:
# create evaluation results
try:
eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
except UnicodeDecodeError as e:
print(f"loading file failed. {model_result_filepath}")
continue
print(f'file loaded: {model_result_filepath}')
eval_name = eval_result.eval_name
eval_results[eval_name] = eval_result
results = []
for k, v in eval_results.items():
try:
v.to_dict()
results.append(v)
except KeyError:
print(f"loading failed: {k}")
continue
return results
def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
"""
Creates a dataframe from all the individual experiment results
"""
cols = [COL_NAME_IS_ANONYMOUS, ]
if task == "qa":
cols += COLS_QA
benchmark_cols = QA_BENCHMARK_COLS
elif task == "long-doc":
cols += COLS_LONG_DOC
benchmark_cols = LONG_DOC_BENCHMARK_COLS
else:
raise NotImplemented
all_data_json = []
for v in raw_data:
all_data_json += v.to_dict(task=task, metric=metric)
df = pd.DataFrame.from_records(all_data_json)
# print(f'dataframe created: {df.shape}')
_benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
# calculate the average score for selected benchmarks
df[COL_NAME_AVG] = df[list(_benchmark_cols)].apply(calculate_mean, axis=1).round(decimals=2)
df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
df.reset_index(inplace=True, drop=True)
_cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
df = df[_cols].round(decimals=2)
# filter out if any of the benchmarks have not been produced
df[COL_NAME_RANK] = df[COL_NAME_AVG].rank(ascending=False, method="min")
# shorten the revision
df[COL_NAME_REVISION] = df[COL_NAME_REVISION].str[:6]
return df
|