Muennighoff commited on
Commit
6181979
β€’
1 Parent(s): 3be8255

Fix metric names & metadata new format

Browse files
Files changed (3) hide show
  1. EXTERNAL_MODEL_RESULTS.json +0 -0
  2. app.py +24 -18
  3. config.yaml +9 -9
EXTERNAL_MODEL_RESULTS.json CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -23,7 +23,15 @@ PRETTY_NAMES = {
23
  "BitextMining": "Bitext Mining",
24
  }
25
 
26
- TASK_TO_METRIC = {k: v["metric"] for k, v in TASKS_CONFIG.items()}
 
 
 
 
 
 
 
 
27
 
28
  def make_clickable_model(model_name, link=None):
29
  if link is None:
@@ -93,16 +101,16 @@ def add_task(examples):
93
  examples["mteb_task"] = "Unknown"
94
  return examples
95
 
96
- def filter_metric_external(x, task, metric):
97
  # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
98
  if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
99
  return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
100
  else:
101
- return x["mteb_task"] == task and x["metric"] == metric
102
 
103
- def filter_metric_fetched(name, metric, expected_metric):
104
  # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
105
- return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric == expected_metric
106
 
107
  if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
108
  with open("EXTERNAL_MODEL_RESULTS.json") as f:
@@ -112,9 +120,9 @@ if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
112
  for model in EXTERNAL_MODELS:
113
  if model not in EXTERNAL_MODEL_RESULTS:
114
  models_to_run.append(model)
115
- EXTERNAL_MODEL_RESULTS[model] = {k: {v: []} for k, v in TASK_TO_METRIC.items()}
116
  else:
117
- EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
118
  models_to_run = EXTERNAL_MODELS
119
 
120
  pbar = tqdm(models_to_run, desc="Fetching external model results")
@@ -127,10 +135,11 @@ for model in pbar:
127
  ds = ds.map(add_task)
128
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
129
 
130
- for task, metric in TASK_TO_METRIC.items():
131
- ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metric))["test"].to_dict()
132
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
133
- EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
 
134
 
135
  # Save & cache EXTERNAL_MODEL_RESULTS
136
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
@@ -204,9 +213,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
204
  results_list = []
205
  for task in tasks:
206
  # Not all models have InstructionRetrieval, other new tasks
207
- if task not in EXTERNAL_MODEL_RESULTS[model]:
208
- continue
209
- results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]
210
 
211
  if len(datasets) > 0:
212
  res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
@@ -262,7 +270,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
262
  # import pdb; pdb.set_trace()
263
  try:
264
  out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
265
- except:
 
266
  print("ERROR", model.modelId)
267
  continue
268
  out = {k: v for d in out for k, v in d.items()}
@@ -304,10 +313,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
304
  if len(datasets) > 0:
305
  # Update legacy column names to be merged with newer ones
306
  # Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
307
- #if ('MLSUMClusteringP2P (fr)' in datasets):
308
- # import pdb; pdb.set_trace()
309
  if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
310
- #import pdb; pdb.set_trace()
311
  df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
312
  datasets.remove('MLSUMClusteringP2P')
313
  if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
@@ -656,7 +662,7 @@ with gr.Blocks(css=css) as block:
656
  gr.Markdown(f"""
657
  {item['description']}
658
 
659
- - **Metric:** {item.get('metric', metric)}
660
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
661
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
662
  """)
 
23
  "BitextMining": "Bitext Mining",
24
  }
25
 
26
+ TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
27
+ # Add legacy metric names
28
+ TASK_TO_METRIC["STS"].append("cos_sim_spearman")
29
+ TASK_TO_METRIC["STS"].append("cosine_spearman")
30
+ TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
31
+ TASK_TO_METRIC["Summarization"].append("cosine_spearman")
32
+ TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
33
+ TASK_TO_METRIC["PairClassification"].append("cosine_ap")
34
+
35
 
36
  def make_clickable_model(model_name, link=None):
37
  if link is None:
 
101
  examples["mteb_task"] = "Unknown"
102
  return examples
103
 
104
+ def filter_metric_external(x, task, metrics):
105
  # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
106
  if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
107
  return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
108
  else:
109
+ return x["mteb_task"] == task and x["metric"] in metrics
110
 
111
+ def filter_metric_fetched(name, metric, expected_metrics):
112
  # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
113
+ return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric in expected_metrics
114
 
115
  if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
116
  with open("EXTERNAL_MODEL_RESULTS.json") as f:
 
120
  for model in EXTERNAL_MODELS:
121
  if model not in EXTERNAL_MODEL_RESULTS:
122
  models_to_run.append(model)
123
+ EXTERNAL_MODEL_RESULTS[model] = {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()}
124
  else:
125
+ EXTERNAL_MODEL_RESULTS = {model: {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
126
  models_to_run = EXTERNAL_MODELS
127
 
128
  pbar = tqdm(models_to_run, desc="Fetching external model results")
 
135
  ds = ds.map(add_task)
136
  base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
137
 
138
+ for task, metrics in TASK_TO_METRIC.items():
139
+ ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metrics))["test"].to_dict()
140
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
141
+ # metrics[0] is the main name for this metric; other names in the list are legacy for backward-compat
142
+ EXTERNAL_MODEL_RESULTS[model][task][metrics[0]].append({**base_dict, **ds_dict})
143
 
144
  # Save & cache EXTERNAL_MODEL_RESULTS
145
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
 
213
  results_list = []
214
  for task in tasks:
215
  # Not all models have InstructionRetrieval, other new tasks
216
+ if task not in EXTERNAL_MODEL_RESULTS[model]: continue
217
+ results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task][0]]
 
218
 
219
  if len(datasets) > 0:
220
  res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
 
270
  # import pdb; pdb.set_trace()
271
  try:
272
  out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
273
+ except Exception as e:
274
+ import pdb; pdb.set_trace()
275
  print("ERROR", model.modelId)
276
  continue
277
  out = {k: v for d in out for k, v in d.items()}
 
313
  if len(datasets) > 0:
314
  # Update legacy column names to be merged with newer ones
315
  # Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
 
 
316
  if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
 
317
  df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
318
  datasets.remove('MLSUMClusteringP2P')
319
  if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
 
662
  gr.Markdown(f"""
663
  {item['description']}
664
 
665
+ - **Metric:** {specific_metric}
666
  - **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
667
  {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
668
  """)
config.yaml CHANGED
@@ -16,12 +16,12 @@ tasks:
16
  Clustering:
17
  icon: "✨"
18
  metric: v_measure
19
- metric_description: "Validity Measure (v_measure)"
20
  task_description: "Clustering is the task of grouping similar documents together."
21
  PairClassification:
22
  icon: "🎭"
23
- metric: cos_sim_ap
24
- metric_description: "Average Precision based on Cosine Similarities (cos_sim_ap)"
25
  task_description: "Pair classification is the task of determining whether two texts are similar."
26
  Reranking:
27
  icon: "πŸ₯ˆ"
@@ -31,22 +31,22 @@ tasks:
31
  Retrieval:
32
  icon: "πŸ”Ž"
33
  metric: ndcg_at_10
34
- metric_description: "Normalized Discounted Cumulative Gain @ k (ndcg_at_10)"
35
  task_description: "Retrieval is the task of finding relevant documents for a query."
36
  STS:
37
  icon: "☘️"
38
- metric: cos_sim_spearman
39
- metric_description: "Spearman correlation based on cosine similarity"
40
  task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
41
  Summarization:
42
  icon: "πŸ“œ"
43
- metric: cos_sim_spearman
44
- metric_description: "Spearman correlation based on cosine similarity"
45
  task_description: "Summarization is the task of generating a summary of a text."
46
  InstructionRetrieval:
47
  icon: "πŸ”ŽπŸ“‹"
48
  metric: "p-MRR"
49
- metric_description: "paired mean reciprocal rank"
50
  task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
51
  boards:
52
  en:
 
16
  Clustering:
17
  icon: "✨"
18
  metric: v_measure
19
+ metric_description: "Validity Measure (V-measure)"
20
  task_description: "Clustering is the task of grouping similar documents together."
21
  PairClassification:
22
  icon: "🎭"
23
+ metric: ap
24
+ metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
25
  task_description: "Pair classification is the task of determining whether two texts are similar."
26
  Reranking:
27
  icon: "πŸ₯ˆ"
 
31
  Retrieval:
32
  icon: "πŸ”Ž"
33
  metric: ndcg_at_10
34
+ metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)"
35
  task_description: "Retrieval is the task of finding relevant documents for a query."
36
  STS:
37
  icon: "☘️"
38
+ metric: spearman
39
+ metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
40
  task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
41
  Summarization:
42
  icon: "πŸ“œ"
43
+ metric: spearman
44
+ metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
45
  task_description: "Summarization is the task of generating a summary of a text."
46
  InstructionRetrieval:
47
  icon: "πŸ”ŽπŸ“‹"
48
  metric: "p-MRR"
49
+ metric_description: "paired mean reciprocal rank (p-MRR)"
50
  task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
51
  boards:
52
  en: