Add voyage-lite-01-instruct

#48
by voyageai01 - opened
Files changed (7) hide show
  1. .gitattributes +31 -0
  2. .gitignore +0 -6
  3. DESCRIPTION.md +1 -0
  4. Dockerfile +0 -19
  5. README.md +4 -14
  6. app.py +1781 -0
  7. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore DELETED
@@ -1,6 +0,0 @@
1
- *.pyc
2
- model_infos.json
3
- space
4
- .venv
5
- results
6
- mteb
 
 
 
 
 
 
 
DESCRIPTION.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Massive Text Embedding Benchmark (MTEB) Leaderboard.
Dockerfile DELETED
@@ -1,19 +0,0 @@
1
- FROM python:3.12-bookworm
2
-
3
- RUN apt update && apt install -y git make
4
- RUN useradd -m -u 1000 user
5
- ENV PATH="/home/user/.local/bin:$PATH"
6
-
7
- RUN git clone https://github.com/embeddings-benchmark/mteb.git
8
- RUN chown -R user:user /mteb
9
-
10
- USER user
11
- WORKDIR /mteb
12
-
13
- RUN pip install "pydantic<2.11"
14
- RUN pip install ".[leaderboard]"
15
- # ENV XDG_CACHE_HOME=/home/user/.cache
16
- ENV GRADIO_SERVER_NAME="0.0.0.0"
17
- EXPOSE 7860
18
-
19
- CMD ["make", "run-leaderboard"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,20 +1,10 @@
1
  ---
2
- title: MTEB Leaderboard
3
  emoji: 🥇
4
  colorFrom: blue
5
  colorTo: indigo
6
- sdk: docker
7
- app_port: 7860
8
  app_file: app.py
9
- pinned: true
10
- tags:
11
- - leaderboard
12
- startup_duration_timeout: 1h
13
- fullWidth: true
14
- license: mit
15
- short_description: Embedding Leaderboard
16
  ---
17
-
18
- # MTEB Leaderboard
19
-
20
- Embedding Leaderboard
 
1
  ---
2
+ title: MTEB Leaderboard
3
  emoji: 🥇
4
  colorFrom: blue
5
  colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.0.2
8
  app_file: app.py
9
+ pinned: false
 
 
 
 
 
 
10
  ---
 
 
 
 
app.py ADDED
@@ -0,0 +1,1781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ import json
3
+
4
+ from datasets import load_dataset
5
+ import gradio as gr
6
+ from huggingface_hub import get_hf_file_metadata, HfApi, hf_hub_download, hf_hub_url
7
+ from huggingface_hub.repocard import metadata_load
8
+ import pandas as pd
9
+
10
+ TASKS = [
11
+ "BitextMining",
12
+ "Classification",
13
+ "Clustering",
14
+ "PairClassification",
15
+ "Reranking",
16
+ "Retrieval",
17
+ "STS",
18
+ "Summarization",
19
+ ]
20
+
21
+ TASK_LIST_BITEXT_MINING = ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)']
22
+ TASK_LIST_BITEXT_MINING_OTHER = ["BornholmBitextMining"]
23
+
24
+ TASK_LIST_CLASSIFICATION = [
25
+ "AmazonCounterfactualClassification (en)",
26
+ "AmazonPolarityClassification",
27
+ "AmazonReviewsClassification (en)",
28
+ "Banking77Classification",
29
+ "EmotionClassification",
30
+ "ImdbClassification",
31
+ "MassiveIntentClassification (en)",
32
+ "MassiveScenarioClassification (en)",
33
+ "MTOPDomainClassification (en)",
34
+ "MTOPIntentClassification (en)",
35
+ "ToxicConversationsClassification",
36
+ "TweetSentimentExtractionClassification",
37
+ ]
38
+
39
+ TASK_LIST_CLASSIFICATION_NORM = [x.replace(" (en)", "") for x in TASK_LIST_CLASSIFICATION]
40
+
41
+ TASK_LIST_CLASSIFICATION_DA = [
42
+ "AngryTweetsClassification",
43
+ "DanishPoliticalCommentsClassification",
44
+ "DKHateClassification",
45
+ "LccSentimentClassification",
46
+ "MassiveIntentClassification (da)",
47
+ "MassiveScenarioClassification (da)",
48
+ "NordicLangClassification",
49
+ "ScalaDaClassification",
50
+ ]
51
+
52
+ TASK_LIST_CLASSIFICATION_NB = [
53
+ "NoRecClassification",
54
+ "NordicLangClassification",
55
+ "NorwegianParliament",
56
+ "MassiveIntentClassification (nb)",
57
+ "MassiveScenarioClassification (nb)",
58
+ "ScalaNbClassification",
59
+ ]
60
+
61
+ TASK_LIST_CLASSIFICATION_PL = [
62
+ "AllegroReviews",
63
+ "CBD",
64
+ "MassiveIntentClassification (pl)",
65
+ "MassiveScenarioClassification (pl)",
66
+ "PAC",
67
+ "PolEmo2.0-IN",
68
+ "PolEmo2.0-OUT",
69
+ ]
70
+
71
+ TASK_LIST_CLASSIFICATION_SV = [
72
+ "DalajClassification",
73
+ "MassiveIntentClassification (sv)",
74
+ "MassiveScenarioClassification (sv)",
75
+ "NordicLangClassification",
76
+ "ScalaSvClassification",
77
+ "SweRecClassification",
78
+ ]
79
+
80
+ TASK_LIST_CLASSIFICATION_ZH = [
81
+ "AmazonReviewsClassification (zh)",
82
+ "IFlyTek",
83
+ "JDReview",
84
+ "MassiveIntentClassification (zh-CN)",
85
+ "MassiveScenarioClassification (zh-CN)",
86
+ "MultilingualSentiment",
87
+ "OnlineShopping",
88
+ "TNews",
89
+ "Waimai",
90
+ ]
91
+
92
+ TASK_LIST_CLASSIFICATION_OTHER = ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)']
93
+
94
+ TASK_LIST_CLUSTERING = [
95
+ "ArxivClusteringP2P",
96
+ "ArxivClusteringS2S",
97
+ "BiorxivClusteringP2P",
98
+ "BiorxivClusteringS2S",
99
+ "MedrxivClusteringP2P",
100
+ "MedrxivClusteringS2S",
101
+ "RedditClustering",
102
+ "RedditClusteringP2P",
103
+ "StackExchangeClustering",
104
+ "StackExchangeClusteringP2P",
105
+ "TwentyNewsgroupsClustering",
106
+ ]
107
+
108
+
109
+ TASK_LIST_CLUSTERING_DE = [
110
+ "BlurbsClusteringP2P",
111
+ "BlurbsClusteringS2S",
112
+ "TenKGnadClusteringP2P",
113
+ "TenKGnadClusteringS2S",
114
+ ]
115
+
116
+ TASK_LIST_CLUSTERING_PL = [
117
+ "8TagsClustering",
118
+ ]
119
+
120
+ TASK_LIST_CLUSTERING_ZH = [
121
+ "CLSClusteringP2P",
122
+ "CLSClusteringS2S",
123
+ "ThuNewsClusteringP2P",
124
+ "ThuNewsClusteringS2S",
125
+ ]
126
+
127
+ TASK_LIST_PAIR_CLASSIFICATION = [
128
+ "SprintDuplicateQuestions",
129
+ "TwitterSemEval2015",
130
+ "TwitterURLCorpus",
131
+ ]
132
+
133
+ TASK_LIST_PAIR_CLASSIFICATION_PL = [
134
+ "CDSC-E",
135
+ "PPC",
136
+ "PSC",
137
+ "SICK-E-PL",
138
+ ]
139
+
140
+ TASK_LIST_PAIR_CLASSIFICATION_ZH = [
141
+ "Cmnli",
142
+ "Ocnli",
143
+ ]
144
+
145
+ TASK_LIST_RERANKING = [
146
+ "AskUbuntuDupQuestions",
147
+ "MindSmallReranking",
148
+ "SciDocsRR",
149
+ "StackOverflowDupQuestions",
150
+ ]
151
+
152
+ TASK_LIST_RERANKING_ZH = [
153
+ "CMedQAv1",
154
+ "CMedQAv2",
155
+ "MMarcoReranking",
156
+ "T2Reranking",
157
+ ]
158
+
159
+ TASK_LIST_RETRIEVAL = [
160
+ "ArguAna",
161
+ "ClimateFEVER",
162
+ "CQADupstackRetrieval",
163
+ "DBPedia",
164
+ "FEVER",
165
+ "FiQA2018",
166
+ "HotpotQA",
167
+ "MSMARCO",
168
+ "NFCorpus",
169
+ "NQ",
170
+ "QuoraRetrieval",
171
+ "SCIDOCS",
172
+ "SciFact",
173
+ "Touche2020",
174
+ "TRECCOVID",
175
+ ]
176
+
177
+ TASK_LIST_RETRIEVAL_PL = [
178
+ "ArguAna-PL",
179
+ "DBPedia-PL",
180
+ "FiQA-PL",
181
+ "HotpotQA-PL",
182
+ "MSMARCO-PL",
183
+ "NFCorpus-PL",
184
+ "NQ-PL",
185
+ "Quora-PL",
186
+ "SCIDOCS-PL",
187
+ "SciFact-PL",
188
+ "TRECCOVID-PL",
189
+ ]
190
+
191
+ TASK_LIST_RETRIEVAL_ZH = [
192
+ "CmedqaRetrieval",
193
+ "CovidRetrieval",
194
+ "DuRetrieval",
195
+ "EcomRetrieval",
196
+ "MedicalRetrieval",
197
+ "MMarcoRetrieval",
198
+ "T2Retrieval",
199
+ "VideoRetrieval",
200
+ ]
201
+
202
+ TASK_LIST_RETRIEVAL_NORM = TASK_LIST_RETRIEVAL + [
203
+ "CQADupstackAndroidRetrieval",
204
+ "CQADupstackEnglishRetrieval",
205
+ "CQADupstackGamingRetrieval",
206
+ "CQADupstackGisRetrieval",
207
+ "CQADupstackMathematicaRetrieval",
208
+ "CQADupstackPhysicsRetrieval",
209
+ "CQADupstackProgrammersRetrieval",
210
+ "CQADupstackStatsRetrieval",
211
+ "CQADupstackTexRetrieval",
212
+ "CQADupstackUnixRetrieval",
213
+ "CQADupstackWebmastersRetrieval",
214
+ "CQADupstackWordpressRetrieval"
215
+ ]
216
+
217
+ TASK_LIST_STS = [
218
+ "BIOSSES",
219
+ "SICK-R",
220
+ "STS12",
221
+ "STS13",
222
+ "STS14",
223
+ "STS15",
224
+ "STS16",
225
+ "STS17 (en-en)",
226
+ "STS22 (en)",
227
+ "STSBenchmark",
228
+ ]
229
+
230
+ TASK_LIST_STS_PL = [
231
+ "CDSC-R",
232
+ "SICK-R-PL",
233
+ "STS22 (pl)",
234
+ ]
235
+
236
+ TASK_LIST_STS_ZH = [
237
+ "AFQMC",
238
+ "ATEC",
239
+ "BQ",
240
+ "LCQMC",
241
+ "PAWSX",
242
+ "QBQTC",
243
+ "STS22 (zh)",
244
+ "STSB",
245
+ ]
246
+
247
+ TASK_LIST_STS_OTHER = ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark",]
248
+ TASK_LIST_STS_NORM = [x.replace(" (en)", "").replace(" (en-en)", "") for x in TASK_LIST_STS]
249
+
250
+ TASK_LIST_SUMMARIZATION = ["SummEval",]
251
+
252
+ TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION
253
+ TASK_LIST_PL = TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL
254
+ TASK_LIST_ZH = TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH
255
+
256
+ TASK_TO_METRIC = {
257
+ "BitextMining": "f1",
258
+ "Clustering": "v_measure",
259
+ "Classification": "accuracy",
260
+ "PairClassification": "cos_sim_ap",
261
+ "Reranking": "map",
262
+ "Retrieval": "ndcg_at_10",
263
+ "STS": "cos_sim_spearman",
264
+ "Summarization": "cos_sim_spearman",
265
+ }
266
+
267
+ def make_clickable_model(model_name, link=None):
268
+ if link is None:
269
+ link = "https://huggingface.co/" + model_name
270
+ # Remove user from model name
271
+ return (
272
+ f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
273
+ )
274
+
275
+ # Models without metadata, thus we cannot fetch their results naturally
276
+ EXTERNAL_MODELS = [
277
+ "all-MiniLM-L12-v2",
278
+ "all-MiniLM-L6-v2",
279
+ "all-mpnet-base-v2",
280
+ "allenai-specter",
281
+ "bert-base-swedish-cased",
282
+ "bert-base-uncased",
283
+ "bge-base-zh-v1.5",
284
+ "bge-large-zh-v1.5",
285
+ "bge-large-zh-noinstruct",
286
+ "bge-small-zh-v1.5",
287
+ "contriever-base-msmarco",
288
+ "cross-en-de-roberta-sentence-transformer",
289
+ "dfm-encoder-large-v1",
290
+ "dfm-sentence-encoder-large-1",
291
+ "distiluse-base-multilingual-cased-v2",
292
+ "DanskBERT",
293
+ "e5-base",
294
+ "e5-large",
295
+ "e5-small",
296
+ "electra-small-nordic",
297
+ "electra-small-swedish-cased-discriminator",
298
+ "gbert-base",
299
+ "gbert-large",
300
+ "gelectra-base",
301
+ "gelectra-large",
302
+ "gottbert-base",
303
+ "glove.6B.300d",
304
+ "gtr-t5-base",
305
+ "gtr-t5-large",
306
+ "gtr-t5-xl",
307
+ "gtr-t5-xxl",
308
+ "herbert-base-retrieval-v2",
309
+ "komninos",
310
+ "luotuo-bert-medium",
311
+ "LASER2",
312
+ "LaBSE",
313
+ "m3e-base",
314
+ "m3e-large",
315
+ "msmarco-bert-co-condensor",
316
+ "multilingual-e5-base",
317
+ "multilingual-e5-large",
318
+ "multilingual-e5-small",
319
+ "nb-bert-base",
320
+ "nb-bert-large",
321
+ "norbert3-base",
322
+ "norbert3-large",
323
+ "paraphrase-multilingual-MiniLM-L12-v2",
324
+ "paraphrase-multilingual-mpnet-base-v2",
325
+ "sentence-bert-swedish-cased",
326
+ "sentence-t5-base",
327
+ "sentence-t5-large",
328
+ "sentence-t5-xl",
329
+ "sentence-t5-xxl",
330
+ "sup-simcse-bert-base-uncased",
331
+ "st-polish-paraphrase-from-distilroberta",
332
+ "st-polish-paraphrase-from-mpnet",
333
+ "text2vec-base-chinese",
334
+ "text2vec-large-chinese",
335
+ "text-embedding-ada-002",
336
+ "text-similarity-ada-001",
337
+ "text-similarity-babbage-001",
338
+ "text-similarity-curie-001",
339
+ "text-similarity-davinci-001",
340
+ "text-search-ada-doc-001",
341
+ "text-search-ada-001",
342
+ "text-search-babbage-001",
343
+ "text-search-curie-001",
344
+ "text-search-davinci-001",
345
+ "titan-embed-text-v1",
346
+ "unsup-simcse-bert-base-uncased",
347
+ "use-cmlm-multilingual",
348
+ "xlm-roberta-base",
349
+ "xlm-roberta-large",
350
+ ]
351
+
352
+ EXTERNAL_MODEL_TO_LINK = {
353
+ "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
354
+ "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
355
+ "all-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2",
356
+ "all-MiniLM-L6-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
357
+ "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
358
+ "bert-base-swedish-cased": "https://huggingface.co/KB/bert-base-swedish-cased",
359
+ "bert-base-uncased": "https://huggingface.co/bert-base-uncased",
360
+ "bge-base-zh-v1.5": "https://huggingface.co/BAAI/bge-base-zh-v1.5",
361
+ "bge-large-zh-v1.5": "https://huggingface.co/BAAI/bge-large-zh-v1.5",
362
+ "bge-large-zh-noinstruct": "https://huggingface.co/BAAI/bge-large-zh-noinstruct",
363
+ "bge-small-zh-v1.5": "https://huggingface.co/BAAI/bge-small-zh-v1.5",
364
+ "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
365
+ "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer",
366
+ "DanskBERT": "https://huggingface.co/vesteinn/DanskBERT",
367
+ "distiluse-base-multilingual-cased-v2": "https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2",
368
+ "dfm-encoder-large-v1": "https://huggingface.co/chcaa/dfm-encoder-large-v1",
369
+ "dfm-sentence-encoder-large-1": "https://huggingface.co/chcaa/dfm-encoder-large-v1",
370
+ "e5-base": "https://huggingface.co/intfloat/e5-base",
371
+ "e5-large": "https://huggingface.co/intfloat/e5-large",
372
+ "e5-small": "https://huggingface.co/intfloat/e5-small",
373
+ "electra-small-nordic": "https://huggingface.co/jonfd/electra-small-nordic",
374
+ "electra-small-swedish-cased-discriminator": "https://huggingface.co/KBLab/electra-small-swedish-cased-discriminator",
375
+ "gbert-base": "https://huggingface.co/deepset/gbert-base",
376
+ "gbert-large": "https://huggingface.co/deepset/gbert-large",
377
+ "gelectra-base": "https://huggingface.co/deepset/gelectra-base",
378
+ "gelectra-large": "https://huggingface.co/deepset/gelectra-large",
379
+ "glove.6B.300d": "https://huggingface.co/sentence-transformers/average_word_embeddings_glove.6B.300d",
380
+ "gottbert-base": "https://huggingface.co/uklfr/gottbert-base",
381
+ "gtr-t5-base": "https://huggingface.co/sentence-transformers/gtr-t5-base",
382
+ "gtr-t5-large": "https://huggingface.co/sentence-transformers/gtr-t5-large",
383
+ "gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl",
384
+ "gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl",
385
+ "herbert-base-retrieval-v2": "https://huggingface.co/ipipan/herbert-base-retrieval-v2",
386
+ "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos",
387
+ "luotuo-bert-medium": "https://huggingface.co/silk-road/luotuo-bert-medium",
388
+ "LASER2": "https://github.com/facebookresearch/LASER",
389
+ "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE",
390
+ "m3e-base": "https://huggingface.co/moka-ai/m3e-base",
391
+ "m3e-large": "https://huggingface.co/moka-ai/m3e-large",
392
+ "msmarco-bert-co-condensor": "https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor",
393
+ "multilingual-e5-base": "https://huggingface.co/intfloat/multilingual-e5-base",
394
+ "multilingual-e5-large": "https://huggingface.co/intfloat/multilingual-e5-large",
395
+ "multilingual-e5-small": "https://huggingface.co/intfloat/multilingual-e5-small",
396
+ "nb-bert-base": "https://huggingface.co/NbAiLab/nb-bert-base",
397
+ "nb-bert-large": "https://huggingface.co/NbAiLab/nb-bert-large",
398
+ "norbert3-base": "https://huggingface.co/ltg/norbert3-base",
399
+ "norbert3-large": "https://huggingface.co/ltg/norbert3-large",
400
+ "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
401
+ "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
402
+ "sentence-bert-swedish-cased": "https://huggingface.co/KBLab/sentence-bert-swedish-cased",
403
+ "sentence-t5-base": "https://huggingface.co/sentence-transformers/sentence-t5-base",
404
+ "sentence-t5-large": "https://huggingface.co/sentence-transformers/sentence-t5-large",
405
+ "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
406
+ "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
407
+ "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
408
+ "st-polish-paraphrase-from-distilroberta": "https://huggingface.co/sdadas/st-polish-paraphrase-from-distilroberta",
409
+ "st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet",
410
+ "text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese",
411
+ "text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese",
412
+ "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
413
+ "text-similarity-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
414
+ "text-similarity-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
415
+ "text-similarity-curie-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
416
+ "text-similarity-davinci-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
417
+ "text-search-ada-doc-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
418
+ "text-search-ada-query-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
419
+ "text-search-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
420
+ "text-search-curie-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
421
+ "text-search-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
422
+ "text-search-davinci-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
423
+ "titan-embed-text-v1": "https://docs.aws.amazon.com/bedrock/latest/userguide/embeddings.html",
424
+ "unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased",
425
+ "use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
426
+ "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base",
427
+ "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large",
428
+ }
429
+
430
+ EXTERNAL_MODEL_TO_DIM = {
431
+ "all-MiniLM-L12-v2": 384,
432
+ "all-MiniLM-L6-v2": 384,
433
+ "all-mpnet-base-v2": 768,
434
+ "allenai-specter": 768,
435
+ "bert-base-swedish-cased": 768,
436
+ "bert-base-uncased": 768,
437
+ "bge-base-zh-v1.5": 768,
438
+ "bge-large-zh-v1.5": 1024,
439
+ "bge-large-zh-noinstruct": 1024,
440
+ "bge-small-zh-v1.5": 512,
441
+ "contriever-base-msmarco": 768,
442
+ "cross-en-de-roberta-sentence-transformer": 768,
443
+ "DanskBERT": 768,
444
+ "distiluse-base-multilingual-cased-v2": 512,
445
+ "dfm-encoder-large-v1": 1024,
446
+ "dfm-sentence-encoder-large-1": 1024,
447
+ "e5-base": 768,
448
+ "e5-small": 384,
449
+ "e5-large": 1024,
450
+ "electra-small-nordic": 256,
451
+ "electra-small-swedish-cased-discriminator": 256,
452
+ "luotuo-bert-medium": 768,
453
+ "LASER2": 1024,
454
+ "LaBSE": 768,
455
+ "gbert-base": 768,
456
+ "gbert-large": 1024,
457
+ "gelectra-base": 768,
458
+ "gelectra-large": 1024,
459
+ "glove.6B.300d": 300,
460
+ "gottbert-base": 768,
461
+ "gtr-t5-base": 768,
462
+ "gtr-t5-large": 768,
463
+ "gtr-t5-xl": 768,
464
+ "gtr-t5-xxl": 768,
465
+ "herbert-base-retrieval-v2": 768,
466
+ "komninos": 300,
467
+ "m3e-base": 768,
468
+ "m3e-large": 768,
469
+ "msmarco-bert-co-condensor": 768,
470
+ "multilingual-e5-base": 768,
471
+ "multilingual-e5-small": 384,
472
+ "multilingual-e5-large": 1024,
473
+ "nb-bert-base": 768,
474
+ "nb-bert-large": 1024,
475
+ "norbert3-base": 768,
476
+ "norbert3-large": 1024,
477
+ "paraphrase-multilingual-MiniLM-L12-v2": 384,
478
+ "paraphrase-multilingual-mpnet-base-v2": 768,
479
+ "sentence-bert-swedish-cased": 768,
480
+ "sentence-t5-base": 768,
481
+ "sentence-t5-large": 768,
482
+ "sentence-t5-xl": 768,
483
+ "sentence-t5-xxl": 768,
484
+ "sup-simcse-bert-base-uncased": 768,
485
+ "st-polish-paraphrase-from-distilroberta": 768,
486
+ "st-polish-paraphrase-from-mpnet": 768,
487
+ "text2vec-base-chinese": 768,
488
+ "text2vec-large-chinese": 1024,
489
+ "text-embedding-ada-002": 1536,
490
+ "text-similarity-ada-001": 1024,
491
+ "text-similarity-babbage-001": 2048,
492
+ "text-similarity-curie-001": 4096,
493
+ "text-similarity-davinci-001": 12288,
494
+ "text-search-ada-doc-001": 1024,
495
+ "text-search-ada-query-001": 1024,
496
+ "text-search-ada-001": 1024,
497
+ "text-search-babbage-001": 2048,
498
+ "text-search-curie-001": 4096,
499
+ "text-search-davinci-001": 12288,
500
+ "titan-embed-text-v1": 1536,
501
+ "unsup-simcse-bert-base-uncased": 768,
502
+ "use-cmlm-multilingual": 768,
503
+ "xlm-roberta-base": 768,
504
+ "xlm-roberta-large": 1024,
505
+ }
506
+
507
+ EXTERNAL_MODEL_TO_SEQLEN = {
508
+ "all-MiniLM-L12-v2": 512,
509
+ "all-MiniLM-L6-v2": 512,
510
+ "all-mpnet-base-v2": 514,
511
+ "allenai-specter": 512,
512
+ "bert-base-swedish-cased": 512,
513
+ "bert-base-uncased": 512,
514
+ "bge-base-zh-v1.5": 512,
515
+ "bge-large-zh-v1.5": 512,
516
+ "bge-large-zh-noinstruct": 512,
517
+ "bge-small-zh-v1.5": 512,
518
+ "contriever-base-msmarco": 512,
519
+ "cross-en-de-roberta-sentence-transformer": 514,
520
+ "DanskBERT": 514,
521
+ "dfm-encoder-large-v1": 512,
522
+ "dfm-sentence-encoder-large-1": 512,
523
+ "distiluse-base-multilingual-cased-v2": 512,
524
+ "e5-base": 512,
525
+ "e5-large": 512,
526
+ "e5-small": 512,
527
+ "electra-small-nordic": 512,
528
+ "electra-small-swedish-cased-discriminator": 512,
529
+ "gbert-base": 512,
530
+ "gbert-large": 512,
531
+ "gelectra-base": 512,
532
+ "gelectra-large": 512,
533
+ "gottbert-base": 512,
534
+ "glove.6B.300d": "N/A",
535
+ "gtr-t5-base": 512,
536
+ "gtr-t5-large": 512,
537
+ "gtr-t5-xl": 512,
538
+ "gtr-t5-xxl": 512,
539
+ "herbert-base-retrieval-v2": 514,
540
+ "komninos": "N/A",
541
+ "luotuo-bert-medium": 512,
542
+ "LASER2": "N/A",
543
+ "LaBSE": 512,
544
+ "m3e-base": 512,
545
+ "m3e-large": 512,
546
+ "msmarco-bert-co-condensor": 512,
547
+ "multilingual-e5-base": 514,
548
+ "multilingual-e5-large": 514,
549
+ "multilingual-e5-small": 512,
550
+ "nb-bert-base": 512,
551
+ "nb-bert-large": 512,
552
+ "norbert3-base": 512,
553
+ "norbert3-large": 512,
554
+ "paraphrase-multilingual-MiniLM-L12-v2": 512,
555
+ "paraphrase-multilingual-mpnet-base-v2": 514,
556
+ "sentence-bert-swedish-cased": 512,
557
+ "sentence-t5-base": 512,
558
+ "sentence-t5-large": 512,
559
+ "sentence-t5-xl": 512,
560
+ "sentence-t5-xxl": 512,
561
+ "sup-simcse-bert-base-uncased": 512,
562
+ "st-polish-paraphrase-from-distilroberta": 514,
563
+ "st-polish-paraphrase-from-mpnet": 514,
564
+ "text2vec-base-chinese": 512,
565
+ "text2vec-large-chinese": 512,
566
+ "text-embedding-ada-002": 8191,
567
+ "text-similarity-ada-001": 2046,
568
+ "text-similarity-babbage-001": 2046,
569
+ "text-similarity-curie-001": 2046,
570
+ "text-similarity-davinci-001": 2046,
571
+ "text-search-ada-doc-001": 2046,
572
+ "text-search-ada-query-001": 2046,
573
+ "text-search-ada-001": 2046,
574
+ "text-search-babbage-001": 2046,
575
+ "text-search-curie-001": 2046,
576
+ "text-search-davinci-001": 2046,
577
+ "titan-embed-text-v1": 8000,
578
+ "use-cmlm-multilingual": 512,
579
+ "unsup-simcse-bert-base-uncased": 512,
580
+ "xlm-roberta-base": 514,
581
+ "xlm-roberta-large": 514,
582
+ }
583
+
584
+ EXTERNAL_MODEL_TO_SIZE = {
585
+ "allenai-specter": 0.44,
586
+ "all-MiniLM-L12-v2": 0.13,
587
+ "all-MiniLM-L6-v2": 0.09,
588
+ "all-mpnet-base-v2": 0.44,
589
+ "bert-base-uncased": 0.44,
590
+ "bert-base-swedish-cased": 0.50,
591
+ "bge-base-zh-v1.5": 0.41,
592
+ "bge-large-zh-v1.5": 1.30,
593
+ "bge-large-zh-noinstruct": 1.30,
594
+ "bge-small-zh-v1.5": 0.10,
595
+ "cross-en-de-roberta-sentence-transformer": 1.11,
596
+ "contriever-base-msmarco": 0.44,
597
+ "DanskBERT": 0.50,
598
+ "distiluse-base-multilingual-cased-v2": 0.54,
599
+ "dfm-encoder-large-v1": 1.42,
600
+ "dfm-sentence-encoder-large-1": 1.63,
601
+ "e5-base": 0.44,
602
+ "e5-small": 0.13,
603
+ "e5-large": 1.34,
604
+ "electra-small-nordic": 0.09,
605
+ "electra-small-swedish-cased-discriminator": 0.06,
606
+ "gbert-base": 0.44,
607
+ "gbert-large": 1.35,
608
+ "gelectra-base": 0.44,
609
+ "gelectra-large": 1.34,
610
+ "glove.6B.300d": 0.48,
611
+ "gottbert-base": 0.51,
612
+ "gtr-t5-base": 0.22,
613
+ "gtr-t5-large": 0.67,
614
+ "gtr-t5-xl": 2.48,
615
+ "gtr-t5-xxl": 9.73,
616
+ "herbert-base-retrieval-v2": 0.50,
617
+ "komninos": 0.27,
618
+ "luotuo-bert-medium": 1.31,
619
+ "LASER2": 0.17,
620
+ "LaBSE": 1.88,
621
+ "m3e-base": 0.41,
622
+ "m3e-large": 0.41,
623
+ "msmarco-bert-co-condensor": 0.44,
624
+ "multilingual-e5-base": 1.11,
625
+ "multilingual-e5-small": 0.47,
626
+ "multilingual-e5-large": 2.24,
627
+ "nb-bert-base": 0.71,
628
+ "nb-bert-large": 1.42,
629
+ "norbert3-base": 0.52,
630
+ "norbert3-large": 1.47,
631
+ "paraphrase-multilingual-mpnet-base-v2": 1.11,
632
+ "paraphrase-multilingual-MiniLM-L12-v2": 0.47,
633
+ "sentence-bert-swedish-cased": 0.50,
634
+ "sentence-t5-base": 0.22,
635
+ "sentence-t5-large": 0.67,
636
+ "sentence-t5-xl": 2.48,
637
+ "sentence-t5-xxl": 9.73,
638
+ "sup-simcse-bert-base-uncased": 0.44,
639
+ "st-polish-paraphrase-from-distilroberta": 0.50,
640
+ "st-polish-paraphrase-from-mpnet": 0.50,
641
+ "text2vec-base-chinese": 0.41,
642
+ "text2vec-large-chinese": 1.30,
643
+ "unsup-simcse-bert-base-uncased": 0.44,
644
+ "use-cmlm-multilingual": 1.89,
645
+ "xlm-roberta-base": 1.12,
646
+ "xlm-roberta-large": 2.24,
647
+ }
648
+
649
+ MODELS_TO_SKIP = {
650
+ "baseplate/instructor-large-1", # Duplicate
651
+ "radames/e5-large", # Duplicate
652
+ "gentlebowl/instructor-large-safetensors", # Duplicate
653
+ "Consensus/instructor-base", # Duplicate
654
+ "GovCompete/instructor-xl", # Duplicate
655
+ "GovCompete/e5-large-v2", # Duplicate
656
+ "t12e/instructor-base", # Duplicate
657
+ "michaelfeil/ct2fast-e5-large-v2",
658
+ "michaelfeil/ct2fast-e5-large",
659
+ "michaelfeil/ct2fast-e5-small-v2",
660
+ "newsrx/instructor-xl-newsrx",
661
+ "newsrx/instructor-large-newsrx",
662
+ "fresha/e5-large-v2-endpoint",
663
+ "ggrn/e5-small-v2",
664
+ "michaelfeil/ct2fast-e5-small",
665
+ "jncraton/e5-small-v2-ct2-int8",
666
+ "anttip/ct2fast-e5-small-v2-hfie",
667
+ "newsrx/instructor-large",
668
+ "newsrx/instructor-xl",
669
+ "dmlls/all-mpnet-base-v2",
670
+ "cgldo/semanticClone",
671
+ "Malmuk1/e5-large-v2_Sharded",
672
+ "jncraton/gte-small-ct2-int8",
673
+ "Einas/einas_ashkar",
674
+ "gruber/e5-small-v2-ggml",
675
+ "jncraton/bge-small-en-ct2-int8",
676
+ "vectoriseai/bge-small-en",
677
+ "recipe/embeddings",
678
+ "dhairya0907/thenlper-get-large",
679
+ "Narsil/bge-base-en",
680
+ "kozistr/fused-large-en",
681
+ "sionic-ai/sionic-ai-v2", # Wait for https://huggingface.co/sionic-ai/sionic-ai-v2/discussions/1
682
+ "sionic-ai/sionic-ai-v1", # Wait for https://huggingface.co/sionic-ai/sionic-ai-v2/discussions/1
683
+ "BAAI/bge-large-en", # Deprecated in favor of v1.5
684
+ "BAAI/bge-base-en", # Deprecated in favor of v1.5
685
+ "BAAI/bge-small-en", # Deprecated in favor of v1.5
686
+ "d0rj/e5-large-en-ru",
687
+ "d0rj/e5-base-en-ru",
688
+ "d0rj/e5-small-en-ru",
689
+ "aident-ai/bge-base-en-onnx",
690
+ "barisaydin/bge-base-en",
691
+ "barisaydin/gte-large",
692
+ "barisaydin/gte-base",
693
+ "barisaydin/gte-small",
694
+ "barisaydin/bge-small-en",
695
+ "odunola/e5-base-v2",
696
+ "goldenrooster/multilingual-e5-large",
697
+ "davidpeer/gte-small",
698
+ "barisaydin/bge-large-en",
699
+ "jamesgpt1/english-large-v1",
700
+ "vectoriseai/bge-large-en-v1.5",
701
+ "vectoriseai/bge-base-en-v1.5",
702
+ "vectoriseai/instructor-large",
703
+ "vectoriseai/instructor-base",
704
+ "vectoriseai/gte-large",
705
+ "vectoriseai/gte-base",
706
+ "vectoriseai/e5-large-v2",
707
+ "vectoriseai/bge-small-en-v1.5",
708
+ "vectoriseai/e5-base-v2",
709
+ "vectoriseai/e5-large",
710
+ "vectoriseai/multilingual-e5-large",
711
+ "vectoriseai/gte-small",
712
+ "vectoriseai/ember-v1",
713
+ "vectoriseai/e5-base",
714
+ "vectoriseai/e5-small-v2",
715
+ "michaelfeil/ct2fast-bge-large-en-v1.5",
716
+ "michaelfeil/ct2fast-bge-large-en-v1.5",
717
+ "michaelfeil/ct2fast-bge-base-en-v1.5",
718
+ "michaelfeil/ct2fast-gte-large",
719
+ "michaelfeil/ct2fast-gte-base",
720
+ "michaelfeil/ct2fast-bge-small-en-v1.5",
721
+ "rizki/bgr-tf",
722
+ "ef-zulla/e5-multi-sml-torch",
723
+ "cherubhao/yogamodel",
724
+ "morgendigital/multilingual-e5-large-quantized",
725
+ "jncraton/gte-tiny-ct2-int8",
726
+ "Research2NLP/electrical_stella",
727
+ }
728
+
729
+ EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
730
+
731
+ def add_lang(examples):
732
+ if not(examples["eval_language"]):
733
+ examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
734
+ else:
735
+ examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
736
+ return examples
737
+
738
+ def add_task(examples):
739
+ # Could be added to the dataset loading script instead
740
+ if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_NB + TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_ZH:
741
+ examples["mteb_task"] = "Classification"
742
+ elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE + TASK_LIST_CLUSTERING_PL + TASK_LIST_CLUSTERING_ZH:
743
+ examples["mteb_task"] = "Clustering"
744
+ elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_PAIR_CLASSIFICATION_ZH:
745
+ examples["mteb_task"] = "PairClassification"
746
+ elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING + TASK_LIST_RERANKING_ZH:
747
+ examples["mteb_task"] = "Reranking"
748
+ elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL + TASK_LIST_RETRIEVAL_ZH:
749
+ examples["mteb_task"] = "Retrieval"
750
+ elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM + TASK_LIST_STS_PL + TASK_LIST_STS_ZH:
751
+ examples["mteb_task"] = "STS"
752
+ elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION:
753
+ examples["mteb_task"] = "Summarization"
754
+ elif examples["mteb_dataset_name"] in [x.split(" ")[0] for x in TASK_LIST_BITEXT_MINING + TASK_LIST_BITEXT_MINING_OTHER]:
755
+ examples["mteb_task"] = "BitextMining"
756
+ else:
757
+ print("WARNING: Task not found for dataset", examples["mteb_dataset_name"])
758
+ examples["mteb_task"] = "Unknown"
759
+ return examples
760
+
761
+ for model in EXTERNAL_MODELS:
762
+ ds = load_dataset("mteb/results", model)
763
+ # For local debugging:
764
+ #, download_mode='force_redownload', verification_mode="no_checks")
765
+ ds = ds.map(add_lang)
766
+ ds = ds.map(add_task)
767
+ base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))}
768
+ # For now only one metric per task - Could add more metrics lateron
769
+ for task, metric in TASK_TO_METRIC.items():
770
+ ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict()
771
+ ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
772
+ EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
773
+
774
+ def get_dim_seq_size(model):
775
+ filenames = [sib.rfilename for sib in model.siblings]
776
+ dim, seq, size = "", "", ""
777
+ if "1_Pooling/config.json" in filenames:
778
+ st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
779
+ dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
780
+ elif "2_Pooling/config.json" in filenames:
781
+ st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
782
+ dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
783
+ if "config.json" in filenames:
784
+ config_path = hf_hub_download(model.modelId, filename="config.json")
785
+ config = json.load(open(config_path))
786
+ if not dim:
787
+ dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
788
+ seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
789
+ # Get model file size without downloading
790
+ if "pytorch_model.bin" in filenames:
791
+ url = hf_hub_url(model.modelId, filename="pytorch_model.bin")
792
+ meta = get_hf_file_metadata(url)
793
+ size = round(meta.size / 1e9, 2)
794
+ elif "pytorch_model.bin.index.json" in filenames:
795
+ index_path = hf_hub_download(model.modelId, filename="pytorch_model.bin.index.json")
796
+ """
797
+ {
798
+ "metadata": {
799
+ "total_size": 28272820224
800
+ },....
801
+ """
802
+ size = json.load(open(index_path))
803
+ if ("metadata" in size) and ("total_size" in size["metadata"]):
804
+ size = round(size["metadata"]["total_size"] / 1e9, 2)
805
+ return dim, seq, size
806
+
807
+ def make_datasets_clickable(df):
808
+ """Does not work"""
809
+ if "BornholmBitextMining" in df.columns:
810
+ link = "https://huggingface.co/datasets/strombergnlp/bornholmsk_parallel"
811
+ df = df.rename(
812
+ columns={f'BornholmBitextMining': '<a target="_blank" style="text-decoration: underline" href="{link}">BornholmBitextMining</a>',})
813
+ return df
814
+
815
+ def add_rank(df):
816
+ cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length"]]
817
+ if len(cols_to_rank) == 1:
818
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
819
+ else:
820
+ df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
821
+ df.sort_values("Average", ascending=False, inplace=True)
822
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
823
+ df = df.round(2)
824
+ # Fill NaN after averaging
825
+ df.fillna("", inplace=True)
826
+ return df
827
+
828
+ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC, rank=True):
829
+ api = HfApi()
830
+ models = api.list_models(filter="mteb")
831
+ # Initialize list to models that we cannot fetch metadata from
832
+ df_list = []
833
+ for model in EXTERNAL_MODEL_RESULTS:
834
+ results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]]
835
+ if len(datasets) > 0:
836
+ res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
837
+ elif langs:
838
+ # Would be cleaner to rely on an extra language column instead
839
+ langs_format = [f"({lang})" for lang in langs]
840
+ res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
841
+ else:
842
+ res = {k: v for d in results_list for k, v in d.items()}
843
+ # Model & at least one result
844
+ if len(res) > 1:
845
+ if add_emb_dim:
846
+ res["Model Size (GB)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
847
+ res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
848
+ res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
849
+ df_list.append(res)
850
+
851
+ for model in models:
852
+ if model.modelId in MODELS_TO_SKIP: continue
853
+ print("MODEL", model)
854
+ readme_path = hf_hub_download(model.modelId, filename="README.md")
855
+ meta = metadata_load(readme_path)
856
+ # meta['model-index'][0]["results"] is list of elements like:
857
+ # {
858
+ # "task": {"type": "Classification"},
859
+ # "dataset": {
860
+ # "type": "mteb/amazon_massive_intent",
861
+ # "name": "MTEB MassiveIntentClassification (nb)",
862
+ # "config": "nb",
863
+ # "split": "test",
864
+ # },
865
+ # "metrics": [
866
+ # {"type": "accuracy", "value": 39.81506388702084},
867
+ # {"type": "f1", "value": 38.809586587791664},
868
+ # ],
869
+ # },
870
+ # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
871
+ if len(datasets) > 0:
872
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
873
+ elif langs:
874
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
875
+ else:
876
+ task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
877
+ out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results]
878
+ out = {k: v for d in out for k, v in d.items()}
879
+ out["Model"] = make_clickable_model(model.modelId)
880
+ # Model & at least one result
881
+ if len(out) > 1:
882
+ if add_emb_dim:
883
+ out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
884
+ df_list.append(out)
885
+ df = pd.DataFrame(df_list)
886
+ # If there are any models that are the same, merge them
887
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
888
+ df = df.groupby("Model", as_index=False).first()
889
+ # Put 'Model' column first
890
+ cols = sorted(list(df.columns))
891
+ cols.insert(0, cols.pop(cols.index("Model")))
892
+ df = df[cols]
893
+ if rank:
894
+ df = add_rank(df)
895
+ if fillna:
896
+ df.fillna("", inplace=True)
897
+ return df
898
+
899
+ def get_mteb_average():
900
+ global DATA_OVERALL, DATA_CLASSIFICATION_EN, DATA_CLUSTERING, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_SUMMARIZATION
901
+ DATA_OVERALL = get_mteb_data(
902
+ tasks=[
903
+ "Classification",
904
+ "Clustering",
905
+ "PairClassification",
906
+ "Reranking",
907
+ "Retrieval",
908
+ "STS",
909
+ "Summarization",
910
+ ],
911
+ datasets=TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION,
912
+ fillna=False,
913
+ add_emb_dim=True,
914
+ rank=False,
915
+ )
916
+ # Debugging:
917
+ # DATA_OVERALL.to_csv("overall.csv")
918
+
919
+ DATA_OVERALL.insert(1, f"Average ({len(TASK_LIST_EN)} datasets)", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False))
920
+ DATA_OVERALL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False))
921
+ DATA_OVERALL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False))
922
+ DATA_OVERALL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False))
923
+ DATA_OVERALL.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False))
924
+ DATA_OVERALL.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False))
925
+ DATA_OVERALL.insert(7, f"STS Average ({len(TASK_LIST_STS)} datasets)", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False))
926
+ DATA_OVERALL.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False))
927
+ DATA_OVERALL.sort_values(f"Average ({len(TASK_LIST_EN)} datasets)", ascending=False, inplace=True)
928
+ # Start ranking from 1
929
+ DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
930
+
931
+ DATA_OVERALL = DATA_OVERALL.round(2)
932
+
933
+ DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION])
934
+ # Only keep rows with at least one score in addition to the "Model" & rank column
935
+ DATA_CLASSIFICATION_EN = DATA_CLASSIFICATION_EN[DATA_CLASSIFICATION_EN.iloc[:, 2:].ne("").any(axis=1)]
936
+
937
+ DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING])
938
+ DATA_CLUSTERING = DATA_CLUSTERING[DATA_CLUSTERING.iloc[:, 2:].ne("").any(axis=1)]
939
+
940
+ DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION])
941
+ DATA_PAIR_CLASSIFICATION = DATA_PAIR_CLASSIFICATION[DATA_PAIR_CLASSIFICATION.iloc[:, 2:].ne("").any(axis=1)]
942
+
943
+ DATA_RERANKING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RERANKING])
944
+ DATA_RERANKING = DATA_RERANKING[DATA_RERANKING.iloc[:, 2:].ne("").any(axis=1)]
945
+
946
+ DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL])
947
+ DATA_RETRIEVAL = DATA_RETRIEVAL[DATA_RETRIEVAL.iloc[:, 2:].ne("").any(axis=1)]
948
+
949
+ DATA_STS_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_STS])
950
+ DATA_STS_EN = DATA_STS_EN[DATA_STS_EN.iloc[:, 2:].ne("").any(axis=1)]
951
+
952
+ DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION])
953
+ DATA_SUMMARIZATION = DATA_SUMMARIZATION[DATA_SUMMARIZATION.iloc[:, 1:].ne("").any(axis=1)]
954
+
955
+ # Fill NaN after averaging
956
+ DATA_OVERALL.fillna("", inplace=True)
957
+
958
+ DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
959
+ DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
960
+
961
+ return DATA_OVERALL
962
+
963
+ def get_mteb_average_zh():
964
+ global DATA_OVERALL_ZH, DATA_CLASSIFICATION_ZH, DATA_CLUSTERING_ZH, DATA_PAIR_CLASSIFICATION_ZH, DATA_RERANKING_ZH, DATA_RETRIEVAL_ZH, DATA_STS_ZH
965
+ DATA_OVERALL_ZH = get_mteb_data(
966
+ tasks=[
967
+ "Classification",
968
+ "Clustering",
969
+ "PairClassification",
970
+ "Reranking",
971
+ "Retrieval",
972
+ "STS",
973
+ ],
974
+ datasets=TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH,
975
+ fillna=False,
976
+ add_emb_dim=True,
977
+ rank=False,
978
+ )
979
+ # Debugging:
980
+ # DATA_OVERALL_ZH.to_csv("overall.csv")
981
+
982
+ DATA_OVERALL_ZH.insert(1, f"Average ({len(TASK_LIST_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_ZH].mean(axis=1, skipna=False))
983
+ DATA_OVERALL_ZH.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLASSIFICATION_ZH].mean(axis=1, skipna=False))
984
+ DATA_OVERALL_ZH.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLUSTERING_ZH].mean(axis=1, skipna=False))
985
+ DATA_OVERALL_ZH.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_PAIR_CLASSIFICATION_ZH].mean(axis=1, skipna=False))
986
+ DATA_OVERALL_ZH.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RERANKING_ZH].mean(axis=1, skipna=False))
987
+ DATA_OVERALL_ZH.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RETRIEVAL_ZH].mean(axis=1, skipna=False))
988
+ DATA_OVERALL_ZH.insert(7, f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_STS_ZH].mean(axis=1, skipna=False))
989
+ DATA_OVERALL_ZH.sort_values(f"Average ({len(TASK_LIST_ZH)} datasets)", ascending=False, inplace=True)
990
+ # Start ranking from 1
991
+ DATA_OVERALL_ZH.insert(0, "Rank", list(range(1, len(DATA_OVERALL_ZH) + 1)))
992
+
993
+ DATA_OVERALL_ZH = DATA_OVERALL_ZH.round(2)
994
+
995
+ DATA_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_CLASSIFICATION_ZH])
996
+ # Only keep rows with at least one score in addition to the "Model" & rank column
997
+ DATA_CLASSIFICATION_ZH = DATA_CLASSIFICATION_ZH[DATA_CLASSIFICATION_ZH.iloc[:, 2:].ne("").any(axis=1)]
998
+
999
+ DATA_CLUSTERING_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_CLUSTERING_ZH])
1000
+ DATA_CLUSTERING_ZH = DATA_CLUSTERING_ZH[DATA_CLUSTERING_ZH.iloc[:, 2:].ne("").any(axis=1)]
1001
+
1002
+ DATA_PAIR_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_PAIR_CLASSIFICATION_ZH])
1003
+ DATA_PAIR_CLASSIFICATION_ZH = DATA_PAIR_CLASSIFICATION_ZH[DATA_PAIR_CLASSIFICATION_ZH.iloc[:, 2:].ne("").any(axis=1)]
1004
+
1005
+ DATA_RERANKING_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_RERANKING_ZH])
1006
+ DATA_RERANKING_ZH = DATA_RERANKING_ZH[DATA_RERANKING_ZH.iloc[:, 2:].ne("").any(axis=1)]
1007
+
1008
+ DATA_RETRIEVAL_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_RETRIEVAL_ZH])
1009
+ DATA_RETRIEVAL_ZH = DATA_RETRIEVAL_ZH[DATA_RETRIEVAL_ZH.iloc[:, 2:].ne("").any(axis=1)]
1010
+
1011
+ DATA_STS_ZH = add_rank(DATA_OVERALL_ZH[["Model"] + TASK_LIST_STS_ZH])
1012
+ DATA_STS_ZH = DATA_STS_ZH[DATA_STS_ZH.iloc[:, 2:].ne("").any(axis=1)]
1013
+
1014
+ # Fill NaN after averaging
1015
+ DATA_OVERALL_ZH.fillna("", inplace=True)
1016
+
1017
+ DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_ZH)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)"]]
1018
+ DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)]
1019
+
1020
+ return DATA_OVERALL_ZH
1021
+
1022
+ def get_mteb_average_pl():
1023
+ global DATA_OVERALL_PL, DATA_CLASSIFICATION_PL, DATA_CLUSTERING_PL, DATA_PAIR_CLASSIFICATION_PL, DATA_RETRIEVAL_PL, DATA_STS_PL
1024
+ DATA_OVERALL_PL = get_mteb_data(
1025
+ tasks=[
1026
+ "Classification",
1027
+ "Clustering",
1028
+ "PairClassification",
1029
+ "Retrieval",
1030
+ "STS",
1031
+ ],
1032
+ datasets=TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL,
1033
+ fillna=False,
1034
+ add_emb_dim=True,
1035
+ rank=False,
1036
+ )
1037
+ # Debugging:
1038
+ # DATA_OVERALL_PL.to_csv("overall.csv")
1039
+
1040
+ DATA_OVERALL_PL.insert(1, f"Average ({len(TASK_LIST_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PL].mean(axis=1, skipna=False))
1041
+ DATA_OVERALL_PL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLASSIFICATION_PL].mean(axis=1, skipna=False))
1042
+ DATA_OVERALL_PL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLUSTERING_PL].mean(axis=1, skipna=False))
1043
+ DATA_OVERALL_PL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PAIR_CLASSIFICATION_PL].mean(axis=1, skipna=False))
1044
+ DATA_OVERALL_PL.insert(5, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_RETRIEVAL_PL].mean(axis=1, skipna=False))
1045
+ DATA_OVERALL_PL.insert(6, f"STS Average ({len(TASK_LIST_STS_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_STS_PL].mean(axis=1, skipna=False))
1046
+ DATA_OVERALL_PL.sort_values(f"Average ({len(TASK_LIST_PL)} datasets)", ascending=False, inplace=True)
1047
+ # Start ranking from 1
1048
+ DATA_OVERALL_PL.insert(0, "Rank", list(range(1, len(DATA_OVERALL_PL) + 1)))
1049
+
1050
+ DATA_OVERALL_PL = DATA_OVERALL_PL.round(2)
1051
+
1052
+ DATA_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLASSIFICATION_PL])
1053
+ # Only keep rows with at least one score in addition to the "Model" & rank column
1054
+ DATA_CLASSIFICATION_PL = DATA_CLASSIFICATION_PL[DATA_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
1055
+
1056
+ DATA_CLUSTERING_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_CLUSTERING_PL])
1057
+ DATA_CLUSTERING_PL = DATA_CLUSTERING_PL[DATA_CLUSTERING_PL.iloc[:, 2:].ne("").any(axis=1)]
1058
+
1059
+ DATA_PAIR_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION_PL])
1060
+ DATA_PAIR_CLASSIFICATION_PL = DATA_PAIR_CLASSIFICATION_PL[DATA_PAIR_CLASSIFICATION_PL.iloc[:, 2:].ne("").any(axis=1)]
1061
+
1062
+ DATA_RETRIEVAL_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_RETRIEVAL_PL])
1063
+ DATA_RETRIEVAL_PL = DATA_RETRIEVAL_PL[DATA_RETRIEVAL_PL.iloc[:, 2:].ne("").any(axis=1)]
1064
+
1065
+ DATA_STS_PL = add_rank(DATA_OVERALL_PL[["Model"] + TASK_LIST_STS_PL])
1066
+ DATA_STS_PL = DATA_STS_PL[DATA_STS_PL.iloc[:, 2:].ne("").any(axis=1)]
1067
+
1068
+ # Fill NaN after averaging
1069
+ DATA_OVERALL_PL.fillna("", inplace=True)
1070
+
1071
+ DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]]
1072
+ DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
1073
+
1074
+ return DATA_OVERALL_PL
1075
+
1076
+ get_mteb_average()
1077
+ get_mteb_average_pl()
1078
+ get_mteb_average_zh()
1079
+ DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
1080
+ DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
1081
+ DATA_CLASSIFICATION_DA = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_DA)
1082
+ DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_NB)
1083
+ DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
1084
+ DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
1085
+ DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
1086
+ DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)
1087
+
1088
+ # Exact, add all non-nan integer values for every dataset
1089
+ NUM_SCORES = 0
1090
+ DATASETS = []
1091
+ MODELS = []
1092
+ # LANGUAGES = []
1093
+ for d in [
1094
+ DATA_BITEXT_MINING,
1095
+ DATA_BITEXT_MINING_OTHER,
1096
+ DATA_CLASSIFICATION_EN,
1097
+ DATA_CLASSIFICATION_DA,
1098
+ DATA_CLASSIFICATION_NB,
1099
+ DATA_CLASSIFICATION_PL,
1100
+ DATA_CLASSIFICATION_SV,
1101
+ DATA_CLASSIFICATION_ZH,
1102
+ DATA_CLASSIFICATION_OTHER,
1103
+ DATA_CLUSTERING,
1104
+ DATA_CLUSTERING_DE,
1105
+ DATA_CLUSTERING_PL,
1106
+ DATA_CLUSTERING_ZH,
1107
+ DATA_PAIR_CLASSIFICATION,
1108
+ DATA_PAIR_CLASSIFICATION_PL,
1109
+ DATA_PAIR_CLASSIFICATION_ZH,
1110
+ DATA_RERANKING,
1111
+ DATA_RERANKING_ZH,
1112
+ DATA_RETRIEVAL,
1113
+ DATA_RETRIEVAL_PL,
1114
+ DATA_RETRIEVAL_ZH,
1115
+ DATA_STS_EN,
1116
+ DATA_STS_PL,
1117
+ DATA_STS_ZH,
1118
+ DATA_STS_OTHER,
1119
+ DATA_SUMMARIZATION,
1120
+ ]:
1121
+ # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
1122
+ cols_to_ignore = 3 if "Average" in d.columns else 2
1123
+ # Count number of scores including only non-nan floats & excluding the rank column
1124
+ NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
1125
+ # Exclude rank & model name column (first two); Do not count different language versions as different datasets
1126
+ DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
1127
+ # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
1128
+ MODELS += d["Model"].tolist()
1129
+
1130
+ NUM_DATASETS = len(set(DATASETS))
1131
+ # NUM_LANGUAGES = len(set(LANGUAGES))
1132
+ NUM_MODELS = len(set(MODELS))
1133
+
1134
+ block = gr.Blocks()
1135
+ with block:
1136
+ gr.Markdown(f"""
1137
+ Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
1138
+
1139
+ - **Total Datasets**: {NUM_DATASETS}
1140
+ - **Total Languages**: 113
1141
+ - **Total Scores**: {NUM_SCORES}
1142
+ - **Total Models**: {NUM_MODELS}
1143
+ """)
1144
+ with gr.Tabs():
1145
+ with gr.TabItem("Overall"):
1146
+ with gr.TabItem("English"):
1147
+ with gr.Row():
1148
+ gr.Markdown("""
1149
+ **Overall MTEB English leaderboard** 🔮
1150
+
1151
+ - **Metric:** Various, refer to task tabs
1152
+ - **Languages:** English
1153
+ """)
1154
+ with gr.Row():
1155
+ data_overall = gr.components.Dataframe(
1156
+ DATA_OVERALL,
1157
+ datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns),
1158
+ type="pandas",
1159
+ wrap=True,
1160
+ )
1161
+ with gr.Row():
1162
+ data_run_overall = gr.Button("Refresh")
1163
+ data_run_overall.click(get_mteb_average, inputs=None, outputs=data_overall)
1164
+ with gr.TabItem("Chinese"):
1165
+ with gr.Row():
1166
+ gr.Markdown("""
1167
+ **Overall MTEB Chinese leaderboard (C-MTEB)** 🔮🇨🇳
1168
+
1169
+ - **Metric:** Various, refer to task tabs
1170
+ - **Languages:** Chinese
1171
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1172
+ """)
1173
+ with gr.Row():
1174
+ data_overall_zh = gr.components.Dataframe(
1175
+ DATA_OVERALL_ZH,
1176
+ datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_ZH.columns),
1177
+ type="pandas",
1178
+ wrap=True,
1179
+ )
1180
+ with gr.Row():
1181
+ data_run_overall_zh = gr.Button("Refresh")
1182
+ data_run_overall_zh.click(get_mteb_average_zh, inputs=None, outputs=data_overall_zh)
1183
+ with gr.TabItem("Polish"):
1184
+ with gr.Row():
1185
+ gr.Markdown("""
1186
+ **Overall MTEB Polish leaderboard (PL-MTEB)** 🔮🇵🇱
1187
+
1188
+ - **Metric:** Various, refer to task tabs
1189
+ - **Languages:** Polish
1190
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata), [Konrad Wojtasik](https://github.com/kwojtasi) & [BEIR-PL](https://arxiv.org/abs/2305.19840)
1191
+ """)
1192
+ with gr.Row():
1193
+ data_overall_pl = gr.components.Dataframe(
1194
+ DATA_OVERALL_PL,
1195
+ datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL_PL.columns),
1196
+ type="pandas",
1197
+ wrap=True,
1198
+ )
1199
+ with gr.Row():
1200
+ data_run_overall_pl = gr.Button("Refresh")
1201
+ data_run_overall_pl.click(get_mteb_average_pl, inputs=None, outputs=data_overall_pl)
1202
+ with gr.TabItem("Bitext Mining"):
1203
+ with gr.TabItem("English-X"):
1204
+ with gr.Row():
1205
+ gr.Markdown("""
1206
+ **Bitext Mining English-X Leaderboard** 🎌
1207
+
1208
+ - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
1209
+ - **Languages:** 117 (Pairs of: English & other language)
1210
+ """)
1211
+ with gr.Row():
1212
+ data_bitext_mining = gr.components.Dataframe(
1213
+ DATA_BITEXT_MINING,
1214
+ datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING.columns),
1215
+ type="pandas",
1216
+ )
1217
+ with gr.Row():
1218
+ data_run_bitext_mining = gr.Button("Refresh")
1219
+ data_run_bitext_mining.click(
1220
+ partial(get_mteb_data, tasks=["BitextMining"], datasets=TASK_LIST_BITEXT_MINING),
1221
+ outputs=data_bitext_mining,
1222
+ )
1223
+ with gr.TabItem("Danish"):
1224
+ with gr.Row():
1225
+ gr.Markdown("""
1226
+ **Bitext Mining Danish Leaderboard** 🎌🇩🇰
1227
+
1228
+ - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
1229
+ - **Languages:** Danish & Bornholmsk (Danish Dialect)
1230
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1231
+ """)
1232
+ with gr.Row():
1233
+ data_bitext_mining_da = gr.components.Dataframe(
1234
+ DATA_BITEXT_MINING_OTHER,
1235
+ datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
1236
+ type="pandas",
1237
+ )
1238
+ with gr.Row():
1239
+ data_run_bitext_mining_da = gr.Button("Refresh")
1240
+ data_run_bitext_mining_da.click(
1241
+ partial(get_mteb_data, tasks=["BitextMining"], datasets=TASK_LIST_BITEXT_MINING_OTHER),
1242
+ outputs=data_bitext_mining_da,
1243
+ )
1244
+ with gr.TabItem("Classification"):
1245
+ with gr.TabItem("English"):
1246
+ with gr.Row():
1247
+ gr.Markdown("""
1248
+ **Classification English Leaderboard** ❤️
1249
+
1250
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1251
+ - **Languages:** English
1252
+ """)
1253
+ with gr.Row():
1254
+ data_classification_en = gr.components.Dataframe(
1255
+ DATA_CLASSIFICATION_EN,
1256
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
1257
+ type="pandas",
1258
+ )
1259
+ with gr.Row():
1260
+ data_run_classification_en = gr.Button("Refresh")
1261
+ data_run_classification_en.click(
1262
+ partial(get_mteb_data, tasks=["Classification"], langs=["en"]),
1263
+ outputs=data_classification_en,
1264
+ )
1265
+ with gr.TabItem("Chinese"):
1266
+ with gr.Row():
1267
+ gr.Markdown("""
1268
+ **Classification Chinese Leaderboard** 🧡🇨🇳
1269
+
1270
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1271
+ - **Languages:** Chinese
1272
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1273
+ """)
1274
+ with gr.Row():
1275
+ data_classification_zh = gr.components.Dataframe(
1276
+ DATA_CLASSIFICATION_ZH,
1277
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_ZH.columns),
1278
+ type="pandas",
1279
+ )
1280
+ with gr.Row():
1281
+ data_run_classification_zh = gr.Button("Refresh")
1282
+ data_run_classification_zh.click(
1283
+ partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_ZH),
1284
+ outputs=data_classification_zh,
1285
+ )
1286
+ with gr.TabItem("Danish"):
1287
+ with gr.Row():
1288
+ gr.Markdown("""
1289
+ **Classification Danish Leaderboard** 🤍🇩🇰
1290
+
1291
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1292
+ - **Languages:** Danish
1293
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1294
+ """)
1295
+ with gr.Row():
1296
+ data_classification_da = gr.components.Dataframe(
1297
+ DATA_CLASSIFICATION_DA,
1298
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_DA.columns),
1299
+ type="pandas",
1300
+ )
1301
+ with gr.Row():
1302
+ data_run_classification_da = gr.Button("Refresh")
1303
+ data_run_classification_da.click(
1304
+ partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_DA),
1305
+ outputs=data_run_classification_da,
1306
+ )
1307
+ with gr.TabItem("Norwegian"):
1308
+ with gr.Row():
1309
+ gr.Markdown("""
1310
+ **Classification Norwegian Leaderboard** 💙🇳🇴
1311
+
1312
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1313
+ - **Languages:** Norwegian Bokmål
1314
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1315
+ """)
1316
+ with gr.Row():
1317
+ data_classification_nb = gr.components.Dataframe(
1318
+ DATA_CLASSIFICATION_NB,
1319
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_NB.columns),
1320
+ type="pandas",
1321
+ )
1322
+ with gr.Row():
1323
+ data_run_classification_nb = gr.Button("Refresh")
1324
+ data_run_classification_nb.click(
1325
+ partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_NB),
1326
+ outputs=data_classification_nb,
1327
+ )
1328
+ with gr.TabItem("Polish"):
1329
+ with gr.Row():
1330
+ gr.Markdown("""
1331
+ **Classification Polish Leaderboard** 🤍🇵🇱
1332
+
1333
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1334
+ - **Languages:** Polish
1335
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1336
+ """)
1337
+ with gr.Row():
1338
+ data_classification_pl = gr.components.Dataframe(
1339
+ DATA_CLASSIFICATION_PL,
1340
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_PL.columns),
1341
+ type="pandas",
1342
+ )
1343
+ with gr.Row():
1344
+ data_run_classification_pl = gr.Button("Refresh")
1345
+ data_run_classification_pl.click(
1346
+ partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_PL),
1347
+ outputs=data_classification_pl,
1348
+ )
1349
+ with gr.TabItem("Swedish"):
1350
+ with gr.Row():
1351
+ gr.Markdown("""
1352
+ **Classification Swedish Leaderboard** 💛🇸🇪
1353
+
1354
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1355
+ - **Languages:** Swedish
1356
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
1357
+ """)
1358
+ with gr.Row():
1359
+ data_classification_sv = gr.components.Dataframe(
1360
+ DATA_CLASSIFICATION_SV,
1361
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_SV.columns),
1362
+ type="pandas",
1363
+ )
1364
+ with gr.Row():
1365
+ data_run_classification_sv = gr.Button("Refresh")
1366
+ data_run_classification_sv.click(
1367
+ partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_SV),
1368
+ outputs=data_classification_sv,
1369
+ )
1370
+ with gr.TabItem("Other"):
1371
+ with gr.Row():
1372
+ gr.Markdown("""
1373
+ **Classification Other Languages Leaderboard** 💜💚💙
1374
+
1375
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
1376
+ - **Languages:** 47 (Only languages not included in the other tabs)
1377
+ """)
1378
+ with gr.Row():
1379
+ data_classification = gr.components.Dataframe(
1380
+ DATA_CLASSIFICATION_OTHER,
1381
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_OTHER) * 10,
1382
+ type="pandas",
1383
+ )
1384
+ with gr.Row():
1385
+ data_run_classification = gr.Button("Refresh")
1386
+ data_run_classification.click(
1387
+ partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_OTHER),
1388
+ outputs=data_classification,
1389
+ )
1390
+ with gr.TabItem("Clustering"):
1391
+ with gr.TabItem("English"):
1392
+ with gr.Row():
1393
+ gr.Markdown("""
1394
+ **Clustering Leaderboard** ✨
1395
+
1396
+ - **Metric:** Validity Measure (v_measure)
1397
+ - **Languages:** English
1398
+ """)
1399
+ with gr.Row():
1400
+ data_clustering = gr.components.Dataframe(
1401
+ DATA_CLUSTERING,
1402
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
1403
+ type="pandas",
1404
+ )
1405
+ with gr.Row():
1406
+ data_run_clustering_en = gr.Button("Refresh")
1407
+ data_run_clustering_en.click(
1408
+ partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING),
1409
+ outputs=data_clustering,
1410
+ )
1411
+ with gr.TabItem("Chinese"):
1412
+ with gr.Row():
1413
+ gr.Markdown("""
1414
+ **Clustering Chinese Leaderboard** ✨🇨🇳
1415
+
1416
+ - **Metric:** Validity Measure (v_measure)
1417
+ - **Languages:** Chinese
1418
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1419
+ """)
1420
+ with gr.Row():
1421
+ data_clustering_zh = gr.components.Dataframe(
1422
+ DATA_CLUSTERING_ZH,
1423
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_ZH.columns),
1424
+ type="pandas",
1425
+ )
1426
+ with gr.Row():
1427
+ data_run_clustering_zh = gr.Button("Refresh")
1428
+ data_run_clustering_zh.click(
1429
+ partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_ZH),
1430
+ outputs=data_clustering_zh,
1431
+ )
1432
+ with gr.TabItem("German"):
1433
+ with gr.Row():
1434
+ gr.Markdown("""
1435
+ **Clustering German Leaderboard** ✨🇩🇪
1436
+
1437
+ - **Metric:** Validity Measure (v_measure)
1438
+ - **Languages:** German
1439
+ - **Credits:** [Silvan](https://github.com/slvnwhrl)
1440
+ """)
1441
+ with gr.Row():
1442
+ data_clustering_de = gr.components.Dataframe(
1443
+ DATA_CLUSTERING_DE,
1444
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_DE.columns) * 2,
1445
+ type="pandas",
1446
+ )
1447
+ with gr.Row():
1448
+ data_run_clustering_de = gr.Button("Refresh")
1449
+ data_run_clustering_de.click(
1450
+ partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_DE),
1451
+ outputs=data_clustering_de,
1452
+ )
1453
+ with gr.TabItem("Polish"):
1454
+ with gr.Row():
1455
+ gr.Markdown("""
1456
+ **Clustering Polish Leaderboard** ✨🇵🇱
1457
+
1458
+ - **Metric:** Validity Measure (v_measure)
1459
+ - **Languages:** Polish
1460
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1461
+ """)
1462
+ with gr.Row():
1463
+ data_clustering_pl = gr.components.Dataframe(
1464
+ DATA_CLUSTERING_PL,
1465
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_PL.columns) * 2,
1466
+ type="pandas",
1467
+ )
1468
+ with gr.Row():
1469
+ data_run_clustering_pl = gr.Button("Refresh")
1470
+ data_run_clustering_pl.click(
1471
+ partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_PL),
1472
+ outputs=data_clustering_pl,
1473
+ )
1474
+ with gr.TabItem("Pair Classification"):
1475
+ with gr.TabItem("English"):
1476
+ with gr.Row():
1477
+ gr.Markdown("""
1478
+ **Pair Classification English Leaderboard** 🎭
1479
+
1480
+ - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1481
+ - **Languages:** English
1482
+ """)
1483
+ with gr.Row():
1484
+ data_pair_classification = gr.components.Dataframe(
1485
+ DATA_PAIR_CLASSIFICATION,
1486
+ datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
1487
+ type="pandas",
1488
+ )
1489
+ with gr.Row():
1490
+ data_run_pair_classification = gr.Button("Refresh")
1491
+ data_run_pair_classification.click(
1492
+ partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION),
1493
+ outputs=data_pair_classification,
1494
+ )
1495
+ with gr.TabItem("Chinese"):
1496
+ with gr.Row():
1497
+ gr.Markdown("""
1498
+ **Pair Classification Chinese Leaderboard** 🎭🇨🇳
1499
+
1500
+ - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1501
+ - **Languages:** Chinese
1502
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1503
+ """)
1504
+ with gr.Row():
1505
+ data_pair_classification_zh = gr.components.Dataframe(
1506
+ DATA_PAIR_CLASSIFICATION_ZH,
1507
+ datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION_ZH.columns),
1508
+ type="pandas",
1509
+ )
1510
+ with gr.Row():
1511
+ data_run_pair_classification_zh = gr.Button("Refresh")
1512
+ data_run_pair_classification_zh.click(
1513
+ partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION_ZH),
1514
+ outputs=data_pair_classification_zh,
1515
+ )
1516
+ with gr.TabItem("Polish"):
1517
+ with gr.Row():
1518
+ gr.Markdown("""
1519
+ **Pair Classification Polish Leaderboard** 🎭🇵🇱
1520
+
1521
+ - **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap)
1522
+ - **Languages:** Polish
1523
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1524
+ """)
1525
+ with gr.Row():
1526
+ data_pair_classification_pl = gr.components.Dataframe(
1527
+ DATA_PAIR_CLASSIFICATION_PL,
1528
+ datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION_PL.columns),
1529
+ type="pandas",
1530
+ )
1531
+ with gr.Row():
1532
+ data_run_pair_classification_pl = gr.Button("Refresh")
1533
+ data_run_pair_classification_pl.click(
1534
+ partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION_PL),
1535
+ outputs=data_pair_classification_pl,
1536
+ )
1537
+ with gr.TabItem("Reranking"):
1538
+ with gr.TabItem("English"):
1539
+ with gr.Row():
1540
+ gr.Markdown("""
1541
+ **Reranking English Leaderboard** 🥈
1542
+
1543
+ - **Metric:** Mean Average Precision (MAP)
1544
+ - **Languages:** English
1545
+ """)
1546
+ with gr.Row():
1547
+ data_reranking = gr.components.Dataframe(
1548
+ DATA_RERANKING,
1549
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
1550
+ type="pandas",
1551
+ )
1552
+ with gr.Row():
1553
+ data_run_reranking = gr.Button("Refresh")
1554
+ data_run_reranking.click(
1555
+ partial(get_mteb_data, tasks=["Reranking"], datasets=TASK_LIST_RERANKING),
1556
+ outputs=data_reranking,
1557
+ )
1558
+ with gr.TabItem("Chinese"):
1559
+ with gr.Row():
1560
+ gr.Markdown("""
1561
+ **Reranking Chinese Leaderboard** 🥈🇨🇳
1562
+
1563
+ - **Metric:** Mean Average Precision (MAP)
1564
+ - **Languages:** Chinese
1565
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1566
+ """)
1567
+ with gr.Row():
1568
+ data_reranking_zh = gr.components.Dataframe(
1569
+ DATA_RERANKING_ZH,
1570
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING_ZH.columns),
1571
+ type="pandas",
1572
+ )
1573
+ with gr.Row():
1574
+ data_run_reranking_zh = gr.Button("Refresh")
1575
+ data_run_reranking_zh.click(
1576
+ partial(get_mteb_data, tasks=["Reranking"], datasets=TASK_LIST_RERANKING_ZH),
1577
+ outputs=data_reranking_zh,
1578
+ )
1579
+ with gr.TabItem("Retrieval"):
1580
+ with gr.TabItem("English"):
1581
+ with gr.Row():
1582
+ gr.Markdown("""
1583
+ **Retrieval English Leaderboard** 🔎
1584
+
1585
+ - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1586
+ - **Languages:** English
1587
+ """)
1588
+ with gr.Row():
1589
+ data_retrieval = gr.components.Dataframe(
1590
+ DATA_RETRIEVAL,
1591
+ # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1592
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
1593
+ type="pandas",
1594
+ )
1595
+ with gr.Row():
1596
+ data_run_retrieval = gr.Button("Refresh")
1597
+ data_run_retrieval.click(
1598
+ partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL),
1599
+ outputs=data_retrieval,
1600
+ )
1601
+ with gr.TabItem("Chinese"):
1602
+ with gr.Row():
1603
+ gr.Markdown("""
1604
+ **Retrieval Chinese Leaderboard** 🔎🇨🇳
1605
+
1606
+ - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1607
+ - **Languages:** Chinese
1608
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1609
+ """)
1610
+ with gr.Row():
1611
+ data_retrieval_zh = gr.components.Dataframe(
1612
+ DATA_RETRIEVAL_ZH,
1613
+ # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1614
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL_ZH.columns) * 2,
1615
+ type="pandas",
1616
+ )
1617
+ with gr.Row():
1618
+ data_run_retrieval_zh = gr.Button("Refresh")
1619
+ data_run_retrieval_zh.click(
1620
+ partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_ZH),
1621
+ outputs=data_retrieval_zh,
1622
+ )
1623
+ with gr.TabItem("Polish"):
1624
+ with gr.Row():
1625
+ gr.Markdown("""
1626
+ **Retrieval Polish Leaderboard** 🔎🇵🇱
1627
+
1628
+ - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1629
+ - **Languages:** Polish
1630
+ - **Credits:** [Konrad Wojtasik](https://github.com/kwojtasi) & [BEIR-PL](https://arxiv.org/abs/2305.19840)
1631
+ """)
1632
+ with gr.Row():
1633
+ data_retrieval_pl = gr.components.Dataframe(
1634
+ DATA_RETRIEVAL_PL,
1635
+ # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1636
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL_PL.columns) * 2,
1637
+ type="pandas",
1638
+ )
1639
+ with gr.Row():
1640
+ data_run_retrieval_pl = gr.Button("Refresh")
1641
+ data_run_retrieval_pl.click(
1642
+ partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_PL),
1643
+ outputs=data_retrieval_pl,
1644
+ )
1645
+ with gr.TabItem("STS"):
1646
+ with gr.TabItem("English"):
1647
+ with gr.Row():
1648
+ gr.Markdown("""
1649
+ **STS English Leaderboard** 🤖
1650
+
1651
+ - **Metric:** Spearman correlation based on cosine similarity
1652
+ - **Languages:** English
1653
+ """)
1654
+ with gr.Row():
1655
+ data_sts_en = gr.components.Dataframe(
1656
+ DATA_STS_EN,
1657
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_EN.columns),
1658
+ type="pandas",
1659
+ )
1660
+ with gr.Row():
1661
+ data_run_sts_en = gr.Button("Refresh")
1662
+ data_run_sts_en.click(
1663
+ partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS),
1664
+ outputs=data_sts_en,
1665
+ )
1666
+ with gr.TabItem("Chinese"):
1667
+ with gr.Row():
1668
+ gr.Markdown("""
1669
+ **STS Chinese Leaderboard** 🤖🇨🇳
1670
+
1671
+ - **Metric:** Spearman correlation based on cosine similarity
1672
+ - **Languages:** Chinese
1673
+ - **Credits:** [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)
1674
+ """)
1675
+ with gr.Row():
1676
+ data_sts_zh = gr.components.Dataframe(
1677
+ DATA_STS_ZH,
1678
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_ZH.columns),
1679
+ type="pandas",
1680
+ )
1681
+ with gr.Row():
1682
+ data_run_sts_zh = gr.Button("Refresh")
1683
+ data_run_sts_zh.click(
1684
+ partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_ZH),
1685
+ outputs=data_sts_zh,
1686
+ )
1687
+ with gr.TabItem("Polish"):
1688
+ with gr.Row():
1689
+ gr.Markdown("""
1690
+ **STS Polish Leaderboard** 🤖🇵🇱
1691
+
1692
+ - **Metric:** Spearman correlation based on cosine similarity
1693
+ - **Languages:** Polish
1694
+ - **Credits:** [Rafał Poświata](https://github.com/rafalposwiata)
1695
+ """)
1696
+ with gr.Row():
1697
+ data_sts_pl = gr.components.Dataframe(
1698
+ DATA_STS_PL,
1699
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_PL.columns),
1700
+ type="pandas",
1701
+ )
1702
+ with gr.Row():
1703
+ data_run_sts_pl = gr.Button("Refresh")
1704
+ data_run_sts_pl.click(
1705
+ partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_PL),
1706
+ outputs=data_sts_pl,
1707
+ )
1708
+ with gr.TabItem("Other"):
1709
+ with gr.Row():
1710
+ gr.Markdown("""
1711
+ **STS Other Leaderboard** 👽
1712
+
1713
+ - **Metric:** Spearman correlation based on cosine similarity
1714
+ - **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)
1715
+ """)
1716
+ with gr.Row():
1717
+ data_sts_other = gr.components.Dataframe(
1718
+ DATA_STS_OTHER,
1719
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_OTHER.columns) * 2,
1720
+ type="pandas",
1721
+ )
1722
+ with gr.Row():
1723
+ data_run_sts_other = gr.Button("Refresh")
1724
+ data_run_sts_other.click(
1725
+ partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_OTHER),
1726
+ outputs=data_sts_other,
1727
+ )
1728
+ with gr.TabItem("Summarization"):
1729
+ with gr.Row():
1730
+ gr.Markdown("""
1731
+ **Summarization Leaderboard** 📜
1732
+
1733
+ - **Metric:** Spearman correlation based on cosine similarity
1734
+ - **Languages:** English
1735
+ """)
1736
+ with gr.Row():
1737
+ data_summarization = gr.components.Dataframe(
1738
+ DATA_SUMMARIZATION,
1739
+ datatype=["number", "markdown"] + ["number"] * 2,
1740
+ type="pandas",
1741
+ )
1742
+ with gr.Row():
1743
+ data_run = gr.Button("Refresh")
1744
+ data_run.click(
1745
+ partial(get_mteb_data, tasks=["Summarization"]),
1746
+ outputs=data_summarization,
1747
+ )
1748
+ gr.Markdown(r"""
1749
+
1750
+ Made with ❤️ for NLP. If this work is useful to you, please consider citing:
1751
+
1752
+ ```bibtex
1753
+ @article{muennighoff2022mteb,
1754
+ doi = {10.48550/ARXIV.2210.07316},
1755
+ url = {https://arxiv.org/abs/2210.07316},
1756
+ author = {Muennighoff, Niklas and Tazi, Nouamane and Magne, Lo{\"\i}c and Reimers, Nils},
1757
+ title = {MTEB: Massive Text Embedding Benchmark},
1758
+ publisher = {arXiv},
1759
+ journal={arXiv preprint arXiv:2210.07316},
1760
+ year = {2022}
1761
+ }
1762
+ ```
1763
+ """)
1764
+ # Running the functions on page load in addition to when the button is clicked
1765
+ # This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
1766
+ """
1767
+ block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
1768
+ """
1769
+
1770
+ block.queue(max_size=10)
1771
+ block.launch()
1772
+
1773
+
1774
+ # Possible changes:
1775
+ # Could add graphs / other visual content
1776
+ # Could add verification marks
1777
+
1778
+ # Sources:
1779
+ # https://huggingface.co/spaces/gradio/leaderboard
1780
+ # https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard
1781
+ # https://getemoji.com/
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ datasets
3
+ pandas
4
+ huggingface_hub