diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -5,344 +5,24 @@ import re from datasets import load_dataset import gradio as gr -from huggingface_hub import HfApi, hf_hub_download +from huggingface_hub import hf_hub_download from huggingface_hub.repocard import metadata_load import pandas as pd from tqdm.autonotebook import tqdm from utils.model_size import get_model_parameters_memory +from envs import LEADERBOARD_CONFIG, MODEL_META, REPO_ID, RESULTS_REPO, API -TASKS = [ - "BitextMining", - "Classification", - "Clustering", - "PairClassification", - "Reranking", - "Retrieval", - "STS", - "Summarization", -] - -TASK_LIST_BITEXT_MINING = ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)'] -TASK_LIST_BITEXT_MINING_DA = ["BornholmBitextMining"] - -TASK_LIST_CLASSIFICATION = [ - "AmazonCounterfactualClassification (en)", - "AmazonPolarityClassification", - "AmazonReviewsClassification (en)", - "Banking77Classification", - "EmotionClassification", - "ImdbClassification", - "MassiveIntentClassification (en)", - "MassiveScenarioClassification (en)", - "MTOPDomainClassification (en)", - "MTOPIntentClassification (en)", - "ToxicConversationsClassification", - "TweetSentimentExtractionClassification", -] - -TASK_LIST_CLASSIFICATION_DA = [ - "AngryTweetsClassification", - "DanishPoliticalCommentsClassification", - "DKHateClassification", - "LccSentimentClassification", - "MassiveIntentClassification (da)", - "MassiveScenarioClassification (da)", - "NordicLangClassification", - "ScalaDaClassification", -] - -TASK_LIST_CLASSIFICATION_FR = [ - "AmazonReviewsClassification (fr)", - "MasakhaNEWSClassification (fra)", - "MassiveIntentClassification (fr)", - "MassiveScenarioClassification (fr)", - "MTOPDomainClassification (fr)", - "MTOPIntentClassification (fr)", -] - -TASK_LIST_CLASSIFICATION_NB = [ - "NoRecClassification", - "NordicLangClassification", - "NorwegianParliament", - "MassiveIntentClassification (nb)", - "MassiveScenarioClassification (nb)", - "ScalaNbClassification", -] - -TASK_LIST_CLASSIFICATION_PL = [ - "AllegroReviews", - "CBD", - "MassiveIntentClassification (pl)", - "MassiveScenarioClassification (pl)", - "PAC", - "PolEmo2.0-IN", - "PolEmo2.0-OUT", -] - -TASK_LIST_CLASSIFICATION_SV = [ - "DalajClassification", - "MassiveIntentClassification (sv)", - "MassiveScenarioClassification (sv)", - "NordicLangClassification", - "ScalaSvClassification", - "SweRecClassification", -] - -TASK_LIST_CLASSIFICATION_ZH = [ - "AmazonReviewsClassification (zh)", - "IFlyTek", - "JDReview", - "MassiveIntentClassification (zh-CN)", - "MassiveScenarioClassification (zh-CN)", - "MultilingualSentiment", - "OnlineShopping", - "TNews", - "Waimai", -] - -TASK_LIST_CLASSIFICATION_OTHER = ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)'] - -TASK_LIST_CLUSTERING = [ - "ArxivClusteringP2P", - "ArxivClusteringS2S", - "BiorxivClusteringP2P", - "BiorxivClusteringS2S", - "MedrxivClusteringP2P", - "MedrxivClusteringS2S", - "RedditClustering", - "RedditClusteringP2P", - "StackExchangeClustering", - "StackExchangeClusteringP2P", - "TwentyNewsgroupsClustering", -] - - -TASK_LIST_CLUSTERING_DE = [ - "BlurbsClusteringP2P", - "BlurbsClusteringS2S", - "TenKGnadClusteringP2P", - "TenKGnadClusteringS2S", -] - -TASK_LIST_CLUSTERING_FR = [ - "AlloProfClusteringP2P", - "AlloProfClusteringS2S", - "HALClusteringS2S", - "MLSUMClusteringP2P", - "MLSUMClusteringS2S", - "MasakhaNEWSClusteringP2P (fra)", - "MasakhaNEWSClusteringS2S (fra)", -] - -TASK_LIST_CLUSTERING_PL = [ - "8TagsClustering", -] - -TASK_LIST_CLUSTERING_ZH = [ - "CLSClusteringP2P", - "CLSClusteringS2S", - "ThuNewsClusteringP2P", - "ThuNewsClusteringS2S", -] - -TASK_LIST_PAIR_CLASSIFICATION = [ - "SprintDuplicateQuestions", - "TwitterSemEval2015", - "TwitterURLCorpus", -] - -TASK_LIST_PAIR_CLASSIFICATION_FR = [ - "OpusparcusPC (fr)", - "PawsX (fr)", -] - -TASK_LIST_PAIR_CLASSIFICATION_PL = [ - "CDSC-E", - "PPC", - "PSC", - "SICK-E-PL", -] - -TASK_LIST_PAIR_CLASSIFICATION_ZH = [ - "Cmnli", - "Ocnli", -] - -TASK_LIST_RERANKING = [ - "AskUbuntuDupQuestions", - "MindSmallReranking", - "SciDocsRR", - "StackOverflowDupQuestions", -] - -TASK_LIST_RERANKING_FR = [ - "AlloprofReranking", - "SyntecReranking", -] - -TASK_LIST_RERANKING_ZH = [ - "CMedQAv1", - "CMedQAv2", - "MMarcoReranking", - "T2Reranking", -] +TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"] +BOARDS_CONFIG = LEADERBOARD_CONFIG["boards"] -TASK_LIST_RETRIEVAL = [ - "ArguAna", - "ClimateFEVER", - "CQADupstackRetrieval", - "DBPedia", - "FEVER", - "FiQA2018", - "HotpotQA", - "MSMARCO", - "NFCorpus", - "NQ", - "QuoraRetrieval", - "SCIDOCS", - "SciFact", - "Touche2020", - "TRECCOVID", -] - -TASK_LIST_RETRIEVAL_FR = [ - "AlloprofRetrieval", - "BSARDRetrieval", - "MintakaRetrieval (fr)", -# "MultiLongDocRetrieval", - "SyntecRetrieval", - "XPQARetrieval (fr)", -] - -TASK_LIST_RETRIEVAL_LAW = [ - "AILACasedocs", - "AILAStatutes", - "GerDaLIRSmall", - "LeCaRDv2", - "LegalBenchConsumerContractsQA", - "LegalBenchCorporateLobbying", - "LegalQuAD", - "LegalSummarization", -] - -TASK_LIST_RETRIEVAL_INSTRUCTIONS = [ - "Robust04InstructionRetrieval", - "News21InstructionRetrieval", - "Core17InstructionRetrieval", -] - -TASK_LIST_RETRIEVAL_PL = [ - "ArguAna-PL", - "DBPedia-PL", - "FiQA-PL", - "HotpotQA-PL", - "MSMARCO-PL", - "NFCorpus-PL", - "NQ-PL", - "Quora-PL", - "SCIDOCS-PL", - "SciFact-PL", - "TRECCOVID-PL", -] - -TASK_LIST_RETRIEVAL_ZH = [ - "CmedqaRetrieval", - "CovidRetrieval", - "DuRetrieval", - "EcomRetrieval", - "MedicalRetrieval", - "MMarcoRetrieval", - "T2Retrieval", - "VideoRetrieval", -] - -TASK_LIST_RETRIEVAL_NORM = TASK_LIST_RETRIEVAL + [ - "CQADupstackAndroidRetrieval", - "CQADupstackEnglishRetrieval", - "CQADupstackGamingRetrieval", - "CQADupstackGisRetrieval", - "CQADupstackMathematicaRetrieval", - "CQADupstackPhysicsRetrieval", - "CQADupstackProgrammersRetrieval", - "CQADupstackStatsRetrieval", - "CQADupstackTexRetrieval", - "CQADupstackUnixRetrieval", - "CQADupstackWebmastersRetrieval", - "CQADupstackWordpressRetrieval" -] - -TASK_LIST_STS = [ - "BIOSSES", - "SICK-R", - "STS12", - "STS13", - "STS14", - "STS15", - "STS16", - "STS17 (en-en)", - "STS22 (en)", - "STSBenchmark", -] - -TASK_LIST_STS_FR = [ - "STS22 (fr)", - "STSBenchmarkMultilingualSTS (fr)", - "SICKFr", -] - -TASK_LIST_STS_PL = [ - "CDSC-R", - "SICK-R-PL", - "STS22 (pl)", -] - -TASK_LIST_STS_ZH = [ - "AFQMC", - "ATEC", - "BQ", - "LCQMC", - "PAWSX", - "QBQTC", - "STS22 (zh)", - "STSB", -] - -TASK_LIST_STS_OTHER = ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark",] - -TASK_LIST_SUMMARIZATION = ["SummEval",] - -TASK_LIST_SUMMARIZATION_FR = ["SummEvalFr"] - -TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION -TASK_LIST_FR = TASK_LIST_CLASSIFICATION_FR + TASK_LIST_CLUSTERING_FR + TASK_LIST_PAIR_CLASSIFICATION_FR + TASK_LIST_RERANKING_FR + TASK_LIST_RETRIEVAL_FR + TASK_LIST_STS_FR + TASK_LIST_SUMMARIZATION_FR -TASK_LIST_PL = TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL -TASK_LIST_ZH = TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH - -TASK_TO_METRIC = { - "BitextMining": "f1", - "Clustering": "v_measure", - "Classification": "accuracy", - "PairClassification": "cos_sim_ap", - "Reranking": "map", - "Retrieval": "ndcg_at_10", - "STS": "cos_sim_spearman", - "Summarization": "cos_sim_spearman", - "InstructionRetrieval": "p-MRR", +TASKS = list(TASKS_CONFIG.keys()) +PRETTY_NAMES = { + "InstructionRetrieval": "Retrieval w/Instructions", + "PairClassification": "Pair Classification" } -TASK_DESCRIPTION = { - "Bitext Mining": "Bitext mining is the task of finding parallel sentences in two languages.", - "Clustering": "Clustering is the task of grouping similar documents together.", - "Classification": "Classification is the task of assigning a label to a text.", - "Pair Classification": "Pair classification is the task of determining whether two texts are similar.", - "Reranking": "Reranking is the task of reordering a list of documents to improve relevance.", - "Retrieval": "Retrieval is the task of finding relevant documents for a query.", - "STS": "Semantic Textual Similarity is the task of determining how similar two texts are.", - "Summarization": "Summarization is the task of generating a summary of a text.", - "Retrieval w/Instructions": "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions.", - "Overall": "Overall performance across MTEB tasks.", -} +TASK_TO_METRIC = {k:v["metric"] for k,v in TASKS_CONFIG.items()} def make_clickable_model(model_name, link=None): if link is None: @@ -352,984 +32,36 @@ def make_clickable_model(model_name, link=None): f'{model_name.split("/")[-1]}' ) -# Models without metadata, thus we cannot fetch their results naturally -EXTERNAL_MODELS = [ - "Baichuan-text-embedding", - "Cohere-embed-english-v3.0", - "Cohere-embed-multilingual-v3.0", - "Cohere-embed-multilingual-light-v3.0", - "DanskBERT", - "FollowIR-7B", - "GritLM-7B", - "LASER2", - "LLM2Vec-Llama-supervised", - "LLM2Vec-Llama-unsupervised", - "LLM2Vec-Mistral-supervised", - "LLM2Vec-Mistral-unsupervised", - "LLM2Vec-Sheared-Llama-supervised", - "LLM2Vec-Sheared-Llama-unsupervised", - "LaBSE", - "OpenSearch-text-hybrid", - "all-MiniLM-L12-v2", - "all-MiniLM-L6-v2", - "all-mpnet-base-v2", - "allenai-specter", - "bert-base-10lang-cased", - "bert-base-15lang-cased", - "bert-base-25lang-cased", - "bert-base-multilingual-cased", - "bert-base-multilingual-uncased", - "bert-base-swedish-cased", - "bert-base-uncased", - "bge-base-zh-v1.5", - "bge-large-en-v1.5", - "bge-large-zh-v1.5", - "bge-large-zh-noinstruct", - "bge-small-zh-v1.5", - "bm25", - "contriever-base-msmarco", - "cross-en-de-roberta-sentence-transformer", - "dfm-encoder-large-v1", - "dfm-sentence-encoder-large-1", - "distiluse-base-multilingual-cased-v2", - "e5-base", - "e5-base-v2", - "e5-large", - "e5-large-v2", - "e5-mistral-7b-instruct", - "e5-small", - "electra-small-nordic", - "electra-small-swedish-cased-discriminator", - "flan-t5-base", - "flan-t5-large", - "flaubert_base_cased", - "flaubert_base_uncased", - "flaubert_large_cased", - "gbert-base", - "gbert-large", - "gelectra-base", - "gelectra-large", - "glove.6B.300d", - "google-gecko.text-embedding-preview-0409", - "google-gecko-256.text-embedding-preview-0409", - "gottbert-base", - "gtr-t5-base", - "gtr-t5-large", - "gtr-t5-xl", - "gtr-t5-xxl", - "herbert-base-retrieval-v2", - "instructor-base", - "instructor-xl", - "komninos", - "llama-2-7b-chat", - "luotuo-bert-medium", - "m3e-base", - "m3e-large", - "mistral-7b-instruct-v0.2", - "mistral-embed", - "monobert-large-msmarco", - "monot5-3b-msmarco-10k", - "monot5-base-msmarco-10k", - "msmarco-bert-co-condensor", - "multi-qa-MiniLM-L6-cos-v1", - "multilingual-e5-base", - "multilingual-e5-large", - "multilingual-e5-small", - "nb-bert-base", - "nb-bert-large", - "nomic-embed-text-v1.5-64", - "nomic-embed-text-v1.5-128", - "nomic-embed-text-v1.5-256", - "nomic-embed-text-v1.5-512", - "norbert3-base", - "norbert3-large", - "paraphrase-multilingual-MiniLM-L12-v2", - "paraphrase-multilingual-mpnet-base-v2", - "sentence-bert-swedish-cased", - "sentence-camembert-base", - "sentence-camembert-large", - "sentence-croissant-llm-base", - "sentence-t5-base", - "sentence-t5-large", - "sentence-t5-xl", - "sentence-t5-xxl", - "silver-retriever-base-v1", - "sup-simcse-bert-base-uncased", - "st-polish-paraphrase-from-distilroberta", - "st-polish-paraphrase-from-mpnet", - "tart-dual-contriever-msmarco", - "tart-full-flan-t5-xl", - "text2vec-base-chinese", - "text2vec-base-multilingual", - "text2vec-large-chinese", - "text-embedding-3-small", - "text-embedding-3-large", - "text-embedding-3-large-256", - "text-embedding-ada-002", - "text-similarity-ada-001", - "text-similarity-babbage-001", - "text-similarity-curie-001", - "text-similarity-davinci-001", - "text-search-ada-doc-001", - "text-search-ada-001", - "text-search-babbage-001", - "text-search-curie-001", - "text-search-davinci-001", - "titan-embed-text-v1", - "udever-bloom-1b1", - "udever-bloom-560m", - "universal-sentence-encoder-multilingual-3", - "universal-sentence-encoder-multilingual-large-3", - "unsup-simcse-bert-base-uncased", - "use-cmlm-multilingual", - "voyage-2", - "voyage-code-2", - "voyage-large-2-instruct", - "voyage-law-2", - "voyage-lite-01-instruct", - "voyage-lite-02-instruct", - "xlm-roberta-base", - "xlm-roberta-large", -] - -EXTERNAL_MODEL_TO_LINK = { - "Baichuan-text-embedding": "https://platform.baichuan-ai.com/docs/text-Embedding", - "Cohere-embed-english-v3.0": "https://huggingface.co/Cohere/Cohere-embed-english-v3.0", - "Cohere-embed-multilingual-v3.0": "https://huggingface.co/Cohere/Cohere-embed-multilingual-v3.0", - "Cohere-embed-multilingual-light-v3.0": "https://huggingface.co/Cohere/Cohere-embed-multilingual-light-v3.0", - "DanskBERT": "https://huggingface.co/vesteinn/DanskBERT", - "LASER2": "https://github.com/facebookresearch/LASER", - "LLM2Vec-Llama-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised", - "LLM2Vec-Llama-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp", - "LLM2Vec-Mistral-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", - "LLM2Vec-Mistral-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp", - "LLM2Vec-Sheared-Llama-supervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised", - "LLM2Vec-Sheared-Llama-unsupervised": "https://huggingface.co/McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp", - "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE", - "FollowIR-7B": "https://huggingface.co/jhu-clsp/FollowIR-7B", - "GritLM-7B": "https://huggingface.co/GritLM/GritLM-7B", - "OpenSearch-text-hybrid": "https://help.aliyun.com/zh/open-search/vector-search-edition/hybrid-retrieval", - "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter", - "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter", - "all-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", - "all-MiniLM-L6-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2", - "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2", - "bert-base-10lang-cased": "https://huggingface.co/Geotrend/bert-base-10lang-cased", - "bert-base-15lang-cased": "https://huggingface.co/Geotrend/bert-base-15lang-cased", - "bert-base-25lang-cased": "https://huggingface.co/Geotrend/bert-base-25lang-cased", - "bert-base-multilingual-cased": "https://huggingface.co/google-bert/bert-base-multilingual-cased", - "bert-base-multilingual-uncased": "https://huggingface.co/google-bert/bert-base-multilingual-uncased", - "bert-base-swedish-cased": "https://huggingface.co/KB/bert-base-swedish-cased", - "bert-base-uncased": "https://huggingface.co/bert-base-uncased", - "bge-base-zh-v1.5": "https://huggingface.co/BAAI/bge-base-zh-v1.5", - "bge-large-en-v1.5": "https://huggingface.co/BAAI/bge-large-en-v1.5", - "bge-large-zh-v1.5": "https://huggingface.co/BAAI/bge-large-zh-v1.5", - "bge-large-zh-noinstruct": "https://huggingface.co/BAAI/bge-large-zh-noinstruct", - "bge-small-zh-v1.5": "https://huggingface.co/BAAI/bge-small-zh-v1.5", - "bm25": "https://en.wikipedia.org/wiki/Okapi_BM25", - "camembert-base": "https://huggingface.co/almanach/camembert-base", - "camembert-large": "https://huggingface.co/almanach/camembert-large", - "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco", - "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer", - "distilbert-base-25lang-cased": "https://huggingface.co/Geotrend/distilbert-base-25lang-cased", - "distilbert-base-en-fr-cased": "https://huggingface.co/Geotrend/distilbert-base-en-fr-cased", - "distilbert-base-en-fr-es-pt-it-cased": "https://huggingface.co/Geotrend/distilbert-base-en-fr-es-pt-it-cased", - "distilbert-base-fr-cased": "https://huggingface.co/Geotrend/distilbert-base-fr-cased", - "distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased", - "distiluse-base-multilingual-cased-v2": "https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2", - "dfm-encoder-large-v1": "https://huggingface.co/chcaa/dfm-encoder-large-v1", - "dfm-sentence-encoder-large-1": "https://huggingface.co/chcaa/dfm-encoder-large-v1", - "e5-base": "https://huggingface.co/intfloat/e5-base", - "e5-base-v2": "https://huggingface.co/intfloat/e5-base-v2", - "e5-large": "https://huggingface.co/intfloat/e5-large", - "e5-large-v2": "https://huggingface.co/intfloat/e5-large-v2", - "e5-mistral-7b-instruct": "https://huggingface.co/intfloat/e5-mistral-7b-instruct", - "e5-small": "https://huggingface.co/intfloat/e5-small", - "electra-small-nordic": "https://huggingface.co/jonfd/electra-small-nordic", - "electra-small-swedish-cased-discriminator": "https://huggingface.co/KBLab/electra-small-swedish-cased-discriminator", - "flan-t5-base": "https://huggingface.co/google/flan-t5-base", - "flan-t5-large": "https://huggingface.co/google/flan-t5-large", - "flaubert_base_cased": "https://huggingface.co/flaubert/flaubert_base_cased", - "flaubert_base_uncased": "https://huggingface.co/flaubert/flaubert_base_uncased", - "flaubert_large_cased": "https://huggingface.co/flaubert/flaubert_large_cased", - "gbert-base": "https://huggingface.co/deepset/gbert-base", - "gbert-large": "https://huggingface.co/deepset/gbert-large", - "gelectra-base": "https://huggingface.co/deepset/gelectra-base", - "gelectra-large": "https://huggingface.co/deepset/gelectra-large", - "glove.6B.300d": "https://huggingface.co/sentence-transformers/average_word_embeddings_glove.6B.300d", - "google-gecko.text-embedding-preview-0409": "https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#latest_models", - "google-gecko-256.text-embedding-preview-0409": "https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#latest_models", - "gottbert-base": "https://huggingface.co/uklfr/gottbert-base", - "gtr-t5-base": "https://huggingface.co/sentence-transformers/gtr-t5-base", - "gtr-t5-large": "https://huggingface.co/sentence-transformers/gtr-t5-large", - "gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl", - "gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl", - "herbert-base-retrieval-v2": "https://huggingface.co/ipipan/herbert-base-retrieval-v2", - "instructor-base": "https://huggingface.co/hkunlp/instructor-base", - "instructor-xl": "https://huggingface.co/hkunlp/instructor-xl", - "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos", - "llama-2-7b-chat": "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf", - "luotuo-bert-medium": "https://huggingface.co/silk-road/luotuo-bert-medium", - "m3e-base": "https://huggingface.co/moka-ai/m3e-base", - "m3e-large": "https://huggingface.co/moka-ai/m3e-large", - "mistral-7b-instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", - "mistral-embed": "https://docs.mistral.ai/guides/embeddings", - "monobert-large-msmarco": "https://huggingface.co/castorini/monobert-large-msmarco", - "monot5-3b-msmarco-10k": "https://huggingface.co/castorini/monot5-3b-msmarco-10k", - "monot5-base-msmarco-10k": "https://huggingface.co/castorini/monot5-base-msmarco-10k", - "msmarco-bert-co-condensor": "https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor", - "multi-qa-MiniLM-L6-cos-v1": "https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1", - "multilingual-e5-base": "https://huggingface.co/intfloat/multilingual-e5-base", - "multilingual-e5-large": "https://huggingface.co/intfloat/multilingual-e5-large", - "multilingual-e5-small": "https://huggingface.co/intfloat/multilingual-e5-small", - "nb-bert-base": "https://huggingface.co/NbAiLab/nb-bert-base", - "nb-bert-large": "https://huggingface.co/NbAiLab/nb-bert-large", - "nomic-embed-text-v1.5-64": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5", - "nomic-embed-text-v1.5-128": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5", - "nomic-embed-text-v1.5-256": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5", - "nomic-embed-text-v1.5-512": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5", - "norbert3-base": "https://huggingface.co/ltg/norbert3-base", - "norbert3-large": "https://huggingface.co/ltg/norbert3-large", - "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2", - "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - "sentence-camembert-base": "https://huggingface.co/dangvantuan/sentence-camembert-base", - "sentence-camembert-large": "https://huggingface.co/dangvantuan/sentence-camembert-large", - "sentence-croissant-llm-base": "https://huggingface.co/Wissam42/sentence-croissant-llm-base", - "sentence-bert-swedish-cased": "https://huggingface.co/KBLab/sentence-bert-swedish-cased", - "sentence-t5-base": "https://huggingface.co/sentence-transformers/sentence-t5-base", - "sentence-t5-large": "https://huggingface.co/sentence-transformers/sentence-t5-large", - "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl", - "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl", - "silver-retriever-base-v1": "https://huggingface.co/ipipan/silver-retriever-base-v1", - "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased", - "st-polish-paraphrase-from-distilroberta": "https://huggingface.co/sdadas/st-polish-paraphrase-from-distilroberta", - "st-polish-paraphrase-from-mpnet": "https://huggingface.co/sdadas/st-polish-paraphrase-from-mpnet", - "tart-dual-contriever-msmarco": "https://huggingface.co/orionweller/tart-dual-contriever-msmarco", - "tart-full-flan-t5-xl": "https://huggingface.co/facebook/tart-full-flan-t5-xl", - "text2vec-base-chinese": "https://huggingface.co/shibing624/text2vec-base-chinese", - "text2vec-large-chinese": "https://huggingface.co/GanymedeNil/text2vec-large-chinese", - "text-embedding-3-small": "https://openai.com/blog/new-embedding-models-and-api-updates", - "text-embedding-3-large": "https://openai.com/blog/new-embedding-models-and-api-updates", - "text-embedding-3-large-256": "https://openai.com/blog/new-embedding-models-and-api-updates", - "text-embedding-ada-002": "https://openai.com/blog/new-and-improved-embedding-model", - "text-similarity-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-similarity-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-similarity-curie-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-similarity-davinci-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-search-ada-doc-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-search-ada-query-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-search-ada-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-search-curie-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-search-babbage-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "text-search-davinci-001": "https://openai.com/blog/introducing-text-and-code-embeddings", - "titan-embed-text-v1": "https://docs.aws.amazon.com/bedrock/latest/userguide/embeddings.html", - "udever-bloom-1b1": "https://huggingface.co/izhx/udever-bloom-1b1", - "udever-bloom-560m": "https://huggingface.co/izhx/udever-bloom-560m", - "universal-sentence-encoder-multilingual-3": "https://huggingface.co/vprelovac/universal-sentence-encoder-multilingual-3", - "universal-sentence-encoder-multilingual-large-3": "https://huggingface.co/vprelovac/universal-sentence-encoder-multilingual-large-3", - "unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased", - "use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual", - "voyage-2": "https://docs.voyageai.com/embeddings/", - "voyage-code-2": "https://docs.voyageai.com/embeddings/", - "voyage-large-2-instruct": "https://docs.voyageai.com/embeddings/", - "voyage-law-2": "https://docs.voyageai.com/embeddings/", - "voyage-lite-01-instruct": "https://docs.voyageai.com/embeddings/", - "voyage-lite-02-instruct": "https://docs.voyageai.com/embeddings/", - "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base", - "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large", -} - -EXTERNAL_MODEL_TO_DIM = { - "Baichuan-text-embedding": 1024, - "Cohere-embed-english-v3.0": 1024, - "Cohere-embed-multilingual-v3.0": 1024, - "Cohere-embed-multilingual-light-v3.0": 384, - "DanskBERT": 768, - "FollowIR-7B": -1, - "GritLM-7B": 4096, - "LASER2": 1024, - "LLM2Vec-Llama-supervised": 4096, - "LLM2Vec-Llama-unsupervised": 4096, - "LLM2Vec-Mistral-supervised": 4096, - "LLM2Vec-Mistral-unsupervised": 4096, - "LLM2Vec-Sheared-Llama-supervised": 2048, - "LLM2Vec-Sheared-Llama-unsupervised": 2048, - "LaBSE": 768, - "all-MiniLM-L12-v2": 384, - "all-MiniLM-L6-v2": 384, - "all-mpnet-base-v2": 768, - "allenai-specter": 768, - "bert-base-10lang-cased": 768, - "bert-base-15lang-cased": 768, - "bert-base-25lang-cased": 768, - "bert-base-multilingual-cased": 768, - "bert-base-multilingual-uncased": 768, - "bert-base-swedish-cased": 768, - "bert-base-uncased": 768, - "bge-base-zh-v1.5": 768, - "bge-large-en-v1.5": 1024, - "bge-large-zh-v1.5": 1024, - "bge-large-zh-noinstruct": 1024, - "bge-small-zh-v1.5": 512, - "bm25": -1, - "camembert-base": 512, - "camembert-large": 768, - "contriever-base-msmarco": 768, - "cross-en-de-roberta-sentence-transformer": 768, - "distilbert-base-25lang-cased": 768, - "distilbert-base-en-fr-cased": 768, - "distilbert-base-en-fr-es-pt-it-cased": 768, - "distilbert-base-fr-cased": 768, - "distilbert-base-uncased": 768, - "distiluse-base-multilingual-cased-v2": 512, - "dfm-encoder-large-v1": 1024, - "dfm-sentence-encoder-large-1": 1024, - "e5-base": 768, - "e5-base-v2": 768, - "e5-large": 1024, - "e5-large-v2": 1024, - "e5-mistral-7b-instruct": 4096, - "e5-small": 384, - "electra-small-nordic": 256, - "electra-small-swedish-cased-discriminator": 256, - "flan-t5-base": -1, - "flan-t5-large": -1, - "flaubert_base_cased": 768, - "flaubert_base_uncased": 768, - "flaubert_large_cased": 1024, - "luotuo-bert-medium": 768, - "gbert-base": 768, - "gbert-large": 1024, - "gelectra-base": 768, - "gelectra-large": 1024, - "glove.6B.300d": 300, - "google-gecko.text-embedding-preview-0409": 768, - "google-gecko-256.text-embedding-preview-0409": 256, - "gottbert-base": 768, - "gtr-t5-base": 768, - "gtr-t5-large": 768, - "gtr-t5-xl": 768, - "gtr-t5-xxl": 768, - "herbert-base-retrieval-v2": 768, - "instructor-base": 768, - "instructor-xl": 768, - "komninos": 300, - "llama-2-7b-chat": -1, - "m3e-base": 768, - "m3e-large": 768, - "mistral-7b-instruct-v0.2": -1, - "mistral-embed": 1024, - "monobert-large-msmarco": -1, - "monot5-3b-msmarco-10k": -1, - "monot5-base-msmarco-10k": -1, - "msmarco-bert-co-condensor": 768, - "multi-qa-MiniLM-L6-cos-v1": 384, - "multilingual-e5-base": 768, - "multilingual-e5-small": 384, - "multilingual-e5-large": 1024, - "nb-bert-base": 768, - "nb-bert-large": 1024, - "nomic-embed-text-v1.5-64": 64, - "nomic-embed-text-v1.5-128": 128, - "nomic-embed-text-v1.5-256": 256, - "nomic-embed-text-v1.5-512": 512, - "norbert3-base": 768, - "norbert3-large": 1024, - "OpenSearch-text-hybrid": 1792, - "paraphrase-multilingual-MiniLM-L12-v2": 384, - "paraphrase-multilingual-mpnet-base-v2": 768, - "sentence-camembert-base": 768, - "sentence-camembert-large": 1024, - "sentence-croissant-llm-base": 2048, - "sentence-bert-swedish-cased": 768, - "sentence-t5-base": 768, - "sentence-t5-large": 768, - "sentence-t5-xl": 768, - "sentence-t5-xxl": 768, - "silver-retriever-base-v1": 768, - "sup-simcse-bert-base-uncased": 768, - "st-polish-paraphrase-from-distilroberta": 768, - "st-polish-paraphrase-from-mpnet": 768, - "tart-dual-contriever-msmarco": 768, - "tart-full-flan-t5-xl": -1, - "text2vec-base-chinese": 768, - "text2vec-large-chinese": 1024, - "text-embedding-3-large": 3072, - "text-embedding-3-large-256": 256, - "text-embedding-3-small": 1536, - "text-embedding-ada-002": 1536, - "text-similarity-ada-001": 1024, - "text-similarity-babbage-001": 2048, - "text-similarity-curie-001": 4096, - "text-similarity-davinci-001": 12288, - "text-search-ada-doc-001": 1024, - "text-search-ada-query-001": 1024, - "text-search-ada-001": 1024, - "text-search-babbage-001": 2048, - "text-search-curie-001": 4096, - "text-search-davinci-001": 12288, - "titan-embed-text-v1": 1536, - "udever-bloom-1b1": 1536, - "udever-bloom-560m": 1024, - "universal-sentence-encoder-multilingual-3": 512, - "universal-sentence-encoder-multilingual-large-3": 512, - "unsup-simcse-bert-base-uncased": 768, - "use-cmlm-multilingual": 768, - "voyage-2": 1024, - "voyage-code-2": 1536, - "voyage-large-2-instruct": 1536, - "voyage-law-2": 1024, - "voyage-lite-01-instruct": 1024, - "voyage-lite-02-instruct": 1024, - "xlm-roberta-base": 768, - "xlm-roberta-large": 1024, -} - -EXTERNAL_MODEL_TO_SEQLEN = { - "Baichuan-text-embedding": 512, - "Cohere-embed-english-v3.0": 512, - "Cohere-embed-multilingual-v3.0": 512, - "Cohere-embed-multilingual-light-v3.0": 512, - "DanskBERT": 514, - "FollowIR-7B": 32768, - "GritLM-7B": 32768, - "LASER2": "N/A", - "LLM2Vec-Llama-supervised": 4096, - "LLM2Vec-Llama-unsupervised": 4096, - "LLM2Vec-Mistral-supervised": 32768, - "LLM2Vec-Mistral-unsupervised": 32768, - "LLM2Vec-Sheared-Llama-supervised": 4096, - "LLM2Vec-Sheared-Llama-unsupervised": 4096, - "LaBSE": 512, - "all-MiniLM-L12-v2": 512, - "all-MiniLM-L6-v2": 512, - "all-mpnet-base-v2": 514, - "allenai-specter": 512, - "bert-base-10lang-cased": 512, - "bert-base-15lang-cased": 512, - "bert-base-25lang-cased": 512, - "bert-base-multilingual-cased": 512, - "bert-base-multilingual-uncased": 512, - "bert-base-swedish-cased": 512, - "bert-base-uncased": 512, - "bge-base-zh-v1.5": 512, - "bge-large-en-v1.5": 512, - "bge-large-zh-v1.5": 512, - "bge-large-zh-noinstruct": 512, - "bge-small-zh-v1.5": 512, - "camembert-base": 512, - "camembert-large": 512, - "contriever-base-msmarco": 512, - "cross-en-de-roberta-sentence-transformer": 514, - "distilbert-base-25lang-cased": 512, - "distilbert-base-en-fr-cased": 512, - "distilbert-base-en-fr-es-pt-it-cased": 512, - "distilbert-base-fr-cased": 512, - "distilbert-base-uncased": 512, - "dfm-encoder-large-v1": 512, - "dfm-sentence-encoder-large-1": 512, - "distiluse-base-multilingual-cased-v2": 512, - "e5-base": 512, - "e5-base-v2": 512, - "e5-large": 512, - "e5-large-v2": 512, - "e5-mistral-7b-instruct": 32768, - "e5-small": 512, - "electra-small-nordic": 512, - "electra-small-swedish-cased-discriminator": 512, - "flan-t5-base": 512, - "flan-t5-large": 512, - "flaubert_base_cased": 512, - "flaubert_base_uncased": 512, - "flaubert_large_cased": 512, - "gbert-base": 512, - "gbert-large": 512, - "gelectra-base": 512, - "gelectra-large": 512, - "google-gecko.text-embedding-preview-0409": 2048, - "google-gecko-256.text-embedding-preview-0409": 2048, - "gottbert-base": 512, - "glove.6B.300d": "N/A", - "gtr-t5-base": 512, - "gtr-t5-large": 512, - "gtr-t5-xl": 512, - "gtr-t5-xxl": 512, - "herbert-base-retrieval-v2": 514, - "instructor-base": 512, - "instructor-xl": 512, - "komninos": "N/A", - "llama-2-7b-chat": 4096, - "luotuo-bert-medium": 512, - "m3e-base": 512, - "m3e-large": 512, - "mistral-7b-instruct-v0.2": 32768, -# "mistral-embed": "?", - "monobert-large-msmarco": 512, - "monot5-3b-msmarco-10k": 512, - "monot5-base-msmarco-10k": 512, - "msmarco-bert-co-condensor": 512, - "multi-qa-MiniLM-L6-cos-v1": 512, - "multilingual-e5-base": 514, - "multilingual-e5-large": 514, - "multilingual-e5-small": 512, - "nb-bert-base": 512, - "nb-bert-large": 512, - "nomic-embed-text-v1.5-64": 8192, - "nomic-embed-text-v1.5-128": 8192, - "nomic-embed-text-v1.5-256": 8192, - "nomic-embed-text-v1.5-512": 8192, - "norbert3-base": 512, - "norbert3-large": 512, - "OpenSearch-text-hybrid": 512, - "paraphrase-multilingual-MiniLM-L12-v2": 512, - "paraphrase-multilingual-mpnet-base-v2": 514, - "sentence-camembert-base": 512, - "sentence-camembert-large": 512, - "sentence-croissant-llm-base": 2048, - "sentence-bert-swedish-cased": 512, - "sentence-t5-base": 512, - "sentence-t5-large": 512, - "sentence-t5-xl": 512, - "sentence-t5-xxl": 512, - "silver-retriever-base-v1": 514, - "sup-simcse-bert-base-uncased": 512, - "st-polish-paraphrase-from-distilroberta": 514, - "st-polish-paraphrase-from-mpnet": 514, - "tart-dual-contriever-msmarco": 512, - "tart-full-flan-t5-xl": 512, - "text2vec-base-chinese": 512, - "text2vec-large-chinese": 512, - "text-embedding-3-large": 8191, - "text-embedding-3-large-256": 8191, - "text-embedding-3-small": 8191, - "text-embedding-ada-002": 8191, - "text-similarity-ada-001": 2046, - "text-similarity-babbage-001": 2046, - "text-similarity-curie-001": 2046, - "text-similarity-davinci-001": 2046, - "text-search-ada-doc-001": 2046, - "text-search-ada-query-001": 2046, - "text-search-ada-001": 2046, - "text-search-babbage-001": 2046, - "text-search-curie-001": 2046, - "text-search-davinci-001": 2046, - "titan-embed-text-v1": 8000, - "udever-bloom-1b1": 2048, - "udever-bloom-560m": 2048, - "universal-sentence-encoder-multilingual-3": 512, - "universal-sentence-encoder-multilingual-large-3": 512, - "use-cmlm-multilingual": 512, - "unsup-simcse-bert-base-uncased": 512, - "voyage-2": 1024, - "voyage-code-2": 16000, - "voyage-large-2-instruct": 16000, - "voyage-law-2": 4000, - "voyage-lite-01-instruct": 4000, - "voyage-lite-02-instruct": 4000, - "xlm-roberta-base": 514, - "xlm-roberta-large": 514, -} - -EXTERNAL_MODEL_TO_SIZE = { - "DanskBERT": 125, - "FollowIR-7B": 7242, - "GritLM-7B": 7242, - "LASER2": 43, - "LLM2Vec-Llama-supervised": 6607, - "LLM2Vec-Llama-unsupervised": 6607, - "LLM2Vec-Mistral-supervised": 7111, - "LLM2Vec-Mistral-unsupervised": 7111, - "LLM2Vec-Sheared-Llama-supervised": 1280, - "LLM2Vec-Sheared-Llama-unsupervised": 1280, - "LaBSE": 471, - "allenai-specter": 110, - "all-MiniLM-L12-v2": 33, - "all-MiniLM-L6-v2": 23, - "all-mpnet-base-v2": 110, - "bert-base-10lang-cased": 138, - "bert-base-15lang-cased": 138, - "bert-base-25lang-cased": 138, - "bert-base-multilingual-cased": 179, - "bert-base-multilingual-uncased": 168, - "bert-base-uncased": 110, - "bert-base-swedish-cased": 125, - "bge-base-zh-v1.5": 102, - "bge-large-zh-v1.5": 326, - "bge-large-zh-noinstruct": 326, - "bge-small-zh-v1.5": 24, - "bm25": 0, - "camembert-base": 111, - "camembert-large": 338, - "cross-en-de-roberta-sentence-transformer": 278, - "contriever-base-msmarco": 110, - "distilbert-base-25lang-cased": 110, - "distilbert-base-en-fr-cased": 110, - "distilbert-base-en-fr-es-pt-it-cased": 110, - "distilbert-base-fr-cased": 110, - "distilbert-base-uncased": 110, - "distiluse-base-multilingual-cased-v2": 135, - "dfm-encoder-large-v1": 355, - "dfm-sentence-encoder-large-1": 355, - "e5-base": 110, - "e5-base-v2": 110, - "e5-large": 335, - "e5-large-v2": 335, - "e5-mistral-7b-instruct": 7111, - "e5-small": 33, - "electra-small-nordic": 23, - "electra-small-swedish-cased-discriminator": 16, - "flan-t5-base": 220, - "flan-t5-large": 770, - "flaubert_base_cased": 138, - "flaubert_base_uncased": 138, - "flaubert_large_cased": 372, - "gbert-base": 110, - "gbert-large": 337, - "gelectra-base": 110, - "gelectra-large": 335, - "glove.6B.300d": 120, - "google-gecko.text-embedding-preview-0409": 1200, - "google-gecko-256.text-embedding-preview-0409": 1200, - "gottbert-base": 127, - "gtr-t5-base": 110, - "gtr-t5-large": 168, - "gtr-t5-xl": 1240, - "gtr-t5-xxl": 4865, - "herbert-base-retrieval-v2": 125, - "instructor-base": 110, - "instructor-xl": 1241, - "komninos": 134, - "llama-2-7b-chat": 7000, - "luotuo-bert-medium": 328, - "m3e-base": 102, - "m3e-large": 102, - "mistral-7b-instruct-v0.2": 7111, - "msmarco-bert-co-condensor": 110, - "monobert-large-msmarco": 335, - "monot5-3b-msmarco-10k": 2480, - "monot5-base-msmarco-10k": 220, - "multi-qa-MiniLM-L6-cos-v1": 23, - "multilingual-e5-base": 278, - "multilingual-e5-small": 118, - "multilingual-e5-large": 560, - "nb-bert-base": 179, - "nb-bert-large": 355, - "nomic-embed-text-v1.5-64": 138, - "nomic-embed-text-v1.5-128": 138, - "nomic-embed-text-v1.5-256": 138, - "nomic-embed-text-v1.5-512": 138, - "norbert3-base": 131, - "norbert3-large": 368, - "paraphrase-multilingual-mpnet-base-v2": 278, - "paraphrase-multilingual-MiniLM-L12-v2": 118, - "sentence-camembert-base": 110, - "sentence-camembert-large": 337, - "sentence-croissant-llm-base": 1280, - "sentence-bert-swedish-cased": 125, - "sentence-t5-base": 110, - "sentence-t5-large": 168, - "sentence-t5-xl": 1240, - "sentence-t5-xxl": 4865, - "silver-retriever-base-v1": 125, - "sup-simcse-bert-base-uncased": 110, - "st-polish-paraphrase-from-distilroberta": 125, - "st-polish-paraphrase-from-mpnet": 125, - "tart-dual-contriever-msmarco": 110, - "tart-full-flan-t5-xl": 2480, - "text2vec-base-chinese": 102, - "text2vec-large-chinese": 326, - "unsup-simcse-bert-base-uncased": 110, - "use-cmlm-multilingual": 472, - #"voyage-law-2": 1220, - "voyage-lite-02-instruct": 1220, - "xlm-roberta-base": 279, - "xlm-roberta-large": 560, -} - -PROPRIETARY_MODELS = { - "Baichuan-text-embedding", - "Cohere-embed-english-v3.0", - "Cohere-embed-multilingual-v3.0", - "Cohere-embed-multilingual-light-v3.0", - "OpenSearch-text-hybrid", - "mistral-embed", - "text-embedding-3-small", - "text-embedding-3-large", - "text-embedding-3-large-256", - "text-embedding-ada-002", - "text-similarity-ada-001", - "text-similarity-babbage-001", - "text-similarity-curie-001", - "text-similarity-davinci-001", - "text-search-ada-doc-001", - "text-search-ada-query-001", - "text-search-ada-001", - "text-search-curie-001", - "text-search-babbage-001", - "text-search-davinci-001", - "titan-embed-text-v1", - "voyage-2", - "voyage-code-2", - "voyage-law-2", - "voyage-lite-01-instruct", - "voyage-lite-02-instruct", - "google-gecko.text-embedding-preview-0409", - "google-gecko-256.text-embedding-preview-0409", -} +EXTERNAL_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_external", False)} +EXTERNAL_MODEL_TO_LINK = {k: v["link"] for k,v in MODEL_META["model_meta"].items() if v.get("link", False)} +EXTERNAL_MODEL_TO_DIM = {k: v["dim"] for k,v in MODEL_META["model_meta"].items() if v.get("dim", False)} +EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)} +EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)} +PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)} +TASK_DESCRIPTIONS = {k: v["task_description"] for k,v in TASKS_CONFIG.items()} +TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks." +SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)} +MODELS_TO_SKIP = MODEL_META["models_to_skip"] +CROSS_ENCODERS = MODEL_META["cross_encoders"] PROPRIETARY_MODELS = { - make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard")) + make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}")) for model in PROPRIETARY_MODELS } SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = { - "allenai-specter", - "allenai-specter", - "all-MiniLM-L12-v2", - "all-MiniLM-L6-v2", - "all-mpnet-base-v2", - "bert-base-10lang-cased", - "bert-base-15lang-cased", - "bert-base-25lang-cased", - "bert-base-multilingual-cased", - "bert-base-multilingual-uncased", - "bert-base-swedish-cased", - "bert-base-uncased", - "bge-base-zh-v1.5", - "bge-large-zh-v1.5", - "bge-large-zh-noinstruct", - "bge-small-zh-v1.5", - "camembert-base", - "camembert-large", - "contriever-base-msmarco", - "cross-en-de-roberta-sentence-transformer", - "DanskBERT", - "distilbert-base-25lang-cased", - "distilbert-base-en-fr-cased", - "distilbert-base-en-fr-es-pt-it-cased", - "distilbert-base-fr-cased", - "distilbert-base-uncased", - "distiluse-base-multilingual-cased-v2", - "dfm-encoder-large-v1", - "dfm-sentence-encoder-large-1", - "e5-base", - "e5-base-v2", - "e5-large", - "e5-large-v2", - "e5-mistral-7b-instruct", - "e5-small", - "electra-small-nordic", - "electra-small-swedish-cased-discriminator", - "flaubert_base_cased", - "flaubert_base_uncased", - "flaubert_large_cased", - "gbert-base", - "gbert-large", - "gelectra-base", - "gelectra-large", - "glove.6B.300d", - "gottbert-base", - "gtr-t5-base", - "gtr-t5-large", - "gtr-t5-xl", - "gtr-t5-xxl", - "herbert-base-retrieval-v2", - "komninos", - "luotuo-bert-medium", - "LaBSE", - "m3e-base", - "m3e-large", - "msmarco-bert-co-condensor", - "multi-qa-MiniLM-L6-cos-v1", - "multilingual-e5-base", - "multilingual-e5-large", - "multilingual-e5-small", - "nb-bert-base", - "nb-bert-large", - "nomic-embed-text-v1.5-64", - "nomic-embed-text-v1.5-128", - "nomic-embed-text-v1.5-256", - "nomic-embed-text-v1.5-512", - "norbert3-base", - "norbert3-large", - "paraphrase-multilingual-mpnet-base-v2", - "paraphrase-multilingual-MiniLM-L12-v2", - "sentence-camembert-base", - "sentence-camembert-large", - "sentence-croissant-llm-base", - "sentence-bert-swedish-cased", - "sentence-t5-base", - "sentence-t5-large", - "sentence-t5-xl", - "sentence-t5-xxl", - "silver-retriever-base-v1", - "sup-simcse-bert-base-uncased", - "st-polish-paraphrase-from-distilroberta", - "st-polish-paraphrase-from-mpnet", - "tart-dual-contriever-msmarco", - "text2vec-base-chinese", - "text2vec-large-chinese", - "udever-bloom-1b1", - "udever-bloom-560m", - "universal-sentence-encoder-multilingual-3", - "universal-sentence-encoder-multilingual-large-3", - "unsup-simcse-bert-base-uncased", - "use-cmlm-multilingual", - "xlm-roberta-base", - "xlm-roberta-large", -} -SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = { - make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard")) + make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}")) for model in SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS } - CROSS_ENCODERS = { - "FollowIR-7B", - "flan-t5-base", - "flan-t5-large", - "monobert-large-msmarco", - "monot5-3b-msmarco-10k", - "monot5-base-msmarco-10k", -} - -MODELS_TO_SKIP = { - "baseplate/instructor-large-1", # Duplicate - "radames/e5-large", # Duplicate - "gentlebowl/instructor-large-safetensors", # Duplicate - "Consensus/instructor-base", # Duplicate - "GovCompete/instructor-xl", # Duplicate - "GovCompete/e5-large-v2", # Duplicate - "t12e/instructor-base", # Duplicate - "michaelfeil/ct2fast-e5-large-v2", - "michaelfeil/ct2fast-e5-large", - "michaelfeil/ct2fast-e5-small-v2", - "newsrx/instructor-xl-newsrx", - "newsrx/instructor-large-newsrx", - "fresha/e5-large-v2-endpoint", - "ggrn/e5-small-v2", - "michaelfeil/ct2fast-e5-small", - "jncraton/e5-small-v2-ct2-int8", - "anttip/ct2fast-e5-small-v2-hfie", - "newsrx/instructor-large", - "newsrx/instructor-xl", - "dmlls/all-mpnet-base-v2", - "cgldo/semanticClone", - "Malmuk1/e5-large-v2_Sharded", - "jncraton/gte-small-ct2-int8", - "Einas/einas_ashkar", - "gruber/e5-small-v2-ggml", - "jncraton/bge-small-en-ct2-int8", - "vectoriseai/bge-small-en", - "recipe/embeddings", - "dhairya0907/thenlper-get-large", - "Narsil/bge-base-en", - "kozistr/fused-large-en", - "sionic-ai/sionic-ai-v2", # Wait for https://huggingface.co/sionic-ai/sionic-ai-v2/discussions/1 - "sionic-ai/sionic-ai-v1", # Wait for https://huggingface.co/sionic-ai/sionic-ai-v2/discussions/1 - "BAAI/bge-large-en", # Deprecated in favor of v1.5 - "BAAI/bge-base-en", # Deprecated in favor of v1.5 - "BAAI/bge-small-en", # Deprecated in favor of v1.5 - "d0rj/e5-large-en-ru", - "d0rj/e5-base-en-ru", - "d0rj/e5-small-en-ru", - "aident-ai/bge-base-en-onnx", - "barisaydin/bge-base-en", - "barisaydin/gte-large", - "barisaydin/gte-base", - "barisaydin/gte-small", - "barisaydin/bge-small-en", - "odunola/e5-base-v2", - "goldenrooster/multilingual-e5-large", - "davidpeer/gte-small", - "barisaydin/bge-large-en", - "jamesgpt1/english-large-v1", - "vectoriseai/bge-large-en-v1.5", - "vectoriseai/bge-base-en-v1.5", - "vectoriseai/instructor-large", - "vectoriseai/instructor-base", - "vectoriseai/gte-large", - "vectoriseai/gte-base", - "vectoriseai/e5-large-v2", - "vectoriseai/bge-small-en-v1.5", - "vectoriseai/e5-base-v2", - "vectoriseai/e5-large", - "vectoriseai/multilingual-e5-large", - "vectoriseai/gte-small", - "vectoriseai/ember-v1", - "vectoriseai/e5-base", - "vectoriseai/e5-small-v2", - "michaelfeil/ct2fast-bge-large-en-v1.5", - "michaelfeil/ct2fast-bge-large-en-v1.5", - "michaelfeil/ct2fast-bge-base-en-v1.5", - "michaelfeil/ct2fast-gte-large", - "michaelfeil/ct2fast-gte-base", - "michaelfeil/ct2fast-bge-small-en-v1.5", - "rizki/bgr-tf", - "ef-zulla/e5-multi-sml-torch", - "cherubhao/yogamodel", - "morgendigital/multilingual-e5-large-quantized", - "jncraton/gte-tiny-ct2-int8", - "Research2NLP/electrical_stella", - "Intel/bge-base-en-v1.5-sts-int8-static", - "Intel/bge-base-en-v1.5-sts-int8-dynamic", - "Intel/bge-base-en-v1.5-sst2", - "Intel/bge-base-en-v1.5-sst2-int8-static", - "Intel/bge-base-en-v1.5-sst2-int8-dynamic", - "Intel/bge-small-en-v1.5-sst2", - "Intel/bge-small-en-v1.5-sst2-int8-dynamic", - "Intel/bge-small-en-v1.5-sst2-int8-static", - "binqiangliu/EmbeddingModlebgelargeENv1.5", - "DecisionOptimizationSystem/DeepFeatEmbeddingLargeContext", - "woody72/multilingual-e5-base", - "Severian/embed", - "Frazic/udever-bloom-3b-sentence", - "jamesgpt1/zzz", - "karrar-alwaili/UAE-Large-V1", - "odunola/UAE-Large-VI", - "shubham-bgi/UAE-Large", - "retrainai/instructor-xl", - "weakit-v/bge-base-en-v1.5-onnx", - "ieasybooks/multilingual-e5-large-onnx", - "gizmo-ai/Cohere-embed-multilingual-v3.0", - "jingyeom/korean_embedding_model", - "barisaydin/text2vec-base-multilingual", - "mlx-community/multilingual-e5-large-mlx", - "mlx-community/multilingual-e5-base-mlx", - "mlx-community/multilingual-e5-small-mlx", - "maiyad/multilingual-e5-small", - "khoa-klaytn/bge-base-en-v1.5-angle", - "khoa-klaytn/bge-small-en-v1.5-angle", - "mixamrepijey/instructor-small", - "mixamrepijey/instructor-models", - "lsf1000/bge-evaluation", # Empty - "giulio98/placeholder", # Empty - "Severian/nomic", # Copy - "atian-chapters/Chapters-SFR-Embedding-Mistral", # Copy - "rlsChapters/Chapters-SFR-Embedding-Mistral", # Copy - "TitanML/jina-v2-base-en-embed", # Copy - "MaziyarPanahi/GritLM-8x7B-GGUF", # GGUF variant - "Geolumina/instructor-xl", # Duplicate - "krilecy/e5-mistral-7b-instruct", - "beademiguelperez/sentence-transformers-multilingual-e5-small", - "arcdev/SFR-Embedding-Mistral", - "arcdev/e5-mistral-7b-instruct", - "Koat/gte-tiny", - "SmartComponents/bge-micro-v2", - "ildodeltaRule/multilingual-e5-large", - "hsikchi/dump", - "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", - "McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-supervised", - "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-supervised", - "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse", - "McGill-NLP/LLM2Vec-Llama-2-7b-chat-hf-mntp-unsup-simcse", - "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse", - "jncraton/GIST-small-Embedding-v0-ct2-int8", - "jncraton/stella-base-en-v2-ct2-int8", - "lightbird-ai/nomic", - "jamesdborin/jina-v2-base-en-embed", - "iampanda/Test", + make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}")) + for model in CROSS_ENCODERS } +TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS} +for board_config in BOARDS_CONFIG.values(): + for task_category, task_list in board_config["tasks"].items(): + TASK_TO_TASK_TYPE[task_category].extend(task_list) def add_lang(examples): if not(examples["eval_language"]): @@ -1342,24 +74,14 @@ def norm(names): return set([name.split(" ")[0] for name in names]) def add_task(examples): # Could be added to the dataset loading script instead - if examples["mteb_dataset_name"] in norm(TASK_LIST_CLASSIFICATION + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_FR + TASK_LIST_CLASSIFICATION_NB + TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_ZH): - examples["mteb_task"] = "Classification" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE + TASK_LIST_CLUSTERING_FR + TASK_LIST_CLUSTERING_PL + TASK_LIST_CLUSTERING_ZH): - examples["mteb_task"] = "Clustering" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_PAIR_CLASSIFICATION_FR + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_PAIR_CLASSIFICATION_ZH): - examples["mteb_task"] = "PairClassification" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_RERANKING + TASK_LIST_RERANKING_FR + TASK_LIST_RERANKING_ZH): - examples["mteb_task"] = "Reranking" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_FR + TASK_LIST_RETRIEVAL_PL + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_RETRIEVAL_LAW): - examples["mteb_task"] = "Retrieval" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_STS + TASK_LIST_STS_FR + TASK_LIST_STS_PL + TASK_LIST_STS_ZH): - examples["mteb_task"] = "STS" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_SUMMARIZATION + TASK_LIST_SUMMARIZATION_FR): - examples["mteb_task"] = "Summarization" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_BITEXT_MINING + TASK_LIST_BITEXT_MINING_DA): - examples["mteb_task"] = "BitextMining" - elif examples["mteb_dataset_name"] in norm(TASK_LIST_RETRIEVAL_INSTRUCTIONS): - examples["mteb_task"] = "InstructionRetrieval" + task_name = examples["mteb_dataset_name"] + task_type = None + for task_category, task_list in TASK_TO_TASK_TYPE.items(): + if task_name in norm(task_list): + task_type = task_category + break + if task_type is not None: + examples["mteb_task"] = task_type else: print("WARNING: Task not found for dataset", examples["mteb_dataset_name"]) examples["mteb_task"] = "Unknown" @@ -1381,12 +103,12 @@ else: pbar = tqdm(models_to_run, desc="Fetching external model results") for model in pbar: pbar.set_description(f"Fetching external model results for {model!r}") - ds = load_dataset("mteb/results", model, trust_remote_code=True) + ds = load_dataset(RESULTS_REPO, model, trust_remote_code=True) # For local debugging: #, download_mode='force_redownload', verification_mode="no_checks") ds = ds.map(add_lang) ds = ds.map(add_task) - base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))} + base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))} # For now only one metric per task - Could add more metrics lateron for task, metric in TASK_TO_METRIC.items(): ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict() @@ -1440,8 +162,15 @@ def add_rank(df): df.fillna("", inplace=True) return df -def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True): - api = HfApi() +model_infos_path = "model_infos.json" +MODEL_INFOS = {} +if os.path.exists(model_infos_path): + with open(model_infos_path) as f: + MODEL_INFOS = json.load(f) + +def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True, refresh=True): + global MODEL_INFOS + api = API models = api.list_models(filter="mteb") # Initialize list to models that we cannot fetch metadata from df_list = [] @@ -1469,12 +198,17 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_ res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "") res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "") df_list.append(res) - + for model in models: if model.modelId in MODELS_TO_SKIP: continue - print("MODEL", model) - readme_path = hf_hub_download(model.modelId, filename="README.md") - meta = metadata_load(readme_path) + print("MODEL", model.modelId) + if model.modelId not in MODEL_INFOS or refresh: + readme_path = hf_hub_download(model.modelId, filename="README.md") + meta = metadata_load(readme_path) + MODEL_INFOS[model.modelId] = { + "metadata": meta + } + meta = MODEL_INFOS[model.modelId]["metadata"] if "model-index" not in meta: continue # meta['model-index'][0]["results"] is list of elements like: @@ -1506,325 +240,107 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_ if add_emb_dim: try: # Fails on gated repos, so we only include scores for them - out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = get_dim_seq_size(model) + if "dim_seq_size" not in MODEL_INFOS[model.modelId] or refresh: + MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model)) + out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"]) except: - pass + MODEL_INFOS[model.modelId]["dim_seq_size"] = "", "", "", "" df_list.append(out) if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model.siblings}: SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"]) + + # Save & cache MODEL_INFOS + with open("model_infos.json", "w") as f: + json.dump(MODEL_INFOS, f) + df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one - df = df.groupby("Model", as_index=False).first() + df = df.groupby("Model", as_index=False).first() # Put 'Model' column first cols = sorted(list(df.columns)) - cols.insert(0, cols.pop(cols.index("Model"))) + base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"] + if len(datasets) > 0: + #filter invalid columns + cols = [col for col in cols if col in base_columns + datasets] + i = 0 + for column in base_columns: + if column in cols: + cols.insert(i, cols.pop(cols.index(column))) + i += 1 df = df[cols] if rank: df = add_rank(df) if fillna: df.fillna("", inplace=True) - - if "instruction" in task.lower(): - df["Model"] = df.Model.apply(lambda x: "โŽ" + x if x.split(">")[1].split("<")[0] in CROSS_ENCODERS else x) return df -def get_mteb_average(): - global DATA_OVERALL, DATA_CLASSIFICATION_EN, DATA_CLUSTERING, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_SUMMARIZATION +# Get dict with a task list for each task category +# E.g. {"Classification": ["AmazonMassiveIntentClassification (en)", ...], "PairClassification": ["SprintDuplicateQuestions", ...]} +def get_mteb_average(task_dict: dict, refresh=True): + all_tasks = reduce(lambda x, y: x + y, task_dict.values()) DATA_OVERALL = get_mteb_data( - tasks=[ - "Classification", - "Clustering", - "PairClassification", - "Reranking", - "Retrieval", - "STS", - "Summarization", - ], - datasets=TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION, + tasks=list(task_dict.keys()), + datasets=all_tasks, fillna=False, add_emb_dim=True, rank=False, + refresh=refresh ) # Debugging: # DATA_OVERALL.to_csv("overall.csv") - DATA_OVERALL.insert(1, f"Average ({len(TASK_LIST_EN)} datasets)", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False)) - DATA_OVERALL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False)) - DATA_OVERALL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False)) - DATA_OVERALL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False)) - DATA_OVERALL.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False)) - DATA_OVERALL.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False)) - DATA_OVERALL.insert(7, f"STS Average ({len(TASK_LIST_STS)} datasets)", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False)) - DATA_OVERALL.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False)) - DATA_OVERALL.sort_values(f"Average ({len(TASK_LIST_EN)} datasets)", ascending=False, inplace=True) + DATA_OVERALL.insert(1, f"Average ({len(all_tasks)} datasets)", DATA_OVERALL[all_tasks].mean(axis=1, skipna=False)) + for i, (task_category, task_category_list) in enumerate(task_dict.items()): + DATA_OVERALL.insert(i+2, f"{task_category} Average ({len(task_category_list)} datasets)", DATA_OVERALL[task_category_list].mean(axis=1, skipna=False)) + DATA_OVERALL.sort_values(f"Average ({len(all_tasks)} datasets)", ascending=False, inplace=True) # Start ranking from 1 DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1))) DATA_OVERALL = DATA_OVERALL.round(2) - DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLASSIFICATION]) - # Only keep rows with at least one score in addition to the "Model" & rank column - DATA_CLASSIFICATION_EN = DATA_CLASSIFICATION_EN[DATA_CLASSIFICATION_EN.iloc[:, 4:].ne("").any(axis=1)] - - DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLUSTERING]) - DATA_CLUSTERING = DATA_CLUSTERING[DATA_CLUSTERING.iloc[:, 4:].ne("").any(axis=1)] - - DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_PAIR_CLASSIFICATION]) - DATA_PAIR_CLASSIFICATION = DATA_PAIR_CLASSIFICATION[DATA_PAIR_CLASSIFICATION.iloc[:, 4:].ne("").any(axis=1)] - - DATA_RERANKING = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_RERANKING]) - DATA_RERANKING = DATA_RERANKING[DATA_RERANKING.iloc[:, 4:].ne("").any(axis=1)] - - DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_RETRIEVAL]) - DATA_RETRIEVAL = DATA_RETRIEVAL[DATA_RETRIEVAL.iloc[:, 4:].ne("").any(axis=1)] - - DATA_STS_EN = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_STS]) - DATA_STS_EN = DATA_STS_EN[DATA_STS_EN.iloc[:, 4:].ne("").any(axis=1)] - - DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_SUMMARIZATION]) - DATA_SUMMARIZATION = DATA_SUMMARIZATION[DATA_SUMMARIZATION.iloc[:, 1:].ne("").any(axis=1)] + DATA_TASKS = {} + for task_category, task_category_list in task_dict.items(): + DATA_TASKS[task_category] = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + task_category_list]) + DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)] # Fill NaN after averaging DATA_OVERALL.fillna("", inplace=True) - DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]] - DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)] + data_overall_rows = ["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(all_tasks)} datasets)"] + for task_category, task_category_list in task_dict.items(): + data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)") - return DATA_OVERALL - -def get_mteb_average_zh(): - global DATA_OVERALL_ZH, DATA_CLASSIFICATION_ZH, DATA_CLUSTERING_ZH, DATA_PAIR_CLASSIFICATION_ZH, DATA_RERANKING_ZH, DATA_RETRIEVAL_ZH, DATA_STS_ZH - DATA_OVERALL_ZH = get_mteb_data( - tasks=[ - "Classification", - "Clustering", - "PairClassification", - "Reranking", - "Retrieval", - "STS", - ], - datasets=TASK_LIST_CLASSIFICATION_ZH + TASK_LIST_CLUSTERING_ZH + TASK_LIST_PAIR_CLASSIFICATION_ZH + TASK_LIST_RERANKING_ZH + TASK_LIST_RETRIEVAL_ZH + TASK_LIST_STS_ZH, - fillna=False, - add_emb_dim=True, - rank=False, - ) - # Debugging: - # DATA_OVERALL_ZH.to_csv("overall.csv") - - DATA_OVERALL_ZH.insert(1, f"Average ({len(TASK_LIST_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_ZH].mean(axis=1, skipna=False)) - DATA_OVERALL_ZH.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLASSIFICATION_ZH].mean(axis=1, skipna=False)) - DATA_OVERALL_ZH.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_CLUSTERING_ZH].mean(axis=1, skipna=False)) - DATA_OVERALL_ZH.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_PAIR_CLASSIFICATION_ZH].mean(axis=1, skipna=False)) - DATA_OVERALL_ZH.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RERANKING_ZH].mean(axis=1, skipna=False)) - DATA_OVERALL_ZH.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_RETRIEVAL_ZH].mean(axis=1, skipna=False)) - DATA_OVERALL_ZH.insert(7, f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)", DATA_OVERALL_ZH[TASK_LIST_STS_ZH].mean(axis=1, skipna=False)) - DATA_OVERALL_ZH.sort_values(f"Average ({len(TASK_LIST_ZH)} datasets)", ascending=False, inplace=True) - # Start ranking from 1 - DATA_OVERALL_ZH.insert(0, "Rank", list(range(1, len(DATA_OVERALL_ZH) + 1))) - - DATA_OVERALL_ZH = DATA_OVERALL_ZH.round(2) - - DATA_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLASSIFICATION_ZH]) - # Only keep rows with at least one score in addition to the "Model" & rank column - DATA_CLASSIFICATION_ZH = DATA_CLASSIFICATION_ZH[DATA_CLASSIFICATION_ZH.iloc[:, 4:].ne("").any(axis=1)] - - DATA_CLUSTERING_ZH = add_rank(DATA_OVERALL_ZH[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLUSTERING_ZH]) - DATA_CLUSTERING_ZH = DATA_CLUSTERING_ZH[DATA_CLUSTERING_ZH.iloc[:, 4:].ne("").any(axis=1)] - - DATA_PAIR_CLASSIFICATION_ZH = add_rank(DATA_OVERALL_ZH[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_PAIR_CLASSIFICATION_ZH]) - DATA_PAIR_CLASSIFICATION_ZH = DATA_PAIR_CLASSIFICATION_ZH[DATA_PAIR_CLASSIFICATION_ZH.iloc[:, 4:].ne("").any(axis=1)] - - DATA_RERANKING_ZH = add_rank(DATA_OVERALL_ZH[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_RERANKING_ZH]) - DATA_RERANKING_ZH = DATA_RERANKING_ZH[DATA_RERANKING_ZH.iloc[:, 4:].ne("").any(axis=1)] - - DATA_RETRIEVAL_ZH = add_rank(DATA_OVERALL_ZH[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_RETRIEVAL_ZH]) - DATA_RETRIEVAL_ZH = DATA_RETRIEVAL_ZH[DATA_RETRIEVAL_ZH.iloc[:, 4:].ne("").any(axis=1)] - - DATA_STS_ZH = add_rank(DATA_OVERALL_ZH[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_STS_ZH]) - DATA_STS_ZH = DATA_STS_ZH[DATA_STS_ZH.iloc[:, 4:].ne("").any(axis=1)] - - # Fill NaN after averaging - DATA_OVERALL_ZH.fillna("", inplace=True) - - DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_ZH)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)"]] - DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)] - - return DATA_OVERALL_ZH - -def get_mteb_average_fr(): - global DATA_OVERALL_FR, DATA_CLASSIFICATION_FR, DATA_CLUSTERING_FR, DATA_PAIR_CLASSIFICATION_FR, DATA_RERANKING_FR, DATA_RETRIEVAL_FR, DATA_STS_FR, DATA_SUMMARIZATION_FR - DATA_OVERALL_FR = get_mteb_data( - tasks=[ - "Classification", - "Clustering", - "PairClassification", - "Reranking", - "Retrieval", - "STS", - "Summarization" - ], - datasets=TASK_LIST_CLASSIFICATION_FR + TASK_LIST_CLUSTERING_FR + TASK_LIST_PAIR_CLASSIFICATION_FR + TASK_LIST_RERANKING_FR + TASK_LIST_RETRIEVAL_FR + TASK_LIST_STS_FR + TASK_LIST_SUMMARIZATION_FR, - fillna=False, - add_emb_dim=True, - rank=False, - ) - # Debugging: - # DATA_OVERALL_FR.to_csv("overall.csv") - - DATA_OVERALL_FR.insert(1, f"Average ({len(TASK_LIST_FR)} datasets)", DATA_OVERALL_FR[TASK_LIST_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_FR)} datasets)", DATA_OVERALL_FR[TASK_LIST_CLASSIFICATION_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_FR)} datasets)", DATA_OVERALL_FR[TASK_LIST_CLUSTERING_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_FR)} datasets)", DATA_OVERALL_FR[TASK_LIST_PAIR_CLASSIFICATION_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING_FR)} datasets)", DATA_OVERALL_FR[TASK_LIST_RERANKING_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_FR)} datasets)", DATA_OVERALL_FR[TASK_LIST_RETRIEVAL_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.insert(7, f"STS Average ({len(TASK_LIST_STS_FR)} datasets)", DATA_OVERALL_FR[TASK_LIST_STS_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION_FR)} dataset)", DATA_OVERALL_FR[TASK_LIST_SUMMARIZATION_FR].mean(axis=1, skipna=False)) - DATA_OVERALL_FR.sort_values(f"Average ({len(TASK_LIST_FR)} datasets)", ascending=False, inplace=True) - # Start ranking from 1 - DATA_OVERALL_FR.insert(0, "Rank", list(range(1, len(DATA_OVERALL_FR) + 1))) - DATA_OVERALL_FR = DATA_OVERALL_FR.round(2) - - DATA_CLASSIFICATION_FR = add_rank(DATA_OVERALL_FR[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLASSIFICATION_FR]) - DATA_CLASSIFICATION_FR = DATA_CLASSIFICATION_FR[DATA_CLASSIFICATION_FR.iloc[:, 4:].ne("").any(axis=1)] - - DATA_CLUSTERING_FR = add_rank(DATA_OVERALL_FR[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLUSTERING_FR]) - DATA_CLUSTERING_FR = DATA_CLUSTERING_FR[DATA_CLUSTERING_FR.iloc[:, 4:].ne("").any(axis=1)] - - DATA_PAIR_CLASSIFICATION_FR = add_rank(DATA_OVERALL_FR[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_PAIR_CLASSIFICATION_FR]) - DATA_PAIR_CLASSIFICATION_FR = DATA_PAIR_CLASSIFICATION_FR[DATA_PAIR_CLASSIFICATION_FR.iloc[:, 4:].ne("").any(axis=1)] - - DATA_RERANKING_FR = add_rank(DATA_OVERALL_FR[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_RERANKING_FR]) - DATA_RERANKING_FR = DATA_RERANKING_FR[DATA_RERANKING_FR.iloc[:, 4:].ne("").any(axis=1)] - - DATA_RETRIEVAL_FR = add_rank(DATA_OVERALL_FR[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_RETRIEVAL_FR]) - DATA_RETRIEVAL_FR = DATA_RETRIEVAL_FR[DATA_RETRIEVAL_FR.iloc[:, 4:].ne("").any(axis=1)] - - DATA_STS_FR = add_rank(DATA_OVERALL_FR[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_STS_FR]) - DATA_STS_FR = DATA_STS_FR[DATA_STS_FR.iloc[:, 4:].ne("").any(axis=1)] - - DATA_SUMMARIZATION_FR = add_rank(DATA_OVERALL_FR[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_SUMMARIZATION_FR]) - DATA_SUMMARIZATION_FR = DATA_SUMMARIZATION_FR[DATA_SUMMARIZATION_FR.iloc[:, 1:].ne("").any(axis=1)] - - # Fill NaN after averaging - DATA_OVERALL_FR.fillna("", inplace=True) - - DATA_OVERALL_FR = DATA_OVERALL_FR[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_FR)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_FR)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_FR)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_FR)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_FR)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_FR)} datasets)", f"STS Average ({len(TASK_LIST_STS_FR)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION_FR)} dataset)"]] - DATA_OVERALL_FR = DATA_OVERALL_FR[DATA_OVERALL_FR.iloc[:, 5:].ne("").any(axis=1)] - - return DATA_OVERALL_FR - -def get_mteb_average_pl(): - global DATA_OVERALL_PL, DATA_CLASSIFICATION_PL, DATA_CLUSTERING_PL, DATA_PAIR_CLASSIFICATION_PL, DATA_RETRIEVAL_PL, DATA_STS_PL - DATA_OVERALL_PL = get_mteb_data( - tasks=[ - "Classification", - "Clustering", - "PairClassification", - "Retrieval", - "STS", - ], - datasets=TASK_LIST_CLASSIFICATION_PL + TASK_LIST_CLUSTERING_PL + TASK_LIST_PAIR_CLASSIFICATION_PL + TASK_LIST_RETRIEVAL_PL + TASK_LIST_STS_PL, - fillna=False, - add_emb_dim=True, - rank=False, - ) - # Debugging: - # DATA_OVERALL_PL.to_csv("overall.csv") - - DATA_OVERALL_PL.insert(1, f"Average ({len(TASK_LIST_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PL].mean(axis=1, skipna=False)) - DATA_OVERALL_PL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLASSIFICATION_PL].mean(axis=1, skipna=False)) - DATA_OVERALL_PL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_CLUSTERING_PL].mean(axis=1, skipna=False)) - DATA_OVERALL_PL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_PAIR_CLASSIFICATION_PL].mean(axis=1, skipna=False)) - DATA_OVERALL_PL.insert(5, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_RETRIEVAL_PL].mean(axis=1, skipna=False)) - DATA_OVERALL_PL.insert(6, f"STS Average ({len(TASK_LIST_STS_PL)} datasets)", DATA_OVERALL_PL[TASK_LIST_STS_PL].mean(axis=1, skipna=False)) - DATA_OVERALL_PL.sort_values(f"Average ({len(TASK_LIST_PL)} datasets)", ascending=False, inplace=True) - # Start ranking from 1 - DATA_OVERALL_PL.insert(0, "Rank", list(range(1, len(DATA_OVERALL_PL) + 1))) - - DATA_OVERALL_PL = DATA_OVERALL_PL.round(2) + DATA_OVERALL = DATA_OVERALL[data_overall_rows] + DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)] - DATA_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLASSIFICATION_PL]) - # Only keep rows with at least one score in addition to the "Model" & rank column - DATA_CLASSIFICATION_PL = DATA_CLASSIFICATION_PL[DATA_CLASSIFICATION_PL.iloc[:, 4:].ne("").any(axis=1)] - - DATA_CLUSTERING_PL = add_rank(DATA_OVERALL_PL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_CLUSTERING_PL]) - DATA_CLUSTERING_PL = DATA_CLUSTERING_PL[DATA_CLUSTERING_PL.iloc[:, 4:].ne("").any(axis=1)] - - DATA_PAIR_CLASSIFICATION_PL = add_rank(DATA_OVERALL_PL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_PAIR_CLASSIFICATION_PL]) - DATA_PAIR_CLASSIFICATION_PL = DATA_PAIR_CLASSIFICATION_PL[DATA_PAIR_CLASSIFICATION_PL.iloc[:, 4:].ne("").any(axis=1)] - - DATA_RETRIEVAL_PL = add_rank(DATA_OVERALL_PL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_RETRIEVAL_PL]) - DATA_RETRIEVAL_PL = DATA_RETRIEVAL_PL[DATA_RETRIEVAL_PL.iloc[:, 4:].ne("").any(axis=1)] - - DATA_STS_PL = add_rank(DATA_OVERALL_PL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_STS_PL]) - DATA_STS_PL = DATA_STS_PL[DATA_STS_PL.iloc[:, 4:].ne("").any(axis=1)] + return DATA_OVERALL, DATA_TASKS - # Fill NaN after averaging - DATA_OVERALL_PL.fillna("", inplace=True) - - DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]] - DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)] - - return DATA_OVERALL_PL - -get_mteb_average() -get_mteb_average_fr() -get_mteb_average_pl() -get_mteb_average_zh() -DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_BITEXT_MINING] -DATA_BITEXT_MINING_DA = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_DA)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + TASK_LIST_BITEXT_MINING_DA] -DATA_CLASSIFICATION_DA = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_DA)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_CLASSIFICATION_DA] -DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_NB)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_CLASSIFICATION_NB] -DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_CLASSIFICATION_SV] -DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_CLASSIFICATION_OTHER] -DATA_CLUSTERING_DE = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_CLUSTERING_DE] -DATA_STS_OTHER = get_mteb_data(["STS"], [], TASK_LIST_STS_OTHER)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_STS_OTHER] -DATA_RETRIEVAL_LAW = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_LAW)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_RETRIEVAL_LAW] -DATA_RETRIEVAL_INSTRUCTIONS = get_mteb_data(["InstructionRetrieval"], [], TASK_LIST_RETRIEVAL_INSTRUCTIONS)[["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Average"] + TASK_LIST_RETRIEVAL_INSTRUCTIONS] +boards_data = {} +all_data_tasks = [] +for board, board_config in BOARDS_CONFIG.items(): + boards_data[board] = { + "data_overall": None, + "data_tasks": {} + } + if board_config["has_overall"]: + data_overall, data_tasks = get_mteb_average(board_config["tasks"], refresh=False) + boards_data[board]["data_overall"] = data_overall + boards_data[board]["data_tasks"] = data_tasks + all_data_tasks.extend(data_tasks.values()) + else: + for task_category, task_category_list in board_config["tasks"].items(): + data_task_category = get_mteb_data(tasks=[task_category], datasets=task_category_list, refresh=False) + data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True) + boards_data[board]["data_tasks"][task_category] = data_task_category + all_data_tasks.append(data_task_category) # Exact, add all non-nan integer values for every dataset NUM_SCORES = 0 DATASETS = [] MODELS = [] # LANGUAGES = [] -for d in [ - DATA_BITEXT_MINING, - DATA_BITEXT_MINING_DA, - DATA_CLASSIFICATION_EN, - DATA_CLASSIFICATION_DA, - DATA_CLASSIFICATION_FR, - DATA_CLASSIFICATION_NB, - DATA_CLASSIFICATION_PL, - DATA_CLASSIFICATION_SV, - DATA_CLASSIFICATION_ZH, - DATA_CLASSIFICATION_OTHER, - DATA_CLUSTERING, - DATA_CLUSTERING_DE, - DATA_CLUSTERING_FR, - DATA_CLUSTERING_PL, - DATA_CLUSTERING_ZH, - DATA_PAIR_CLASSIFICATION, - DATA_PAIR_CLASSIFICATION_FR, - DATA_PAIR_CLASSIFICATION_PL, - DATA_PAIR_CLASSIFICATION_ZH, - DATA_RERANKING, - DATA_RERANKING_FR, - DATA_RERANKING_ZH, - DATA_RETRIEVAL, - DATA_RETRIEVAL_FR, - DATA_RETRIEVAL_PL, - DATA_RETRIEVAL_ZH, - DATA_RETRIEVAL_LAW, - DATA_RETRIEVAL_INSTRUCTIONS, - DATA_STS_EN, - DATA_STS_FR, - DATA_STS_PL, - DATA_STS_ZH, - DATA_STS_OTHER, - DATA_SUMMARIZATION, - DATA_SUMMARIZATION_FR, -]: +for d in all_data_tasks: # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum() cols_to_ignore = 4 if "Average" in d.columns else 3 # Count number of scores including only non-nan floats & excluding the rank column @@ -1870,332 +386,54 @@ Each inner tab can have the following keys: - refresh: The function to refresh the leaderboard """ -chinese_credits = "[FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)" -french_credits = "[Lyon-NLP](https://github.com/Lyon-NLP): [Gabriel Sequeira](https://github.com/GabrielSequeira), [Imene Kerboua](https://github.com/imenelydiaker), [Wissam Siblini](https://github.com/wissam-sib), [Mathieu Ciancone](https://github.com/MathieuCiancone), [Marion Schaeffer](https://github.com/schmarion)" -danish_credits = "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" -norwegian_credits = "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" -polish_credits = "[Rafaล‚ Poล›wiata](https://github.com/rafalposwiata)" -instruction_credits = "[Orion Weller, FollowIR paper](https://arxiv.org/abs/2403.15246)" +def get_refresh_function(task_category, task_list): + def _refresh(): + data_task_category = get_mteb_data(tasks=[task_category], datasets=task_list) + data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True) + return data_task_category + return _refresh data = { - "Overall": { - "metric": "Various, refer to task tabs", - "data": [ - { - "language": "English", - "description": "**Overall MTEB English leaderboard** ๐Ÿ”ฎ", - "data": DATA_OVERALL, - "refresh": get_mteb_average, - }, - { - "language": "Chinese", - "data": DATA_OVERALL_ZH, - "description": "**Overall MTEB Chinese leaderboard (C-MTEB)** ๐Ÿ”ฎ๐Ÿ‡จ๐Ÿ‡ณ", - "credits": chinese_credits, - "refresh": get_mteb_average_zh, - }, - { - "language": "French", - "data": DATA_OVERALL_FR, - "description": "**Overall MTEB French leaderboard (F-MTEB)** ๐Ÿ”ฎ๐Ÿ‡ซ๐Ÿ‡ท", - "credits": french_credits, - "refresh": get_mteb_average_fr, - }, - { - "language": "Polish", - "data": DATA_OVERALL_PL, - "description": "**Overall MTEB Polish leaderboard** ๐Ÿ”ฎ๐Ÿ‡ต๐Ÿ‡ฑ", - "refresh": get_mteb_average_pl, - }, - ] - }, - "Bitext Mining": { - "metric": "[F1](https://huggingface.co/spaces/evaluate-metric/f1)", - "data": [ - { - "language": "English-X", - "language_long": "117 (Pairs of: English & other language)", - "description": "**Bitext Mining English-X Leaderboard** ๐ŸŽŒ", - "data": DATA_BITEXT_MINING, - "refresh": partial(get_mteb_data, tasks=["BitextMining"], datasets=TASK_LIST_BITEXT_MINING), - }, - { - "language": "Danish", - "language_long": "Danish & Bornholmsk (Danish Dialect)", - "description": "**Bitext Mining Danish Leaderboard** ๐ŸŽŒ๐Ÿ‡ฉ๐Ÿ‡ฐ", - "credits": danish_credits, - "data": DATA_BITEXT_MINING_DA, - "refresh": partial(get_mteb_data, tasks=["BitextMining"], datasets=TASK_LIST_BITEXT_MINING_DA), - } - ] - }, - "Classification": { - "metric": "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)", - "data": [ - { - "language": "English", - "description": "**Classification English Leaderboard** โค๏ธ", - "data": DATA_CLASSIFICATION_EN, - "refresh": partial(get_mteb_data, tasks=["Classification"], langs=["en"]) - }, - { - "language": "Chinese", - "description": "**Classification Chinese Leaderboard** ๐Ÿงก๐Ÿ‡จ๐Ÿ‡ณ", - "credits": chinese_credits, - "data": DATA_CLASSIFICATION_ZH, - "refresh": partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_ZH) - }, - { - "language": "Danish", - "description": "**Classification Danish Leaderboard** ๐Ÿค๐Ÿ‡ฉ๐Ÿ‡ฐ", - "credits": danish_credits, - "data": DATA_CLASSIFICATION_DA, - "refresh": partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_DA) - }, - { - "language": "French", - "description": "**Classification French Leaderboard** ๐Ÿ’™๐Ÿ‡ซ๐Ÿ‡ท", - "credits": french_credits, - "data": DATA_CLASSIFICATION_FR, - "refresh": partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_FR) - }, - { - "language": "Norwegian", - "language_long": "Norwegian Bokmรฅl", - "description": "**Classification Norwegian Leaderboard** ๐Ÿ’™๐Ÿ‡ณ๐Ÿ‡ด", - "credits": norwegian_credits, - "data": DATA_CLASSIFICATION_NB, - "refresh": partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_NB) - }, - { - "language": "Polish", - "description": "**Classification Polish Leaderboard** ๐Ÿค๐Ÿ‡ต๐Ÿ‡ฑ", - "credits": polish_credits, - "data": DATA_CLASSIFICATION_PL, - "refresh": partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_PL) - }, - { - "language": "Swedish", - "description": "**Classification Swedish Leaderboard** ๐Ÿ’›๐Ÿ‡ธ๐Ÿ‡ช", - "credits": norwegian_credits, - "data": DATA_CLASSIFICATION_SV, - "refresh": partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_SV) - }, - { - "language": "Other", - "language_long": "47 (Only languages not included in the other tabs)", - "description": "**Classification Other Languages Leaderboard** ๐Ÿ’œ๐Ÿ’š๐Ÿ’™", - "data": DATA_CLASSIFICATION_OTHER, - "refresh": partial(get_mteb_data, tasks=["Classification"], datasets=TASK_LIST_CLASSIFICATION_OTHER) - } - ] - }, - "Clustering": { - "metric": "Validity Measure (v_measure)", - "data": [ - { - "language": "English", - "description": "**Clustering Leaderboard** โœจ", - "data": DATA_CLUSTERING, - "refresh": partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING) - }, - { - "language": "Chinese", - "description": "**Clustering Chinese Leaderboard** โœจ๐Ÿ‡จ๐Ÿ‡ณ", - "credits": chinese_credits, - "data": DATA_CLUSTERING_ZH, - "refresh": partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_ZH) - }, - { - "language": "French", - "description": "**Clustering French Leaderboard** โœจ๐Ÿ‡ซ๐Ÿ‡ท", - "credits": french_credits, - "data": DATA_CLUSTERING_FR, - "refresh": partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_FR) - }, - { - "language": "German", - "description": "**Clustering German Leaderboard** โœจ๐Ÿ‡ฉ๐Ÿ‡ช", - "credits": "[Silvan](https://github.com/slvnwhrl)", - "data": DATA_CLUSTERING_DE, - "refresh": partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_DE) - }, - { - "language": "Polish", - "description": "**Clustering Polish Leaderboard** โœจ๐Ÿ‡ต๐Ÿ‡ฑ", - "credits": polish_credits, - "data": DATA_CLUSTERING_PL, - "refresh": partial(get_mteb_data, tasks=["Clustering"], datasets=TASK_LIST_CLUSTERING_PL) - }, - ] - }, - "Pair Classification": { - "metric": "Average Precision based on Cosine Similarities (cos_sim_ap)", - "data": [ - { - "language": "English", - "description": "**Pair Classification English Leaderboard** ๐ŸŽญ", - "data": DATA_PAIR_CLASSIFICATION, - "refresh": partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION) - }, - { - "language": "Chinese", - "description": "**Pair Classification Chinese Leaderboard** ๐ŸŽญ๐Ÿ‡จ๐Ÿ‡ณ", - "credits": chinese_credits, - "data": DATA_PAIR_CLASSIFICATION_ZH, - "refresh": partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION_ZH) - }, - { - "language": "French", - "description": "**Pair Classification French Leaderboard** ๐ŸŽญ๐Ÿ‡ซ๐Ÿ‡ท", - "credits": french_credits, - "data": DATA_PAIR_CLASSIFICATION_FR, - "refresh": partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION_FR) - }, - { - "language": "Polish", - "description": "**Pair Classification Polish Leaderboard** ๐ŸŽญ๐Ÿ‡ต๐Ÿ‡ฑ", - "credits": polish_credits, - "data": DATA_PAIR_CLASSIFICATION_PL, - "refresh": partial(get_mteb_data, tasks=["PairClassification"], datasets=TASK_LIST_PAIR_CLASSIFICATION_PL) - }, - ] - }, - "Reranking": { - "metric": "Mean Average Precision (MAP)", - "data": [ - { - "language": "English", - "description": "**Reranking English Leaderboard** ๐Ÿฅˆ", - "data": DATA_RERANKING, - "refresh": partial(get_mteb_data, tasks=["Reranking"], datasets=TASK_LIST_RERANKING) - }, - { - "language": "Chinese", - "description": "**Reranking Chinese Leaderboard** ๐Ÿฅˆ๐Ÿ‡จ๐Ÿ‡ณ", - "credits": chinese_credits, - "data": DATA_RERANKING_ZH, - "refresh": partial(get_mteb_data, tasks=["Reranking"], datasets=TASK_LIST_RERANKING_ZH) - }, - { - "language": "French", - "description": "**Reranking French Leaderboard** ๐Ÿฅˆ๐Ÿ‡ซ๐Ÿ‡ท", - "credits": french_credits, - "data": DATA_RERANKING_FR, - "refresh": partial(get_mteb_data, tasks=["Reranking"], datasets=TASK_LIST_RERANKING_FR) - } - ] - }, - "Retrieval": { - "metric": "Normalized Discounted Cumulative Gain @ k (ndcg_at_10)", - "data": [ - { - "language": "English", - "description": "**Retrieval English Leaderboard** ๐Ÿ”Ž", - "data": DATA_RETRIEVAL, - "refresh": partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL) - }, - { - "language": "Chinese", - "description": "**Retrieval Chinese Leaderboard** ๐Ÿ”Ž๐Ÿ‡จ๐Ÿ‡ณ", - "credits": chinese_credits, - "data": DATA_RETRIEVAL_ZH, - "refresh": partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_ZH) - }, - { - "language": "French", - "description": "**Retrieval French Leaderboard** ๐Ÿ”Ž๐Ÿ‡ซ๐Ÿ‡ท", - "credits": french_credits, - "data": DATA_RETRIEVAL_FR, - "refresh": partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_FR) - }, - { - "language": "Law", - "language_long": "English, German, Chinese", - "description": "**Retrieval Law Leaderboard** ๐Ÿ”Žโš–๏ธ", - "credits": "[Voyage AI](https://www.voyageai.com/)", - "data": DATA_RETRIEVAL_LAW, - "refresh": partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_LAW) - }, - { - "language": "Polish", - "description": "**Retrieval Polish Leaderboard** ๐Ÿ”Ž๐Ÿ‡ต๐Ÿ‡ฑ", - "credits": polish_credits, - "data": DATA_RETRIEVAL_PL, - "refresh": partial(get_mteb_data, tasks=["Retrieval"], datasets=TASK_LIST_RETRIEVAL_PL) - } - ] - }, - "STS": { - "metric": "Spearman correlation based on cosine similarity", - "data": [ - { - "language": "English", - "description": "**STS English Leaderboard** ๐Ÿค–", - "data": DATA_STS_EN, - "refresh": partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS) - }, - { - "language": "Chinese", - "description": "**STS Chinese Leaderboard** ๐Ÿค–๐Ÿ‡จ๐Ÿ‡ณ", - "credits": chinese_credits, - "data": DATA_STS_ZH, - "refresh": partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_ZH) - }, - { - "language": "French", - "description": "**STS French Leaderboard** ๐Ÿค–๐Ÿ‡ซ๐Ÿ‡ท", - "credits": french_credits, - "data": DATA_STS_FR, - "refresh": partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_FR) - }, - { - "language": "Polish", - "description": "**STS Polish Leaderboard** ๐Ÿค–๐Ÿ‡ต๐Ÿ‡ฑ", - "credits": polish_credits, - "data": DATA_STS_PL, - "refresh": partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_PL) - }, - { - "language": "Other", - "language_long": "Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)", - "description": "**STS Other Leaderboard** ๐Ÿ‘ฝ", - "data": DATA_STS_OTHER, - "refresh": partial(get_mteb_data, tasks=["STS"], datasets=TASK_LIST_STS_OTHER) - }, - ] - }, - "Summarization": { - "metric": "Spearman correlation based on cosine similarity", - "data": [ - { - "language": "English", - "description": "**Summarization Leaderboard** ๐Ÿ“œ", - "data": DATA_SUMMARIZATION, - "refresh": partial(get_mteb_data, tasks=TASK_LIST_SUMMARIZATION) - }, - { - "language": "French", - "description": "**Summarization Leaderboard** ๐Ÿ“œ", - "credits": french_credits, - "data": DATA_SUMMARIZATION_FR, - "refresh": partial(get_mteb_data, tasks=TASK_LIST_SUMMARIZATION_FR) - } - ] - }, - "Retrieval w/Instructions": { - "metric": "paired mean reciprocal rank (p-MRR)", - "data": [ - { - "language": "English", - "description": "**Retrieval with Instructions Leaderboard** ๐Ÿ”Ž๐Ÿ“‹", - "credits": instruction_credits, - "data": DATA_RETRIEVAL_INSTRUCTIONS, - "refresh": partial(get_mteb_data, tasks=TASK_LIST_RETRIEVAL_INSTRUCTIONS) - } - ] - } + "Overall": {"metric": "Various, refer to task tabs", "data": []} } +for task in TASKS: + data[task] = {"metric": TASKS_CONFIG[task]["metric_description"], "data": []} + +for board, board_config in BOARDS_CONFIG.items(): + init_name = board_config["title"] + if init_name in PRETTY_NAMES: + init_name = PRETTY_NAMES[init_name] + board_pretty_name = f"{init_name} leaderboard" + acronym = board_config.get("acronym", None) + board_icon = board_config.get("icon", None) + if board_icon is None: + board_icon = "" + credits = board_config.get("credits", None) + + if board_config["has_overall"]: + overall_pretty_name = board_pretty_name + if acronym is not None: + overall_pretty_name += f" ({board_config['acronym']})" + data["Overall"]["data"].append({ + "language": board_config["title"], + "language_long": board_config["language_long"], + "description": f"**Overall MTEB {overall_pretty_name}** ๐Ÿ”ฎ{board_icon}", + "data": boards_data[board]["data_overall"], + "refresh": lambda: get_mteb_average(board_config["tasks"])[0],#partial(get_mteb_average, board_config["tasks"]), + "credits": credits, + }) + for task_category, task_category_list in board_config["tasks"].items(): + task_icon = TASKS_CONFIG[task_category]['icon'] + if "special_icons" in board_config and isinstance(board_config["special_icons"], dict): + task_icon = board_config["special_icons"].get(task_category, task_icon) + data[task_category]["data"].append({ + "language": board_config["title"], + "language_long": board_config["language_long"], + "description": f"**{task_category} {board_pretty_name}** {task_icon}{board_icon}", + "data": boards_data[board]["data_tasks"][task_category], + "refresh": get_refresh_function(task_category, task_category_list), + "credits": credits, + }) dataframes = [] full_dataframes = [] @@ -2243,6 +481,7 @@ MODEL_TYPES = [ "Open", "Proprietary", "Sentence Transformers", + "Cross Encoders" ] def filter_data(search_query, model_types, model_sizes, *full_dataframes): @@ -2266,6 +505,8 @@ def filter_data(search_query, model_types, model_sizes, *full_dataframes): masks.append(df["Model"].isin(PROPRIETARY_MODELS)) elif model_type == "Sentence Transformers": masks.append(df["Model"].isin(SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS)) + elif model_type == "Cross Encoders": + masks.append(df["Model"].isin(CROSS_ENCODERS)) if masks: df = df[reduce(lambda a, b: a | b, masks)] else: @@ -2281,6 +522,7 @@ def filter_data(search_query, model_types, model_sizes, *full_dataframes): output_dataframes.append(df) return output_dataframes + with gr.Blocks(css=css) as block: # Store the current task and language for updating the URL. This is a bit hacky, but it works @@ -2316,16 +558,16 @@ with gr.Blocks(css=css) as block: with gr.Tabs() as outer_tabs: # Store the tabs for updating them on load based on URL parameters tabs.append(outer_tabs) - for task, task_values in data.items(): metric = task_values["metric"] task_tab_id = task.lower().replace(" ", "-") # Overall, Bitext Mining, Classification, etc. - with gr.Tab(task, id=task_tab_id) as task_tab: + pretty_task_name = task if task not in ["InstructionRetrieval", "PairClassification"] else PRETTY_NAMES[task] + with gr.Tab(pretty_task_name, id=task_tab_id) as task_tab: # For updating the 'task' in the URL task_tab.select(update_url_task, [current_task_language, language_per_task], [current_task_language, language_per_task]).then(None, [current_task_language], [], js=set_window_url_params) - gr.Markdown(TASK_DESCRIPTION[task]) + gr.Markdown(TASK_DESCRIPTIONS[task]) with gr.Tabs() as task_tabs: # Store the task tabs for updating them on load based on URL parameters tabs.append(task_tabs) @@ -2344,7 +586,7 @@ with gr.Blocks(css=css) as block: - **Metric:** {metric} - **Languages:** {item['language_long'] if 'language_long' in item else item['language']} - {"- **Credits:** " + item['credits'] if "credits" in item else ''} + {"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''} """) with gr.Row():