LeaderboardsExplorer / leaderboards_metadata.py
Clémentine
hardcoded metadata
509661e
raw history blame
No virus
4.97 kB
from enum import Enum, auto
#from dataclasses import dataclass
SubmissionType = Enum(
"SubmissionType",
[
"Automatic",
"SemiAutomatic",
"Manual",
"Closed",
"Arena"
]
)
Evaluators = Enum(
"Evaluators",
[
"Humans", # Arena
"Automatic",
"Model"
]
)
TestSet = Enum(
"TestSet",
[
"Private",
"Public",
"Mix",
"Rolling",
"N/A"
]
)
Categories = Enum(
"Categories",
[
"Text",
"Image",
"Audio",
"Video",
"Multimodal",
"Generation",
"Math",
"Code",
"LanguageSpecific",
"Performance",
"Safety",
"VibeCheck",
"Tools",
"Artefacts"
]
)
Languages = Enum(
"Languages",
[
"Chinese",
"Korean",
"Dutch",
"Portuguese",
"Italian",
"Malay",
"Polish",
"Turkish"
]
)
leaderboard_to_tags = {
"HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
"bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
"optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
"lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
"llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
"mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
"gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
"opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
"upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
"BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
"vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
"facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
"mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
"AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
"AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
"mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
"echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
"NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
"HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
"devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
"WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
"Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
"eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
"FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
"mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
"TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
"q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
"OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
"speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
"malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
"allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
"hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
"opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
"livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
"allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
"TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
}