Clémentine commited on
Commit
509661e
1 Parent(s): 270109b

hardcoded metadata

Browse files
Files changed (1) hide show
  1. leaderboards_metadata.py +107 -0
leaderboards_metadata.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum, auto
2
+ #from dataclasses import dataclass
3
+
4
+ SubmissionType = Enum(
5
+ "SubmissionType",
6
+ [
7
+ "Automatic",
8
+ "SemiAutomatic",
9
+ "Manual",
10
+ "Closed",
11
+ "Arena"
12
+ ]
13
+ )
14
+
15
+ Evaluators = Enum(
16
+ "Evaluators",
17
+ [
18
+ "Humans", # Arena
19
+ "Automatic",
20
+ "Model"
21
+ ]
22
+ )
23
+
24
+ TestSet = Enum(
25
+ "TestSet",
26
+ [
27
+ "Private",
28
+ "Public",
29
+ "Mix",
30
+ "Rolling",
31
+ "N/A"
32
+ ]
33
+ )
34
+
35
+ Categories = Enum(
36
+ "Categories",
37
+ [
38
+ "Text",
39
+ "Image",
40
+ "Audio",
41
+ "Video",
42
+ "Multimodal",
43
+ "Generation",
44
+ "Math",
45
+ "Code",
46
+ "LanguageSpecific",
47
+ "Performance",
48
+ "Safety",
49
+ "VibeCheck",
50
+ "Tools",
51
+ "Artefacts"
52
+ ]
53
+ )
54
+
55
+ Languages = Enum(
56
+ "Languages",
57
+ [
58
+ "Chinese",
59
+ "Korean",
60
+ "Dutch",
61
+ "Portuguese",
62
+ "Italian",
63
+ "Malay",
64
+ "Polish",
65
+ "Turkish"
66
+
67
+ ]
68
+ )
69
+
70
+ leaderboard_to_tags = {
71
+ "HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
72
+ "bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
73
+ "optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
74
+ "lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
75
+ "llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
76
+ "mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
77
+ "gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
78
+ "opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
79
+ "upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
80
+ "BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
81
+ "vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
82
+ "facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
83
+ "mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
84
+ "AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
85
+ "AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
86
+ "mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
87
+ "echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
88
+ "NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
89
+ "HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
90
+ "devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
91
+ "WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
92
+ "Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
93
+ "eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
94
+ "FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
95
+ "mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
96
+ "TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
97
+ "q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
98
+ "OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
99
+ "speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
100
+ "malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
101
+ "allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
102
+ "hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
103
+ "opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
104
+ "livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
105
+ "allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
106
+ "TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
107
+ }