File size: 4,969 Bytes
509661e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from enum import Enum, auto
#from dataclasses import dataclass

SubmissionType = Enum(
    "SubmissionType", 
    [
        "Automatic", 
        "SemiAutomatic", 
        "Manual", 
        "Closed", 
        "Arena"
    ]
)

Evaluators = Enum(
    "Evaluators", 
    [
        "Humans", # Arena
        "Automatic",
        "Model"
    ]
)

TestSet = Enum(
    "TestSet",
    [
        "Private",
        "Public",
        "Mix",
        "Rolling",
        "N/A"
    ]
)

Categories = Enum(
    "Categories",
    [
        "Text",
        "Image",
        "Audio",
        "Video",
        "Multimodal",
        "Generation",
        "Math",
        "Code",
        "LanguageSpecific",
        "Performance",
        "Safety",
        "VibeCheck",
        "Tools",
        "Artefacts"
    ]
)

Languages = Enum(
    "Languages",
    [
        "Chinese",
        "Korean",
        "Dutch",
        "Portuguese",
        "Italian",
        "Malay",
        "Polish",
        "Turkish"

    ]
)  

leaderboard_to_tags = {
    "HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
    "bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
    "optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
    "lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
    "llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
    "mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
    "gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
    "opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
    "upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
    "BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
    "vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
    "facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
    "mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
    "AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
    "AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
    "mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
    "echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
    "NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
    "HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
    "devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
    "WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
    "Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
    "eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
    "FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
    "mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
    "TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
    "q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
    "OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
    "speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
    "malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
    "allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
    "hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
    "opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
    "livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
    "allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
    "TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
}