SondosMB commited on
Commit
b782462
1 Parent(s): 640cdfd

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +99 -0
  2. big (1).json +68 -0
  3. constants.py +26 -0
  4. requirements (1).txt +2 -0
  5. small (1).json +134 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from constants import INTRODUCTION_TEXT,CITATION_TEXT
4
+
5
+
6
+ # Define the formatter function
7
+ def formatter(x):
8
+ try:
9
+ return round(x, 2)
10
+ except:
11
+ return x
12
+
13
+
14
+ # Example DataFrames
15
+
16
+ jsond_data = pd.read_json('big.json')
17
+ original_df = pd.DataFrame(jsond_data)
18
+ print(original_df)
19
+
20
+ jsond_data2 = pd.read_json('small.json')
21
+ Small_original_df = pd.DataFrame(jsond_data2)
22
+ print(Small_original_df)
23
+
24
+ # Apply formatter to the entire DataFrame
25
+ original_df = original_df.applymap(formatter)
26
+ Small_original_df=Small_original_df.applymap(formatter)
27
+
28
+
29
+ # Correct data types for Gradio DataFrame component
30
+ TYPES = ['str', 'number', 'number', 'number']
31
+
32
+
33
+ LAST_UPDATED = "May 10th 2024"
34
+
35
+ # CSS for styling
36
+ css = """
37
+ .markdown-text{font-size: 200pt}
38
+ .markdown-text-small{font-size: 13pt}
39
+ th {
40
+ text-align: center;
41
+ }
42
+ td {
43
+ font-size: 15px; /* Adjust the font size as needed */
44
+ text-align: center;
45
+ }
46
+ #od-benchmark-tab-table-button{
47
+ font-size: 15pt;
48
+ font-weight: bold;
49
+ }
50
+
51
+ #Intro{
52
+ font-size: 100pt;
53
+ }
54
+ """
55
+
56
+
57
+ def build_demo(original_df,Small_original_df, TYPES):
58
+ with gr.Blocks(css=css) as demo:
59
+ gr.Markdown(INTRODUCTION_TEXT, elem_id="Intro")
60
+ with gr.Tabs():
61
+ with gr.TabItem("🏅Leaderboard_Large",elem_id="od-benchmark-tab-table", id=0):
62
+ leaderboard_table = gr.components.Dataframe(
63
+ value=original_df,
64
+ datatype=TYPES,
65
+ label="Leaderboard_Big",
66
+ height=1000,
67
+ wrap=False,
68
+ interactive=False,
69
+ visible=True,
70
+ min_width=60,
71
+ )
72
+
73
+ with gr.TabItem("🏅 Leaderboard_Small",elem_id="od-benchmark-tab-table", id=1):
74
+ leaderboard_table = gr.components.Dataframe(
75
+ value=Small_original_df,
76
+ datatype=TYPES,
77
+ label="Leaderboard_small",
78
+ height=1000,
79
+ wrap=False,
80
+ interactive=False,
81
+ visible=True,
82
+ min_width=60,
83
+ )
84
+
85
+
86
+ gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
87
+
88
+ with gr.Row():
89
+ with gr.Accordion("📙 Citation", open=False):
90
+ gr.Textbox(
91
+ value=CITATION_TEXT, lines=18,
92
+ label="",
93
+ elem_id="citation-button",
94
+ show_copy_button=True)
95
+
96
+ return demo
97
+
98
+ demo = build_demo(original_df,Small_original_df, TYPES)
99
+ demo.launch(share='True')
big (1).json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "GPT-4",
4
+ "Average": 65.94,
5
+ "MMLU": 74.8,
6
+ "WinoGrande": 66.2,
7
+ "PiQA": 61.6,
8
+ "CommonsenseQA": 63.0,
9
+ "Race": 67.0,
10
+ "MedMCQA": 51.8,
11
+ "OpenkookQA": 60.3
12
+ },
13
+ {
14
+ "model": "Claude-3 Opus",
15
+ "Average": 62.64,
16
+ "MMLU": 70.4,
17
+ "WinoGrande": 63.5,
18
+ "PiQA": 59.1,
19
+ "CommonsenseQA": 63.7,
20
+ "Race": 66.2,
21
+ "MedMCQA": 49.1,
22
+ "OpenkookQA": 54.0
23
+ },
24
+ {
25
+ "model": "Mistral Large",
26
+ "Average": 61.45,
27
+ "MMLU": 67.8,
28
+ "WinoGrande": 56.8,
29
+ "PiQA": 61.2,
30
+ "CommonsenseQA": 55.4,
31
+ "Race": 70.1,
32
+ "MedMCQA": 43.4,
33
+ "OpenkookQA": 58.7
34
+ },
35
+ {
36
+ "model": "GPT-3.5",
37
+ "Average": 59.06,
38
+ "MMLU": 65.4,
39
+ "WinoGrande": 54.6,
40
+ "PiQA": 54.9,
41
+ "CommonsenseQA": 67.9,
42
+ "Race": 60.1,
43
+ "MedMCQA": 41.4,
44
+ "OpenkookQA": 49.9
45
+ },
46
+ {
47
+ "model": "Gemini Pro",
48
+ "Average": 54.45,
49
+ "MMLU": 57.7,
50
+ "WinoGrande": 56.4,
51
+ "PiQA": 47.7,
52
+ "CommonsenseQA": 50.6,
53
+ "Race": 61.0,
54
+ "MedMCQA": 37.5,
55
+ "OpenkookQA": 52.5
56
+ },
57
+ {
58
+ "model": "Llama3-70b-instruct",
59
+ "Average": 54.06,
60
+ "MMLU": 64.67,
61
+ "WinoGrande": 57.14,
62
+ "PiQA": 43.1,
63
+ "CommonsenseQA": 55.49,
64
+ "Race": 58.21,
65
+ "MedMCQA": 41.67,
66
+ "OpenkookQA": 41.93
67
+ }
68
+ ]
constants.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from pathlib import Path
3
+
4
+
5
+
6
+ banner_url = "https://huggingface.co/spaces/WildEval/WildBench-Leaderboard/resolve/main/%E2%80%8Eleaderboard_logo_v2.png" # the same repo here.
7
+ BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
8
+
9
+ INTRODUCTION_TEXT= """
10
+ # OS Benchmark (Evaluating LLMs with OS and MCQ)
11
+ 🔗 [Website](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 💻 [GitHub](https://github.com/VILA-Lab/MBZUAI-LLM-Leaderboard) | 📖 [Paper](#) | 🐦 [Tweet 1](#) | 🐦 [Tweet 2](#)
12
+
13
+ > ### MBZUAI-LLM-Leaderboard, a new framework for evaluating large language models (LLMs) by transitioning from multiple-choice questions (MCQs) to open-style questions.
14
+ This approach addresses the inherent biases and limitations of MCQs, such as selection bias and the effect of random guessing. By utilizing open-style questions,
15
+ the framework aims to provide a more accurate assessment of LLMs' abilities across various benchmarks and ensure that the evaluation reflects true capabilities,
16
+ particularly in terms of language understanding and reasoning.
17
+
18
+ """
19
+
20
+ CITATION_TEXT = """@artical{..,
21
+ title={MBZUAI-LLM-Leaderboard: From Multi-choice to Open-style Questions for LLMs Evaluation, Benchmark, and Arena},
22
+ author={},
23
+ year={2024},
24
+ archivePrefix={arXiv}
25
+ }
26
+ """
requirements (1).txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ pandas
small (1).json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "OPT (1.3B)",
4
+ "Average": 7.84,
5
+ "MMLU": 7.4,
6
+ "WinoGrande": 12.47,
7
+ "PiQA": 4.45,
8
+ "CommonsenseQA": 7.61,
9
+ "Race": 13.61,
10
+ "MedMCQA": 1.25,
11
+ "OpenkookQA": 4.48
12
+ },
13
+ {
14
+ "model": "SlimPajama",
15
+ "Average": 9.54,
16
+ "MMLU": 9.22,
17
+ "WinoGrande": 14.76,
18
+ "PiQA": 5.32,
19
+ "CommonsenseQA": 9.01,
20
+ "Race": 16.19,
21
+ "MedMCQA": 1.68,
22
+ "OpenkookQA": 5.7
23
+ },
24
+ {
25
+ "model": "OLMo (1B)",
26
+ "Average": 8.8,
27
+ "MMLU": 8.54,
28
+ "WinoGrande": 6.16,
29
+ "PiQA": 8.05,
30
+ "CommonsenseQA": 13.1,
31
+ "Race": 13.61,
32
+ "MedMCQA": 2.1,
33
+ "OpenkookQA": 6.11
34
+ },
35
+ {
36
+ "model": "GPT-Neo (1.3B)",
37
+ "Average": 7.38,
38
+ "MMLU": 6.94,
39
+ "WinoGrande": 10.81,
40
+ "PiQA": 4.31,
41
+ "CommonsenseQA": 6.34,
42
+ "Race": 13.75,
43
+ "MedMCQA": 2.63,
44
+ "OpenkookQA": 4.89
45
+ },
46
+ {
47
+ "model": "Cerebras-GPT (1.3B)",
48
+ "Average": 4.84,
49
+ "MMLU": 5.37,
50
+ "WinoGrande": 9.31,
51
+ "PiQA": 2.16,
52
+ "CommonsenseQA": 6.2,
53
+ "Race": 6.9,
54
+ "MedMCQA": 1.04,
55
+ "OpenkookQA": 3.46
56
+ },
57
+ {
58
+ "model": "RedPajama (1B)",
59
+ "Average": 9.01,
60
+ "MMLU": 9.21,
61
+ "WinoGrande": 16.97,
62
+ "PiQA": 1.39,
63
+ "CommonsenseQA": 11.41,
64
+ "Race": 14.35,
65
+ "MedMCQA": 1.86,
66
+ "OpenkookQA": 3.87
67
+ },
68
+ {
69
+ "model": "Pythia (1.4B)",
70
+ "Average": 8.73,
71
+ "MMLU": 9.66,
72
+ "WinoGrande": 11.52,
73
+ "PiQA": 4.17,
74
+ "CommonsenseQA": 9.01,
75
+ "Race": 12.76,
76
+ "MedMCQA": 3.19,
77
+ "OpenkookQA": 5.3
78
+ },
79
+ {
80
+ "model": "TinyLLama (1.1B)",
81
+ "Average": 8.39,
82
+ "MMLU": 8.94,
83
+ "WinoGrande": 12.23,
84
+ "PiQA": 3.59,
85
+ "CommonsenseQA": 6.06,
86
+ "Race": 16.7,
87
+ "MedMCQA": 2.07,
88
+ "OpenkookQA": 4.68
89
+ },
90
+ {
91
+ "model": "OELM (1B)",
92
+ "Average": 8.99,
93
+ "MMLU": 9.03,
94
+ "WinoGrande": 10.18,
95
+ "PiQA": 9.05,
96
+ "CommonsenseQA": 7.75,
97
+ "Race": 12.78,
98
+ "MedMCQA": 2.5,
99
+ "OpenkookQA": 6.31
100
+ },
101
+ {
102
+ "model": "Phi-3-mini-128k-instruct (3.8B)",
103
+ "Average": 39.73,
104
+ "MMLU": 36.97,
105
+ "WinoGrande": 46.88,
106
+ "PiQA": 32.04,
107
+ "CommonsenseQA": 49.15,
108
+ "Race": 37.81,
109
+ "MedMCQA": 22.61,
110
+ "OpenkookQA": 33.6
111
+ },
112
+ {
113
+ "model": "Gemma (2B)",
114
+ "Average": 17.37,
115
+ "MMLU": 17.52,
116
+ "WinoGrande": 22.68,
117
+ "PiQA": 15.09,
118
+ "CommonsenseQA": 27.46,
119
+ "Race": 14.32,
120
+ "MedMCQA": 4.57,
121
+ "OpenkookQA": 14.26
122
+ },
123
+ {
124
+ "model": "Qwen (1.8B)",
125
+ "Average": 21.61,
126
+ "MMLU": 10.0,
127
+ "WinoGrande": 40.97,
128
+ "PiQA": 15.52,
129
+ "CommonsenseQA": 31.13,
130
+ "Race": 34.91,
131
+ "MedMCQA": 4.7,
132
+ "OpenkookQA": 20.37
133
+ }
134
+ ]