Adding Evaluation Results

#2
by T145 - opened
Files changed (1) hide show
  1. README.md +84 -1
README.md CHANGED
@@ -80,6 +80,9 @@ model-index:
80
  args:
81
  num_few_shot: 0
82
  metrics:
 
 
 
83
  - type: acc_norm
84
  value: 9.09
85
  name: acc_norm
@@ -97,12 +100,79 @@ model-index:
97
  args:
98
  num_few_shot: 5
99
  metrics:
 
 
 
100
  - type: acc
101
  value: 32.26
102
  name: accuracy
103
  source:
104
  url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=T145/ZEUS-8B-V10
105
  name: Open LLM Leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  ---
107
  # ZEUS 8B 🌩️ V10
108
 
@@ -183,4 +253,17 @@ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-le
183
  |MATH Lvl 5 (4-Shot)|-1.06|
184
  |GPQA (0-shot) |+3.02|
185
  |MuSR (0-shot) |+0.85|
186
- |MMLU-PRO (5-shot) |+0.08|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  args:
81
  num_few_shot: 0
82
  metrics:
83
+ - type: acc_norm
84
+ value: 9.09
85
+ name: acc_norm
86
  - type: acc_norm
87
  value: 9.09
88
  name: acc_norm
 
100
  args:
101
  num_few_shot: 5
102
  metrics:
103
+ - type: acc
104
+ value: 32.26
105
+ name: accuracy
106
  - type: acc
107
  value: 32.26
108
  name: accuracy
109
  source:
110
  url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=T145/ZEUS-8B-V10
111
  name: Open LLM Leaderboard
112
+ - task:
113
+ type: text-generation
114
+ name: Text Generation
115
+ dataset:
116
+ name: IFEval (0-Shot)
117
+ type: wis-k/instruction-following-eval
118
+ split: train
119
+ args:
120
+ num_few_shot: 0
121
+ metrics:
122
+ - type: inst_level_strict_acc and prompt_level_strict_acc
123
+ value: 77.07
124
+ name: averaged accuracy
125
+ source:
126
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V10
127
+ name: Open LLM Leaderboard
128
+ - task:
129
+ type: text-generation
130
+ name: Text Generation
131
+ dataset:
132
+ name: BBH (3-Shot)
133
+ type: SaylorTwift/bbh
134
+ split: test
135
+ args:
136
+ num_few_shot: 3
137
+ metrics:
138
+ - type: acc_norm
139
+ value: 32.7
140
+ name: normalized accuracy
141
+ source:
142
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V10
143
+ name: Open LLM Leaderboard
144
+ - task:
145
+ type: text-generation
146
+ name: Text Generation
147
+ dataset:
148
+ name: MATH Lvl 5 (4-Shot)
149
+ type: lighteval/MATH-Hard
150
+ split: test
151
+ args:
152
+ num_few_shot: 4
153
+ metrics:
154
+ - type: exact_match
155
+ value: 20.09
156
+ name: exact match
157
+ source:
158
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V10
159
+ name: Open LLM Leaderboard
160
+ - task:
161
+ type: text-generation
162
+ name: Text Generation
163
+ dataset:
164
+ name: GPQA (0-shot)
165
+ type: Idavidrein/gpqa
166
+ split: train
167
+ args:
168
+ num_few_shot: 0
169
+ metrics:
170
+ - type: acc_norm
171
+ value: 9.96
172
+ name: acc_norm
173
+ source:
174
+ url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search=T145%2FZEUS-8B-V10
175
+ name: Open LLM Leaderboard
176
  ---
177
  # ZEUS 8B 🌩️ V10
178
 
 
253
  |MATH Lvl 5 (4-Shot)|-1.06|
254
  |GPQA (0-shot) |+3.02|
255
  |MuSR (0-shot) |+0.85|
256
+ |MMLU-PRO (5-shot) |+0.08|# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
257
+ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/T145__ZEUS-8B-V10-details)!
258
+ Summarized results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/contents/viewer/default/train?q=T145/ZEUS-8B-V10)!
259
+
260
+ | Metric |Value (%)|
261
+ |-------------------|--------:|
262
+ |**Average** | 30.19|
263
+ |IFEval (0-Shot) | 77.07|
264
+ |BBH (3-Shot) | 32.70|
265
+ |MATH Lvl 5 (4-Shot)| 20.09|
266
+ |GPQA (0-shot) | 9.96|
267
+ |MuSR (0-shot) | 9.09|
268
+ |MMLU-PRO (5-shot) | 32.26|
269
+