Update README.md
Browse files
README.md
CHANGED
@@ -65,4 +65,395 @@ generated_ids = [
|
|
65 |
|
66 |
response = tokenizer.batch_decode(generated_ids)[0]
|
67 |
print(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
```
|
|
|
65 |
|
66 |
response = tokenizer.batch_decode(generated_ids)[0]
|
67 |
print(response)
|
68 |
+
```
|
69 |
+
|
70 |
+
## Benchmarks
|
71 |
+
|
72 |
+
Nous Benchmark:
|
73 |
+
```
|
74 |
+
| Model |AGIEval|GPT4All|TruthfulQA|Bigbench|Average|
|
75 |
+
|---------------------------------------------------|------:|------:|---------:|-------:|------:|
|
76 |
+
|[Master-Yi-9B](https://huggingface.co/qnguyen3/Master-Yi-9B)| 43.55| 71.48| 48.54| 41.43| 51.25|
|
77 |
+
|
78 |
+
### AGIEval
|
79 |
+
| Task |Version| Metric |Value| |Stderr|
|
80 |
+
|------------------------------|------:|--------|----:|---|-----:|
|
81 |
+
|agieval_aqua_rat | 0|acc |35.83|± | 3.01|
|
82 |
+
| | |acc_norm|31.89|± | 2.93|
|
83 |
+
|agieval_logiqa_en | 0|acc |38.25|± | 1.91|
|
84 |
+
| | |acc_norm|37.79|± | 1.90|
|
85 |
+
|agieval_lsat_ar | 0|acc |23.04|± | 2.78|
|
86 |
+
| | |acc_norm|20.43|± | 2.66|
|
87 |
+
|agieval_lsat_lr | 0|acc |48.04|± | 2.21|
|
88 |
+
| | |acc_norm|42.75|± | 2.19|
|
89 |
+
|agieval_lsat_rc | 0|acc |61.34|± | 2.97|
|
90 |
+
| | |acc_norm|52.79|± | 3.05|
|
91 |
+
|agieval_sat_en | 0|acc |79.13|± | 2.84|
|
92 |
+
| | |acc_norm|72.33|± | 3.12|
|
93 |
+
|agieval_sat_en_without_passage| 0|acc |44.17|± | 3.47|
|
94 |
+
| | |acc_norm|42.72|± | 3.45|
|
95 |
+
|agieval_sat_math | 0|acc |52.27|± | 3.38|
|
96 |
+
| | |acc_norm|47.73|± | 3.38|
|
97 |
+
|
98 |
+
Average: 43.55%
|
99 |
+
|
100 |
+
### GPT4All
|
101 |
+
| Task |Version| Metric |Value| |Stderr|
|
102 |
+
|-------------|------:|--------|----:|---|-----:|
|
103 |
+
|arc_challenge| 0|acc |54.95|± | 1.45|
|
104 |
+
| | |acc_norm|58.70|± | 1.44|
|
105 |
+
|arc_easy | 0|acc |82.28|± | 0.78|
|
106 |
+
| | |acc_norm|81.10|± | 0.80|
|
107 |
+
|boolq | 1|acc |86.15|± | 0.60|
|
108 |
+
|hellaswag | 0|acc |59.16|± | 0.49|
|
109 |
+
| | |acc_norm|77.53|± | 0.42|
|
110 |
+
|openbookqa | 0|acc |37.40|± | 2.17|
|
111 |
+
| | |acc_norm|44.00|± | 2.22|
|
112 |
+
|piqa | 0|acc |79.00|± | 0.95|
|
113 |
+
| | |acc_norm|80.25|± | 0.93|
|
114 |
+
|winogrande | 0|acc |72.61|± | 1.25|
|
115 |
+
|
116 |
+
Average: 71.48%
|
117 |
+
|
118 |
+
### TruthfulQA
|
119 |
+
| Task |Version|Metric|Value| |Stderr|
|
120 |
+
|-------------|------:|------|----:|---|-----:|
|
121 |
+
|truthfulqa_mc| 1|mc1 |33.05|± | 1.65|
|
122 |
+
| | |mc2 |48.54|± | 1.54|
|
123 |
+
|
124 |
+
Average: 48.54%
|
125 |
+
|
126 |
+
### Bigbench
|
127 |
+
| Task |Version| Metric |Value| |Stderr|
|
128 |
+
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|
129 |
+
|bigbench_causal_judgement | 0|multiple_choice_grade|54.74|± | 3.62|
|
130 |
+
|bigbench_date_understanding | 0|multiple_choice_grade|68.02|± | 2.43|
|
131 |
+
|bigbench_disambiguation_qa | 0|multiple_choice_grade|40.31|± | 3.06|
|
132 |
+
|bigbench_geometric_shapes | 0|multiple_choice_grade|30.36|± | 2.43|
|
133 |
+
| | |exact_str_match | 2.23|± | 0.78|
|
134 |
+
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|26.00|± | 1.96|
|
135 |
+
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|20.71|± | 1.53|
|
136 |
+
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|44.00|± | 2.87|
|
137 |
+
|bigbench_movie_recommendation | 0|multiple_choice_grade|35.00|± | 2.14|
|
138 |
+
|bigbench_navigate | 0|multiple_choice_grade|58.40|± | 1.56|
|
139 |
+
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|61.80|± | 1.09|
|
140 |
+
|bigbench_ruin_names | 0|multiple_choice_grade|42.41|± | 2.34|
|
141 |
+
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|31.56|± | 1.47|
|
142 |
+
|bigbench_snarks | 0|multiple_choice_grade|55.25|± | 3.71|
|
143 |
+
|bigbench_sports_understanding | 0|multiple_choice_grade|69.37|± | 1.47|
|
144 |
+
|bigbench_temporal_sequences | 0|multiple_choice_grade|27.70|± | 1.42|
|
145 |
+
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|21.36|± | 1.16|
|
146 |
+
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|14.69|± | 0.85|
|
147 |
+
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|44.00|± | 2.87|
|
148 |
+
|
149 |
+
Average: 41.43%
|
150 |
+
|
151 |
+
Average score: 51.25%
|
152 |
+
```
|
153 |
+
|
154 |
+
OpenLLM Benchmark:
|
155 |
+
```
|
156 |
+
| Model |ARC |HellaSwag|MMLU |TruthfulQA|Winogrande|GSM8K|Average|
|
157 |
+
|---------------------------------------------------|---:|--------:|----:|---------:|---------:|----:|------:|
|
158 |
+
|[Master-Yi-9B](https://huggingface.co/qnguyen3/Master-Yi-9B)|61.6| 79.89|69.95| 48.59| 77.35|67.48| 67.48|
|
159 |
+
|
160 |
+
### ARC
|
161 |
+
| Task |Version| Metric | Value | |Stderr|
|
162 |
+
|-------------|------:|--------------------|-------------|---|------|
|
163 |
+
|arc_challenge| 1|acc,none | 0.59| | |
|
164 |
+
| | |acc_stderr,none | 0.01| | |
|
165 |
+
| | |acc_norm,none | 0.62| | |
|
166 |
+
| | |acc_norm_stderr,none| 0.01| | |
|
167 |
+
| | |alias |arc_challenge| | |
|
168 |
+
|
169 |
+
Average: 61.6%
|
170 |
+
|
171 |
+
### HellaSwag
|
172 |
+
| Task |Version| Metric | Value | |Stderr|
|
173 |
+
|---------|------:|--------------------|---------|---|------|
|
174 |
+
|hellaswag| 1|acc,none | 0.61| | |
|
175 |
+
| | |acc_stderr,none | 0| | |
|
176 |
+
| | |acc_norm,none | 0.80| | |
|
177 |
+
| | |acc_norm_stderr,none| 0| | |
|
178 |
+
| | |alias |hellaswag| | |
|
179 |
+
|
180 |
+
Average: 79.89%
|
181 |
+
|
182 |
+
### MMLU
|
183 |
+
| Task |Version| Metric | Value | |Stderr|
|
184 |
+
|----------------------------------------|-------|---------------|---------------------------------------|---|------|
|
185 |
+
|mmlu |N/A |acc,none | 0.7| | |
|
186 |
+
| | |acc_stderr,none| 0| | |
|
187 |
+
| | |alias |mmlu | | |
|
188 |
+
|mmlu_abstract_algebra | 0|alias | - abstract_algebra | | |
|
189 |
+
| | |acc,none |0.46 | | |
|
190 |
+
| | |acc_stderr,none|0.05 | | |
|
191 |
+
|mmlu_anatomy | 0|alias | - anatomy | | |
|
192 |
+
| | |acc,none |0.64 | | |
|
193 |
+
| | |acc_stderr,none|0.04 | | |
|
194 |
+
|mmlu_astronomy | 0|alias | - astronomy | | |
|
195 |
+
| | |acc,none |0.77 | | |
|
196 |
+
| | |acc_stderr,none|0.03 | | |
|
197 |
+
|mmlu_business_ethics | 0|alias | - business_ethics | | |
|
198 |
+
| | |acc,none |0.76 | | |
|
199 |
+
| | |acc_stderr,none|0.04 | | |
|
200 |
+
|mmlu_clinical_knowledge | 0|alias | - clinical_knowledge | | |
|
201 |
+
| | |acc,none |0.71 | | |
|
202 |
+
| | |acc_stderr,none|0.03 | | |
|
203 |
+
|mmlu_college_biology | 0|alias | - college_biology | | |
|
204 |
+
| | |acc,none |0.82 | | |
|
205 |
+
| | |acc_stderr,none|0.03 | | |
|
206 |
+
|mmlu_college_chemistry | 0|alias | - college_chemistry | | |
|
207 |
+
| | |acc,none |0.52 | | |
|
208 |
+
| | |acc_stderr,none|0.05 | | |
|
209 |
+
|mmlu_college_computer_science | 0|alias | - college_computer_science | | |
|
210 |
+
| | |acc,none |0.56 | | |
|
211 |
+
| | |acc_stderr,none|0.05 | | |
|
212 |
+
|mmlu_college_mathematics | 0|alias | - college_mathematics | | |
|
213 |
+
| | |acc,none |0.44 | | |
|
214 |
+
| | |acc_stderr,none|0.05 | | |
|
215 |
+
|mmlu_college_medicine | 0|alias | - college_medicine | | |
|
216 |
+
| | |acc,none |0.72 | | |
|
217 |
+
| | |acc_stderr,none|0.03 | | |
|
218 |
+
|mmlu_college_physics | 0|alias | - college_physics | | |
|
219 |
+
| | |acc,none |0.45 | | |
|
220 |
+
| | |acc_stderr,none|0.05 | | |
|
221 |
+
|mmlu_computer_security | 0|alias | - computer_security | | |
|
222 |
+
| | |acc,none |0.81 | | |
|
223 |
+
| | |acc_stderr,none|0.04 | | |
|
224 |
+
|mmlu_conceptual_physics | 0|alias | - conceptual_physics | | |
|
225 |
+
| | |acc,none |0.74 | | |
|
226 |
+
| | |acc_stderr,none|0.03 | | |
|
227 |
+
|mmlu_econometrics | 0|alias | - econometrics | | |
|
228 |
+
| | |acc,none |0.65 | | |
|
229 |
+
| | |acc_stderr,none|0.04 | | |
|
230 |
+
|mmlu_electrical_engineering | 0|alias | - electrical_engineering | | |
|
231 |
+
| | |acc,none |0.72 | | |
|
232 |
+
| | |acc_stderr,none|0.04 | | |
|
233 |
+
|mmlu_elementary_mathematics | 0|alias | - elementary_mathematics | | |
|
234 |
+
| | |acc,none |0.62 | | |
|
235 |
+
| | |acc_stderr,none|0.02 | | |
|
236 |
+
|mmlu_formal_logic | 0|alias | - formal_logic | | |
|
237 |
+
| | |acc,none |0.57 | | |
|
238 |
+
| | |acc_stderr,none|0.04 | | |
|
239 |
+
|mmlu_global_facts | 0|alias | - global_facts | | |
|
240 |
+
| | |acc,none |0.46 | | |
|
241 |
+
| | |acc_stderr,none|0.05 | | |
|
242 |
+
|mmlu_high_school_biology | 0|alias | - high_school_biology | | |
|
243 |
+
| | |acc,none |0.86 | | |
|
244 |
+
| | |acc_stderr,none|0.02 | | |
|
245 |
+
|mmlu_high_school_chemistry | 0|alias | - high_school_chemistry | | |
|
246 |
+
| | |acc,none |0.67 | | |
|
247 |
+
| | |acc_stderr,none|0.03 | | |
|
248 |
+
|mmlu_high_school_computer_science | 0|alias | - high_school_computer_science | | |
|
249 |
+
| | |acc,none |0.84 | | |
|
250 |
+
| | |acc_stderr,none|0.04 | | |
|
251 |
+
|mmlu_high_school_european_history | 0|alias | - high_school_european_history | | |
|
252 |
+
| | |acc,none |0.82 | | |
|
253 |
+
| | |acc_stderr,none|0.03 | | |
|
254 |
+
|mmlu_high_school_geography | 0|alias | - high_school_geography | | |
|
255 |
+
| | |acc,none |0.86 | | |
|
256 |
+
| | |acc_stderr,none|0.02 | | |
|
257 |
+
|mmlu_high_school_government_and_politics| 0|alias | - high_school_government_and_politics| | |
|
258 |
+
| | |acc,none |0.90 | | |
|
259 |
+
| | |acc_stderr,none|0.02 | | |
|
260 |
+
|mmlu_high_school_macroeconomics | 0|alias | - high_school_macroeconomics | | |
|
261 |
+
| | |acc,none |0.75 | | |
|
262 |
+
| | |acc_stderr,none|0.02 | | |
|
263 |
+
|mmlu_high_school_mathematics | 0|alias | - high_school_mathematics | | |
|
264 |
+
| | |acc,none |0.43 | | |
|
265 |
+
| | |acc_stderr,none|0.03 | | |
|
266 |
+
|mmlu_high_school_microeconomics | 0|alias | - high_school_microeconomics | | |
|
267 |
+
| | |acc,none |0.86 | | |
|
268 |
+
| | |acc_stderr,none|0.02 | | |
|
269 |
+
|mmlu_high_school_physics | 0|alias | - high_school_physics | | |
|
270 |
+
| | |acc,none |0.45 | | |
|
271 |
+
| | |acc_stderr,none|0.04 | | |
|
272 |
+
|mmlu_high_school_psychology | 0|alias | - high_school_psychology | | |
|
273 |
+
| | |acc,none |0.87 | | |
|
274 |
+
| | |acc_stderr,none|0.01 | | |
|
275 |
+
|mmlu_high_school_statistics | 0|alias | - high_school_statistics | | |
|
276 |
+
| | |acc,none |0.68 | | |
|
277 |
+
| | |acc_stderr,none|0.03 | | |
|
278 |
+
|mmlu_high_school_us_history | 0|alias | - high_school_us_history | | |
|
279 |
+
| | |acc,none |0.85 | | |
|
280 |
+
| | |acc_stderr,none|0.02 | | |
|
281 |
+
|mmlu_high_school_world_history | 0|alias | - high_school_world_history | | |
|
282 |
+
| | |acc,none |0.85 | | |
|
283 |
+
| | |acc_stderr,none|0.02 | | |
|
284 |
+
|mmlu_human_aging | 0|alias | - human_aging | | |
|
285 |
+
| | |acc,none |0.76 | | |
|
286 |
+
| | |acc_stderr,none|0.03 | | |
|
287 |
+
|mmlu_human_sexuality | 0|alias | - human_sexuality | | |
|
288 |
+
| | |acc,none |0.78 | | |
|
289 |
+
| | |acc_stderr,none|0.04 | | |
|
290 |
+
|mmlu_humanities |N/A |alias | - humanities | | |
|
291 |
+
| | |acc,none |0.63 | | |
|
292 |
+
| | |acc_stderr,none|0.01 | | |
|
293 |
+
|mmlu_international_law | 0|alias | - international_law | | |
|
294 |
+
| | |acc,none |0.79 | | |
|
295 |
+
| | |acc_stderr,none|0.04 | | |
|
296 |
+
|mmlu_jurisprudence | 0|alias | - jurisprudence | | |
|
297 |
+
| | |acc,none |0.79 | | |
|
298 |
+
| | |acc_stderr,none|0.04 | | |
|
299 |
+
|mmlu_logical_fallacies | 0|alias | - logical_fallacies | | |
|
300 |
+
| | |acc,none |0.80 | | |
|
301 |
+
| | |acc_stderr,none|0.03 | | |
|
302 |
+
|mmlu_machine_learning | 0|alias | - machine_learning | | |
|
303 |
+
| | |acc,none |0.52 | | |
|
304 |
+
| | |acc_stderr,none|0.05 | | |
|
305 |
+
|mmlu_management | 0|alias | - management | | |
|
306 |
+
| | |acc,none |0.83 | | |
|
307 |
+
| | |acc_stderr,none|0.04 | | |
|
308 |
+
|mmlu_marketing | 0|alias | - marketing | | |
|
309 |
+
| | |acc,none |0.89 | | |
|
310 |
+
| | |acc_stderr,none|0.02 | | |
|
311 |
+
|mmlu_medical_genetics | 0|alias | - medical_genetics | | |
|
312 |
+
| | |acc,none |0.78 | | |
|
313 |
+
| | |acc_stderr,none|0.04 | | |
|
314 |
+
|mmlu_miscellaneous | 0|alias | - miscellaneous | | |
|
315 |
+
| | |acc,none |0.85 | | |
|
316 |
+
| | |acc_stderr,none|0.01 | | |
|
317 |
+
|mmlu_moral_disputes | 0|alias | - moral_disputes | | |
|
318 |
+
| | |acc,none |0.75 | | |
|
319 |
+
| | |acc_stderr,none|0.02 | | |
|
320 |
+
|mmlu_moral_scenarios | 0|alias | - moral_scenarios | | |
|
321 |
+
| | |acc,none |0.48 | | |
|
322 |
+
| | |acc_stderr,none|0.02 | | |
|
323 |
+
|mmlu_nutrition | 0|alias | - nutrition | | |
|
324 |
+
| | |acc,none |0.77 | | |
|
325 |
+
| | |acc_stderr,none|0.02 | | |
|
326 |
+
|mmlu_other |N/A |alias | - other | | |
|
327 |
+
| | |acc,none |0.75 | | |
|
328 |
+
| | |acc_stderr,none|0.01 | | |
|
329 |
+
|mmlu_philosophy | 0|alias | - philosophy | | |
|
330 |
+
| | |acc,none |0.78 | | |
|
331 |
+
| | |acc_stderr,none|0.02 | | |
|
332 |
+
|mmlu_prehistory | 0|alias | - prehistory | | |
|
333 |
+
| | |acc,none |0.77 | | |
|
334 |
+
| | |acc_stderr,none|0.02 | | |
|
335 |
+
|mmlu_professional_accounting | 0|alias | - professional_accounting | | |
|
336 |
+
| | |acc,none |0.57 | | |
|
337 |
+
| | |acc_stderr,none|0.03 | | |
|
338 |
+
|mmlu_professional_law | 0|alias | - professional_law | | |
|
339 |
+
| | |acc,none |0.50 | | |
|
340 |
+
| | |acc_stderr,none|0.01 | | |
|
341 |
+
|mmlu_professional_medicine | 0|alias | - professional_medicine | | |
|
342 |
+
| | |acc,none |0.71 | | |
|
343 |
+
| | |acc_stderr,none|0.03 | | |
|
344 |
+
|mmlu_professional_psychology | 0|alias | - professional_psychology | | |
|
345 |
+
| | |acc,none |0.73 | | |
|
346 |
+
| | |acc_stderr,none|0.02 | | |
|
347 |
+
|mmlu_public_relations | 0|alias | - public_relations | | |
|
348 |
+
| | |acc,none |0.76 | | |
|
349 |
+
| | |acc_stderr,none|0.04 | | |
|
350 |
+
|mmlu_security_studies | 0|alias | - security_studies | | |
|
351 |
+
| | |acc,none |0.78 | | |
|
352 |
+
| | |acc_stderr,none|0.03 | | |
|
353 |
+
|mmlu_social_sciences |N/A |alias | - social_sciences | | |
|
354 |
+
| | |acc,none |0.81 | | |
|
355 |
+
| | |acc_stderr,none|0.01 | | |
|
356 |
+
|mmlu_sociology | 0|alias | - sociology | | |
|
357 |
+
| | |acc,none |0.86 | | |
|
358 |
+
| | |acc_stderr,none|0.02 | | |
|
359 |
+
|mmlu_stem |N/A |alias | - stem | | |
|
360 |
+
| | |acc,none |0.65 | | |
|
361 |
+
| | |acc_stderr,none|0.01 | | |
|
362 |
+
|mmlu_us_foreign_policy | 0|alias | - us_foreign_policy | | |
|
363 |
+
| | |acc,none |0.92 | | |
|
364 |
+
| | |acc_stderr,none|0.03 | | |
|
365 |
+
|mmlu_virology | 0|alias | - virology | | |
|
366 |
+
| | |acc,none |0.58 | | |
|
367 |
+
| | |acc_stderr,none|0.04 | | |
|
368 |
+
|mmlu_world_religions | 0|alias | - world_religions | | |
|
369 |
+
| | |acc,none |0.82 | | |
|
370 |
+
| | |acc_stderr,none|0.03 | | |
|
371 |
+
|
372 |
+
Average: 69.95%
|
373 |
+
|
374 |
+
### TruthfulQA
|
375 |
+
| Task |Version| Metric | Value | |Stderr|
|
376 |
+
|--------------|-------|-----------------------|-----------------|---|------|
|
377 |
+
|truthfulqa |N/A |bleu_acc,none | 0.45| | |
|
378 |
+
| | |bleu_acc_stderr,none | 0.02| | |
|
379 |
+
| | |rouge1_acc,none | 0.45| | |
|
380 |
+
| | |rouge1_acc_stderr,none | 0.02| | |
|
381 |
+
| | |rouge2_diff,none | 0.92| | |
|
382 |
+
| | |rouge2_diff_stderr,none| 1.07| | |
|
383 |
+
| | |bleu_max,none | 23.77| | |
|
384 |
+
| | |bleu_max_stderr,none | 0.81| | |
|
385 |
+
| | |rouge2_acc,none | 0.38| | |
|
386 |
+
| | |rouge2_acc_stderr,none | 0.02| | |
|
387 |
+
| | |acc,none | 0.41| | |
|
388 |
+
| | |acc_stderr,none | 0.01| | |
|
389 |
+
| | |rougeL_diff,none | 1.57| | |
|
390 |
+
| | |rougeL_diff_stderr,none| 0.93| | |
|
391 |
+
| | |rougeL_acc,none | 0.46| | |
|
392 |
+
| | |rougeL_acc_stderr,none | 0.02| | |
|
393 |
+
| | |bleu_diff,none | 1.38| | |
|
394 |
+
| | |bleu_diff_stderr,none | 0.75| | |
|
395 |
+
| | |rouge2_max,none | 33.01| | |
|
396 |
+
| | |rouge2_max_stderr,none | 1.05| | |
|
397 |
+
| | |rouge1_diff,none | 1.72| | |
|
398 |
+
| | |rouge1_diff_stderr,none| 0.92| | |
|
399 |
+
| | |rougeL_max,none | 45.25| | |
|
400 |
+
| | |rougeL_max_stderr,none | 0.92| | |
|
401 |
+
| | |rouge1_max,none | 48.29| | |
|
402 |
+
| | |rouge1_max_stderr,none | 0.90| | |
|
403 |
+
| | |alias |truthfulqa | | |
|
404 |
+
|truthfulqa_gen| 3|bleu_max,none | 23.77| | |
|
405 |
+
| | |bleu_max_stderr,none | 0.81| | |
|
406 |
+
| | |bleu_acc,none | 0.45| | |
|
407 |
+
| | |bleu_acc_stderr,none | 0.02| | |
|
408 |
+
| | |bleu_diff,none | 1.38| | |
|
409 |
+
| | |bleu_diff_stderr,none | 0.75| | |
|
410 |
+
| | |rouge1_max,none | 48.29| | |
|
411 |
+
| | |rouge1_max_stderr,none | 0.90| | |
|
412 |
+
| | |rouge1_acc,none | 0.45| | |
|
413 |
+
| | |rouge1_acc_stderr,none | 0.02| | |
|
414 |
+
| | |rouge1_diff,none | 1.72| | |
|
415 |
+
| | |rouge1_diff_stderr,none| 0.92| | |
|
416 |
+
| | |rouge2_max,none | 33.01| | |
|
417 |
+
| | |rouge2_max_stderr,none | 1.05| | |
|
418 |
+
| | |rouge2_acc,none | 0.38| | |
|
419 |
+
| | |rouge2_acc_stderr,none | 0.02| | |
|
420 |
+
| | |rouge2_diff,none | 0.92| | |
|
421 |
+
| | |rouge2_diff_stderr,none| 1.07| | |
|
422 |
+
| | |rougeL_max,none | 45.25| | |
|
423 |
+
| | |rougeL_max_stderr,none | 0.92| | |
|
424 |
+
| | |rougeL_acc,none | 0.46| | |
|
425 |
+
| | |rougeL_acc_stderr,none | 0.02| | |
|
426 |
+
| | |rougeL_diff,none | 1.57| | |
|
427 |
+
| | |rougeL_diff_stderr,none| 0.93| | |
|
428 |
+
| | |alias | - truthfulqa_gen| | |
|
429 |
+
|truthfulqa_mc1| 2|acc,none | 0.33| | |
|
430 |
+
| | |acc_stderr,none | 0.02| | |
|
431 |
+
| | |alias | - truthfulqa_mc1| | |
|
432 |
+
|truthfulqa_mc2| 2|acc,none | 0.49| | |
|
433 |
+
| | |acc_stderr,none | 0.02| | |
|
434 |
+
| | |alias | - truthfulqa_mc2| | |
|
435 |
+
|
436 |
+
Average: 48.59%
|
437 |
+
|
438 |
+
### Winogrande
|
439 |
+
| Task |Version| Metric | Value | |Stderr|
|
440 |
+
|----------|------:|---------------|----------|---|------|
|
441 |
+
|winogrande| 1|acc,none | 0.77| | |
|
442 |
+
| | |acc_stderr,none| 0.01| | |
|
443 |
+
| | |alias |winogrande| | |
|
444 |
+
|
445 |
+
Average: 77.35%
|
446 |
+
|
447 |
+
### GSM8K
|
448 |
+
|Task |Version| Metric |Value| |Stderr|
|
449 |
+
|-----|------:|-----------------------------------|-----|---|------|
|
450 |
+
|gsm8k| 3|exact_match,strict-match | 0.67| | |
|
451 |
+
| | |exact_match_stderr,strict-match | 0.01| | |
|
452 |
+
| | |exact_match,flexible-extract | 0.68| | |
|
453 |
+
| | |exact_match_stderr,flexible-extract| 0.01| | |
|
454 |
+
| | |alias |gsm8k| | |
|
455 |
+
|
456 |
+
Average: 67.48%
|
457 |
+
|
458 |
+
Average score: 67.48%
|
459 |
```
|