eval
Browse files
README.md
CHANGED
@@ -210,6 +210,97 @@ litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4
|
|
210 |
litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
211 |
```
|
212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
```bash
|
214 |
litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
215 |
```
|
|
|
210 |
litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
211 |
```
|
212 |
|
213 |
+
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
214 |
+
|---------------------------------------|------:|--------------|-----:|-----------|---|-----:|---|-----:|
|
215 |
+
|mmlu | 2|none | |acc |↑ |0.2459|± |0.0036|
|
216 |
+
| - humanities | 2|none | |acc |↑ |0.2480|± |0.0063|
|
217 |
+
| - formal_logic | 1|none | 0|acc |↑ |0.3175|± |0.0416|
|
218 |
+
| - high_school_european_history | 1|none | 0|acc |↑ |0.2424|± |0.0335|
|
219 |
+
| - high_school_us_history | 1|none | 0|acc |↑ |0.2402|± |0.0300|
|
220 |
+
| - high_school_world_history | 1|none | 0|acc |↑ |0.2743|± |0.0290|
|
221 |
+
| - international_law | 1|none | 0|acc |↑ |0.2314|± |0.0385|
|
222 |
+
| - jurisprudence | 1|none | 0|acc |↑ |0.2315|± |0.0408|
|
223 |
+
| - logical_fallacies | 1|none | 0|acc |↑ |0.2209|± |0.0326|
|
224 |
+
| - moral_disputes | 1|none | 0|acc |↑ |0.2081|± |0.0219|
|
225 |
+
| - moral_scenarios | 1|none | 0|acc |↑ |0.2670|± |0.0148|
|
226 |
+
| - philosophy | 1|none | 0|acc |↑ |0.2090|± |0.0231|
|
227 |
+
| - prehistory | 1|none | 0|acc |↑ |0.2160|± |0.0229|
|
228 |
+
| - professional_law | 1|none | 0|acc |↑ |0.2516|± |0.0111|
|
229 |
+
| - world_religions | 1|none | 0|acc |↑ |0.3041|± |0.0353|
|
230 |
+
| - other | 2|none | |acc |↑ |0.2549|± |0.0078|
|
231 |
+
| - business_ethics | 1|none | 0|acc |↑ |0.2700|± |0.0446|
|
232 |
+
| - clinical_knowledge | 1|none | 0|acc |↑ |0.2264|± |0.0258|
|
233 |
+
| - college_medicine | 1|none | 0|acc |↑ |0.2428|± |0.0327|
|
234 |
+
| - global_facts | 1|none | 0|acc |↑ |0.1600|± |0.0368|
|
235 |
+
| - human_aging | 1|none | 0|acc |↑ |0.2242|± |0.0280|
|
236 |
+
| - management | 1|none | 0|acc |↑ |0.1845|± |0.0384|
|
237 |
+
| - marketing | 1|none | 0|acc |↑ |0.2949|± |0.0299|
|
238 |
+
| - medical_genetics | 1|none | 0|acc |↑ |0.2200|± |0.0416|
|
239 |
+
| - miscellaneous | 1|none | 0|acc |↑ |0.2478|± |0.0154|
|
240 |
+
| - nutrition | 1|none | 0|acc |↑ |0.2353|± |0.0243|
|
241 |
+
| - professional_accounting | 1|none | 0|acc |↑ |0.2553|± |0.0260|
|
242 |
+
| - professional_medicine | 1|none | 0|acc |↑ |0.4118|± |0.0299|
|
243 |
+
| - virology | 1|none | 0|acc |↑ |0.2229|± |0.0324|
|
244 |
+
| - social sciences | 2|none | |acc |↑ |0.2525|± |0.0078|
|
245 |
+
| - econometrics | 1|none | 0|acc |↑ |0.2368|± |0.0400|
|
246 |
+
| - high_school_geography | 1|none | 0|acc |↑ |0.2172|± |0.0294|
|
247 |
+
| - high_school_government_and_politics| 1|none | 0|acc |↑ |0.2539|± |0.0314|
|
248 |
+
| - high_school_macroeconomics | 1|none | 0|acc |↑ |0.2410|± |0.0217|
|
249 |
+
| - high_school_microeconomics | 1|none | 0|acc |↑ |0.2311|± |0.0274|
|
250 |
+
| - high_school_psychology | 1|none | 0|acc |↑ |0.2495|± |0.0186|
|
251 |
+
| - human_sexuality | 1|none | 0|acc |↑ |0.2824|± |0.0395|
|
252 |
+
| - professional_psychology | 1|none | 0|acc |↑ |0.2565|± |0.0177|
|
253 |
+
| - public_relations | 1|none | 0|acc |↑ |0.2636|± |0.0422|
|
254 |
+
| - security_studies | 1|none | 0|acc |↑ |0.2898|± |0.0290|
|
255 |
+
| - sociology | 1|none | 0|acc |↑ |0.2537|± |0.0308|
|
256 |
+
| - us_foreign_policy | 1|none | 0|acc |↑ |0.2800|± |0.0451|
|
257 |
+
| - stem | 2|none | |acc |↑ |0.2274|± |0.0075|
|
258 |
+
| - abstract_algebra | 1|none | 0|acc |↑ |0.2200|± |0.0416|
|
259 |
+
| - anatomy | 1|none | 0|acc |↑ |0.2444|± |0.0371|
|
260 |
+
| - astronomy | 1|none | 0|acc |↑ |0.2697|± |0.0361|
|
261 |
+
| - college_biology | 1|none | 0|acc |↑ |0.2500|± |0.0362|
|
262 |
+
| - college_chemistry | 1|none | 0|acc |↑ |0.2100|± |0.0409|
|
263 |
+
| - college_computer_science | 1|none | 0|acc |↑ |0.2800|± |0.0451|
|
264 |
+
| - college_mathematics | 1|none | 0|acc |↑ |0.1900|± |0.0394|
|
265 |
+
| - college_physics | 1|none | 0|acc |↑ |0.2549|± |0.0434|
|
266 |
+
| - computer_security | 1|none | 0|acc |↑ |0.1900|± |0.0394|
|
267 |
+
| - conceptual_physics | 1|none | 0|acc |↑ |0.2298|± |0.0275|
|
268 |
+
| - electrical_engineering | 1|none | 0|acc |↑ |0.2483|± |0.0360|
|
269 |
+
| - elementary_mathematics | 1|none | 0|acc |↑ |0.1931|± |0.0203|
|
270 |
+
| - high_school_biology | 1|none | 0|acc |↑ |0.2258|± |0.0238|
|
271 |
+
| - high_school_chemistry | 1|none | 0|acc |↑ |0.2217|± |0.0292|
|
272 |
+
| - high_school_computer_science | 1|none | 0|acc |↑ |0.2400|± |0.0429|
|
273 |
+
| - high_school_mathematics | 1|none | 0|acc |↑ |0.2074|± |0.0247|
|
274 |
+
| - high_school_physics | 1|none | 0|acc |↑ |0.2185|± |0.0337|
|
275 |
+
| - high_school_statistics | 1|none | 0|acc |↑ |0.1991|± |0.0272|
|
276 |
+
| - machine_learning | 1|none | 0|acc |↑ |0.3393|± |0.0449|
|
277 |
+
|mmlu_pro | 2|custom-extract| |exact_match|↑ |0.0000|± |0.0000|
|
278 |
+
| - biology | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
279 |
+
| - business | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
280 |
+
| - chemistry | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
281 |
+
| - computer_science | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
282 |
+
| - economics | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
283 |
+
| - engineering | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
284 |
+
| - health | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
285 |
+
| - history | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
286 |
+
| - law | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
287 |
+
| - math | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
288 |
+
| - other | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
289 |
+
| - philosophy | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
290 |
+
| - physics | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
291 |
+
| - psychology | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
|
292 |
+
|
293 |
+
| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
294 |
+
|------------------|------:|--------------|------|-----------|---|-----:|---|-----:|
|
295 |
+
|mmlu | 2|none | |acc |↑ |0.2459|± |0.0036|
|
296 |
+
| - humanities | 2|none | |acc |↑ |0.2480|± |0.0063|
|
297 |
+
| - other | 2|none | |acc |↑ |0.2549|± |0.0078|
|
298 |
+
| - social sciences| 2|none | |acc |↑ |0.2525|± |0.0078|
|
299 |
+
| - stem | 2|none | |acc |↑ |0.2274|± |0.0075|
|
300 |
+
|mmlu_pro | 2|custom-extract| |exact_match|↑ |0.0000|± |0.0000|
|
301 |
+
|
302 |
+
|
303 |
+
|
304 |
```bash
|
305 |
litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
|
306 |
```
|