eval
Browse files
README.md
CHANGED
@@ -358,3 +358,96 @@ litgpt evaluate --tasks 'wikitext,qasper' --out_dir 'evaluate-long/' --batch_siz
|
|
358 |
|wikitext | 2|none | 0|bits_per_byte |↓ | 2.2154|± | N/A|
|
359 |
| | |none | 0|byte_perplexity|↓ | 4.6441|± | N/A|
|
360 |
| | |none | 0|word_perplexity|↓ |3683.1019|± | N/A|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|wikitext | 2|none | 0|bits_per_byte |↓ | 2.2154|± | N/A|
|
359 |
| | |none | 0|byte_perplexity|↓ | 4.6441|± | N/A|
|
360 |
| | |none | 0|word_perplexity|↓ |3683.1019|± | N/A|
|
361 |
+
|
362 |
+
## Continued Pretrain Evaluation
|
363 |
+
|
364 |
+
### lm-evaluation-harness
|
365 |
+
|
366 |
+
```bash
|
367 |
+
litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-contrain-quick/' --batch_size 4 --dtype 'bfloat16' out/contrain/final/
|
368 |
+
```
|
369 |
+
|
370 |
+
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
|
371 |
+
|---------------------------------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|
372 |
+
|arc_challenge | 1|none | 0|acc |↑ |0.1894|± |0.0115|
|
373 |
+
| | |none | 0|acc_norm |↑ |0.2193|± |0.0121|
|
374 |
+
|gsm8k | 3|flexible-extract| 5|exact_match|↑ |0.0182|± |0.0037|
|
375 |
+
| | |strict-match | 5|exact_match|↑ |0.0000|± |0.0000|
|
376 |
+
|hellaswag | 1|none | 0|acc |↑ |0.2638|± |0.0044|
|
377 |
+
| | |none | 0|acc_norm |↑ |0.2655|± |0.0044|
|
378 |
+
|mmlu | 2|none | |acc |↑ |0.2376|± |0.0036|
|
379 |
+
| - humanities | 2|none | |acc |↑ |0.2438|± |0.0063|
|
380 |
+
| - formal_logic | 1|none | 0|acc |↑ |0.2222|± |0.0372|
|
381 |
+
| - high_school_european_history | 1|none | 0|acc |↑ |0.2485|± |0.0337|
|
382 |
+
| - high_school_us_history | 1|none | 0|acc |↑ |0.2304|± |0.0296|
|
383 |
+
| - high_school_world_history | 1|none | 0|acc |↑ |0.2489|± |0.0281|
|
384 |
+
| - international_law | 1|none | 0|acc |↑ |0.2397|± |0.0390|
|
385 |
+
| - jurisprudence | 1|none | 0|acc |↑ |0.2407|± |0.0413|
|
386 |
+
| - logical_fallacies | 1|none | 0|acc |↑ |0.2025|± |0.0316|
|
387 |
+
| - moral_disputes | 1|none | 0|acc |↑ |0.1965|± |0.0214|
|
388 |
+
| - moral_scenarios | 1|none | 0|acc |↑ |0.2726|± |0.0149|
|
389 |
+
| - philosophy | 1|none | 0|acc |↑ |0.1897|± |0.0223|
|
390 |
+
| - prehistory | 1|none | 0|acc |↑ |0.2191|± |0.0230|
|
391 |
+
| - professional_law | 1|none | 0|acc |↑ |0.2529|± |0.0111|
|
392 |
+
| - world_religions | 1|none | 0|acc |↑ |0.3158|± |0.0357|
|
393 |
+
| - other | 2|none | |acc |↑ |0.2407|± |0.0077|
|
394 |
+
| - business_ethics | 1|none | 0|acc |↑ |0.2600|± |0.0441|
|
395 |
+
| - clinical_knowledge | 1|none | 0|acc |↑ |0.2302|± |0.0259|
|
396 |
+
| - college_medicine | 1|none | 0|acc |↑ |0.2370|± |0.0324|
|
397 |
+
| - global_facts | 1|none | 0|acc |↑ |0.1900|± |0.0394|
|
398 |
+
| - human_aging | 1|none | 0|acc |↑ |0.3004|± |0.0308|
|
399 |
+
| - management | 1|none | 0|acc |↑ |0.1845|± |0.0384|
|
400 |
+
| - marketing | 1|none | 0|acc |↑ |0.2863|± |0.0296|
|
401 |
+
| - medical_genetics | 1|none | 0|acc |↑ |0.3000|± |0.0461|
|
402 |
+
| - miscellaneous | 1|none | 0|acc |↑ |0.2375|± |0.0152|
|
403 |
+
| - nutrition | 1|none | 0|acc |↑ |0.2353|± |0.0243|
|
404 |
+
| - professional_accounting | 1|none | 0|acc |↑ |0.2305|± |0.0251|
|
405 |
+
| - professional_medicine | 1|none | 0|acc |↑ |0.2096|± |0.0247|
|
406 |
+
| - virology | 1|none | 0|acc |↑ |0.2289|± |0.0327|
|
407 |
+
| - social sciences | 2|none | |acc |↑ |0.2382|± |0.0077|
|
408 |
+
| - econometrics | 1|none | 0|acc |↑ |0.2368|± |0.0400|
|
409 |
+
| - high_school_geography | 1|none | 0|acc |�� |0.1818|± |0.0275|
|
410 |
+
| - high_school_government_and_politics| 1|none | 0|acc |↑ |0.2280|± |0.0303|
|
411 |
+
| - high_school_macroeconomics | 1|none | 0|acc |↑ |0.2410|± |0.0217|
|
412 |
+
| - high_school_microeconomics | 1|none | 0|acc |↑ |0.2479|± |0.0280|
|
413 |
+
| - high_school_psychology | 1|none | 0|acc |↑ |0.2055|± |0.0173|
|
414 |
+
| - human_sexuality | 1|none | 0|acc |↑ |0.2824|± |0.0395|
|
415 |
+
| - professional_psychology | 1|none | 0|acc |↑ |0.2565|± |0.0177|
|
416 |
+
| - public_relations | 1|none | 0|acc |↑ |0.2091|± |0.0390|
|
417 |
+
| - security_studies | 1|none | 0|acc |↑ |0.2694|± |0.0284|
|
418 |
+
| - sociology | 1|none | 0|acc |↑ |0.2438|± |0.0304|
|
419 |
+
| - us_foreign_policy | 1|none | 0|acc |↑ |0.2900|± |0.0456|
|
420 |
+
| - stem | 2|none | |acc |↑ |0.2249|± |0.0074|
|
421 |
+
| - abstract_algebra | 1|none | 0|acc |↑ |0.1800|± |0.0386|
|
422 |
+
| - anatomy | 1|none | 0|acc |↑ |0.1704|± |0.0325|
|
423 |
+
| - astronomy | 1|none | 0|acc |↑ |0.2105|± |0.0332|
|
424 |
+
| - college_biology | 1|none | 0|acc |↑ |0.2500|± |0.0362|
|
425 |
+
| - college_chemistry | 1|none | 0|acc |↑ |0.1900|± |0.0394|
|
426 |
+
| - college_computer_science | 1|none | 0|acc |↑ |0.2600|± |0.0441|
|
427 |
+
| - college_mathematics | 1|none | 0|acc |↑ |0.2000|± |0.0402|
|
428 |
+
| - college_physics | 1|none | 0|acc |↑ |0.2353|± |0.0422|
|
429 |
+
| - computer_security | 1|none | 0|acc |↑ |0.2800|± |0.0451|
|
430 |
+
| - conceptual_physics | 1|none | 0|acc |↑ |0.2596|± |0.0287|
|
431 |
+
| - electrical_engineering | 1|none | 0|acc |↑ |0.2345|± |0.0353|
|
432 |
+
| - elementary_mathematics | 1|none | 0|acc |↑ |0.2434|± |0.0221|
|
433 |
+
| - high_school_biology | 1|none | 0|acc |↑ |0.1871|± |0.0222|
|
434 |
+
| - high_school_chemistry | 1|none | 0|acc |↑ |0.2118|± |0.0287|
|
435 |
+
| - high_school_computer_science | 1|none | 0|acc |↑ |0.2600|± |0.0441|
|
436 |
+
| - high_school_mathematics | 1|none | 0|acc |↑ |0.2222|± |0.0253|
|
437 |
+
| - high_school_physics | 1|none | 0|acc |↑ |0.1921|± |0.0322|
|
438 |
+
| - high_school_statistics | 1|none | 0|acc |↑ |0.2130|± |0.0279|
|
439 |
+
| - machine_learning | 1|none | 0|acc |↑ |0.3036|± |0.0436|
|
440 |
+
|truthfulqa_mc2 | 2|none | 0|acc |↑ |0.4931|± |0.0161|
|
441 |
+
|winogrande | 1|none | 0|acc |↑ |0.5012|± |0.0141|
|
442 |
+
|
443 |
+
| Groups |Version|Filter|n-shot|Metric| |Value | |Stderr|
|
444 |
+
|------------------|------:|------|------|------|---|-----:|---|-----:|
|
445 |
+
|mmlu | 2|none | |acc |↑ |0.2376|± |0.0036|
|
446 |
+
| - humanities | 2|none | |acc |↑ |0.2438|± |0.0063|
|
447 |
+
| - other | 2|none | |acc |↑ |0.2407|± |0.0077|
|
448 |
+
| - social sciences| 2|none | |acc |↑ |0.2382|± |0.0077|
|
449 |
+
| - stem | 2|none | |acc |↑ |0.2249|± |0.0074|
|
450 |
+
|
451 |
+
```bash
|
452 |
+
litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-contrain-math/' --batch_size 4 --dtype 'bfloat16' out/contrain/final/
|
453 |
+
```
|