mtasic85 commited on
Commit
c7d2182
1 Parent(s): 68a8a9f
Files changed (1) hide show
  1. README.md +91 -0
README.md CHANGED
@@ -210,6 +210,97 @@ litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4
210
  litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
211
  ```
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  ```bash
214
  litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
215
  ```
 
210
  litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
211
  ```
212
 
213
+ | Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
214
+ |---------------------------------------|------:|--------------|-----:|-----------|---|-----:|---|-----:|
215
+ |mmlu | 2|none | |acc |↑ |0.2459|± |0.0036|
216
+ | - humanities | 2|none | |acc |↑ |0.2480|± |0.0063|
217
+ | - formal_logic | 1|none | 0|acc |↑ |0.3175|± |0.0416|
218
+ | - high_school_european_history | 1|none | 0|acc |↑ |0.2424|± |0.0335|
219
+ | - high_school_us_history | 1|none | 0|acc |↑ |0.2402|± |0.0300|
220
+ | - high_school_world_history | 1|none | 0|acc |↑ |0.2743|± |0.0290|
221
+ | - international_law | 1|none | 0|acc |↑ |0.2314|± |0.0385|
222
+ | - jurisprudence | 1|none | 0|acc |↑ |0.2315|± |0.0408|
223
+ | - logical_fallacies | 1|none | 0|acc |↑ |0.2209|± |0.0326|
224
+ | - moral_disputes | 1|none | 0|acc |↑ |0.2081|± |0.0219|
225
+ | - moral_scenarios | 1|none | 0|acc |↑ |0.2670|± |0.0148|
226
+ | - philosophy | 1|none | 0|acc |↑ |0.2090|± |0.0231|
227
+ | - prehistory | 1|none | 0|acc |↑ |0.2160|± |0.0229|
228
+ | - professional_law | 1|none | 0|acc |↑ |0.2516|± |0.0111|
229
+ | - world_religions | 1|none | 0|acc |↑ |0.3041|± |0.0353|
230
+ | - other | 2|none | |acc |↑ |0.2549|± |0.0078|
231
+ | - business_ethics | 1|none | 0|acc |↑ |0.2700|± |0.0446|
232
+ | - clinical_knowledge | 1|none | 0|acc |↑ |0.2264|± |0.0258|
233
+ | - college_medicine | 1|none | 0|acc |↑ |0.2428|± |0.0327|
234
+ | - global_facts | 1|none | 0|acc |↑ |0.1600|± |0.0368|
235
+ | - human_aging | 1|none | 0|acc |↑ |0.2242|± |0.0280|
236
+ | - management | 1|none | 0|acc |↑ |0.1845|± |0.0384|
237
+ | - marketing | 1|none | 0|acc |↑ |0.2949|± |0.0299|
238
+ | - medical_genetics | 1|none | 0|acc |↑ |0.2200|± |0.0416|
239
+ | - miscellaneous | 1|none | 0|acc |↑ |0.2478|± |0.0154|
240
+ | - nutrition | 1|none | 0|acc |↑ |0.2353|± |0.0243|
241
+ | - professional_accounting | 1|none | 0|acc |↑ |0.2553|± |0.0260|
242
+ | - professional_medicine | 1|none | 0|acc |↑ |0.4118|± |0.0299|
243
+ | - virology | 1|none | 0|acc |↑ |0.2229|± |0.0324|
244
+ | - social sciences | 2|none | |acc |↑ |0.2525|± |0.0078|
245
+ | - econometrics | 1|none | 0|acc |↑ |0.2368|± |0.0400|
246
+ | - high_school_geography | 1|none | 0|acc |↑ |0.2172|± |0.0294|
247
+ | - high_school_government_and_politics| 1|none | 0|acc |↑ |0.2539|± |0.0314|
248
+ | - high_school_macroeconomics | 1|none | 0|acc |↑ |0.2410|± |0.0217|
249
+ | - high_school_microeconomics | 1|none | 0|acc |↑ |0.2311|± |0.0274|
250
+ | - high_school_psychology | 1|none | 0|acc |↑ |0.2495|± |0.0186|
251
+ | - human_sexuality | 1|none | 0|acc |↑ |0.2824|± |0.0395|
252
+ | - professional_psychology | 1|none | 0|acc |↑ |0.2565|± |0.0177|
253
+ | - public_relations | 1|none | 0|acc |↑ |0.2636|± |0.0422|
254
+ | - security_studies | 1|none | 0|acc |↑ |0.2898|± |0.0290|
255
+ | - sociology | 1|none | 0|acc |↑ |0.2537|± |0.0308|
256
+ | - us_foreign_policy | 1|none | 0|acc |↑ |0.2800|± |0.0451|
257
+ | - stem | 2|none | |acc |↑ |0.2274|± |0.0075|
258
+ | - abstract_algebra | 1|none | 0|acc |↑ |0.2200|± |0.0416|
259
+ | - anatomy | 1|none | 0|acc |↑ |0.2444|± |0.0371|
260
+ | - astronomy | 1|none | 0|acc |↑ |0.2697|± |0.0361|
261
+ | - college_biology | 1|none | 0|acc |↑ |0.2500|± |0.0362|
262
+ | - college_chemistry | 1|none | 0|acc |↑ |0.2100|± |0.0409|
263
+ | - college_computer_science | 1|none | 0|acc |↑ |0.2800|± |0.0451|
264
+ | - college_mathematics | 1|none | 0|acc |↑ |0.1900|± |0.0394|
265
+ | - college_physics | 1|none | 0|acc |↑ |0.2549|± |0.0434|
266
+ | - computer_security | 1|none | 0|acc |↑ |0.1900|± |0.0394|
267
+ | - conceptual_physics | 1|none | 0|acc |↑ |0.2298|± |0.0275|
268
+ | - electrical_engineering | 1|none | 0|acc |↑ |0.2483|± |0.0360|
269
+ | - elementary_mathematics | 1|none | 0|acc |↑ |0.1931|± |0.0203|
270
+ | - high_school_biology | 1|none | 0|acc |↑ |0.2258|± |0.0238|
271
+ | - high_school_chemistry | 1|none | 0|acc |↑ |0.2217|± |0.0292|
272
+ | - high_school_computer_science | 1|none | 0|acc |↑ |0.2400|± |0.0429|
273
+ | - high_school_mathematics | 1|none | 0|acc |↑ |0.2074|± |0.0247|
274
+ | - high_school_physics | 1|none | 0|acc |↑ |0.2185|± |0.0337|
275
+ | - high_school_statistics | 1|none | 0|acc |↑ |0.1991|± |0.0272|
276
+ | - machine_learning | 1|none | 0|acc |↑ |0.3393|± |0.0449|
277
+ |mmlu_pro | 2|custom-extract| |exact_match|↑ |0.0000|± |0.0000|
278
+ | - biology | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
279
+ | - business | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
280
+ | - chemistry | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
281
+ | - computer_science | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
282
+ | - economics | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
283
+ | - engineering | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
284
+ | - health | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
285
+ | - history | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
286
+ | - law | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
287
+ | - math | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
288
+ | - other | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
289
+ | - philosophy | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
290
+ | - physics | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
291
+ | - psychology | 1|custom-extract| 5|exact_match|↑ |0.0000|± |0.0000|
292
+
293
+ | Groups |Version| Filter |n-shot| Metric | |Value | |Stderr|
294
+ |------------------|------:|--------------|------|-----------|---|-----:|---|-----:|
295
+ |mmlu | 2|none | |acc |↑ |0.2459|± |0.0036|
296
+ | - humanities | 2|none | |acc |↑ |0.2480|± |0.0063|
297
+ | - other | 2|none | |acc |↑ |0.2549|± |0.0078|
298
+ | - social sciences| 2|none | |acc |↑ |0.2525|± |0.0078|
299
+ | - stem | 2|none | |acc |↑ |0.2274|± |0.0075|
300
+ |mmlu_pro | 2|custom-extract| |exact_match|↑ |0.0000|± |0.0000|
301
+
302
+
303
+
304
  ```bash
305
  litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
306
  ```