Upload 101m-gqa.md
Browse files
smol_llama-101M-GQA-evals/101m-gqa.md
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-101M-GQA,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 0, batch_size: 64
|
2 |
+
| Task |Version| Metric | Value | |Stderr|
|
3 |
+
|--------------|------:|--------|------:|---|-----:|
|
4 |
+
|arc_easy | 0|acc | 0.4322|± |0.0102|
|
5 |
+
| | |acc_norm| 0.3868|± |0.0100|
|
6 |
+
|boolq | 1|acc | 0.6092|± |0.0085|
|
7 |
+
|lambada_openai| 0|ppl |74.2399|± |2.9038|
|
8 |
+
| | |acc | 0.2604|± |0.0061|
|
9 |
+
|openbookqa | 0|acc | 0.1440|± |0.0157|
|
10 |
+
| | |acc_norm| 0.2780|± |0.0201|
|
11 |
+
|piqa | 0|acc | 0.5909|± |0.0115|
|
12 |
+
| | |acc_norm| 0.5871|± |0.0115|
|
13 |
+
|winogrande | 0|acc | 0.5225|± |0.0140|
|
14 |
+
|
15 |
+
hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-101M-GQA,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 25, batch_size: 64
|
16 |
+
| Task |Version| Metric |Value | |Stderr|
|
17 |
+
|-------------|------:|--------|-----:|---|-----:|
|
18 |
+
|arc_challenge| 0|acc |0.1817|± |0.0113|
|
19 |
+
| | |acc_norm|0.2329|± |0.0124|
|
20 |
+
|
21 |
+
hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-101M-GQA,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 10, batch_size: 64
|
22 |
+
| Task |Version| Metric |Value | |Stderr|
|
23 |
+
|---------|------:|--------|-----:|---|-----:|
|
24 |
+
|hellaswag| 0|acc |0.2792|± |0.0045|
|
25 |
+
| | |acc_norm|0.2865|± |0.0045|
|
26 |
+
|
27 |
+
hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-101M-GQA,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 0, batch_size: 64
|
28 |
+
| Task |Version|Metric|Value | |Stderr|
|
29 |
+
|-------------|------:|------|-----:|---|-----:|
|
30 |
+
|truthfulqa_mc| 1|mc1 |0.2485|± |0.0151|
|
31 |
+
| | |mc2 |0.4594|± |0.0151|
|
32 |
+
|
33 |
+
hf-causal-experimental (pretrained=BEE-spoke-data/smol_llama-101M-GQA,trust_remote_code=True,dtype=float), limit: None, provide_description: False, num_fewshot: 5, batch_size: 64
|
34 |
+
| Task |Version| Metric |Value | |Stderr|
|
35 |
+
|-------------------------------------------------|------:|--------|-----:|---|-----:|
|
36 |
+
|hendrycksTest-abstract_algebra | 1|acc |0.2200|± |0.0416|
|
37 |
+
| | |acc_norm|0.2200|± |0.0416|
|
38 |
+
|hendrycksTest-anatomy | 1|acc |0.2741|± |0.0385|
|
39 |
+
| | |acc_norm|0.2741|± |0.0385|
|
40 |
+
|hendrycksTest-astronomy | 1|acc |0.1776|± |0.0311|
|
41 |
+
| | |acc_norm|0.1776|± |0.0311|
|
42 |
+
|hendrycksTest-business_ethics | 1|acc |0.2100|± |0.0409|
|
43 |
+
| | |acc_norm|0.2100|± |0.0409|
|
44 |
+
|hendrycksTest-clinical_knowledge | 1|acc |0.2264|± |0.0258|
|
45 |
+
| | |acc_norm|0.2264|± |0.0258|
|
46 |
+
|hendrycksTest-college_biology | 1|acc |0.2500|± |0.0362|
|
47 |
+
| | |acc_norm|0.2500|± |0.0362|
|
48 |
+
|hendrycksTest-college_chemistry | 1|acc |0.1500|± |0.0359|
|
49 |
+
| | |acc_norm|0.1500|± |0.0359|
|
50 |
+
|hendrycksTest-college_computer_science | 1|acc |0.1600|± |0.0368|
|
51 |
+
| | |acc_norm|0.1600|± |0.0368|
|
52 |
+
|hendrycksTest-college_mathematics | 1|acc |0.3000|± |0.0461|
|
53 |
+
| | |acc_norm|0.3000|± |0.0461|
|
54 |
+
|hendrycksTest-college_medicine | 1|acc |0.1908|± |0.0300|
|
55 |
+
| | |acc_norm|0.1908|± |0.0300|
|
56 |
+
|hendrycksTest-college_physics | 1|acc |0.2157|± |0.0409|
|
57 |
+
| | |acc_norm|0.2157|± |0.0409|
|
58 |
+
|hendrycksTest-computer_security | 1|acc |0.2200|± |0.0416|
|
59 |
+
| | |acc_norm|0.2200|± |0.0416|
|
60 |
+
|hendrycksTest-conceptual_physics | 1|acc |0.2383|± |0.0279|
|
61 |
+
| | |acc_norm|0.2383|± |0.0279|
|
62 |
+
|hendrycksTest-econometrics | 1|acc |0.2456|± |0.0405|
|
63 |
+
| | |acc_norm|0.2456|± |0.0405|
|
64 |
+
|hendrycksTest-electrical_engineering | 1|acc |0.2276|± |0.0349|
|
65 |
+
| | |acc_norm|0.2276|± |0.0349|
|
66 |
+
|hendrycksTest-elementary_mathematics | 1|acc |0.1772|± |0.0197|
|
67 |
+
| | |acc_norm|0.1772|± |0.0197|
|
68 |
+
|hendrycksTest-formal_logic | 1|acc |0.2460|± |0.0385|
|
69 |
+
| | |acc_norm|0.2460|± |0.0385|
|
70 |
+
|hendrycksTest-global_facts | 1|acc |0.2400|± |0.0429|
|
71 |
+
| | |acc_norm|0.2400|± |0.0429|
|
72 |
+
|hendrycksTest-high_school_biology | 1|acc |0.3065|± |0.0262|
|
73 |
+
| | |acc_norm|0.3065|± |0.0262|
|
74 |
+
|hendrycksTest-high_school_chemistry | 1|acc |0.2759|± |0.0314|
|
75 |
+
| | |acc_norm|0.2759|± |0.0314|
|
76 |
+
|hendrycksTest-high_school_computer_science | 1|acc |0.1600|± |0.0368|
|
77 |
+
| | |acc_norm|0.1600|± |0.0368|
|
78 |
+
|hendrycksTest-high_school_european_history | 1|acc |0.2242|± |0.0326|
|
79 |
+
| | |acc_norm|0.2242|± |0.0326|
|
80 |
+
|hendrycksTest-high_school_geography | 1|acc |0.2828|± |0.0321|
|
81 |
+
| | |acc_norm|0.2828|± |0.0321|
|
82 |
+
|hendrycksTest-high_school_government_and_politics| 1|acc |0.3472|± |0.0344|
|
83 |
+
| | |acc_norm|0.3472|± |0.0344|
|
84 |
+
|hendrycksTest-high_school_macroeconomics | 1|acc |0.3026|± |0.0233|
|
85 |
+
| | |acc_norm|0.3026|± |0.0233|
|
86 |
+
|hendrycksTest-high_school_mathematics | 1|acc |0.2667|± |0.0270|
|
87 |
+
| | |acc_norm|0.2667|± |0.0270|
|
88 |
+
|hendrycksTest-high_school_microeconomics | 1|acc |0.2983|± |0.0297|
|
89 |
+
| | |acc_norm|0.2983|± |0.0297|
|
90 |
+
|hendrycksTest-high_school_physics | 1|acc |0.1722|± |0.0308|
|
91 |
+
| | |acc_norm|0.1722|± |0.0308|
|
92 |
+
|hendrycksTest-high_school_psychology | 1|acc |0.2312|± |0.0181|
|
93 |
+
| | |acc_norm|0.2312|± |0.0181|
|
94 |
+
|hendrycksTest-high_school_statistics | 1|acc |0.4167|± |0.0336|
|
95 |
+
| | |acc_norm|0.4167|± |0.0336|
|
96 |
+
|hendrycksTest-high_school_us_history | 1|acc |0.2451|± |0.0302|
|
97 |
+
| | |acc_norm|0.2451|± |0.0302|
|
98 |
+
|hendrycksTest-high_school_world_history | 1|acc |0.2489|± |0.0281|
|
99 |
+
| | |acc_norm|0.2489|± |0.0281|
|
100 |
+
|hendrycksTest-human_aging | 1|acc |0.2422|± |0.0288|
|
101 |
+
| | |acc_norm|0.2422|± |0.0288|
|
102 |
+
|hendrycksTest-human_sexuality | 1|acc |0.2214|± |0.0364|
|
103 |
+
| | |acc_norm|0.2214|± |0.0364|
|
104 |
+
|hendrycksTest-international_law | 1|acc |0.3223|± |0.0427|
|
105 |
+
| | |acc_norm|0.3223|± |0.0427|
|
106 |
+
|hendrycksTest-jurisprudence | 1|acc |0.2500|± |0.0419|
|
107 |
+
| | |acc_norm|0.2500|± |0.0419|
|
108 |
+
|hendrycksTest-logical_fallacies | 1|acc |0.2454|± |0.0338|
|
109 |
+
| | |acc_norm|0.2454|± |0.0338|
|
110 |
+
|hendrycksTest-machine_learning | 1|acc |0.1964|± |0.0377|
|
111 |
+
| | |acc_norm|0.1964|± |0.0377|
|
112 |
+
|hendrycksTest-management | 1|acc |0.2427|± |0.0425|
|
113 |
+
| | |acc_norm|0.2427|± |0.0425|
|
114 |
+
|hendrycksTest-marketing | 1|acc |0.2009|± |0.0262|
|
115 |
+
| | |acc_norm|0.2009|± |0.0262|
|
116 |
+
|hendrycksTest-medical_genetics | 1|acc |0.2400|± |0.0429|
|
117 |
+
| | |acc_norm|0.2400|± |0.0429|
|
118 |
+
|hendrycksTest-miscellaneous | 1|acc |0.2593|± |0.0157|
|
119 |
+
| | |acc_norm|0.2593|± |0.0157|
|
120 |
+
|hendrycksTest-moral_disputes | 1|acc |0.2486|± |0.0233|
|
121 |
+
| | |acc_norm|0.2486|± |0.0233|
|
122 |
+
|hendrycksTest-moral_scenarios | 1|acc |0.2469|± |0.0144|
|
123 |
+
| | |acc_norm|0.2469|± |0.0144|
|
124 |
+
|hendrycksTest-nutrition | 1|acc |0.2157|± |0.0236|
|
125 |
+
| | |acc_norm|0.2157|± |0.0236|
|
126 |
+
|hendrycksTest-philosophy | 1|acc |0.2830|± |0.0256|
|
127 |
+
| | |acc_norm|0.2830|± |0.0256|
|
128 |
+
|hendrycksTest-prehistory | 1|acc |0.2377|± |0.0237|
|
129 |
+
| | |acc_norm|0.2377|± |0.0237|
|
130 |
+
|hendrycksTest-professional_accounting | 1|acc |0.2801|± |0.0268|
|
131 |
+
| | |acc_norm|0.2801|± |0.0268|
|
132 |
+
|hendrycksTest-professional_law | 1|acc |0.2458|± |0.0110|
|
133 |
+
| | |acc_norm|0.2458|± |0.0110|
|
134 |
+
|hendrycksTest-professional_medicine | 1|acc |0.2794|± |0.0273|
|
135 |
+
| | |acc_norm|0.2794|± |0.0273|
|
136 |
+
|hendrycksTest-professional_psychology | 1|acc |0.2598|± |0.0177|
|
137 |
+
| | |acc_norm|0.2598|± |0.0177|
|
138 |
+
|hendrycksTest-public_relations | 1|acc |0.2273|± |0.0401|
|
139 |
+
| | |acc_norm|0.2273|± |0.0401|
|
140 |
+
|hendrycksTest-security_studies | 1|acc |0.3388|± |0.0303|
|
141 |
+
| | |acc_norm|0.3388|± |0.0303|
|
142 |
+
|hendrycksTest-sociology | 1|acc |0.2189|± |0.0292|
|
143 |
+
| | |acc_norm|0.2189|± |0.0292|
|
144 |
+
|hendrycksTest-us_foreign_policy | 1|acc |0.2100|± |0.0409|
|
145 |
+
| | |acc_norm|0.2100|± |0.0409|
|
146 |
+
|hendrycksTest-virology | 1|acc |0.2169|± |0.0321|
|
147 |
+
| | |acc_norm|0.2169|± |0.0321|
|
148 |
+
|hendrycksTest-world_religions | 1|acc |0.2047|± |0.0309|
|
149 |
+
| | |acc_norm|0.2047|± |0.0309|
|