Muennighoff
commited on
Commit
•
f5f9b77
1
Parent(s):
e989436
Add files
Browse files- 8b7178b44b/evaluation/rankeval/8b7178b44b_3.json +15 -1
- 8b7178b44b/evaluation/rankeval/8b7178b44b_3_lm-eval_global_step84877_2023-01-31-11-38-06_3shots_backup.json +15 -1
- 8b7178b44b/evaluation/rankeval/8b7178b44b_4.json +34 -1
- 8b7178b44b/evaluation/rankeval/8b7178b44b_4_lm-eval_global_step84877_2023-01-31-11-38-06_4shots_backup.json +34 -1
- 8b7178b44b/evaluation/rankeval/8b7178b44b_5.json +39 -1
- 8b7178b44b/evaluation/rankeval/8b7178b44b_5_lm-eval_global_step84877_2023-01-31-11-38-06_5shots_backup.json +39 -1
8b7178b44b/evaluation/rankeval/8b7178b44b_3.json
CHANGED
@@ -54,6 +54,18 @@
|
|
54 |
"acc_stderr": 0.013203196088537369,
|
55 |
"acc_norm": 0.32081911262798635,
|
56 |
"acc_norm_stderr": 0.013640943091946524
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
},
|
59 |
"versions": {
|
@@ -68,6 +80,8 @@
|
|
68 |
"storycloze_2016": 0,
|
69 |
"boolq": 1,
|
70 |
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
|
|
|
|
72 |
}
|
73 |
}
|
|
|
54 |
"acc_stderr": 0.013203196088537369,
|
55 |
"acc_norm": 0.32081911262798635,
|
56 |
"acc_norm_stderr": 0.013640943091946524
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.923,
|
60 |
+
"acc_stderr": 0.008434580140240651,
|
61 |
+
"acc_norm": 0.925,
|
62 |
+
"acc_norm_stderr": 0.00833333333333335
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7442872687704026,
|
66 |
+
"acc_stderr": 0.010178690109459862,
|
67 |
+
"acc_norm": 0.7519042437431991,
|
68 |
+
"acc_norm_stderr": 0.010077118315574703
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
8b7178b44b/evaluation/rankeval/8b7178b44b_3_lm-eval_global_step84877_2023-01-31-11-38-06_3shots_backup.json
CHANGED
@@ -54,6 +54,18 @@
|
|
54 |
"acc_stderr": 0.013203196088537369,
|
55 |
"acc_norm": 0.32081911262798635,
|
56 |
"acc_norm_stderr": 0.013640943091946524
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
},
|
59 |
"versions": {
|
@@ -68,6 +80,8 @@
|
|
68 |
"storycloze_2016": 0,
|
69 |
"boolq": 1,
|
70 |
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
|
|
|
|
72 |
}
|
73 |
}
|
|
|
54 |
"acc_stderr": 0.013203196088537369,
|
55 |
"acc_norm": 0.32081911262798635,
|
56 |
"acc_norm_stderr": 0.013640943091946524
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.923,
|
60 |
+
"acc_stderr": 0.008434580140240651,
|
61 |
+
"acc_norm": 0.925,
|
62 |
+
"acc_norm_stderr": 0.00833333333333335
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7442872687704026,
|
66 |
+
"acc_stderr": 0.010178690109459862,
|
67 |
+
"acc_norm": 0.7519042437431991,
|
68 |
+
"acc_norm_stderr": 0.010077118315574703
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
8b7178b44b/evaluation/rankeval/8b7178b44b_4.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.7097808658471406,
|
40 |
"acc_stderr": 0.010495529690730063
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.7097808658471406,
|
40 |
"acc_stderr": 0.010495529690730063
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.6241590214067279,
|
44 |
+
"acc_stderr": 0.008471147248160114
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.6401515151515151,
|
48 |
+
"acc_stderr": 0.009848484848484843,
|
49 |
+
"acc_norm": 0.6346801346801347,
|
50 |
+
"acc_norm_stderr": 0.009880576614806924
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.28924914675767915,
|
54 |
+
"acc_stderr": 0.013250012579393443,
|
55 |
+
"acc_norm": 0.318259385665529,
|
56 |
+
"acc_norm_stderr": 0.013611993916971453
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.927,
|
60 |
+
"acc_stderr": 0.008230354715244055,
|
61 |
+
"acc_norm": 0.928,
|
62 |
+
"acc_norm_stderr": 0.008178195576218681
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7453754080522307,
|
66 |
+
"acc_stderr": 0.010164432237060487,
|
67 |
+
"acc_norm": 0.7448313384113167,
|
68 |
+
"acc_norm_stderr": 0.010171571592521834
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
8b7178b44b/evaluation/rankeval/8b7178b44b_4_lm-eval_global_step84877_2023-01-31-11-38-06_4shots_backup.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.7097808658471406,
|
40 |
"acc_stderr": 0.010495529690730063
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.7097808658471406,
|
40 |
"acc_stderr": 0.010495529690730063
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.6241590214067279,
|
44 |
+
"acc_stderr": 0.008471147248160114
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.6401515151515151,
|
48 |
+
"acc_stderr": 0.009848484848484843,
|
49 |
+
"acc_norm": 0.6346801346801347,
|
50 |
+
"acc_norm_stderr": 0.009880576614806924
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.28924914675767915,
|
54 |
+
"acc_stderr": 0.013250012579393443,
|
55 |
+
"acc_norm": 0.318259385665529,
|
56 |
+
"acc_norm_stderr": 0.013611993916971453
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.927,
|
60 |
+
"acc_stderr": 0.008230354715244055,
|
61 |
+
"acc_norm": 0.928,
|
62 |
+
"acc_norm_stderr": 0.008178195576218681
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7453754080522307,
|
66 |
+
"acc_stderr": 0.010164432237060487,
|
67 |
+
"acc_norm": 0.7448313384113167,
|
68 |
+
"acc_norm_stderr": 0.010171571592521834
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
8b7178b44b/evaluation/rankeval/8b7178b44b_5.json
CHANGED
@@ -34,6 +34,38 @@
|
|
34 |
"winogrande": {
|
35 |
"acc": 0.569060773480663,
|
36 |
"acc_stderr": 0.01391779662333596
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
}
|
38 |
},
|
39 |
"versions": {
|
@@ -44,6 +76,12 @@
|
|
44 |
"copa": 0,
|
45 |
"hellaswag": 0,
|
46 |
"rte": 0,
|
47 |
-
"winogrande": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
}
|
49 |
}
|
|
|
34 |
"winogrande": {
|
35 |
"acc": 0.569060773480663,
|
36 |
"acc_stderr": 0.01391779662333596
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.7097808658471406,
|
40 |
+
"acc_stderr": 0.010495529690730063
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.6223241590214067,
|
44 |
+
"acc_stderr": 0.008479309208281643
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.6456228956228957,
|
48 |
+
"acc_stderr": 0.00981500403025175,
|
49 |
+
"acc_norm": 0.6506734006734006,
|
50 |
+
"acc_norm_stderr": 0.0097828534493993
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.29180887372013653,
|
54 |
+
"acc_stderr": 0.01328452529240351,
|
55 |
+
"acc_norm": 0.33532423208191126,
|
56 |
+
"acc_norm_stderr": 0.013796182947785562
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.931,
|
60 |
+
"acc_stderr": 0.00801893405031515,
|
61 |
+
"acc_norm": 0.936,
|
62 |
+
"acc_norm_stderr": 0.007743640226919298
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7388465723612623,
|
66 |
+
"acc_stderr": 0.010248738649935581,
|
67 |
+
"acc_norm": 0.7459194776931447,
|
68 |
+
"acc_norm_stderr": 0.010157271999135055
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
76 |
"copa": 0,
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
8b7178b44b/evaluation/rankeval/8b7178b44b_5_lm-eval_global_step84877_2023-01-31-11-38-06_5shots_backup.json
CHANGED
@@ -34,6 +34,38 @@
|
|
34 |
"winogrande": {
|
35 |
"acc": 0.569060773480663,
|
36 |
"acc_stderr": 0.01391779662333596
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
}
|
38 |
},
|
39 |
"versions": {
|
@@ -44,6 +76,12 @@
|
|
44 |
"copa": 0,
|
45 |
"hellaswag": 0,
|
46 |
"rte": 0,
|
47 |
-
"winogrande": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
}
|
49 |
}
|
|
|
34 |
"winogrande": {
|
35 |
"acc": 0.569060773480663,
|
36 |
"acc_stderr": 0.01391779662333596
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.7097808658471406,
|
40 |
+
"acc_stderr": 0.010495529690730063
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.6223241590214067,
|
44 |
+
"acc_stderr": 0.008479309208281643
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.6456228956228957,
|
48 |
+
"acc_stderr": 0.00981500403025175,
|
49 |
+
"acc_norm": 0.6506734006734006,
|
50 |
+
"acc_norm_stderr": 0.0097828534493993
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.29180887372013653,
|
54 |
+
"acc_stderr": 0.01328452529240351,
|
55 |
+
"acc_norm": 0.33532423208191126,
|
56 |
+
"acc_norm_stderr": 0.013796182947785562
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.931,
|
60 |
+
"acc_stderr": 0.00801893405031515,
|
61 |
+
"acc_norm": 0.936,
|
62 |
+
"acc_norm_stderr": 0.007743640226919298
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7388465723612623,
|
66 |
+
"acc_stderr": 0.010248738649935581,
|
67 |
+
"acc_norm": 0.7459194776931447,
|
68 |
+
"acc_norm_stderr": 0.010157271999135055
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
76 |
"copa": 0,
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|