Spaces:
Runtime error
Runtime error
hunterhector
commited on
Commit
·
e74bc72
1
Parent(s):
2c39f2b
fix data columns
Browse files- data/txt360_eval/CKPT Eval - BoolQ.csv +0 -68
- data/txt360_eval/CKPT Eval - GSM8K.csv +0 -68
- data/txt360_eval/CKPT Eval - HellaSwag.csv +68 -69
- data/txt360_eval/CKPT Eval - MATH.csv +0 -68
- data/txt360_eval/CKPT Eval - MMLU.csv +68 -68
- data/txt360_eval/CKPT Eval - MedQA.csv +68 -68
- data/txt360_eval/CKPT Eval - NQ.csv +68 -68
- data/txt360_eval/CKPT Eval - PIQA.csv +68 -69
- data/txt360_eval/CKPT Eval - TriviaQA.csv +68 -68
- data/txt360_eval/CKPT Eval - WinoGrande.csv +68 -69
- main.py +13 -1
- results.py +9 -12
data/txt360_eval/CKPT Eval - BoolQ.csv
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
0-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
-
hf-time: 4 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
-
5k,0.5761,0.5624,,0.6116,0.5514,0.5945,0.5446,0.5336,0.5902,0.5908,0.5394,0.5865,0.5284
|
4 |
-
10k,0.6242,0.5853,,0.6131,,0.5358,0.6122,0.6080,0.5471,0.5511,0.6138,0.5902,0.5780
|
5 |
-
15k,0.6480,0.6291,,0.6061,0.6217,0.5468,0.6205,0.6242,0.6248,0.5917,0.6211,0.5933,0.5713
|
6 |
-
20k,0.6541,0.6474,,0.5865,0.6187,0.6122,0.6199,0.6116,0.6119,0.5636,0.6239,0.5988,0.5850
|
7 |
-
25k,0.6670,0.6012,,0.6398,0.6251,0.6162,0.6349,0.6239,0.6291,0.5630,0.6336,0.6232,0.6312
|
8 |
-
30k,0.6777,0.6523,,0.6379,0.6083,0.6260,0.6437,0.6263,0.6107,0.5835,0.5865,0.6391,0.6425
|
9 |
-
35k,0.6495,0.6584,,0.6388,,0.6333,0.6346,0.6343,0.6144,0.4933,0.6043,0.6278,0.6480
|
10 |
-
40k,0.6771,0.6930,,0.6489,0.6410,0.6596,0.6330,0.6214,0.6520,0.5685,0.5768,0.6343,0.6505
|
11 |
-
45k,0.6624,0.6887,,0.6590,0.6422,0.6223,0.6401,0.6131,0.6153,0.5578,0.6058,0.6336,0.6529
|
12 |
-
50k,0.6761,0.6951,,0.6575,0.6566,0.6593,0.6557,0.6058,0.6541,0.5972,0.6018,0.6177,0.6563
|
13 |
-
55k,0.6847,0.6725,,0.6752,0.6321,0.6688,0.6523,0.6520,0.6679,0.5908,0.5343,0.6214,0.6618
|
14 |
-
60k,0.6920,0.6697,,0.6566,0.6226,0.6642,0.6401,0.6162,0.6361,0.5908,0.5972,0.6226,0.6645
|
15 |
-
65k,0.6979,0.6905,,0.6865,0.6352,0.6758,0.6688,0.6691,0.6942,0.6315,0.5682,0.6196,0.6352
|
16 |
-
70k,0.7104,0.6966,,0.6795,0.6456,0.6746,0.6651,0.6624,0.6575,0.5997,0.5324,0.6358,0.6526
|
17 |
-
75k,0.7269,0.6850,,0.6862,0.6514,,0.6621,0.6774,0.6817,0.6217,0.6009,0.6453,0.6535
|
18 |
-
80k,0.6997,0.6817,,0.6945,0.6327,0.6664,0.6667,0.6709,0.6703,0.6275,0.5896,0.6502,0.6612
|
19 |
-
85k,0.7346,0.6939,,0.6853,0.6746,0.6902,0.6602,0.6330,0.6737,0.6272,0.5239,0.6489,0.6703
|
20 |
-
90k,0.7254,0.6908,,0.6936,0.6612,0.6713,0.6755,0.6835,0.6315,0.6275,0.5428,0.6128,0.6807
|
21 |
-
95k,0.7165,0.7229,,0.7003,0.6587,,0.6823,0.6404,0.6670,0.6089,0.6138,0.6456,0.6612
|
22 |
-
100k,0.7153,0.7073,,0.6869,,0.6676,0.6746,0.6618,0.6587,0.6006,0.5584,0.6566,0.6810
|
23 |
-
105k,0.7333,0.7147,,0.6682,,0.6899,0.6609,0.6853,0.6853,0.6544,0.5740,0.6520,0.6755
|
24 |
-
110k,0.7376,0.7095,,0.6954,0.6664,0.6703,0.6810,0.6612,0.6798,0.6618,,0.6346,0.6434
|
25 |
-
115k,0.7168,0.7095,,0.7156,0.6645,0.6746,0.6997,0.6829,0.6813,0.6523,,0.6596,0.6920
|
26 |
-
120k,0.7370,0.7226,,0.7177,0.6648,0.6752,0.7015,,0.6841,0.6633,,0.6587,0.6890
|
27 |
-
125k,0.7361,0.7144,,0.7034,0.6636,0.6826,0.6869,0.6657,,0.6593,,0.6593,0.6795
|
28 |
-
130k,0.7284,0.7269,,0.6939,0.6786,0.6554,0.6988,0.6719,0.6777,0.6260,,,0.7018
|
29 |
-
135k,0.7483,0.7141,,0.7128,,0.6847,0.7028,0.6838,0.6933,0.6602,,,0.6966
|
30 |
-
140k,,0.7312,,0.7080,,0.6777,0.6997,0.6957,0.7040,0.6624,,,0.6884
|
31 |
-
145k,,,,0.7281,,0.6844,0.6908,0.6743,0.6914,0.6657,,,0.7061
|
32 |
-
150k,,,,0.7297,,0.6795,,0.6807,0.6991,0.6526,,,0.7024
|
33 |
-
155k,,,,0.7162,,0.7021,0.6976,0.6792,0.6927,0.6587,,,0.7028
|
34 |
-
160k,,,,0.6902,,0.6810,0.6985,0.6930,0.6893,0.6434,,,0.7098
|
35 |
-
165k,,,,0.7239,,0.6896,0.7037,,0.7021,0.6581,,,0.7080
|
36 |
-
170k,,,,0.7471,,0.6780,0.7141,,0.6911,0.6761,,,0.7058
|
37 |
-
175k,,,,0.7486,,0.6817,0.6942,,0.7095,0.6557,,,0.7021
|
38 |
-
180k,,,,0.6985,,0.6979,0.7162,,0.7067,0.6468,,,0.6523
|
39 |
-
185k,,,,0.7187,,0.6887,0.7031,,0.6917,0.6642,,,0.6914
|
40 |
-
190k,,,,0.7333,,0.6963,,,0.7113,0.6563,,,0.718
|
41 |
-
195k,,,,0.7269,,0.7021,,,0.7199,0.6817,,,0.7165
|
42 |
-
200k,,,,0.7135,,0.7080,,,0.707,0.6709,,,0.7015
|
43 |
-
205k,,,,0.7388,,0.7015,,,0.7168,0.6722,,,0.722
|
44 |
-
210k,,,,0.7489,,0.7089,,,,0.6765,,,0.6948
|
45 |
-
215k,,,,0.7538,,0.7183,,,0.7309,0.6869,,,0.6835
|
46 |
-
220k,,,,0.7474,,0.7171,,,0.7398,0.6893,,,
|
47 |
-
225k,,,,0.7251,,0.7131,,,0.7061,0.6801,,,
|
48 |
-
230k,,,,0.7083,,,,,0.7232,0.6765,,,
|
49 |
-
235k,,,,0.6930,,,,,0.6884,0.6434,,,
|
50 |
-
240k,,,,0.7541,,,,,,0.6875,,,
|
51 |
-
245k,,,,0.7541,,,,,,0.6713,,,
|
52 |
-
250k,,,,0.7498,,,,,,0.6798,,,
|
53 |
-
255k,,,,0.7749,,,,,,0.6578,,,
|
54 |
-
260k,,,,0.7615,,,,,,0.6954,,,
|
55 |
-
265k,,,,0.7486,,,,,,0.6807,,,
|
56 |
-
270k,,,,0.7226,,,,,,0.6869,,,
|
57 |
-
275k,,,,0.7269,,,,,,0.6841,,,
|
58 |
-
280k,,,,0.7517,,,,,,0.6804,,,
|
59 |
-
285k,,,,0.7150,,,,,,0.7006,,,
|
60 |
-
290k,,,,,,,,,,0.6826,,,
|
61 |
-
300k,,,,,,,,,,0.6706,,,
|
62 |
-
305k,,,,,,,,,,0.7006,,,
|
63 |
-
310k,,,,,,,,,,0.6777,,,
|
64 |
-
315k,,,,,,,,,,0.6859,,,
|
65 |
-
320k,,,,,,,,,,0.6939,,,
|
66 |
-
325k,,,,,,,,,,,,,
|
67 |
-
330k,,,,,,,,,,,,,
|
68 |
-
335k,,,,,,,,,,,,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/txt360_eval/CKPT Eval - GSM8K.csv
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
-
hf-time: 115 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
-
5k,0.0152,0.0099,,,0.0076,0.0015,0.0045,0.0030,,0.0152,0.0106,0.0197,0.0197
|
4 |
-
10k,0.0152,0.0190,,0.0015,,0.0091,0.0000,0.0212,0.0144,0.0159,0.0136,0.0174,0.0243
|
5 |
-
15k,0.0182,0.0167,,0.0053,0.0068,0.0045,0.0083,0.0212,0.0068,0.0174,0.0190,0.0174,0.0136
|
6 |
-
20k,0.0250,0.0212,,,,,0.0030,0.0159,0.0220,0.0167,0.0190,0.0220,0.0174
|
7 |
-
25k,0.0288,0.0114,,,,0.0129,0.0053,0.0258,0.0144,0.0152,0.0144,0.0144,0.0144
|
8 |
-
30k,0.0220,0.0265,,0.0197,0.0038,0.0152,0.0167,0.0227,0.0220,0.0205,0.0129,0.0167,0.0038
|
9 |
-
35k,0.0296,0.0212,,0.0136,0.0045,0.0190,0.0045,0.0227,0.0220,0.0174,0.0174,0.0243,0.0182
|
10 |
-
40k,0.0235,0.0288,,0.0068,0.0121,0.0220,0.0015,0.0243,0.0265,0.0152,0.0212,0.0190,0.0182
|
11 |
-
45k,0.0387,0.0250,,0.0258,0.0038,0.0273,0.0106,0.0296,0.0273,0.0182,0.0152,0.0174,0.0129
|
12 |
-
50k,0.0318,0.0303,,0.0015,0.0243,0.0227,0.0121,0.0190,0.0220,0.0197,0.0205,0.0182,0.0068
|
13 |
-
55k,0.0296,0.0311,,0.0023,0.0235,0.0235,0.0250,0.0326,0.0197,0.0182,0.0174,0.0250,0.0091
|
14 |
-
60k,0.0432,0.0326,,0.0167,0.0212,0.0212,0.0182,0.0349,0.0220,0.0182,0.0099,0.0190,0.0197
|
15 |
-
65k,0.0470,0.0379,,0.0015,0.0159,0.0281,0.0136,0.0296,0.0212,0.0212,0.0129,0.0205,0.0114
|
16 |
-
70k,0.0432,0.0417,,0.0136,0.0197,0.0174,0.0114,0.0341,0.0243,0.0205,0.0136,0.0250,0.0091
|
17 |
-
75k,0.0508,0.0470,,0.0174,0.0121,0.0250,0.0182,0.0356,0.0288,0.0281,0.0174,0.0190,0.0106
|
18 |
-
80k,0.0561,0.0417,,0.0068,0.0000,0.0190,0.0083,0.0318,0.0356,0.0273,0.0167,0.0265,0.0182
|
19 |
-
85k,0.0728,0.0341,,0.0341,0.0190,0.0296,0.0205,0.0265,0.0250,0.0220,0.0129,0.0235,0.0083
|
20 |
-
90k,0.0690,0.0425,,0.0197,0.0190,0.0281,0.0061,0.0417,0.0265,0.0273,0.0167,0.0190,0.0182
|
21 |
-
95k,0.0735,0.0447,,0.0167,0.0250,0.0281,0.0136,0.0349,0.0281,0.0174,0.0106,0.0288,0.0159
|
22 |
-
100k,0.0637,0.0470,,0.0159,,0.0227,0.0045,0.0409,0.0311,0.0265,0.0205,0.0190,0.0190
|
23 |
-
105k,0.0637,0.0447,,0.0341,,0.0303,0.0129,0.0371,0.0311,0.0273,0.0205,0.0311,0.0129
|
24 |
-
110k,0.0872,0.0576,,0.0038,0.0273,0.0129,0.0205,0.0478,0.0296,0.0212,,0.0281,0.0182
|
25 |
-
115k,0.0788,0.0576,,0.0091,0.0167,0.0311,0.0167,0.0508,0.0349,0.0220,,0.0220,0.0174
|
26 |
-
120k,0.0834,0.0455,,0.0227,0.0265,0.0167,0.0212,0.0371,0.0318,0.0167,,0.0220,0.0152
|
27 |
-
125k,0.1001,0.0493,,0.0288,0.0250,0.0205,0.0387,0.0402,0.0318,0.0182,,0.0235,0.0144
|
28 |
-
130k,0.0766,0.0470,,0.0068,0.0258,0.0288,0.0174,,0.0341,0.0243,,,0.0205
|
29 |
-
135k,0.0879,0.0607,,0.0190,,0.0349,0.0258,0.0409,0.0288,0.0212,,,0.0281
|
30 |
-
140k,,0.0569,,0.0379,,0.0356,0.0227,0.0440,0.0341,0.0144,,,0.0144
|
31 |
-
145k,,,,0.0341,,0.0379,0.0015,0.0387,,0.0174,,,0.0273
|
32 |
-
150k,,,,,,0.0281,,0.0470,0.0265,0.0220,,,0.0258
|
33 |
-
155k,,,,0.0318,,0.0303,0.0121,0.0561,0.0523,0.0227,,,0.0243
|
34 |
-
160k,,,,0.0356,,0.0243,0.0061,0.0425,0.0432,0.0220,,,0.0303
|
35 |
-
165k,,,,0.0167,,0.0409,0.0015,,0.0470,0.0281,,,
|
36 |
-
170k,,,,0.0334,,0.0281,0.0129,,0.0455,0.0273,,,0.0235
|
37 |
-
175k,,,,0.0371,,0.0326,0.0190,,0.0409,0.0190,,,0.0273
|
38 |
-
180k,,,,0.0425,,0.0364,0.0227,,0.0356,0.0243,,,0.0288
|
39 |
-
185k,,,,0.0341,,0.0318,0.0341,,0.0546,0.0235,,,0.0364
|
40 |
-
190k,,,,0.0296,,0.0364,,,0.0425,0.0220,,,0.0349
|
41 |
-
195k,,,,0.0250,,0.0303,,,0.0493,0.0258,,,
|
42 |
-
200k,,,,0.0250,,0.0371,,,0.0493,0.0273,,,0.0205
|
43 |
-
205k,,,,0.0455,,0.0409,,,0.0553,0.0220,,,0.0258
|
44 |
-
210k,,,,0.0462,,0.0371,,,0.0523,0.0281,,,
|
45 |
-
215k,,,,0.0349,,0.0265,,,0.0500,0.0235,,,0.0281
|
46 |
-
220k,,,,0.0432,,0.0167,,,0.0462,0.0326,,,
|
47 |
-
225k,,,,0.0447,,0.0212,,,,0.0265,,,
|
48 |
-
230k,,,,0.0440,,,,,0.0493,0.0273,,,
|
49 |
-
235k,,,,0.0402,,,,,0.0508,0.0220,,,
|
50 |
-
240k,,,,0.0341,,,,,,0.0281,,,
|
51 |
-
245k,,,,0.0462,,,,,,0.0356,,,
|
52 |
-
250k,,,,0.0500,,,,,,,,,
|
53 |
-
255k,,,,0.0569,,,,,,0.0303,,,
|
54 |
-
260k,,,,0.0500,,,,,,0.0334,,,
|
55 |
-
265k,,,,0.0455,,,,,,0.0318,,,
|
56 |
-
270k,,,,0.0538,,,,,,0.0273,,,
|
57 |
-
275k,,,,0.0470,,,,,,,,,
|
58 |
-
280k,,,,0.0553,,,,,,0.0364,,,
|
59 |
-
285k,,,,0.0531,,,,,,0.0349,,,
|
60 |
-
290k,,,,,,,,,,0.0311,,,
|
61 |
-
300k,,,,,,,,,,,,,
|
62 |
-
305k,,,,,,,,,,0.0311,,,
|
63 |
-
310k,,,,,,,,,,0.0273,,,
|
64 |
-
315k,,,,,,,,,,,,,
|
65 |
-
320k,,,,,,,,,,,,,
|
66 |
-
325k,,,,,,,,,,,,,
|
67 |
-
330k,,,,,,,,,,,,,
|
68 |
-
335k,,,,,,,,,,,,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/txt360_eval/CKPT Eval - HellaSwag.csv
CHANGED
@@ -1,69 +1,68 @@
|
|
1 |
-
ga,
|
2 |
-
0-shot: 5 min,Llama-8x8B-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
335k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
|
|
1 |
+
ga,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
2 |
+
0-shot: 5 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
3 |
+
5k,0.5622,0.5254,0.5324,0.5366
|
4 |
+
10k,0.6433,0.5836,0.6046,0.6139
|
5 |
+
15k,0.6716,0.6114,0.6336,0.6388
|
6 |
+
20k,0.6855,0.6271,0.6492,0.6548
|
7 |
+
25k,0.6945,0.6413,0.6665,0.6683
|
8 |
+
30k,0.7059,,0.6746,0.6741
|
9 |
+
35k,0.7158,0.6547,0.6832,0.6864
|
10 |
+
40k,0.7184,0.6642,0.6821,0.6917
|
11 |
+
45k,0.722,0.6698,0.6905,0.6933
|
12 |
+
50k,0.725,0.6689,0.6964,0.7018
|
13 |
+
55k,0.7305,0.6697,0.6959,0.7052
|
14 |
+
60k,0.7236,0.6748,0.6904,0.704
|
15 |
+
65k,0.7355,0.6752,0.7061,0.7074
|
16 |
+
70k,0.7399,0.6773,0.7054,0.7074
|
17 |
+
75k,0.7374,0.6854,0.7065,0.7027
|
18 |
+
80k,0.7422,0.6862,0.7118,0.7139
|
19 |
+
85k,0.7444,0.6887,0.7126,0.7178
|
20 |
+
90k,0.7443,0.6917,0.7148,0.7146
|
21 |
+
95k,0.7376,0.6901,0.7115,0.724
|
22 |
+
100k,0.7457,,0.7117,0.7241
|
23 |
+
105k,0.7476,,0.7132,0.7263
|
24 |
+
110k,0.7486,0.6942,0.7166,0.7284
|
25 |
+
115k,0.7522,0.6957,0.7179,0.7274
|
26 |
+
120k,0.752,0.7022,0.7224,0.7329
|
27 |
+
125k,0.7533,0.7029,0.7221,0.7285
|
28 |
+
130k,0.7573,0.7032,0.7261,0.7337
|
29 |
+
135k,0.758,,0.7198,0.7324
|
30 |
+
140k,0.7596,,0.7245,0.7338
|
31 |
+
145k,0.7573,,0.7247,0.7431
|
32 |
+
150k,0.7614,,,0.7386
|
33 |
+
155k,0.7579,,0.7294,0.7448
|
34 |
+
160k,0.7606,,0.7279,0.7385
|
35 |
+
165k,,,0.7297,0.7493
|
36 |
+
170k,0.7696,,0.7323,0.7499
|
37 |
+
175k,0.7745,,0.7338,0.7502
|
38 |
+
180k,0.7676,,0.7316,0.7457
|
39 |
+
185k,0.7678,,0.7354,0.7519
|
40 |
+
190k,0.7701,,,0.7493
|
41 |
+
195k,0.773,,,0.7579
|
42 |
+
200k,0.7753,,,0.7567
|
43 |
+
205k,0.7744,,,0.756
|
44 |
+
210k,0.7729,,,0.7658
|
45 |
+
215k,0.7804,,,0.7621
|
46 |
+
220k,0.7752,,,0.7678
|
47 |
+
225k,0.7808,,,0.7649
|
48 |
+
230k,0.7786,,,0.7662
|
49 |
+
235k,0.7844,,,0.7676
|
50 |
+
240k,0.7866,,,
|
51 |
+
245k,0.7857,,,
|
52 |
+
250k,0.7851,,,
|
53 |
+
255k,0.7845,,,
|
54 |
+
260k,0.7893,,,
|
55 |
+
265k,0.7918,,,
|
56 |
+
270k,0.7917,,,
|
57 |
+
275k,0.7925,,,
|
58 |
+
280k,0.7943,,,
|
59 |
+
285k,0.7946,,,
|
60 |
+
290k,,,,
|
61 |
+
300k,,,,
|
62 |
+
305k,,,,
|
63 |
+
310k,,,,
|
64 |
+
315k,,,,
|
65 |
+
320k,,,,
|
66 |
+
325k,,,,
|
67 |
+
330k,,,,
|
68 |
+
335k,,,,
|
|
data/txt360_eval/CKPT Eval - MATH.csv
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
-
time: 5 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
-
5k,0.2335,0.2308,,0.2251,,0.2157,0.2221,0.2231,0.2211,0.2251,0.2191,0.2271,0.2238
|
4 |
-
10k,0.2489,0.2519,,0.2379,0.2211,0.2332,0.2415,0.2342,0.2399,0.2285,0.2342,0.2402,0.2224
|
5 |
-
15k,0.2626,0.2469,,0.2526,,0.2389,0.2322,0.2479,0.2580,0.2375,0.2271,0.2355,0.2375
|
6 |
-
20k,0.2737,0.2606,,0.2469,0.2399,,0.2419,0.2526,0.2663,0.2469,0.2499,0.2439,0.2322
|
7 |
-
25k,0.2700,0.2653,,0.2523,0.2395,0.2600,0.2526,0.2616,0.2559,0.2369,0.2476,0.2462,0.2355
|
8 |
-
30k,0.2687,0.2556,,0.2402,,0.2452,0.2533,0.2606,0.2503,0.2456,0.2452,0.2446,0.2372
|
9 |
-
35k,0.2765,0.2533,,0.2683,0.2596,0.2590,0.2509,0.2630,0.2737,0.2392,0.2405,0.2536,0.2402
|
10 |
-
40k,0.2667,0.2683,,0.2496,0.2496,0.2593,0.2529,0.2697,0.2663,0.2379,0.2486,0.2526,0.2422
|
11 |
-
45k,0.2750,0.2620,,0.2616,0.2586,0.2563,0.2503,0.2683,0.2673,0.2479,0.2496,0.2513,0.2472
|
12 |
-
50k,0.2861,0.2697,,0.2693,0.2553,0.2596,0.2553,0.2700,0.2771,0.2442,0.2425,0.2546,0.2395
|
13 |
-
55k,0.2848,0.2693,,0.2640,0.2630,0.2566,0.2479,0.2630,0.2757,0.2526,0.2506,0.2586,0.2509
|
14 |
-
60k,0.2945,0.2784,,0.2727,0.2596,0.2633,0.2590,0.2690,0.2714,0.2519,0.2563,0.2553,0.2479
|
15 |
-
65k,0.3008,0.2767,,0.2680,0.2623,0.2704,0.2610,0.2492,0.2727,0.2529,0.2559,0.2647,0.2462
|
16 |
-
70k,0.2891,0.2824,,0.2730,0.2596,0.2710,0.2700,0.2677,0.2807,0.2469,0.2459,0.2626,0.2576
|
17 |
-
75k,0.2982,0.2938,,0.2784,0.2647,0.2630,0.2697,0.2777,0.2620,0.2626,0.2499,0.2583,0.2549
|
18 |
-
80k,0.2948,0.2801,,0.2737,0.2727,0.2643,0.2553,0.2657,0.2704,0.2509,0.2590,0.2549,0.2563
|
19 |
-
85k,0.2992,0.2938,,0.2754,0.2620,0.2704,0.2677,0.2600,0.2771,0.2496,0.2385,0.2620,0.2529
|
20 |
-
90k,0.3002,0.2888,,0.2764,0.2714,0.2737,0.2573,0.2693,0.2918,0.2616,0.2492,0.2566,0.2516
|
21 |
-
95k,0.3025,0.2817,,0.2616,0.2690,0.2737,0.2523,0.2690,0.2791,0.2492,0.2576,0.2576,0.2549
|
22 |
-
100k,0.2951,0.2894,,0.2616,,0.2817,0.2660,0.2757,0.2861,0.2546,0.2479,0.2667,0.2559
|
23 |
-
105k,0.3052,0.2928,,0.2653,,0.2710,0.2707,0.2771,0.2868,0.2529,0.2482,0.2640,0.2633
|
24 |
-
110k,0.3052,0.2985,,0.2600,0.2764,0.2781,0.2600,0.2764,0.2824,0.2536,,0.2727,0.2606
|
25 |
-
115k,0.3025,0.2985,,0.2690,0.2791,0.2720,0.2704,0.2744,0.2918,0.2623,,0.2807,0.2496
|
26 |
-
120k,0.3042,0.2985,,0.2750,0.2647,0.2650,0.2814,0.2754,0.2955,0.2677,,0.2626,0.2586
|
27 |
-
125k,0.3149,0.3018,,0.2683,0.2707,0.2647,0.2757,0.2760,0.2804,0.2509,,0.2704,0.2496
|
28 |
-
130k,0.3179,0.2978,,0.2781,0.2747,0.2653,0.2760,0.2774,0.2767,0.2593,,,0.2513
|
29 |
-
135k,0.3226,0.2945,,0.2747,,0.2717,0.2673,0.2784,0.2884,0.2606,,,0.2533
|
30 |
-
140k,,0.3018,,0.2771,,0.2757,0.2794,0.2787,0.2821,0.2459,,,0.2596
|
31 |
-
145k,,,,0.2724,,0.2650,0.2720,0.2888,0.2801,0.2543,,,0.2633
|
32 |
-
150k,,,,0.2720,,0.2814,,0.2864,0.2901,0.2590,,,0.2543
|
33 |
-
155k,,,,,,0.2784,0.2720,0.2874,0.2938,0.2580,,,0.2566
|
34 |
-
160k,,,,0.2817,,0.2834,0.2653,0.2807,0.2814,0.2563,,,0.2549
|
35 |
-
165k,,,,0.2834,,0.2821,0.2804,,0.2955,0.2559,,,0.2536
|
36 |
-
170k,,,,0.2854,,0.2824,0.2804,,0.3119,0.2536,,,0.2626
|
37 |
-
175k,,,,0.2804,,0.2915,0.2750,,0.2988,0.2489,,,0.2657
|
38 |
-
180k,,,,0.2767,,0.2901,0.2958,,0.3099,0.2623,,,0.2643
|
39 |
-
185k,,,,0.2767,,0.2948,0.2804,,0.3055,0.2570,,,0.2643
|
40 |
-
190k,,,,0.2787,,0.2925,,,0.3065,0.2573,,,0.2760
|
41 |
-
195k,,,,0.2858,,0.2898,,,0.3119,0.2640,,,0.2657
|
42 |
-
200k,,,,0.2771,,0.3028,,,0.3112,0.2610,,,0.2687
|
43 |
-
205k,,,,0.2851,,0.2921,,,0.3002,0.2680,,,0.2667
|
44 |
-
210k,,,,0.2838,,0.2817,,,0.3022,0.2650,,,0.2714
|
45 |
-
215k,,,,0.2838,,0.2851,,,0.3069,0.2653,,,0.2600
|
46 |
-
220k,,,,0.2938,,0.2814,,,0.3002,0.2549,,,
|
47 |
-
225k,,,,0.2935,,0.2898,,,0.3049,0.2633,,,
|
48 |
-
230k,,,,0.2888,,,,,0.3132,0.2653,,,
|
49 |
-
235k,,,,0.3055,,,,,0.2951,0.2717,,,
|
50 |
-
240k,,,,0.2995,,,,,,0.2667,,,
|
51 |
-
245k,,,,0.2928,,,,,,0.2610,,,
|
52 |
-
250k,,,,0.3092,,,,,,0.2650,,,
|
53 |
-
255k,,,,0.3152,,,,,,0.2643,,,
|
54 |
-
260k,,,,0.2951,,,,,,0.2616,,,
|
55 |
-
265k,,,,0.3045,,,,,,0.2610,,,
|
56 |
-
270k,,,,0.3018,,,,,,,,,
|
57 |
-
275k,,,,0.3065,,,,,,,,,
|
58 |
-
280k,,,,0.3015,,,,,,,,,
|
59 |
-
285k,,,,0.2965,,,,,,0.2586,,,
|
60 |
-
290k,,,,,,,,,,0.2623,,,
|
61 |
-
300k,,,,,,,,,,0.2603,,,
|
62 |
-
305k,,,,,,,,,,0.2630,,,
|
63 |
-
310k,,,,,,,,,,0.2710,,,
|
64 |
-
315k,,,,,,,,,,0.2677,,,
|
65 |
-
320k,,,,,,,,,,0.2650,,,
|
66 |
-
325k,,,,,,,,,,,,,
|
67 |
-
330k,,,,,,,,,,,,,
|
68 |
-
335k,,,,,,,,,,,,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/txt360_eval/CKPT Eval - MMLU.csv
CHANGED
@@ -1,68 +1,68 @@
|
|
1 |
-
5-shot,
|
2 |
-
time: 20 min,Llama-8x8B-
|
3 |
-
5k
|
4 |
-
10k,0.
|
5 |
-
15k
|
6 |
-
20k,0.
|
7 |
-
25k
|
8 |
-
30k
|
9 |
-
35k,0.
|
10 |
-
40k,0.
|
11 |
-
45k,0.
|
12 |
-
50k,0.
|
13 |
-
55k,0.
|
14 |
-
60k,0.
|
15 |
-
65k,0.
|
16 |
-
70k,0.
|
17 |
-
75k,0.
|
18 |
-
80k,0.
|
19 |
-
85k,0.
|
20 |
-
90k,0.
|
21 |
-
95k,0.
|
22 |
-
100k,0.
|
23 |
-
105k,0.
|
24 |
-
110k,0.
|
25 |
-
115k,0.
|
26 |
-
120k,0.
|
27 |
-
125k,0.
|
28 |
-
130k,0.
|
29 |
-
135k,0.
|
30 |
-
140k
|
31 |
-
145k
|
32 |
-
150k
|
33 |
-
155k
|
34 |
-
160k
|
35 |
-
165k
|
36 |
-
170k
|
37 |
-
175k
|
38 |
-
180k
|
39 |
-
185k
|
40 |
-
190k
|
41 |
-
195k
|
42 |
-
200k
|
43 |
-
205k
|
44 |
-
210k
|
45 |
-
215k
|
46 |
-
220k
|
47 |
-
225k
|
48 |
-
230k
|
49 |
-
235k
|
50 |
-
240k
|
51 |
-
245k
|
52 |
-
250k
|
53 |
-
255k
|
54 |
-
260k
|
55 |
-
265k
|
56 |
-
270k
|
57 |
-
275k
|
58 |
-
280k
|
59 |
-
285k
|
60 |
-
290k
|
61 |
-
300k
|
62 |
-
305k
|
63 |
-
310k
|
64 |
-
315k
|
65 |
-
320k
|
66 |
-
325k
|
67 |
-
330k
|
68 |
-
335k
|
|
|
1 |
+
5-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
2 |
+
time: 20 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
3 |
+
5k,,0.2579,0.2482,0.2456
|
4 |
+
10k,0.2594,0.2612,0.2628,0.2525
|
5 |
+
15k,,,0.2334,0.2503
|
6 |
+
20k,0.2495,0.2467,0.2449,0.254
|
7 |
+
25k,,0.2431,0.2571,0.2534
|
8 |
+
30k,,,0.2678,0.2557
|
9 |
+
35k,0.2426,0.2591,0.2562,0.2494
|
10 |
+
40k,0.2467,0.2485,0.2408,0.2686
|
11 |
+
45k,0.2418,0.2296,0.2712,0.2503
|
12 |
+
50k,0.2382,0.2441,0.2558,0.2322
|
13 |
+
55k,0.2408,0.2536,0.244,0.2747
|
14 |
+
60k,0.2718,0.2539,0.2339,0.2432
|
15 |
+
65k,0.2637,0.2423,0.2342,0.2478
|
16 |
+
70k,0.2534,0.2359,0.2673,0.2478
|
17 |
+
75k,0.2529,0.2372,0.2579,0.2478
|
18 |
+
80k,0.2504,0.2344,0.2535,0.2718
|
19 |
+
85k,0.2547,0.2496,0.2418,0.2465
|
20 |
+
90k,0.2595,0.2464,0.2359,0.2475
|
21 |
+
95k,0.2621,0.2469,0.2534,0.2424
|
22 |
+
100k,0.255,,0.2461,0.2497
|
23 |
+
105k,0.2659,,0.2729,0.2468
|
24 |
+
110k,0.2551,0.2629,0.2604,0.2522
|
25 |
+
115k,0.2624,0.2324,0.259,0.2584
|
26 |
+
120k,0.2626,0.2663,0.2629,0.2748
|
27 |
+
125k,0.2712,0.2733,0.2768,0.257
|
28 |
+
130k,0.2404,0.2635,0.2676,0.2812
|
29 |
+
135k,0.2641,,0.2735,0.2882
|
30 |
+
140k,0.2553,,0.2765,0.3019
|
31 |
+
145k,0.2492,,0.2708,0.309
|
32 |
+
150k,0.2595,,,0.3199
|
33 |
+
155k,0.2681,,0.2463,0.3116
|
34 |
+
160k,0.2605,,0.2821,0.324
|
35 |
+
165k,0.2725,,0.2816,0.3478
|
36 |
+
170k,0.2514,,0.2893,0.3423
|
37 |
+
175k,0.2535,,0.3317,0.3156
|
38 |
+
180k,0.2561,,0.2624,0.2893
|
39 |
+
185k,0.2523,,0.3026,0.3876
|
40 |
+
190k,0.2653,,,0.3131
|
41 |
+
195k,0.2681,,,0.3473
|
42 |
+
200k,0.2515,,,0.3257
|
43 |
+
205k,0.2619,,,0.3836
|
44 |
+
210k,0.2687,,,0.3063
|
45 |
+
215k,0.2653,,,0.3947
|
46 |
+
220k,0.2631,,,0.3621
|
47 |
+
225k,0.2737,,,0.4151
|
48 |
+
230k,0.2833,,,0.3825
|
49 |
+
235k,0.2703,,,0.3897
|
50 |
+
240k,0.2572,,,
|
51 |
+
245k,0.27,,,
|
52 |
+
250k,0.2639,,,
|
53 |
+
255k,0.268,,,
|
54 |
+
260k,0.2897,,,
|
55 |
+
265k,0.2815,,,
|
56 |
+
270k,0.2693,,,
|
57 |
+
275k,0.2789,,,
|
58 |
+
280k,0.3052,,,
|
59 |
+
285k,0.285,,,
|
60 |
+
290k,,,,
|
61 |
+
300k,,,,
|
62 |
+
305k,,,,
|
63 |
+
310k,,,,
|
64 |
+
315k,,,,
|
65 |
+
320k,,,,
|
66 |
+
325k,,,,
|
67 |
+
330k,,,,
|
68 |
+
335k,,,,
|
data/txt360_eval/CKPT Eval - MedQA.csv
CHANGED
@@ -1,68 +1,68 @@
|
|
1 |
-
0-shot,
|
2 |
-
time: 3 min,Llama-8x8B-
|
3 |
-
5k,0.
|
4 |
-
10k,0.
|
5 |
-
15k,0.
|
6 |
-
20k,0.
|
7 |
-
25k,0.
|
8 |
-
30k,0.
|
9 |
-
35k,0.
|
10 |
-
40k,0.
|
11 |
-
45k,0.
|
12 |
-
50k,0.
|
13 |
-
55k,0.
|
14 |
-
60k,0.
|
15 |
-
65k,0.
|
16 |
-
70k,0.
|
17 |
-
75k,0.
|
18 |
-
80k,0.
|
19 |
-
85k,0.
|
20 |
-
90k,0.
|
21 |
-
95k,0.
|
22 |
-
100k,0.
|
23 |
-
105k,0.
|
24 |
-
110k,0.
|
25 |
-
115k,0.
|
26 |
-
120k,0.
|
27 |
-
125k,0.
|
28 |
-
130k,0.
|
29 |
-
135k
|
30 |
-
140k
|
31 |
-
145k
|
32 |
-
150k
|
33 |
-
155k
|
34 |
-
160k
|
35 |
-
165k
|
36 |
-
170k
|
37 |
-
175k
|
38 |
-
180k
|
39 |
-
185k
|
40 |
-
190k
|
41 |
-
195k
|
42 |
-
200k
|
43 |
-
205k
|
44 |
-
210k
|
45 |
-
215k
|
46 |
-
220k
|
47 |
-
225k
|
48 |
-
230k
|
49 |
-
235k
|
50 |
-
240k
|
51 |
-
245k
|
52 |
-
250k
|
53 |
-
255k
|
54 |
-
260k
|
55 |
-
265k
|
56 |
-
270k
|
57 |
-
275k
|
58 |
-
280k
|
59 |
-
285k
|
60 |
-
290k
|
61 |
-
300k
|
62 |
-
305k
|
63 |
-
310k
|
64 |
-
315k
|
65 |
-
320k
|
66 |
-
325k
|
67 |
-
330k
|
68 |
-
335k
|
|
|
1 |
+
0-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
2 |
+
time: 3 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
3 |
+
5k,0.2152,,0.2482,0.2687
|
4 |
+
10k,0.238,0.2372,0.2616,0.2718
|
5 |
+
15k,0.227,,0.2797,0.2639
|
6 |
+
20k,0.2419,,0.2317,0.2757
|
7 |
+
25k,0.2184,,0.2569,0.2474
|
8 |
+
30k,0.2679,0.2522,0.2097,0.2608
|
9 |
+
35k,0.2647,0.2655,0.2467,0.2694
|
10 |
+
40k,0.2671,0.2396,0.2569,0.2482
|
11 |
+
45k,0.2742,0.2734,0.2255,0.2333
|
12 |
+
50k,0.2749,0.2537,0.2372,0.2655
|
13 |
+
55k,0.2797,0.2561,0.2294,0.2537
|
14 |
+
60k,0.2294,0.2325,,0.2639
|
15 |
+
65k,0.2663,0.2757,0.2749,0.2726
|
16 |
+
70k,0.2592,0.2757,0.2632,0.2435
|
17 |
+
75k,0.249,0.2679,0.2616,0.2765
|
18 |
+
80k,0.2797,0.2419,0.2522,0.2789
|
19 |
+
85k,0.2655,0.2844,0.2687,0.2553
|
20 |
+
90k,0.231,0.2364,0.2624,0.2679
|
21 |
+
95k,0.2742,0.282,0.2647,0.2749
|
22 |
+
100k,0.2679,,0.2702,0.2663
|
23 |
+
105k,0.2655,,0.2632,0.2726
|
24 |
+
110k,0.2718,0.2474,0.2537,0.2537
|
25 |
+
115k,0.2655,0.2718,0.2247,0.2867
|
26 |
+
120k,0.293,0.2537,,0.2844
|
27 |
+
125k,0.2624,0.2364,0.2145,0.2883
|
28 |
+
130k,0.2828,0.2412,0.2891,0.2922
|
29 |
+
135k,,,0.2765,0.2702
|
30 |
+
140k,0.2529,,0.2545,0.293
|
31 |
+
145k,0.249,,0.2718,0.3024
|
32 |
+
150k,,,,0.3244
|
33 |
+
155k,0.2608,,0.2624,
|
34 |
+
160k,0.2529,,0.2726,0.2852
|
35 |
+
165k,0.2388,,0.2742,0.2561
|
36 |
+
170k,0.2435,,0.2506,0.3056
|
37 |
+
175k,0.2632,,0.2647,0.3126
|
38 |
+
180k,0.2608,,0.2899,0.3166
|
39 |
+
185k,0.271,,0.2561,0.3268
|
40 |
+
190k,0.2812,,,0.304
|
41 |
+
195k,0.2482,,,0.3472
|
42 |
+
200k,0.2639,,,0.3339
|
43 |
+
205k,0.2514,,,0.3409
|
44 |
+
210k,0.2742,,,0.3378
|
45 |
+
215k,0.2592,,,0.3362
|
46 |
+
220k,0.2262,,,0.3559
|
47 |
+
225k,0.249,,,0.3213
|
48 |
+
230k,0.2357,,,0.3472
|
49 |
+
235k,0.2514,,,0.3614
|
50 |
+
240k,0.2624,,,
|
51 |
+
245k,0.2482,,,
|
52 |
+
250k,0.2592,,,
|
53 |
+
255k,0.2537,,,
|
54 |
+
260k,0.2639,,,
|
55 |
+
265k,0.2844,,,
|
56 |
+
270k,0.2624,,,
|
57 |
+
275k,0.2757,,,
|
58 |
+
280k,0.2852,,,
|
59 |
+
285k,0.2726,,,
|
60 |
+
290k,,,,
|
61 |
+
300k,,,,
|
62 |
+
305k,,,,
|
63 |
+
310k,,,,
|
64 |
+
315k,,,,
|
65 |
+
320k,,,,
|
66 |
+
325k,,,,
|
67 |
+
330k,,,,
|
68 |
+
335k,,,,
|
data/txt360_eval/CKPT Eval - NQ.csv
CHANGED
@@ -1,68 +1,68 @@
|
|
1 |
-
5-shot,
|
2 |
-
time: 22 min,Llama-8x8B-
|
3 |
-
5k,0.
|
4 |
-
10k,0.
|
5 |
-
15k,0.
|
6 |
-
20k,0.
|
7 |
-
25k,0.
|
8 |
-
30k,0.
|
9 |
-
35k,0.
|
10 |
-
40k,0.
|
11 |
-
45k,0.
|
12 |
-
50k,0.
|
13 |
-
55k,0.
|
14 |
-
60k,0.
|
15 |
-
65k,0.
|
16 |
-
70k,0.
|
17 |
-
75k,0.
|
18 |
-
80k,0.
|
19 |
-
85k,0.
|
20 |
-
90k,0.
|
21 |
-
95k,0.
|
22 |
-
100k,0.
|
23 |
-
105k,0.
|
24 |
-
110k,0.
|
25 |
-
115k,0.
|
26 |
-
120k,0.
|
27 |
-
125k,0.
|
28 |
-
130k,0.
|
29 |
-
135k,0.
|
30 |
-
140k
|
31 |
-
145k
|
32 |
-
150k
|
33 |
-
155k
|
34 |
-
160k
|
35 |
-
165k
|
36 |
-
170k
|
37 |
-
175k
|
38 |
-
180k
|
39 |
-
185k
|
40 |
-
190k
|
41 |
-
195k
|
42 |
-
200k
|
43 |
-
205k
|
44 |
-
210k
|
45 |
-
215k
|
46 |
-
220k
|
47 |
-
225k
|
48 |
-
230k
|
49 |
-
235k
|
50 |
-
240k
|
51 |
-
245k
|
52 |
-
250k
|
53 |
-
255k
|
54 |
-
260k
|
55 |
-
265k
|
56 |
-
270k
|
57 |
-
275k
|
58 |
-
280k
|
59 |
-
285k
|
60 |
-
290k
|
61 |
-
300k
|
62 |
-
305k
|
63 |
-
310k
|
64 |
-
315k
|
65 |
-
320k
|
66 |
-
325k
|
67 |
-
330k
|
68 |
-
335k
|
|
|
1 |
+
5-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
2 |
+
time: 22 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
3 |
+
5k,0.0341,0.0416,0.0565,0.0526
|
4 |
+
10k,0.0715,,0.0931,0.0767
|
5 |
+
15k,0.0765,,0.1061,0.1127
|
6 |
+
20k,0.0787,,0.1183,0.1247
|
7 |
+
25k,0.0892,0.115,0.1352,0.1343
|
8 |
+
30k,0.0911,0.1366,0.1271,0.1421
|
9 |
+
35k,0.097,0.1488,0.1485,0.1524
|
10 |
+
40k,0.1028,0.1355,0.1488,0.1562
|
11 |
+
45k,0.1078,0.1488,0.162,0.1598
|
12 |
+
50k,0.105,0.154,0.159,0.1698
|
13 |
+
55k,0.1097,0.1607,0.1662,0.1704
|
14 |
+
60k,0.1211,0.1654,0.1612,0.1801
|
15 |
+
65k,0.1089,0.1573,0.1693,0.1823
|
16 |
+
70k,0.1222,0.1634,0.1679,0.1767
|
17 |
+
75k,0.1097,0.1709,0.1881,0.1762
|
18 |
+
80k,0.1277,0.1573,0.1776,0.1964
|
19 |
+
85k,0.128,0.1776,0.1889,0.1889
|
20 |
+
90k,0.1158,0.1598,0.1806,0.1773
|
21 |
+
95k,0.1235,0.1762,0.1781,0.1917
|
22 |
+
100k,0.1258,,0.1928,0.1947
|
23 |
+
105k,0.1366,,0.1814,0.2094
|
24 |
+
110k,0.1377,0.1756,0.1859,
|
25 |
+
115k,0.1346,0.1831,0.1947,0.2119
|
26 |
+
120k,0.1402,0.2014,,0.2119
|
27 |
+
125k,0.1307,0.203,0.1992,0.1787
|
28 |
+
130k,0.1368,0.1997,0.1994,0.2086
|
29 |
+
135k,0.1363,,0.2014,0.2069
|
30 |
+
140k,0.1435,,0.1986,0.2058
|
31 |
+
145k,0.1532,,0.1953,0.2102
|
32 |
+
150k,0.1404,,,0.2075
|
33 |
+
155k,0.1418,,0.1931,0.2205
|
34 |
+
160k,0.1346,,0.2116,0.2208
|
35 |
+
165k,0.1524,,0.2139,0.2213
|
36 |
+
170k,0.1388,,,0.2169
|
37 |
+
175k,0.1438,,0.2222,0.2321
|
38 |
+
180k,0.1471,,0.2249,0.236
|
39 |
+
185k,0.1499,,0.2222,0.2366
|
40 |
+
190k,0.1504,,,0.2274
|
41 |
+
195k,0.1554,,,0.2454
|
42 |
+
200k,0.1565,,,0.2346
|
43 |
+
205k,0.1726,,,0.2316
|
44 |
+
210k,0.1623,,,0.2493
|
45 |
+
215k,0.1576,,,0.2355
|
46 |
+
220k,0.1693,,,0.2427
|
47 |
+
225k,0.1596,,,0.244
|
48 |
+
230k,0.1693,,,0.2554
|
49 |
+
235k,0.172,,,0.2535
|
50 |
+
240k,0.1712,,,
|
51 |
+
245k,0.1704,,,
|
52 |
+
250k,0.1784,,,
|
53 |
+
255k,0.174,,,
|
54 |
+
260k,0.1756,,,
|
55 |
+
265k,0.1886,,,
|
56 |
+
270k,0.182,,,
|
57 |
+
275k,0.187,,,
|
58 |
+
280k,0.1704,,,
|
59 |
+
285k,0.1903,,,
|
60 |
+
290k,,,,
|
61 |
+
300k,,,,
|
62 |
+
305k,,,,
|
63 |
+
310k,,,,
|
64 |
+
315k,,,,
|
65 |
+
320k,,,,
|
66 |
+
325k,,,,
|
67 |
+
330k,,,,
|
68 |
+
335k,,,,
|
data/txt360_eval/CKPT Eval - PIQA.csv
CHANGED
@@ -1,69 +1,68 @@
|
|
1 |
-
,
|
2 |
-
0-shot: 3 min,Llama-8x8B-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
335k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
|
|
1 |
+
,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
2 |
+
0-shot: 3 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
3 |
+
5k,0.747,,0.7378,0.7318
|
4 |
+
10k,0.765,0.7573,0.7557,0.7612
|
5 |
+
15k,0.7775,0.7628,0.7769,0.7655
|
6 |
+
20k,0.7807,0.7671,0.7709,0.784
|
7 |
+
25k,0.7878,,0.7913,0.7791
|
8 |
+
30k,0.7862,0.778,0.7829,0.7889
|
9 |
+
35k,0.7933,0.7769,0.7824,0.7987
|
10 |
+
40k,0.7905,,0.7943,0.7878
|
11 |
+
45k,0.7982,0.7786,0.7829,0.7949
|
12 |
+
50k,0.7992,0.7775,0.7943,0.7933
|
13 |
+
55k,0.8079,0.7786,0.7884,0.7943
|
14 |
+
60k,0.7922,0.7818,0.7905,0.8003
|
15 |
+
65k,0.7976,0.79,0.7835,0.7943
|
16 |
+
70k,0.8052,0.7916,0.79,0.7976
|
17 |
+
75k,0.803,0.7878,0.8079,0.802
|
18 |
+
80k,0.7971,0.7829,0.7992,0.7933
|
19 |
+
85k,0.8003,0.8014,0.7949,0.7965
|
20 |
+
90k,0.7976,0.7873,0.7856,0.7998
|
21 |
+
95k,0.8041,0.7905,0.7954,0.8003
|
22 |
+
100k,0.8069,,0.7998,0.8009
|
23 |
+
105k,0.8074,,0.8063,0.796
|
24 |
+
110k,0.8085,0.7856,0.7938,0.7998
|
25 |
+
115k,0.8118,0.7911,0.8041,0.8052
|
26 |
+
120k,0.8074,0.7982,0.8025,0.7949
|
27 |
+
125k,0.8107,0.8009,0.8047,0.8003
|
28 |
+
130k,0.8079,0.7916,0.8014,0.7922
|
29 |
+
135k,0.8074,,0.8052,0.8014
|
30 |
+
140k,0.8123,,0.8063,0.7987
|
31 |
+
145k,0.8069,,0.8052,0.803
|
32 |
+
150k,0.8058,,,0.7987
|
33 |
+
155k,0.8096,,0.7954,0.8107
|
34 |
+
160k,0.8101,,0.802,0.8079
|
35 |
+
165k,0.8112,,0.8058,0.8101
|
36 |
+
170k,,,0.8041,0.8036
|
37 |
+
175k,0.8194,,0.7982,0.8118
|
38 |
+
180k,0.8118,,0.8025,0.8172
|
39 |
+
185k,0.8259,,0.8036,0.8096
|
40 |
+
190k,0.8139,,,0.8128
|
41 |
+
195k,0.8188,,,0.8161
|
42 |
+
200k,0.8112,,,0.8128
|
43 |
+
205k,0.8188,,,0.8177
|
44 |
+
210k,0.8188,,,0.8161
|
45 |
+
215k,0.8188,,,0.8085
|
46 |
+
220k,0.8199,,,0.8096
|
47 |
+
225k,0.8199,,,0.8134
|
48 |
+
230k,0.8172,,,0.8134
|
49 |
+
235k,0.8199,,,0.8205
|
50 |
+
240k,0.8166,,,
|
51 |
+
245k,0.8215,,,
|
52 |
+
250k,0.8172,,,
|
53 |
+
255k,0.8254,,,
|
54 |
+
260k,0.8215,,,
|
55 |
+
265k,0.821,,,
|
56 |
+
270k,0.8145,,,
|
57 |
+
275k,0.8161,,,
|
58 |
+
280k,0.8248,,,
|
59 |
+
285k,0.821,,,
|
60 |
+
290k,,,,
|
61 |
+
300k,,,,
|
62 |
+
305k,,,,
|
63 |
+
310k,,,,
|
64 |
+
315k,,,,
|
65 |
+
320k,,,,
|
66 |
+
325k,,,,
|
67 |
+
330k,,,,
|
68 |
+
335k,,,,
|
|
data/txt360_eval/CKPT Eval - TriviaQA.csv
CHANGED
@@ -1,68 +1,68 @@
|
|
1 |
-
5-shot,
|
2 |
-
time: 76 min,Llama-8x8B-
|
3 |
-
5k,0.
|
4 |
-
10k,0.
|
5 |
-
15k,0.
|
6 |
-
20k,0.
|
7 |
-
25k,0.
|
8 |
-
30k,0.
|
9 |
-
35k,0.
|
10 |
-
40k,0.
|
11 |
-
45k,0.
|
12 |
-
50k,0.
|
13 |
-
55k,0.
|
14 |
-
60k,0.
|
15 |
-
65k,0.
|
16 |
-
70k,0.
|
17 |
-
75k,0.
|
18 |
-
80k,0.
|
19 |
-
85k,0.
|
20 |
-
90k,0.
|
21 |
-
95k,0.
|
22 |
-
100k,0.
|
23 |
-
105k,0.
|
24 |
-
110k,0.
|
25 |
-
115k,0.
|
26 |
-
120k,0.
|
27 |
-
125k,0.
|
28 |
-
130k,0.
|
29 |
-
135k,0.
|
30 |
-
140k
|
31 |
-
145k
|
32 |
-
150k
|
33 |
-
155k
|
34 |
-
160k
|
35 |
-
165k
|
36 |
-
170k
|
37 |
-
175k
|
38 |
-
180k
|
39 |
-
185k
|
40 |
-
190k
|
41 |
-
195k
|
42 |
-
200k
|
43 |
-
205k
|
44 |
-
210k
|
45 |
-
215k
|
46 |
-
220k
|
47 |
-
225k
|
48 |
-
230k
|
49 |
-
235k
|
50 |
-
240k
|
51 |
-
245k
|
52 |
-
250k
|
53 |
-
255k
|
54 |
-
260k
|
55 |
-
265k
|
56 |
-
270k
|
57 |
-
275k
|
58 |
-
280k
|
59 |
-
285k
|
60 |
-
290k
|
61 |
-
300k
|
62 |
-
305k
|
63 |
-
310k
|
64 |
-
315k
|
65 |
-
320k
|
66 |
-
325k
|
67 |
-
330k
|
68 |
-
335k
|
|
|
1 |
+
5-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
2 |
+
time: 76 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
3 |
+
5k,0.1025,0.1232,0.126,
|
4 |
+
10k,0.2073,,0.115,0.2604
|
5 |
+
15k,0.3005,,0.1872,0.3244
|
6 |
+
20k,0.3506,0.2795,0.2719,0.3637
|
7 |
+
25k,0.307,,0.4093,0.412
|
8 |
+
30k,0.2461,0.2974,0.4195,0.4294
|
9 |
+
35k,0.3639,0.3572,0.3587,0.4428
|
10 |
+
40k,0.3537,0.0346,0.4434,0.4623
|
11 |
+
45k,0.3602,0.2674,0.4366,0.4792
|
12 |
+
50k,0.2407,0.3689,0.4051,0.4795
|
13 |
+
55k,0.2081,0.4101,0.323,0.494
|
14 |
+
60k,0.4068,0.4107,0.4469,0.513
|
15 |
+
65k,0.3145,0.4477,0.4907,0.5087
|
16 |
+
70k,0.4102,0.4736,0.492,0.5129
|
17 |
+
75k,0.282,0.4226,0.2245,0.5042
|
18 |
+
80k,0.0975,0.4217,,0.5301
|
19 |
+
85k,0.0722,0.4763,0.5029,0.535
|
20 |
+
90k,0.3388,0.1472,0.0317,0.522
|
21 |
+
95k,0.5283,0.4938,0.518,0.5446
|
22 |
+
100k,0.4317,0.11,0.5358,0.5514
|
23 |
+
105k,0.1886,,0.5153,0.5562
|
24 |
+
110k,0.351,,0.5182,0.5654
|
25 |
+
115k,0.3692,0.4759,0.5132,0.5577
|
26 |
+
120k,0.369,0.4352,0.5483,0.5658
|
27 |
+
125k,0.3365,0.5206,0.5211,0.5658
|
28 |
+
130k,0.355,0.0088,0.5245,0.5609
|
29 |
+
135k,0.3892,,0.3977,0.5774
|
30 |
+
140k,0.393,,0.4991,0.5675
|
31 |
+
145k,0.4538,,0.4872,0.5639
|
32 |
+
150k,0.2883,,,0.5844
|
33 |
+
155k,0.4185,,0.1586,0.5755
|
34 |
+
160k,0.272,,0.563,0.5864
|
35 |
+
165k,0.4252,,0.5642,0.5853
|
36 |
+
170k,0.1507,,0.5739,
|
37 |
+
175k,0.3242,,0.564,0.5979
|
38 |
+
180k,0.2653,,0.5912,0.6054
|
39 |
+
185k,0.2651,,0.5852,0.6064
|
40 |
+
190k,0.238,,,0.5996
|
41 |
+
195k,0.4048,,,0.6243
|
42 |
+
200k,0.5058,,,0.6248
|
43 |
+
205k,0.0945,,,0.6224
|
44 |
+
210k,0.1557,,,0.6311
|
45 |
+
215k,0.2483,,,0.6293
|
46 |
+
220k,0.1725,,,0.6375
|
47 |
+
225k,0.2467,,,0.634
|
48 |
+
230k,0.1653,,,0.6436
|
49 |
+
235k,0.1884,,,0.6411
|
50 |
+
240k,0.0719,,,
|
51 |
+
245k,0.3757,,,
|
52 |
+
250k,0.5859,,,
|
53 |
+
255k,0.4987,,,
|
54 |
+
260k,0.394,,,
|
55 |
+
265k,0.3607,,,
|
56 |
+
270k,0.3898,,,
|
57 |
+
275k,0.4123,,,
|
58 |
+
280k,0.2413,,,
|
59 |
+
285k,0.3665,,,
|
60 |
+
290k,,,,
|
61 |
+
300k,,,,
|
62 |
+
305k,,,,
|
63 |
+
310k,,,,
|
64 |
+
315k,,,,
|
65 |
+
320k,,,,
|
66 |
+
325k,,,,
|
67 |
+
330k,,,,
|
68 |
+
335k,,,,
|
data/txt360_eval/CKPT Eval - WinoGrande.csv
CHANGED
@@ -1,69 +1,68 @@
|
|
1 |
-
,
|
2 |
-
0-shot: 3 min,Llama-8x8B-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
335k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
|
|
1 |
+
,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
|
2 |
+
0-shot: 3 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
|
3 |
+
5k,0.5691,0.5351,0.5367,0.5383
|
4 |
+
10k,0.5904,0.5604,0.5817,0.5667
|
5 |
+
15k,0.5927,0.5919,0.588,0.5896
|
6 |
+
20k,0.6448,0.6006,0.618,0.5935
|
7 |
+
25k,0.6196,0.6125,0.6062,0.6101
|
8 |
+
30k,0.6488,,0.614,0.6322
|
9 |
+
35k,0.644,0.603,0.6259,0.6212
|
10 |
+
40k,0.6496,0.6338,0.6267,0.6433
|
11 |
+
45k,0.6456,0.6172,0.6393,0.6393
|
12 |
+
50k,0.6464,0.6401,0.6164,0.6472
|
13 |
+
55k,0.6567,0.6235,0.6314,0.6464
|
14 |
+
60k,0.648,0.6251,0.6219,0.6369
|
15 |
+
65k,0.6654,0.6283,0.6401,0.6504
|
16 |
+
70k,0.6709,0.6322,0.6417,0.6559
|
17 |
+
75k,0.6709,0.648,0.6527,0.6527
|
18 |
+
80k,0.6843,0.6504,0.6369,0.6519
|
19 |
+
85k,0.6875,0.6409,0.6575,0.6393
|
20 |
+
90k,0.674,0.6369,0.6488,0.6527
|
21 |
+
95k,0.6835,0.6369,0.6654,0.6409
|
22 |
+
100k,0.6756,,0.659,0.6511
|
23 |
+
105k,0.6772,,0.6732,0.674
|
24 |
+
110k,0.6669,0.6559,0.6567,0.6551
|
25 |
+
115k,0.6732,0.6456,0.6661,0.6622
|
26 |
+
120k,0.6764,0.6519,0.659,0.6519
|
27 |
+
125k,0.6985,0.6393,0.6646,0.6803
|
28 |
+
130k,0.6811,0.6614,0.659,0.6559
|
29 |
+
135k,0.6827,,0.6551,0.6677
|
30 |
+
140k,0.6867,,0.6567,0.6638
|
31 |
+
145k,0.6819,,0.6669,0.6725
|
32 |
+
150k,0.6835,,,0.6788
|
33 |
+
155k,0.6748,,0.663,0.6922
|
34 |
+
160k,0.6875,,0.6748,0.6811
|
35 |
+
165k,0.6788,,0.6725,
|
36 |
+
170k,0.6938,,0.6725,0.6717
|
37 |
+
175k,0.6938,,0.6693,0.689
|
38 |
+
180k,0.6977,,0.674,0.6685
|
39 |
+
185k,0.6875,,0.6811,0.6851
|
40 |
+
190k,0.6914,,,0.6693
|
41 |
+
195k,0.6859,,,0.6756
|
42 |
+
200k,0.6875,,,0.7017
|
43 |
+
205k,0.7072,,,0.6827
|
44 |
+
210k,0.6859,,,0.6882
|
45 |
+
215k,0.7017,,,0.6922
|
46 |
+
220k,0.704,,,0.6969
|
47 |
+
225k,0.7111,,,0.6756
|
48 |
+
230k,0.7103,,,0.7096
|
49 |
+
235k,0.704,,,0.7096
|
50 |
+
240k,0.708,,,
|
51 |
+
245k,0.6985,,,
|
52 |
+
250k,0.7127,,,
|
53 |
+
255k,0.7119,,,
|
54 |
+
260k,0.7056,,,
|
55 |
+
265k,0.704,,,
|
56 |
+
270k,0.7111,,,
|
57 |
+
275k,0.7127,,,
|
58 |
+
280k,0.7064,,,
|
59 |
+
285k,0.7096,,,
|
60 |
+
290k,,,,
|
61 |
+
300k,,,,
|
62 |
+
305k,,,,
|
63 |
+
310k,,,,
|
64 |
+
315k,,,,
|
65 |
+
320k,,,,
|
66 |
+
325k,,,,
|
67 |
+
330k,,,,
|
68 |
+
335k,,,,
|
|
main.py
CHANGED
@@ -54,7 +54,7 @@ front_matter = {
|
|
54 |
"author": "Nikhil Ranjan",
|
55 |
"authorURL": "https://huggingface.co/nikhilranjan",
|
56 |
"affiliation": "MBZUAI",
|
57 |
-
"affiliationURL": "",
|
58 |
},
|
59 |
{
|
60 |
"author": "Omkar Pangarkar",
|
@@ -62,6 +62,12 @@ front_matter = {
|
|
62 |
"affiliation": "Petuum, Inc.",
|
63 |
"affiliationURL": "",
|
64 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
{
|
66 |
"author": "Zhen Wang",
|
67 |
"authorURL": "",
|
@@ -74,6 +80,12 @@ front_matter = {
|
|
74 |
"affiliation": "UCSD",
|
75 |
"affiliationURL": "",
|
76 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
{
|
78 |
"author": "Zhoujun Cheng",
|
79 |
"authorURL": "https://huggingface.co/zhoujun",
|
|
|
54 |
"author": "Nikhil Ranjan",
|
55 |
"authorURL": "https://huggingface.co/nikhilranjan",
|
56 |
"affiliation": "MBZUAI",
|
57 |
+
"affiliationURL": "LLM360.ai",
|
58 |
},
|
59 |
{
|
60 |
"author": "Omkar Pangarkar",
|
|
|
62 |
"affiliation": "Petuum, Inc.",
|
63 |
"affiliationURL": "",
|
64 |
},
|
65 |
+
{
|
66 |
+
"author": "Xuezhi Liang",
|
67 |
+
"authorURL": "",
|
68 |
+
"affiliation": "MBZUAI",
|
69 |
+
"affiliationURL": "",
|
70 |
+
},
|
71 |
{
|
72 |
"author": "Zhen Wang",
|
73 |
"authorURL": "",
|
|
|
80 |
"affiliation": "UCSD",
|
81 |
"affiliationURL": "",
|
82 |
},
|
83 |
+
{
|
84 |
+
"author": "Bhaskar Rao",
|
85 |
+
"authorURL": "",
|
86 |
+
"affiliation": "MBZUAI",
|
87 |
+
"affiliationURL": "",
|
88 |
+
},
|
89 |
{
|
90 |
"author": "Zhoujun Cheng",
|
91 |
"authorURL": "https://huggingface.co/zhoujun",
|
results.py
CHANGED
@@ -25,10 +25,10 @@ for fname in os.listdir("data/txt360_eval"):
|
|
25 |
df = pd.read_csv(os.path.join("data/txt360_eval", fname))
|
26 |
|
27 |
# slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
|
28 |
-
fineweb_res = df.iloc[2:,
|
29 |
-
txt360_base = df.iloc[2:,
|
30 |
-
txt360_web_up = df.iloc[2:,
|
31 |
-
txt360_all_up_stack = df.iloc[2:,
|
32 |
|
33 |
# each row is 20B tokens.
|
34 |
# all_eval_results[metric_name]["slimpajama"] = slimpajama_res
|
@@ -66,10 +66,6 @@ for metric_name, res in all_eval_results.items():
|
|
66 |
mode='lines', name='TxT360 - Full Upsampled + Stack V2'
|
67 |
))
|
68 |
|
69 |
-
print(all_eval_results[metric_name]["token"])
|
70 |
-
print(all_eval_results[metric_name]["fineweb"].tolist())
|
71 |
-
print(all_eval_results[metric_name]["txt360-web-only-upsampled"].tolist())
|
72 |
-
|
73 |
# Update layout
|
74 |
fig_res.update_layout(
|
75 |
title=f"{metric_name} Performance",
|
@@ -825,7 +821,7 @@ table_div_1 = Div(NotStr(table_html),
|
|
825 |
intro_div = Div(
|
826 |
H2("TxT360 Studies"),
|
827 |
H3("What This Section Contains"),
|
828 |
-
P("This section shows the learning curve when pre-training on TxT360, with a proper upsampling approach. We compare several simple strategies and demonstrate that one particular upsampling method, inspired by the natural data distribution, performs exceptionally well. In our preliminary experiments, the model learns significantly faster on TxT360 compared to a similarly scaled dataset, FineWeb. We believe that a more carefully designed upsampling strategy could further enhance the use of our data."),
|
829 |
P("In addition to the training results, we also provide an analysis of the dataset, including perplexity trends over time across the CommonCrawl snapshots. This section is organized into the following topic areas:"),
|
830 |
Ul(
|
831 |
Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
|
@@ -865,17 +861,18 @@ upsampling_exp = Div(
|
|
865 |
"Evaluation results are the most direct indicator of model quality. We assess the intermediate results of the models across multiple metrics and plot the learning curves. Our findings indicate that the model learns significantly faster with TxT360. For a fair comparison, we evaluate TxT360 against FineWeb using only the CommonCrawl data sources, and we also show the curves after incorporating the 14 curated sources and coding data (Stack V2), demonstrating the full potential of the dataset. Due to computation resource constraints, we stop running experiments when we can observe clear trends."
|
866 |
),
|
867 |
P(
|
868 |
-
"Based on the metrics, we find that TxT360’s CommonCrawl portion
|
869 |
),
|
870 |
plotly2fasthtml(all_eval_res_figs["MMLU"]),
|
871 |
plotly2fasthtml(all_eval_res_figs["NQ"]),
|
872 |
-
# plotly2fasthtml(all_eval_res_figs["GSM8K"]),
|
873 |
plotly2fasthtml(all_eval_res_figs["HellaSwag"]),
|
|
|
|
|
|
|
874 |
plotly2fasthtml(all_eval_res_figs["MedQA"]),
|
875 |
plotly2fasthtml(all_eval_res_figs["PIQA"]),
|
876 |
plotly2fasthtml(all_eval_res_figs["TriviaQA"]),
|
877 |
plotly2fasthtml(all_eval_res_figs["WinoGrande"]),
|
878 |
-
|
879 |
H3("Comparing the Loss Curves"),
|
880 |
P(
|
881 |
"We also plot the training and validation loss curves for each dataset, showing that TxT360 achieves both lower training and validation losses compared to FineWeb. Although training loss may not correlate directly with final model performance, we observe that the loss curve for TxT360 exhibits fewer spikes compared to FineWeb, indicating more stable training dynamics."
|
|
|
25 |
df = pd.read_csv(os.path.join("data/txt360_eval", fname))
|
26 |
|
27 |
# slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
|
28 |
+
fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") # fineweb
|
29 |
+
txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") # txt360-dedup-only
|
30 |
+
txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
|
31 |
+
txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
|
32 |
|
33 |
# each row is 20B tokens.
|
34 |
# all_eval_results[metric_name]["slimpajama"] = slimpajama_res
|
|
|
66 |
mode='lines', name='TxT360 - Full Upsampled + Stack V2'
|
67 |
))
|
68 |
|
|
|
|
|
|
|
|
|
69 |
# Update layout
|
70 |
fig_res.update_layout(
|
71 |
title=f"{metric_name} Performance",
|
|
|
821 |
intro_div = Div(
|
822 |
H2("TxT360 Studies"),
|
823 |
H3("What This Section Contains"),
|
824 |
+
P("This section shows the learning curve when pre-training on TxT360, with a proper upsampling approach. We compare several simple strategies and demonstrate that one particular upsampling method, inspired by the natural data distribution, performs exceptionally well. In our preliminary experiments, the model learns significantly faster on TxT360 compared to a similarly scaled dataset, FineWeb, on several important evaluation metrics. We believe that a more carefully designed upsampling strategy could further enhance the use of our data."),
|
825 |
P("In addition to the training results, we also provide an analysis of the dataset, including perplexity trends over time across the CommonCrawl snapshots. This section is organized into the following topic areas:"),
|
826 |
Ul(
|
827 |
Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
|
|
|
861 |
"Evaluation results are the most direct indicator of model quality. We assess the intermediate results of the models across multiple metrics and plot the learning curves. Our findings indicate that the model learns significantly faster with TxT360. For a fair comparison, we evaluate TxT360 against FineWeb using only the CommonCrawl data sources, and we also show the curves after incorporating the 14 curated sources and coding data (Stack V2), demonstrating the full potential of the dataset. Due to computation resource constraints, we stop running experiments when we can observe clear trends."
|
862 |
),
|
863 |
P(
|
864 |
+
"Based on the metrics, we find that TxT360’s CommonCrawl portion with the umsampling strategy outperforms FineWeb on key metrics at MMLU, NQ, falls slightly behind on HellaSwag. Furhter, we show that by combining TxT360 with coding data (Stack V2), the learning curve is significantly more stable and we observe improved results across most all of the metrics. Apparently the dataset preference here may depend on the set of metrics one would use."
|
865 |
),
|
866 |
plotly2fasthtml(all_eval_res_figs["MMLU"]),
|
867 |
plotly2fasthtml(all_eval_res_figs["NQ"]),
|
|
|
868 |
plotly2fasthtml(all_eval_res_figs["HellaSwag"]),
|
869 |
+
P(
|
870 |
+
"Similar to the findings in DCLM, adding the curated non-CommonCrawl data sources produces mixed results (some preliminary figures are not shown here). Yet such data can help with domain specific tasks like MedQA."
|
871 |
+
),
|
872 |
plotly2fasthtml(all_eval_res_figs["MedQA"]),
|
873 |
plotly2fasthtml(all_eval_res_figs["PIQA"]),
|
874 |
plotly2fasthtml(all_eval_res_figs["TriviaQA"]),
|
875 |
plotly2fasthtml(all_eval_res_figs["WinoGrande"]),
|
|
|
876 |
H3("Comparing the Loss Curves"),
|
877 |
P(
|
878 |
"We also plot the training and validation loss curves for each dataset, showing that TxT360 achieves both lower training and validation losses compared to FineWeb. Although training loss may not correlate directly with final model performance, we observe that the loss curve for TxT360 exhibits fewer spikes compared to FineWeb, indicating more stable training dynamics."
|