hunterhector commited on
Commit
e74bc72
1 Parent(s): 2c39f2b

fix data columns

Browse files
data/txt360_eval/CKPT Eval - BoolQ.csv DELETED
@@ -1,68 +0,0 @@
1
- 0-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
2
- hf-time: 4 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
3
- 5k,0.5761,0.5624,,0.6116,0.5514,0.5945,0.5446,0.5336,0.5902,0.5908,0.5394,0.5865,0.5284
4
- 10k,0.6242,0.5853,,0.6131,,0.5358,0.6122,0.6080,0.5471,0.5511,0.6138,0.5902,0.5780
5
- 15k,0.6480,0.6291,,0.6061,0.6217,0.5468,0.6205,0.6242,0.6248,0.5917,0.6211,0.5933,0.5713
6
- 20k,0.6541,0.6474,,0.5865,0.6187,0.6122,0.6199,0.6116,0.6119,0.5636,0.6239,0.5988,0.5850
7
- 25k,0.6670,0.6012,,0.6398,0.6251,0.6162,0.6349,0.6239,0.6291,0.5630,0.6336,0.6232,0.6312
8
- 30k,0.6777,0.6523,,0.6379,0.6083,0.6260,0.6437,0.6263,0.6107,0.5835,0.5865,0.6391,0.6425
9
- 35k,0.6495,0.6584,,0.6388,,0.6333,0.6346,0.6343,0.6144,0.4933,0.6043,0.6278,0.6480
10
- 40k,0.6771,0.6930,,0.6489,0.6410,0.6596,0.6330,0.6214,0.6520,0.5685,0.5768,0.6343,0.6505
11
- 45k,0.6624,0.6887,,0.6590,0.6422,0.6223,0.6401,0.6131,0.6153,0.5578,0.6058,0.6336,0.6529
12
- 50k,0.6761,0.6951,,0.6575,0.6566,0.6593,0.6557,0.6058,0.6541,0.5972,0.6018,0.6177,0.6563
13
- 55k,0.6847,0.6725,,0.6752,0.6321,0.6688,0.6523,0.6520,0.6679,0.5908,0.5343,0.6214,0.6618
14
- 60k,0.6920,0.6697,,0.6566,0.6226,0.6642,0.6401,0.6162,0.6361,0.5908,0.5972,0.6226,0.6645
15
- 65k,0.6979,0.6905,,0.6865,0.6352,0.6758,0.6688,0.6691,0.6942,0.6315,0.5682,0.6196,0.6352
16
- 70k,0.7104,0.6966,,0.6795,0.6456,0.6746,0.6651,0.6624,0.6575,0.5997,0.5324,0.6358,0.6526
17
- 75k,0.7269,0.6850,,0.6862,0.6514,,0.6621,0.6774,0.6817,0.6217,0.6009,0.6453,0.6535
18
- 80k,0.6997,0.6817,,0.6945,0.6327,0.6664,0.6667,0.6709,0.6703,0.6275,0.5896,0.6502,0.6612
19
- 85k,0.7346,0.6939,,0.6853,0.6746,0.6902,0.6602,0.6330,0.6737,0.6272,0.5239,0.6489,0.6703
20
- 90k,0.7254,0.6908,,0.6936,0.6612,0.6713,0.6755,0.6835,0.6315,0.6275,0.5428,0.6128,0.6807
21
- 95k,0.7165,0.7229,,0.7003,0.6587,,0.6823,0.6404,0.6670,0.6089,0.6138,0.6456,0.6612
22
- 100k,0.7153,0.7073,,0.6869,,0.6676,0.6746,0.6618,0.6587,0.6006,0.5584,0.6566,0.6810
23
- 105k,0.7333,0.7147,,0.6682,,0.6899,0.6609,0.6853,0.6853,0.6544,0.5740,0.6520,0.6755
24
- 110k,0.7376,0.7095,,0.6954,0.6664,0.6703,0.6810,0.6612,0.6798,0.6618,,0.6346,0.6434
25
- 115k,0.7168,0.7095,,0.7156,0.6645,0.6746,0.6997,0.6829,0.6813,0.6523,,0.6596,0.6920
26
- 120k,0.7370,0.7226,,0.7177,0.6648,0.6752,0.7015,,0.6841,0.6633,,0.6587,0.6890
27
- 125k,0.7361,0.7144,,0.7034,0.6636,0.6826,0.6869,0.6657,,0.6593,,0.6593,0.6795
28
- 130k,0.7284,0.7269,,0.6939,0.6786,0.6554,0.6988,0.6719,0.6777,0.6260,,,0.7018
29
- 135k,0.7483,0.7141,,0.7128,,0.6847,0.7028,0.6838,0.6933,0.6602,,,0.6966
30
- 140k,,0.7312,,0.7080,,0.6777,0.6997,0.6957,0.7040,0.6624,,,0.6884
31
- 145k,,,,0.7281,,0.6844,0.6908,0.6743,0.6914,0.6657,,,0.7061
32
- 150k,,,,0.7297,,0.6795,,0.6807,0.6991,0.6526,,,0.7024
33
- 155k,,,,0.7162,,0.7021,0.6976,0.6792,0.6927,0.6587,,,0.7028
34
- 160k,,,,0.6902,,0.6810,0.6985,0.6930,0.6893,0.6434,,,0.7098
35
- 165k,,,,0.7239,,0.6896,0.7037,,0.7021,0.6581,,,0.7080
36
- 170k,,,,0.7471,,0.6780,0.7141,,0.6911,0.6761,,,0.7058
37
- 175k,,,,0.7486,,0.6817,0.6942,,0.7095,0.6557,,,0.7021
38
- 180k,,,,0.6985,,0.6979,0.7162,,0.7067,0.6468,,,0.6523
39
- 185k,,,,0.7187,,0.6887,0.7031,,0.6917,0.6642,,,0.6914
40
- 190k,,,,0.7333,,0.6963,,,0.7113,0.6563,,,0.718
41
- 195k,,,,0.7269,,0.7021,,,0.7199,0.6817,,,0.7165
42
- 200k,,,,0.7135,,0.7080,,,0.707,0.6709,,,0.7015
43
- 205k,,,,0.7388,,0.7015,,,0.7168,0.6722,,,0.722
44
- 210k,,,,0.7489,,0.7089,,,,0.6765,,,0.6948
45
- 215k,,,,0.7538,,0.7183,,,0.7309,0.6869,,,0.6835
46
- 220k,,,,0.7474,,0.7171,,,0.7398,0.6893,,,
47
- 225k,,,,0.7251,,0.7131,,,0.7061,0.6801,,,
48
- 230k,,,,0.7083,,,,,0.7232,0.6765,,,
49
- 235k,,,,0.6930,,,,,0.6884,0.6434,,,
50
- 240k,,,,0.7541,,,,,,0.6875,,,
51
- 245k,,,,0.7541,,,,,,0.6713,,,
52
- 250k,,,,0.7498,,,,,,0.6798,,,
53
- 255k,,,,0.7749,,,,,,0.6578,,,
54
- 260k,,,,0.7615,,,,,,0.6954,,,
55
- 265k,,,,0.7486,,,,,,0.6807,,,
56
- 270k,,,,0.7226,,,,,,0.6869,,,
57
- 275k,,,,0.7269,,,,,,0.6841,,,
58
- 280k,,,,0.7517,,,,,,0.6804,,,
59
- 285k,,,,0.7150,,,,,,0.7006,,,
60
- 290k,,,,,,,,,,0.6826,,,
61
- 300k,,,,,,,,,,0.6706,,,
62
- 305k,,,,,,,,,,0.7006,,,
63
- 310k,,,,,,,,,,0.6777,,,
64
- 315k,,,,,,,,,,0.6859,,,
65
- 320k,,,,,,,,,,0.6939,,,
66
- 325k,,,,,,,,,,,,,
67
- 330k,,,,,,,,,,,,,
68
- 335k,,,,,,,,,,,,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/txt360_eval/CKPT Eval - GSM8K.csv DELETED
@@ -1,68 +0,0 @@
1
- 5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
2
- hf-time: 115 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
3
- 5k,0.0152,0.0099,,,0.0076,0.0015,0.0045,0.0030,,0.0152,0.0106,0.0197,0.0197
4
- 10k,0.0152,0.0190,,0.0015,,0.0091,0.0000,0.0212,0.0144,0.0159,0.0136,0.0174,0.0243
5
- 15k,0.0182,0.0167,,0.0053,0.0068,0.0045,0.0083,0.0212,0.0068,0.0174,0.0190,0.0174,0.0136
6
- 20k,0.0250,0.0212,,,,,0.0030,0.0159,0.0220,0.0167,0.0190,0.0220,0.0174
7
- 25k,0.0288,0.0114,,,,0.0129,0.0053,0.0258,0.0144,0.0152,0.0144,0.0144,0.0144
8
- 30k,0.0220,0.0265,,0.0197,0.0038,0.0152,0.0167,0.0227,0.0220,0.0205,0.0129,0.0167,0.0038
9
- 35k,0.0296,0.0212,,0.0136,0.0045,0.0190,0.0045,0.0227,0.0220,0.0174,0.0174,0.0243,0.0182
10
- 40k,0.0235,0.0288,,0.0068,0.0121,0.0220,0.0015,0.0243,0.0265,0.0152,0.0212,0.0190,0.0182
11
- 45k,0.0387,0.0250,,0.0258,0.0038,0.0273,0.0106,0.0296,0.0273,0.0182,0.0152,0.0174,0.0129
12
- 50k,0.0318,0.0303,,0.0015,0.0243,0.0227,0.0121,0.0190,0.0220,0.0197,0.0205,0.0182,0.0068
13
- 55k,0.0296,0.0311,,0.0023,0.0235,0.0235,0.0250,0.0326,0.0197,0.0182,0.0174,0.0250,0.0091
14
- 60k,0.0432,0.0326,,0.0167,0.0212,0.0212,0.0182,0.0349,0.0220,0.0182,0.0099,0.0190,0.0197
15
- 65k,0.0470,0.0379,,0.0015,0.0159,0.0281,0.0136,0.0296,0.0212,0.0212,0.0129,0.0205,0.0114
16
- 70k,0.0432,0.0417,,0.0136,0.0197,0.0174,0.0114,0.0341,0.0243,0.0205,0.0136,0.0250,0.0091
17
- 75k,0.0508,0.0470,,0.0174,0.0121,0.0250,0.0182,0.0356,0.0288,0.0281,0.0174,0.0190,0.0106
18
- 80k,0.0561,0.0417,,0.0068,0.0000,0.0190,0.0083,0.0318,0.0356,0.0273,0.0167,0.0265,0.0182
19
- 85k,0.0728,0.0341,,0.0341,0.0190,0.0296,0.0205,0.0265,0.0250,0.0220,0.0129,0.0235,0.0083
20
- 90k,0.0690,0.0425,,0.0197,0.0190,0.0281,0.0061,0.0417,0.0265,0.0273,0.0167,0.0190,0.0182
21
- 95k,0.0735,0.0447,,0.0167,0.0250,0.0281,0.0136,0.0349,0.0281,0.0174,0.0106,0.0288,0.0159
22
- 100k,0.0637,0.0470,,0.0159,,0.0227,0.0045,0.0409,0.0311,0.0265,0.0205,0.0190,0.0190
23
- 105k,0.0637,0.0447,,0.0341,,0.0303,0.0129,0.0371,0.0311,0.0273,0.0205,0.0311,0.0129
24
- 110k,0.0872,0.0576,,0.0038,0.0273,0.0129,0.0205,0.0478,0.0296,0.0212,,0.0281,0.0182
25
- 115k,0.0788,0.0576,,0.0091,0.0167,0.0311,0.0167,0.0508,0.0349,0.0220,,0.0220,0.0174
26
- 120k,0.0834,0.0455,,0.0227,0.0265,0.0167,0.0212,0.0371,0.0318,0.0167,,0.0220,0.0152
27
- 125k,0.1001,0.0493,,0.0288,0.0250,0.0205,0.0387,0.0402,0.0318,0.0182,,0.0235,0.0144
28
- 130k,0.0766,0.0470,,0.0068,0.0258,0.0288,0.0174,,0.0341,0.0243,,,0.0205
29
- 135k,0.0879,0.0607,,0.0190,,0.0349,0.0258,0.0409,0.0288,0.0212,,,0.0281
30
- 140k,,0.0569,,0.0379,,0.0356,0.0227,0.0440,0.0341,0.0144,,,0.0144
31
- 145k,,,,0.0341,,0.0379,0.0015,0.0387,,0.0174,,,0.0273
32
- 150k,,,,,,0.0281,,0.0470,0.0265,0.0220,,,0.0258
33
- 155k,,,,0.0318,,0.0303,0.0121,0.0561,0.0523,0.0227,,,0.0243
34
- 160k,,,,0.0356,,0.0243,0.0061,0.0425,0.0432,0.0220,,,0.0303
35
- 165k,,,,0.0167,,0.0409,0.0015,,0.0470,0.0281,,,
36
- 170k,,,,0.0334,,0.0281,0.0129,,0.0455,0.0273,,,0.0235
37
- 175k,,,,0.0371,,0.0326,0.0190,,0.0409,0.0190,,,0.0273
38
- 180k,,,,0.0425,,0.0364,0.0227,,0.0356,0.0243,,,0.0288
39
- 185k,,,,0.0341,,0.0318,0.0341,,0.0546,0.0235,,,0.0364
40
- 190k,,,,0.0296,,0.0364,,,0.0425,0.0220,,,0.0349
41
- 195k,,,,0.0250,,0.0303,,,0.0493,0.0258,,,
42
- 200k,,,,0.0250,,0.0371,,,0.0493,0.0273,,,0.0205
43
- 205k,,,,0.0455,,0.0409,,,0.0553,0.0220,,,0.0258
44
- 210k,,,,0.0462,,0.0371,,,0.0523,0.0281,,,
45
- 215k,,,,0.0349,,0.0265,,,0.0500,0.0235,,,0.0281
46
- 220k,,,,0.0432,,0.0167,,,0.0462,0.0326,,,
47
- 225k,,,,0.0447,,0.0212,,,,0.0265,,,
48
- 230k,,,,0.0440,,,,,0.0493,0.0273,,,
49
- 235k,,,,0.0402,,,,,0.0508,0.0220,,,
50
- 240k,,,,0.0341,,,,,,0.0281,,,
51
- 245k,,,,0.0462,,,,,,0.0356,,,
52
- 250k,,,,0.0500,,,,,,,,,
53
- 255k,,,,0.0569,,,,,,0.0303,,,
54
- 260k,,,,0.0500,,,,,,0.0334,,,
55
- 265k,,,,0.0455,,,,,,0.0318,,,
56
- 270k,,,,0.0538,,,,,,0.0273,,,
57
- 275k,,,,0.0470,,,,,,,,,
58
- 280k,,,,0.0553,,,,,,0.0364,,,
59
- 285k,,,,0.0531,,,,,,0.0349,,,
60
- 290k,,,,,,,,,,0.0311,,,
61
- 300k,,,,,,,,,,,,,
62
- 305k,,,,,,,,,,0.0311,,,
63
- 310k,,,,,,,,,,0.0273,,,
64
- 315k,,,,,,,,,,,,,
65
- 320k,,,,,,,,,,,,,
66
- 325k,,,,,,,,,,,,,
67
- 330k,,,,,,,,,,,,,
68
- 335k,,,,,,,,,,,,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/txt360_eval/CKPT Eval - HellaSwag.csv CHANGED
@@ -1,69 +1,68 @@
1
- ga,Slim-Pajama 600B (bsz=4K x 1024),,,,,,FineWeb-1.5T,,Ours-Base,,Ours-Upsampling1,,Ours-Upsampling2,,Ours-Code-Upsampling2,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,DCLM-Base,
2
- 0-shot: 5 min,Llama-8x8B-baseline,,Llama-8x8B-seq8192,,Llama-8x8B-mup,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-1x8B-seq8192,,Llama_extend-1x8B-seq8192,,Jais-1x8B-seq8192,,Llama-1x8B-seq8192,
3
- 10-shot: 36 min,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot
4
- 5k,0.5315,0.5301,0.5165,0.5693,,,0.5622,0.5376,0.5254,0.5119,0.5356,0.5291,0.5324,0.5210,0.5377,0.5339,0.5366,0.5278,0.4507,0.4300,0.4413,0.4262,0.4497,0.4397,0.4624,0.4469
5
- 10k,0.6076,0.6008,0.5949,0.5693,,,0.6433,0.6202,0.5836,0.5827,,0.5975,0.6046,0.5886,0.6036,0.5987,0.6139,0.5901,0.5279,0.4889,0.5141,0.4872,0.5219,0.5028,0.5454,0.5150
6
- 15k,0.6422,0.6278,0.6314,0.5998,,,0.6716,0.6367,0.6114,0.6002,0.6281,0.6079,0.6336,0.6118,0.6399,0.6266,0.6388,0.6172,0.5495,0.5211,0.5444,0.5142,0.5469,0.5096,0.5785,0.5484
7
- 20k,0.6616,0.6424,0.6496,0.6244,,,0.6855,,0.6271,0.6223,0.6461,0.6230,0.6492,0.6329,0.6511,0.6475,0.6548,0.6382,0.5685,0.5310,0.5579,0.5270,0.5813,0.5377,0.5946,0.5649
8
- 25k,0.6738,0.6577,0.6683,0.6390,,,0.6945,0.6662,0.6413,,0.6612,0.6404,0.6665,0.6417,0.6652,0.6629,0.6683,0.6499,0.5759,0.5369,0.5787,0.5486,0.5864,0.5598,0.6105,0.5796
9
- 30k,0.6863,0.6656,0.6758,0.6368,,,0.7059,0.6639,,0.6387,0.6692,0.6425,0.6746,0.6485,0.6708,0.6584,0.6741,0.6587,0.5891,0.5490,0.5915,0.5437,0.5990,0.5625,0.6197,0.5897
10
- 35k,0.6956,0.6762,0.6850,0.6430,,,0.7158,0.6602,0.6547,0.6420,,0.6348,0.6832,0.6572,0.6816,0.6705,0.6864,0.6682,0.5985,0.5590,0.5954,0.5553,0.6090,0.5667,0.6343,0.6046
11
- 40k,0.7022,0.6812,0.6966,0.6524,,,0.7184,0.6814,0.6642,0.6452,0.6751,0.6347,0.6821,0.6533,0.6865,0.6717,0.6917,0.6646,0.6015,0.5595,0.6033,0.5592,0.6112,0.5704,0.6429,0.6100
12
- 45k,0.7048,0.6954,0.6991,0.6583,,,0.7220,0.6921,0.6698,0.6479,0.6802,,0.6905,0.6616,0.6919,0.6812,0.6933,0.6704,0.6103,0.5663,0.6040,0.5623,0.6175,0.5773,0.6473,0.6212
13
- 50k,0.7171,0.6998,0.7041,0.6574,,,0.7250,0.6785,0.6689,0.6611,0.6931,0.6726,0.6964,0.6720,0.6905,0.6715,0.7018,0.6902,0.6106,0.5510,0.6138,0.5676,0.6230,0.5934,0.6477,0.6109
14
- 55k,0.7187,0.7012,0.7080,0.6768,,,0.7305,0.6967,0.6697,0.6571,0.6899,0.6614,0.6959,0.6764,0.7047,0.6816,0.7052,0.6799,0.6182,0.5759,0.6200,0.5753,0.6260,0.5857,0.6518,0.6145
15
- 60k,0.7240,0.7037,0.7129,0.6748,,,0.7236,0.6955,0.6748,0.6573,0.6941,0.6584,0.6904,0.6850,0.6982,0.6731,0.7040,0.6767,0.6207,0.5849,0.6217,0.5711,0.6318,0.5744,0.6566,0.6204
16
- 65k,0.7297,0.7130,0.7142,0.6700,,,0.7355,0.6994,0.6752,0.6590,0.6907,0.6598,0.7061,0.6772,0.6963,0.6824,0.7074,0.6857,0.6235,0.5766,0.6299,0.5750,0.6381,0.5973,0.6544,0.6264
17
- 70k,0.7298,0.7148,0.7224,0.6796,,,0.7399,0.7034,0.6773,0.6631,0.6968,0.6735,0.7054,0.6789,0.7043,0.6936,0.7074,0.6883,0.6294,0.5982,0.6341,0.5872,0.6403,0.5928,0.6617,0.6220
18
- 75k,0.7329,0.7144,0.7261,0.6972,,,0.7374,0.6934,0.6854,0.6661,0.7014,0.6622,0.7065,0.6843,0.7029,0.6837,0.7027,0.6853,0.6285,0.5932,0.6336,0.5830,0.6376,0.5907,0.6706,0.6237
19
- 80k,0.7414,0.7271,0.7316,0.6937,,,0.7422,0.6989,0.6862,0.6717,0.7051,0.6762,0.7118,0.6954,0.7178,0.6909,0.7139,0.6908,0.6315,0.5898,0.6363,0.5877,0.6491,0.5968,0.6710,0.6032
20
- 85k,0.7449,0.7278,0.7334,0.7011,,,0.7444,0.7101,0.6887,0.6635,0.7086,0.6739,0.7126,0.6872,0.7052,0.6927,0.7178,0.7048,0.6359,0.5970,0.6375,0.5941,0.6380,0.5897,0.6789,0.6203
21
- 90k,0.7483,0.7379,0.7379,0.6949,,,0.7443,0.7064,0.6917,0.6818,0.7079,0.6804,0.7148,0.6926,0.7106,0.6976,0.7146,0.6818,0.6400,0.6052,0.6327,0.5846,0.6521,0.6120,0.6781,0.6271
22
- 95k,0.7510,0.7411,0.7427,0.6987,,,0.7376,0.6943,0.6901,0.6719,0.7097,0.6616,0.7115,0.6946,0.7221,0.6979,0.7240,0.6953,0.6388,0.5870,0.6373,0.5899,0.6460,0.5970,0.6798,0.6320
23
- 100k,0.7550,0.7419,0.7437,0.7070,,,0.7457,0.7153,,,0.7060,0.6902,0.7117,0.6955,0.7167,0.7002,0.7241,0.7013,0.6447,0.6079,0.6431,0.5916,0.6490,0.6095,0.6854,0.6467
24
- 105k,0.7547,0.7424,0.7445,0.7042,,,0.7476,0.7158,,,0.7141,0.6804,0.7132,0.6953,0.7222,0.6980,0.7263,0.6912,0.6470,0.6060,0.6473,0.5908,0.6588,0.6023,0.6809,0.6144
25
- 110k,0.7605,0.7491,0.7540,0.7070,,,0.7486,0.7210,0.6942,0.6850,0.7107,0.6696,0.7166,0.6883,0.7221,0.7020,0.7284,0.7013,0.6482,0.6196,,,0.6620,0.6166,0.6888,0.6269
26
- 115k,0.7626,0.7491,0.7540,0.7070,,,0.7522,0.7213,0.6957,0.6832,0.7120,0.6698,0.7179,0.6955,0.7284,0.7101,0.7274,0.7045,0.6511,0.6004,,,0.6636,0.5998,0.6882,0.6250
27
- 120k,0.7641,0.7545,0.7532,0.7110,,,0.7520,0.7217,0.7022,0.6911,0.7139,0.6855,0.7224,0.7017,0.7132,0.6866,0.7329,0.7089,0.6532,0.6145,,,0.6611,0.6085,0.6874,0.6250
28
- 125k,0.7636,0.7552,0.7538,0.7126,,,0.7533,0.7195,0.7029,0.6946,0.7211,0.6944,0.7221,0.6937,0.7250,0.7155,0.7285,0.7194,0.6571,0.6184,,,0.6624,0.6071,0.6896,0.6294
29
- 130k,0.7619,0.7547,0.7539,0.7168,,,0.7573,0.7178,0.7032,0.6929,0.7195,0.6969,0.7261,0.7103,0.7320,0.7221,0.7337,0.7096,0.6593,0.6174,,,,,0.6929,0.6273
30
- 135k,0.7641,0.7570,0.7543,0.7162,,,0.7580,0.7324,,,0.7177,0.6978,0.7198,0.6969,0.7249,0.7162,0.7324,0.7107,0.6584,0.6316,,,,,0.6941,0.6392
31
- 140k,,,0.7615,0.7250,,,0.7596,0.7329,,,0.7236,0.7106,0.7245,0.7140,0.7306,0.7228,0.7338,0.7099,0.6577,0.6142,,,,,0.6925,0.6334
32
- 145k,,,,,,,0.7573,0.7207,,,0.7194,0.7040,0.7247,0.7077,0.7347,0.7231,0.7431,0.7195,0.6628,0.6295,,,,,0.6984,0.6543
33
- 150k,,,,,,,0.7614,0.7352,,,0.7170,0.7029,,,0.7304,0.7116,0.7386,0.7233,0.6592,0.6212,,,,,0.6978,0.6291
34
- 155k,,,,,,,0.7579,0.7360,,,0.7245,0.7127,0.7294,0.7058,0.7378,0.7162,0.7448,0.7139,0.6662,0.6246,,,,,0.6929,0.6396
35
- 160k,,,,,,,0.7606,0.7356,,,0.7199,0.6983,0.7279,0.7109,0.7343,0.7230,0.7385,0.7172,0.6666,0.6169,,,,,0.7009,0.6266
36
- 165k,,,,,,,,0.7403,,,0.7249,0.7058,0.7297,0.7119,,,0.7493,0.7234,0.6680,0.6268,,,,,0.7003,0.6104
37
- 170k,,,,,,,0.7696,0.7422,,,0.7262,0.7070,0.7323,0.7031,,,0.7499,0.7260,0.6710,0.6319,,,,,0.7010,0.6514
38
- 175k,,,,,,,0.7745,0.7450,,,0.7303,0.7180,0.7338,0.7206,,,0.7502,0.7257,0.6707,0.6205,,,,,0.7047,0.6401
39
- 180k,,,,,,,0.7676,0.7384,,,0.7299,0.7249,0.7316,0.7250,,,0.7457,0.7270,0.6721,0.6327,,,,,0.7079,0.6421
40
- 185k,,,,,,,0.7678,0.7441,,,0.7319,0.7232,0.7354,0.7340,,,0.7519,0.7309,0.6732,0.6275,,,,,0.7050,0.6130
41
- 190k,,,,,,,0.7701,0.7505,,,0.7336,0.7193,,,,,0.7493,0.7305,0.6729,0.6343,,,,,0.7097,0.6568
42
- 195k,,,,,,,0.7730,0.7504,,,0.7293,0.7137,,,,,0.7579,0.7376,0.6774,0.6251,,,,,0.7074,0.6363
43
- 200k,,,,,,,0.7753,0.7521,,,0.7366,0.7138,,,,,0.7567,0.7372,0.6795,0.6279,,,,,0.7122,0.6430
44
- 205k,,,,,,,0.7744,0.7537,,,0.7360,0.7312,,,,,0.7560,0.7453,,0.6293,,,,,0.7175,0.6647
45
- 210k,,,,,,,0.7729,0.7539,,,0.7368,0.7284,,,,,0.7658,0.7465,,0.6431,,,,,0.7179,0.6109
46
- 215k,,,,,,,0.7804,0.7596,,,0.7359,0.7295,,,,,0.7621,0.7357,0.6819,0.6370,,,,,0.7136,0.6287
47
- 220k,,,,,,,0.7752,0.7633,,,0.7384,0.7436,,,,,0.7678,0.7457,0.6860,0.6384,,,,,,
48
- 225k,,,,,,,0.7808,0.7607,,,0.7340,0.7366,,,,,0.7649,0.7427,0.6805,0.6354,,,,,,
49
- 230k,,,,,,,0.7786,0.7614,,,,,,,,,0.7662,0.7561,0.6855,0.6483,,,,,,
50
- 235k,,,,,,,0.7844,0.7619,,,,,,,,,0.7676,0.7532,0.6880,0.6471,,,,,,
51
- 240k,,,,,,,0.7866,0.7677,,,,,,,,,,,0.6841,0.6509,,,,,,
52
- 245k,,,,,,,0.7857,0.7684,,,,,,,,,,,0.6850,0.6487,,,,,,
53
- 250k,,,,,,,0.7851,0.7738,,,,,,,,,,,0.6892,0.6541,,,,,,
54
- 255k,,,,,,,0.7845,0.7716,,,,,,,,,,,0.6875,0.6448,,,,,,
55
- 260k,,,,,,,0.7893,0.7705,,,,,,,,,,,0.6945,0.6480,,,,,,
56
- 265k,,,,,,,0.7918,0.7727,,,,,,,,,,,0.6933,0.6552,,,,,,
57
- 270k,,,,,,,0.7917,0.7725,,,,,,,,,,,0.6980,0.6548,,,,,,
58
- 275k,,,,,,,0.7925,0.7741,,,,,,,,,,,0.6950,0.6604,,,,,,
59
- 280k,,,,,,,0.7943,0.7769,,,,,,,,,,,,0.6574,,,,,,
60
- 285k,,,,,,,0.7946,0.7781,,,,,,,,,,,0.6970,0.6644,,,,,,
61
- 290k,,,,,,,,,,,,,,,,,,,0.6970,0.6674,,,,,,
62
- 300k,,,,,,,,,,,,,,,,,,,0.6969,0.6592,,,,,,
63
- 305k,,,,,,,,,,,,,,,,,,,0.6997,0.6655,,,,,,
64
- 310k,,,,,,,,,,,,,,,,,,,0.6988,0.6639,,,,,,
65
- 315k,,,,,,,,,,,,,,,,,,,0.7023,0.6749,,,,,,
66
- 320k,,,,,,,,,,,,,,,,,,,0.7012,0.6706,,,,,,
67
- 325k,,,,,,,,,,,,,,,,,,,,,,,,,,
68
- 330k,,,,,,,,,,,,,,,,,,,,,,,,,,
69
- 335k,,,,,,,,,,,,,,,,,,,,,,,,,,
 
1
+ ga,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
2
+ 0-shot: 5 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
3
+ 5k,0.5622,0.5254,0.5324,0.5366
4
+ 10k,0.6433,0.5836,0.6046,0.6139
5
+ 15k,0.6716,0.6114,0.6336,0.6388
6
+ 20k,0.6855,0.6271,0.6492,0.6548
7
+ 25k,0.6945,0.6413,0.6665,0.6683
8
+ 30k,0.7059,,0.6746,0.6741
9
+ 35k,0.7158,0.6547,0.6832,0.6864
10
+ 40k,0.7184,0.6642,0.6821,0.6917
11
+ 45k,0.722,0.6698,0.6905,0.6933
12
+ 50k,0.725,0.6689,0.6964,0.7018
13
+ 55k,0.7305,0.6697,0.6959,0.7052
14
+ 60k,0.7236,0.6748,0.6904,0.704
15
+ 65k,0.7355,0.6752,0.7061,0.7074
16
+ 70k,0.7399,0.6773,0.7054,0.7074
17
+ 75k,0.7374,0.6854,0.7065,0.7027
18
+ 80k,0.7422,0.6862,0.7118,0.7139
19
+ 85k,0.7444,0.6887,0.7126,0.7178
20
+ 90k,0.7443,0.6917,0.7148,0.7146
21
+ 95k,0.7376,0.6901,0.7115,0.724
22
+ 100k,0.7457,,0.7117,0.7241
23
+ 105k,0.7476,,0.7132,0.7263
24
+ 110k,0.7486,0.6942,0.7166,0.7284
25
+ 115k,0.7522,0.6957,0.7179,0.7274
26
+ 120k,0.752,0.7022,0.7224,0.7329
27
+ 125k,0.7533,0.7029,0.7221,0.7285
28
+ 130k,0.7573,0.7032,0.7261,0.7337
29
+ 135k,0.758,,0.7198,0.7324
30
+ 140k,0.7596,,0.7245,0.7338
31
+ 145k,0.7573,,0.7247,0.7431
32
+ 150k,0.7614,,,0.7386
33
+ 155k,0.7579,,0.7294,0.7448
34
+ 160k,0.7606,,0.7279,0.7385
35
+ 165k,,,0.7297,0.7493
36
+ 170k,0.7696,,0.7323,0.7499
37
+ 175k,0.7745,,0.7338,0.7502
38
+ 180k,0.7676,,0.7316,0.7457
39
+ 185k,0.7678,,0.7354,0.7519
40
+ 190k,0.7701,,,0.7493
41
+ 195k,0.773,,,0.7579
42
+ 200k,0.7753,,,0.7567
43
+ 205k,0.7744,,,0.756
44
+ 210k,0.7729,,,0.7658
45
+ 215k,0.7804,,,0.7621
46
+ 220k,0.7752,,,0.7678
47
+ 225k,0.7808,,,0.7649
48
+ 230k,0.7786,,,0.7662
49
+ 235k,0.7844,,,0.7676
50
+ 240k,0.7866,,,
51
+ 245k,0.7857,,,
52
+ 250k,0.7851,,,
53
+ 255k,0.7845,,,
54
+ 260k,0.7893,,,
55
+ 265k,0.7918,,,
56
+ 270k,0.7917,,,
57
+ 275k,0.7925,,,
58
+ 280k,0.7943,,,
59
+ 285k,0.7946,,,
60
+ 290k,,,,
61
+ 300k,,,,
62
+ 305k,,,,
63
+ 310k,,,,
64
+ 315k,,,,
65
+ 320k,,,,
66
+ 325k,,,,
67
+ 330k,,,,
68
+ 335k,,,,
 
data/txt360_eval/CKPT Eval - MATH.csv DELETED
@@ -1,68 +0,0 @@
1
- 5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
2
- time: 5 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
3
- 5k,0.2335,0.2308,,0.2251,,0.2157,0.2221,0.2231,0.2211,0.2251,0.2191,0.2271,0.2238
4
- 10k,0.2489,0.2519,,0.2379,0.2211,0.2332,0.2415,0.2342,0.2399,0.2285,0.2342,0.2402,0.2224
5
- 15k,0.2626,0.2469,,0.2526,,0.2389,0.2322,0.2479,0.2580,0.2375,0.2271,0.2355,0.2375
6
- 20k,0.2737,0.2606,,0.2469,0.2399,,0.2419,0.2526,0.2663,0.2469,0.2499,0.2439,0.2322
7
- 25k,0.2700,0.2653,,0.2523,0.2395,0.2600,0.2526,0.2616,0.2559,0.2369,0.2476,0.2462,0.2355
8
- 30k,0.2687,0.2556,,0.2402,,0.2452,0.2533,0.2606,0.2503,0.2456,0.2452,0.2446,0.2372
9
- 35k,0.2765,0.2533,,0.2683,0.2596,0.2590,0.2509,0.2630,0.2737,0.2392,0.2405,0.2536,0.2402
10
- 40k,0.2667,0.2683,,0.2496,0.2496,0.2593,0.2529,0.2697,0.2663,0.2379,0.2486,0.2526,0.2422
11
- 45k,0.2750,0.2620,,0.2616,0.2586,0.2563,0.2503,0.2683,0.2673,0.2479,0.2496,0.2513,0.2472
12
- 50k,0.2861,0.2697,,0.2693,0.2553,0.2596,0.2553,0.2700,0.2771,0.2442,0.2425,0.2546,0.2395
13
- 55k,0.2848,0.2693,,0.2640,0.2630,0.2566,0.2479,0.2630,0.2757,0.2526,0.2506,0.2586,0.2509
14
- 60k,0.2945,0.2784,,0.2727,0.2596,0.2633,0.2590,0.2690,0.2714,0.2519,0.2563,0.2553,0.2479
15
- 65k,0.3008,0.2767,,0.2680,0.2623,0.2704,0.2610,0.2492,0.2727,0.2529,0.2559,0.2647,0.2462
16
- 70k,0.2891,0.2824,,0.2730,0.2596,0.2710,0.2700,0.2677,0.2807,0.2469,0.2459,0.2626,0.2576
17
- 75k,0.2982,0.2938,,0.2784,0.2647,0.2630,0.2697,0.2777,0.2620,0.2626,0.2499,0.2583,0.2549
18
- 80k,0.2948,0.2801,,0.2737,0.2727,0.2643,0.2553,0.2657,0.2704,0.2509,0.2590,0.2549,0.2563
19
- 85k,0.2992,0.2938,,0.2754,0.2620,0.2704,0.2677,0.2600,0.2771,0.2496,0.2385,0.2620,0.2529
20
- 90k,0.3002,0.2888,,0.2764,0.2714,0.2737,0.2573,0.2693,0.2918,0.2616,0.2492,0.2566,0.2516
21
- 95k,0.3025,0.2817,,0.2616,0.2690,0.2737,0.2523,0.2690,0.2791,0.2492,0.2576,0.2576,0.2549
22
- 100k,0.2951,0.2894,,0.2616,,0.2817,0.2660,0.2757,0.2861,0.2546,0.2479,0.2667,0.2559
23
- 105k,0.3052,0.2928,,0.2653,,0.2710,0.2707,0.2771,0.2868,0.2529,0.2482,0.2640,0.2633
24
- 110k,0.3052,0.2985,,0.2600,0.2764,0.2781,0.2600,0.2764,0.2824,0.2536,,0.2727,0.2606
25
- 115k,0.3025,0.2985,,0.2690,0.2791,0.2720,0.2704,0.2744,0.2918,0.2623,,0.2807,0.2496
26
- 120k,0.3042,0.2985,,0.2750,0.2647,0.2650,0.2814,0.2754,0.2955,0.2677,,0.2626,0.2586
27
- 125k,0.3149,0.3018,,0.2683,0.2707,0.2647,0.2757,0.2760,0.2804,0.2509,,0.2704,0.2496
28
- 130k,0.3179,0.2978,,0.2781,0.2747,0.2653,0.2760,0.2774,0.2767,0.2593,,,0.2513
29
- 135k,0.3226,0.2945,,0.2747,,0.2717,0.2673,0.2784,0.2884,0.2606,,,0.2533
30
- 140k,,0.3018,,0.2771,,0.2757,0.2794,0.2787,0.2821,0.2459,,,0.2596
31
- 145k,,,,0.2724,,0.2650,0.2720,0.2888,0.2801,0.2543,,,0.2633
32
- 150k,,,,0.2720,,0.2814,,0.2864,0.2901,0.2590,,,0.2543
33
- 155k,,,,,,0.2784,0.2720,0.2874,0.2938,0.2580,,,0.2566
34
- 160k,,,,0.2817,,0.2834,0.2653,0.2807,0.2814,0.2563,,,0.2549
35
- 165k,,,,0.2834,,0.2821,0.2804,,0.2955,0.2559,,,0.2536
36
- 170k,,,,0.2854,,0.2824,0.2804,,0.3119,0.2536,,,0.2626
37
- 175k,,,,0.2804,,0.2915,0.2750,,0.2988,0.2489,,,0.2657
38
- 180k,,,,0.2767,,0.2901,0.2958,,0.3099,0.2623,,,0.2643
39
- 185k,,,,0.2767,,0.2948,0.2804,,0.3055,0.2570,,,0.2643
40
- 190k,,,,0.2787,,0.2925,,,0.3065,0.2573,,,0.2760
41
- 195k,,,,0.2858,,0.2898,,,0.3119,0.2640,,,0.2657
42
- 200k,,,,0.2771,,0.3028,,,0.3112,0.2610,,,0.2687
43
- 205k,,,,0.2851,,0.2921,,,0.3002,0.2680,,,0.2667
44
- 210k,,,,0.2838,,0.2817,,,0.3022,0.2650,,,0.2714
45
- 215k,,,,0.2838,,0.2851,,,0.3069,0.2653,,,0.2600
46
- 220k,,,,0.2938,,0.2814,,,0.3002,0.2549,,,
47
- 225k,,,,0.2935,,0.2898,,,0.3049,0.2633,,,
48
- 230k,,,,0.2888,,,,,0.3132,0.2653,,,
49
- 235k,,,,0.3055,,,,,0.2951,0.2717,,,
50
- 240k,,,,0.2995,,,,,,0.2667,,,
51
- 245k,,,,0.2928,,,,,,0.2610,,,
52
- 250k,,,,0.3092,,,,,,0.2650,,,
53
- 255k,,,,0.3152,,,,,,0.2643,,,
54
- 260k,,,,0.2951,,,,,,0.2616,,,
55
- 265k,,,,0.3045,,,,,,0.2610,,,
56
- 270k,,,,0.3018,,,,,,,,,
57
- 275k,,,,0.3065,,,,,,,,,
58
- 280k,,,,0.3015,,,,,,,,,
59
- 285k,,,,0.2965,,,,,,0.2586,,,
60
- 290k,,,,,,,,,,0.2623,,,
61
- 300k,,,,,,,,,,0.2603,,,
62
- 305k,,,,,,,,,,0.2630,,,
63
- 310k,,,,,,,,,,0.2710,,,
64
- 315k,,,,,,,,,,0.2677,,,
65
- 320k,,,,,,,,,,0.2650,,,
66
- 325k,,,,,,,,,,,,,
67
- 330k,,,,,,,,,,,,,
68
- 335k,,,,,,,,,,,,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/txt360_eval/CKPT Eval - MMLU.csv CHANGED
@@ -1,68 +1,68 @@
1
- 5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base,Comments
2
- time: 20 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192,"1. Comparing with upsample2-155k, the social science acc of dclm-195k is much higher"
3
- 5k,0.2398,0.2671,,,0.2579,0.2418,0.2482,0.2690,0.2456,0.2512,0.2532,0.2428,0.2530,
4
- 10k,0.2535,0.2520,,0.2594,0.2612,,0.2628,0.2319,0.2525,0.2462,0.2582,0.2713,0.2529,
5
- 15k,0.2527,0.2347,,,,0.2489,0.2334,0.2483,0.2503,0.2549,0.2380,0.2653,0.2507,
6
- 20k,0.2530,0.2478,,0.2495,0.2467,0.2677,0.2449,0.2507,0.2540,0.2416,0.2612,0.2482,0.2553,
7
- 25k,0.2503,0.2488,,,0.2431,0.2597,0.2571,0.2506,0.2534,0.2505,0.2464,0.2577,0.2552,
8
- 30k,0.2297,0.2539,,,,0.2592,0.2678,0.2389,0.2557,0.2468,0.2517,0.2485,0.2556,
9
- 35k,0.2356,0.2374,,0.2426,0.2591,0.2550,0.2562,0.2594,0.2494,0.2403,0.2451,0.2547,0.2443,
10
- 40k,0.2406,0.2462,,0.2467,0.2485,0.2344,0.2408,0.2555,0.2686,0.2552,0.2500,0.2553,0.2775,
11
- 45k,0.2470,0.2428,,0.2418,0.2296,0.2512,0.2712,0.2630,0.2503,0.2368,0.2536,0.2557,0.2393,
12
- 50k,0.2421,0.2368,,0.2382,0.2441,0.2727,0.2558,0.2558,0.2322,0.2499,0.2563,0.2305,0.2485,
13
- 55k,0.2460,0.2551,,0.2408,0.2536,0.2389,0.2440,0.2444,0.2747,0.2552,0.2516,0.2339,0.2595,
14
- 60k,0.2415,0.2397,,0.2718,0.2539,0.2518,0.2339,0.2551,0.2432,0.2517,0.2589,0.2379,0.2589,
15
- 65k,0.2490,0.2641,,0.2637,0.2423,0.2589,0.2342,0.2303,0.2478,0.2485,0.2643,0.2485,0.2798,
16
- 70k,0.2578,0.2641,,0.2534,0.2359,0.2716,0.2673,0.2307,0.2478,0.2483,0.2426,0.2499,0.2583,
17
- 75k,0.2587,0.2599,,0.2529,0.2372,0.2514,0.2579,0.2519,0.2478,0.2742,0.2594,0.2371,0.2653,
18
- 80k,0.2493,0.2519,,0.2504,0.2344,0.2582,0.2535,0.2433,0.2718,0.2596,0.2536,0.2553,0.2573,
19
- 85k,0.2527,0.2789,,0.2547,0.2496,0.2564,0.2418,0.2572,0.2465,0.2663,0.2552,0.2485,0.2584,
20
- 90k,0.2679,0.2668,,0.2595,0.2464,0.2608,0.2359,0.2777,0.2475,0.2543,0.2514,0.2411,0.2499,
21
- 95k,0.2551,0.2763,,0.2621,0.2469,0.2505,0.2534,0.2584,0.2424,0.2607,0.2742,0.2385,0.2521,
22
- 100k,0.2594,0.2564,,0.2550,,0.2614,0.2461,0.2611,0.2497,0.2675,0.2545,0.2540,0.2574,
23
- 105k,0.2787,0.2473,,0.2659,,0.2542,0.2729,0.2666,0.2468,0.2610,0.2726,0.2465,0.2798,
24
- 110k,0.3079,0.2458,,0.2551,0.2629,0.2512,0.2604,0.3027,0.2522,0.2673,,0.2410,0.2540,
25
- 115k,0.3185,0.2458,,0.2624,0.2324,0.2569,0.2590,0.2863,0.2584,0.2624,,0.2396,0.2771,
26
- 120k,0.3139,0.2832,,0.2626,0.2663,0.2718,0.2629,0.3190,0.2748,0.2419,,0.2544,0.2772,
27
- 125k,0.2960,0.2928,,0.2712,0.2733,0.2663,0.2768,0.2788,0.2570,0.2616,,0.2466,0.2856,
28
- 130k,0.3033,0.2844,,0.2404,0.2635,0.2767,0.2676,0.3191,0.2812,0.2538,,,0.2973,
29
- 135k,0.2934,0.2895,,0.2641,,0.2713,0.2735,0.3119,0.2882,0.2661,,,0.3203,
30
- 140k,,0.3045,,0.2553,,0.2811,0.2765,0.2866,0.3019,0.2730,,,0.2772,
31
- 145k,,,,0.2492,,0.2850,0.2708,0.3107,0.3090,0.2582,,,0.3435,
32
- 150k,,,,0.2595,,0.2780,,0.3225,0.3199,0.2541,,,0.3112,
33
- 155k,,,,0.2681,,0.2664,0.2463,0.3618,0.3116,0.2594,,,0.3361,
34
- 160k,,,,0.2605,,0.2793,0.2821,0.3047,0.3240,0.2688,,,0.3392,
35
- 165k,,,,0.2725,,0.2933,0.2816,,0.3478,0.2653,,,0.3485,
36
- 170k,,,,0.2514,,0.2656,0.2893,,0.3423,0.2537,,,0.3355,
37
- 175k,,,,0.2535,,0.3007,0.3317,,0.3156,0.2621,,,0.3162,
38
- 180k,,,,0.2561,,0.2785,0.2624,,0.2893,0.2555,,,0.3398,
39
- 185k,,,,0.2523,,0.3131,0.3026,,0.3876,0.2461,,,0.3631,
40
- 190k,,,,0.2653,,0.3226,,,0.3131,0.2540,,,0.3930,
41
- 195k,,,,0.2681,,0.3136,,,0.3473,0.2550,,,0.3972,
42
- 200k,,,,0.2515,,0.2811,,,0.3257,0.2481,,,0.3660,
43
- 205k,,,,0.2619,,0.3004,,,0.3836,,,,0.3748,
44
- 210k,,,,0.2687,,0.2996,,,0.3063,0.2646,,,0.3668,
45
- 215k,,,,0.2653,,0.3329,,,0.3947,0.2626,,,0.3641,
46
- 220k,,,,0.2631,,0.3590,,,0.3621,0.2600,,,,
47
- 225k,,,,0.2737,,0.3453,,,0.4151,0.2589,,,,
48
- 230k,,,,0.2833,,,,,0.3825,0.2587,,,,
49
- 235k,,,,0.2703,,,,,0.3897,,,,,
50
- 240k,,,,0.2572,,,,,,0.2610,,,,
51
- 245k,,,,0.2700,,,,,,0.2612,,,,
52
- 250k,,,,0.2639,,,,,,0.2583,,,,
53
- 255k,,,,0.2680,,,,,,0.2564,,,,
54
- 260k,,,,0.2897,,,,,,0.2631,,,,
55
- 265k,,,,0.2815,,,,,,0.2635,,,,
56
- 270k,,,,0.2693,,,,,,,,,,
57
- 275k,,,,0.2789,,,,,,0.2643,,,,
58
- 280k,,,,0.3052,,,,,,0.2687,,,,
59
- 285k,,,,0.2850,,,,,,0.2605,,,,
60
- 290k,,,,,,,,,,0.2779,,,,
61
- 300k,,,,,,,,,,0.2755,,,,
62
- 305k,,,,,,,,,,,,,,
63
- 310k,,,,,,,,,,0.2614,,,,
64
- 315k,,,,,,,,,,0.2646,,,,
65
- 320k,,,,,,,,,,0.2745,,,,
66
- 325k,,,,,,,,,,,,,,
67
- 330k,,,,,,,,,,,,,,
68
- 335k,,,,,,,,,,,,,,
 
1
+ 5-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
2
+ time: 20 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
3
+ 5k,,0.2579,0.2482,0.2456
4
+ 10k,0.2594,0.2612,0.2628,0.2525
5
+ 15k,,,0.2334,0.2503
6
+ 20k,0.2495,0.2467,0.2449,0.254
7
+ 25k,,0.2431,0.2571,0.2534
8
+ 30k,,,0.2678,0.2557
9
+ 35k,0.2426,0.2591,0.2562,0.2494
10
+ 40k,0.2467,0.2485,0.2408,0.2686
11
+ 45k,0.2418,0.2296,0.2712,0.2503
12
+ 50k,0.2382,0.2441,0.2558,0.2322
13
+ 55k,0.2408,0.2536,0.244,0.2747
14
+ 60k,0.2718,0.2539,0.2339,0.2432
15
+ 65k,0.2637,0.2423,0.2342,0.2478
16
+ 70k,0.2534,0.2359,0.2673,0.2478
17
+ 75k,0.2529,0.2372,0.2579,0.2478
18
+ 80k,0.2504,0.2344,0.2535,0.2718
19
+ 85k,0.2547,0.2496,0.2418,0.2465
20
+ 90k,0.2595,0.2464,0.2359,0.2475
21
+ 95k,0.2621,0.2469,0.2534,0.2424
22
+ 100k,0.255,,0.2461,0.2497
23
+ 105k,0.2659,,0.2729,0.2468
24
+ 110k,0.2551,0.2629,0.2604,0.2522
25
+ 115k,0.2624,0.2324,0.259,0.2584
26
+ 120k,0.2626,0.2663,0.2629,0.2748
27
+ 125k,0.2712,0.2733,0.2768,0.257
28
+ 130k,0.2404,0.2635,0.2676,0.2812
29
+ 135k,0.2641,,0.2735,0.2882
30
+ 140k,0.2553,,0.2765,0.3019
31
+ 145k,0.2492,,0.2708,0.309
32
+ 150k,0.2595,,,0.3199
33
+ 155k,0.2681,,0.2463,0.3116
34
+ 160k,0.2605,,0.2821,0.324
35
+ 165k,0.2725,,0.2816,0.3478
36
+ 170k,0.2514,,0.2893,0.3423
37
+ 175k,0.2535,,0.3317,0.3156
38
+ 180k,0.2561,,0.2624,0.2893
39
+ 185k,0.2523,,0.3026,0.3876
40
+ 190k,0.2653,,,0.3131
41
+ 195k,0.2681,,,0.3473
42
+ 200k,0.2515,,,0.3257
43
+ 205k,0.2619,,,0.3836
44
+ 210k,0.2687,,,0.3063
45
+ 215k,0.2653,,,0.3947
46
+ 220k,0.2631,,,0.3621
47
+ 225k,0.2737,,,0.4151
48
+ 230k,0.2833,,,0.3825
49
+ 235k,0.2703,,,0.3897
50
+ 240k,0.2572,,,
51
+ 245k,0.27,,,
52
+ 250k,0.2639,,,
53
+ 255k,0.268,,,
54
+ 260k,0.2897,,,
55
+ 265k,0.2815,,,
56
+ 270k,0.2693,,,
57
+ 275k,0.2789,,,
58
+ 280k,0.3052,,,
59
+ 285k,0.285,,,
60
+ 290k,,,,
61
+ 300k,,,,
62
+ 305k,,,,
63
+ 310k,,,,
64
+ 315k,,,,
65
+ 320k,,,,
66
+ 325k,,,,
67
+ 330k,,,,
68
+ 335k,,,,
data/txt360_eval/CKPT Eval - MedQA.csv CHANGED
@@ -1,68 +1,68 @@
1
- 0-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
2
- time: 3 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
3
- 5k,0.2679,0.2946,,0.2152,,0.2781,0.2482,0.2160,0.2687,0.2639,0.2215,0.2584,0.2396
4
- 10k,0.2749,0.2506,,0.2380,0.2372,0.2624,0.2616,0.2734,0.2718,0.2647,0.2435,0.2789,0.2435
5
- 15k,0.2388,0.2773,,0.2270,,0.2749,0.2797,0.2545,0.2639,0.2726,0.2773,0.2647,0.2412
6
- 20k,0.2412,0.2773,,0.2419,,,0.2317,0.2789,0.2757,0.2372,0.2506,0.2742,0.2577
7
- 25k,0.2302,0.2624,,0.2184,,0.2561,0.2569,0.2804,0.2474,0.2616,0.2498,0.2624,0.2419
8
- 30k,0.2545,0.2553,,0.2679,0.2522,0.2239,0.2097,0.2742,0.2608,0.2592,0.2694,0.2655,0.2765
9
- 35k,0.2757,0.2577,,0.2647,0.2655,0.2671,0.2467,0.2907,0.2694,0.2797,0.2498,0.2749,0.2514
10
- 40k,0.2710,0.2608,,0.2671,0.2396,0.2490,0.2569,0.2734,0.2482,0.2647,0.2742,0.2608,0.2671
11
- 45k,0.2765,0.2506,,0.2742,0.2734,0.2600,0.2255,0.2859,0.2333,0.2671,0.2435,0.2474,0.2749
12
- 50k,0.2742,0.2624,,0.2749,0.2537,0.2482,0.2372,0.2412,0.2655,0.2789,0.2412,0.2702,0.2577
13
- 55k,0.2679,0.2545,,0.2797,0.2561,0.2632,0.2294,0.2718,0.2537,0.2647,0.2647,0.2757,0.2632
14
- 60k,0.2773,0.2427,,0.2294,0.2325,0.2789,,0.2419,0.2639,0.2679,0.2506,0.2687,0.2419
15
- 65k,0.2404,0.2687,,0.2663,0.2757,0.2310,0.2749,0.2836,0.2726,0.2734,0.2537,0.2608,0.2459
16
- 70k,0.2600,0.2592,,0.2592,0.2757,0.2797,0.2632,0.2569,0.2435,0.2773,0.2765,0.2702,0.2584
17
- 75k,0.2679,0.2584,,0.2490,0.2679,0.2789,0.2616,0.2710,0.2765,0.2742,0.2710,0.2687,0.2789
18
- 80k,0.2632,0.2702,,0.2797,0.2419,0.2757,0.2522,0.2616,0.2789,0.2655,0.2694,0.2435,0.2757
19
- 85k,0.2734,0.2451,,0.2655,0.2844,0.2608,0.2687,0.2742,0.2553,0.2663,0.2749,0.2639,0.2773
20
- 90k,0.2797,0.2506,,0.2310,0.2364,0.2679,0.2624,,0.2679,0.2608,0.2561,0.2765,0.2820
21
- 95k,0.2529,0.2545,,0.2742,0.2820,0.2797,0.2647,0.2757,0.2749,0.2663,0.2105,0.2655,0.2749
22
- 100k,0.2694,0.2459,,0.2679,,0.2168,0.2702,0.2459,0.2663,0.2655,0.2537,0.2655,0.2781
23
- 105k,0.2537,0.2529,,0.2655,,0.2773,0.2632,0.2592,0.2726,0.2687,0.2671,0.2749,0.2812
24
- 110k,0.2663,0.2419,,0.2718,0.2474,0.2584,0.2537,0.2569,0.2537,0.2349,,0.2537,0.2765
25
- 115k,0.2459,0.2419,,0.2655,0.2718,0.2773,0.2247,0.2852,0.2867,0.2490,,0.2561,0.2364
26
- 120k,0.2624,0.2561,,0.2930,0.2537,0.2671,,0.2718,0.2844,0.2545,,0.2608,0.2443
27
- 125k,0.2451,0.2742,,0.2624,0.2364,0.2451,0.2145,0.2985,0.2883,0.2726,,0.2498,0.2867
28
- 130k,0.2655,0.2797,,0.2828,0.2412,0.2836,0.2891,0.2930,0.2922,0.2522,,,0.2765
29
- 135k,0.2749,0.2655,,,,0.2443,0.2765,0.2883,0.2702,0.2679,,,0.2679
30
- 140k,,0.2781,,0.2529,,0.2427,0.2545,0.2962,0.2930,0.2569,,,0.2820
31
- 145k,,,,0.2490,,0.2427,0.2718,0.3048,0.3024,0.2639,,,0.2632
32
- 150k,,,,,,0.2694,,0.2482,0.3244,0.2655,,,0.3150
33
- 155k,,,,0.2608,,0.2789,0.2624,0.3134,,0.2490,,,0.3009
34
- 160k,,,,0.2529,,0.2765,0.2726,0.3079,0.2852,0.2577,,,0.2757
35
- 165k,,,,0.2388,,0.2592,0.2742,,0.2561,0.2380,,,0.3009
36
- 170k,,,,0.2435,,0.2852,0.2506,,0.3056,0.2380,,,0.2836
37
- 175k,,,,0.2632,,0.2757,0.2647,,0.3126,0.2671,,,0.2993
38
- 180k,,,,0.2608,,0.2592,0.2899,,0.3166,0.2396,,,0.3071
39
- 185k,,,,0.2710,,0.2859,0.2561,,0.3268,0.2537,,,0.2490
40
- 190k,,,,0.2812,,0.2914,,,0.3040,0.2577,,,0.2828
41
- 195k,,,,0.2482,,0.2797,,,0.3472,0.2694,,,0.2883
42
- 200k,,,,0.2639,,0.2584,,,0.3339,0.2639,,,0.3126
43
- 205k,,,,0.2514,,0.3158,,,0.3409,,,,0.2710
44
- 210k,,,,0.2742,,0.3016,,,0.3378,0.2624,,,0.2962
45
- 215k,,,,0.2592,,0.2859,,,0.3362,,,,0.2859
46
- 220k,,,,0.2262,,0.3001,,,0.3559,0.2781,,,
47
- 225k,,,,0.2490,,0.3134,,,0.3213,0.2608,,,
48
- 230k,,,,0.2357,,,,,0.3472,0.2828,,,
49
- 235k,,,,0.2514,,,,,0.3614,0.2639,,,
50
- 240k,,,,0.2624,,,,,,0.2867,,,
51
- 245k,,,,0.2482,,,,,,0.2718,,,
52
- 250k,,,,0.2592,,,,,,0.2624,,,
53
- 255k,,,,0.2537,,,,,,0.2781,,,
54
- 260k,,,,0.2639,,,,,,0.2679,,,
55
- 265k,,,,0.2844,,,,,,0.2616,,,
56
- 270k,,,,0.2624,,,,,,,,,
57
- 275k,,,,0.2757,,,,,,,,,
58
- 280k,,,,0.2852,,,,,,0.2592,,,
59
- 285k,,,,0.2726,,,,,,0.2781,,,
60
- 290k,,,,,,,,,,0.2671,,,
61
- 300k,,,,,,,,,,0.2742,,,
62
- 305k,,,,,,,,,,0.2624,,,
63
- 310k,,,,,,,,,,0.2718,,,
64
- 315k,,,,,,,,,,0.2694,,,
65
- 320k,,,,,,,,,,0.2749,,,
66
- 325k,,,,,,,,,,,,,
67
- 330k,,,,,,,,,,,,,
68
- 335k,,,,,,,,,,,,,
 
1
+ 0-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
2
+ time: 3 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
3
+ 5k,0.2152,,0.2482,0.2687
4
+ 10k,0.238,0.2372,0.2616,0.2718
5
+ 15k,0.227,,0.2797,0.2639
6
+ 20k,0.2419,,0.2317,0.2757
7
+ 25k,0.2184,,0.2569,0.2474
8
+ 30k,0.2679,0.2522,0.2097,0.2608
9
+ 35k,0.2647,0.2655,0.2467,0.2694
10
+ 40k,0.2671,0.2396,0.2569,0.2482
11
+ 45k,0.2742,0.2734,0.2255,0.2333
12
+ 50k,0.2749,0.2537,0.2372,0.2655
13
+ 55k,0.2797,0.2561,0.2294,0.2537
14
+ 60k,0.2294,0.2325,,0.2639
15
+ 65k,0.2663,0.2757,0.2749,0.2726
16
+ 70k,0.2592,0.2757,0.2632,0.2435
17
+ 75k,0.249,0.2679,0.2616,0.2765
18
+ 80k,0.2797,0.2419,0.2522,0.2789
19
+ 85k,0.2655,0.2844,0.2687,0.2553
20
+ 90k,0.231,0.2364,0.2624,0.2679
21
+ 95k,0.2742,0.282,0.2647,0.2749
22
+ 100k,0.2679,,0.2702,0.2663
23
+ 105k,0.2655,,0.2632,0.2726
24
+ 110k,0.2718,0.2474,0.2537,0.2537
25
+ 115k,0.2655,0.2718,0.2247,0.2867
26
+ 120k,0.293,0.2537,,0.2844
27
+ 125k,0.2624,0.2364,0.2145,0.2883
28
+ 130k,0.2828,0.2412,0.2891,0.2922
29
+ 135k,,,0.2765,0.2702
30
+ 140k,0.2529,,0.2545,0.293
31
+ 145k,0.249,,0.2718,0.3024
32
+ 150k,,,,0.3244
33
+ 155k,0.2608,,0.2624,
34
+ 160k,0.2529,,0.2726,0.2852
35
+ 165k,0.2388,,0.2742,0.2561
36
+ 170k,0.2435,,0.2506,0.3056
37
+ 175k,0.2632,,0.2647,0.3126
38
+ 180k,0.2608,,0.2899,0.3166
39
+ 185k,0.271,,0.2561,0.3268
40
+ 190k,0.2812,,,0.304
41
+ 195k,0.2482,,,0.3472
42
+ 200k,0.2639,,,0.3339
43
+ 205k,0.2514,,,0.3409
44
+ 210k,0.2742,,,0.3378
45
+ 215k,0.2592,,,0.3362
46
+ 220k,0.2262,,,0.3559
47
+ 225k,0.249,,,0.3213
48
+ 230k,0.2357,,,0.3472
49
+ 235k,0.2514,,,0.3614
50
+ 240k,0.2624,,,
51
+ 245k,0.2482,,,
52
+ 250k,0.2592,,,
53
+ 255k,0.2537,,,
54
+ 260k,0.2639,,,
55
+ 265k,0.2844,,,
56
+ 270k,0.2624,,,
57
+ 275k,0.2757,,,
58
+ 280k,0.2852,,,
59
+ 285k,0.2726,,,
60
+ 290k,,,,
61
+ 300k,,,,
62
+ 305k,,,,
63
+ 310k,,,,
64
+ 315k,,,,
65
+ 320k,,,,
66
+ 325k,,,,
67
+ 330k,,,,
68
+ 335k,,,,
data/txt360_eval/CKPT Eval - NQ.csv CHANGED
@@ -1,68 +1,68 @@
1
- 5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
2
- time: 22 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
3
- 5k,0.0615,0.0537,,0.0341,0.0416,0.0634,0.0565,0.0579,0.0526,0.0219,0.0213,0.0205,0.0274
4
- 10k,0.1075,0.1053,,0.0715,,0.0906,0.0931,0.0828,0.0767,0.0391,0.0418,0.0529,0.0554
5
- 15k,0.1382,0.1136,,0.0765,,0.1147,0.1061,0.1152,0.1127,0.0607,0.0560,0.0629,0.0587
6
- 20k,0.1490,0.1393,,0.0787,,0.1161,0.1183,0.1285,0.1247,0.0529,0.0623,0.0668,0.0709
7
- 25k,0.1687,0.1416,,0.0892,0.1150,0.1402,0.1352,0.1380,0.1343,0.0584,0.0687,0.0762,0.0828
8
- 30k,0.1767,0.1557,,0.0911,0.1366,0.1454,0.1271,0.1501,0.1421,0.0723,0.0687,0.0723,0.0839
9
- 35k,0.1706,0.1756,,0.0970,0.1488,0.1573,0.1485,0.1565,0.1524,0.0803,0.0798,0.0803,0.0778
10
- 40k,0.1942,0.1759,,0.1028,0.1355,0.1560,0.1488,0.1554,0.1562,0.0759,0.0848,0.0845,0.0886
11
- 45k,0.1798,0.1820,,0.1078,0.1488,0.1715,0.1620,0.1684,0.1598,0.0881,0.0911,0.0867,0.0848
12
- 50k,0.1972,0.1809,,0.1050,0.1540,,0.1590,0.1657,0.1698,0.0864,0.0909,0.0909,0.0884
13
- 55k,0.2158,0.1956,,0.1097,0.1607,0.1659,0.1662,0.1751,0.1704,0.0892,0.0898,0.0745,0.0931
14
- 60k,0.2039,0.2036,,0.1211,0.1654,0.1734,0.1612,0.1745,0.1801,0.0817,0.0850,0.0922,0.0986
15
- 65k,0.2244,0.2044,,0.1089,0.1573,0.1765,0.1693,0.1776,0.1823,0.0920,0.0967,0.1025,0.1066
16
- 70k,0.2233,0.2233,,0.1222,0.1634,0.1845,0.1679,0.1859,0.1767,0.1022,0.0925,0.1039,0.1177
17
- 75k,0.2305,0.2277,,0.1097,0.1709,0.1825,0.1881,0.1737,0.1762,0.1069,0.0936,0.1116,0.1199
18
- 80k,0.2457,0.2252,,0.1277,0.1573,0.1900,0.1776,0.1787,0.1964,0.1047,0.0981,0.1033,0.1097
19
- 85k,0.2501,0.2285,,0.1280,0.1776,0.1914,0.1889,0.1870,0.1889,0.0942,0.0964,0.1144,0.1213
20
- 90k,0.2504,0.2521,,0.1158,0.1598,0.1911,0.1806,0.1898,0.1773,0.1058,0.0964,0.1186,0.1163
21
- 95k,0.2579,0.2443,,0.1235,0.1762,0.1911,0.1781,0.1989,0.1917,0.1097,0.0928,0.1213,0.1169
22
- 100k,0.2526,0.2446,,0.1258,,0.2097,0.1928,0.1903,0.1947,0.1125,0.1025,0.1127,0.1188
23
- 105k,0.2679,0.2482,,0.1366,,0.2028,0.1814,0.1922,0.2094,0.1199,0.1069,0.1186,0.1269
24
- 110k,0.2717,0.2562,,0.1377,0.1756,0.2019,0.1859,0.1975,,0.1152,,0.1252,0.1252
25
- 115k,0.2745,0.2562,,0.1346,0.1831,0.1956,0.1947,0.1903,0.2119,0.1127,,0.1285,0.1111
26
- 120k,0.2801,0.2612,,0.1402,0.2014,0.2000,,0.2044,0.2119,0.1188,,0.1166,0.1219
27
- 125k,0.2751,0.2657,,0.1307,0.2030,0.2014,0.1992,0.2053,0.1787,0.1230,,0.1274,0.1418
28
- 130k,0.2884,0.2673,,0.1368,0.1997,0.2125,0.1994,0.2011,0.2086,0.1127,,,0.1335
29
- 135k,0.2842,0.2673,,0.1363,,0.2069,0.2014,0.2036,0.2069,0.1255,,,0.1299
30
- 140k,,0.2679,,0.1435,,0.2039,0.1986,0.2042,0.2058,0.1263,,,0.1299
31
- 145k,,,,0.1532,,0.2172,0.1953,0.2078,0.2102,0.1274,,,0.1443
32
- 150k,,,,0.1404,,0.2125,,0.2127,0.2075,0.1263,,,0.1410
33
- 155k,,,,0.1418,,0.2235,0.1931,0.2066,0.2205,0.1418,,,0.1460
34
- 160k,,,,0.1346,,0.2183,0.2116,0.2069,0.2208,0.1319,,,0.1413
35
- 165k,,,,0.1524,,0.2219,0.2139,,0.2213,0.1296,,,0.1424
36
- 170k,,,,0.1388,,0.2175,,,0.2169,0.1366,,,0.1454
37
- 175k,,,,0.1438,,0.2235,0.2222,,0.2321,0.1349,,,0.1399
38
- 180k,,,,0.1471,,0.2260,0.2249,,0.236,0.1465,,,0.1421
39
- 185k,,,,0.1499,,0.2341,0.2222,,0.2366,0.1449,,,0.1421
40
- 190k,,,,0.1504,,0.2233,,,0.2274,0.1413,,,0.1471
41
- 195k,,,,0.1554,,0.2330,,,0.2454,0.1440,,,0.1407
42
- 200k,,,,0.1565,,0.2238,,,0.2346,0.1407,,,0.1449
43
- 205k,,,,0.1726,,0.2271,,,0.2316,0.1382,,,0.1501
44
- 210k,,,,0.1623,,0.2305,,,0.2493,0.1526,,,0.1424
45
- 215k,,,,0.1576,,0.2299,,,0.2355,0.1518,,,0.1535
46
- 220k,,,,0.1693,,0.2330,,,0.2427,0.1529,,,
47
- 225k,,,,0.1596,,0.2366,,,0.2440,0.1479,,,
48
- 230k,,,,0.1693,,,,,0.2554,0.1560,,,
49
- 235k,,,,0.1720,,,,,0.2535,0.1540,,,
50
- 240k,,,,0.1712,,,,,,0.1554,,,
51
- 245k,,,,0.1704,,,,,,0.1532,,,
52
- 250k,,,,0.1784,,,,,,0.1551,,,
53
- 255k,,,,0.1740,,,,,,0.1623,,,
54
- 260k,,,,0.1756,,,,,,0.1618,,,
55
- 265k,,,,0.1886,,,,,,0.1604,,,
56
- 270k,,,,0.1820,,,,,,0.1612,,,
57
- 275k,,,,0.1870,,,,,,0.1629,,,
58
- 280k,,,,0.1704,,,,,,0.1645,,,
59
- 285k,,,,0.1903,,,,,,0.1665,,,
60
- 290k,,,,,,,,,,0.1648,,,
61
- 300k,,,,,,,,,,0.1712,,,
62
- 305k,,,,,,,,,,0.1690,,,
63
- 310k,,,,,,,,,,0.1712,,,
64
- 315k,,,,,,,,,,,,,
65
- 320k,,,,,,,,,,,,,
66
- 325k,,,,,,,,,,,,,
67
- 330k,,,,,,,,,,,,,
68
- 335k,,,,,,,,,,,,,
 
1
+ 5-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
2
+ time: 22 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
3
+ 5k,0.0341,0.0416,0.0565,0.0526
4
+ 10k,0.0715,,0.0931,0.0767
5
+ 15k,0.0765,,0.1061,0.1127
6
+ 20k,0.0787,,0.1183,0.1247
7
+ 25k,0.0892,0.115,0.1352,0.1343
8
+ 30k,0.0911,0.1366,0.1271,0.1421
9
+ 35k,0.097,0.1488,0.1485,0.1524
10
+ 40k,0.1028,0.1355,0.1488,0.1562
11
+ 45k,0.1078,0.1488,0.162,0.1598
12
+ 50k,0.105,0.154,0.159,0.1698
13
+ 55k,0.1097,0.1607,0.1662,0.1704
14
+ 60k,0.1211,0.1654,0.1612,0.1801
15
+ 65k,0.1089,0.1573,0.1693,0.1823
16
+ 70k,0.1222,0.1634,0.1679,0.1767
17
+ 75k,0.1097,0.1709,0.1881,0.1762
18
+ 80k,0.1277,0.1573,0.1776,0.1964
19
+ 85k,0.128,0.1776,0.1889,0.1889
20
+ 90k,0.1158,0.1598,0.1806,0.1773
21
+ 95k,0.1235,0.1762,0.1781,0.1917
22
+ 100k,0.1258,,0.1928,0.1947
23
+ 105k,0.1366,,0.1814,0.2094
24
+ 110k,0.1377,0.1756,0.1859,
25
+ 115k,0.1346,0.1831,0.1947,0.2119
26
+ 120k,0.1402,0.2014,,0.2119
27
+ 125k,0.1307,0.203,0.1992,0.1787
28
+ 130k,0.1368,0.1997,0.1994,0.2086
29
+ 135k,0.1363,,0.2014,0.2069
30
+ 140k,0.1435,,0.1986,0.2058
31
+ 145k,0.1532,,0.1953,0.2102
32
+ 150k,0.1404,,,0.2075
33
+ 155k,0.1418,,0.1931,0.2205
34
+ 160k,0.1346,,0.2116,0.2208
35
+ 165k,0.1524,,0.2139,0.2213
36
+ 170k,0.1388,,,0.2169
37
+ 175k,0.1438,,0.2222,0.2321
38
+ 180k,0.1471,,0.2249,0.236
39
+ 185k,0.1499,,0.2222,0.2366
40
+ 190k,0.1504,,,0.2274
41
+ 195k,0.1554,,,0.2454
42
+ 200k,0.1565,,,0.2346
43
+ 205k,0.1726,,,0.2316
44
+ 210k,0.1623,,,0.2493
45
+ 215k,0.1576,,,0.2355
46
+ 220k,0.1693,,,0.2427
47
+ 225k,0.1596,,,0.244
48
+ 230k,0.1693,,,0.2554
49
+ 235k,0.172,,,0.2535
50
+ 240k,0.1712,,,
51
+ 245k,0.1704,,,
52
+ 250k,0.1784,,,
53
+ 255k,0.174,,,
54
+ 260k,0.1756,,,
55
+ 265k,0.1886,,,
56
+ 270k,0.182,,,
57
+ 275k,0.187,,,
58
+ 280k,0.1704,,,
59
+ 285k,0.1903,,,
60
+ 290k,,,,
61
+ 300k,,,,
62
+ 305k,,,,
63
+ 310k,,,,
64
+ 315k,,,,
65
+ 320k,,,,
66
+ 325k,,,,
67
+ 330k,,,,
68
+ 335k,,,,
data/txt360_eval/CKPT Eval - PIQA.csv CHANGED
@@ -1,69 +1,68 @@
1
- ,Slim-Pajama 600B (bsz=4K x 1024),,,,,,FineWeb-1.5T,,Ours-Base,,Ours-Upsampling1,,Ours-Upsampling2,,Ours-Code-Upsampling2,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,DCLM-Base,
2
- 0-shot: 3 min,Llama-8x8B-baseline,,Llama-8x8B-seq8192,,Llama-8x8B-mup,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-1x8B-seq8192,,Llama_extend-1x8B-seq8192,,Jais-1x8B-seq8192,,Llama-1x8B-seq8192,
3
- 5-shot: 4 min,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot
4
- 5k,0.7236,0.7073,0.7176,0.7133,,,0.7470,0.7263,,0.7106,0.7280,0.7242,0.7378,0.7296,0.7356,0.7323,0.7318,0.7263,0.7078,0.7002,0.7057,0.6844,0.7116,0.6975,0.7046,0.6942
5
- 10k,0.7492,0.7421,0.7427,0.7318,,,0.7650,0.7524,0.7573,0.7454,0.7568,0.7486,0.7557,0.7492,0.7726,0.7568,0.7612,0.7486,0.7198,0.7089,0.7209,0.7127,0.7280,0.7236,0.7367,0.7144
6
- 15k,0.7688,0.7541,0.7639,0.7481,,,0.7775,0.7655,0.7628,,0.7748,0.7622,0.7769,0.7622,0.7786,0.7737,0.7655,0.7661,0.7367,0.7329,0.7378,0.7312,0.7350,0.7318,0.7443,0.7133
7
- 20k,0.7639,0.7655,0.7682,0.7579,,,0.7807,0.7612,0.7671,0.7590,0.7845,,0.7709,0.7693,0.7813,0.7650,0.7840,0.7758,0.7465,0.7378,0.7470,0.7312,0.7486,0.7394,0.7514,0.7323
8
- 25k,0.7639,0.7677,0.7682,0.7671,,,0.7878,0.7748,,0.7590,,0.7693,0.7913,0.7671,0.7818,0.7715,0.7791,0.7715,0.7470,0.7378,0.7503,0.7437,0.7470,0.7492,0.7497,0.7345
9
- 30k,0.7764,0.7677,0.7797,0.7563,,,0.7862,0.7704,0.7780,0.7617,0.7802,0.7737,0.7829,0.7655,0.7813,0.7661,0.7889,0.7731,0.7524,0.7388,0.7497,0.7497,0.7546,0.7437,0.7563,0.7427
10
- 35k,0.7802,0.7677,0.7769,0.7622,,,0.7933,0.7726,0.7769,0.7699,0.7878,0.7682,0.7824,0.7737,0.7813,0.7797,0.7987,0.7780,0.7573,0.7361,0.7508,0.7421,0.7612,0.7579,0.7655,0.7486
11
- 40k,0.7873,0.7802,0.7802,0.7682,,,0.7905,0.7758,,0.7731,0.7889,0.7731,0.7943,0.7704,0.7835,0.7775,0.7878,0.7769,0.7573,0.7383,0.7579,0.7481,0.7606,0.7492,0.7650,0.7519
12
- 45k,0.7813,0.7786,0.7764,0.7699,,,0.7982,0.7824,0.7786,0.7661,0.7911,0.7775,0.7829,0.7737,0.7894,0.7835,0.7949,0.7780,0.7579,0.7465,0.7639,0.7465,0.7612,0.7514,0.7677,0.7497
13
- 50k,0.7818,0.7797,0.7878,0.7753,,,0.7992,0.7780,0.7775,0.7748,0.7856,0.7775,0.7943,0.7699,0.7998,0.7851,0.7933,0.7786,0.7557,0.7437,0.7524,0.7443,0.7677,0.7579,0.7802,0.7601
14
- 55k,0.7900,0.7780,0.7905,0.7829,,,0.8079,0.7737,0.7786,0.7775,0.7878,0.7731,0.7884,0.7780,0.7976,0.7905,0.7943,0.7824,0.7661,0.7546,0.7655,0.7541,0.7704,0.7606,0.7704,0.7606
15
- 60k,0.7916,0.7851,0.7911,0.7797,,,0.7922,0.7797,0.7818,0.7813,0.7900,0.7699,0.7905,0.7661,0.7943,0.7878,0.8003,0.7818,0.7650,0.7530,0.7628,0.7557,0.7661,0.7628,0.7661,0.7579
16
- 65k,0.7938,0.7840,0.7927,0.7840,,,0.7976,0.7769,0.7900,0.7780,0.7933,0.7731,0.7835,0.7671,0.7960,0.7845,0.7943,0.7780,0.7720,0.7492,0.7606,0.7535,0.7748,0.7639,0.7704,0.7584
17
- 70k,0.7922,0.7835,0.7922,0.7845,,,0.8052,0.7916,0.7916,0.7818,0.7949,0.7807,0.7900,0.7726,0.7889,0.7845,0.7976,0.7900,0.7633,0.7524,0.7612,0.7552,0.7644,0.7622,0.7699,0.7568
18
- 75k,0.7938,0.7927,0.7949,0.7840,,,0.8030,0.7873,0.7878,0.7715,0.7938,0.7807,0.8079,0.7922,0.7927,0.7905,0.8020,0.7933,0.7655,0.7541,0.7682,0.7508,0.7737,0.7568,0.7737,0.7573
19
- 80k,0.7911,0.7878,0.7873,0.7894,,,0.7971,0.7742,0.7829,0.7797,0.7987,0.7824,0.7992,0.7894,0.8003,0.7900,0.7933,0.7884,0.7671,0.7497,0.7682,0.7524,0.7748,0.7563,0.7742,0.7628
20
- 85k,0.7949,0.7894,0.7900,0.7889,,,0.8003,0.7840,0.8014,0.7786,0.8025,0.7894,0.7949,0.7818,0.7992,0.7894,0.7965,0.7851,0.7682,0.7530,0.7731,0.7563,0.7829,0.7622,0.7780,0.7704
21
- 90k,0.7982,0.7894,0.7916,0.7943,,,0.7976,0.7797,0.7873,0.7720,0.7971,0.7862,0.7856,0.7845,0.7960,0.7976,0.7998,0.7878,0.7731,0.7535,0.7650,0.7552,0.7737,0.7622,0.7742,0.7617
22
- 95k,0.8058,0.7992,0.8020,0.7873,,,0.8041,0.7742,0.7905,0.7840,0.8014,0.7807,0.7954,0.7829,0.8025,0.7911,0.8003,0.7884,0.7709,0.7535,0.7699,0.7519,0.7731,0.7612,0.7753,0.7704
23
- 100k,0.8069,0.7992,0.8052,0.7873,,,0.8069,0.7856,,,0.8041,0.7851,0.7998,0.7824,0.8014,0.7927,0.8009,0.7905,0.7628,0.7508,0.7715,0.7628,0.7748,0.7584,0.7758,0.7720
24
- 105k,0.8058,0.7965,0.8025,0.7943,,,0.8074,0.7916,,,0.8030,0.7900,0.8063,0.7927,0.8036,0.7949,0.7960,0.7905,0.7688,0.7568,0.7644,0.7601,0.7753,0.7682,0.7797,0.7639
25
- 110k,0.8041,0.7987,0.8069,0.7982,,,0.8085,0.7797,0.7856,0.7856,0.8009,0.7922,0.7938,0.7856,0.8020,0.7911,0.7998,0.7916,0.7682,0.7563,,,0.7791,0.7699,0.7845,0.7633
26
- 115k,0.8090,0.8009,0.8069,0.7982,,,0.8118,0.7867,0.7911,0.7802,0.8020,0.7867,0.8041,0.7922,0.8052,0.7916,0.8052,0.7938,0.7612,0.7541,,,0.7780,0.7633,0.7709,0.7639
27
- 120k,0.8145,0.7949,0.8041,0.7911,,,0.8074,0.7878,0.7982,0.7851,0.7976,0.7922,0.8025,0.7905,0.7938,0.7927,0.7949,0.7905,0.7704,0.7715,,,0.7813,0.7720,0.7867,0.7758
28
- 125k,0.8079,0.8009,0.8058,0.7900,,,0.8107,0.7829,0.8009,0.7900,0.8020,0.7894,0.8047,0.7916,0.8047,0.7976,0.8003,0.7922,0.7677,0.7671,,,0.7824,0.7737,0.7764,0.7699
29
- 130k,0.8069,0.8058,0.8041,0.7982,,,0.8079,0.7845,0.7916,0.7797,0.8036,0.7916,0.8014,0.7949,0.8058,0.8014,0.7922,0.7943,0.7835,0.7622,,,,,0.7748,0.7720
30
- 135k,0.8063,0.8047,0.8090,0.8020,,,0.8074,0.7878,,,0.8009,0.7878,0.8052,0.7835,0.8014,0.8030,0.8014,0.7927,0.7764,0.7682,,,,,0.7867,0.7813
31
- 140k,,,0.8090,0.7992,,,0.8123,0.7911,,,0.8047,0.7916,0.8063,0.7971,0.8079,0.8036,0.7987,0.7976,0.7764,0.7628,,,,,0.7862,0.7720
32
- 145k,,,,,,,0.8069,0.7807,,,0.8047,0.7922,0.8052,0.7845,0.7982,0.8025,0.8030,0.8085,0.7748,0.7688,,,,,0.7791,0.7699
33
- 150k,,,,,,,0.8058,0.7949,,,0.8058,0.7878,,,0.8090,0.7998,0.7987,0.8025,0.7693,0.7579,,,,,0.7916,0.7769
34
- 155k,,,,,,,0.8096,0.8041,,,0.8096,0.7922,0.7954,0.7775,0.8101,0.8041,0.8107,0.7965,0.7769,0.7639,,,,,0.7933,0.7731
35
- 160k,,,,,,,0.8101,0.7900,,,0.8014,0.7976,0.8020,0.7894,0.8128,0.8036,0.8079,0.8009,0.7753,0.7715,,,,,0.7987,0.7709
36
- 165k,,,,,,,0.8112,0.7933,,,0.8030,0.7971,0.8058,0.7878,,,0.8101,0.8009,0.7824,0.7709,,,,,0.7873,0.7682
37
- 170k,,,,,,,,0.7916,,,0.8047,0.7954,0.8041,0.7922,,,0.8036,0.8041,0.7797,0.7720,,,,,0.7884,0.7715
38
- 175k,,,,,,,0.8194,0.7965,,,0.8030,0.7911,0.7982,0.7927,,,0.8118,0.8096,0.7709,0.7666,,,,,0.7911,0.7802
39
- 180k,,,,,,,0.8118,0.7845,,,0.8041,0.7954,0.8025,0.7987,,,0.8172,0.7976,0.7775,0.7677,,,,,0.7884,0.7851
40
- 185k,,,,,,,0.8259,0.7982,,,0.8025,0.7960,0.8036,0.7905,,,0.8096,0.7987,0.7851,0.7737,,,,,0.7927,0.7813
41
- 190k,,,,,,,0.8139,0.8025,,,0.7998,0.7987,,,,,0.8128,0.7998,0.7840,0.7758,,,,,0.7922,0.7867
42
- 195k,,,,,,,0.8188,0.7965,,,0.8090,0.7878,,,,,0.8161,0.8052,0.7748,0.7677,,,,,0.7884,0.7769
43
- 200k,,,,,,,0.8112,0.8025,,,0.8079,0.8009,,,,,0.8128,0.8041,0.7802,0.7726,,,,,0.7916,0.7802
44
- 205k,,,,,,,0.8188,0.8009,,,0.8003,0.7938,,,,,0.8177,0.8145,0.7813,0.7726,,,,,0.7949,0.7748
45
- 210k,,,,,,,0.8188,0.7971,,,0.8047,0.7889,,,,,0.8161,0.8101,0.7818,0.7786,,,,,0.7894,0.7867
46
- 215k,,,,,,,0.8188,0.7992,,,0.8030,0.7922,,,,,0.8085,0.8085,0.7813,0.7748,,,,,0.7845,0.7802
47
- 220k,,,,,,,0.8199,0.8030,,,0.8085,0.7976,,,,,0.8096,0.8074,0.7769,0.7704,,,,,,
48
- 225k,,,,,,,0.8199,0.8041,,,0.8052,0.8014,,,,,0.8134,0.8101,0.7829,0.7731,,,,,,
49
- 230k,,,,,,,0.8172,0.8041,,,,,,,,,0.8134,0.8107,0.7824,0.7802,,,,,,
50
- 235k,,,,,,,0.8199,0.8085,,,,,,,,,0.8205,0.8118,0.7813,0.7764,,,,,,
51
- 240k,,,,,,,0.8166,0.8101,,,,,,,,,,,0.7829,0.7824,,,,,,
52
- 245k,,,,,,,0.8215,0.8090,,,,,,,,,,,0.7873,0.7753,,,,,,
53
- 250k,,,,,,,0.8172,0.8107,,,,,,,,,,,0.7807,0.7797,,,,,,
54
- 255k,,,,,,,0.8254,0.8128,,,,,,,,,,,0.7824,0.7737,,,,,,
55
- 260k,,,,,,,0.8215,0.809,,,,,,,,,,,0.7807,0.7797,,,,,,
56
- 265k,,,,,,,0.8210,0.8139,,,,,,,,,,,0.7775,0.7753,,,,,,
57
- 270k,,,,,,,0.8145,0.8079,,,,,,,,,,,0.7824,,,,,,,
58
- 275k,,,,,,,0.8161,0.8139,,,,,,,,,,,0.7889,0.7769,,,,,,
59
- 280k,,,,,,,0.8248,0.8150,,,,,,,,,,,0.7807,0.7726,,,,,,
60
- 285k,,,,,,,0.8210,0.8101,,,,,,,,,,,0.7916,0.7818,,,,,,
61
- 290k,,,,,,,,,,,,,,,,,,,0.7851,0.7758,,,,,,
62
- 300k,,,,,,,,,,,,,,,,,,,0.7840,0.7780,,,,,,
63
- 305k,,,,,,,,,,,,,,,,,,,0.7873,0.7829,,,,,,
64
- 310k,,,,,,,,,,,,,,,,,,,0.7813,0.7829,,,,,,
65
- 315k,,,,,,,,,,,,,,,,,,,0.7851,0.7791,,,,,,
66
- 320k,,,,,,,,,,,,,,,,,,,0.7873,0.7813,,,,,,
67
- 325k,,,,,,,,,,,,,,,,,,,,,,,,,,
68
- 330k,,,,,,,,,,,,,,,,,,,,,,,,,,
69
- 335k,,,,,,,,,,,,,,,,,,,,,,,,,,
 
1
+ ,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
2
+ 0-shot: 3 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
3
+ 5k,0.747,,0.7378,0.7318
4
+ 10k,0.765,0.7573,0.7557,0.7612
5
+ 15k,0.7775,0.7628,0.7769,0.7655
6
+ 20k,0.7807,0.7671,0.7709,0.784
7
+ 25k,0.7878,,0.7913,0.7791
8
+ 30k,0.7862,0.778,0.7829,0.7889
9
+ 35k,0.7933,0.7769,0.7824,0.7987
10
+ 40k,0.7905,,0.7943,0.7878
11
+ 45k,0.7982,0.7786,0.7829,0.7949
12
+ 50k,0.7992,0.7775,0.7943,0.7933
13
+ 55k,0.8079,0.7786,0.7884,0.7943
14
+ 60k,0.7922,0.7818,0.7905,0.8003
15
+ 65k,0.7976,0.79,0.7835,0.7943
16
+ 70k,0.8052,0.7916,0.79,0.7976
17
+ 75k,0.803,0.7878,0.8079,0.802
18
+ 80k,0.7971,0.7829,0.7992,0.7933
19
+ 85k,0.8003,0.8014,0.7949,0.7965
20
+ 90k,0.7976,0.7873,0.7856,0.7998
21
+ 95k,0.8041,0.7905,0.7954,0.8003
22
+ 100k,0.8069,,0.7998,0.8009
23
+ 105k,0.8074,,0.8063,0.796
24
+ 110k,0.8085,0.7856,0.7938,0.7998
25
+ 115k,0.8118,0.7911,0.8041,0.8052
26
+ 120k,0.8074,0.7982,0.8025,0.7949
27
+ 125k,0.8107,0.8009,0.8047,0.8003
28
+ 130k,0.8079,0.7916,0.8014,0.7922
29
+ 135k,0.8074,,0.8052,0.8014
30
+ 140k,0.8123,,0.8063,0.7987
31
+ 145k,0.8069,,0.8052,0.803
32
+ 150k,0.8058,,,0.7987
33
+ 155k,0.8096,,0.7954,0.8107
34
+ 160k,0.8101,,0.802,0.8079
35
+ 165k,0.8112,,0.8058,0.8101
36
+ 170k,,,0.8041,0.8036
37
+ 175k,0.8194,,0.7982,0.8118
38
+ 180k,0.8118,,0.8025,0.8172
39
+ 185k,0.8259,,0.8036,0.8096
40
+ 190k,0.8139,,,0.8128
41
+ 195k,0.8188,,,0.8161
42
+ 200k,0.8112,,,0.8128
43
+ 205k,0.8188,,,0.8177
44
+ 210k,0.8188,,,0.8161
45
+ 215k,0.8188,,,0.8085
46
+ 220k,0.8199,,,0.8096
47
+ 225k,0.8199,,,0.8134
48
+ 230k,0.8172,,,0.8134
49
+ 235k,0.8199,,,0.8205
50
+ 240k,0.8166,,,
51
+ 245k,0.8215,,,
52
+ 250k,0.8172,,,
53
+ 255k,0.8254,,,
54
+ 260k,0.8215,,,
55
+ 265k,0.821,,,
56
+ 270k,0.8145,,,
57
+ 275k,0.8161,,,
58
+ 280k,0.8248,,,
59
+ 285k,0.821,,,
60
+ 290k,,,,
61
+ 300k,,,,
62
+ 305k,,,,
63
+ 310k,,,,
64
+ 315k,,,,
65
+ 320k,,,,
66
+ 325k,,,,
67
+ 330k,,,,
68
+ 335k,,,,
 
data/txt360_eval/CKPT Eval - TriviaQA.csv CHANGED
@@ -1,68 +1,68 @@
1
- 5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base,Comments
2
- time: 76 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192,1. Takes 25min to load checkpoints.
3
- 5k,0.1944,0.1764,,0.1025,0.1232,0.1086,0.1260,0.1647,,0.0841,0.0762,0.0824,0.1066,2. GPU utility is only 20%
4
- 10k,0.3372,0.3292,,0.2073,,0.2636,0.1150,0.2659,0.2604,0.1348,0.1343,0.1585,0.1850,
5
- 15k,0.4050,0.3909,,0.3005,,0.3250,0.1872,0.3445,0.3244,0.1821,0.1930,0.1968,0.2443,
6
- 20k,0.4451,0.4497,,0.3506,0.2795,,0.2719,0.3802,0.3637,0.2086,0.2196,0.2231,0.2772,
7
- 25k,0.4899,0.4601,,0.3070,,0.3975,0.4093,0.4105,0.4120,0.2261,0.2375,0.2574,0.3146,
8
- 30k,0.5125,0.4824,,0.2461,0.2974,0.0303,0.4195,0.4330,0.4294,0.2352,0.2484,0.2675,0.3328,
9
- 35k,0.5249,0.5091,,0.3639,0.3572,0.1983,0.3587,0.4434,0.4428,0.2433,,0.2863,0.3507,
10
- 40k,0.5555,0.5166,,0.3537,0.0346,0.4571,0.4434,0.4618,0.4623,0.2708,0.2828,0.3020,0.3606,
11
- 45k,0.5664,0.5403,,0.3602,0.2674,0.2654,0.4366,0.4746,0.4792,0.2668,0.3018,0.3065,0.3726,
12
- 50k,0.5690,0.5217,,0.2407,0.3689,0.4355,0.4051,0.4885,0.4795,0.2906,0.2952,0.3187,0.3807,
13
- 55k,0.5843,0.5680,,0.2081,0.4101,0.4341,0.3230,0.4931,0.4940,0.2940,0.3117,0.3242,0.3984,
14
- 60k,0.5916,0.5814,,0.4068,0.4107,0.4861,0.4469,0.4955,0.5130,0.3137,0.3090,0.3422,0.4081,
15
- 65k,0.6032,0.5774,,0.3145,0.4477,0.4858,0.4907,0.5039,0.5087,0.3097,0.3184,0.3397,0.4156,
16
- 70k,0.6030,0.5920,,0.4102,0.4736,0.5080,0.4920,0.5164,0.5129,0.3236,0.3360,0.3375,0.4242,
17
- 75k,0.6216,0.6187,,0.2820,0.4226,0.4777,0.2245,0.5190,0.5042,0.3265,0.3341,0.3483,0.4220,
18
- 80k,0.6397,0.6127,,0.0975,0.4217,0.3698,,0.5185,0.5301,0.3352,0.3412,0.3532,0.4306,
19
- 85k,0.6416,0.6254,,0.0722,0.4763,0.3700,0.5029,0.5249,0.5350,0.3448,0.3423,0.3530,0.4340,
20
- 90k,0.6510,0.6317,,0.3388,0.1472,0.4793,0.0317,0.5337,0.5220,0.3440,0.3559,0.3644,0.4418,
21
- 95k,0.6655,0.6479,,0.5283,0.4938,0.5144,0.5180,0.5432,0.5446,0.3331,0.3393,0.3683,0.4454,
22
- 100k,0.6723,0.6486,,0.4317,0.1100,0.5121,0.5358,0.5383,0.5514,0.3520,0.3544,0.3698,0.4378,
23
- 105k,0.6755,0.6582,,0.1886,,0.5280,0.5153,0.5499,0.5562,0.3626,0.3642,0.3683,0.4525,
24
- 110k,0.6798,0.6668,,0.3510,,0.5468,0.5182,0.5541,0.5654,0.3694,,0.3903,0.4566,
25
- 115k,0.6796,0.6668,,0.3692,0.4759,0.5347,0.5132,0.5508,0.5577,0.3741,,0.3908,0.4482,
26
- 120k,0.6822,0.6688,,0.3690,0.4352,0.5376,0.5483,0.5567,0.5658,0.3881,,0.3950,0.4524,
27
- 125k,0.6894,0.6743,,0.3365,0.5206,0.4855,0.5211,0.5617,0.5658,0.3725,,0.3880,0.4592,
28
- 130k,0.6914,0.6709,,0.3550,0.0088,0.5238,0.5245,0.5597,0.5609,0.3698,,,0.4594,
29
- 135k,0.6915,0.6721,,0.3892,,0.5467,0.3977,0.5541,0.5774,0.3782,,,0.4636,
30
- 140k,,0.6773,,0.3930,,0.3110,0.4991,0.5572,0.5675,0.3906,,,0.4741,
31
- 145k,,,,0.4538,,0.5720,0.4872,0.5642,0.5639,,,,0.4720,
32
- 150k,,,,0.2883,,0.5612,,0.5701,0.5844,0.3899,,,0.4651,
33
- 155k,,,,0.4185,,0.5030,0.1586,0.5790,0.5755,0.4044,,,0.4784,
34
- 160k,,,,0.2720,,0.5701,0.5630,0.5819,0.5864,0.4049,,,0.4665,
35
- 165k,,,,0.4252,,0.5388,0.5642,,0.5853,0.4007,,,0.4793,
36
- 170k,,,,0.1507,,0.5951,0.5739,,,0.4150,,,0.4846,
37
- 175k,,,,0.3242,,0.5437,0.5640,,0.5979,0.4092,,,0.4908,
38
- 180k,,,,0.2653,,0.5580,0.5912,,0.6054,0.4189,,,,
39
- 185k,,,,0.2651,,0.5709,0.5852,,0.6064,,,,0.5030,
40
- 190k,,,,0.2380,,0.5142,,,0.5996,0.4193,,,0.5115,
41
- 195k,,,,0.4048,,0.5964,,,0.6243,0.4265,,,,
42
- 200k,,,,0.5058,,0.5684,,,0.6248,0.4256,,,,
43
- 205k,,,,0.0945,,0.5878,,,0.6224,0.4190,,,0.5105,
44
- 210k,,,,0.1557,,0.6020,,,0.6311,0.4415,,,0.5164,
45
- 215k,,,,0.2483,,0.5995,,,0.6293,0.4353,,,0.5163,
46
- 220k,,,,0.1725,,0.5924,,,0.6375,,,,,
47
- 225k,,,,0.2467,,0.4832,,,0.6340,0.4556,,,,
48
- 230k,,,,0.1653,,,,,0.6436,0.4622,,,,
49
- 235k,,,,0.1884,,,,,0.6411,0.4608,,,,
50
- 240k,,,,0.0719,,,,,,0.4536,,,,
51
- 245k,,,,0.3757,,,,,,0.4641,,,,
52
- 250k,,,,0.5859,,,,,,,,,,
53
- 255k,,,,0.4987,,,,,,0.4741,,,,
54
- 260k,,,,0.3940,,,,,,0.4712,,,,
55
- 265k,,,,0.3607,,,,,,0.4767,,,,
56
- 270k,,,,0.3898,,,,,,0.4795,,,,
57
- 275k,,,,0.4123,,,,,,,,,,
58
- 280k,,,,0.2413,,,,,,0.4787,,,,
59
- 285k,,,,0.3665,,,,,,0.4843,,,,
60
- 290k,,,,,,,,,,0.4818,,,,
61
- 300k,,,,,,,,,,0.4969,,,,
62
- 305k,,,,,,,,,,0.4941,,,,
63
- 310k,,,,,,,,,,0.4963,,,,
64
- 315k,,,,,,,,,,,,,,
65
- 320k,,,,,,,,,,,,,,
66
- 325k,,,,,,,,,,,,,,
67
- 330k,,,,,,,,,,,,,,
68
- 335k,,,,,,,,,,,,,,
 
1
+ 5-shot,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
2
+ time: 76 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
3
+ 5k,0.1025,0.1232,0.126,
4
+ 10k,0.2073,,0.115,0.2604
5
+ 15k,0.3005,,0.1872,0.3244
6
+ 20k,0.3506,0.2795,0.2719,0.3637
7
+ 25k,0.307,,0.4093,0.412
8
+ 30k,0.2461,0.2974,0.4195,0.4294
9
+ 35k,0.3639,0.3572,0.3587,0.4428
10
+ 40k,0.3537,0.0346,0.4434,0.4623
11
+ 45k,0.3602,0.2674,0.4366,0.4792
12
+ 50k,0.2407,0.3689,0.4051,0.4795
13
+ 55k,0.2081,0.4101,0.323,0.494
14
+ 60k,0.4068,0.4107,0.4469,0.513
15
+ 65k,0.3145,0.4477,0.4907,0.5087
16
+ 70k,0.4102,0.4736,0.492,0.5129
17
+ 75k,0.282,0.4226,0.2245,0.5042
18
+ 80k,0.0975,0.4217,,0.5301
19
+ 85k,0.0722,0.4763,0.5029,0.535
20
+ 90k,0.3388,0.1472,0.0317,0.522
21
+ 95k,0.5283,0.4938,0.518,0.5446
22
+ 100k,0.4317,0.11,0.5358,0.5514
23
+ 105k,0.1886,,0.5153,0.5562
24
+ 110k,0.351,,0.5182,0.5654
25
+ 115k,0.3692,0.4759,0.5132,0.5577
26
+ 120k,0.369,0.4352,0.5483,0.5658
27
+ 125k,0.3365,0.5206,0.5211,0.5658
28
+ 130k,0.355,0.0088,0.5245,0.5609
29
+ 135k,0.3892,,0.3977,0.5774
30
+ 140k,0.393,,0.4991,0.5675
31
+ 145k,0.4538,,0.4872,0.5639
32
+ 150k,0.2883,,,0.5844
33
+ 155k,0.4185,,0.1586,0.5755
34
+ 160k,0.272,,0.563,0.5864
35
+ 165k,0.4252,,0.5642,0.5853
36
+ 170k,0.1507,,0.5739,
37
+ 175k,0.3242,,0.564,0.5979
38
+ 180k,0.2653,,0.5912,0.6054
39
+ 185k,0.2651,,0.5852,0.6064
40
+ 190k,0.238,,,0.5996
41
+ 195k,0.4048,,,0.6243
42
+ 200k,0.5058,,,0.6248
43
+ 205k,0.0945,,,0.6224
44
+ 210k,0.1557,,,0.6311
45
+ 215k,0.2483,,,0.6293
46
+ 220k,0.1725,,,0.6375
47
+ 225k,0.2467,,,0.634
48
+ 230k,0.1653,,,0.6436
49
+ 235k,0.1884,,,0.6411
50
+ 240k,0.0719,,,
51
+ 245k,0.3757,,,
52
+ 250k,0.5859,,,
53
+ 255k,0.4987,,,
54
+ 260k,0.394,,,
55
+ 265k,0.3607,,,
56
+ 270k,0.3898,,,
57
+ 275k,0.4123,,,
58
+ 280k,0.2413,,,
59
+ 285k,0.3665,,,
60
+ 290k,,,,
61
+ 300k,,,,
62
+ 305k,,,,
63
+ 310k,,,,
64
+ 315k,,,,
65
+ 320k,,,,
66
+ 325k,,,,
67
+ 330k,,,,
68
+ 335k,,,,
data/txt360_eval/CKPT Eval - WinoGrande.csv CHANGED
@@ -1,69 +1,68 @@
1
- ,Slim-Pajama 600B (bsz=4K x 1024),,,,,,FineWeb-1.5T,,Ours-Base,,Ours-Upsampling1,,Ours-Upsampling2,,Ours-Code-Upsampling2,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,DCLM-Base,
2
- 0-shot: 3 min,Llama-8x8B-baseline,,Llama-8x8B-seq8192,,Llama-8x8B-mup,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-1x8B-seq8192,,Llama_extend-1x8B-seq8192,,Jais-1x8B-seq8192,,Llama-1x8B-seq8192,
3
- 5-shot: 3 min,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot
4
- 5k,0.5454,0.5367,0.5572,0.5367,,,0.5691,0.5335,0.5351,0.5233,0.5241,,0.5367,0.5391,0.5470,0.5414,0.5383,0.5320,0.5067,0.5201,0.5257,0.5122,0.5020,0.5217,0.5217,0.5312
5
- 10k,0.5801,0.6054,0.5730,0.5525,,,0.5904,0.5620,0.5604,0.5738,0.5825,0.5691,0.5817,0.5809,0.5777,0.5706,0.5667,0.5620,0.5383,0.5375,0.5241,0.5430,0.5343,0.5241,0.5722,0.5517
6
- 15k,0.6172,0.6038,0.5848,0.6046,,,0.5927,0.5951,0.5919,0.5699,,0.6014,0.5880,0.5856,0.5991,0.5943,0.5896,0.5841,0.5596,0.5470,0.5391,0.5414,0.5509,0.5454,0.5620,0.5612
7
- 20k,0.6109,0.6267,0.5935,0.6085,,,0.6448,0.6022,0.6006,,0.6204,,0.6180,0.6235,0.6014,0.6062,0.5935,0.6014,0.5580,0.5691,0.5533,0.5328,0.5659,0.5580,0.5943,0.5714
8
- 25k,0.6417,0.6369,0.5998,0.6140,,,0.6196,0.6164,0.6125,0.5998,0.6093,0.6117,0.6062,0.6117,0.6212,0.6014,0.6101,0.6188,0.5785,0.5770,0.5556,0.5572,0.5762,0.5691,0.5904,0.5864
9
- 30k,0.6377,0.6456,0.6251,0.6361,,,0.6488,0.6251,,,0.6330,0.5983,0.6140,0.6022,0.6338,0.6109,0.6322,0.6188,0.5667,0.5770,0.5612,0.5493,0.5919,0.5604,0.6054,0.5935
10
- 35k,0.6472,0.6456,0.6196,0.5935,,,0.6440,0.6227,0.6030,0.6172,0.6393,0.6243,0.6259,0.6093,0.6172,0.6164,0.6212,0.6188,0.5817,0.5572,0.5848,0.5612,0.5777,0.5714,0.6172,0.5975
11
- 40k,0.6606,0.6630,0.6369,0.6259,,,0.6496,0.6077,0.6338,0.5951,,,0.6267,0.6101,0.6417,0.6235,0.6433,0.6361,0.5856,0.5770,0.5817,0.5777,0.5927,0.5635,0.6117,0.5959
12
- 45k,0.6717,0.6496,0.6196,0.6425,,,0.6456,0.6417,0.6172,0.6109,0.6448,0.6235,0.6393,0.6417,0.6425,0.6369,0.6393,0.6346,0.5777,0.5699,0.5872,0.5809,0.5951,0.5833,0.6156,0.6085
13
- 50k,0.6756,0.6725,0.6369,0.6377,,,0.6464,0.6275,0.6401,0.5975,0.6322,0.6322,0.6164,0.614,0.6425,0.6425,0.6472,0.6393,0.5833,0.5683,0.5872,0.5675,0.5967,0.5580,0.6338,0.6022
14
- 55k,0.6661,0.6590,0.6496,0.6614,,,0.6567,0.6314,0.6235,0.6062,0.6519,0.6133,0.6314,0.6235,0.6377,0.648,0.6464,0.6322,0.5738,0.5612,0.5935,0.5722,0.6148,0.5533,0.6140,0.614
15
- 60k,0.6622,0.6511,0.6377,0.6582,,,0.6480,0.6464,0.6251,0.6117,0.6535,0.6251,0.6219,0.6235,0.6480,0.6196,0.6369,0.6338,0.5864,0.5730,0.5746,0.5683,0.5896,0.5785,0.6062,0.6283
16
- 65k,0.6669,0.6772,0.6590,0.6685,,,0.6654,0.6354,0.6283,0.6180,0.6519,0.6196,0.6401,0.6393,0.6559,0.633,0.6504,0.6275,0.5919,0.5754,0.5825,0.5793,0.6101,0.5959,0.6290,0.6125
17
- 70k,0.6827,0.6811,0.6567,0.6701,,,0.6709,0.6401,0.6322,0.6235,0.6622,,0.6417,0.6409,0.6433,0.6338,0.6559,0.6361,0.5975,0.5927,0.5738,0.5588,0.5983,0.5738,0.6330,0.6101
18
- 75k,0.6788,0.6819,0.6543,0.6685,,,0.6709,0.6283,0.6480,0.6172,0.6654,0.6409,0.6527,0.6267,0.6488,0.6551,0.6527,0.6638,0.5959,0.5817,0.5880,0.5517,0.6164,0.5793,0.6196,0.6164
19
- 80k,0.6835,0.6882,0.6748,0.6638,,,0.6843,0.6464,0.6504,0.6275,0.6677,0.6401,0.6369,0.6298,0.6606,0.6488,0.6519,0.6440,0.5872,0.5675,0.5856,0.5564,0.6148,0.6038,0.6188,0.5983
20
- 85k,0.6867,0.6882,0.6638,0.6590,,,0.6875,0.6504,0.6409,0.6188,,0.6535,0.6575,0.6283,0.6606,0.6393,0.6393,0.6543,0.6085,0.5919,0.5872,0.5738,0.6156,0.5817,0.6361,0.6196
21
- 90k,0.6827,0.6803,0.6740,0.6598,,,0.6740,0.6393,0.6369,0.6306,0.6496,0.6543,0.6488,0.6409,0.6606,0.6314,0.6527,0.6409,0.5927,0.5825,0.5951,0.5888,0.6148,0.6077,0.6259,0.6164
22
- 95k,0.6859,0.6859,0.6764,0.6575,,,0.6835,0.6401,0.6369,0.6361,0.6551,0.6322,0.6654,0.6338,0.6630,0.6322,0.6409,0.6582,0.6156,0.5927,0.5864,0.5919,0.6164,0.5809,0.6283,0.6014
23
- 100k,0.6898,,0.6661,0.6851,,,0.6756,0.6567,,,0.6567,0.6472,0.6590,0.6488,0.6748,0.6204,0.6511,0.6519,0.6109,0.5817,0.5919,0.5746,0.6133,0.6030,0.6488,0.6338
24
- 105k,0.6811,0.6772,0.6654,0.6646,,,0.6772,0.6519,,,0.6661,0.6472,0.6732,0.6369,0.6638,0.633,0.6740,0.6638,0.6140,0.5991,0.5912,0.5833,0.6046,0.5943,0.6496,0.618
25
- 110k,0.7017,0.6867,0.6701,0.6654,,,0.6669,0.6480,0.6559,0.6456,0.6756,0.6551,0.6567,0.6401,0.6661,0.6456,0.6551,0.6535,0.6196,0.6006,,,0.6219,0.6069,0.6417,0.6338
26
- 115k,0.6890,0.7040,0.6701,0.6654,,,0.6732,0.6511,0.6456,0.6227,0.6559,0.6456,0.6661,0.6488,0.6748,0.6527,0.6622,0.6448,0.6156,0.6014,,,0.6338,0.6069,0.6575,0.6196
27
- 120k,0.6930,0.6953,0.6717,0.6701,,,0.6764,0.6464,0.6519,0.6275,0.6622,0.6480,0.6590,0.6322,0.6732,0.6377,0.6519,0.6622,0.5872,0.5612,,,0.6227,0.6077,0.6504,0.6275
28
- 125k,0.6961,0.6977,0.6811,0.6819,,,0.6985,0.6433,0.6393,0.6417,0.6685,0.6433,0.6646,0.6338,0.6740,0.6559,0.6803,0.6693,0.6014,0.5888,,,0.6298,0.6243,0.6488,0.6117
29
- 130k,0.6922,0.7056,0.6859,0.6717,,,0.6811,0.6330,0.6614,0.6393,0.6780,0.6322,0.6590,0.6361,0.6748,0.6456,0.6559,0.6472,0.6085,0.5880,,,,,0.6614,0.6235
30
- 135k,0.6961,0.6953,0.6788,0.6756,,,0.6827,0.6614,,,0.6606,0.6575,0.6551,0.6464,0.6748,0.629,0.6677,0.6535,0.5991,0.5959,,,,,0.6433,0.6417
31
- 140k,,,0.6819,0.6827,,,0.6867,0.6630,,,0.6598,0.6551,0.6567,0.6369,0.6709,0.6551,0.6638,0.6519,0.6038,0.5809,,,,,0.6472,0.6314
32
- 145k,,,,,,,0.6819,0.6504,,,0.6717,0.6480,0.6669,0.6551,0.6661,0.6433,0.6725,0.6630,0.6180,0.5801,,,,,0.6606,0.644
33
- 150k,,,,,,,0.6835,0.6646,,,0.6693,0.6654,,,0.6732,0.6148,0.6788,0.6409,0.6062,0.5991,,,,,0.6567,0.6361
34
- 155k,,,,,,,0.6748,0.6590,,,0.6772,0.6677,0.6630,0.648,0.6851,0.6409,0.6922,0.6764,0.6204,0.6006,,,,,0.6677,0.6401
35
- 160k,,,,,,,0.6875,0.6614,,,0.6709,0.6669,0.6748,0.648,0.6622,0.6638,0.6811,0.6803,0.6133,0.5864,,,,,0.6567,0.6322
36
- 165k,,,,,,,0.6788,0.6661,,,0.6709,0.6717,0.6725,0.6433,,,,,0.6006,0.5856,,,,,0.6669,0.6472
37
- 170k,,,,,,,0.6938,0.6709,,,0.6701,0.6598,0.6725,0.6354,,,0.6717,0.6867,0.6085,0.5833,,,,,0.6685,0.6575
38
- 175k,,,,,,,0.6938,0.6693,,,0.6590,0.6622,0.6693,0.6614,,,0.6890,0.6867,0.6133,0.5754,,,,,0.6622,0.6598
39
- 180k,,,,,,,0.6977,0.6646,,,0.6661,0.6646,0.6740,0.6661,,,0.6685,0.6504,0.6235,0.5967,,,,,0.6732,0.6535
40
- 185k,,,,,,,0.6875,0.6519,,,0.6930,0.6535,0.6811,0.663,,,0.6851,0.6819,0.6156,0.5833,,,,,0.6661,0.648
41
- 190k,,,,,,,0.6914,0.6859,,,0.6819,0.6606,,,,,0.6693,0.6638,0.6361,0.6006,,,,,0.6701,0.6488
42
- 195k,,,,,,,0.6859,0.6614,,,0.6946,0.6732,,,,,0.6756,0.6638,0.6259,0.5841,,,,,0.6606,0.6543
43
- 200k,,,,,,,0.6875,0.6669,,,0.6898,0.6780,,,,,0.7017,0.6701,0.6227,0.5872,,,,,0.6590,0.648
44
- 205k,,,,,,,0.7072,0.6906,,,0.6969,0.6780,,,,,0.6827,0.6748,0.6306,0.5888,,,,,0.6725,0.659
45
- 210k,,,,,,,0.6859,0.6661,,,0.6827,0.6748,,,,,0.6882,0.6717,0.6322,0.5919,,,,,0.6669,0.6488
46
- 215k,,,,,,,0.7017,0.6780,,,0.6748,0.6772,,,,,0.6922,0.6709,0.6346,0.6006,,,,,0.6709,0.6661
47
- 220k,,,,,,,0.7040,0.6788,,,0.6859,0.6732,,,,,0.6969,0.6638,0.6346,0.5983,,,,,,
48
- 225k,,,,,,,0.7111,0.6717,,,0.6843,0.6685,,,,,0.6756,0.6606,0.6188,0.5935,,,,,,
49
- 230k,,,,,,,0.7103,0.6811,,,,,,,,,0.7096,0.6701,0.6235,0.5935,,,,,,
50
- 235k,,,,,,,0.7040,0.6772,,,,,,,,,0.7096,0.6764,0.6306,0.6062,,,,,,
51
- 240k,,,,,,,0.7080,0.6851,,,,,,,,,,,0.6219,,,,,,,
52
- 245k,,,,,,,0.6985,0.6938,,,,,,,,,,,0.6267,0.5888,,,,,,
53
- 250k,,,,,,,0.7127,0.6938,,,,,,,,,,,0.6361,0.6006,,,,,,
54
- 255k,,,,,,,0.7119,0.6827,,,,,,,,,,,0.6440,0.5998,,,,,,
55
- 260k,,,,,,,0.7056,0.6867,,,,,,,,,,,0.6322,0.5975,,,,,,
56
- 265k,,,,,,,0.7040,0.6756,,,,,,,,,,,0.6338,0.6069,,,,,,
57
- 270k,,,,,,,0.7111,0.6819,,,,,,,,,,,0.6314,0.5991,,,,,,
58
- 275k,,,,,,,0.7127,0.6811,,,,,,,,,,,0.6306,0.6148,,,,,,
59
- 280k,,,,,,,0.7064,0.6914,,,,,,,,,,,0.6251,0.6054,,,,,,
60
- 285k,,,,,,,0.7096,0.6977,,,,,,,,,,,0.6385,,,,,,,
61
- 290k,,,,,,,,,,,,,,,,,,,0.6338,0.6077,,,,,,
62
- 300k,,,,,,,,,,,,,,,,,,,0.6227,0.6093,,,,,,
63
- 305k,,,,,,,,,,,,,,,,,,,0.6290,0.6069,,,,,,
64
- 310k,,,,,,,,,,,,,,,,,,,0.6267,0.6156,,,,,,
65
- 315k,,,,,,,,,,,,,,,,,,,0.6314,0.6101,,,,,,
66
- 320k,,,,,,,,,,,,,,,,,,,0.6401,0.5991,,,,,,
67
- 325k,,,,,,,,,,,,,,,,,,,,,,,,,,
68
- 330k,,,,,,,,,,,,,,,,,,,,,,,,,,
69
- 335k,,,,,,,,,,,,,,,,,,,,,,,,,,
 
1
+ ,FineWeb-1.5T,Ours-Base,Ours-Upsampling2,All-Upsampling1
2
+ 0-shot: 3 min,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192
3
+ 5k,0.5691,0.5351,0.5367,0.5383
4
+ 10k,0.5904,0.5604,0.5817,0.5667
5
+ 15k,0.5927,0.5919,0.588,0.5896
6
+ 20k,0.6448,0.6006,0.618,0.5935
7
+ 25k,0.6196,0.6125,0.6062,0.6101
8
+ 30k,0.6488,,0.614,0.6322
9
+ 35k,0.644,0.603,0.6259,0.6212
10
+ 40k,0.6496,0.6338,0.6267,0.6433
11
+ 45k,0.6456,0.6172,0.6393,0.6393
12
+ 50k,0.6464,0.6401,0.6164,0.6472
13
+ 55k,0.6567,0.6235,0.6314,0.6464
14
+ 60k,0.648,0.6251,0.6219,0.6369
15
+ 65k,0.6654,0.6283,0.6401,0.6504
16
+ 70k,0.6709,0.6322,0.6417,0.6559
17
+ 75k,0.6709,0.648,0.6527,0.6527
18
+ 80k,0.6843,0.6504,0.6369,0.6519
19
+ 85k,0.6875,0.6409,0.6575,0.6393
20
+ 90k,0.674,0.6369,0.6488,0.6527
21
+ 95k,0.6835,0.6369,0.6654,0.6409
22
+ 100k,0.6756,,0.659,0.6511
23
+ 105k,0.6772,,0.6732,0.674
24
+ 110k,0.6669,0.6559,0.6567,0.6551
25
+ 115k,0.6732,0.6456,0.6661,0.6622
26
+ 120k,0.6764,0.6519,0.659,0.6519
27
+ 125k,0.6985,0.6393,0.6646,0.6803
28
+ 130k,0.6811,0.6614,0.659,0.6559
29
+ 135k,0.6827,,0.6551,0.6677
30
+ 140k,0.6867,,0.6567,0.6638
31
+ 145k,0.6819,,0.6669,0.6725
32
+ 150k,0.6835,,,0.6788
33
+ 155k,0.6748,,0.663,0.6922
34
+ 160k,0.6875,,0.6748,0.6811
35
+ 165k,0.6788,,0.6725,
36
+ 170k,0.6938,,0.6725,0.6717
37
+ 175k,0.6938,,0.6693,0.689
38
+ 180k,0.6977,,0.674,0.6685
39
+ 185k,0.6875,,0.6811,0.6851
40
+ 190k,0.6914,,,0.6693
41
+ 195k,0.6859,,,0.6756
42
+ 200k,0.6875,,,0.7017
43
+ 205k,0.7072,,,0.6827
44
+ 210k,0.6859,,,0.6882
45
+ 215k,0.7017,,,0.6922
46
+ 220k,0.704,,,0.6969
47
+ 225k,0.7111,,,0.6756
48
+ 230k,0.7103,,,0.7096
49
+ 235k,0.704,,,0.7096
50
+ 240k,0.708,,,
51
+ 245k,0.6985,,,
52
+ 250k,0.7127,,,
53
+ 255k,0.7119,,,
54
+ 260k,0.7056,,,
55
+ 265k,0.704,,,
56
+ 270k,0.7111,,,
57
+ 275k,0.7127,,,
58
+ 280k,0.7064,,,
59
+ 285k,0.7096,,,
60
+ 290k,,,,
61
+ 300k,,,,
62
+ 305k,,,,
63
+ 310k,,,,
64
+ 315k,,,,
65
+ 320k,,,,
66
+ 325k,,,,
67
+ 330k,,,,
68
+ 335k,,,,
 
main.py CHANGED
@@ -54,7 +54,7 @@ front_matter = {
54
  "author": "Nikhil Ranjan",
55
  "authorURL": "https://huggingface.co/nikhilranjan",
56
  "affiliation": "MBZUAI",
57
- "affiliationURL": "",
58
  },
59
  {
60
  "author": "Omkar Pangarkar",
@@ -62,6 +62,12 @@ front_matter = {
62
  "affiliation": "Petuum, Inc.",
63
  "affiliationURL": "",
64
  },
 
 
 
 
 
 
65
  {
66
  "author": "Zhen Wang",
67
  "authorURL": "",
@@ -74,6 +80,12 @@ front_matter = {
74
  "affiliation": "UCSD",
75
  "affiliationURL": "",
76
  },
 
 
 
 
 
 
77
  {
78
  "author": "Zhoujun Cheng",
79
  "authorURL": "https://huggingface.co/zhoujun",
 
54
  "author": "Nikhil Ranjan",
55
  "authorURL": "https://huggingface.co/nikhilranjan",
56
  "affiliation": "MBZUAI",
57
+ "affiliationURL": "LLM360.ai",
58
  },
59
  {
60
  "author": "Omkar Pangarkar",
 
62
  "affiliation": "Petuum, Inc.",
63
  "affiliationURL": "",
64
  },
65
+ {
66
+ "author": "Xuezhi Liang",
67
+ "authorURL": "",
68
+ "affiliation": "MBZUAI",
69
+ "affiliationURL": "",
70
+ },
71
  {
72
  "author": "Zhen Wang",
73
  "authorURL": "",
 
80
  "affiliation": "UCSD",
81
  "affiliationURL": "",
82
  },
83
+ {
84
+ "author": "Bhaskar Rao",
85
+ "authorURL": "",
86
+ "affiliation": "MBZUAI",
87
+ "affiliationURL": "",
88
+ },
89
  {
90
  "author": "Zhoujun Cheng",
91
  "authorURL": "https://huggingface.co/zhoujun",
results.py CHANGED
@@ -25,10 +25,10 @@ for fname in os.listdir("data/txt360_eval"):
25
  df = pd.read_csv(os.path.join("data/txt360_eval", fname))
26
 
27
  # slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
28
- fineweb_res = df.iloc[2:, 4].astype(float).fillna(method="bfill") # fineweb
29
- txt360_base = df.iloc[2:, 5].astype(float).fillna(method="bfill") # txt360-dedup-only
30
- txt360_web_up = df.iloc[2:, 7].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
31
- txt360_all_up_stack = df.iloc[2:, 9].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
32
 
33
  # each row is 20B tokens.
34
  # all_eval_results[metric_name]["slimpajama"] = slimpajama_res
@@ -66,10 +66,6 @@ for metric_name, res in all_eval_results.items():
66
  mode='lines', name='TxT360 - Full Upsampled + Stack V2'
67
  ))
68
 
69
- print(all_eval_results[metric_name]["token"])
70
- print(all_eval_results[metric_name]["fineweb"].tolist())
71
- print(all_eval_results[metric_name]["txt360-web-only-upsampled"].tolist())
72
-
73
  # Update layout
74
  fig_res.update_layout(
75
  title=f"{metric_name} Performance",
@@ -825,7 +821,7 @@ table_div_1 = Div(NotStr(table_html),
825
  intro_div = Div(
826
  H2("TxT360 Studies"),
827
  H3("What This Section Contains"),
828
- P("This section shows the learning curve when pre-training on TxT360, with a proper upsampling approach. We compare several simple strategies and demonstrate that one particular upsampling method, inspired by the natural data distribution, performs exceptionally well. In our preliminary experiments, the model learns significantly faster on TxT360 compared to a similarly scaled dataset, FineWeb. We believe that a more carefully designed upsampling strategy could further enhance the use of our data."),
829
  P("In addition to the training results, we also provide an analysis of the dataset, including perplexity trends over time across the CommonCrawl snapshots. This section is organized into the following topic areas:"),
830
  Ul(
831
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
@@ -865,17 +861,18 @@ upsampling_exp = Div(
865
  "Evaluation results are the most direct indicator of model quality. We assess the intermediate results of the models across multiple metrics and plot the learning curves. Our findings indicate that the model learns significantly faster with TxT360. For a fair comparison, we evaluate TxT360 against FineWeb using only the CommonCrawl data sources, and we also show the curves after incorporating the 14 curated sources and coding data (Stack V2), demonstrating the full potential of the dataset. Due to computation resource constraints, we stop running experiments when we can observe clear trends."
866
  ),
867
  P(
868
- "Based on the metrics, we find that TxT360’s CommonCrawl portion consistently outperforms FineWeb after upsampling, particularly on challenging tasks like MMLU and generation tasks such as NQ. Similar to the findings in DCLM, adding non-CommonCrawl data sources produces mixed results, especially when testing with that specific version of the data. We have since updated the non-CC data to further reduce noise."
869
  ),
870
  plotly2fasthtml(all_eval_res_figs["MMLU"]),
871
  plotly2fasthtml(all_eval_res_figs["NQ"]),
872
- # plotly2fasthtml(all_eval_res_figs["GSM8K"]),
873
  plotly2fasthtml(all_eval_res_figs["HellaSwag"]),
 
 
 
874
  plotly2fasthtml(all_eval_res_figs["MedQA"]),
875
  plotly2fasthtml(all_eval_res_figs["PIQA"]),
876
  plotly2fasthtml(all_eval_res_figs["TriviaQA"]),
877
  plotly2fasthtml(all_eval_res_figs["WinoGrande"]),
878
-
879
  H3("Comparing the Loss Curves"),
880
  P(
881
  "We also plot the training and validation loss curves for each dataset, showing that TxT360 achieves both lower training and validation losses compared to FineWeb. Although training loss may not correlate directly with final model performance, we observe that the loss curve for TxT360 exhibits fewer spikes compared to FineWeb, indicating more stable training dynamics."
 
25
  df = pd.read_csv(os.path.join("data/txt360_eval", fname))
26
 
27
  # slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
28
+ fineweb_res = df.iloc[2:, 1].astype(float).fillna(method="bfill") # fineweb
29
+ txt360_base = df.iloc[2:, 2].astype(float).fillna(method="bfill") # txt360-dedup-only
30
+ txt360_web_up = df.iloc[2:, 3].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
31
+ txt360_all_up_stack = df.iloc[2:, 4].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
32
 
33
  # each row is 20B tokens.
34
  # all_eval_results[metric_name]["slimpajama"] = slimpajama_res
 
66
  mode='lines', name='TxT360 - Full Upsampled + Stack V2'
67
  ))
68
 
 
 
 
 
69
  # Update layout
70
  fig_res.update_layout(
71
  title=f"{metric_name} Performance",
 
821
  intro_div = Div(
822
  H2("TxT360 Studies"),
823
  H3("What This Section Contains"),
824
+ P("This section shows the learning curve when pre-training on TxT360, with a proper upsampling approach. We compare several simple strategies and demonstrate that one particular upsampling method, inspired by the natural data distribution, performs exceptionally well. In our preliminary experiments, the model learns significantly faster on TxT360 compared to a similarly scaled dataset, FineWeb, on several important evaluation metrics. We believe that a more carefully designed upsampling strategy could further enhance the use of our data."),
825
  P("In addition to the training results, we also provide an analysis of the dataset, including perplexity trends over time across the CommonCrawl snapshots. This section is organized into the following topic areas:"),
826
  Ul(
827
  Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
 
861
  "Evaluation results are the most direct indicator of model quality. We assess the intermediate results of the models across multiple metrics and plot the learning curves. Our findings indicate that the model learns significantly faster with TxT360. For a fair comparison, we evaluate TxT360 against FineWeb using only the CommonCrawl data sources, and we also show the curves after incorporating the 14 curated sources and coding data (Stack V2), demonstrating the full potential of the dataset. Due to computation resource constraints, we stop running experiments when we can observe clear trends."
862
  ),
863
  P(
864
+ "Based on the metrics, we find that TxT360’s CommonCrawl portion with the umsampling strategy outperforms FineWeb on key metrics at MMLU, NQ, falls slightly behind on HellaSwag. Furhter, we show that by combining TxT360 with coding data (Stack V2), the learning curve is significantly more stable and we observe improved results across most all of the metrics. Apparently the dataset preference here may depend on the set of metrics one would use."
865
  ),
866
  plotly2fasthtml(all_eval_res_figs["MMLU"]),
867
  plotly2fasthtml(all_eval_res_figs["NQ"]),
 
868
  plotly2fasthtml(all_eval_res_figs["HellaSwag"]),
869
+ P(
870
+ "Similar to the findings in DCLM, adding the curated non-CommonCrawl data sources produces mixed results (some preliminary figures are not shown here). Yet such data can help with domain specific tasks like MedQA."
871
+ ),
872
  plotly2fasthtml(all_eval_res_figs["MedQA"]),
873
  plotly2fasthtml(all_eval_res_figs["PIQA"]),
874
  plotly2fasthtml(all_eval_res_figs["TriviaQA"]),
875
  plotly2fasthtml(all_eval_res_figs["WinoGrande"]),
 
876
  H3("Comparing the Loss Curves"),
877
  P(
878
  "We also plot the training and validation loss curves for each dataset, showing that TxT360 achieves both lower training and validation losses compared to FineWeb. Although training loss may not correlate directly with final model performance, we observe that the loss curve for TxT360 exhibits fewer spikes compared to FineWeb, indicating more stable training dynamics."