|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 25000, |
|
"global_step": 125000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.6675467491149902, |
|
"learning_rate": 0.0002988, |
|
"loss": 0.2058, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.40607237815856934, |
|
"learning_rate": 0.00029759999999999997, |
|
"loss": 0.2096, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9344772100448608, |
|
"learning_rate": 0.0002964, |
|
"loss": 0.237, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.254749059677124, |
|
"learning_rate": 0.00029519999999999997, |
|
"loss": 0.2275, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8561826348304749, |
|
"learning_rate": 0.000294, |
|
"loss": 0.2355, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.0209927558898926, |
|
"learning_rate": 0.00029279999999999996, |
|
"loss": 0.2275, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9906947612762451, |
|
"learning_rate": 0.0002916, |
|
"loss": 0.2188, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9113153219223022, |
|
"learning_rate": 0.00029039999999999996, |
|
"loss": 0.2272, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.4784849882125854, |
|
"learning_rate": 0.0002892, |
|
"loss": 0.2437, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9436748623847961, |
|
"learning_rate": 0.00028799999999999995, |
|
"loss": 0.2378, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9457006454467773, |
|
"learning_rate": 0.0002868, |
|
"loss": 0.2244, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6395999193191528, |
|
"learning_rate": 0.00028559999999999995, |
|
"loss": 0.2251, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4920419156551361, |
|
"learning_rate": 0.0002844, |
|
"loss": 0.2249, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.67146235704422, |
|
"learning_rate": 0.00028319999999999994, |
|
"loss": 0.2223, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.5474592447280884, |
|
"learning_rate": 0.00028199999999999997, |
|
"loss": 0.2231, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2913522720336914, |
|
"learning_rate": 0.0002808, |
|
"loss": 0.2344, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.6871858835220337, |
|
"learning_rate": 0.00027959999999999997, |
|
"loss": 0.2325, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6918925046920776, |
|
"learning_rate": 0.0002784, |
|
"loss": 0.2329, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9310051798820496, |
|
"learning_rate": 0.0002772, |
|
"loss": 0.2309, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6937859058380127, |
|
"learning_rate": 0.000276, |
|
"loss": 0.2362, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.504669666290283, |
|
"learning_rate": 0.0002748, |
|
"loss": 0.2295, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7235252261161804, |
|
"learning_rate": 0.0002736, |
|
"loss": 0.2286, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.0144588947296143, |
|
"learning_rate": 0.0002724, |
|
"loss": 0.2302, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7903403043746948, |
|
"learning_rate": 0.0002712, |
|
"loss": 0.2266, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.6691497564315796, |
|
"learning_rate": 0.00027, |
|
"loss": 0.2356, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6409358978271484, |
|
"learning_rate": 0.0002688, |
|
"loss": 0.2292, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7465757131576538, |
|
"learning_rate": 0.0002676, |
|
"loss": 0.2347, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.411289930343628, |
|
"learning_rate": 0.00026639999999999997, |
|
"loss": 0.2275, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5167527198791504, |
|
"learning_rate": 0.0002652, |
|
"loss": 0.2364, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.128173589706421, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 0.2197, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8312121033668518, |
|
"learning_rate": 0.0002628, |
|
"loss": 0.236, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7007321715354919, |
|
"learning_rate": 0.00026159999999999996, |
|
"loss": 0.2229, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0870940685272217, |
|
"learning_rate": 0.0002604, |
|
"loss": 0.2201, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8884884715080261, |
|
"learning_rate": 0.00025919999999999996, |
|
"loss": 0.2178, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5293123126029968, |
|
"learning_rate": 0.000258, |
|
"loss": 0.2248, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8335347771644592, |
|
"learning_rate": 0.00025679999999999995, |
|
"loss": 0.2236, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7105595469474792, |
|
"learning_rate": 0.0002556, |
|
"loss": 0.2294, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0978660583496094, |
|
"learning_rate": 0.00025439999999999995, |
|
"loss": 0.2236, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7025924921035767, |
|
"learning_rate": 0.0002532, |
|
"loss": 0.219, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.9559372067451477, |
|
"learning_rate": 0.00025199999999999995, |
|
"loss": 0.2147, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.550178587436676, |
|
"learning_rate": 0.00025079999999999997, |
|
"loss": 0.2141, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.0202014446258545, |
|
"learning_rate": 0.00024959999999999994, |
|
"loss": 0.2166, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5799145102500916, |
|
"learning_rate": 0.00024839999999999997, |
|
"loss": 0.2219, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.9509211778640747, |
|
"learning_rate": 0.0002472, |
|
"loss": 0.2144, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6535309553146362, |
|
"learning_rate": 0.00024599999999999996, |
|
"loss": 0.2189, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.4835234880447388, |
|
"learning_rate": 0.0002448, |
|
"loss": 0.2105, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.50291907787323, |
|
"learning_rate": 0.00024359999999999999, |
|
"loss": 0.2076, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6879478693008423, |
|
"learning_rate": 0.00024239999999999998, |
|
"loss": 0.2231, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.3261083364486694, |
|
"learning_rate": 0.00024119999999999998, |
|
"loss": 0.2255, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1070926189422607, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.229, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.14503039419651031, |
|
"eval_runtime": 197.5245, |
|
"eval_samples_per_second": 101.253, |
|
"eval_steps_per_second": 25.313, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4342971742153168, |
|
"learning_rate": 0.0002388, |
|
"loss": 0.2178, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.339428901672363, |
|
"learning_rate": 0.0002376, |
|
"loss": 0.2107, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.38554310798645, |
|
"learning_rate": 0.0002364, |
|
"loss": 0.2227, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.249841332435608, |
|
"learning_rate": 0.0002352, |
|
"loss": 0.2119, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.1185526847839355, |
|
"learning_rate": 0.000234, |
|
"loss": 0.2085, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.008499264717102, |
|
"learning_rate": 0.0002328, |
|
"loss": 0.214, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0491580963134766, |
|
"learning_rate": 0.0002316, |
|
"loss": 0.2086, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0968835353851318, |
|
"learning_rate": 0.0002304, |
|
"loss": 0.2094, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.4313267767429352, |
|
"learning_rate": 0.0002292, |
|
"loss": 0.2048, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5634821653366089, |
|
"learning_rate": 0.00022799999999999999, |
|
"loss": 0.211, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8901171684265137, |
|
"learning_rate": 0.00022679999999999998, |
|
"loss": 0.2123, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5377653241157532, |
|
"learning_rate": 0.00022559999999999998, |
|
"loss": 0.2097, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.8237270712852478, |
|
"learning_rate": 0.00022439999999999998, |
|
"loss": 0.2066, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5759382247924805, |
|
"learning_rate": 0.00022319999999999998, |
|
"loss": 0.2098, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7323036193847656, |
|
"learning_rate": 0.00022199999999999998, |
|
"loss": 0.2087, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8658179640769958, |
|
"learning_rate": 0.00022079999999999997, |
|
"loss": 0.2038, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.8712558150291443, |
|
"learning_rate": 0.00021959999999999997, |
|
"loss": 0.2202, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.005462884902954, |
|
"learning_rate": 0.00021839999999999997, |
|
"loss": 0.2052, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4281785786151886, |
|
"learning_rate": 0.00021719999999999997, |
|
"loss": 0.2049, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5366469621658325, |
|
"learning_rate": 0.00021599999999999996, |
|
"loss": 0.2067, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.9057306051254272, |
|
"learning_rate": 0.00021479999999999996, |
|
"loss": 0.2036, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.9041026830673218, |
|
"learning_rate": 0.00021359999999999996, |
|
"loss": 0.2092, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.8272420167922974, |
|
"learning_rate": 0.00021239999999999996, |
|
"loss": 0.2043, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.302566409111023, |
|
"learning_rate": 0.00021119999999999996, |
|
"loss": 0.2247, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.45851215720176697, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 0.2119, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.9625339508056641, |
|
"learning_rate": 0.00020879999999999998, |
|
"loss": 0.2036, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.465646505355835, |
|
"learning_rate": 0.00020759999999999998, |
|
"loss": 0.1981, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.8826301097869873, |
|
"learning_rate": 0.00020639999999999998, |
|
"loss": 0.2008, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9201263785362244, |
|
"learning_rate": 0.0002052, |
|
"loss": 0.1959, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.8154922723770142, |
|
"learning_rate": 0.000204, |
|
"loss": 0.2032, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0132577419281006, |
|
"learning_rate": 0.0002028, |
|
"loss": 0.2012, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.34237733483314514, |
|
"learning_rate": 0.0002016, |
|
"loss": 0.2068, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5986165404319763, |
|
"learning_rate": 0.0002004, |
|
"loss": 0.204, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2128850221633911, |
|
"learning_rate": 0.0001992, |
|
"loss": 0.1969, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.2548407316207886, |
|
"learning_rate": 0.000198, |
|
"loss": 0.2055, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9521819949150085, |
|
"learning_rate": 0.00019679999999999999, |
|
"loss": 0.2018, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6435508728027344, |
|
"learning_rate": 0.00019559999999999998, |
|
"loss": 0.2013, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.0959296226501465, |
|
"learning_rate": 0.00019439999999999998, |
|
"loss": 0.2032, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5862271189689636, |
|
"learning_rate": 0.00019319999999999998, |
|
"loss": 0.1961, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.31843170523643494, |
|
"learning_rate": 0.00019199999999999998, |
|
"loss": 0.196, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.3819515705108643, |
|
"learning_rate": 0.00019079999999999998, |
|
"loss": 0.1966, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.03080415725708, |
|
"learning_rate": 0.00018959999999999997, |
|
"loss": 0.1985, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.47353965044021606, |
|
"learning_rate": 0.00018839999999999997, |
|
"loss": 0.1928, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.774486780166626, |
|
"learning_rate": 0.0001872, |
|
"loss": 0.1985, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.371840000152588, |
|
"learning_rate": 0.000186, |
|
"loss": 0.2053, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.9633282423019409, |
|
"learning_rate": 0.0001848, |
|
"loss": 0.1967, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7216082215309143, |
|
"learning_rate": 0.0001836, |
|
"loss": 0.1935, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8321841955184937, |
|
"learning_rate": 0.0001824, |
|
"loss": 0.1909, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.924881637096405, |
|
"learning_rate": 0.00018119999999999999, |
|
"loss": 0.1914, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1322112083435059, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.1823, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.1376880556344986, |
|
"eval_runtime": 197.1453, |
|
"eval_samples_per_second": 101.448, |
|
"eval_steps_per_second": 25.362, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5036086440086365, |
|
"learning_rate": 0.00017879999999999998, |
|
"loss": 0.19, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8745198249816895, |
|
"learning_rate": 0.00017759999999999998, |
|
"loss": 0.1952, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.9014063477516174, |
|
"learning_rate": 0.00017639999999999998, |
|
"loss": 0.1825, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5580583214759827, |
|
"learning_rate": 0.00017519999999999998, |
|
"loss": 0.1958, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6560271382331848, |
|
"learning_rate": 0.00017399999999999997, |
|
"loss": 0.1986, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9042519330978394, |
|
"learning_rate": 0.00017279999999999997, |
|
"loss": 0.2037, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7758679986000061, |
|
"learning_rate": 0.00017159999999999997, |
|
"loss": 0.1891, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.9086706042289734, |
|
"learning_rate": 0.00017039999999999997, |
|
"loss": 0.1864, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.0545058250427246, |
|
"learning_rate": 0.00016919999999999997, |
|
"loss": 0.199, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.20504042506217957, |
|
"learning_rate": 0.000168, |
|
"loss": 0.1902, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7427380084991455, |
|
"learning_rate": 0.0001668, |
|
"loss": 0.192, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.42541056871414185, |
|
"learning_rate": 0.0001656, |
|
"loss": 0.1939, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1101672649383545, |
|
"learning_rate": 0.0001644, |
|
"loss": 0.195, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7569801807403564, |
|
"learning_rate": 0.0001632, |
|
"loss": 0.1838, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2808530032634735, |
|
"learning_rate": 0.000162, |
|
"loss": 0.1948, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3836987018585205, |
|
"learning_rate": 0.0001608, |
|
"loss": 0.1908, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.8910863995552063, |
|
"learning_rate": 0.0001596, |
|
"loss": 0.1916, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.40731602907180786, |
|
"learning_rate": 0.0001584, |
|
"loss": 0.178, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.8736116886138916, |
|
"learning_rate": 0.0001572, |
|
"loss": 0.1932, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.457201600074768, |
|
"learning_rate": 0.000156, |
|
"loss": 0.1815, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6906099319458008, |
|
"learning_rate": 0.0001548, |
|
"loss": 0.1923, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0163846015930176, |
|
"learning_rate": 0.0001536, |
|
"loss": 0.1843, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.36051931977272034, |
|
"learning_rate": 0.0001524, |
|
"loss": 0.1722, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3554840087890625, |
|
"learning_rate": 0.0001512, |
|
"loss": 0.1868, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.4623011350631714, |
|
"learning_rate": 0.00015, |
|
"loss": 0.1833, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.31331172585487366, |
|
"learning_rate": 0.00014879999999999998, |
|
"loss": 0.1826, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.1543946266174316, |
|
"learning_rate": 0.00014759999999999998, |
|
"loss": 0.181, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.758339524269104, |
|
"learning_rate": 0.00014639999999999998, |
|
"loss": 0.1813, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7100640535354614, |
|
"learning_rate": 0.00014519999999999998, |
|
"loss": 0.1822, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7758503556251526, |
|
"learning_rate": 0.00014399999999999998, |
|
"loss": 0.1809, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.6668974161148071, |
|
"learning_rate": 0.00014279999999999997, |
|
"loss": 0.1888, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6467080116271973, |
|
"learning_rate": 0.00014159999999999997, |
|
"loss": 0.1809, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.47351643443107605, |
|
"learning_rate": 0.0001404, |
|
"loss": 0.1822, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6044811606407166, |
|
"learning_rate": 0.0001392, |
|
"loss": 0.1822, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.485547423362732, |
|
"learning_rate": 0.000138, |
|
"loss": 0.1789, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.9173816442489624, |
|
"learning_rate": 0.0001368, |
|
"loss": 0.184, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2925781011581421, |
|
"learning_rate": 0.0001356, |
|
"loss": 0.183, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.29401320219039917, |
|
"learning_rate": 0.0001344, |
|
"loss": 0.1799, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5869783163070679, |
|
"learning_rate": 0.00013319999999999999, |
|
"loss": 0.1768, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.3995858430862427, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 0.1848, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.703525185585022, |
|
"learning_rate": 0.00013079999999999998, |
|
"loss": 0.1734, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.8634682893753052, |
|
"learning_rate": 0.00012959999999999998, |
|
"loss": 0.1789, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5816720724105835, |
|
"learning_rate": 0.00012839999999999998, |
|
"loss": 0.1772, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.1784312725067139, |
|
"learning_rate": 0.00012719999999999997, |
|
"loss": 0.1759, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.4554961025714874, |
|
"learning_rate": 0.00012599999999999997, |
|
"loss": 0.184, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8070880174636841, |
|
"learning_rate": 0.00012479999999999997, |
|
"loss": 0.1813, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.8661233186721802, |
|
"learning_rate": 0.0001236, |
|
"loss": 0.1733, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.788370668888092, |
|
"learning_rate": 0.0001224, |
|
"loss": 0.1788, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.25436270236969, |
|
"learning_rate": 0.00012119999999999999, |
|
"loss": 0.1808, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.562129020690918, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 0.1757, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.1267654001712799, |
|
"eval_runtime": 198.6188, |
|
"eval_samples_per_second": 100.695, |
|
"eval_steps_per_second": 25.174, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4803348481655121, |
|
"learning_rate": 0.0001188, |
|
"loss": 0.1827, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7015312910079956, |
|
"learning_rate": 0.0001176, |
|
"loss": 0.1731, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7549095153808594, |
|
"learning_rate": 0.0001164, |
|
"loss": 0.1815, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0105984210968018, |
|
"learning_rate": 0.0001152, |
|
"loss": 0.1695, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.8554935455322266, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 0.1767, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7114706039428711, |
|
"learning_rate": 0.00011279999999999999, |
|
"loss": 0.1667, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6958938241004944, |
|
"learning_rate": 0.00011159999999999999, |
|
"loss": 0.1765, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.3407191336154938, |
|
"learning_rate": 0.00011039999999999999, |
|
"loss": 0.1834, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1113251447677612, |
|
"learning_rate": 0.00010919999999999998, |
|
"loss": 0.173, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9301660060882568, |
|
"learning_rate": 0.00010799999999999998, |
|
"loss": 0.1731, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.45441198348999023, |
|
"learning_rate": 0.00010679999999999998, |
|
"loss": 0.1747, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.413922518491745, |
|
"learning_rate": 0.00010559999999999998, |
|
"loss": 0.1783, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6205990314483643, |
|
"learning_rate": 0.00010439999999999999, |
|
"loss": 0.174, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.4416259527206421, |
|
"learning_rate": 0.00010319999999999999, |
|
"loss": 0.1773, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6363853216171265, |
|
"learning_rate": 0.000102, |
|
"loss": 0.1781, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5858229398727417, |
|
"learning_rate": 0.0001008, |
|
"loss": 0.1675, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.49044686555862427, |
|
"learning_rate": 9.96e-05, |
|
"loss": 0.1672, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8010206818580627, |
|
"learning_rate": 9.839999999999999e-05, |
|
"loss": 0.1658, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1232587099075317, |
|
"learning_rate": 9.719999999999999e-05, |
|
"loss": 0.171, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5122153759002686, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 0.1768, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.524728775024414, |
|
"learning_rate": 9.479999999999999e-05, |
|
"loss": 0.1741, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6309654116630554, |
|
"learning_rate": 9.36e-05, |
|
"loss": 0.168, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7848625183105469, |
|
"learning_rate": 9.24e-05, |
|
"loss": 0.1727, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.2840665578842163, |
|
"learning_rate": 9.12e-05, |
|
"loss": 0.1681, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.561613917350769, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 0.174, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.3210696578025818, |
|
"learning_rate": 8.879999999999999e-05, |
|
"loss": 0.1665, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6899012327194214, |
|
"learning_rate": 8.759999999999999e-05, |
|
"loss": 0.1713, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5571711659431458, |
|
"learning_rate": 8.639999999999999e-05, |
|
"loss": 0.1742, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3232167065143585, |
|
"learning_rate": 8.519999999999998e-05, |
|
"loss": 0.1642, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5947475433349609, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.1607, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.0395877361297607, |
|
"learning_rate": 8.28e-05, |
|
"loss": 0.1668, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8020333051681519, |
|
"learning_rate": 8.16e-05, |
|
"loss": 0.1705, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.420878142118454, |
|
"learning_rate": 8.04e-05, |
|
"loss": 0.1495, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.2120175063610077, |
|
"learning_rate": 7.92e-05, |
|
"loss": 0.1684, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.321204662322998, |
|
"learning_rate": 7.8e-05, |
|
"loss": 0.1677, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5741947889328003, |
|
"learning_rate": 7.68e-05, |
|
"loss": 0.1643, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2746628224849701, |
|
"learning_rate": 7.56e-05, |
|
"loss": 0.169, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.5869113206863403, |
|
"learning_rate": 7.439999999999999e-05, |
|
"loss": 0.158, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5085694789886475, |
|
"learning_rate": 7.319999999999999e-05, |
|
"loss": 0.1766, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.8071849942207336, |
|
"learning_rate": 7.199999999999999e-05, |
|
"loss": 0.1623, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6275350451469421, |
|
"learning_rate": 7.079999999999999e-05, |
|
"loss": 0.1624, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.35898515582084656, |
|
"learning_rate": 6.96e-05, |
|
"loss": 0.1659, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5679383873939514, |
|
"learning_rate": 6.84e-05, |
|
"loss": 0.1621, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.2387058734893799, |
|
"learning_rate": 6.72e-05, |
|
"loss": 0.1603, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.4777892529964447, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 0.1584, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.789397120475769, |
|
"learning_rate": 6.479999999999999e-05, |
|
"loss": 0.1621, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.8871356248855591, |
|
"learning_rate": 6.359999999999999e-05, |
|
"loss": 0.1559, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0920441150665283, |
|
"learning_rate": 6.239999999999999e-05, |
|
"loss": 0.169, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.3875184059143066, |
|
"learning_rate": 6.12e-05, |
|
"loss": 0.1623, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.1163883209228516, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 0.1663, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.1176806390285492, |
|
"eval_runtime": 202.7443, |
|
"eval_samples_per_second": 98.646, |
|
"eval_steps_per_second": 24.662, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.8637445569038391, |
|
"learning_rate": 5.88e-05, |
|
"loss": 0.1663, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.794600486755371, |
|
"learning_rate": 5.76e-05, |
|
"loss": 0.1638, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.7344540953636169, |
|
"learning_rate": 5.6399999999999995e-05, |
|
"loss": 0.1589, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.591451108455658, |
|
"learning_rate": 5.519999999999999e-05, |
|
"loss": 0.1613, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.9407438635826111, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 0.1588, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5834544897079468, |
|
"learning_rate": 5.279999999999999e-05, |
|
"loss": 0.1708, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6752551198005676, |
|
"learning_rate": 5.1599999999999994e-05, |
|
"loss": 0.1548, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6357858180999756, |
|
"learning_rate": 5.04e-05, |
|
"loss": 0.1641, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.6341969966888428, |
|
"learning_rate": 4.9199999999999997e-05, |
|
"loss": 0.162, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3042418956756592, |
|
"learning_rate": 4.7999999999999994e-05, |
|
"loss": 0.1519, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0411969423294067, |
|
"learning_rate": 4.68e-05, |
|
"loss": 0.1552, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.1914784461259842, |
|
"learning_rate": 4.56e-05, |
|
"loss": 0.1614, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5716798305511475, |
|
"learning_rate": 4.4399999999999995e-05, |
|
"loss": 0.1639, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.2206032276153564, |
|
"learning_rate": 4.319999999999999e-05, |
|
"loss": 0.1504, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9124048352241516, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.1575, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4684576392173767, |
|
"learning_rate": 4.08e-05, |
|
"loss": 0.1678, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4757069945335388, |
|
"learning_rate": 3.96e-05, |
|
"loss": 0.1609, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.4235193431377411, |
|
"learning_rate": 3.84e-05, |
|
"loss": 0.1633, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6009053587913513, |
|
"learning_rate": 3.7199999999999996e-05, |
|
"loss": 0.1528, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.9195963144302368, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 0.1616, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5454314947128296, |
|
"learning_rate": 3.48e-05, |
|
"loss": 0.1611, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5214249491691589, |
|
"learning_rate": 3.36e-05, |
|
"loss": 0.1586, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.0026371479034424, |
|
"learning_rate": 3.2399999999999995e-05, |
|
"loss": 0.1641, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.3591657876968384, |
|
"learning_rate": 3.119999999999999e-05, |
|
"loss": 0.1599, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6785110831260681, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 0.1585, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.6680885553359985, |
|
"learning_rate": 2.88e-05, |
|
"loss": 0.1534, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.4302424192428589, |
|
"learning_rate": 2.7599999999999997e-05, |
|
"loss": 0.1603, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9608228206634521, |
|
"learning_rate": 2.6399999999999995e-05, |
|
"loss": 0.1509, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.16929888725280762, |
|
"learning_rate": 2.52e-05, |
|
"loss": 0.1576, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.19129659235477448, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 0.1556, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.7054457068443298, |
|
"learning_rate": 2.28e-05, |
|
"loss": 0.1615, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.2099674940109253, |
|
"learning_rate": 2.1599999999999996e-05, |
|
"loss": 0.152, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6213043332099915, |
|
"learning_rate": 2.04e-05, |
|
"loss": 0.159, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5257019996643066, |
|
"learning_rate": 1.92e-05, |
|
"loss": 0.1643, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7175723314285278, |
|
"learning_rate": 1.7999999999999997e-05, |
|
"loss": 0.1561, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.37755417823791504, |
|
"learning_rate": 1.68e-05, |
|
"loss": 0.1594, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.6101952195167542, |
|
"learning_rate": 1.5599999999999996e-05, |
|
"loss": 0.1574, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.0712097883224487, |
|
"learning_rate": 1.44e-05, |
|
"loss": 0.1468, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5005315542221069, |
|
"learning_rate": 1.3199999999999997e-05, |
|
"loss": 0.1586, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7147447466850281, |
|
"learning_rate": 1.1999999999999999e-05, |
|
"loss": 0.1553, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.2048006057739258, |
|
"learning_rate": 1.0799999999999998e-05, |
|
"loss": 0.1603, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6054685711860657, |
|
"learning_rate": 9.6e-06, |
|
"loss": 0.161, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.4145253300666809, |
|
"learning_rate": 8.4e-06, |
|
"loss": 0.1549, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.2331937849521637, |
|
"learning_rate": 7.2e-06, |
|
"loss": 0.1584, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.44909703731536865, |
|
"learning_rate": 5.999999999999999e-06, |
|
"loss": 0.161, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 6.39445686340332, |
|
"learning_rate": 4.8e-06, |
|
"loss": 0.159, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5847737193107605, |
|
"learning_rate": 3.6e-06, |
|
"loss": 0.1469, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.45996806025505066, |
|
"learning_rate": 2.4e-06, |
|
"loss": 0.1606, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5662907361984253, |
|
"learning_rate": 1.2e-06, |
|
"loss": 0.1511, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9263651967048645, |
|
"learning_rate": 0.0, |
|
"loss": 0.1543, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.11344572901725769, |
|
"eval_runtime": 200.0914, |
|
"eval_samples_per_second": 99.954, |
|
"eval_steps_per_second": 24.989, |
|
"step": 125000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 125000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 62500, |
|
"total_flos": 6.527143521828864e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|