|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 10, |
|
"global_step": 1250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 8.746766090393066, |
|
"learning_rate": 2.9998815663057244e-06, |
|
"loss": 1.3502, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 6.427896499633789, |
|
"learning_rate": 2.99952628392495e-06, |
|
"loss": 0.6921, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"eval_loss": 0.6720694303512573, |
|
"eval_runtime": 13.3015, |
|
"eval_samples_per_second": 150.359, |
|
"eval_steps_per_second": 9.397, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 4.996713638305664, |
|
"learning_rate": 2.9989342089608837e-06, |
|
"loss": 0.6382, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 5.306614875793457, |
|
"learning_rate": 2.9981054349090266e-06, |
|
"loss": 0.6141, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"eval_loss": 0.6662230491638184, |
|
"eval_runtime": 12.0717, |
|
"eval_samples_per_second": 165.676, |
|
"eval_steps_per_second": 10.355, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.872737407684326, |
|
"learning_rate": 2.9970400926424076e-06, |
|
"loss": 0.6373, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 4.585391998291016, |
|
"learning_rate": 2.995738350390921e-06, |
|
"loss": 0.6665, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"eval_loss": 0.6603240370750427, |
|
"eval_runtime": 12.0003, |
|
"eval_samples_per_second": 166.663, |
|
"eval_steps_per_second": 10.416, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 4.579038619995117, |
|
"learning_rate": 2.9942004137147588e-06, |
|
"loss": 0.6827, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 4.587752819061279, |
|
"learning_rate": 2.9924265254719506e-06, |
|
"loss": 0.6116, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"eval_loss": 0.661081075668335, |
|
"eval_runtime": 12.0453, |
|
"eval_samples_per_second": 166.04, |
|
"eval_steps_per_second": 10.377, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 4.197625160217285, |
|
"learning_rate": 2.9904169657800125e-06, |
|
"loss": 0.6483, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.366331100463867, |
|
"learning_rate": 2.988172051971717e-06, |
|
"loss": 0.6102, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.6579549908638, |
|
"eval_runtime": 12.021, |
|
"eval_samples_per_second": 166.376, |
|
"eval_steps_per_second": 10.398, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 4.392234802246094, |
|
"learning_rate": 2.985692138544977e-06, |
|
"loss": 0.6567, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 4.914165019989014, |
|
"learning_rate": 2.982977617106871e-06, |
|
"loss": 0.6886, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"eval_loss": 0.6593431234359741, |
|
"eval_runtime": 12.0251, |
|
"eval_samples_per_second": 166.319, |
|
"eval_steps_per_second": 10.395, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 4.842955589294434, |
|
"learning_rate": 2.980028916311802e-06, |
|
"loss": 0.6573, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 4.415096282958984, |
|
"learning_rate": 2.9768465017938084e-06, |
|
"loss": 0.6415, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"eval_loss": 0.6595576405525208, |
|
"eval_runtime": 12.0577, |
|
"eval_samples_per_second": 165.869, |
|
"eval_steps_per_second": 10.367, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.4561028480529785, |
|
"learning_rate": 2.9734308760930334e-06, |
|
"loss": 0.696, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 4.971661567687988, |
|
"learning_rate": 2.9697825785763704e-06, |
|
"loss": 0.6214, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"eval_loss": 0.6594560742378235, |
|
"eval_runtime": 12.0017, |
|
"eval_samples_per_second": 166.643, |
|
"eval_steps_per_second": 10.415, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 4.301797389984131, |
|
"learning_rate": 2.9659021853522904e-06, |
|
"loss": 0.6265, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 4.3297319412231445, |
|
"learning_rate": 2.961790309179866e-06, |
|
"loss": 0.6816, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"eval_loss": 0.6583801507949829, |
|
"eval_runtime": 12.0447, |
|
"eval_samples_per_second": 166.048, |
|
"eval_steps_per_second": 10.378, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 4.100594997406006, |
|
"learning_rate": 2.957447599372011e-06, |
|
"loss": 0.6395, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.249173641204834, |
|
"learning_rate": 2.9528747416929465e-06, |
|
"loss": 0.6481, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.6597488522529602, |
|
"eval_runtime": 12.1055, |
|
"eval_samples_per_second": 165.214, |
|
"eval_steps_per_second": 10.326, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 4.542850017547607, |
|
"learning_rate": 2.9480724582499107e-06, |
|
"loss": 0.6964, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 3.8737800121307373, |
|
"learning_rate": 2.943041507379129e-06, |
|
"loss": 0.6022, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"eval_loss": 0.6589648723602295, |
|
"eval_runtime": 12.0836, |
|
"eval_samples_per_second": 165.514, |
|
"eval_steps_per_second": 10.345, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 3.998706340789795, |
|
"learning_rate": 2.937782683526064e-06, |
|
"loss": 0.6373, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 4.192784309387207, |
|
"learning_rate": 2.9322968171199645e-06, |
|
"loss": 0.6703, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"eval_loss": 0.6606726050376892, |
|
"eval_runtime": 12.0359, |
|
"eval_samples_per_second": 166.17, |
|
"eval_steps_per_second": 10.386, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.3980231285095215, |
|
"learning_rate": 2.9265847744427307e-06, |
|
"loss": 0.5916, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 4.94151496887207, |
|
"learning_rate": 2.9206474574921165e-06, |
|
"loss": 0.6742, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"eval_loss": 0.6614590287208557, |
|
"eval_runtime": 12.0042, |
|
"eval_samples_per_second": 166.608, |
|
"eval_steps_per_second": 10.413, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 3.713902473449707, |
|
"learning_rate": 2.914485803839297e-06, |
|
"loss": 0.6717, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 4.350754737854004, |
|
"learning_rate": 2.9081007864808113e-06, |
|
"loss": 0.6369, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"eval_loss": 0.6614840030670166, |
|
"eval_runtime": 11.9865, |
|
"eval_samples_per_second": 166.854, |
|
"eval_steps_per_second": 10.428, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 3.956383466720581, |
|
"learning_rate": 2.9014934136849183e-06, |
|
"loss": 0.642, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.635798454284668, |
|
"learning_rate": 2.894664728832377e-06, |
|
"loss": 0.7142, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.660183846950531, |
|
"eval_runtime": 12.0567, |
|
"eval_samples_per_second": 165.883, |
|
"eval_steps_per_second": 10.368, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 4.187668800354004, |
|
"learning_rate": 2.887615810251687e-06, |
|
"loss": 0.6241, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 4.165324687957764, |
|
"learning_rate": 2.8803477710488056e-06, |
|
"loss": 0.6707, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"eval_loss": 0.6611347794532776, |
|
"eval_runtime": 12.5135, |
|
"eval_samples_per_second": 159.828, |
|
"eval_steps_per_second": 9.989, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 4.027588844299316, |
|
"learning_rate": 2.8728617589313763e-06, |
|
"loss": 0.6436, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 3.7992372512817383, |
|
"learning_rate": 2.8651589560274937e-06, |
|
"loss": 0.6629, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"eval_loss": 0.6608501672744751, |
|
"eval_runtime": 12.007, |
|
"eval_samples_per_second": 166.57, |
|
"eval_steps_per_second": 10.411, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.8681631088256836, |
|
"learning_rate": 2.8572405786990296e-06, |
|
"loss": 0.6505, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 3.890542507171631, |
|
"learning_rate": 2.8491078773495566e-06, |
|
"loss": 0.6299, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"eval_loss": 0.6610468626022339, |
|
"eval_runtime": 12.01, |
|
"eval_samples_per_second": 166.528, |
|
"eval_steps_per_second": 10.408, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 3.706123113632202, |
|
"learning_rate": 2.840762136226896e-06, |
|
"loss": 0.6188, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 4.024528503417969, |
|
"learning_rate": 2.832204673220317e-06, |
|
"loss": 0.6351, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"eval_loss": 0.6607259511947632, |
|
"eval_runtime": 12.56, |
|
"eval_samples_per_second": 159.235, |
|
"eval_steps_per_second": 9.952, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 3.7646734714508057, |
|
"learning_rate": 2.8234368396524304e-06, |
|
"loss": 0.5825, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.327579021453857, |
|
"learning_rate": 2.814460020065795e-06, |
|
"loss": 0.5885, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.6609508991241455, |
|
"eval_runtime": 12.0117, |
|
"eval_samples_per_second": 166.504, |
|
"eval_steps_per_second": 10.407, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 3.8449947834014893, |
|
"learning_rate": 2.8052756320042887e-06, |
|
"loss": 0.6157, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 4.472646236419678, |
|
"learning_rate": 2.795885125789253e-06, |
|
"loss": 0.6613, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"eval_loss": 0.6619015336036682, |
|
"eval_runtime": 12.0457, |
|
"eval_samples_per_second": 166.034, |
|
"eval_steps_per_second": 10.377, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 4.221950054168701, |
|
"learning_rate": 2.7862899842904785e-06, |
|
"loss": 0.662, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 3.822866201400757, |
|
"learning_rate": 2.776491722692038e-06, |
|
"loss": 0.6151, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"eval_loss": 0.6602036356925964, |
|
"eval_runtime": 12.088, |
|
"eval_samples_per_second": 165.453, |
|
"eval_steps_per_second": 10.341, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.046660423278809, |
|
"learning_rate": 2.7664918882530226e-06, |
|
"loss": 0.6348, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 3.865205764770508, |
|
"learning_rate": 2.756292060063213e-06, |
|
"loss": 0.6342, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"eval_loss": 0.6608572602272034, |
|
"eval_runtime": 12.111, |
|
"eval_samples_per_second": 165.138, |
|
"eval_steps_per_second": 10.321, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 3.76444673538208, |
|
"learning_rate": 2.745893848793719e-06, |
|
"loss": 0.658, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 4.313933849334717, |
|
"learning_rate": 2.735298896442641e-06, |
|
"loss": 0.6376, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"eval_loss": 0.6601213216781616, |
|
"eval_runtime": 12.0967, |
|
"eval_samples_per_second": 165.335, |
|
"eval_steps_per_second": 10.333, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 3.985441207885742, |
|
"learning_rate": 2.7245088760757763e-06, |
|
"loss": 0.6508, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.7975800037384033, |
|
"learning_rate": 2.713525491562421e-06, |
|
"loss": 0.679, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6601463556289673, |
|
"eval_runtime": 12.1046, |
|
"eval_samples_per_second": 165.227, |
|
"eval_steps_per_second": 10.327, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 4.147428512573242, |
|
"learning_rate": 2.702350477306315e-06, |
|
"loss": 0.6564, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 4.010830879211426, |
|
"learning_rate": 2.690985597971753e-06, |
|
"loss": 0.6911, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"eval_loss": 0.6592859029769897, |
|
"eval_runtime": 12.2518, |
|
"eval_samples_per_second": 163.242, |
|
"eval_steps_per_second": 10.203, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 4.228829383850098, |
|
"learning_rate": 2.679432648204928e-06, |
|
"loss": 0.6194, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 3.865734577178955, |
|
"learning_rate": 2.6676934523505355e-06, |
|
"loss": 0.6717, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"eval_loss": 0.6591557860374451, |
|
"eval_runtime": 12.0618, |
|
"eval_samples_per_second": 165.813, |
|
"eval_steps_per_second": 10.363, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.116940498352051, |
|
"learning_rate": 2.655769864163684e-06, |
|
"loss": 0.6292, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 3.9206840991973877, |
|
"learning_rate": 2.643663766517172e-06, |
|
"loss": 0.6758, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"eval_loss": 0.6602749228477478, |
|
"eval_runtime": 12.1339, |
|
"eval_samples_per_second": 164.827, |
|
"eval_steps_per_second": 10.302, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 3.7658212184906006, |
|
"learning_rate": 2.6313770711041557e-06, |
|
"loss": 0.6698, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 3.8347365856170654, |
|
"learning_rate": 2.6189117181362736e-06, |
|
"loss": 0.6243, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"eval_loss": 0.660269021987915, |
|
"eval_runtime": 12.0287, |
|
"eval_samples_per_second": 166.269, |
|
"eval_steps_per_second": 10.392, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 4.060953140258789, |
|
"learning_rate": 2.606269676037261e-06, |
|
"loss": 0.7274, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.1027655601501465, |
|
"learning_rate": 2.5934529411321173e-06, |
|
"loss": 0.643, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.6585854291915894, |
|
"eval_runtime": 12.0789, |
|
"eval_samples_per_second": 165.578, |
|
"eval_steps_per_second": 10.349, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 3.78674578666687, |
|
"learning_rate": 2.5804635373318606e-06, |
|
"loss": 0.6707, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 3.7603418827056885, |
|
"learning_rate": 2.5673035158139285e-06, |
|
"loss": 0.603, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"eval_loss": 0.6572903394699097, |
|
"eval_runtime": 12.2284, |
|
"eval_samples_per_second": 163.554, |
|
"eval_steps_per_second": 10.222, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 4.131699562072754, |
|
"learning_rate": 2.553974954698274e-06, |
|
"loss": 0.6348, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 4.052087783813477, |
|
"learning_rate": 2.5404799587192076e-06, |
|
"loss": 0.6336, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"eval_loss": 0.6567848920822144, |
|
"eval_runtime": 12.1155, |
|
"eval_samples_per_second": 165.077, |
|
"eval_steps_per_second": 10.317, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.040383338928223, |
|
"learning_rate": 2.526820658893033e-06, |
|
"loss": 0.6919, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 4.098529815673828, |
|
"learning_rate": 2.5129992121815365e-06, |
|
"loss": 0.6198, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"eval_loss": 0.6569080948829651, |
|
"eval_runtime": 12.4865, |
|
"eval_samples_per_second": 160.173, |
|
"eval_steps_per_second": 10.011, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 3.941558837890625, |
|
"learning_rate": 2.4990178011513777e-06, |
|
"loss": 0.6361, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 4.143987655639648, |
|
"learning_rate": 2.484878633629435e-06, |
|
"loss": 0.6989, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"eval_loss": 0.657779335975647, |
|
"eval_runtime": 12.0123, |
|
"eval_samples_per_second": 166.496, |
|
"eval_steps_per_second": 10.406, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 3.3898260593414307, |
|
"learning_rate": 2.4705839423541666e-06, |
|
"loss": 0.658, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.7825887203216553, |
|
"learning_rate": 2.456135984623035e-06, |
|
"loss": 0.6353, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.6569732427597046, |
|
"eval_runtime": 12.0405, |
|
"eval_samples_per_second": 166.107, |
|
"eval_steps_per_second": 10.382, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 4.2369537353515625, |
|
"learning_rate": 2.441537041936051e-06, |
|
"loss": 0.654, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 3.7942652702331543, |
|
"learning_rate": 2.4267894196355018e-06, |
|
"loss": 0.6746, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"eval_loss": 0.6568124890327454, |
|
"eval_runtime": 12.3613, |
|
"eval_samples_per_second": 161.795, |
|
"eval_steps_per_second": 10.112, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 4.185107231140137, |
|
"learning_rate": 2.4118954465419083e-06, |
|
"loss": 0.5986, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 3.844982385635376, |
|
"learning_rate": 2.3968574745862785e-06, |
|
"loss": 0.6883, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"eval_loss": 0.6570600867271423, |
|
"eval_runtime": 12.0276, |
|
"eval_samples_per_second": 166.284, |
|
"eval_steps_per_second": 10.393, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.494849443435669, |
|
"learning_rate": 2.3816778784387097e-06, |
|
"loss": 0.6193, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 4.09348726272583, |
|
"learning_rate": 2.3663590551334015e-06, |
|
"loss": 0.6772, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"eval_loss": 0.6566088795661926, |
|
"eval_runtime": 12.1153, |
|
"eval_samples_per_second": 165.08, |
|
"eval_steps_per_second": 10.317, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 3.824082612991333, |
|
"learning_rate": 2.350903423690135e-06, |
|
"loss": 0.6153, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 4.054186820983887, |
|
"learning_rate": 2.3353134247322823e-06, |
|
"loss": 0.6563, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"eval_loss": 0.6563527584075928, |
|
"eval_runtime": 12.0218, |
|
"eval_samples_per_second": 166.364, |
|
"eval_steps_per_second": 10.398, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 3.5578935146331787, |
|
"learning_rate": 2.3195915201014038e-06, |
|
"loss": 0.7107, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.8796043395996094, |
|
"learning_rate": 2.303740192468495e-06, |
|
"loss": 0.6077, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.6553810238838196, |
|
"eval_runtime": 12.1088, |
|
"eval_samples_per_second": 165.169, |
|
"eval_steps_per_second": 10.323, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 3.7618532180786133, |
|
"learning_rate": 2.2877619449419438e-06, |
|
"loss": 0.6272, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 3.750394344329834, |
|
"learning_rate": 2.2716593006722595e-06, |
|
"loss": 0.6291, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"eval_loss": 0.65521240234375, |
|
"eval_runtime": 12.0523, |
|
"eval_samples_per_second": 165.943, |
|
"eval_steps_per_second": 10.371, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 4.360326290130615, |
|
"learning_rate": 2.2554348024536415e-06, |
|
"loss": 0.699, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 3.571743965148926, |
|
"learning_rate": 2.2390910123224374e-06, |
|
"loss": 0.6073, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"eval_loss": 0.6547145843505859, |
|
"eval_runtime": 12.0682, |
|
"eval_samples_per_second": 165.725, |
|
"eval_steps_per_second": 10.358, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.8361449241638184, |
|
"learning_rate": 2.222630511152573e-06, |
|
"loss": 0.5729, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 3.7221009731292725, |
|
"learning_rate": 2.2060558982479992e-06, |
|
"loss": 0.6598, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"eval_loss": 0.6550743579864502, |
|
"eval_runtime": 12.0566, |
|
"eval_samples_per_second": 165.885, |
|
"eval_steps_per_second": 10.368, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 4.329174518585205, |
|
"learning_rate": 2.1893697909322322e-06, |
|
"loss": 0.6447, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 3.9657375812530518, |
|
"learning_rate": 2.1725748241350487e-06, |
|
"loss": 0.593, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"eval_loss": 0.6547417640686035, |
|
"eval_runtime": 12.1368, |
|
"eval_samples_per_second": 164.789, |
|
"eval_steps_per_second": 10.299, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 3.378925323486328, |
|
"learning_rate": 2.1556736499763994e-06, |
|
"loss": 0.6351, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.746727228164673, |
|
"learning_rate": 2.138668937347609e-06, |
|
"loss": 0.6352, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.6547327637672424, |
|
"eval_runtime": 12.0347, |
|
"eval_samples_per_second": 166.187, |
|
"eval_steps_per_second": 10.387, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 3.8396756649017334, |
|
"learning_rate": 2.1215633714899263e-06, |
|
"loss": 0.683, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 3.8135344982147217, |
|
"learning_rate": 2.1043596535704943e-06, |
|
"loss": 0.6216, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"eval_loss": 0.6539892554283142, |
|
"eval_runtime": 12.0015, |
|
"eval_samples_per_second": 166.645, |
|
"eval_steps_per_second": 10.415, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 3.2499091625213623, |
|
"learning_rate": 2.0870605002558037e-06, |
|
"loss": 0.6512, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 3.963940382003784, |
|
"learning_rate": 2.069668643282702e-06, |
|
"loss": 0.6937, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"eval_loss": 0.6535360813140869, |
|
"eval_runtime": 13.6104, |
|
"eval_samples_per_second": 146.947, |
|
"eval_steps_per_second": 9.184, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.249936103820801, |
|
"learning_rate": 2.0521868290270174e-06, |
|
"loss": 0.659, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 3.558105230331421, |
|
"learning_rate": 2.034617818069876e-06, |
|
"loss": 0.669, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"eval_loss": 0.6529609560966492, |
|
"eval_runtime": 12.1045, |
|
"eval_samples_per_second": 165.227, |
|
"eval_steps_per_second": 10.327, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 4.027170658111572, |
|
"learning_rate": 2.0169643847617756e-06, |
|
"loss": 0.6846, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 3.395838737487793, |
|
"learning_rate": 1.99922931678448e-06, |
|
"loss": 0.6052, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"eval_loss": 0.6525455117225647, |
|
"eval_runtime": 12.7081, |
|
"eval_samples_per_second": 157.38, |
|
"eval_steps_per_second": 9.836, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 3.7162697315216064, |
|
"learning_rate": 1.981415414710814e-06, |
|
"loss": 0.6307, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.496490955352783, |
|
"learning_rate": 1.963525491562421e-06, |
|
"loss": 0.6218, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.652510404586792, |
|
"eval_runtime": 12.0132, |
|
"eval_samples_per_second": 166.484, |
|
"eval_steps_per_second": 10.405, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 4.182400703430176, |
|
"learning_rate": 1.9455623723655522e-06, |
|
"loss": 0.6504, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 3.4778692722320557, |
|
"learning_rate": 1.927528893704964e-06, |
|
"loss": 0.6341, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"eval_loss": 0.6526325941085815, |
|
"eval_runtime": 12.1893, |
|
"eval_samples_per_second": 164.078, |
|
"eval_steps_per_second": 10.255, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 3.5207061767578125, |
|
"learning_rate": 1.909427903275988e-06, |
|
"loss": 0.6774, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 3.6349990367889404, |
|
"learning_rate": 1.8912622594348455e-06, |
|
"loss": 0.6681, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"eval_loss": 0.6522479057312012, |
|
"eval_runtime": 12.3219, |
|
"eval_samples_per_second": 162.312, |
|
"eval_steps_per_second": 10.145, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.17547082901001, |
|
"learning_rate": 1.8730348307472826e-06, |
|
"loss": 0.639, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 3.4751803874969482, |
|
"learning_rate": 1.8547484955355872e-06, |
|
"loss": 0.6203, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"eval_loss": 0.6515837907791138, |
|
"eval_runtime": 13.3997, |
|
"eval_samples_per_second": 149.257, |
|
"eval_steps_per_second": 9.329, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 3.854713201522827, |
|
"learning_rate": 1.836406141424072e-06, |
|
"loss": 0.6283, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 3.728891134262085, |
|
"learning_rate": 1.8180106648830824e-06, |
|
"loss": 0.6682, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"eval_loss": 0.6505850553512573, |
|
"eval_runtime": 14.2953, |
|
"eval_samples_per_second": 139.906, |
|
"eval_steps_per_second": 8.744, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 3.6460957527160645, |
|
"learning_rate": 1.7995649707716105e-06, |
|
"loss": 0.6677, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.6266672611236572, |
|
"learning_rate": 1.7810719718785873e-06, |
|
"loss": 0.6212, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.6500898599624634, |
|
"eval_runtime": 13.827, |
|
"eval_samples_per_second": 144.644, |
|
"eval_steps_per_second": 9.04, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 3.6777870655059814, |
|
"learning_rate": 1.7625345884629143e-06, |
|
"loss": 0.6827, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 3.7290449142456055, |
|
"learning_rate": 1.7439557477923257e-06, |
|
"loss": 0.6887, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"eval_loss": 0.6501717567443848, |
|
"eval_runtime": 13.4717, |
|
"eval_samples_per_second": 148.459, |
|
"eval_steps_per_second": 9.279, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 3.18540620803833, |
|
"learning_rate": 1.7253383836811356e-06, |
|
"loss": 0.575, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 4.008258819580078, |
|
"learning_rate": 1.706685436026957e-06, |
|
"loss": 0.64, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"eval_loss": 0.6503917574882507, |
|
"eval_runtime": 13.6, |
|
"eval_samples_per_second": 147.058, |
|
"eval_steps_per_second": 9.191, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.878329277038574, |
|
"learning_rate": 1.6879998503464564e-06, |
|
"loss": 0.6653, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 3.8032970428466797, |
|
"learning_rate": 1.6692845773102223e-06, |
|
"loss": 0.6176, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"eval_loss": 0.6500183343887329, |
|
"eval_runtime": 13.5348, |
|
"eval_samples_per_second": 147.768, |
|
"eval_steps_per_second": 9.235, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 3.704298973083496, |
|
"learning_rate": 1.6505425722768222e-06, |
|
"loss": 0.6302, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 3.4791512489318848, |
|
"learning_rate": 1.6317767948261151e-06, |
|
"loss": 0.6285, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"eval_loss": 0.6499763131141663, |
|
"eval_runtime": 13.3791, |
|
"eval_samples_per_second": 149.487, |
|
"eval_steps_per_second": 9.343, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 3.9690394401550293, |
|
"learning_rate": 1.6129902082918993e-06, |
|
"loss": 0.6118, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.715287208557129, |
|
"learning_rate": 1.5941857792939703e-06, |
|
"loss": 0.6661, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.6488626599311829, |
|
"eval_runtime": 13.0584, |
|
"eval_samples_per_second": 153.158, |
|
"eval_steps_per_second": 9.572, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 3.739778518676758, |
|
"learning_rate": 1.5753664772696545e-06, |
|
"loss": 0.6188, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 3.546541213989258, |
|
"learning_rate": 1.556535274004902e-06, |
|
"loss": 0.6537, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"eval_loss": 0.6488344669342041, |
|
"eval_runtime": 12.2525, |
|
"eval_samples_per_second": 163.231, |
|
"eval_steps_per_second": 10.202, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 3.7501473426818848, |
|
"learning_rate": 1.5376951431650064e-06, |
|
"loss": 0.6508, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 3.7694437503814697, |
|
"learning_rate": 1.518849059825029e-06, |
|
"loss": 0.657, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"eval_loss": 0.648171603679657, |
|
"eval_runtime": 11.9787, |
|
"eval_samples_per_second": 166.963, |
|
"eval_steps_per_second": 10.435, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.7407610416412354, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.582, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 3.2185583114624023, |
|
"learning_rate": 1.481150940174971e-06, |
|
"loss": 0.4004, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"eval_loss": 0.6502845287322998, |
|
"eval_runtime": 11.9952, |
|
"eval_samples_per_second": 166.733, |
|
"eval_steps_per_second": 10.421, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 3.261690855026245, |
|
"learning_rate": 1.4623048568349939e-06, |
|
"loss": 0.3738, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 5.511118412017822, |
|
"learning_rate": 1.4434647259950982e-06, |
|
"loss": 0.4014, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"eval_loss": 0.7170341610908508, |
|
"eval_runtime": 11.9625, |
|
"eval_samples_per_second": 167.189, |
|
"eval_steps_per_second": 10.449, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 4.647660255432129, |
|
"learning_rate": 1.4246335227303458e-06, |
|
"loss": 0.3661, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.8759918212890625, |
|
"learning_rate": 1.40581422070603e-06, |
|
"loss": 0.4179, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 0.6922851800918579, |
|
"eval_runtime": 11.9429, |
|
"eval_samples_per_second": 167.463, |
|
"eval_steps_per_second": 10.466, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 4.271312236785889, |
|
"learning_rate": 1.3870097917081012e-06, |
|
"loss": 0.3519, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 4.502732276916504, |
|
"learning_rate": 1.3682232051738854e-06, |
|
"loss": 0.3998, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"eval_loss": 0.6920506358146667, |
|
"eval_runtime": 12.2751, |
|
"eval_samples_per_second": 162.931, |
|
"eval_steps_per_second": 10.183, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 4.114953994750977, |
|
"learning_rate": 1.3494574277231775e-06, |
|
"loss": 0.399, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 4.416531085968018, |
|
"learning_rate": 1.3307154226897775e-06, |
|
"loss": 0.3705, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"eval_loss": 0.7054136991500854, |
|
"eval_runtime": 12.407, |
|
"eval_samples_per_second": 161.199, |
|
"eval_steps_per_second": 10.075, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.7480475902557373, |
|
"learning_rate": 1.3120001496535434e-06, |
|
"loss": 0.3634, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 4.652655124664307, |
|
"learning_rate": 1.293314563973043e-06, |
|
"loss": 0.3513, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"eval_loss": 0.7035665512084961, |
|
"eval_runtime": 12.0092, |
|
"eval_samples_per_second": 166.539, |
|
"eval_steps_per_second": 10.409, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 3.6166396141052246, |
|
"learning_rate": 1.2746616163188644e-06, |
|
"loss": 0.3481, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 4.5175981521606445, |
|
"learning_rate": 1.2560442522076746e-06, |
|
"loss": 0.3815, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"eval_loss": 0.7024725079536438, |
|
"eval_runtime": 12.0834, |
|
"eval_samples_per_second": 165.516, |
|
"eval_steps_per_second": 10.345, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 4.5786213874816895, |
|
"learning_rate": 1.2374654115370858e-06, |
|
"loss": 0.3672, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 4.556806564331055, |
|
"learning_rate": 1.2189280281214128e-06, |
|
"loss": 0.3684, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.704890787601471, |
|
"eval_runtime": 11.9719, |
|
"eval_samples_per_second": 167.058, |
|
"eval_steps_per_second": 10.441, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 4.46769380569458, |
|
"learning_rate": 1.2004350292283896e-06, |
|
"loss": 0.3424, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 5.269836902618408, |
|
"learning_rate": 1.1819893351169183e-06, |
|
"loss": 0.3914, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"eval_loss": 0.7068949937820435, |
|
"eval_runtime": 11.9987, |
|
"eval_samples_per_second": 166.685, |
|
"eval_steps_per_second": 10.418, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 4.130289554595947, |
|
"learning_rate": 1.1635938585759285e-06, |
|
"loss": 0.3516, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 4.418029308319092, |
|
"learning_rate": 1.1452515044644133e-06, |
|
"loss": 0.4082, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"eval_loss": 0.7017790675163269, |
|
"eval_runtime": 11.9747, |
|
"eval_samples_per_second": 167.019, |
|
"eval_steps_per_second": 10.439, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.497861623764038, |
|
"learning_rate": 1.1269651692527181e-06, |
|
"loss": 0.3627, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 4.658440589904785, |
|
"learning_rate": 1.108737740565155e-06, |
|
"loss": 0.3494, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"eval_loss": 0.7041600346565247, |
|
"eval_runtime": 11.9932, |
|
"eval_samples_per_second": 166.762, |
|
"eval_steps_per_second": 10.423, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 4.345962047576904, |
|
"learning_rate": 1.0905720967240124e-06, |
|
"loss": 0.3762, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 3.5613420009613037, |
|
"learning_rate": 1.0724711062950359e-06, |
|
"loss": 0.3715, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"eval_loss": 0.707082986831665, |
|
"eval_runtime": 12.0082, |
|
"eval_samples_per_second": 166.553, |
|
"eval_steps_per_second": 10.41, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 4.606480598449707, |
|
"learning_rate": 1.0544376276344478e-06, |
|
"loss": 0.386, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 4.682101249694824, |
|
"learning_rate": 1.036474508437579e-06, |
|
"loss": 0.3675, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.7084864377975464, |
|
"eval_runtime": 12.0192, |
|
"eval_samples_per_second": 166.4, |
|
"eval_steps_per_second": 10.4, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 5.7233710289001465, |
|
"learning_rate": 1.018584585289186e-06, |
|
"loss": 0.354, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 4.105285167694092, |
|
"learning_rate": 1.0007706832155202e-06, |
|
"loss": 0.3319, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"eval_loss": 0.7111691236495972, |
|
"eval_runtime": 12.5364, |
|
"eval_samples_per_second": 159.535, |
|
"eval_steps_per_second": 9.971, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 4.163219451904297, |
|
"learning_rate": 9.830356152382247e-07, |
|
"loss": 0.357, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 5.115902423858643, |
|
"learning_rate": 9.65382181930124e-07, |
|
"loss": 0.3823, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"eval_loss": 0.7141273021697998, |
|
"eval_runtime": 12.0055, |
|
"eval_samples_per_second": 166.591, |
|
"eval_steps_per_second": 10.412, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 4.919798851013184, |
|
"learning_rate": 9.478131709729831e-07, |
|
"loss": 0.3669, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 3.693423271179199, |
|
"learning_rate": 9.303313567172986e-07, |
|
"loss": 0.3571, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"eval_loss": 0.7113474011421204, |
|
"eval_runtime": 11.9538, |
|
"eval_samples_per_second": 167.311, |
|
"eval_steps_per_second": 10.457, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 4.305978298187256, |
|
"learning_rate": 9.129394997441964e-07, |
|
"loss": 0.3765, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 4.349783897399902, |
|
"learning_rate": 8.956403464295061e-07, |
|
"loss": 0.3503, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"eval_loss": 0.7126505970954895, |
|
"eval_runtime": 12.0036, |
|
"eval_samples_per_second": 166.617, |
|
"eval_steps_per_second": 10.414, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 4.4210662841796875, |
|
"learning_rate": 8.784366285100739e-07, |
|
"loss": 0.3583, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 4.530226707458496, |
|
"learning_rate": 8.613310626523911e-07, |
|
"loss": 0.3742, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 0.7158926129341125, |
|
"eval_runtime": 11.9999, |
|
"eval_samples_per_second": 166.668, |
|
"eval_steps_per_second": 10.417, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 3.9921834468841553, |
|
"learning_rate": 8.443263500236011e-07, |
|
"loss": 0.371, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 5.109288215637207, |
|
"learning_rate": 8.274251758649519e-07, |
|
"loss": 0.4087, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"eval_loss": 0.7139343619346619, |
|
"eval_runtime": 12.0284, |
|
"eval_samples_per_second": 166.274, |
|
"eval_steps_per_second": 10.392, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 4.64515495300293, |
|
"learning_rate": 8.106302090677683e-07, |
|
"loss": 0.3806, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 6.393331527709961, |
|
"learning_rate": 7.939441017520012e-07, |
|
"loss": 0.3781, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"eval_loss": 0.7073184847831726, |
|
"eval_runtime": 12.02, |
|
"eval_samples_per_second": 166.389, |
|
"eval_steps_per_second": 10.399, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 4.550942897796631, |
|
"learning_rate": 7.773694888474268e-07, |
|
"loss": 0.3891, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 4.257821083068848, |
|
"learning_rate": 7.609089876775628e-07, |
|
"loss": 0.3475, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"eval_loss": 0.7128930687904358, |
|
"eval_runtime": 12.01, |
|
"eval_samples_per_second": 166.528, |
|
"eval_steps_per_second": 10.408, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 3.6469428539276123, |
|
"learning_rate": 7.445651975463588e-07, |
|
"loss": 0.3546, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 6.628065586090088, |
|
"learning_rate": 7.283406993277403e-07, |
|
"loss": 0.3724, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"eval_loss": 0.7112878561019897, |
|
"eval_runtime": 12.3921, |
|
"eval_samples_per_second": 161.393, |
|
"eval_steps_per_second": 10.087, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 4.343158721923828, |
|
"learning_rate": 7.122380550580563e-07, |
|
"loss": 0.3659, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 5.028806686401367, |
|
"learning_rate": 6.962598075315047e-07, |
|
"loss": 0.3612, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"eval_loss": 0.7130332589149475, |
|
"eval_runtime": 12.01, |
|
"eval_samples_per_second": 166.528, |
|
"eval_steps_per_second": 10.408, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 4.212679386138916, |
|
"learning_rate": 6.804084798985965e-07, |
|
"loss": 0.3495, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 4.199646949768066, |
|
"learning_rate": 6.646865752677186e-07, |
|
"loss": 0.3254, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"eval_loss": 0.7138640880584717, |
|
"eval_runtime": 12.0153, |
|
"eval_samples_per_second": 166.455, |
|
"eval_steps_per_second": 10.403, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 4.8986921310424805, |
|
"learning_rate": 6.490965763098655e-07, |
|
"loss": 0.3986, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 4.102914810180664, |
|
"learning_rate": 6.336409448665989e-07, |
|
"loss": 0.3626, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"eval_loss": 0.7144864797592163, |
|
"eval_runtime": 11.9979, |
|
"eval_samples_per_second": 166.696, |
|
"eval_steps_per_second": 10.418, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.214079856872559, |
|
"learning_rate": 6.183221215612905e-07, |
|
"loss": 0.3739, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 5.322539806365967, |
|
"learning_rate": 6.031425254137223e-07, |
|
"loss": 0.351, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"eval_loss": 0.7146536707878113, |
|
"eval_runtime": 12.0493, |
|
"eval_samples_per_second": 165.985, |
|
"eval_steps_per_second": 10.374, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 4.931739330291748, |
|
"learning_rate": 5.881045534580923e-07, |
|
"loss": 0.3971, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 3.9749743938446045, |
|
"learning_rate": 5.732105803644987e-07, |
|
"loss": 0.3357, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"eval_loss": 0.7105372548103333, |
|
"eval_runtime": 12.0068, |
|
"eval_samples_per_second": 166.572, |
|
"eval_steps_per_second": 10.411, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 4.823004245758057, |
|
"learning_rate": 5.584629580639495e-07, |
|
"loss": 0.4003, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 4.934309482574463, |
|
"learning_rate": 5.438640153769653e-07, |
|
"loss": 0.371, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 0.7078642845153809, |
|
"eval_runtime": 12.0222, |
|
"eval_samples_per_second": 166.359, |
|
"eval_steps_per_second": 10.397, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 4.136751174926758, |
|
"learning_rate": 5.29416057645834e-07, |
|
"loss": 0.3825, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 4.164857864379883, |
|
"learning_rate": 5.151213663705655e-07, |
|
"loss": 0.3566, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"eval_loss": 0.7069818377494812, |
|
"eval_runtime": 12.3015, |
|
"eval_samples_per_second": 162.582, |
|
"eval_steps_per_second": 10.161, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 4.379539489746094, |
|
"learning_rate": 5.009821988486227e-07, |
|
"loss": 0.3733, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 4.559187412261963, |
|
"learning_rate": 4.870007878184633e-07, |
|
"loss": 0.3762, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"eval_loss": 0.711800754070282, |
|
"eval_runtime": 11.9873, |
|
"eval_samples_per_second": 166.843, |
|
"eval_steps_per_second": 10.428, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 4.373340606689453, |
|
"learning_rate": 4.731793411069669e-07, |
|
"loss": 0.3768, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 4.8367838859558105, |
|
"learning_rate": 4.5952004128079276e-07, |
|
"loss": 0.3755, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"eval_loss": 0.7125899791717529, |
|
"eval_runtime": 12.4233, |
|
"eval_samples_per_second": 160.988, |
|
"eval_steps_per_second": 10.062, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 6.753279685974121, |
|
"learning_rate": 4.460250453017264e-07, |
|
"loss": 0.374, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 4.952857971191406, |
|
"learning_rate": 4.3269648418607197e-07, |
|
"loss": 0.3595, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"eval_loss": 0.7107095718383789, |
|
"eval_runtime": 11.9826, |
|
"eval_samples_per_second": 166.909, |
|
"eval_steps_per_second": 10.432, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 4.235396385192871, |
|
"learning_rate": 4.1953646266813963e-07, |
|
"loss": 0.4008, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 4.504904747009277, |
|
"learning_rate": 4.06547058867883e-07, |
|
"loss": 0.3828, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 0.7118133902549744, |
|
"eval_runtime": 12.4744, |
|
"eval_samples_per_second": 160.328, |
|
"eval_steps_per_second": 10.02, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 3.8867733478546143, |
|
"learning_rate": 3.9373032396273926e-07, |
|
"loss": 0.3498, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 4.430863857269287, |
|
"learning_rate": 3.8108828186372685e-07, |
|
"loss": 0.3793, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"eval_loss": 0.7172751426696777, |
|
"eval_runtime": 12.065, |
|
"eval_samples_per_second": 165.768, |
|
"eval_steps_per_second": 10.361, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 4.033322334289551, |
|
"learning_rate": 3.686229288958442e-07, |
|
"loss": 0.3693, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 4.288609027862549, |
|
"learning_rate": 3.56336233482828e-07, |
|
"loss": 0.3446, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"eval_loss": 0.7149741053581238, |
|
"eval_runtime": 12.0179, |
|
"eval_samples_per_second": 166.418, |
|
"eval_steps_per_second": 10.401, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 4.438851356506348, |
|
"learning_rate": 3.442301358363163e-07, |
|
"loss": 0.3672, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 4.459841728210449, |
|
"learning_rate": 3.32306547649465e-07, |
|
"loss": 0.3707, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"eval_loss": 0.7135123014450073, |
|
"eval_runtime": 11.9963, |
|
"eval_samples_per_second": 166.718, |
|
"eval_steps_per_second": 10.42, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 3.922914743423462, |
|
"learning_rate": 3.2056735179507165e-07, |
|
"loss": 0.3938, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 4.287975788116455, |
|
"learning_rate": 3.0901440202824693e-07, |
|
"loss": 0.3604, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"eval_loss": 0.714073657989502, |
|
"eval_runtime": 11.9966, |
|
"eval_samples_per_second": 166.714, |
|
"eval_steps_per_second": 10.42, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 4.261760711669922, |
|
"learning_rate": 2.976495226936849e-07, |
|
"loss": 0.3538, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 4.7926812171936035, |
|
"learning_rate": 2.86474508437579e-07, |
|
"loss": 0.3441, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.7137433290481567, |
|
"eval_runtime": 12.033, |
|
"eval_samples_per_second": 166.21, |
|
"eval_steps_per_second": 10.388, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 4.072969436645508, |
|
"learning_rate": 2.754911239242241e-07, |
|
"loss": 0.3653, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 4.100124835968018, |
|
"learning_rate": 2.647011035573588e-07, |
|
"loss": 0.3705, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"eval_loss": 0.7153717279434204, |
|
"eval_runtime": 12.9324, |
|
"eval_samples_per_second": 154.65, |
|
"eval_steps_per_second": 9.666, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.624, |
|
"grad_norm": 4.28076696395874, |
|
"learning_rate": 2.5410615120628085e-07, |
|
"loss": 0.3299, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 4.782041072845459, |
|
"learning_rate": 2.437079399367875e-07, |
|
"loss": 0.3857, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"eval_loss": 0.7188824415206909, |
|
"eval_runtime": 13.608, |
|
"eval_samples_per_second": 146.972, |
|
"eval_steps_per_second": 9.186, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 4.323799133300781, |
|
"learning_rate": 2.3350811174697772e-07, |
|
"loss": 0.3702, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 4.875490188598633, |
|
"learning_rate": 2.235082773079624e-07, |
|
"loss": 0.3952, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"eval_loss": 0.7147963643074036, |
|
"eval_runtime": 14.1342, |
|
"eval_samples_per_second": 141.501, |
|
"eval_steps_per_second": 8.844, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 4.45432710647583, |
|
"learning_rate": 2.1371001570952186e-07, |
|
"loss": 0.4075, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 4.542659759521484, |
|
"learning_rate": 2.0411487421074708e-07, |
|
"loss": 0.3815, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"eval_loss": 0.7115746140480042, |
|
"eval_runtime": 15.5186, |
|
"eval_samples_per_second": 128.878, |
|
"eval_steps_per_second": 8.055, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 4.134467601776123, |
|
"learning_rate": 1.9472436799571147e-07, |
|
"loss": 0.3594, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 4.484969615936279, |
|
"learning_rate": 1.8553997993420495e-07, |
|
"loss": 0.3507, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"eval_loss": 0.7108085751533508, |
|
"eval_runtime": 13.7067, |
|
"eval_samples_per_second": 145.914, |
|
"eval_steps_per_second": 9.12, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 5.112241744995117, |
|
"learning_rate": 1.7656316034757024e-07, |
|
"loss": 0.3736, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 4.2886786460876465, |
|
"learning_rate": 1.6779532677968329e-07, |
|
"loss": 0.3662, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"eval_loss": 0.7123615741729736, |
|
"eval_runtime": 13.8684, |
|
"eval_samples_per_second": 144.212, |
|
"eval_steps_per_second": 9.013, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 4.763908863067627, |
|
"learning_rate": 1.5923786377310435e-07, |
|
"loss": 0.3736, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 4.227341651916504, |
|
"learning_rate": 1.508921226504434e-07, |
|
"loss": 0.3581, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"eval_loss": 0.7136136293411255, |
|
"eval_runtime": 13.5861, |
|
"eval_samples_per_second": 147.209, |
|
"eval_steps_per_second": 9.201, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 4.712426662445068, |
|
"learning_rate": 1.4275942130097098e-07, |
|
"loss": 0.374, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 4.427713871002197, |
|
"learning_rate": 1.348410439725065e-07, |
|
"loss": 0.3867, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"eval_loss": 0.7132411003112793, |
|
"eval_runtime": 13.1772, |
|
"eval_samples_per_second": 151.778, |
|
"eval_steps_per_second": 9.486, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 4.723916530609131, |
|
"learning_rate": 1.271382410686237e-07, |
|
"loss": 0.3859, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 4.608342170715332, |
|
"learning_rate": 1.1965222895119444e-07, |
|
"loss": 0.3707, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"eval_loss": 0.7126567959785461, |
|
"eval_runtime": 13.2064, |
|
"eval_samples_per_second": 151.442, |
|
"eval_steps_per_second": 9.465, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 4.681605815887451, |
|
"learning_rate": 1.123841897483131e-07, |
|
"loss": 0.3439, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 5.060537338256836, |
|
"learning_rate": 1.0533527116762298e-07, |
|
"loss": 0.4078, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 0.7122145891189575, |
|
"eval_runtime": 12.046, |
|
"eval_samples_per_second": 166.03, |
|
"eval_steps_per_second": 10.377, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 4.388428211212158, |
|
"learning_rate": 9.850658631508197e-08, |
|
"loss": 0.3318, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 4.610175609588623, |
|
"learning_rate": 9.18992135191889e-08, |
|
"loss": 0.3713, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"eval_loss": 0.711074709892273, |
|
"eval_runtime": 12.0327, |
|
"eval_samples_per_second": 166.213, |
|
"eval_steps_per_second": 10.388, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 4.19280481338501, |
|
"learning_rate": 8.551419616070322e-08, |
|
"loss": 0.3503, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 4.062910079956055, |
|
"learning_rate": 7.935254250788366e-08, |
|
"loss": 0.3525, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"eval_loss": 0.7110276818275452, |
|
"eval_runtime": 12.001, |
|
"eval_samples_per_second": 166.653, |
|
"eval_steps_per_second": 10.416, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 4.942102909088135, |
|
"learning_rate": 7.341522555726971e-08, |
|
"loss": 0.3293, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 4.574679851531982, |
|
"learning_rate": 6.770318288003558e-08, |
|
"loss": 0.3873, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"eval_loss": 0.7115213871002197, |
|
"eval_runtime": 12.0697, |
|
"eval_samples_per_second": 165.704, |
|
"eval_steps_per_second": 10.357, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 4.616149425506592, |
|
"learning_rate": 6.221731647393609e-08, |
|
"loss": 0.3684, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 5.204924583435059, |
|
"learning_rate": 5.6958492620871105e-08, |
|
"loss": 0.4008, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"eval_loss": 0.7119244337081909, |
|
"eval_runtime": 12.3474, |
|
"eval_samples_per_second": 161.977, |
|
"eval_steps_per_second": 10.124, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 4.088404178619385, |
|
"learning_rate": 5.192754175008918e-08, |
|
"loss": 0.3352, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 4.6251606941223145, |
|
"learning_rate": 4.712525830705339e-08, |
|
"loss": 0.3889, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"eval_loss": 0.711883008480072, |
|
"eval_runtime": 12.5273, |
|
"eval_samples_per_second": 159.651, |
|
"eval_steps_per_second": 9.978, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 4.4832072257995605, |
|
"learning_rate": 4.255240062798904e-08, |
|
"loss": 0.3426, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 4.493223190307617, |
|
"learning_rate": 3.820969082013415e-08, |
|
"loss": 0.3591, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"eval_loss": 0.711625337600708, |
|
"eval_runtime": 12.5893, |
|
"eval_samples_per_second": 158.865, |
|
"eval_steps_per_second": 9.929, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 3.978353261947632, |
|
"learning_rate": 3.409781464770978e-08, |
|
"loss": 0.3594, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 4.505449295043945, |
|
"learning_rate": 3.021742142362971e-08, |
|
"loss": 0.3843, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"eval_loss": 0.7115885019302368, |
|
"eval_runtime": 12.4393, |
|
"eval_samples_per_second": 160.781, |
|
"eval_steps_per_second": 10.049, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 4.039797782897949, |
|
"learning_rate": 2.6569123906967087e-08, |
|
"loss": 0.3353, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 5.2295732498168945, |
|
"learning_rate": 2.3153498206192002e-08, |
|
"loss": 0.3713, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"eval_loss": 0.7115111351013184, |
|
"eval_runtime": 12.0362, |
|
"eval_samples_per_second": 166.166, |
|
"eval_steps_per_second": 10.385, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 4.890398979187012, |
|
"learning_rate": 1.9971083688197945e-08, |
|
"loss": 0.3765, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 4.141822814941406, |
|
"learning_rate": 1.7022382893129074e-08, |
|
"loss": 0.3659, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"eval_loss": 0.7115259766578674, |
|
"eval_runtime": 12.0459, |
|
"eval_samples_per_second": 166.032, |
|
"eval_steps_per_second": 10.377, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 4.194301128387451, |
|
"learning_rate": 1.430786145502322e-08, |
|
"loss": 0.3534, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 4.5958051681518555, |
|
"learning_rate": 1.1827948028283353e-08, |
|
"loss": 0.3588, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_loss": 0.7115083932876587, |
|
"eval_runtime": 12.3331, |
|
"eval_samples_per_second": 162.165, |
|
"eval_steps_per_second": 10.135, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 4.880290508270264, |
|
"learning_rate": 9.583034219987408e-09, |
|
"loss": 0.3672, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 3.973886251449585, |
|
"learning_rate": 7.57347452804974e-09, |
|
"loss": 0.3556, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"eval_loss": 0.7114896774291992, |
|
"eval_runtime": 12.1956, |
|
"eval_samples_per_second": 163.993, |
|
"eval_steps_per_second": 10.25, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 3.817405939102173, |
|
"learning_rate": 5.799586285241243e-09, |
|
"loss": 0.3265, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 4.495524883270264, |
|
"learning_rate": 4.261649609079099e-09, |
|
"loss": 0.3278, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"eval_loss": 0.7115508317947388, |
|
"eval_runtime": 11.9967, |
|
"eval_samples_per_second": 166.712, |
|
"eval_steps_per_second": 10.42, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 4.464719772338867, |
|
"learning_rate": 2.9599073575926614e-09, |
|
"loss": 0.3817, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 4.08857536315918, |
|
"learning_rate": 1.8945650909737986e-09, |
|
"loss": 0.3642, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"eval_loss": 0.7115161418914795, |
|
"eval_runtime": 12.0169, |
|
"eval_samples_per_second": 166.433, |
|
"eval_steps_per_second": 10.402, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 4.385359764099121, |
|
"learning_rate": 1.0657910391161929e-09, |
|
"loss": 0.3696, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 4.141005992889404, |
|
"learning_rate": 4.737160750500902e-10, |
|
"loss": 0.3718, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"eval_loss": 0.7114637494087219, |
|
"eval_runtime": 12.0025, |
|
"eval_samples_per_second": 166.632, |
|
"eval_steps_per_second": 10.414, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 5.190242767333984, |
|
"learning_rate": 1.184336942758324e-10, |
|
"loss": 0.3882, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.157950401306152, |
|
"learning_rate": 0.0, |
|
"loss": 0.3611, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7115355134010315, |
|
"eval_runtime": 11.9974, |
|
"eval_samples_per_second": 166.703, |
|
"eval_steps_per_second": 10.419, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1250, |
|
"total_flos": 1.8775693783885414e+17, |
|
"train_loss": 0.5100525261878968, |
|
"train_runtime": 5131.1185, |
|
"train_samples_per_second": 3.898, |
|
"train_steps_per_second": 0.244 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8775693783885414e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|