|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 121000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 53.65376281738281, |
|
"learning_rate": 1.9917355371900827e-05, |
|
"loss": 9.7879, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 92.14137268066406, |
|
"learning_rate": 1.9834710743801656e-05, |
|
"loss": 7.9084, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 7.1100754737854, |
|
"eval_runtime": 137.7524, |
|
"eval_samples_per_second": 6.984, |
|
"eval_steps_per_second": 6.984, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 116.5856704711914, |
|
"learning_rate": 1.9752066115702482e-05, |
|
"loss": 7.3127, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 31.839763641357422, |
|
"learning_rate": 1.9669421487603307e-05, |
|
"loss": 7.1385, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 6.323878765106201, |
|
"eval_runtime": 139.7775, |
|
"eval_samples_per_second": 6.882, |
|
"eval_steps_per_second": 6.882, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 81.93315887451172, |
|
"learning_rate": 1.9586776859504133e-05, |
|
"loss": 6.5815, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 91.25088500976562, |
|
"learning_rate": 1.950413223140496e-05, |
|
"loss": 6.3278, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 5.6174092292785645, |
|
"eval_runtime": 137.9688, |
|
"eval_samples_per_second": 6.973, |
|
"eval_steps_per_second": 6.973, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 42.85502243041992, |
|
"learning_rate": 1.9421487603305788e-05, |
|
"loss": 5.9866, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 128.11082458496094, |
|
"learning_rate": 1.9338842975206613e-05, |
|
"loss": 5.6448, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 5.013957977294922, |
|
"eval_runtime": 141.6906, |
|
"eval_samples_per_second": 6.789, |
|
"eval_steps_per_second": 6.789, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 94.7215576171875, |
|
"learning_rate": 1.925619834710744e-05, |
|
"loss": 5.2837, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 109.5147705078125, |
|
"learning_rate": 1.9173553719008268e-05, |
|
"loss": 5.0691, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 4.529860973358154, |
|
"eval_runtime": 139.3388, |
|
"eval_samples_per_second": 6.904, |
|
"eval_steps_per_second": 6.904, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 136.12649536132812, |
|
"learning_rate": 1.9090909090909094e-05, |
|
"loss": 4.8339, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 118.48357391357422, |
|
"learning_rate": 1.900826446280992e-05, |
|
"loss": 4.6329, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 4.244248390197754, |
|
"eval_runtime": 141.3143, |
|
"eval_samples_per_second": 6.808, |
|
"eval_steps_per_second": 6.808, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 297.8464660644531, |
|
"learning_rate": 1.8925619834710745e-05, |
|
"loss": 4.5749, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 155.1725311279297, |
|
"learning_rate": 1.884297520661157e-05, |
|
"loss": 4.6391, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 4.004535675048828, |
|
"eval_runtime": 142.1183, |
|
"eval_samples_per_second": 6.769, |
|
"eval_steps_per_second": 6.769, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 82.23986053466797, |
|
"learning_rate": 1.87603305785124e-05, |
|
"loss": 4.299, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 83.66497039794922, |
|
"learning_rate": 1.8677685950413225e-05, |
|
"loss": 4.1757, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 3.7286927700042725, |
|
"eval_runtime": 141.6853, |
|
"eval_samples_per_second": 6.79, |
|
"eval_steps_per_second": 6.79, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 185.3876190185547, |
|
"learning_rate": 1.859504132231405e-05, |
|
"loss": 4.1474, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 26.485153198242188, |
|
"learning_rate": 1.851239669421488e-05, |
|
"loss": 4.3622, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 3.6635849475860596, |
|
"eval_runtime": 138.132, |
|
"eval_samples_per_second": 6.964, |
|
"eval_steps_per_second": 6.964, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 158.5999755859375, |
|
"learning_rate": 1.8429752066115705e-05, |
|
"loss": 4.0511, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 48.88996505737305, |
|
"learning_rate": 1.834710743801653e-05, |
|
"loss": 4.0734, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 3.5613434314727783, |
|
"eval_runtime": 141.6568, |
|
"eval_samples_per_second": 6.791, |
|
"eval_steps_per_second": 6.791, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 46.01389694213867, |
|
"learning_rate": 1.8264462809917356e-05, |
|
"loss": 3.9621, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 311.6304016113281, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 3.8738, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 3.4829487800598145, |
|
"eval_runtime": 137.0137, |
|
"eval_samples_per_second": 7.021, |
|
"eval_steps_per_second": 7.021, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 148.29006958007812, |
|
"learning_rate": 1.809917355371901e-05, |
|
"loss": 3.7787, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 87.70325469970703, |
|
"learning_rate": 1.8016528925619837e-05, |
|
"loss": 3.8316, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 3.322986364364624, |
|
"eval_runtime": 140.8967, |
|
"eval_samples_per_second": 6.828, |
|
"eval_steps_per_second": 6.828, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 87.01234436035156, |
|
"learning_rate": 1.7933884297520662e-05, |
|
"loss": 3.9134, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 308.3322448730469, |
|
"learning_rate": 1.7851239669421488e-05, |
|
"loss": 3.7965, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 3.3614585399627686, |
|
"eval_runtime": 141.5793, |
|
"eval_samples_per_second": 6.795, |
|
"eval_steps_per_second": 6.795, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 142.58859252929688, |
|
"learning_rate": 1.7768595041322317e-05, |
|
"loss": 3.7439, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 134.673828125, |
|
"learning_rate": 1.7685950413223143e-05, |
|
"loss": 3.7948, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 3.228428363800049, |
|
"eval_runtime": 136.8867, |
|
"eval_samples_per_second": 7.028, |
|
"eval_steps_per_second": 7.028, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 69.1788330078125, |
|
"learning_rate": 1.7603305785123968e-05, |
|
"loss": 3.5873, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 161.53973388671875, |
|
"learning_rate": 1.7520661157024794e-05, |
|
"loss": 3.8479, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 3.2672481536865234, |
|
"eval_runtime": 138.7625, |
|
"eval_samples_per_second": 6.933, |
|
"eval_steps_per_second": 6.933, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 128.98785400390625, |
|
"learning_rate": 1.743801652892562e-05, |
|
"loss": 3.6716, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 170.5436553955078, |
|
"learning_rate": 1.735537190082645e-05, |
|
"loss": 3.7273, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 3.1376287937164307, |
|
"eval_runtime": 136.1603, |
|
"eval_samples_per_second": 7.065, |
|
"eval_steps_per_second": 7.065, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 129.81439208984375, |
|
"learning_rate": 1.7272727272727274e-05, |
|
"loss": 3.5567, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 135.2463836669922, |
|
"learning_rate": 1.71900826446281e-05, |
|
"loss": 3.6128, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 3.1712100505828857, |
|
"eval_runtime": 140.5239, |
|
"eval_samples_per_second": 6.846, |
|
"eval_steps_per_second": 6.846, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 99.10409545898438, |
|
"learning_rate": 1.710743801652893e-05, |
|
"loss": 3.5886, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 42.10710144042969, |
|
"learning_rate": 1.7024793388429754e-05, |
|
"loss": 3.4751, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 3.1437861919403076, |
|
"eval_runtime": 141.4419, |
|
"eval_samples_per_second": 6.801, |
|
"eval_steps_per_second": 6.801, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 60.31798553466797, |
|
"learning_rate": 1.694214876033058e-05, |
|
"loss": 3.3902, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 446.0960388183594, |
|
"learning_rate": 1.6859504132231405e-05, |
|
"loss": 3.5544, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 3.0222690105438232, |
|
"eval_runtime": 140.8408, |
|
"eval_samples_per_second": 6.83, |
|
"eval_steps_per_second": 6.83, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 125.9988784790039, |
|
"learning_rate": 1.677685950413223e-05, |
|
"loss": 3.4624, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 41.044036865234375, |
|
"learning_rate": 1.669421487603306e-05, |
|
"loss": 3.3734, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 2.9934887886047363, |
|
"eval_runtime": 140.0157, |
|
"eval_samples_per_second": 6.871, |
|
"eval_steps_per_second": 6.871, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 98.76380157470703, |
|
"learning_rate": 1.6611570247933886e-05, |
|
"loss": 3.4481, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 96.16433715820312, |
|
"learning_rate": 1.652892561983471e-05, |
|
"loss": 3.6288, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 2.9670004844665527, |
|
"eval_runtime": 141.8861, |
|
"eval_samples_per_second": 6.78, |
|
"eval_steps_per_second": 6.78, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 147.28933715820312, |
|
"learning_rate": 1.644628099173554e-05, |
|
"loss": 3.622, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 85.05192565917969, |
|
"learning_rate": 1.6363636363636366e-05, |
|
"loss": 3.194, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 2.9449918270111084, |
|
"eval_runtime": 140.4537, |
|
"eval_samples_per_second": 6.849, |
|
"eval_steps_per_second": 6.849, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 231.34249877929688, |
|
"learning_rate": 1.628099173553719e-05, |
|
"loss": 3.3939, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 17.25453758239746, |
|
"learning_rate": 1.6198347107438017e-05, |
|
"loss": 3.4362, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 2.9837887287139893, |
|
"eval_runtime": 140.5356, |
|
"eval_samples_per_second": 6.845, |
|
"eval_steps_per_second": 6.845, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 209.98016357421875, |
|
"learning_rate": 1.6115702479338843e-05, |
|
"loss": 3.5168, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 94.24089050292969, |
|
"learning_rate": 1.6033057851239672e-05, |
|
"loss": 3.2853, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 2.9006521701812744, |
|
"eval_runtime": 139.8116, |
|
"eval_samples_per_second": 6.881, |
|
"eval_steps_per_second": 6.881, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 59.02330780029297, |
|
"learning_rate": 1.5950413223140497e-05, |
|
"loss": 3.2102, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 114.82744598388672, |
|
"learning_rate": 1.5867768595041323e-05, |
|
"loss": 3.3839, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 2.8425700664520264, |
|
"eval_runtime": 135.9965, |
|
"eval_samples_per_second": 7.074, |
|
"eval_steps_per_second": 7.074, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 151.8341064453125, |
|
"learning_rate": 1.5785123966942152e-05, |
|
"loss": 3.3901, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 69.12097930908203, |
|
"learning_rate": 1.5702479338842978e-05, |
|
"loss": 3.4662, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 2.8814632892608643, |
|
"eval_runtime": 139.0607, |
|
"eval_samples_per_second": 6.918, |
|
"eval_steps_per_second": 6.918, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 81.52751159667969, |
|
"learning_rate": 1.5619834710743803e-05, |
|
"loss": 3.325, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 206.8463592529297, |
|
"learning_rate": 1.553719008264463e-05, |
|
"loss": 3.3232, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 2.836251735687256, |
|
"eval_runtime": 138.3679, |
|
"eval_samples_per_second": 6.952, |
|
"eval_steps_per_second": 6.952, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 82.50727844238281, |
|
"learning_rate": 1.5454545454545454e-05, |
|
"loss": 3.2616, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 131.51040649414062, |
|
"learning_rate": 1.5371900826446283e-05, |
|
"loss": 3.3857, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 2.780090093612671, |
|
"eval_runtime": 140.6306, |
|
"eval_samples_per_second": 6.841, |
|
"eval_steps_per_second": 6.841, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 66.1187515258789, |
|
"learning_rate": 1.528925619834711e-05, |
|
"loss": 3.2072, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 131.09405517578125, |
|
"learning_rate": 1.5206611570247936e-05, |
|
"loss": 3.0476, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 2.8148860931396484, |
|
"eval_runtime": 141.206, |
|
"eval_samples_per_second": 6.813, |
|
"eval_steps_per_second": 6.813, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 51.353572845458984, |
|
"learning_rate": 1.5123966942148762e-05, |
|
"loss": 3.3377, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 66.42013549804688, |
|
"learning_rate": 1.504132231404959e-05, |
|
"loss": 3.0664, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 2.776242971420288, |
|
"eval_runtime": 141.6364, |
|
"eval_samples_per_second": 6.792, |
|
"eval_steps_per_second": 6.792, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 170.49464416503906, |
|
"learning_rate": 1.4958677685950413e-05, |
|
"loss": 3.0958, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 137.87022399902344, |
|
"learning_rate": 1.487603305785124e-05, |
|
"loss": 3.2919, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 2.8100545406341553, |
|
"eval_runtime": 141.0359, |
|
"eval_samples_per_second": 6.821, |
|
"eval_steps_per_second": 6.821, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 211.2021942138672, |
|
"learning_rate": 1.4793388429752066e-05, |
|
"loss": 3.4836, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 113.34933471679688, |
|
"learning_rate": 1.4710743801652893e-05, |
|
"loss": 3.334, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 2.754260301589966, |
|
"eval_runtime": 140.5105, |
|
"eval_samples_per_second": 6.846, |
|
"eval_steps_per_second": 6.846, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 114.3346939086914, |
|
"learning_rate": 1.462809917355372e-05, |
|
"loss": 3.0715, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 91.6365966796875, |
|
"learning_rate": 1.4545454545454546e-05, |
|
"loss": 3.2307, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 2.7607641220092773, |
|
"eval_runtime": 139.2517, |
|
"eval_samples_per_second": 6.908, |
|
"eval_steps_per_second": 6.908, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.1861701011657715, |
|
"learning_rate": 1.4462809917355374e-05, |
|
"loss": 2.9537, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 97.13253021240234, |
|
"learning_rate": 1.4380165289256201e-05, |
|
"loss": 2.9517, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 2.805582046508789, |
|
"eval_runtime": 138.7882, |
|
"eval_samples_per_second": 6.931, |
|
"eval_steps_per_second": 6.931, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 55.73282241821289, |
|
"learning_rate": 1.4297520661157025e-05, |
|
"loss": 3.5026, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 105.15763854980469, |
|
"learning_rate": 1.4214876033057852e-05, |
|
"loss": 3.3557, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 2.7643611431121826, |
|
"eval_runtime": 140.4066, |
|
"eval_samples_per_second": 6.852, |
|
"eval_steps_per_second": 6.852, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 34.19166564941406, |
|
"learning_rate": 1.4132231404958678e-05, |
|
"loss": 3.1291, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 129.49188232421875, |
|
"learning_rate": 1.4049586776859505e-05, |
|
"loss": 3.2801, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 2.7544772624969482, |
|
"eval_runtime": 138.7006, |
|
"eval_samples_per_second": 6.936, |
|
"eval_steps_per_second": 6.936, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 36.190162658691406, |
|
"learning_rate": 1.3966942148760332e-05, |
|
"loss": 3.1862, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 126.45512390136719, |
|
"learning_rate": 1.3884297520661158e-05, |
|
"loss": 3.1573, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 2.7139012813568115, |
|
"eval_runtime": 139.7912, |
|
"eval_samples_per_second": 6.882, |
|
"eval_steps_per_second": 6.882, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 168.74777221679688, |
|
"learning_rate": 1.3801652892561985e-05, |
|
"loss": 3.1524, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 64.23265075683594, |
|
"learning_rate": 1.3719008264462813e-05, |
|
"loss": 3.2022, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 2.726412773132324, |
|
"eval_runtime": 137.2349, |
|
"eval_samples_per_second": 7.01, |
|
"eval_steps_per_second": 7.01, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 143.0786895751953, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 3.3005, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.35411542654037476, |
|
"learning_rate": 1.3553719008264464e-05, |
|
"loss": 3.1852, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 2.7224111557006836, |
|
"eval_runtime": 140.911, |
|
"eval_samples_per_second": 6.827, |
|
"eval_steps_per_second": 6.827, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 56.73325729370117, |
|
"learning_rate": 1.347107438016529e-05, |
|
"loss": 3.0389, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 182.5022430419922, |
|
"learning_rate": 1.3388429752066117e-05, |
|
"loss": 2.9861, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 2.7215094566345215, |
|
"eval_runtime": 142.3112, |
|
"eval_samples_per_second": 6.76, |
|
"eval_steps_per_second": 6.76, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 129.25076293945312, |
|
"learning_rate": 1.3305785123966944e-05, |
|
"loss": 3.2484, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 28.573749542236328, |
|
"learning_rate": 1.322314049586777e-05, |
|
"loss": 3.176, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 2.678982973098755, |
|
"eval_runtime": 140.2555, |
|
"eval_samples_per_second": 6.859, |
|
"eval_steps_per_second": 6.859, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 29.228971481323242, |
|
"learning_rate": 1.3140495867768597e-05, |
|
"loss": 2.898, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.7074639797210693, |
|
"learning_rate": 1.3057851239669424e-05, |
|
"loss": 2.9024, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 2.7119204998016357, |
|
"eval_runtime": 141.0803, |
|
"eval_samples_per_second": 6.819, |
|
"eval_steps_per_second": 6.819, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 122.27530670166016, |
|
"learning_rate": 1.2975206611570248e-05, |
|
"loss": 3.024, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 19.63726806640625, |
|
"learning_rate": 1.2892561983471074e-05, |
|
"loss": 3.2979, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 2.6496329307556152, |
|
"eval_runtime": 141.7782, |
|
"eval_samples_per_second": 6.785, |
|
"eval_steps_per_second": 6.785, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 220.49954223632812, |
|
"learning_rate": 1.2809917355371901e-05, |
|
"loss": 3.0016, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 77.60582733154297, |
|
"learning_rate": 1.2727272727272728e-05, |
|
"loss": 3.0776, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 2.6587836742401123, |
|
"eval_runtime": 139.8192, |
|
"eval_samples_per_second": 6.88, |
|
"eval_steps_per_second": 6.88, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 94.2193832397461, |
|
"learning_rate": 1.2644628099173554e-05, |
|
"loss": 3.0418, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 30.321847915649414, |
|
"learning_rate": 1.2561983471074381e-05, |
|
"loss": 3.2145, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 2.7039036750793457, |
|
"eval_runtime": 138.137, |
|
"eval_samples_per_second": 6.964, |
|
"eval_steps_per_second": 6.964, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 102.64441680908203, |
|
"learning_rate": 1.2479338842975209e-05, |
|
"loss": 2.9082, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 95.51321411132812, |
|
"learning_rate": 1.2396694214876034e-05, |
|
"loss": 3.2535, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 2.648103952407837, |
|
"eval_runtime": 139.7372, |
|
"eval_samples_per_second": 6.884, |
|
"eval_steps_per_second": 6.884, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 90.64472961425781, |
|
"learning_rate": 1.231404958677686e-05, |
|
"loss": 3.1841, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 128.23159790039062, |
|
"learning_rate": 1.2231404958677686e-05, |
|
"loss": 3.0074, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 2.6026036739349365, |
|
"eval_runtime": 139.947, |
|
"eval_samples_per_second": 6.874, |
|
"eval_steps_per_second": 6.874, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 254.29718017578125, |
|
"learning_rate": 1.2148760330578513e-05, |
|
"loss": 3.3493, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 33.48649215698242, |
|
"learning_rate": 1.206611570247934e-05, |
|
"loss": 3.007, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 2.6409358978271484, |
|
"eval_runtime": 140.003, |
|
"eval_samples_per_second": 6.871, |
|
"eval_steps_per_second": 6.871, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 165.1754608154297, |
|
"learning_rate": 1.1983471074380166e-05, |
|
"loss": 3.0267, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 57.49995803833008, |
|
"learning_rate": 1.1900826446280993e-05, |
|
"loss": 3.0913, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 2.676673412322998, |
|
"eval_runtime": 139.7897, |
|
"eval_samples_per_second": 6.882, |
|
"eval_steps_per_second": 6.882, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 269.28326416015625, |
|
"learning_rate": 1.181818181818182e-05, |
|
"loss": 3.0674, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 81.5179672241211, |
|
"learning_rate": 1.1735537190082646e-05, |
|
"loss": 3.1847, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 2.6336779594421387, |
|
"eval_runtime": 140.9712, |
|
"eval_samples_per_second": 6.824, |
|
"eval_steps_per_second": 6.824, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 81.88212585449219, |
|
"learning_rate": 1.1652892561983472e-05, |
|
"loss": 2.7742, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 34.32734298706055, |
|
"learning_rate": 1.1570247933884297e-05, |
|
"loss": 2.9359, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 2.649772882461548, |
|
"eval_runtime": 139.5432, |
|
"eval_samples_per_second": 6.894, |
|
"eval_steps_per_second": 6.894, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 44.86323165893555, |
|
"learning_rate": 1.1487603305785125e-05, |
|
"loss": 3.1545, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 70.11674499511719, |
|
"learning_rate": 1.1404958677685952e-05, |
|
"loss": 2.9488, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 2.5968177318573, |
|
"eval_runtime": 142.9712, |
|
"eval_samples_per_second": 6.729, |
|
"eval_steps_per_second": 6.729, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 43.42722702026367, |
|
"learning_rate": 1.1322314049586777e-05, |
|
"loss": 3.02, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 178.18666076660156, |
|
"learning_rate": 1.1239669421487605e-05, |
|
"loss": 3.0279, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 2.625840425491333, |
|
"eval_runtime": 142.8837, |
|
"eval_samples_per_second": 6.733, |
|
"eval_steps_per_second": 6.733, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 33.83938217163086, |
|
"learning_rate": 1.1157024793388432e-05, |
|
"loss": 2.9362, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 24.059724807739258, |
|
"learning_rate": 1.1074380165289258e-05, |
|
"loss": 2.9282, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 2.671956777572632, |
|
"eval_runtime": 141.943, |
|
"eval_samples_per_second": 6.777, |
|
"eval_steps_per_second": 6.777, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 150.78285217285156, |
|
"learning_rate": 1.0991735537190083e-05, |
|
"loss": 3.1242, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 129.85336303710938, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 2.9939, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 2.6020476818084717, |
|
"eval_runtime": 137.1321, |
|
"eval_samples_per_second": 7.015, |
|
"eval_steps_per_second": 7.015, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 97.6047134399414, |
|
"learning_rate": 1.0826446280991736e-05, |
|
"loss": 3.2276, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 54.10629653930664, |
|
"learning_rate": 1.0743801652892562e-05, |
|
"loss": 3.1815, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 2.6137943267822266, |
|
"eval_runtime": 140.5161, |
|
"eval_samples_per_second": 6.846, |
|
"eval_steps_per_second": 6.846, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 116.71500396728516, |
|
"learning_rate": 1.0661157024793389e-05, |
|
"loss": 2.7576, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 132.3123016357422, |
|
"learning_rate": 1.0578512396694216e-05, |
|
"loss": 2.8927, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 2.58026385307312, |
|
"eval_runtime": 137.5438, |
|
"eval_samples_per_second": 6.994, |
|
"eval_steps_per_second": 6.994, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3877036571502686, |
|
"learning_rate": 1.0495867768595042e-05, |
|
"loss": 3.1726, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 23.99924659729004, |
|
"learning_rate": 1.041322314049587e-05, |
|
"loss": 3.0603, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 2.5942938327789307, |
|
"eval_runtime": 139.9432, |
|
"eval_samples_per_second": 6.874, |
|
"eval_steps_per_second": 6.874, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 89.54573059082031, |
|
"learning_rate": 1.0330578512396693e-05, |
|
"loss": 2.9213, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 117.28192138671875, |
|
"learning_rate": 1.024793388429752e-05, |
|
"loss": 2.9974, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 2.574179172515869, |
|
"eval_runtime": 137.8963, |
|
"eval_samples_per_second": 6.976, |
|
"eval_steps_per_second": 6.976, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 230.2947540283203, |
|
"learning_rate": 1.0165289256198348e-05, |
|
"loss": 2.9884, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 114.89617156982422, |
|
"learning_rate": 1.0082644628099174e-05, |
|
"loss": 3.1789, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 2.551656723022461, |
|
"eval_runtime": 139.8913, |
|
"eval_samples_per_second": 6.877, |
|
"eval_steps_per_second": 6.877, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 19.36899185180664, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2153, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 95.15733337402344, |
|
"learning_rate": 9.917355371900828e-06, |
|
"loss": 3.0478, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 2.5420753955841064, |
|
"eval_runtime": 138.826, |
|
"eval_samples_per_second": 6.93, |
|
"eval_steps_per_second": 6.93, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 164.1331329345703, |
|
"learning_rate": 9.834710743801654e-06, |
|
"loss": 3.0467, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 120.298828125, |
|
"learning_rate": 9.75206611570248e-06, |
|
"loss": 3.0664, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 2.561354875564575, |
|
"eval_runtime": 138.2294, |
|
"eval_samples_per_second": 6.959, |
|
"eval_steps_per_second": 6.959, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 67.15013122558594, |
|
"learning_rate": 9.669421487603307e-06, |
|
"loss": 3.0473, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 116.40657043457031, |
|
"learning_rate": 9.586776859504134e-06, |
|
"loss": 2.8695, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 2.559004783630371, |
|
"eval_runtime": 139.5498, |
|
"eval_samples_per_second": 6.894, |
|
"eval_steps_per_second": 6.894, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 16.178442001342773, |
|
"learning_rate": 9.50413223140496e-06, |
|
"loss": 3.1084, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 9.640996932983398, |
|
"learning_rate": 9.421487603305785e-06, |
|
"loss": 3.0757, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 2.553495407104492, |
|
"eval_runtime": 140.0088, |
|
"eval_samples_per_second": 6.871, |
|
"eval_steps_per_second": 6.871, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.591582775115967, |
|
"learning_rate": 9.338842975206613e-06, |
|
"loss": 2.9386, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 62.728607177734375, |
|
"learning_rate": 9.25619834710744e-06, |
|
"loss": 2.9709, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 2.530237913131714, |
|
"eval_runtime": 139.6525, |
|
"eval_samples_per_second": 6.889, |
|
"eval_steps_per_second": 6.889, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 27.372655868530273, |
|
"learning_rate": 9.173553719008265e-06, |
|
"loss": 2.976, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 89.07634735107422, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 2.8215, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 2.565800905227661, |
|
"eval_runtime": 140.4523, |
|
"eval_samples_per_second": 6.849, |
|
"eval_steps_per_second": 6.849, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 20.640071868896484, |
|
"learning_rate": 9.008264462809918e-06, |
|
"loss": 3.0081, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 69.5301284790039, |
|
"learning_rate": 8.925619834710744e-06, |
|
"loss": 3.077, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 2.5604488849639893, |
|
"eval_runtime": 140.6864, |
|
"eval_samples_per_second": 6.838, |
|
"eval_steps_per_second": 6.838, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 46.04232406616211, |
|
"learning_rate": 8.842975206611571e-06, |
|
"loss": 3.0368, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 142.82720947265625, |
|
"learning_rate": 8.760330578512397e-06, |
|
"loss": 2.9914, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 2.561345100402832, |
|
"eval_runtime": 139.7552, |
|
"eval_samples_per_second": 6.883, |
|
"eval_steps_per_second": 6.883, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 12.521268844604492, |
|
"learning_rate": 8.677685950413224e-06, |
|
"loss": 2.8843, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.718876600265503, |
|
"learning_rate": 8.59504132231405e-06, |
|
"loss": 2.8383, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_loss": 2.55441951751709, |
|
"eval_runtime": 140.0164, |
|
"eval_samples_per_second": 6.871, |
|
"eval_steps_per_second": 6.871, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 68.07179260253906, |
|
"learning_rate": 8.512396694214877e-06, |
|
"loss": 2.9206, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 53.18625259399414, |
|
"learning_rate": 8.429752066115703e-06, |
|
"loss": 3.0139, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 2.539428234100342, |
|
"eval_runtime": 141.3662, |
|
"eval_samples_per_second": 6.805, |
|
"eval_steps_per_second": 6.805, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 38.117530822753906, |
|
"learning_rate": 8.34710743801653e-06, |
|
"loss": 2.9579, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 11.0903902053833, |
|
"learning_rate": 8.264462809917356e-06, |
|
"loss": 2.9588, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 2.5550858974456787, |
|
"eval_runtime": 140.3679, |
|
"eval_samples_per_second": 6.853, |
|
"eval_steps_per_second": 6.853, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 136.44300842285156, |
|
"learning_rate": 8.181818181818183e-06, |
|
"loss": 2.9754, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 118.072265625, |
|
"learning_rate": 8.099173553719009e-06, |
|
"loss": 3.1051, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 2.5240180492401123, |
|
"eval_runtime": 140.2239, |
|
"eval_samples_per_second": 6.86, |
|
"eval_steps_per_second": 6.86, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 124.3985824584961, |
|
"learning_rate": 8.016528925619836e-06, |
|
"loss": 2.9229, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 39.26608657836914, |
|
"learning_rate": 7.933884297520661e-06, |
|
"loss": 2.9566, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 2.5205495357513428, |
|
"eval_runtime": 140.8561, |
|
"eval_samples_per_second": 6.83, |
|
"eval_steps_per_second": 6.83, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 101.39036560058594, |
|
"learning_rate": 7.851239669421489e-06, |
|
"loss": 3.1112, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 66.84815216064453, |
|
"learning_rate": 7.768595041322314e-06, |
|
"loss": 2.8258, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 2.53057599067688, |
|
"eval_runtime": 138.4074, |
|
"eval_samples_per_second": 6.95, |
|
"eval_steps_per_second": 6.95, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 27.408185958862305, |
|
"learning_rate": 7.685950413223142e-06, |
|
"loss": 3.0526, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 51.945796966552734, |
|
"learning_rate": 7.603305785123968e-06, |
|
"loss": 3.1191, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 2.5579235553741455, |
|
"eval_runtime": 140.5154, |
|
"eval_samples_per_second": 6.846, |
|
"eval_steps_per_second": 6.846, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 54.32289123535156, |
|
"learning_rate": 7.520661157024795e-06, |
|
"loss": 2.6603, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 68.38690185546875, |
|
"learning_rate": 7.43801652892562e-06, |
|
"loss": 2.9808, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 2.4875569343566895, |
|
"eval_runtime": 139.4265, |
|
"eval_samples_per_second": 6.9, |
|
"eval_steps_per_second": 6.9, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 13.368736267089844, |
|
"learning_rate": 7.355371900826447e-06, |
|
"loss": 2.9716, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 223.064208984375, |
|
"learning_rate": 7.272727272727273e-06, |
|
"loss": 2.866, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 2.4790120124816895, |
|
"eval_runtime": 138.7203, |
|
"eval_samples_per_second": 6.935, |
|
"eval_steps_per_second": 6.935, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 45.701168060302734, |
|
"learning_rate": 7.1900826446281005e-06, |
|
"loss": 2.8411, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 45.346946716308594, |
|
"learning_rate": 7.107438016528926e-06, |
|
"loss": 3.0315, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 2.4591948986053467, |
|
"eval_runtime": 136.0903, |
|
"eval_samples_per_second": 7.069, |
|
"eval_steps_per_second": 7.069, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 110.85874938964844, |
|
"learning_rate": 7.0247933884297525e-06, |
|
"loss": 2.9017, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 38.339019775390625, |
|
"learning_rate": 6.942148760330579e-06, |
|
"loss": 2.9615, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 2.519632339477539, |
|
"eval_runtime": 136.2766, |
|
"eval_samples_per_second": 7.059, |
|
"eval_steps_per_second": 7.059, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 136.72471618652344, |
|
"learning_rate": 6.859504132231406e-06, |
|
"loss": 2.7923, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 177.0449676513672, |
|
"learning_rate": 6.776859504132232e-06, |
|
"loss": 2.894, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 2.4873228073120117, |
|
"eval_runtime": 140.1936, |
|
"eval_samples_per_second": 6.862, |
|
"eval_steps_per_second": 6.862, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 52.91669464111328, |
|
"learning_rate": 6.694214876033058e-06, |
|
"loss": 2.9347, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 29.3709716796875, |
|
"learning_rate": 6.611570247933885e-06, |
|
"loss": 3.0159, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 2.5049378871917725, |
|
"eval_runtime": 136.3273, |
|
"eval_samples_per_second": 7.057, |
|
"eval_steps_per_second": 7.057, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 39.67999267578125, |
|
"learning_rate": 6.528925619834712e-06, |
|
"loss": 2.7904, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 67.77261352539062, |
|
"learning_rate": 6.446280991735537e-06, |
|
"loss": 2.8427, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 2.483983039855957, |
|
"eval_runtime": 140.506, |
|
"eval_samples_per_second": 6.847, |
|
"eval_steps_per_second": 6.847, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 31.837095260620117, |
|
"learning_rate": 6.363636363636364e-06, |
|
"loss": 2.9611, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 90.8124008178711, |
|
"learning_rate": 6.280991735537191e-06, |
|
"loss": 3.0876, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 2.4686708450317383, |
|
"eval_runtime": 140.2415, |
|
"eval_samples_per_second": 6.86, |
|
"eval_steps_per_second": 6.86, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 203.96783447265625, |
|
"learning_rate": 6.198347107438017e-06, |
|
"loss": 2.8774, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 68.90469360351562, |
|
"learning_rate": 6.115702479338843e-06, |
|
"loss": 2.9447, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 2.4458138942718506, |
|
"eval_runtime": 140.8702, |
|
"eval_samples_per_second": 6.829, |
|
"eval_steps_per_second": 6.829, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 96.91522979736328, |
|
"learning_rate": 6.03305785123967e-06, |
|
"loss": 3.1052, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 54.14280319213867, |
|
"learning_rate": 5.9504132231404965e-06, |
|
"loss": 3.133, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 2.498807668685913, |
|
"eval_runtime": 140.2236, |
|
"eval_samples_per_second": 6.86, |
|
"eval_steps_per_second": 6.86, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 32.362709045410156, |
|
"learning_rate": 5.867768595041323e-06, |
|
"loss": 3.0148, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 89.885009765625, |
|
"learning_rate": 5.785123966942149e-06, |
|
"loss": 2.8195, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 2.465299606323242, |
|
"eval_runtime": 138.3234, |
|
"eval_samples_per_second": 6.955, |
|
"eval_steps_per_second": 6.955, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.9571592211723328, |
|
"learning_rate": 5.702479338842976e-06, |
|
"loss": 2.896, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 76.42816162109375, |
|
"learning_rate": 5.619834710743802e-06, |
|
"loss": 3.0033, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 2.4510247707366943, |
|
"eval_runtime": 137.4476, |
|
"eval_samples_per_second": 6.999, |
|
"eval_steps_per_second": 6.999, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 31.637996673583984, |
|
"learning_rate": 5.537190082644629e-06, |
|
"loss": 3.0171, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 276.10009765625, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 2.7309, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 2.4777259826660156, |
|
"eval_runtime": 138.3526, |
|
"eval_samples_per_second": 6.953, |
|
"eval_steps_per_second": 6.953, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 64.81793212890625, |
|
"learning_rate": 5.371900826446281e-06, |
|
"loss": 3.035, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 74.73489379882812, |
|
"learning_rate": 5.289256198347108e-06, |
|
"loss": 2.778, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 2.4714279174804688, |
|
"eval_runtime": 135.884, |
|
"eval_samples_per_second": 7.08, |
|
"eval_steps_per_second": 7.08, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 423.29888916015625, |
|
"learning_rate": 5.206611570247935e-06, |
|
"loss": 3.1729, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 68.26463317871094, |
|
"learning_rate": 5.12396694214876e-06, |
|
"loss": 2.6937, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 2.511780023574829, |
|
"eval_runtime": 139.7573, |
|
"eval_samples_per_second": 6.883, |
|
"eval_steps_per_second": 6.883, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 63.75825500488281, |
|
"learning_rate": 5.041322314049587e-06, |
|
"loss": 2.8172, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 95.2588882446289, |
|
"learning_rate": 4.958677685950414e-06, |
|
"loss": 2.6397, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 2.4570491313934326, |
|
"eval_runtime": 139.6678, |
|
"eval_samples_per_second": 6.888, |
|
"eval_steps_per_second": 6.888, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 146.0210418701172, |
|
"learning_rate": 4.87603305785124e-06, |
|
"loss": 2.7785, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.548470973968506, |
|
"learning_rate": 4.793388429752067e-06, |
|
"loss": 2.8252, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 2.455515146255493, |
|
"eval_runtime": 140.0032, |
|
"eval_samples_per_second": 6.871, |
|
"eval_steps_per_second": 6.871, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.6395989656448364, |
|
"learning_rate": 4.710743801652893e-06, |
|
"loss": 2.8768, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 127.5672607421875, |
|
"learning_rate": 4.62809917355372e-06, |
|
"loss": 2.8688, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 2.456037998199463, |
|
"eval_runtime": 140.078, |
|
"eval_samples_per_second": 6.868, |
|
"eval_steps_per_second": 6.868, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 76.23936462402344, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 2.8035, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.9954880475997925, |
|
"learning_rate": 4.462809917355372e-06, |
|
"loss": 2.8295, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 2.470505714416504, |
|
"eval_runtime": 140.8327, |
|
"eval_samples_per_second": 6.831, |
|
"eval_steps_per_second": 6.831, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 121.44882202148438, |
|
"learning_rate": 4.3801652892561984e-06, |
|
"loss": 2.7538, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 222.09165954589844, |
|
"learning_rate": 4.297520661157025e-06, |
|
"loss": 2.885, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 2.462240695953369, |
|
"eval_runtime": 140.1637, |
|
"eval_samples_per_second": 6.863, |
|
"eval_steps_per_second": 6.863, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 54.475791931152344, |
|
"learning_rate": 4.214876033057851e-06, |
|
"loss": 2.9929, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 132.60020446777344, |
|
"learning_rate": 4.132231404958678e-06, |
|
"loss": 2.9288, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 2.461066484451294, |
|
"eval_runtime": 137.924, |
|
"eval_samples_per_second": 6.975, |
|
"eval_steps_per_second": 6.975, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 43.78097915649414, |
|
"learning_rate": 4.049586776859504e-06, |
|
"loss": 2.8568, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 29.313560485839844, |
|
"learning_rate": 3.966942148760331e-06, |
|
"loss": 2.8473, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 2.4508566856384277, |
|
"eval_runtime": 139.8571, |
|
"eval_samples_per_second": 6.878, |
|
"eval_steps_per_second": 6.878, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 32.94176483154297, |
|
"learning_rate": 3.884297520661157e-06, |
|
"loss": 3.0909, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 26.41152000427246, |
|
"learning_rate": 3.801652892561984e-06, |
|
"loss": 2.8863, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 2.4427459239959717, |
|
"eval_runtime": 139.5787, |
|
"eval_samples_per_second": 6.892, |
|
"eval_steps_per_second": 6.892, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 75.68090057373047, |
|
"learning_rate": 3.71900826446281e-06, |
|
"loss": 2.8636, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 30.334091186523438, |
|
"learning_rate": 3.6363636363636366e-06, |
|
"loss": 2.7238, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 2.4287164211273193, |
|
"eval_runtime": 140.5273, |
|
"eval_samples_per_second": 6.846, |
|
"eval_steps_per_second": 6.846, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 191.05506896972656, |
|
"learning_rate": 3.553719008264463e-06, |
|
"loss": 3.0945, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 30.880615234375, |
|
"learning_rate": 3.4710743801652895e-06, |
|
"loss": 2.842, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 2.418543815612793, |
|
"eval_runtime": 137.122, |
|
"eval_samples_per_second": 7.016, |
|
"eval_steps_per_second": 7.016, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 169.966796875, |
|
"learning_rate": 3.388429752066116e-06, |
|
"loss": 2.6592, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 31.075923919677734, |
|
"learning_rate": 3.3057851239669424e-06, |
|
"loss": 2.947, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 2.415428400039673, |
|
"eval_runtime": 137.232, |
|
"eval_samples_per_second": 7.01, |
|
"eval_steps_per_second": 7.01, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.725558280944824, |
|
"learning_rate": 3.2231404958677685e-06, |
|
"loss": 2.7334, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 96.62724304199219, |
|
"learning_rate": 3.1404958677685953e-06, |
|
"loss": 2.9315, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 2.4265284538269043, |
|
"eval_runtime": 139.3641, |
|
"eval_samples_per_second": 6.903, |
|
"eval_steps_per_second": 6.903, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 245.8263702392578, |
|
"learning_rate": 3.0578512396694214e-06, |
|
"loss": 2.7418, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 166.16859436035156, |
|
"learning_rate": 2.9752066115702483e-06, |
|
"loss": 2.8382, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 2.4436089992523193, |
|
"eval_runtime": 138.7814, |
|
"eval_samples_per_second": 6.932, |
|
"eval_steps_per_second": 6.932, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 155.92645263671875, |
|
"learning_rate": 2.8925619834710743e-06, |
|
"loss": 2.8438, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 229.83657836914062, |
|
"learning_rate": 2.809917355371901e-06, |
|
"loss": 2.8605, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 2.4522359371185303, |
|
"eval_runtime": 140.2918, |
|
"eval_samples_per_second": 6.857, |
|
"eval_steps_per_second": 6.857, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 114.02568817138672, |
|
"learning_rate": 2.7272727272727272e-06, |
|
"loss": 2.7917, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.14724546670913696, |
|
"learning_rate": 2.644628099173554e-06, |
|
"loss": 2.8771, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 2.4402596950531006, |
|
"eval_runtime": 137.5715, |
|
"eval_samples_per_second": 6.993, |
|
"eval_steps_per_second": 6.993, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 62.19240188598633, |
|
"learning_rate": 2.56198347107438e-06, |
|
"loss": 2.8352, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 285.7218017578125, |
|
"learning_rate": 2.479338842975207e-06, |
|
"loss": 2.9687, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 2.443941116333008, |
|
"eval_runtime": 138.5628, |
|
"eval_samples_per_second": 6.943, |
|
"eval_steps_per_second": 6.943, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 7.110346794128418, |
|
"learning_rate": 2.3966942148760335e-06, |
|
"loss": 2.7594, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 55.28038787841797, |
|
"learning_rate": 2.31404958677686e-06, |
|
"loss": 2.8578, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 2.435091257095337, |
|
"eval_runtime": 141.2615, |
|
"eval_samples_per_second": 6.81, |
|
"eval_steps_per_second": 6.81, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 89.96186828613281, |
|
"learning_rate": 2.231404958677686e-06, |
|
"loss": 2.6923, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 65.65628051757812, |
|
"learning_rate": 2.1487603305785124e-06, |
|
"loss": 2.8386, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 2.4371442794799805, |
|
"eval_runtime": 138.8387, |
|
"eval_samples_per_second": 6.929, |
|
"eval_steps_per_second": 6.929, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 235.8939971923828, |
|
"learning_rate": 2.066115702479339e-06, |
|
"loss": 2.8686, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 66.96907806396484, |
|
"learning_rate": 1.9834710743801654e-06, |
|
"loss": 2.9078, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 2.4459149837493896, |
|
"eval_runtime": 140.0398, |
|
"eval_samples_per_second": 6.869, |
|
"eval_steps_per_second": 6.869, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 204.19281005859375, |
|
"learning_rate": 1.900826446280992e-06, |
|
"loss": 2.8531, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 121.73772430419922, |
|
"learning_rate": 1.8181818181818183e-06, |
|
"loss": 2.9152, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 2.436737060546875, |
|
"eval_runtime": 140.5818, |
|
"eval_samples_per_second": 6.843, |
|
"eval_steps_per_second": 6.843, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 555.6137084960938, |
|
"learning_rate": 1.7355371900826448e-06, |
|
"loss": 2.8121, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 75.70996856689453, |
|
"learning_rate": 1.6528925619834712e-06, |
|
"loss": 2.8674, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 2.4294447898864746, |
|
"eval_runtime": 140.7437, |
|
"eval_samples_per_second": 6.835, |
|
"eval_steps_per_second": 6.835, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 64.46011352539062, |
|
"learning_rate": 1.5702479338842977e-06, |
|
"loss": 2.7017, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 7.751842975616455, |
|
"learning_rate": 1.4876033057851241e-06, |
|
"loss": 3.2391, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 2.433881998062134, |
|
"eval_runtime": 138.8671, |
|
"eval_samples_per_second": 6.927, |
|
"eval_steps_per_second": 6.927, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 81.50652313232422, |
|
"learning_rate": 1.4049586776859506e-06, |
|
"loss": 2.9904, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 57.682987213134766, |
|
"learning_rate": 1.322314049586777e-06, |
|
"loss": 2.8742, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 2.4209775924682617, |
|
"eval_runtime": 138.7625, |
|
"eval_samples_per_second": 6.933, |
|
"eval_steps_per_second": 6.933, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 52.617713928222656, |
|
"learning_rate": 1.2396694214876035e-06, |
|
"loss": 2.9336, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 123.95398712158203, |
|
"learning_rate": 1.15702479338843e-06, |
|
"loss": 2.7248, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 2.4292454719543457, |
|
"eval_runtime": 138.6689, |
|
"eval_samples_per_second": 6.937, |
|
"eval_steps_per_second": 6.937, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 122.3961410522461, |
|
"learning_rate": 1.0743801652892562e-06, |
|
"loss": 2.9363, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.238726258277893, |
|
"learning_rate": 9.917355371900827e-07, |
|
"loss": 3.0417, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 2.4144928455352783, |
|
"eval_runtime": 139.5698, |
|
"eval_samples_per_second": 6.893, |
|
"eval_steps_per_second": 6.893, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 29.5744571685791, |
|
"learning_rate": 9.090909090909091e-07, |
|
"loss": 2.7703, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 57.384883880615234, |
|
"learning_rate": 8.264462809917356e-07, |
|
"loss": 3.0549, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 2.4143543243408203, |
|
"eval_runtime": 139.2695, |
|
"eval_samples_per_second": 6.907, |
|
"eval_steps_per_second": 6.907, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 108.1037826538086, |
|
"learning_rate": 7.438016528925621e-07, |
|
"loss": 2.9309, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 68.91290283203125, |
|
"learning_rate": 6.611570247933885e-07, |
|
"loss": 2.8769, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 2.4317731857299805, |
|
"eval_runtime": 139.2728, |
|
"eval_samples_per_second": 6.907, |
|
"eval_steps_per_second": 6.907, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 79.56616973876953, |
|
"learning_rate": 5.78512396694215e-07, |
|
"loss": 2.9857, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.2761176824569702, |
|
"learning_rate": 4.958677685950413e-07, |
|
"loss": 2.9639, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 2.425485849380493, |
|
"eval_runtime": 139.7857, |
|
"eval_samples_per_second": 6.882, |
|
"eval_steps_per_second": 6.882, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 73.10523986816406, |
|
"learning_rate": 4.132231404958678e-07, |
|
"loss": 2.6807, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 17.360837936401367, |
|
"learning_rate": 3.3057851239669426e-07, |
|
"loss": 2.7351, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"eval_loss": 2.436006546020508, |
|
"eval_runtime": 139.6881, |
|
"eval_samples_per_second": 6.887, |
|
"eval_steps_per_second": 6.887, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.4898810386657715, |
|
"learning_rate": 2.4793388429752067e-07, |
|
"loss": 2.9617, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 76.47006225585938, |
|
"learning_rate": 1.6528925619834713e-07, |
|
"loss": 2.8396, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_loss": 2.429133176803589, |
|
"eval_runtime": 138.7508, |
|
"eval_samples_per_second": 6.933, |
|
"eval_steps_per_second": 6.933, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 143.42933654785156, |
|
"learning_rate": 8.264462809917357e-08, |
|
"loss": 2.9364, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 37.042301177978516, |
|
"learning_rate": 0.0, |
|
"loss": 2.6683, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.4457602500915527, |
|
"eval_runtime": 136.676, |
|
"eval_samples_per_second": 7.039, |
|
"eval_steps_per_second": 7.039, |
|
"step": 121000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 121000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"total_flos": 3.0771197488774195e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|