|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.996415770609319, |
|
"eval_steps": 500, |
|
"global_step": 1254, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.023894862604540025, |
|
"grad_norm": 4.236361518666415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.881, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04778972520908005, |
|
"grad_norm": 5.741199257309677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7803, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07168458781362007, |
|
"grad_norm": 19.255957544263854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7636, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0955794504181601, |
|
"grad_norm": 1.4389740219584857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7601, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11947431302270012, |
|
"grad_norm": 2.052033228969374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7307, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14336917562724014, |
|
"grad_norm": 1.4883942502737415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7194, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16726403823178015, |
|
"grad_norm": 0.7721229562083435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7186, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1911589008363202, |
|
"grad_norm": 0.5683129469939435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6965, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21505376344086022, |
|
"grad_norm": 0.527733611127623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6922, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23894862604540024, |
|
"grad_norm": 0.5540046789983225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.693, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2628434886499403, |
|
"grad_norm": 0.5451390307514128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6802, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2867383512544803, |
|
"grad_norm": 0.5143838898116624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.688, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3106332138590203, |
|
"grad_norm": 0.6502984472755421, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6838, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3345280764635603, |
|
"grad_norm": 0.5635569077666838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6733, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.35842293906810035, |
|
"grad_norm": 0.6029469287016763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6776, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3823178016726404, |
|
"grad_norm": 0.486292600711864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4062126642771804, |
|
"grad_norm": 0.6615883711779132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6652, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.43010752688172044, |
|
"grad_norm": 0.4717863479299739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6655, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4540023894862604, |
|
"grad_norm": 0.4888275284899482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.662, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4778972520908005, |
|
"grad_norm": 0.5394213188181476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6687, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5017921146953405, |
|
"grad_norm": 0.45576158948311507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5256869772998806, |
|
"grad_norm": 0.45151984287636476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6597, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5495818399044206, |
|
"grad_norm": 0.4799647706900106, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6602, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5734767025089605, |
|
"grad_norm": 0.47076133511342133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6571, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5973715651135006, |
|
"grad_norm": 0.5919116297131423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6615, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6212664277180406, |
|
"grad_norm": 0.5500231129527917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6585, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.5242998976170237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6633, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6690561529271206, |
|
"grad_norm": 0.44132900428051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6588, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6929510155316607, |
|
"grad_norm": 0.6925054556015406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6553, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7168458781362007, |
|
"grad_norm": 0.4625241785333385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6574, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.4229402308269957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6527, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7646356033452808, |
|
"grad_norm": 0.5130609463277542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6561, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7885304659498208, |
|
"grad_norm": 0.6838274381409521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6555, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8124253285543608, |
|
"grad_norm": 0.4426103821343896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6528, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8363201911589009, |
|
"grad_norm": 0.4768048776745041, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6526, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8602150537634409, |
|
"grad_norm": 0.47979657505843953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6507, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8841099163679809, |
|
"grad_norm": 0.43210991398577236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6545, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9080047789725209, |
|
"grad_norm": 0.4219482631866451, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6561, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.931899641577061, |
|
"grad_norm": 0.4889263682317913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6415, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.955794504181601, |
|
"grad_norm": 0.4994356501839893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6434, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9796893667861409, |
|
"grad_norm": 0.5756138907013993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6535, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.998805256869773, |
|
"eval_loss": 0.6466529965400696, |
|
"eval_runtime": 225.9354, |
|
"eval_samples_per_second": 49.895, |
|
"eval_steps_per_second": 0.394, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.003584229390681, |
|
"grad_norm": 0.7256416169536216, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6438, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.027479091995221, |
|
"grad_norm": 0.6564158902335233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6108, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0513739545997611, |
|
"grad_norm": 0.4999679801637927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6106, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.075268817204301, |
|
"grad_norm": 0.5241048691611577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6124, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.099163679808841, |
|
"grad_norm": 0.5456228664692746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6042, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1230585424133812, |
|
"grad_norm": 0.5456744152195628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6028, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.146953405017921, |
|
"grad_norm": 0.4664933079979728, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6095, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1708482676224612, |
|
"grad_norm": 0.4894583019401931, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6019, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.194743130227001, |
|
"grad_norm": 0.4942642519947347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6114, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2186379928315412, |
|
"grad_norm": 0.46554339302452813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6099, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2425328554360813, |
|
"grad_norm": 0.5215764597896382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6073, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2664277180406214, |
|
"grad_norm": 0.5142341654295087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6105, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 0.4429903840954624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.61, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3142174432497014, |
|
"grad_norm": 0.4244756990330428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6113, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3381123058542412, |
|
"grad_norm": 0.4664930270424248, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6057, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3620071684587813, |
|
"grad_norm": 0.6747787167132405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6084, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3859020310633214, |
|
"grad_norm": 0.8515989236641928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4097968936678615, |
|
"grad_norm": 0.634857639704424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4336917562724014, |
|
"grad_norm": 0.5282115500074044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6096, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4575866188769415, |
|
"grad_norm": 0.5576953727126037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.616, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.45965397939992636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6082, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5053763440860215, |
|
"grad_norm": 0.5729607655893968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6037, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5292712066905616, |
|
"grad_norm": 0.4420855639504453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.608, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5531660692951017, |
|
"grad_norm": 0.4815965030552482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6053, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5770609318996416, |
|
"grad_norm": 0.5446732967871324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6076, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6009557945041815, |
|
"grad_norm": 0.5773921107864519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6248506571087216, |
|
"grad_norm": 0.44904612161350127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5988, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6487455197132617, |
|
"grad_norm": 0.4659803956684399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.598, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.6726403823178018, |
|
"grad_norm": 0.4361474003132107, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6081, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6965352449223416, |
|
"grad_norm": 0.4702827100539838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5997, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7204301075268817, |
|
"grad_norm": 0.46962735672309736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6135, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7443249701314216, |
|
"grad_norm": 0.5064462322593579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6034, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7682198327359617, |
|
"grad_norm": 0.6442892941899157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6044, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7921146953405018, |
|
"grad_norm": 0.449859458258856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6062, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.816009557945042, |
|
"grad_norm": 0.47467567108778363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6035, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8399044205495818, |
|
"grad_norm": 0.43550415026449085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5987, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.863799283154122, |
|
"grad_norm": 0.48913780227876247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6031, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8876941457586618, |
|
"grad_norm": 0.5594004132295759, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5995, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.911589008363202, |
|
"grad_norm": 0.4971730954697683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6056, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 0.46186692571258725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6004, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.959378733572282, |
|
"grad_norm": 0.4508830943663248, |
|
"learning_rate": 5e-06, |
|
"loss": 0.604, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.983273596176822, |
|
"grad_norm": 0.501207898912081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.608, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6365451812744141, |
|
"eval_runtime": 226.5918, |
|
"eval_samples_per_second": 49.75, |
|
"eval_steps_per_second": 0.393, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 2.007168458781362, |
|
"grad_norm": 0.85054207213812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.592, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.031063321385902, |
|
"grad_norm": 0.5201507086108782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5608, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.054958183990442, |
|
"grad_norm": 0.7188535226812537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.078853046594982, |
|
"grad_norm": 0.4763195641282365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5644, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.1027479091995223, |
|
"grad_norm": 0.5081415859208832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.126642771804062, |
|
"grad_norm": 0.5931792000293172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5563, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.150537634408602, |
|
"grad_norm": 0.49851033855755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5611, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.174432497013142, |
|
"grad_norm": 0.5379278365329638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5573, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.198327359617682, |
|
"grad_norm": 0.5350268044233742, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5678, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5689123372030673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5686, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2461170848267624, |
|
"grad_norm": 0.6671996296787344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.270011947431302, |
|
"grad_norm": 0.4506810352733908, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5642, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.293906810035842, |
|
"grad_norm": 0.5358151280205125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5628, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.3178016726403823, |
|
"grad_norm": 0.5566771627404731, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5634, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3416965352449224, |
|
"grad_norm": 0.49963936030628325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5632, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.3655913978494625, |
|
"grad_norm": 0.48679480824629434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5583, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.389486260454002, |
|
"grad_norm": 0.5074816823498985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5636, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.4133811230585422, |
|
"grad_norm": 0.5739148795335686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5671, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4372759856630823, |
|
"grad_norm": 0.6501742104516552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.4611708482676224, |
|
"grad_norm": 0.43406800220014613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5645, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4850657108721625, |
|
"grad_norm": 0.47946981158627366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5612, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.5089605734767026, |
|
"grad_norm": 0.5508677225984592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5658, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.5328554360812428, |
|
"grad_norm": 0.6172108213167418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5656, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5567502986857824, |
|
"grad_norm": 0.6149816712572169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5637, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 0.5494076230620691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5748, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.6045400238948626, |
|
"grad_norm": 0.5098015036653776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5665, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.6284348864994027, |
|
"grad_norm": 0.4763003977246298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5659, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.652329749103943, |
|
"grad_norm": 0.45015059391064355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5613, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.6762246117084825, |
|
"grad_norm": 0.5195016081388676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5661, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.7001194743130226, |
|
"grad_norm": 0.461979850463992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5703, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.7240143369175627, |
|
"grad_norm": 0.4611698536891998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5628, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7479091995221028, |
|
"grad_norm": 0.5474996121575114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5692, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.771804062126643, |
|
"grad_norm": 0.477411852958178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5745, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.795698924731183, |
|
"grad_norm": 0.48004817339516165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5625, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.819593787335723, |
|
"grad_norm": 0.5043226922994581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5664, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.8434886499402627, |
|
"grad_norm": 0.4988305698181874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5649, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.867383512544803, |
|
"grad_norm": 0.4569103859069353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.891278375149343, |
|
"grad_norm": 0.46286445346886024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.915173237753883, |
|
"grad_norm": 0.5296890930558641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5668, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.9390681003584227, |
|
"grad_norm": 0.5546209266748766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.5910470543653078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5668, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.986857825567503, |
|
"grad_norm": 0.5524701632367459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.996415770609319, |
|
"eval_loss": 0.6373269557952881, |
|
"eval_runtime": 227.0283, |
|
"eval_samples_per_second": 49.655, |
|
"eval_steps_per_second": 0.392, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 2.996415770609319, |
|
"step": 1254, |
|
"total_flos": 2100077946470400.0, |
|
"train_loss": 0.6173035094612523, |
|
"train_runtime": 37751.1113, |
|
"train_samples_per_second": 17.021, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1254, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2100077946470400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|