|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997049277072882, |
|
"eval_steps": 500, |
|
"global_step": 847, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011802891708468575, |
|
"grad_norm": 1.0318430662155151, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 2.0002, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02360578341693715, |
|
"grad_norm": 0.7277640104293823, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 1.5384, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03540867512540572, |
|
"grad_norm": 0.5723221898078918, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 1.2868, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0472115668338743, |
|
"grad_norm": 0.5436434149742126, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 1.195, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05901445854234287, |
|
"grad_norm": 0.5486343502998352, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 1.099, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07081735025081144, |
|
"grad_norm": 0.540969967842102, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 1.0529, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08262024195928003, |
|
"grad_norm": 0.5404631495475769, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 0.9962, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0944231336677486, |
|
"grad_norm": 0.7088269591331482, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 1.0458, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10622602537621717, |
|
"grad_norm": 0.5803569555282593, |
|
"learning_rate": 4.9994688411216076e-05, |
|
"loss": 1.0237, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11802891708468574, |
|
"grad_norm": 0.5251590013504028, |
|
"learning_rate": 4.99522092422138e-05, |
|
"loss": 1.005, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1298318087931543, |
|
"grad_norm": 0.5402054786682129, |
|
"learning_rate": 4.986732309873992e-05, |
|
"loss": 0.9771, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14163470050162288, |
|
"grad_norm": 0.5150293111801147, |
|
"learning_rate": 4.9740174247159156e-05, |
|
"loss": 0.9695, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15343759221009148, |
|
"grad_norm": 0.6688190698623657, |
|
"learning_rate": 4.95709787804856e-05, |
|
"loss": 0.9605, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16524048391856005, |
|
"grad_norm": 0.4473928213119507, |
|
"learning_rate": 4.936002425112657e-05, |
|
"loss": 0.8765, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17704337562702863, |
|
"grad_norm": 0.5843707919120789, |
|
"learning_rate": 4.910766918217935e-05, |
|
"loss": 0.9371, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1888462673354972, |
|
"grad_norm": 0.5653939247131348, |
|
"learning_rate": 4.881434245811115e-05, |
|
"loss": 0.9091, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20064915904396577, |
|
"grad_norm": 0.725297212600708, |
|
"learning_rate": 4.8480542595858025e-05, |
|
"loss": 0.9217, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21245205075243434, |
|
"grad_norm": 0.4762970805168152, |
|
"learning_rate": 4.810683689758147e-05, |
|
"loss": 0.9, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2242549424609029, |
|
"grad_norm": 0.6104872226715088, |
|
"learning_rate": 4.7693860486522604e-05, |
|
"loss": 0.8662, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23605783416937148, |
|
"grad_norm": 0.5987859964370728, |
|
"learning_rate": 4.7242315227592496e-05, |
|
"loss": 0.8754, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24786072587784008, |
|
"grad_norm": 0.5462383031845093, |
|
"learning_rate": 4.675296853453326e-05, |
|
"loss": 0.8838, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2596636175863086, |
|
"grad_norm": 0.7473201155662537, |
|
"learning_rate": 4.6226652065676974e-05, |
|
"loss": 0.8798, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2714665092947772, |
|
"grad_norm": 0.6000027656555176, |
|
"learning_rate": 4.566426031051922e-05, |
|
"loss": 0.9065, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28326940100324577, |
|
"grad_norm": 0.6105000972747803, |
|
"learning_rate": 4.506674906950929e-05, |
|
"loss": 0.9111, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.29507229271171437, |
|
"grad_norm": 0.5564777851104736, |
|
"learning_rate": 4.4435133829640645e-05, |
|
"loss": 0.8646, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.30687518442018297, |
|
"grad_norm": 0.6954275369644165, |
|
"learning_rate": 4.3770488038602555e-05, |
|
"loss": 0.8485, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3186780761286515, |
|
"grad_norm": 0.8191194534301758, |
|
"learning_rate": 4.30739412804258e-05, |
|
"loss": 0.826, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3304809678371201, |
|
"grad_norm": 0.6449839472770691, |
|
"learning_rate": 4.234667735572323e-05, |
|
"loss": 0.8685, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34228385954558865, |
|
"grad_norm": 0.7887718677520752, |
|
"learning_rate": 4.158993226978757e-05, |
|
"loss": 0.8229, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35408675125405725, |
|
"grad_norm": 0.764539361000061, |
|
"learning_rate": 4.080499213196607e-05, |
|
"loss": 0.8303, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3658896429625258, |
|
"grad_norm": 0.6305603384971619, |
|
"learning_rate": 3.999319096988183e-05, |
|
"loss": 0.829, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3776925346709944, |
|
"grad_norm": 0.5482339859008789, |
|
"learning_rate": 3.915590846221669e-05, |
|
"loss": 0.8356, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.389495426379463, |
|
"grad_norm": 0.6555970311164856, |
|
"learning_rate": 3.8294567593908915e-05, |
|
"loss": 0.8281, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.40129831808793154, |
|
"grad_norm": 0.8127148151397705, |
|
"learning_rate": 3.741063223775066e-05, |
|
"loss": 0.8543, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.41310120979640014, |
|
"grad_norm": 0.8948593735694885, |
|
"learning_rate": 3.650560466649538e-05, |
|
"loss": 0.8639, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4249041015048687, |
|
"grad_norm": 0.6402966976165771, |
|
"learning_rate": 3.5581022999703464e-05, |
|
"loss": 0.8324, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4367069932133373, |
|
"grad_norm": 0.6675844192504883, |
|
"learning_rate": 3.4638458589665194e-05, |
|
"loss": 0.8265, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4485098849218058, |
|
"grad_norm": 0.6756200194358826, |
|
"learning_rate": 3.367951335084379e-05, |
|
"loss": 0.7834, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4603127766302744, |
|
"grad_norm": 0.7358006834983826, |
|
"learning_rate": 3.270581703737716e-05, |
|
"loss": 0.8107, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.47211566833874297, |
|
"grad_norm": 0.6496703028678894, |
|
"learning_rate": 3.171902447326536e-05, |
|
"loss": 0.8055, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.48391856004721157, |
|
"grad_norm": 0.6885930895805359, |
|
"learning_rate": 3.07208127399511e-05, |
|
"loss": 0.8249, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.49572145175568016, |
|
"grad_norm": 0.7303836941719055, |
|
"learning_rate": 2.9712878326073168e-05, |
|
"loss": 0.8054, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5075243434641488, |
|
"grad_norm": 0.6711559295654297, |
|
"learning_rate": 2.869693424423673e-05, |
|
"loss": 0.7779, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5193272351726173, |
|
"grad_norm": 0.6829948425292969, |
|
"learning_rate": 2.767470711970067e-05, |
|
"loss": 0.7729, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5311301268810859, |
|
"grad_norm": 0.6073248386383057, |
|
"learning_rate": 2.6647934255929933e-05, |
|
"loss": 0.7867, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5429330185895545, |
|
"grad_norm": 0.7291135787963867, |
|
"learning_rate": 2.5618360681999876e-05, |
|
"loss": 0.7751, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.554735910298023, |
|
"grad_norm": 0.6531949043273926, |
|
"learning_rate": 2.4587736186870766e-05, |
|
"loss": 0.7979, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5665388020064915, |
|
"grad_norm": 0.5947457551956177, |
|
"learning_rate": 2.3557812345572718e-05, |
|
"loss": 0.7807, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5783416937149601, |
|
"grad_norm": 0.7103855609893799, |
|
"learning_rate": 2.2530339542355145e-05, |
|
"loss": 0.8293, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5901445854234287, |
|
"grad_norm": 1.0487534999847412, |
|
"learning_rate": 2.150706399585999e-05, |
|
"loss": 0.798, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6019474771318973, |
|
"grad_norm": 0.8106992244720459, |
|
"learning_rate": 2.048972479137449e-05, |
|
"loss": 0.7426, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6137503688403659, |
|
"grad_norm": 0.6543154120445251, |
|
"learning_rate": 1.948005092520735e-05, |
|
"loss": 0.7813, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6255532605488344, |
|
"grad_norm": 0.6375657916069031, |
|
"learning_rate": 1.8479758366211334e-05, |
|
"loss": 0.7701, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.637356152257303, |
|
"grad_norm": 0.6001560091972351, |
|
"learning_rate": 1.7490547139446407e-05, |
|
"loss": 0.7777, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6491590439657716, |
|
"grad_norm": 0.7287290096282959, |
|
"learning_rate": 1.6514098436939835e-05, |
|
"loss": 0.7693, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6609619356742402, |
|
"grad_norm": 0.6269923448562622, |
|
"learning_rate": 1.555207176045349e-05, |
|
"loss": 0.7672, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6727648273827088, |
|
"grad_norm": 0.622016966342926, |
|
"learning_rate": 1.4606102101114391e-05, |
|
"loss": 0.7504, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6845677190911773, |
|
"grad_norm": 0.5838598012924194, |
|
"learning_rate": 1.367779716070179e-05, |
|
"loss": 0.7865, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6963706107996459, |
|
"grad_norm": 0.656366765499115, |
|
"learning_rate": 1.2768734619313147e-05, |
|
"loss": 0.7696, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7081735025081145, |
|
"grad_norm": 0.6976104378700256, |
|
"learning_rate": 1.188045945405299e-05, |
|
"loss": 0.7652, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7199763942165831, |
|
"grad_norm": 0.7407099604606628, |
|
"learning_rate": 1.1014481313301172e-05, |
|
"loss": 0.7533, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7317792859250516, |
|
"grad_norm": 0.5191411375999451, |
|
"learning_rate": 1.017227195102352e-05, |
|
"loss": 0.7578, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7435821776335202, |
|
"grad_norm": 0.6771509051322937, |
|
"learning_rate": 9.355262725484901e-06, |
|
"loss": 0.7768, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7553850693419888, |
|
"grad_norm": 0.6330916881561279, |
|
"learning_rate": 8.564842166616047e-06, |
|
"loss": 0.7071, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7671879610504574, |
|
"grad_norm": 0.693899929523468, |
|
"learning_rate": 7.802353616168229e-06, |
|
"loss": 0.7544, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.778990852758926, |
|
"grad_norm": 0.6973963379859924, |
|
"learning_rate": 7.069092944666586e-06, |
|
"loss": 0.7418, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7907937444673945, |
|
"grad_norm": 0.758264422416687, |
|
"learning_rate": 6.3663063490420336e-06, |
|
"loss": 0.7564, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8025966361758631, |
|
"grad_norm": 0.6236333847045898, |
|
"learning_rate": 5.695188234684898e-06, |
|
"loss": 0.7431, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8143995278843317, |
|
"grad_norm": 0.6301143169403076, |
|
"learning_rate": 5.056879185519714e-06, |
|
"loss": 0.7307, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8262024195928003, |
|
"grad_norm": 0.5712493062019348, |
|
"learning_rate": 4.452464025551037e-06, |
|
"loss": 0.7157, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8380053113012688, |
|
"grad_norm": 0.6849854588508606, |
|
"learning_rate": 3.8829699751748885e-06, |
|
"loss": 0.7367, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8498082030097374, |
|
"grad_norm": 0.6399794816970825, |
|
"learning_rate": 3.3493649053890326e-06, |
|
"loss": 0.7288, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.861611094718206, |
|
"grad_norm": 0.8012081384658813, |
|
"learning_rate": 2.8525556928693186e-06, |
|
"loss": 0.7237, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8734139864266746, |
|
"grad_norm": 0.7375155687332153, |
|
"learning_rate": 2.3933866787074627e-06, |
|
"loss": 0.7543, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8852168781351432, |
|
"grad_norm": 0.6023644208908081, |
|
"learning_rate": 1.9726382334298883e-06, |
|
"loss": 0.74, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8970197698436116, |
|
"grad_norm": 0.6464205980300903, |
|
"learning_rate": 1.5910254307362705e-06, |
|
"loss": 0.7578, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9088226615520802, |
|
"grad_norm": 0.6287794709205627, |
|
"learning_rate": 1.2491968322118685e-06, |
|
"loss": 0.7513, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9206255532605488, |
|
"grad_norm": 0.6092919707298279, |
|
"learning_rate": 9.477333850790554e-07, |
|
"loss": 0.7187, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9324284449690174, |
|
"grad_norm": 0.6522098183631897, |
|
"learning_rate": 6.871474348613266e-07, |
|
"loss": 0.7519, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9442313366774859, |
|
"grad_norm": 0.5721604228019714, |
|
"learning_rate": 4.678818546378333e-07, |
|
"loss": 0.7502, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9560342283859545, |
|
"grad_norm": 0.8141267895698547, |
|
"learning_rate": 2.903092923682266e-07, |
|
"loss": 0.7512, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9678371200944231, |
|
"grad_norm": 0.5925819277763367, |
|
"learning_rate": 1.5473153756709046e-07, |
|
"loss": 0.795, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9796400118028917, |
|
"grad_norm": 0.599314272403717, |
|
"learning_rate": 6.137900840425815e-08, |
|
"loss": 0.7319, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9914429035113603, |
|
"grad_norm": 0.7140465974807739, |
|
"learning_rate": 1.0410360102702799e-08, |
|
"loss": 0.7747, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9997049277072882, |
|
"step": 847, |
|
"total_flos": 6.067900108221972e+17, |
|
"train_loss": 0.8547630963105941, |
|
"train_runtime": 5769.0005, |
|
"train_samples_per_second": 4.699, |
|
"train_steps_per_second": 0.147 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 847, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.067900108221972e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|