|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 7679, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013022528975126969, |
|
"grad_norm": 0.13363803923130035, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.4964, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.026045057950253938, |
|
"grad_norm": 0.2602158486843109, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.484, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03906758692538091, |
|
"grad_norm": 0.3781226873397827, |
|
"learning_rate": 2e-05, |
|
"loss": 2.4035, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.052090115900507876, |
|
"grad_norm": 0.4238300323486328, |
|
"learning_rate": 1.999093831153269e-05, |
|
"loss": 2.3381, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06511264487563485, |
|
"grad_norm": 0.4826453924179077, |
|
"learning_rate": 1.9963769668970327e-05, |
|
"loss": 2.3359, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07813517385076182, |
|
"grad_norm": 0.5577328205108643, |
|
"learning_rate": 1.991854331106791e-05, |
|
"loss": 2.3252, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.09115770282588878, |
|
"grad_norm": 0.6093964576721191, |
|
"learning_rate": 1.9855341203258605e-05, |
|
"loss": 2.3003, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.10418023180101575, |
|
"grad_norm": 0.6448187828063965, |
|
"learning_rate": 1.9774277889104696e-05, |
|
"loss": 2.2692, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.11720276077614272, |
|
"grad_norm": 0.6290944218635559, |
|
"learning_rate": 1.967550028270599e-05, |
|
"loss": 2.2529, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1302252897512697, |
|
"grad_norm": 0.6941649317741394, |
|
"learning_rate": 1.9559187402441825e-05, |
|
"loss": 2.2357, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.14324781872639666, |
|
"grad_norm": 0.752715528011322, |
|
"learning_rate": 1.942555004652934e-05, |
|
"loss": 2.2442, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.15627034770152365, |
|
"grad_norm": 0.6779051423072815, |
|
"learning_rate": 1.9274830410985915e-05, |
|
"loss": 2.2162, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.1692928766766506, |
|
"grad_norm": 0.6820743083953857, |
|
"learning_rate": 1.9107301650688188e-05, |
|
"loss": 2.2573, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.18231540565177756, |
|
"grad_norm": 1.0248582363128662, |
|
"learning_rate": 1.8923267384323184e-05, |
|
"loss": 2.2348, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.19533793462690455, |
|
"grad_norm": 0.6467209458351135, |
|
"learning_rate": 1.8723061144128728e-05, |
|
"loss": 2.2301, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2083604636020315, |
|
"grad_norm": 0.918159008026123, |
|
"learning_rate": 1.8507045771420383e-05, |
|
"loss": 2.2039, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2213829925771585, |
|
"grad_norm": 0.7963811755180359, |
|
"learning_rate": 1.8275612759000486e-05, |
|
"loss": 2.2139, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.23440552155228545, |
|
"grad_norm": 0.731287956237793, |
|
"learning_rate": 1.8029181541640952e-05, |
|
"loss": 2.1956, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.24742805052741243, |
|
"grad_norm": 0.7656735777854919, |
|
"learning_rate": 1.7768198735925848e-05, |
|
"loss": 2.2251, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.2604505795025394, |
|
"grad_norm": 0.9210675358772278, |
|
"learning_rate": 1.7493137330831318e-05, |
|
"loss": 2.1839, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.27347310847766637, |
|
"grad_norm": 0.6853137016296387, |
|
"learning_rate": 1.7204495830509832e-05, |
|
"loss": 2.1842, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.28649563745279333, |
|
"grad_norm": 0.6904870271682739, |
|
"learning_rate": 1.6902797350832318e-05, |
|
"loss": 2.1832, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.2995181664279203, |
|
"grad_norm": 1.0184141397476196, |
|
"learning_rate": 1.6588588671325554e-05, |
|
"loss": 2.1457, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.3125406954030473, |
|
"grad_norm": 0.8124328851699829, |
|
"learning_rate": 1.626243924422303e-05, |
|
"loss": 2.1382, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.32556322437817425, |
|
"grad_norm": 0.9830845594406128, |
|
"learning_rate": 1.592494016242518e-05, |
|
"loss": 2.1469, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3385857533533012, |
|
"grad_norm": 0.9020227193832397, |
|
"learning_rate": 1.5576703088239455e-05, |
|
"loss": 2.1907, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.35160828232842817, |
|
"grad_norm": 1.0550016164779663, |
|
"learning_rate": 1.5218359144841666e-05, |
|
"loss": 2.1908, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.3646308113035551, |
|
"grad_norm": 0.7753348350524902, |
|
"learning_rate": 1.4850557772467655e-05, |
|
"loss": 2.1503, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.37765334027868214, |
|
"grad_norm": 0.9372894167900085, |
|
"learning_rate": 1.4473965551408284e-05, |
|
"loss": 2.1639, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.3906758692538091, |
|
"grad_norm": 0.7956731915473938, |
|
"learning_rate": 1.4089264993940843e-05, |
|
"loss": 2.1498, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.40369839822893605, |
|
"grad_norm": 0.7356022000312805, |
|
"learning_rate": 1.3697153307386327e-05, |
|
"loss": 2.1705, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.416720927204063, |
|
"grad_norm": 0.864648699760437, |
|
"learning_rate": 1.3298341130534323e-05, |
|
"loss": 2.1514, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.42974345617919, |
|
"grad_norm": 1.0018072128295898, |
|
"learning_rate": 1.2893551245725551e-05, |
|
"loss": 2.1069, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.442765985154317, |
|
"grad_norm": 0.9439816474914551, |
|
"learning_rate": 1.2483517268926188e-05, |
|
"loss": 2.1194, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.45578851412944393, |
|
"grad_norm": 0.8040792346000671, |
|
"learning_rate": 1.2068982320167986e-05, |
|
"loss": 2.1365, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4688110431045709, |
|
"grad_norm": 1.0141490697860718, |
|
"learning_rate": 1.1650697676763833e-05, |
|
"loss": 2.1117, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.4818335720796979, |
|
"grad_norm": 1.0436880588531494, |
|
"learning_rate": 1.1229421411739574e-05, |
|
"loss": 2.1248, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.49485610105482486, |
|
"grad_norm": 1.00360906124115, |
|
"learning_rate": 1.0805917019949665e-05, |
|
"loss": 2.1349, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.5078786300299518, |
|
"grad_norm": 0.9457325339317322, |
|
"learning_rate": 1.0380952034366703e-05, |
|
"loss": 2.1253, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.5209011590050788, |
|
"grad_norm": 0.871446967124939, |
|
"learning_rate": 9.955296635052454e-06, |
|
"loss": 2.1322, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5339236879802057, |
|
"grad_norm": 1.199268102645874, |
|
"learning_rate": 9.529722253331522e-06, |
|
"loss": 2.1386, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.5469462169553327, |
|
"grad_norm": 0.8790176510810852, |
|
"learning_rate": 9.105000173697276e-06, |
|
"loss": 2.1618, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.5599687459304596, |
|
"grad_norm": 1.089074969291687, |
|
"learning_rate": 8.681900135983885e-06, |
|
"loss": 2.132, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.5729912749055867, |
|
"grad_norm": 1.0114529132843018, |
|
"learning_rate": 8.26118894033779e-06, |
|
"loss": 2.1433, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.5860138038807137, |
|
"grad_norm": 1.2954638004302979, |
|
"learning_rate": 7.843629057516935e-06, |
|
"loss": 2.1213, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5990363328558406, |
|
"grad_norm": 1.0429459810256958, |
|
"learning_rate": 7.429977247036231e-06, |
|
"loss": 2.0845, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.6120588618309676, |
|
"grad_norm": 0.9565938115119934, |
|
"learning_rate": 7.020983185663779e-06, |
|
"loss": 2.1291, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.6250813908060946, |
|
"grad_norm": 0.900711715221405, |
|
"learning_rate": 6.617388108753403e-06, |
|
"loss": 2.1065, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.6381039197812215, |
|
"grad_norm": 1.0982881784439087, |
|
"learning_rate": 6.219923466875894e-06, |
|
"loss": 2.1607, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.6511264487563485, |
|
"grad_norm": 1.1638540029525757, |
|
"learning_rate": 5.829309600183536e-06, |
|
"loss": 2.0958, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6641489777314754, |
|
"grad_norm": 0.9292285442352295, |
|
"learning_rate": 5.446254432910526e-06, |
|
"loss": 2.1075, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.6771715067066024, |
|
"grad_norm": 0.8159144520759583, |
|
"learning_rate": 5.071452190375194e-06, |
|
"loss": 2.1218, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.6901940356817294, |
|
"grad_norm": 1.1008979082107544, |
|
"learning_rate": 4.705582140809275e-06, |
|
"loss": 2.1127, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.7032165646568563, |
|
"grad_norm": 0.885331928730011, |
|
"learning_rate": 4.349307364294512e-06, |
|
"loss": 2.1082, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.7162390936319833, |
|
"grad_norm": 0.9930873513221741, |
|
"learning_rate": 4.0032735510376055e-06, |
|
"loss": 2.1218, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7292616226071102, |
|
"grad_norm": 0.9115188717842102, |
|
"learning_rate": 3.668107831161537e-06, |
|
"loss": 2.1659, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.7422841515822373, |
|
"grad_norm": 1.0631247758865356, |
|
"learning_rate": 3.344417638133999e-06, |
|
"loss": 2.1263, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.7553066805573643, |
|
"grad_norm": 1.2672806978225708, |
|
"learning_rate": 3.032789607892811e-06, |
|
"loss": 2.1333, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.7683292095324912, |
|
"grad_norm": 1.0414445400238037, |
|
"learning_rate": 2.733788515663528e-06, |
|
"loss": 2.1135, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.7813517385076182, |
|
"grad_norm": 1.0180047750473022, |
|
"learning_rate": 2.447956252395974e-06, |
|
"loss": 2.1743, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7943742674827452, |
|
"grad_norm": 0.8480224609375, |
|
"learning_rate": 2.1758108426748847e-06, |
|
"loss": 2.1312, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.8073967964578721, |
|
"grad_norm": 1.0538519620895386, |
|
"learning_rate": 1.9178455058843938e-06, |
|
"loss": 2.0941, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.8204193254329991, |
|
"grad_norm": 0.8907907605171204, |
|
"learning_rate": 1.6745277623279766e-06, |
|
"loss": 2.1227, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.833441854408126, |
|
"grad_norm": 0.8479962348937988, |
|
"learning_rate": 1.446298585923771e-06, |
|
"loss": 2.1258, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.846464383383253, |
|
"grad_norm": 1.204111933708191, |
|
"learning_rate": 1.2335716050109182e-06, |
|
"loss": 2.1328, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.85948691235838, |
|
"grad_norm": 1.111570119857788, |
|
"learning_rate": 1.0367323527153462e-06, |
|
"loss": 2.1176, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.8725094413335069, |
|
"grad_norm": 0.7793114185333252, |
|
"learning_rate": 8.561375682335393e-07, |
|
"loss": 2.1142, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.885531970308634, |
|
"grad_norm": 0.8252727389335632, |
|
"learning_rate": 6.92114550300661e-07, |
|
"loss": 2.1273, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.8985544992837609, |
|
"grad_norm": 0.9007234573364258, |
|
"learning_rate": 5.449605640147038e-07, |
|
"loss": 2.1098, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.9115770282588879, |
|
"grad_norm": 0.9859702587127686, |
|
"learning_rate": 4.149423020917587e-07, |
|
"loss": 2.1, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9245995572340149, |
|
"grad_norm": 1.1626622676849365, |
|
"learning_rate": 3.022954015287449e-07, |
|
"loss": 2.1123, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.9376220862091418, |
|
"grad_norm": 0.9756370186805725, |
|
"learning_rate": 2.0722401654960644e-07, |
|
"loss": 2.1186, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.9506446151842688, |
|
"grad_norm": 1.004597783088684, |
|
"learning_rate": 1.299004486089095e-07, |
|
"loss": 2.0744, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.9636671441593958, |
|
"grad_norm": 1.021781086921692, |
|
"learning_rate": 7.046483412342708e-08, |
|
"loss": 2.086, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.9766896731345227, |
|
"grad_norm": 0.9634168148040771, |
|
"learning_rate": 2.9024890497625356e-08, |
|
"loss": 2.1365, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.9897122021096497, |
|
"grad_norm": 1.1906155347824097, |
|
"learning_rate": 5.655720903351425e-09, |
|
"loss": 2.1136, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 7679, |
|
"total_flos": 1.3987105112064e+17, |
|
"train_loss": 2.1709037589008475, |
|
"train_runtime": 4004.8154, |
|
"train_samples_per_second": 3.835, |
|
"train_steps_per_second": 1.917 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 7679, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3987105112064e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|