|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 780, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.158679656326677, |
|
"learning_rate": 8.974358974358974e-08, |
|
"loss": 0.5861, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.0322708666943594, |
|
"learning_rate": 1.7948717948717948e-07, |
|
"loss": 0.6075, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.822065509661876, |
|
"learning_rate": 2.692307692307692e-07, |
|
"loss": 0.5896, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.01990383007436, |
|
"learning_rate": 3.5897435897435896e-07, |
|
"loss": 0.6534, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.1514015311739887, |
|
"learning_rate": 4.4871794871794865e-07, |
|
"loss": 0.6318, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.770082837335288, |
|
"learning_rate": 5.384615384615384e-07, |
|
"loss": 0.5567, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.1473432471953484, |
|
"learning_rate": 6.282051282051282e-07, |
|
"loss": 0.556, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.867277473433583, |
|
"learning_rate": 7.179487179487179e-07, |
|
"loss": 0.626, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.116538344911058, |
|
"learning_rate": 8.076923076923077e-07, |
|
"loss": 0.5956, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.883927442084639, |
|
"learning_rate": 8.974358974358973e-07, |
|
"loss": 0.6348, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.7296086958649717, |
|
"learning_rate": 9.871794871794872e-07, |
|
"loss": 0.6012, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.888291481225885, |
|
"learning_rate": 1.0769230769230769e-06, |
|
"loss": 0.6158, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.6798480814733354, |
|
"learning_rate": 1.1666666666666666e-06, |
|
"loss": 0.5465, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.5286944008685213, |
|
"learning_rate": 1.2564102564102565e-06, |
|
"loss": 0.538, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.8054433121795896, |
|
"learning_rate": 1.3461538461538462e-06, |
|
"loss": 0.5605, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.301783853298887, |
|
"learning_rate": 1.4358974358974359e-06, |
|
"loss": 0.5839, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.0055412267413795, |
|
"learning_rate": 1.5256410256410255e-06, |
|
"loss": 0.563, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.337406004341634, |
|
"learning_rate": 1.6153846153846154e-06, |
|
"loss": 0.5564, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.120974098777836, |
|
"learning_rate": 1.7051282051282051e-06, |
|
"loss": 0.5074, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.2235871141473913, |
|
"learning_rate": 1.7948717948717946e-06, |
|
"loss": 0.5682, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.325580776574494, |
|
"learning_rate": 1.8846153846153845e-06, |
|
"loss": 0.7112, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.459952422821355, |
|
"learning_rate": 1.9743589743589744e-06, |
|
"loss": 0.6661, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.95839275930406, |
|
"learning_rate": 2.064102564102564e-06, |
|
"loss": 0.4998, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.974144020858437, |
|
"learning_rate": 2.1538461538461538e-06, |
|
"loss": 0.5123, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.7091650511198182, |
|
"learning_rate": 2.243589743589744e-06, |
|
"loss": 0.4679, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.8800999304301238, |
|
"learning_rate": 2.333333333333333e-06, |
|
"loss": 0.4695, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.11180387597458, |
|
"learning_rate": 2.423076923076923e-06, |
|
"loss": 0.5301, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.9925416173236044, |
|
"learning_rate": 2.512820512820513e-06, |
|
"loss": 0.5425, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.9501180051573095, |
|
"learning_rate": 2.6025641025641026e-06, |
|
"loss": 0.5811, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.8590839448588985, |
|
"learning_rate": 2.6923076923076923e-06, |
|
"loss": 0.5084, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.9713208570956429, |
|
"learning_rate": 2.782051282051282e-06, |
|
"loss": 0.5148, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.6902323535320234, |
|
"learning_rate": 2.8717948717948717e-06, |
|
"loss": 0.4844, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.8356045743488203, |
|
"learning_rate": 2.9615384615384614e-06, |
|
"loss": 0.4298, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.7798007609043585, |
|
"learning_rate": 3.051282051282051e-06, |
|
"loss": 0.4968, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.102982471446415, |
|
"learning_rate": 3.141025641025641e-06, |
|
"loss": 0.5953, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.1598493191848167, |
|
"learning_rate": 3.230769230769231e-06, |
|
"loss": 0.5369, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.681878732897283, |
|
"learning_rate": 3.32051282051282e-06, |
|
"loss": 0.4529, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.7703525285846058, |
|
"learning_rate": 3.4102564102564103e-06, |
|
"loss": 0.4955, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.7683469960023375, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.5052, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7419535992669049, |
|
"learning_rate": 3.5897435897435892e-06, |
|
"loss": 0.5162, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7449819975072198, |
|
"learning_rate": 3.6794871794871797e-06, |
|
"loss": 0.5277, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.7648482981202087, |
|
"learning_rate": 3.769230769230769e-06, |
|
"loss": 0.4876, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.6946980645629792, |
|
"learning_rate": 3.858974358974359e-06, |
|
"loss": 0.441, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.8314518184385051, |
|
"learning_rate": 3.948717948717949e-06, |
|
"loss": 0.4556, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.9141285726499875, |
|
"learning_rate": 4.038461538461538e-06, |
|
"loss": 0.4898, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.7938139036996628, |
|
"learning_rate": 4.128205128205128e-06, |
|
"loss": 0.4583, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.6929510536316692, |
|
"learning_rate": 4.217948717948718e-06, |
|
"loss": 0.4323, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.8092163175147657, |
|
"learning_rate": 4.3076923076923076e-06, |
|
"loss": 0.4285, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.766659033823101, |
|
"learning_rate": 4.397435897435897e-06, |
|
"loss": 0.4975, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.6616034395207262, |
|
"learning_rate": 4.487179487179488e-06, |
|
"loss": 0.3902, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.717963883722435, |
|
"learning_rate": 4.576923076923077e-06, |
|
"loss": 0.4552, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.9526480455423094, |
|
"learning_rate": 4.666666666666666e-06, |
|
"loss": 0.6078, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.7249377723052146, |
|
"learning_rate": 4.756410256410257e-06, |
|
"loss": 0.4963, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.6946332682174237, |
|
"learning_rate": 4.846153846153846e-06, |
|
"loss": 0.523, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.7252362147444964, |
|
"learning_rate": 4.935897435897436e-06, |
|
"loss": 0.4866, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.6674378519984692, |
|
"learning_rate": 5.025641025641026e-06, |
|
"loss": 0.4364, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.9277904713149823, |
|
"learning_rate": 5.115384615384615e-06, |
|
"loss": 0.4977, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.8110319448592889, |
|
"learning_rate": 5.205128205128205e-06, |
|
"loss": 0.4954, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.7639887815910555, |
|
"learning_rate": 5.294871794871795e-06, |
|
"loss": 0.4816, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.5942284653278298, |
|
"learning_rate": 5.384615384615385e-06, |
|
"loss": 0.4693, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5325212943981656, |
|
"learning_rate": 5.474358974358974e-06, |
|
"loss": 0.3896, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.5026157299230636, |
|
"learning_rate": 5.564102564102564e-06, |
|
"loss": 0.4682, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4154344705468702, |
|
"learning_rate": 5.653846153846154e-06, |
|
"loss": 0.3658, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.7736758802266976, |
|
"learning_rate": 5.743589743589743e-06, |
|
"loss": 0.5029, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.7765092780230023, |
|
"learning_rate": 5.833333333333333e-06, |
|
"loss": 0.4608, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.9404511561666338, |
|
"learning_rate": 5.923076923076923e-06, |
|
"loss": 0.4652, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.753353616574486, |
|
"learning_rate": 6.0128205128205125e-06, |
|
"loss": 0.5038, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.8599416931364385, |
|
"learning_rate": 6.102564102564102e-06, |
|
"loss": 0.5354, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.5941444460994902, |
|
"learning_rate": 6.192307692307692e-06, |
|
"loss": 0.4343, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.017933692057373, |
|
"learning_rate": 6.282051282051282e-06, |
|
"loss": 0.5443, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.7072140285820805, |
|
"learning_rate": 6.371794871794871e-06, |
|
"loss": 0.4895, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.5695015424217655, |
|
"learning_rate": 6.461538461538462e-06, |
|
"loss": 0.4482, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.7562460888971851, |
|
"learning_rate": 6.5512820512820515e-06, |
|
"loss": 0.4835, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.7316693884254104, |
|
"learning_rate": 6.64102564102564e-06, |
|
"loss": 0.4877, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.6249360298584317, |
|
"learning_rate": 6.730769230769231e-06, |
|
"loss": 0.4522, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.6936430758296457, |
|
"learning_rate": 6.8205128205128205e-06, |
|
"loss": 0.4546, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.611309437065777, |
|
"learning_rate": 6.91025641025641e-06, |
|
"loss": 0.4106, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.7957088623001527, |
|
"learning_rate": 7e-06, |
|
"loss": 0.4938, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.6681604508744847, |
|
"learning_rate": 6.9999649520318915e-06, |
|
"loss": 0.4654, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.6377954005820725, |
|
"learning_rate": 6.999859808829483e-06, |
|
"loss": 0.3843, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.701805354673672, |
|
"learning_rate": 6.999684572498523e-06, |
|
"loss": 0.523, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.8137342603270725, |
|
"learning_rate": 6.999439246548541e-06, |
|
"loss": 0.4227, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.610322923292699, |
|
"learning_rate": 6.999123835892781e-06, |
|
"loss": 0.3851, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.7424356339897609, |
|
"learning_rate": 6.998738346848099e-06, |
|
"loss": 0.5343, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.5684238404352413, |
|
"learning_rate": 6.998282787134845e-06, |
|
"loss": 0.4003, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.6444093877109798, |
|
"learning_rate": 6.997757165876698e-06, |
|
"loss": 0.5267, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.658881843691408, |
|
"learning_rate": 6.9971614936004935e-06, |
|
"loss": 0.4371, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.752404442163129, |
|
"learning_rate": 6.996495782236003e-06, |
|
"loss": 0.4437, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.7726886879652404, |
|
"learning_rate": 6.9957600451157e-06, |
|
"loss": 0.4805, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.7130722569388794, |
|
"learning_rate": 6.9949542969744955e-06, |
|
"loss": 0.4753, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.66475783464997, |
|
"learning_rate": 6.9940785539494385e-06, |
|
"loss": 0.4647, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.6802006865988512, |
|
"learning_rate": 6.9931328335793926e-06, |
|
"loss": 0.4353, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.7344675321937646, |
|
"learning_rate": 6.992117154804688e-06, |
|
"loss": 0.4741, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.683261676978249, |
|
"learning_rate": 6.991031537966741e-06, |
|
"loss": 0.4209, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.8314669990022636, |
|
"learning_rate": 6.989876004807644e-06, |
|
"loss": 0.4895, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.6286010700850748, |
|
"learning_rate": 6.9886505784697354e-06, |
|
"loss": 0.4316, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.79287017890556, |
|
"learning_rate": 6.98735528349513e-06, |
|
"loss": 0.5164, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.5957068076822012, |
|
"learning_rate": 6.985990145825233e-06, |
|
"loss": 0.4151, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.7661104640217256, |
|
"learning_rate": 6.984555192800216e-06, |
|
"loss": 0.5416, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.5981307636582884, |
|
"learning_rate": 6.983050453158471e-06, |
|
"loss": 0.4675, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.6430144323111846, |
|
"learning_rate": 6.981475957036039e-06, |
|
"loss": 0.4333, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.7167975371990496, |
|
"learning_rate": 6.979831735965997e-06, |
|
"loss": 0.3996, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.8627029109499222, |
|
"learning_rate": 6.9781178228778385e-06, |
|
"loss": 0.4818, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.7395258209832791, |
|
"learning_rate": 6.9763342520968e-06, |
|
"loss": 0.5318, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.7354412977551585, |
|
"learning_rate": 6.974481059343188e-06, |
|
"loss": 0.4553, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.7554448227091846, |
|
"learning_rate": 6.972558281731655e-06, |
|
"loss": 0.473, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.8267052663331127, |
|
"learning_rate": 6.970565957770456e-06, |
|
"loss": 0.4606, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.4809121128025398, |
|
"learning_rate": 6.96850412736068e-06, |
|
"loss": 0.4078, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.7118223458061004, |
|
"learning_rate": 6.9663728317954505e-06, |
|
"loss": 0.4936, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.7727261373643637, |
|
"learning_rate": 6.9641721137591e-06, |
|
"loss": 0.5248, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.6530960350112813, |
|
"learning_rate": 6.961902017326311e-06, |
|
"loss": 0.4673, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.621743811161392, |
|
"learning_rate": 6.959562587961235e-06, |
|
"loss": 0.4532, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.7097624964244162, |
|
"learning_rate": 6.9571538725165855e-06, |
|
"loss": 0.4599, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.6251091941091769, |
|
"learning_rate": 6.9546759192326944e-06, |
|
"loss": 0.4622, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.600988114121893, |
|
"learning_rate": 6.95212877773655e-06, |
|
"loss": 0.3924, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.7438877798521037, |
|
"learning_rate": 6.949512499040799e-06, |
|
"loss": 0.4436, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7329586181529388, |
|
"learning_rate": 6.946827135542729e-06, |
|
"loss": 0.4049, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.4988532720543135, |
|
"learning_rate": 6.944072741023215e-06, |
|
"loss": 0.3812, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.767535618217951, |
|
"learning_rate": 6.941249370645649e-06, |
|
"loss": 0.4399, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.6724445288790142, |
|
"learning_rate": 6.938357080954826e-06, |
|
"loss": 0.4523, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.666547625101208, |
|
"learning_rate": 6.935395929875821e-06, |
|
"loss": 0.4782, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.85593796476523, |
|
"learning_rate": 6.93236597671282e-06, |
|
"loss": 0.5285, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5583837042326505, |
|
"learning_rate": 6.929267282147936e-06, |
|
"loss": 0.4111, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5489104990791143, |
|
"learning_rate": 6.9260999082400014e-06, |
|
"loss": 0.4232, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.7270840700085746, |
|
"learning_rate": 6.922863918423311e-06, |
|
"loss": 0.4385, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.7851688586632903, |
|
"learning_rate": 6.91955937750636e-06, |
|
"loss": 0.503, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.6825892167200216, |
|
"learning_rate": 6.916186351670546e-06, |
|
"loss": 0.4429, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.6005946819762804, |
|
"learning_rate": 6.912744908468841e-06, |
|
"loss": 0.428, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5740880543799294, |
|
"learning_rate": 6.909235116824441e-06, |
|
"loss": 0.4859, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5701248349607777, |
|
"learning_rate": 6.905657047029383e-06, |
|
"loss": 0.4114, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.8117183546435858, |
|
"learning_rate": 6.90201077074314e-06, |
|
"loss": 0.5386, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.7176967426159182, |
|
"learning_rate": 6.898296360991182e-06, |
|
"loss": 0.4988, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.6119557103460176, |
|
"learning_rate": 6.894513892163519e-06, |
|
"loss": 0.4353, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.8163909694175877, |
|
"learning_rate": 6.890663440013204e-06, |
|
"loss": 0.4624, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.5906374611348832, |
|
"learning_rate": 6.886745081654823e-06, |
|
"loss": 0.4413, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.7083476998308251, |
|
"learning_rate": 6.882758895562948e-06, |
|
"loss": 0.4798, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.716310498038562, |
|
"learning_rate": 6.8787049615705635e-06, |
|
"loss": 0.4478, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.7408667095266788, |
|
"learning_rate": 6.8745833608674685e-06, |
|
"loss": 0.5123, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.4736521035752084, |
|
"learning_rate": 6.870394175998651e-06, |
|
"loss": 0.4126, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.6728053416467819, |
|
"learning_rate": 6.866137490862636e-06, |
|
"loss": 0.479, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.6162869139636344, |
|
"learning_rate": 6.861813390709803e-06, |
|
"loss": 0.4, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.7100413135119708, |
|
"learning_rate": 6.857421962140681e-06, |
|
"loss": 0.4366, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.5997870652975197, |
|
"learning_rate": 6.852963293104211e-06, |
|
"loss": 0.4237, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.661897923842565, |
|
"learning_rate": 6.848437472895989e-06, |
|
"loss": 0.3599, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.5437806414367865, |
|
"learning_rate": 6.84384459215647e-06, |
|
"loss": 0.382, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.8494864927398265, |
|
"learning_rate": 6.839184742869166e-06, |
|
"loss": 0.4811, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.7136464365889568, |
|
"learning_rate": 6.8344580183587866e-06, |
|
"loss": 0.4596, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.5679341769016597, |
|
"learning_rate": 6.829664513289387e-06, |
|
"loss": 0.4479, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.5364949608026977, |
|
"learning_rate": 6.824804323662456e-06, |
|
"loss": 0.4242, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.7792768326461645, |
|
"learning_rate": 6.8198775468150085e-06, |
|
"loss": 0.5057, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.64880328624309, |
|
"learning_rate": 6.814884281417627e-06, |
|
"loss": 0.4682, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.6611210694948921, |
|
"learning_rate": 6.8098246274724835e-06, |
|
"loss": 0.4179, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.5595830417210474, |
|
"learning_rate": 6.8046986863113455e-06, |
|
"loss": 0.3934, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5319462691982524, |
|
"learning_rate": 6.7995065605935405e-06, |
|
"loss": 0.433, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5929306277662778, |
|
"learning_rate": 6.7942483543039e-06, |
|
"loss": 0.4027, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.6960544276623803, |
|
"learning_rate": 6.788924172750679e-06, |
|
"loss": 0.4456, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.583925185372818, |
|
"learning_rate": 6.783534122563447e-06, |
|
"loss": 0.3919, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.5898722296830252, |
|
"learning_rate": 6.7780783116909495e-06, |
|
"loss": 0.4287, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.4205967061693483, |
|
"learning_rate": 6.772556849398952e-06, |
|
"loss": 0.414, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.5791464079003765, |
|
"learning_rate": 6.7669698462680434e-06, |
|
"loss": 0.4554, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.655295470225546, |
|
"learning_rate": 6.761317414191428e-06, |
|
"loss": 0.4268, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.5529231674584578, |
|
"learning_rate": 6.755599666372685e-06, |
|
"loss": 0.3546, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.6006425620300517, |
|
"learning_rate": 6.749816717323493e-06, |
|
"loss": 0.3596, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.5607773236397822, |
|
"learning_rate": 6.743968682861346e-06, |
|
"loss": 0.4296, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.5655082531240243, |
|
"learning_rate": 6.738055680107233e-06, |
|
"loss": 0.3895, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.7066313168254195, |
|
"learning_rate": 6.7320778274832836e-06, |
|
"loss": 0.4153, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.659306747460643, |
|
"learning_rate": 6.726035244710406e-06, |
|
"loss": 0.4053, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.6891499067659266, |
|
"learning_rate": 6.7199280528058844e-06, |
|
"loss": 0.3971, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.5705522381169368, |
|
"learning_rate": 6.713756374080959e-06, |
|
"loss": 0.3442, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.560896872295485, |
|
"learning_rate": 6.70752033213837e-06, |
|
"loss": 0.3426, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.698691051754592, |
|
"learning_rate": 6.7012200518698904e-06, |
|
"loss": 0.3358, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.583062054049607, |
|
"learning_rate": 6.6948556594538185e-06, |
|
"loss": 0.3902, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.5865690802506587, |
|
"learning_rate": 6.688427282352449e-06, |
|
"loss": 0.3747, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.6540144652784194, |
|
"learning_rate": 6.681935049309533e-06, |
|
"loss": 0.358, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.63701712650459, |
|
"learning_rate": 6.6753790903476814e-06, |
|
"loss": 0.3225, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.7170307214130864, |
|
"learning_rate": 6.668759536765778e-06, |
|
"loss": 0.3457, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.862784857769338, |
|
"learning_rate": 6.6620765211363376e-06, |
|
"loss": 0.4721, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.0372982900949275, |
|
"learning_rate": 6.655330177302857e-06, |
|
"loss": 0.4379, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.6108247505727127, |
|
"learning_rate": 6.64852064037713e-06, |
|
"loss": 0.3245, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.5709194699574685, |
|
"learning_rate": 6.6416480467365494e-06, |
|
"loss": 0.3274, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.4120453761226166, |
|
"learning_rate": 6.634712534021367e-06, |
|
"loss": 0.3123, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.5271738412739237, |
|
"learning_rate": 6.627714241131943e-06, |
|
"loss": 0.2981, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.657027378920605, |
|
"learning_rate": 6.62065330822596e-06, |
|
"loss": 0.3302, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.6455506118501428, |
|
"learning_rate": 6.613529876715619e-06, |
|
"loss": 0.3508, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.728702644768987, |
|
"learning_rate": 6.606344089264805e-06, |
|
"loss": 0.3717, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.66605902530422, |
|
"learning_rate": 6.599096089786234e-06, |
|
"loss": 0.3268, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.7301577213147818, |
|
"learning_rate": 6.591786023438565e-06, |
|
"loss": 0.3184, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.5156585831764764, |
|
"learning_rate": 6.5844140366234956e-06, |
|
"loss": 0.3003, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.5266516234302516, |
|
"learning_rate": 6.576980276982832e-06, |
|
"loss": 0.2672, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.551348526011579, |
|
"learning_rate": 6.569484893395527e-06, |
|
"loss": 0.3095, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.8181092851281593, |
|
"learning_rate": 6.5619280359747045e-06, |
|
"loss": 0.3643, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.7302846932168594, |
|
"learning_rate": 6.55430985606465e-06, |
|
"loss": 0.3177, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.587436238285827, |
|
"learning_rate": 6.546630506237778e-06, |
|
"loss": 0.273, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.5878702336844899, |
|
"learning_rate": 6.538890140291578e-06, |
|
"loss": 0.2945, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.6055049355849913, |
|
"learning_rate": 6.531088913245536e-06, |
|
"loss": 0.2917, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.6136330635697707, |
|
"learning_rate": 6.5232269813380254e-06, |
|
"loss": 0.3031, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.6772420948466469, |
|
"learning_rate": 6.5153045020231855e-06, |
|
"loss": 0.3063, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.6946657606437059, |
|
"learning_rate": 6.507321633967758e-06, |
|
"loss": 0.2792, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.6400336393596016, |
|
"learning_rate": 6.499278537047919e-06, |
|
"loss": 0.2533, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.663092604458537, |
|
"learning_rate": 6.49117537234607e-06, |
|
"loss": 0.2457, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.6920287652097301, |
|
"learning_rate": 6.483012302147617e-06, |
|
"loss": 0.2654, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.5534173050131097, |
|
"learning_rate": 6.474789489937715e-06, |
|
"loss": 0.2534, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.5689308391296257, |
|
"learning_rate": 6.4665071003979985e-06, |
|
"loss": 0.2247, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.6586248558217869, |
|
"learning_rate": 6.4581652994032816e-06, |
|
"loss": 0.223, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.5984104105693349, |
|
"learning_rate": 6.449764254018236e-06, |
|
"loss": 0.2672, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.5608644731307773, |
|
"learning_rate": 6.441304132494045e-06, |
|
"loss": 0.2067, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.6111918566862335, |
|
"learning_rate": 6.432785104265034e-06, |
|
"loss": 0.2327, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.820458743704155, |
|
"learning_rate": 6.424207339945278e-06, |
|
"loss": 0.3055, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.6194035399467694, |
|
"learning_rate": 6.415571011325181e-06, |
|
"loss": 0.2638, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.8508729271576532, |
|
"learning_rate": 6.406876291368041e-06, |
|
"loss": 0.2854, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.621939945199255, |
|
"learning_rate": 6.3981233542065824e-06, |
|
"loss": 0.2559, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.6253283893715365, |
|
"learning_rate": 6.3893123751394695e-06, |
|
"loss": 0.2087, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.6425706873157386, |
|
"learning_rate": 6.380443530627797e-06, |
|
"loss": 0.2439, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.6238804132316198, |
|
"learning_rate": 6.371516998291552e-06, |
|
"loss": 0.245, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.6717735651678982, |
|
"learning_rate": 6.3625329569060595e-06, |
|
"loss": 0.2439, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.542780390527117, |
|
"learning_rate": 6.3534915863984045e-06, |
|
"loss": 0.2473, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.4913673007510586, |
|
"learning_rate": 6.344393067843825e-06, |
|
"loss": 0.192, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.5417375030490306, |
|
"learning_rate": 6.335237583462083e-06, |
|
"loss": 0.2459, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.435171944665851, |
|
"learning_rate": 6.326025316613824e-06, |
|
"loss": 0.1888, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.6884981117835218, |
|
"learning_rate": 6.3167564517968944e-06, |
|
"loss": 0.2381, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.5331309342695914, |
|
"learning_rate": 6.307431174642653e-06, |
|
"loss": 0.2019, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.6107084406380612, |
|
"learning_rate": 6.2980496719122544e-06, |
|
"loss": 0.2217, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.5873058124403931, |
|
"learning_rate": 6.288612131492901e-06, |
|
"loss": 0.2413, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.6440866109246133, |
|
"learning_rate": 6.279118742394089e-06, |
|
"loss": 0.2265, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.4675158233520256, |
|
"learning_rate": 6.2695696947438165e-06, |
|
"loss": 0.1992, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.8369618646827899, |
|
"learning_rate": 6.25996517978478e-06, |
|
"loss": 0.2351, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.6374154520403525, |
|
"learning_rate": 6.2503053898705416e-06, |
|
"loss": 0.2224, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.5927118853075677, |
|
"learning_rate": 6.2405905184616776e-06, |
|
"loss": 0.2155, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.8012434043943897, |
|
"learning_rate": 6.230820760121904e-06, |
|
"loss": 0.2047, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.7040064897533143, |
|
"learning_rate": 6.220996310514181e-06, |
|
"loss": 0.2268, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.5756797325887457, |
|
"learning_rate": 6.21111736639679e-06, |
|
"loss": 0.2067, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.5807736828548324, |
|
"learning_rate": 6.201184125619403e-06, |
|
"loss": 0.1972, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.5794194589821984, |
|
"learning_rate": 6.191196787119104e-06, |
|
"loss": 0.1881, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.5803383370350332, |
|
"learning_rate": 6.181155550916423e-06, |
|
"loss": 0.2186, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.5109465274494256, |
|
"learning_rate": 6.171060618111317e-06, |
|
"loss": 0.203, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.4013551973113128, |
|
"learning_rate": 6.160912190879146e-06, |
|
"loss": 0.1516, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.6401705227130554, |
|
"learning_rate": 6.15071047246663e-06, |
|
"loss": 0.2405, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.532406183260015, |
|
"learning_rate": 6.140455667187765e-06, |
|
"loss": 0.1584, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.4242864943638422, |
|
"learning_rate": 6.13014798041975e-06, |
|
"loss": 0.1605, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.658783600906005, |
|
"learning_rate": 6.119787618598854e-06, |
|
"loss": 0.251, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.5412515372331743, |
|
"learning_rate": 6.109374789216296e-06, |
|
"loss": 0.1843, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.5415956887491478, |
|
"learning_rate": 6.098909700814082e-06, |
|
"loss": 0.2413, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.642788987391343, |
|
"learning_rate": 6.08839256298083e-06, |
|
"loss": 0.2066, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.5497708571437974, |
|
"learning_rate": 6.077823586347579e-06, |
|
"loss": 0.1812, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.5028500577138046, |
|
"learning_rate": 6.06720298258356e-06, |
|
"loss": 0.1936, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.6048175825890045, |
|
"learning_rate": 6.056530964391961e-06, |
|
"loss": 0.2167, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.5024506404579518, |
|
"learning_rate": 6.0458077455056704e-06, |
|
"loss": 0.2027, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.563040859063659, |
|
"learning_rate": 6.035033540682993e-06, |
|
"loss": 0.1914, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.5792427326316212, |
|
"learning_rate": 6.024208565703351e-06, |
|
"loss": 0.2111, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.5265351502978457, |
|
"learning_rate": 6.013333037362959e-06, |
|
"loss": 0.1771, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.6864551041061244, |
|
"learning_rate": 6.002407173470486e-06, |
|
"loss": 0.2253, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.648662118810667, |
|
"learning_rate": 5.991431192842692e-06, |
|
"loss": 0.1919, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.650471958543779, |
|
"learning_rate": 5.980405315300045e-06, |
|
"loss": 0.235, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.6364021616200621, |
|
"learning_rate": 5.969329761662319e-06, |
|
"loss": 0.1889, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.669326431103446, |
|
"learning_rate": 5.9582047537441716e-06, |
|
"loss": 0.2336, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.5745390091314653, |
|
"learning_rate": 5.9470305143507e-06, |
|
"loss": 0.2231, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.5373757070684195, |
|
"learning_rate": 5.9358072672729845e-06, |
|
"loss": 0.2089, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.538852347237115, |
|
"learning_rate": 5.924535237283598e-06, |
|
"loss": 0.1658, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.6284277440436594, |
|
"learning_rate": 5.913214650132112e-06, |
|
"loss": 0.1896, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.6636487154066815, |
|
"learning_rate": 5.901845732540568e-06, |
|
"loss": 0.2591, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.5490784239861026, |
|
"learning_rate": 5.8904287121989455e-06, |
|
"loss": 0.1826, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.6410593482616802, |
|
"learning_rate": 5.878963817760597e-06, |
|
"loss": 0.2031, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.6496652774449363, |
|
"learning_rate": 5.867451278837666e-06, |
|
"loss": 0.179, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.4140543483810792, |
|
"learning_rate": 5.855891325996495e-06, |
|
"loss": 0.1951, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.6277068884971182, |
|
"learning_rate": 5.8442841907530035e-06, |
|
"loss": 0.2323, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.9911845501519134, |
|
"learning_rate": 5.83263010556805e-06, |
|
"loss": 0.2472, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.681859260330041, |
|
"learning_rate": 5.820929303842783e-06, |
|
"loss": 0.2256, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.6655852843392644, |
|
"learning_rate": 5.809182019913959e-06, |
|
"loss": 0.2081, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.6025126221307116, |
|
"learning_rate": 5.797388489049253e-06, |
|
"loss": 0.2025, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.554404375701696, |
|
"learning_rate": 5.785548947442547e-06, |
|
"loss": 0.2131, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.4734916428150344, |
|
"learning_rate": 5.7736636322092016e-06, |
|
"loss": 0.1751, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.6900522728772658, |
|
"learning_rate": 5.7617327813813e-06, |
|
"loss": 0.1833, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.5488301168933416, |
|
"learning_rate": 5.749756633902887e-06, |
|
"loss": 0.1581, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.537366544574601, |
|
"learning_rate": 5.7377354296251855e-06, |
|
"loss": 0.1873, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.639976903485692, |
|
"learning_rate": 5.725669409301782e-06, |
|
"loss": 0.1645, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.6628953838068126, |
|
"learning_rate": 5.71355881458382e-06, |
|
"loss": 0.2042, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.6468632976145132, |
|
"learning_rate": 5.701403888015149e-06, |
|
"loss": 0.2166, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.7952592417746278, |
|
"learning_rate": 5.689204873027471e-06, |
|
"loss": 0.2341, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.5636852158640757, |
|
"learning_rate": 5.676962013935464e-06, |
|
"loss": 0.1987, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.6527989174526452, |
|
"learning_rate": 5.664675555931892e-06, |
|
"loss": 0.1985, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.7788761906790338, |
|
"learning_rate": 5.652345745082691e-06, |
|
"loss": 0.1919, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.8065167705575866, |
|
"learning_rate": 5.639972828322043e-06, |
|
"loss": 0.2279, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.5716803825130072, |
|
"learning_rate": 5.627557053447427e-06, |
|
"loss": 0.1928, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.5390117314749268, |
|
"learning_rate": 5.615098669114664e-06, |
|
"loss": 0.1967, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.5711837517647724, |
|
"learning_rate": 5.6025979248329265e-06, |
|
"loss": 0.2509, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.4610065215464945, |
|
"learning_rate": 5.590055070959752e-06, |
|
"loss": 0.1823, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.6474450019294298, |
|
"learning_rate": 5.577470358696021e-06, |
|
"loss": 0.2568, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.593759652911112, |
|
"learning_rate": 5.564844040080931e-06, |
|
"loss": 0.226, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.4905383734267907, |
|
"learning_rate": 5.5521763679869445e-06, |
|
"loss": 0.2009, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.857438854432097, |
|
"learning_rate": 5.53946759611473e-06, |
|
"loss": 0.1911, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.4583746703530132, |
|
"learning_rate": 5.526717978988076e-06, |
|
"loss": 0.2056, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.5753247070423766, |
|
"learning_rate": 5.513927771948798e-06, |
|
"loss": 0.2117, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.5953163221997346, |
|
"learning_rate": 5.5010972311516184e-06, |
|
"loss": 0.1946, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.6835676204377235, |
|
"learning_rate": 5.488226613559045e-06, |
|
"loss": 0.239, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.46235220992734, |
|
"learning_rate": 5.475316176936217e-06, |
|
"loss": 0.2106, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.5871461769010167, |
|
"learning_rate": 5.462366179845746e-06, |
|
"loss": 0.2103, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.6639931565192387, |
|
"learning_rate": 5.449376881642537e-06, |
|
"loss": 0.182, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.57603760113916, |
|
"learning_rate": 5.436348542468598e-06, |
|
"loss": 0.186, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.5868679006153603, |
|
"learning_rate": 5.423281423247821e-06, |
|
"loss": 0.1849, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.5360967555172775, |
|
"learning_rate": 5.4101757856807655e-06, |
|
"loss": 0.1389, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 1.4790770040597105, |
|
"learning_rate": 5.397031892239414e-06, |
|
"loss": 0.1666, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.6761503047677448, |
|
"learning_rate": 5.383850006161913e-06, |
|
"loss": 0.1932, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.6271280533728048, |
|
"learning_rate": 5.370630391447303e-06, |
|
"loss": 0.1959, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.73188421274081, |
|
"learning_rate": 5.357373312850236e-06, |
|
"loss": 0.2206, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.532708439171332, |
|
"learning_rate": 5.3440790358756615e-06, |
|
"loss": 0.2077, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.7867777204429343, |
|
"learning_rate": 5.330747826773522e-06, |
|
"loss": 0.2253, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.6343254051389262, |
|
"learning_rate": 5.317379952533411e-06, |
|
"loss": 0.2139, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.6108378458429666, |
|
"learning_rate": 5.303975680879232e-06, |
|
"loss": 0.189, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.5104269336113685, |
|
"learning_rate": 5.290535280263835e-06, |
|
"loss": 0.179, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 1.5594047944760525, |
|
"learning_rate": 5.277059019863637e-06, |
|
"loss": 0.1933, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 1.480689209423561, |
|
"learning_rate": 5.263547169573235e-06, |
|
"loss": 0.1817, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.5361954565514202, |
|
"learning_rate": 5.25e-06, |
|
"loss": 0.1898, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.524359102459848, |
|
"learning_rate": 5.236417782458656e-06, |
|
"loss": 0.1665, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.5142652247408335, |
|
"learning_rate": 5.222800788965847e-06, |
|
"loss": 0.1968, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.4018217314302317, |
|
"learning_rate": 5.2091492922346894e-06, |
|
"loss": 0.201, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.4467159018792048, |
|
"learning_rate": 5.195463565669309e-06, |
|
"loss": 0.2011, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.4366910804420947, |
|
"learning_rate": 5.18174388335937e-06, |
|
"loss": 0.1713, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.4023140768213624, |
|
"learning_rate": 5.167990520074577e-06, |
|
"loss": 0.1413, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.4524819834529294, |
|
"learning_rate": 5.154203751259183e-06, |
|
"loss": 0.1481, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.8373309015708064, |
|
"learning_rate": 5.140383853026463e-06, |
|
"loss": 0.2036, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.4579603843340663, |
|
"learning_rate": 5.12653110215319e-06, |
|
"loss": 0.1637, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.4630253394454602, |
|
"learning_rate": 5.11264577607409e-06, |
|
"loss": 0.1679, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.5010582913270236, |
|
"learning_rate": 5.098728152876287e-06, |
|
"loss": 0.1776, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.6137474108058125, |
|
"learning_rate": 5.084778511293731e-06, |
|
"loss": 0.1446, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.3840161878955395, |
|
"learning_rate": 5.070797130701618e-06, |
|
"loss": 0.1277, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.5250613932316681, |
|
"learning_rate": 5.056784291110794e-06, |
|
"loss": 0.127, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.4249393893437852, |
|
"learning_rate": 5.04274027316215e-06, |
|
"loss": 0.1077, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.6057528976372746, |
|
"learning_rate": 5.028665358120995e-06, |
|
"loss": 0.1525, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.5619085988004227, |
|
"learning_rate": 5.014559827871426e-06, |
|
"loss": 0.1328, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.5710261957830611, |
|
"learning_rate": 5.00042396491069e-06, |
|
"loss": 0.1256, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.559080830559306, |
|
"learning_rate": 4.9862580523435116e-06, |
|
"loss": 0.0978, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.55462651273927, |
|
"learning_rate": 4.972062373876435e-06, |
|
"loss": 0.0938, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.8403497241572058, |
|
"learning_rate": 4.95783721381214e-06, |
|
"loss": 0.157, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.871192614146256, |
|
"learning_rate": 4.943582857043742e-06, |
|
"loss": 0.1324, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.6434827968562997, |
|
"learning_rate": 4.9292995890490945e-06, |
|
"loss": 0.1022, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.5886538225877374, |
|
"learning_rate": 4.914987695885067e-06, |
|
"loss": 0.1105, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.50849954963733, |
|
"learning_rate": 4.900647464181817e-06, |
|
"loss": 0.1248, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.4878345169368261, |
|
"learning_rate": 4.886279181137049e-06, |
|
"loss": 0.0967, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.5621938707921055, |
|
"learning_rate": 4.871883134510263e-06, |
|
"loss": 0.1021, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.6173058805664053, |
|
"learning_rate": 4.8574596126169925e-06, |
|
"loss": 0.126, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.5633693529236932, |
|
"learning_rate": 4.843008904323029e-06, |
|
"loss": 0.1243, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.4883031561898439, |
|
"learning_rate": 4.828531299038638e-06, |
|
"loss": 0.1101, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.6093361309083989, |
|
"learning_rate": 4.81402708671276e-06, |
|
"loss": 0.0982, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.5365242697494994, |
|
"learning_rate": 4.799496557827208e-06, |
|
"loss": 0.1082, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.3677800601638932, |
|
"learning_rate": 4.7849400033908465e-06, |
|
"loss": 0.0984, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.4697749904794721, |
|
"learning_rate": 4.770357714933765e-06, |
|
"loss": 0.1106, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.515659406646426, |
|
"learning_rate": 4.755749984501437e-06, |
|
"loss": 0.1083, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.5033210621791604, |
|
"learning_rate": 4.741117104648874e-06, |
|
"loss": 0.1004, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.29511877140105, |
|
"learning_rate": 4.726459368434768e-06, |
|
"loss": 0.0826, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.3129120468041393, |
|
"learning_rate": 4.711777069415615e-06, |
|
"loss": 0.0948, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.4547928582961123, |
|
"learning_rate": 4.697070501639841e-06, |
|
"loss": 0.0891, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.444163293133138, |
|
"learning_rate": 4.682339959641915e-06, |
|
"loss": 0.0902, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.5803225389429953, |
|
"learning_rate": 4.667585738436448e-06, |
|
"loss": 0.0966, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 1.6247464395630296, |
|
"learning_rate": 4.652808133512279e-06, |
|
"loss": 0.0854, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.684325097074753, |
|
"learning_rate": 4.638007440826568e-06, |
|
"loss": 0.0821, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.5373768221128852, |
|
"learning_rate": 4.62318395679886e-06, |
|
"loss": 0.0715, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.689616257480933, |
|
"learning_rate": 4.6083379783051545e-06, |
|
"loss": 0.0877, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.4679120367508793, |
|
"learning_rate": 4.593469802671951e-06, |
|
"loss": 0.0775, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.3313559666874426, |
|
"learning_rate": 4.5785797276703075e-06, |
|
"loss": 0.0589, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.5399713230048182, |
|
"learning_rate": 4.563668051509864e-06, |
|
"loss": 0.058, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.6540628385359142, |
|
"learning_rate": 4.548735072832879e-06, |
|
"loss": 0.0824, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.4382670193400522, |
|
"learning_rate": 4.533781090708244e-06, |
|
"loss": 0.0651, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.3614359116885117, |
|
"learning_rate": 4.518806404625495e-06, |
|
"loss": 0.0702, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.6935996818522705, |
|
"learning_rate": 4.503811314488816e-06, |
|
"loss": 0.0981, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.5512615145482254, |
|
"learning_rate": 4.48879612061103e-06, |
|
"loss": 0.0805, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.50599871994829, |
|
"learning_rate": 4.473761123707584e-06, |
|
"loss": 0.0937, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.4324990501767214, |
|
"learning_rate": 4.458706624890534e-06, |
|
"loss": 0.0785, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.4491214242293755, |
|
"learning_rate": 4.443632925662504e-06, |
|
"loss": 0.0634, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.4290846682285696, |
|
"learning_rate": 4.428540327910652e-06, |
|
"loss": 0.0715, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.5712380885133443, |
|
"learning_rate": 4.41342913390063e-06, |
|
"loss": 0.0809, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.673544546540396, |
|
"learning_rate": 4.398299646270518e-06, |
|
"loss": 0.0767, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.4244151960543419, |
|
"learning_rate": 4.3831521680247765e-06, |
|
"loss": 0.0843, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.4326265228498654, |
|
"learning_rate": 4.3679870025281644e-06, |
|
"loss": 0.0607, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.4882885049005354, |
|
"learning_rate": 4.352804453499677e-06, |
|
"loss": 0.0776, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.4736757455586167, |
|
"learning_rate": 4.3376048250064525e-06, |
|
"loss": 0.0657, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 1.574256990342458, |
|
"learning_rate": 4.322388421457687e-06, |
|
"loss": 0.082, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.611911060390727, |
|
"learning_rate": 4.30715554759854e-06, |
|
"loss": 0.0695, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.5323569330355356, |
|
"learning_rate": 4.2919065085040285e-06, |
|
"loss": 0.0736, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.4158459967105763, |
|
"learning_rate": 4.276641609572911e-06, |
|
"loss": 0.0774, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.411415928220124, |
|
"learning_rate": 4.261361156521586e-06, |
|
"loss": 0.0609, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.3129410201180565, |
|
"learning_rate": 4.246065455377956e-06, |
|
"loss": 0.066, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.7101784619118496, |
|
"learning_rate": 4.230754812475306e-06, |
|
"loss": 0.0773, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.386968561190813, |
|
"learning_rate": 4.215429534446161e-06, |
|
"loss": 0.0663, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.402321547197101, |
|
"learning_rate": 4.200089928216156e-06, |
|
"loss": 0.0703, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.5024096069853687, |
|
"learning_rate": 4.1847363009978776e-06, |
|
"loss": 0.0666, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.5771691557161291, |
|
"learning_rate": 4.169368960284718e-06, |
|
"loss": 0.0752, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.5004612326192561, |
|
"learning_rate": 4.153988213844717e-06, |
|
"loss": 0.0659, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.5934299696891325, |
|
"learning_rate": 4.138594369714394e-06, |
|
"loss": 0.0649, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.531764702409795, |
|
"learning_rate": 4.123187736192583e-06, |
|
"loss": 0.0598, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.5143115245379815, |
|
"learning_rate": 4.107768621834257e-06, |
|
"loss": 0.0645, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.4197919164126784, |
|
"learning_rate": 4.092337335444343e-06, |
|
"loss": 0.0579, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.2419987125261145, |
|
"learning_rate": 4.076894186071548e-06, |
|
"loss": 0.0485, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.6270989771595925, |
|
"learning_rate": 4.061439483002161e-06, |
|
"loss": 0.0752, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.6668781562334298, |
|
"learning_rate": 4.045973535753863e-06, |
|
"loss": 0.0506, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.4111176343177239, |
|
"learning_rate": 4.030496654069524e-06, |
|
"loss": 0.0531, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.551403757100443, |
|
"learning_rate": 4.015009147911007e-06, |
|
"loss": 0.0829, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.5266748331664532, |
|
"learning_rate": 3.9995113274529506e-06, |
|
"loss": 0.0579, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.2952363610150155, |
|
"learning_rate": 3.984003503076566e-06, |
|
"loss": 0.075, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.4737121699978741, |
|
"learning_rate": 3.968485985363416e-06, |
|
"loss": 0.0667, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.3510918382261339, |
|
"learning_rate": 3.952959085089193e-06, |
|
"loss": 0.0506, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 1.2593798377497472, |
|
"learning_rate": 3.937423113217505e-06, |
|
"loss": 0.0586, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.3649343014206317, |
|
"learning_rate": 3.92187838089363e-06, |
|
"loss": 0.0722, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.3123387897850538, |
|
"learning_rate": 3.9063251994383055e-06, |
|
"loss": 0.0681, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.3958865510693308, |
|
"learning_rate": 3.8907638803414774e-06, |
|
"loss": 0.0641, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.3955726113701625, |
|
"learning_rate": 3.875194735256067e-06, |
|
"loss": 0.0692, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.4482062071185025, |
|
"learning_rate": 3.859618075991735e-06, |
|
"loss": 0.0612, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.599652437336555, |
|
"learning_rate": 3.844034214508625e-06, |
|
"loss": 0.0819, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.4310645914637838, |
|
"learning_rate": 3.828443462911128e-06, |
|
"loss": 0.063, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.4893479185732388, |
|
"learning_rate": 3.8128461334416223e-06, |
|
"loss": 0.0762, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 1.3002764419943207, |
|
"learning_rate": 3.7972425384742264e-06, |
|
"loss": 0.0595, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 1.5160157250102182, |
|
"learning_rate": 3.781632990508541e-06, |
|
"loss": 0.0762, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.3825833601713984, |
|
"learning_rate": 3.766017802163386e-06, |
|
"loss": 0.0672, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.3919309630976415, |
|
"learning_rate": 3.7503972861705478e-06, |
|
"loss": 0.0711, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.3171909579170673, |
|
"learning_rate": 3.7347717553685084e-06, |
|
"loss": 0.0492, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.4882892958051253, |
|
"learning_rate": 3.7191415226961867e-06, |
|
"loss": 0.0578, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.5208283595273544, |
|
"learning_rate": 3.703506901186665e-06, |
|
"loss": 0.0858, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.255493106442096, |
|
"learning_rate": 3.6878682039609253e-06, |
|
"loss": 0.0467, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.3608611259131593, |
|
"learning_rate": 3.6722257442215736e-06, |
|
"loss": 0.0601, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.2991974178801189, |
|
"learning_rate": 3.6565798352465697e-06, |
|
"loss": 0.0471, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.2001791330016007, |
|
"learning_rate": 3.640930790382953e-06, |
|
"loss": 0.0601, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.4083800701711358, |
|
"learning_rate": 3.625278923040567e-06, |
|
"loss": 0.0764, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.4598166249307498, |
|
"learning_rate": 3.6096245466857808e-06, |
|
"loss": 0.0825, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.5075568056733875, |
|
"learning_rate": 3.5939679748352146e-06, |
|
"loss": 0.0773, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.6813722670975189, |
|
"learning_rate": 3.578309521049456e-06, |
|
"loss": 0.0678, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.4325596151868998, |
|
"learning_rate": 3.562649498926785e-06, |
|
"loss": 0.0628, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.485710743290764, |
|
"learning_rate": 3.546988222096891e-06, |
|
"loss": 0.0682, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.4464307460259824, |
|
"learning_rate": 3.531326004214592e-06, |
|
"loss": 0.0546, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.607327721588889, |
|
"learning_rate": 3.515663158953552e-06, |
|
"loss": 0.0631, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.5697673191081656, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.0522, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.3868779096239099, |
|
"learning_rate": 3.484336841046448e-06, |
|
"loss": 0.0651, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.3231395905165282, |
|
"learning_rate": 3.468673995785409e-06, |
|
"loss": 0.0481, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.3593013701575807, |
|
"learning_rate": 3.4530117779031096e-06, |
|
"loss": 0.0619, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.3228492850140474, |
|
"learning_rate": 3.4373505010732152e-06, |
|
"loss": 0.0675, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.5367657186057377, |
|
"learning_rate": 3.4216904789505444e-06, |
|
"loss": 0.0648, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.3753764939376347, |
|
"learning_rate": 3.4060320251647866e-06, |
|
"loss": 0.0623, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.2778004930662803, |
|
"learning_rate": 3.3903754533142195e-06, |
|
"loss": 0.0641, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.4635871262219515, |
|
"learning_rate": 3.374721076959433e-06, |
|
"loss": 0.0563, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.437055332759444, |
|
"learning_rate": 3.359069209617048e-06, |
|
"loss": 0.0745, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.339007375027239, |
|
"learning_rate": 3.3434201647534306e-06, |
|
"loss": 0.055, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.301365126421519, |
|
"learning_rate": 3.3277742557784263e-06, |
|
"loss": 0.0598, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.4760103555439992, |
|
"learning_rate": 3.312131796039074e-06, |
|
"loss": 0.0892, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.2237525507670013, |
|
"learning_rate": 3.296493098813335e-06, |
|
"loss": 0.0542, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.5222822001085714, |
|
"learning_rate": 3.280858477303813e-06, |
|
"loss": 0.0868, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.39320313785298, |
|
"learning_rate": 3.265228244631491e-06, |
|
"loss": 0.0736, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.3573613063742243, |
|
"learning_rate": 3.2496027138294534e-06, |
|
"loss": 0.0628, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.4654821857892797, |
|
"learning_rate": 3.2339821978366144e-06, |
|
"loss": 0.0563, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.4095736477135694, |
|
"learning_rate": 3.2183670094914596e-06, |
|
"loss": 0.079, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.2720604821604127, |
|
"learning_rate": 3.2027574615257726e-06, |
|
"loss": 0.0593, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.4181511906965734, |
|
"learning_rate": 3.1871538665583784e-06, |
|
"loss": 0.0657, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.4715832293023445, |
|
"learning_rate": 3.171556537088873e-06, |
|
"loss": 0.0841, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.3651848095300287, |
|
"learning_rate": 3.155965785491375e-06, |
|
"loss": 0.0763, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.2733406993933158, |
|
"learning_rate": 3.140381924008266e-06, |
|
"loss": 0.0642, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.4649815014884033, |
|
"learning_rate": 3.1248052647439327e-06, |
|
"loss": 0.0571, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.2811384885805435, |
|
"learning_rate": 3.109236119658523e-06, |
|
"loss": 0.0546, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.2820286884512915, |
|
"learning_rate": 3.0936748005616936e-06, |
|
"loss": 0.0572, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.2499223607002325, |
|
"learning_rate": 3.0781216191063695e-06, |
|
"loss": 0.0413, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.2767469475357223, |
|
"learning_rate": 3.0625768867824957e-06, |
|
"loss": 0.0509, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.4259647549315762, |
|
"learning_rate": 3.047040914910806e-06, |
|
"loss": 0.0586, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.3134194512796191, |
|
"learning_rate": 3.0315140146365854e-06, |
|
"loss": 0.0538, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.8805869988982469, |
|
"learning_rate": 3.015996496923435e-06, |
|
"loss": 0.1031, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.3905086869403662, |
|
"learning_rate": 3.00048867254705e-06, |
|
"loss": 0.072, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.5495009008005376, |
|
"learning_rate": 2.9849908520889936e-06, |
|
"loss": 0.0724, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.4887450497609702, |
|
"learning_rate": 2.9695033459304766e-06, |
|
"loss": 0.072, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.4706761990324224, |
|
"learning_rate": 2.954026464246138e-06, |
|
"loss": 0.0586, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 1.3362836710115724, |
|
"learning_rate": 2.9385605169978387e-06, |
|
"loss": 0.0537, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.3505739518905349, |
|
"learning_rate": 2.923105813928453e-06, |
|
"loss": 0.0538, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.2122588192580181, |
|
"learning_rate": 2.907662664555658e-06, |
|
"loss": 0.0487, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.4238620533550892, |
|
"learning_rate": 2.8922313781657437e-06, |
|
"loss": 0.0552, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.1892063414140128, |
|
"learning_rate": 2.876812263807417e-06, |
|
"loss": 0.0444, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.476004528679644, |
|
"learning_rate": 2.861405630285606e-06, |
|
"loss": 0.0661, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 1.2283441956799137, |
|
"learning_rate": 2.8460117861552833e-06, |
|
"loss": 0.0685, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.334586988917645, |
|
"learning_rate": 2.8306310397152817e-06, |
|
"loss": 0.067, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.2195524245988179, |
|
"learning_rate": 2.815263699002124e-06, |
|
"loss": 0.0474, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.2153155028002154, |
|
"learning_rate": 2.799910071783845e-06, |
|
"loss": 0.0411, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.2126762221994973, |
|
"learning_rate": 2.7845704655538383e-06, |
|
"loss": 0.0449, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.9487314192826695, |
|
"learning_rate": 2.7692451875246956e-06, |
|
"loss": 0.0699, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 1.2517341101809802, |
|
"learning_rate": 2.7539345446220444e-06, |
|
"loss": 0.0501, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 1.1687876365657441, |
|
"learning_rate": 2.7386388434784143e-06, |
|
"loss": 0.0542, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 1.2867517360302896, |
|
"learning_rate": 2.723358390427089e-06, |
|
"loss": 0.0594, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.2340318913389399, |
|
"learning_rate": 2.708093491495973e-06, |
|
"loss": 0.0448, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.027195991965926, |
|
"learning_rate": 2.6928444524014595e-06, |
|
"loss": 0.0369, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 1.1529153587540384, |
|
"learning_rate": 2.6776115785423123e-06, |
|
"loss": 0.0366, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.0936389016778483, |
|
"learning_rate": 2.6623951749935487e-06, |
|
"loss": 0.0318, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.2450006020964501, |
|
"learning_rate": 2.6471955465003237e-06, |
|
"loss": 0.0495, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 1.2226035789766154, |
|
"learning_rate": 2.6320129974718355e-06, |
|
"loss": 0.0445, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.25075066205537, |
|
"learning_rate": 2.616847831975224e-06, |
|
"loss": 0.0384, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.2463934852518022, |
|
"learning_rate": 2.601700353729481e-06, |
|
"loss": 0.03, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 1.2325826640793016, |
|
"learning_rate": 2.58657086609937e-06, |
|
"loss": 0.0301, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 1.3592627844900937, |
|
"learning_rate": 2.5714596720893473e-06, |
|
"loss": 0.0556, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 1.2447561831293539, |
|
"learning_rate": 2.5563670743374973e-06, |
|
"loss": 0.0458, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.2857400436948982, |
|
"learning_rate": 2.5412933751094662e-06, |
|
"loss": 0.0334, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.2532170542444407, |
|
"learning_rate": 2.5262388762924157e-06, |
|
"loss": 0.0344, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.203530717357317, |
|
"learning_rate": 2.5112038793889706e-06, |
|
"loss": 0.0421, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.3294042133808428, |
|
"learning_rate": 2.496188685511185e-06, |
|
"loss": 0.0321, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.2370090074937525, |
|
"learning_rate": 2.481193595374505e-06, |
|
"loss": 0.0303, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.2854698199833365, |
|
"learning_rate": 2.4662189092917563e-06, |
|
"loss": 0.0437, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.258966876492369, |
|
"learning_rate": 2.4512649271671214e-06, |
|
"loss": 0.0348, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.3543020541420905, |
|
"learning_rate": 2.436331948490136e-06, |
|
"loss": 0.0381, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.524191201432137, |
|
"learning_rate": 2.4214202723296924e-06, |
|
"loss": 0.0349, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.2112359861077326, |
|
"learning_rate": 2.4065301973280486e-06, |
|
"loss": 0.0383, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.1519148621388768, |
|
"learning_rate": 2.391662021694847e-06, |
|
"loss": 0.0321, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.3479943611058076, |
|
"learning_rate": 2.3768160432011395e-06, |
|
"loss": 0.0437, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.5161099076368707, |
|
"learning_rate": 2.3619925591734323e-06, |
|
"loss": 0.0319, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 1.2636677740958409, |
|
"learning_rate": 2.3471918664877217e-06, |
|
"loss": 0.0332, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.144672254581016, |
|
"learning_rate": 2.332414261563553e-06, |
|
"loss": 0.0321, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.9299371323757989, |
|
"learning_rate": 2.317660040358085e-06, |
|
"loss": 0.032, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.1313131663449125, |
|
"learning_rate": 2.3029294983601598e-06, |
|
"loss": 0.0313, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.0138929678425856, |
|
"learning_rate": 2.2882229305843866e-06, |
|
"loss": 0.0307, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.1199838222245246, |
|
"learning_rate": 2.2735406315652323e-06, |
|
"loss": 0.0334, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.1253274246781595, |
|
"learning_rate": 2.258882895351125e-06, |
|
"loss": 0.0291, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.0864211988064494, |
|
"learning_rate": 2.2442500154985643e-06, |
|
"loss": 0.0283, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.154223930393599, |
|
"learning_rate": 2.229642285066236e-06, |
|
"loss": 0.0286, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 1.1685223712205286, |
|
"learning_rate": 2.215059996609154e-06, |
|
"loss": 0.0391, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 1.075832341378613, |
|
"learning_rate": 2.200503442172792e-06, |
|
"loss": 0.0304, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.9059326924484261, |
|
"learning_rate": 2.185972913287241e-06, |
|
"loss": 0.0189, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.0918674140346547, |
|
"learning_rate": 2.1714687009613628e-06, |
|
"loss": 0.0191, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.1821702256912072, |
|
"learning_rate": 2.156991095676971e-06, |
|
"loss": 0.0362, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.97509557211284, |
|
"learning_rate": 2.1425403873830083e-06, |
|
"loss": 0.0225, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.9490638193518203, |
|
"learning_rate": 2.1281168654897376e-06, |
|
"loss": 0.0246, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.2487827485548177, |
|
"learning_rate": 2.113720818862951e-06, |
|
"loss": 0.0355, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 1.1256405423581581, |
|
"learning_rate": 2.099352535818182e-06, |
|
"loss": 0.0285, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.9095020425799777, |
|
"learning_rate": 2.085012304114933e-06, |
|
"loss": 0.029, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 1.1211507222049146, |
|
"learning_rate": 2.070700410950906e-06, |
|
"loss": 0.0295, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.3066574168869387, |
|
"learning_rate": 2.0564171429562587e-06, |
|
"loss": 0.0253, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.9756346110567761, |
|
"learning_rate": 2.042162786187862e-06, |
|
"loss": 0.0247, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.1674473433223178, |
|
"learning_rate": 2.027937626123565e-06, |
|
"loss": 0.0309, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.125722578814156, |
|
"learning_rate": 2.0137419476564896e-06, |
|
"loss": 0.0257, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.9997980698465747, |
|
"learning_rate": 1.9995760350893098e-06, |
|
"loss": 0.0279, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.9080372864182613, |
|
"learning_rate": 1.985440172128573e-06, |
|
"loss": 0.0197, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.0431027439667329, |
|
"learning_rate": 1.9713346418790058e-06, |
|
"loss": 0.0277, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.0520465883120045, |
|
"learning_rate": 1.957259726837849e-06, |
|
"loss": 0.0221, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 1.1131126853324889, |
|
"learning_rate": 1.9432157088892064e-06, |
|
"loss": 0.0263, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.0287024420083004, |
|
"learning_rate": 1.9292028692983824e-06, |
|
"loss": 0.025, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.0138678830635144, |
|
"learning_rate": 1.91522148870627e-06, |
|
"loss": 0.0222, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.9566591334023489, |
|
"learning_rate": 1.9012718471237144e-06, |
|
"loss": 0.0283, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.0842644448648473, |
|
"learning_rate": 1.887354223925911e-06, |
|
"loss": 0.0214, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.975574990683092, |
|
"learning_rate": 1.87346889784681e-06, |
|
"loss": 0.0246, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 1.1478718825084286, |
|
"learning_rate": 1.8596161469735374e-06, |
|
"loss": 0.028, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.0607801924078737, |
|
"learning_rate": 1.8457962487408175e-06, |
|
"loss": 0.0225, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.9032740986095086, |
|
"learning_rate": 1.8320094799254222e-06, |
|
"loss": 0.0241, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.9873082880576691, |
|
"learning_rate": 1.8182561166406308e-06, |
|
"loss": 0.0233, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 1.036096577241665, |
|
"learning_rate": 1.8045364343306915e-06, |
|
"loss": 0.0225, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.011020516763835, |
|
"learning_rate": 1.7908507077653124e-06, |
|
"loss": 0.0231, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.9671254615577292, |
|
"learning_rate": 1.7771992110341533e-06, |
|
"loss": 0.0224, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.9590953302318, |
|
"learning_rate": 1.7635822175413446e-06, |
|
"loss": 0.0202, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.9693202672140285, |
|
"learning_rate": 1.7500000000000008e-06, |
|
"loss": 0.0207, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.1439987999618473, |
|
"learning_rate": 1.7364528304267646e-06, |
|
"loss": 0.0255, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.9251345782373769, |
|
"learning_rate": 1.7229409801363635e-06, |
|
"loss": 0.0179, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.9771154935646014, |
|
"learning_rate": 1.7094647197361656e-06, |
|
"loss": 0.0255, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.2798180991649053, |
|
"learning_rate": 1.6960243191207686e-06, |
|
"loss": 0.0232, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.0317246192313414, |
|
"learning_rate": 1.6826200474665891e-06, |
|
"loss": 0.0221, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.088333248492394, |
|
"learning_rate": 1.669252173226479e-06, |
|
"loss": 0.0282, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.0599165259760654, |
|
"learning_rate": 1.6559209641243388e-06, |
|
"loss": 0.0213, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.8840390529269968, |
|
"learning_rate": 1.642626687149765e-06, |
|
"loss": 0.0266, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.022733278705135, |
|
"learning_rate": 1.629369608552696e-06, |
|
"loss": 0.0189, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.9370338460268343, |
|
"learning_rate": 1.6161499938380873e-06, |
|
"loss": 0.0165, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.8403429260662558, |
|
"learning_rate": 1.6029681077605864e-06, |
|
"loss": 0.0194, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.0755284457395846, |
|
"learning_rate": 1.5898242143192336e-06, |
|
"loss": 0.0237, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.9907763333997255, |
|
"learning_rate": 1.576718576752179e-06, |
|
"loss": 0.0211, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.888869735944081, |
|
"learning_rate": 1.5636514575314024e-06, |
|
"loss": 0.0179, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.885718782686732, |
|
"learning_rate": 1.550623118357463e-06, |
|
"loss": 0.0201, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.0264516969794926, |
|
"learning_rate": 1.5376338201542538e-06, |
|
"loss": 0.0194, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.9766407203376387, |
|
"learning_rate": 1.5246838230637831e-06, |
|
"loss": 0.0221, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.725003276513177, |
|
"learning_rate": 1.511773386440955e-06, |
|
"loss": 0.0149, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.8499489911772146, |
|
"learning_rate": 1.4989027688483808e-06, |
|
"loss": 0.0224, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.8758345450457246, |
|
"learning_rate": 1.4860722280512022e-06, |
|
"loss": 0.0186, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.0827335434130656, |
|
"learning_rate": 1.473282021011924e-06, |
|
"loss": 0.0259, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.997864768606809, |
|
"learning_rate": 1.4605324038852707e-06, |
|
"loss": 0.0196, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.906378937725106, |
|
"learning_rate": 1.4478236320130554e-06, |
|
"loss": 0.0209, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.7169173279534029, |
|
"learning_rate": 1.4351559599190708e-06, |
|
"loss": 0.0116, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.8890755643373717, |
|
"learning_rate": 1.4225296413039794e-06, |
|
"loss": 0.0154, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 1.0180803274467207, |
|
"learning_rate": 1.4099449290402492e-06, |
|
"loss": 0.0239, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.8050132864758199, |
|
"learning_rate": 1.3974020751670734e-06, |
|
"loss": 0.0142, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.9413472638967014, |
|
"learning_rate": 1.3849013308853369e-06, |
|
"loss": 0.0244, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.7466941890622708, |
|
"learning_rate": 1.3724429465525733e-06, |
|
"loss": 0.012, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.7962567303062643, |
|
"learning_rate": 1.360027171677957e-06, |
|
"loss": 0.0171, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.8736628823165199, |
|
"learning_rate": 1.3476542549173097e-06, |
|
"loss": 0.0228, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.8479619989881332, |
|
"learning_rate": 1.335324444068108e-06, |
|
"loss": 0.0237, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 1.016855000511962, |
|
"learning_rate": 1.3230379860645363e-06, |
|
"loss": 0.0239, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.9313677958108982, |
|
"learning_rate": 1.3107951269725286e-06, |
|
"loss": 0.0183, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.769512727084196, |
|
"learning_rate": 1.2985961119848508e-06, |
|
"loss": 0.0151, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.8107437247743629, |
|
"learning_rate": 1.28644118541618e-06, |
|
"loss": 0.0214, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.7390308078541298, |
|
"learning_rate": 1.2743305906982184e-06, |
|
"loss": 0.014, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.8815024064424067, |
|
"learning_rate": 1.2622645703748163e-06, |
|
"loss": 0.0176, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.0125087727077078, |
|
"learning_rate": 1.2502433660971122e-06, |
|
"loss": 0.0161, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.7991134263859656, |
|
"learning_rate": 1.2382672186187003e-06, |
|
"loss": 0.0181, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.7695977276266822, |
|
"learning_rate": 1.2263363677907975e-06, |
|
"loss": 0.0144, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.7707016914429152, |
|
"learning_rate": 1.214451052557453e-06, |
|
"loss": 0.0152, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.7456989741080493, |
|
"learning_rate": 1.202611510950747e-06, |
|
"loss": 0.0171, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.9961563272393459, |
|
"learning_rate": 1.1908179800860415e-06, |
|
"loss": 0.0185, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.8118922886096522, |
|
"learning_rate": 1.1790706961572176e-06, |
|
"loss": 0.0148, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.8151702390898807, |
|
"learning_rate": 1.167369894431949e-06, |
|
"loss": 0.021, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.9901756508859597, |
|
"learning_rate": 1.1557158092469968e-06, |
|
"loss": 0.0163, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.768365224491104, |
|
"learning_rate": 1.1441086740035036e-06, |
|
"loss": 0.0166, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.8630559070642096, |
|
"learning_rate": 1.1325487211623343e-06, |
|
"loss": 0.0159, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.6603172130754364, |
|
"learning_rate": 1.121036182239403e-06, |
|
"loss": 0.0131, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.9272515645659455, |
|
"learning_rate": 1.1095712878010542e-06, |
|
"loss": 0.0265, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.7321005503092809, |
|
"learning_rate": 1.0981542674594327e-06, |
|
"loss": 0.0171, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.044787276822999, |
|
"learning_rate": 1.08678534986789e-06, |
|
"loss": 0.0236, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.8453699406585213, |
|
"learning_rate": 1.0754647627164022e-06, |
|
"loss": 0.0214, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.7651768008769598, |
|
"learning_rate": 1.064192732727016e-06, |
|
"loss": 0.0152, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.8101285929402818, |
|
"learning_rate": 1.0529694856493002e-06, |
|
"loss": 0.0156, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.8055953151019704, |
|
"learning_rate": 1.0417952462558286e-06, |
|
"loss": 0.02, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.7870823941792655, |
|
"learning_rate": 1.0306702383376813e-06, |
|
"loss": 0.0171, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.7771789691810701, |
|
"learning_rate": 1.0195946846999551e-06, |
|
"loss": 0.0173, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.8068664036381833, |
|
"learning_rate": 1.0085688071573086e-06, |
|
"loss": 0.0233, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.8738920300362064, |
|
"learning_rate": 9.97592826529514e-07, |
|
"loss": 0.0237, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.841806738949766, |
|
"learning_rate": 9.866669626370412e-07, |
|
"loss": 0.0192, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.8875635219003588, |
|
"learning_rate": 9.757914342966495e-07, |
|
"loss": 0.0154, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.757847160378628, |
|
"learning_rate": 9.649664593170062e-07, |
|
"loss": 0.0136, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.8909717857074549, |
|
"learning_rate": 9.541922544943295e-07, |
|
"loss": 0.0192, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.8943576006561003, |
|
"learning_rate": 9.434690356080394e-07, |
|
"loss": 0.0139, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.6645490438288841, |
|
"learning_rate": 9.327970174164409e-07, |
|
"loss": 0.0142, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.67608989131137, |
|
"learning_rate": 9.221764136524202e-07, |
|
"loss": 0.0157, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.7343067591040218, |
|
"learning_rate": 9.116074370191705e-07, |
|
"loss": 0.0142, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.9063118920210492, |
|
"learning_rate": 9.010902991859196e-07, |
|
"loss": 0.0352, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.7302861209580851, |
|
"learning_rate": 8.906252107837054e-07, |
|
"loss": 0.021, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.951381903299599, |
|
"learning_rate": 8.802123814011458e-07, |
|
"loss": 0.0219, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.8401460617775475, |
|
"learning_rate": 8.698520195802499e-07, |
|
"loss": 0.0181, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.9542163084486763, |
|
"learning_rate": 8.595443328122345e-07, |
|
"loss": 0.0149, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.7339711918314168, |
|
"learning_rate": 8.492895275333705e-07, |
|
"loss": 0.0135, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 1.0384660655095288, |
|
"learning_rate": 8.390878091208544e-07, |
|
"loss": 0.0126, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.882866454798963, |
|
"learning_rate": 8.289393818886837e-07, |
|
"loss": 0.0145, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.0887387218063342, |
|
"learning_rate": 8.188444490835774e-07, |
|
"loss": 0.0163, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.8850634568075848, |
|
"learning_rate": 8.088032128808952e-07, |
|
"loss": 0.0129, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.8517304003704518, |
|
"learning_rate": 7.988158743805973e-07, |
|
"loss": 0.0199, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.7431241826485437, |
|
"learning_rate": 7.888826336032093e-07, |
|
"loss": 0.0201, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.6695280139807532, |
|
"learning_rate": 7.790036894858198e-07, |
|
"loss": 0.0201, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.6069369320073023, |
|
"learning_rate": 7.691792398780962e-07, |
|
"loss": 0.0118, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.7630660606526203, |
|
"learning_rate": 7.594094815383223e-07, |
|
"loss": 0.0109, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.7085481933951672, |
|
"learning_rate": 7.496946101294585e-07, |
|
"loss": 0.0123, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.7837964413939096, |
|
"learning_rate": 7.400348202152192e-07, |
|
"loss": 0.0196, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.6565704481331924, |
|
"learning_rate": 7.304303052561841e-07, |
|
"loss": 0.0122, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.6955396545607133, |
|
"learning_rate": 7.208812576059113e-07, |
|
"loss": 0.0156, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.5956060246770807, |
|
"learning_rate": 7.113878685070994e-07, |
|
"loss": 0.015, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.5935447259636408, |
|
"learning_rate": 7.019503280877466e-07, |
|
"loss": 0.0105, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.676191247655706, |
|
"learning_rate": 6.925688253573465e-07, |
|
"loss": 0.0128, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.5351296056518017, |
|
"learning_rate": 6.832435482031064e-07, |
|
"loss": 0.0094, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.5478275824393115, |
|
"learning_rate": 6.73974683386176e-07, |
|
"loss": 0.0103, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.5637592397358228, |
|
"learning_rate": 6.647624165379173e-07, |
|
"loss": 0.0131, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.5844732480577218, |
|
"learning_rate": 6.55606932156175e-07, |
|
"loss": 0.012, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.5560435069211428, |
|
"learning_rate": 6.465084136015951e-07, |
|
"loss": 0.0084, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.4885438511561934, |
|
"learning_rate": 6.374670430939404e-07, |
|
"loss": 0.0072, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.5653559793263204, |
|
"learning_rate": 6.284830017084488e-07, |
|
"loss": 0.0068, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.6135164676196407, |
|
"learning_rate": 6.195564693722027e-07, |
|
"loss": 0.0142, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.4518891844084337, |
|
"learning_rate": 6.106876248605299e-07, |
|
"loss": 0.0102, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.4974300377898031, |
|
"learning_rate": 6.018766457934177e-07, |
|
"loss": 0.0054, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.5411109097379481, |
|
"learning_rate": 5.931237086319592e-07, |
|
"loss": 0.009, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.5359394089968081, |
|
"learning_rate": 5.844289886748196e-07, |
|
"loss": 0.0134, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.80432664191503, |
|
"learning_rate": 5.757926600547231e-07, |
|
"loss": 0.0075, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.7830393837005951, |
|
"learning_rate": 5.672148957349661e-07, |
|
"loss": 0.0106, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.6788613866964288, |
|
"learning_rate": 5.586958675059548e-07, |
|
"loss": 0.013, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.47264907469733186, |
|
"learning_rate": 5.502357459817639e-07, |
|
"loss": 0.0095, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.48175069610687854, |
|
"learning_rate": 5.418347005967189e-07, |
|
"loss": 0.0071, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.6790920380461669, |
|
"learning_rate": 5.334928996020013e-07, |
|
"loss": 0.0115, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.4090145392612429, |
|
"learning_rate": 5.252105100622848e-07, |
|
"loss": 0.0073, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.49819569886726583, |
|
"learning_rate": 5.169876978523828e-07, |
|
"loss": 0.0091, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.6045179104834907, |
|
"learning_rate": 5.088246276539292e-07, |
|
"loss": 0.0129, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.40815948732981716, |
|
"learning_rate": 5.0072146295208e-07, |
|
"loss": 0.0067, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.5355302694809556, |
|
"learning_rate": 4.926783660322411e-07, |
|
"loss": 0.0091, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.4672977933370541, |
|
"learning_rate": 4.846954979768149e-07, |
|
"loss": 0.0081, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.6156756390110313, |
|
"learning_rate": 4.7677301866197455e-07, |
|
"loss": 0.0103, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.4919976018539734, |
|
"learning_rate": 4.6891108675446453e-07, |
|
"loss": 0.0075, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.3845755495693945, |
|
"learning_rate": 4.611098597084226e-07, |
|
"loss": 0.0075, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.5879587628160349, |
|
"learning_rate": 4.533694937622227e-07, |
|
"loss": 0.0096, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.4812494262734211, |
|
"learning_rate": 4.456901439353499e-07, |
|
"loss": 0.0066, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.5332547275950345, |
|
"learning_rate": 4.3807196402529535e-07, |
|
"loss": 0.0077, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.6890481785285228, |
|
"learning_rate": 4.3051510660447336e-07, |
|
"loss": 0.0057, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.47168726099727676, |
|
"learning_rate": 4.2301972301716934e-07, |
|
"loss": 0.0086, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.5049838939089353, |
|
"learning_rate": 4.155859633765044e-07, |
|
"loss": 0.0078, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.34008519374005064, |
|
"learning_rate": 4.0821397656143503e-07, |
|
"loss": 0.0048, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.607517447320986, |
|
"learning_rate": 4.009039102137657e-07, |
|
"loss": 0.0051, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.5920287455206938, |
|
"learning_rate": 3.9365591073519387e-07, |
|
"loss": 0.0089, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.5406111609382083, |
|
"learning_rate": 3.8647012328438085e-07, |
|
"loss": 0.0063, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.35958261003969805, |
|
"learning_rate": 3.793466917740402e-07, |
|
"loss": 0.0061, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.5049810947725443, |
|
"learning_rate": 3.7228575886805744e-07, |
|
"loss": 0.0091, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.641243030116075, |
|
"learning_rate": 3.6528746597863283e-07, |
|
"loss": 0.0091, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.5551219906137611, |
|
"learning_rate": 3.583519532634516e-07, |
|
"loss": 0.0093, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.5639034541755932, |
|
"learning_rate": 3.514793596228702e-07, |
|
"loss": 0.0078, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.6129837509479856, |
|
"learning_rate": 3.44669822697144e-07, |
|
"loss": 0.0072, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.4367986472936705, |
|
"learning_rate": 3.3792347886366265e-07, |
|
"loss": 0.0088, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.46298536553833297, |
|
"learning_rate": 3.31240463234221e-07, |
|
"loss": 0.0077, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.47004028200875053, |
|
"learning_rate": 3.2462090965231767e-07, |
|
"loss": 0.0057, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.4778435373643932, |
|
"learning_rate": 3.180649506904667e-07, |
|
"loss": 0.0084, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.3789331021336801, |
|
"learning_rate": 3.1157271764755085e-07, |
|
"loss": 0.0059, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.5304877562885468, |
|
"learning_rate": 3.0514434054618216e-07, |
|
"loss": 0.0071, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.453404842137191, |
|
"learning_rate": 2.987799481301091e-07, |
|
"loss": 0.005, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.4946215142129279, |
|
"learning_rate": 2.924796678616297e-07, |
|
"loss": 0.0064, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.43531588103388913, |
|
"learning_rate": 2.862436259190414e-07, |
|
"loss": 0.0074, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.42256615584272145, |
|
"learning_rate": 2.800719471941152e-07, |
|
"loss": 0.0052, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.5292189519846633, |
|
"learning_rate": 2.739647552895949e-07, |
|
"loss": 0.0098, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.2916439347063657, |
|
"learning_rate": 2.6792217251671744e-07, |
|
"loss": 0.0042, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.42877425290392307, |
|
"learning_rate": 2.619443198927677e-07, |
|
"loss": 0.0075, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.35484337293926443, |
|
"learning_rate": 2.5603131713865374e-07, |
|
"loss": 0.0078, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.4543586667955343, |
|
"learning_rate": 2.50183282676508e-07, |
|
"loss": 0.0059, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.48615348530445246, |
|
"learning_rate": 2.444003336273163e-07, |
|
"loss": 0.0075, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.3946759106084988, |
|
"learning_rate": 2.3868258580857164e-07, |
|
"loss": 0.0067, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.3682990107394935, |
|
"learning_rate": 2.3303015373195713e-07, |
|
"loss": 0.0065, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.3469846297857803, |
|
"learning_rate": 2.2744315060104846e-07, |
|
"loss": 0.0057, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.3055870761212841, |
|
"learning_rate": 2.2192168830904963e-07, |
|
"loss": 0.0046, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.2684039161842392, |
|
"learning_rate": 2.1646587743655287e-07, |
|
"loss": 0.0044, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.551851617769783, |
|
"learning_rate": 2.1107582724932088e-07, |
|
"loss": 0.0076, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.3773077489097412, |
|
"learning_rate": 2.0575164569610016e-07, |
|
"loss": 0.0058, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.30157672847907924, |
|
"learning_rate": 2.0049343940645935e-07, |
|
"loss": 0.005, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.37102460832798273, |
|
"learning_rate": 1.953013136886541e-07, |
|
"loss": 0.0072, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.5134825511016379, |
|
"learning_rate": 1.901753725275166e-07, |
|
"loss": 0.0054, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.3418486411468999, |
|
"learning_rate": 1.8511571858237357e-07, |
|
"loss": 0.0049, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.37310826466217933, |
|
"learning_rate": 1.801224531849908e-07, |
|
"loss": 0.0068, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.5358213340322415, |
|
"learning_rate": 1.7519567633754352e-07, |
|
"loss": 0.0055, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.43174093605907204, |
|
"learning_rate": 1.70335486710614e-07, |
|
"loss": 0.0104, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.3429914758954129, |
|
"learning_rate": 1.6554198164121265e-07, |
|
"loss": 0.0052, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.24916354059181808, |
|
"learning_rate": 1.6081525713083428e-07, |
|
"loss": 0.0038, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.4382399839925513, |
|
"learning_rate": 1.561554078435296e-07, |
|
"loss": 0.0057, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.4974839410031333, |
|
"learning_rate": 1.5156252710401207e-07, |
|
"loss": 0.0091, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.3839389633196022, |
|
"learning_rate": 1.4703670689578884e-07, |
|
"loss": 0.0064, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.3399047282423716, |
|
"learning_rate": 1.4257803785931926e-07, |
|
"loss": 0.006, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.26059500112902745, |
|
"learning_rate": 1.3818660929019717e-07, |
|
"loss": 0.0054, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.324557199088587, |
|
"learning_rate": 1.3386250913736408e-07, |
|
"loss": 0.0053, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.31523371331282024, |
|
"learning_rate": 1.296058240013491e-07, |
|
"loss": 0.0069, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.2736291729957554, |
|
"learning_rate": 1.2541663913253191e-07, |
|
"loss": 0.0055, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.3431360054012468, |
|
"learning_rate": 1.2129503842943645e-07, |
|
"loss": 0.0072, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.30255537482868156, |
|
"learning_rate": 1.1724110443705115e-07, |
|
"loss": 0.0055, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.35751371687669753, |
|
"learning_rate": 1.1325491834517676e-07, |
|
"loss": 0.007, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.37341244248321687, |
|
"learning_rate": 1.0933655998679653e-07, |
|
"loss": 0.0049, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.36294551103333006, |
|
"learning_rate": 1.0548610783648199e-07, |
|
"loss": 0.0075, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.23289924927760813, |
|
"learning_rate": 1.0170363900881795e-07, |
|
"loss": 0.003, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.23590969846667792, |
|
"learning_rate": 9.798922925685994e-08, |
|
"loss": 0.0037, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.371740523030214, |
|
"learning_rate": 9.434295297061668e-08, |
|
"loss": 0.0081, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.35348995399607075, |
|
"learning_rate": 9.076488317555886e-08, |
|
"loss": 0.0053, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.5059722433604245, |
|
"learning_rate": 8.725509153115918e-08, |
|
"loss": 0.0092, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.29220233975264737, |
|
"learning_rate": 8.38136483294546e-08, |
|
"loss": 0.0033, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.28309868206988914, |
|
"learning_rate": 8.044062249364048e-08, |
|
"loss": 0.0052, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.4083770685160588, |
|
"learning_rate": 7.713608157668921e-08, |
|
"loss": 0.0082, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.3242480770397174, |
|
"learning_rate": 7.390009175999835e-08, |
|
"loss": 0.0087, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.3427949444072894, |
|
"learning_rate": 7.073271785206314e-08, |
|
"loss": 0.0072, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.4125538286496571, |
|
"learning_rate": 6.763402328718116e-08, |
|
"loss": 0.0065, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.266510337541702, |
|
"learning_rate": 6.460407012417918e-08, |
|
"loss": 0.0042, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.3107043699376071, |
|
"learning_rate": 6.164291904517333e-08, |
|
"loss": 0.0083, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.2613236810681545, |
|
"learning_rate": 5.875062935435121e-08, |
|
"loss": 0.0051, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.2851078758987879, |
|
"learning_rate": 5.592725897678446e-08, |
|
"loss": 0.0057, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.21054304193906503, |
|
"learning_rate": 5.3172864457271926e-08, |
|
"loss": 0.0031, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.3265715406591838, |
|
"learning_rate": 5.048750095920151e-08, |
|
"loss": 0.0067, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.45665464799625405, |
|
"learning_rate": 4.787122226345014e-08, |
|
"loss": 0.0055, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.2843677568177273, |
|
"learning_rate": 4.532408076730504e-08, |
|
"loss": 0.0057, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.28779532309845957, |
|
"learning_rate": 4.2846127483414206e-08, |
|
"loss": 0.0059, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.3003525952411292, |
|
"learning_rate": 4.043741203876483e-08, |
|
"loss": 0.0051, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.26104706018588686, |
|
"learning_rate": 3.80979826736893e-08, |
|
"loss": 0.0049, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.39096885913150037, |
|
"learning_rate": 3.58278862409e-08, |
|
"loss": 0.0074, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.3088992283183089, |
|
"learning_rate": 3.3627168204549306e-08, |
|
"loss": 0.0041, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.31604556262271916, |
|
"learning_rate": 3.1495872639320357e-08, |
|
"loss": 0.0062, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.3121260022084396, |
|
"learning_rate": 2.9434042229544543e-08, |
|
"loss": 0.0064, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.2529095358959997, |
|
"learning_rate": 2.7441718268344737e-08, |
|
"loss": 0.0054, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.3758371548507174, |
|
"learning_rate": 2.5518940656811095e-08, |
|
"loss": 0.0097, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.3434048070755047, |
|
"learning_rate": 2.3665747903199418e-08, |
|
"loss": 0.0075, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.3932959717890068, |
|
"learning_rate": 2.1882177122162173e-08, |
|
"loss": 0.0087, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.3941078588894348, |
|
"learning_rate": 2.0168264034002404e-08, |
|
"loss": 0.0105, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.2780367397788764, |
|
"learning_rate": 1.8524042963961095e-08, |
|
"loss": 0.0052, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.3463950374581184, |
|
"learning_rate": 1.6949546841528607e-08, |
|
"loss": 0.0057, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.36448273851642904, |
|
"learning_rate": 1.544480719978447e-08, |
|
"loss": 0.0085, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.2802121622475311, |
|
"learning_rate": 1.4009854174767521e-08, |
|
"loss": 0.008, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.313518910941428, |
|
"learning_rate": 1.2644716504870091e-08, |
|
"loss": 0.007, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.3661778282505006, |
|
"learning_rate": 1.1349421530265246e-08, |
|
"loss": 0.0093, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.39469666067358067, |
|
"learning_rate": 1.0123995192356183e-08, |
|
"loss": 0.0102, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.35114903900718974, |
|
"learning_rate": 8.968462033259405e-09, |
|
"loss": 0.0084, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.3674950171762429, |
|
"learning_rate": 7.882845195312016e-09, |
|
"loss": 0.0054, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 0.2775079654458955, |
|
"learning_rate": 6.8671664206073625e-09, |
|
"loss": 0.0049, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.3258792327477565, |
|
"learning_rate": 5.921446050561386e-09, |
|
"loss": 0.0085, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.3545502145110547, |
|
"learning_rate": 5.0457030255038334e-09, |
|
"loss": 0.0044, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.3203502791392638, |
|
"learning_rate": 4.239954884299401e-09, |
|
"loss": 0.0066, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.322043236377632, |
|
"learning_rate": 3.5042177639972304e-09, |
|
"loss": 0.0081, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.3262082117453807, |
|
"learning_rate": 2.838506399506446e-09, |
|
"loss": 0.0068, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.46623582650195866, |
|
"learning_rate": 2.2428341233012294e-09, |
|
"loss": 0.0197, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.41216660386641557, |
|
"learning_rate": 1.7172128651554152e-09, |
|
"loss": 0.0102, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.41233357005851606, |
|
"learning_rate": 1.2616531519011874e-09, |
|
"loss": 0.0081, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.2876638035649477, |
|
"learning_rate": 8.761641072196346e-10, |
|
"loss": 0.0071, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.3046501833564902, |
|
"learning_rate": 5.607534514585066e-10, |
|
"loss": 0.0053, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.2748887054069138, |
|
"learning_rate": 3.1542750147639517e-10, |
|
"loss": 0.0055, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.324688595353322, |
|
"learning_rate": 1.401911705168346e-10, |
|
"loss": 0.0057, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.2944466244233151, |
|
"learning_rate": 3.5047968109214176e-11, |
|
"loss": 0.0059, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.31412002348727536, |
|
"learning_rate": 0.0, |
|
"loss": 0.0066, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 780, |
|
"total_flos": 0.0, |
|
"train_loss": 0.17025724985541252, |
|
"train_runtime": 1577.4481, |
|
"train_samples_per_second": 15.848, |
|
"train_steps_per_second": 0.494 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|