|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3333613374779467, |
|
"eval_steps": 248, |
|
"global_step": 248, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013441989414433337, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.0378, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013441989414433337, |
|
"eval_loss": 3.0436718463897705, |
|
"eval_runtime": 5734.1832, |
|
"eval_samples_per_second": 3.861, |
|
"eval_steps_per_second": 0.483, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0026883978828866673, |
|
"grad_norm": 10.3125, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.1143, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.004032596824330001, |
|
"grad_norm": 8.75, |
|
"learning_rate": 3e-06, |
|
"loss": 1.0933, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.005376795765773335, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1014, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006720994707216668, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0504, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008065193648660002, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0031, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.009409392590103335, |
|
"grad_norm": 3.0, |
|
"learning_rate": 7e-06, |
|
"loss": 1.0001, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01075359153154667, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.0196, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.012097790472990002, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9e-06, |
|
"loss": 0.9207, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.013441989414433336, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9502, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01478618835587667, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.999954076906038e-06, |
|
"loss": 0.9674, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.016130387297320005, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.99981630846772e-06, |
|
"loss": 0.8617, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.017474586238763336, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.999586697215748e-06, |
|
"loss": 0.9222, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01881878518020667, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.999265247367909e-06, |
|
"loss": 0.884, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.020162984121650004, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 9.998851964828987e-06, |
|
"loss": 0.9086, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02150718306309334, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.99834685719067e-06, |
|
"loss": 0.8723, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.022851382004536673, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 9.997749933731397e-06, |
|
"loss": 0.8678, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.024195580945980004, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 9.997061205416203e-06, |
|
"loss": 0.8497, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.025539779887423338, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 9.996280684896496e-06, |
|
"loss": 0.8475, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.026883978828866673, |
|
"grad_norm": 1.625, |
|
"learning_rate": 9.995408386509846e-06, |
|
"loss": 0.8673, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.028228177770310007, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 9.99444432627971e-06, |
|
"loss": 0.8704, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02957237671175334, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.993388521915134e-06, |
|
"loss": 0.813, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.030916575653196672, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 9.992240992810445e-06, |
|
"loss": 0.8497, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03226077459464001, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 9.991001760044877e-06, |
|
"loss": 0.8027, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03360497353608334, |
|
"grad_norm": 1.375, |
|
"learning_rate": 9.989670846382189e-06, |
|
"loss": 0.8099, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03494917247752667, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 9.98824827627025e-06, |
|
"loss": 0.7818, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03629337141897001, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.986734075840591e-06, |
|
"loss": 0.8078, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03763757036041334, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 9.985128272907917e-06, |
|
"loss": 0.819, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03898176930185668, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 9.983430896969606e-06, |
|
"loss": 0.7949, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.04032596824330001, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.981641979205158e-06, |
|
"loss": 0.8044, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04167016718474334, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.97976155247563e-06, |
|
"loss": 0.8025, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04301436612618668, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.977789651323025e-06, |
|
"loss": 0.7461, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04435856506763001, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 9.975726311969664e-06, |
|
"loss": 0.787, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.045702764009073346, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.973571572317519e-06, |
|
"loss": 0.7837, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.04704696295051668, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.971325471947518e-06, |
|
"loss": 0.8101, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04839116189196001, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.968988052118804e-06, |
|
"loss": 0.783, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.049735360833403346, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.966559355768005e-06, |
|
"loss": 0.8141, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.051079559774846676, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.964039427508418e-06, |
|
"loss": 0.7753, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.052423758716290014, |
|
"grad_norm": 1.375, |
|
"learning_rate": 9.961428313629203e-06, |
|
"loss": 0.773, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.053767957657733345, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 9.958726062094533e-06, |
|
"loss": 0.7557, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.055112156599176676, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.955932722542709e-06, |
|
"loss": 0.7241, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.056456355540620014, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.953048346285245e-06, |
|
"loss": 0.781, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.057800554482063345, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 9.950072986305938e-06, |
|
"loss": 0.7644, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05914475342350668, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 9.947006697259881e-06, |
|
"loss": 0.7406, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.06048895236495001, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.943849535472468e-06, |
|
"loss": 0.762, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.061833151306393344, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.940601558938348e-06, |
|
"loss": 0.7448, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.06317735024783667, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.93726282732038e-06, |
|
"loss": 0.7658, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.06452154918928002, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.933833401948514e-06, |
|
"loss": 0.7876, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06586574813072335, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 9.930313345818683e-06, |
|
"loss": 0.8166, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06720994707216668, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 9.92670272359163e-06, |
|
"loss": 0.7927, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06855414601361001, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.923001601591738e-06, |
|
"loss": 0.7906, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06989834495505334, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 9.919210047805791e-06, |
|
"loss": 0.7576, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.07124254389649669, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.915328131881745e-06, |
|
"loss": 0.7847, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.07258674283794002, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.911355925127433e-06, |
|
"loss": 0.7728, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07393094177938335, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.907293500509268e-06, |
|
"loss": 0.7947, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07527514072082668, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 9.903140932650891e-06, |
|
"loss": 0.7503, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07661933966227001, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 9.898898297831808e-06, |
|
"loss": 0.7519, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07796353860371336, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.894565673985986e-06, |
|
"loss": 0.7438, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07930773754515669, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.890143140700419e-06, |
|
"loss": 0.7635, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.08065193648660002, |
|
"grad_norm": 1.375, |
|
"learning_rate": 9.885630779213678e-06, |
|
"loss": 0.7176, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08199613542804335, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.881028672414397e-06, |
|
"loss": 0.7536, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.08334033436948668, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 9.876336904839772e-06, |
|
"loss": 0.7572, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.08468453331093002, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.871555562673996e-06, |
|
"loss": 0.7767, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08602873225237335, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 9.86668473374668e-06, |
|
"loss": 0.763, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08737293119381669, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 9.861724507531234e-06, |
|
"loss": 0.7576, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08871713013526002, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 9.856674975143237e-06, |
|
"loss": 0.7342, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.09006132907670335, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.851536229338747e-06, |
|
"loss": 0.7487, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.09140552801814669, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.846308364512607e-06, |
|
"loss": 0.7604, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.09274972695959002, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 9.840991476696707e-06, |
|
"loss": 0.7537, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.09409392590103335, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.835585663558221e-06, |
|
"loss": 0.7608, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09543812484247668, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 9.830091024397818e-06, |
|
"loss": 0.7298, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.09678232378392002, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 9.824507660147831e-06, |
|
"loss": 0.7646, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09812652272536336, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 9.8188356733704e-06, |
|
"loss": 0.7679, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09947072166680669, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.813075168255601e-06, |
|
"loss": 0.7263, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.10081492060825002, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.807226250619522e-06, |
|
"loss": 0.7589, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10215911954969335, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 9.801289027902316e-06, |
|
"loss": 0.7216, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.10350331849113668, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.795263609166243e-06, |
|
"loss": 0.7779, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.10484751743258003, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 9.789150105093647e-06, |
|
"loss": 0.6941, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.10619171637402336, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.78294862798494e-06, |
|
"loss": 0.7362, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.10753591531546669, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 9.776659291756528e-06, |
|
"loss": 0.7103, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10888011425691002, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.77028221193872e-06, |
|
"loss": 0.6866, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.11022431319835335, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.763817505673614e-06, |
|
"loss": 0.7517, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1115685121397967, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 9.75726529171293e-06, |
|
"loss": 0.7508, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.11291271108124003, |
|
"grad_norm": 1.5, |
|
"learning_rate": 9.750625690415848e-06, |
|
"loss": 0.7275, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.11425691002268336, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.74389882374678e-06, |
|
"loss": 0.6994, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11560110896412669, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.737084815273137e-06, |
|
"loss": 0.7365, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.11694530790557002, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 9.730183790163061e-06, |
|
"loss": 0.7294, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.11828950684701336, |
|
"grad_norm": 1.375, |
|
"learning_rate": 9.72319587518312e-06, |
|
"loss": 0.7226, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.1196337057884567, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.716121198695987e-06, |
|
"loss": 0.7126, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.12097790472990003, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 9.708959890658074e-06, |
|
"loss": 0.7118, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12232210367134336, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 9.70171208261715e-06, |
|
"loss": 0.7164, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.12366630261278669, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 9.69437790770992e-06, |
|
"loss": 0.7015, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.12501050155423002, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.68695750065959e-06, |
|
"loss": 0.7212, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.12635470049567335, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 9.679450997773378e-06, |
|
"loss": 0.7301, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.12769889943711668, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 9.67185853694002e-06, |
|
"loss": 0.7434, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12904309837856004, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 9.664180257627231e-06, |
|
"loss": 0.7503, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.13038729732000337, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 9.656416300879147e-06, |
|
"loss": 0.704, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.1317314962614467, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 9.648566809313738e-06, |
|
"loss": 0.7091, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.13307569520289003, |
|
"grad_norm": 1.25, |
|
"learning_rate": 9.640631927120177e-06, |
|
"loss": 0.6939, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.13441989414433336, |
|
"grad_norm": 1.25, |
|
"learning_rate": 9.632611800056202e-06, |
|
"loss": 0.6645, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1357640930857767, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 9.62450657544543e-06, |
|
"loss": 0.7314, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.13710829202722002, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.616316402174657e-06, |
|
"loss": 0.7022, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.13845249096866336, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 9.608041430691126e-06, |
|
"loss": 0.7015, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.1397966899101067, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.59968181299975e-06, |
|
"loss": 0.6831, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.14114088885155002, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 9.591237702660335e-06, |
|
"loss": 0.6903, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.14248508779299338, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 9.58270925478475e-06, |
|
"loss": 0.7236, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1438292867344367, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.574096626034077e-06, |
|
"loss": 0.7375, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.14517348567588004, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 9.565399974615744e-06, |
|
"loss": 0.7051, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.14651768461732337, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 9.556619460280605e-06, |
|
"loss": 0.6961, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.1478618835587667, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 9.547755244320013e-06, |
|
"loss": 0.731, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14920608250021003, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.53880748956286e-06, |
|
"loss": 0.6992, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.15055028144165336, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 9.529776360372576e-06, |
|
"loss": 0.6954, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.1518944803830967, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.52066202264412e-06, |
|
"loss": 0.6932, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.15323867932454002, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 9.511464643800926e-06, |
|
"loss": 0.7117, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.15458287826598335, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 9.502184392791834e-06, |
|
"loss": 0.6992, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1559270772074267, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 9.492821440087978e-06, |
|
"loss": 0.6744, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.15727127614887004, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.48337595767966e-06, |
|
"loss": 0.6553, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.15861547509031337, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 9.473848119073188e-06, |
|
"loss": 0.7206, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1599596740317567, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.4642380992877e-06, |
|
"loss": 0.7014, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.16130387297320004, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 9.454546074851927e-06, |
|
"loss": 0.7249, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.16264807191464337, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.444772223800972e-06, |
|
"loss": 0.7142, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1639922708560867, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.434916725673023e-06, |
|
"loss": 0.6778, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.16533646979753003, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.42497976150607e-06, |
|
"loss": 0.6831, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.16668066873897336, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.414961513834569e-06, |
|
"loss": 0.6744, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1680248676804167, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 9.404862166686089e-06, |
|
"loss": 0.7059, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16936906662186005, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.394681905577938e-06, |
|
"loss": 0.7245, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.17071326556330338, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.384420917513752e-06, |
|
"loss": 0.7064, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.1720574645047467, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.374079390980058e-06, |
|
"loss": 0.7177, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.17340166344619004, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.363657515942814e-06, |
|
"loss": 0.7031, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.17474586238763337, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 9.35315548384392e-06, |
|
"loss": 0.6752, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1760900613290767, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.342573487597696e-06, |
|
"loss": 0.7189, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.17743426027052003, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.331911721587345e-06, |
|
"loss": 0.6873, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.17877845921196336, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.321170381661383e-06, |
|
"loss": 0.7056, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.1801226581534067, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 9.310349665130035e-06, |
|
"loss": 0.7024, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.18146685709485003, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.299449770761612e-06, |
|
"loss": 0.708, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.18281105603629338, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.288470898778863e-06, |
|
"loss": 0.6624, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.18415525497773672, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 9.277413250855296e-06, |
|
"loss": 0.6622, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.18549945391918005, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.266277030111474e-06, |
|
"loss": 0.7074, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.18684365286062338, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.25506244111128e-06, |
|
"loss": 0.6745, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1881878518020667, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 9.243769689858167e-06, |
|
"loss": 0.6627, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18953205074351004, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.232398983791363e-06, |
|
"loss": 0.6563, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.19087624968495337, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.220950531782069e-06, |
|
"loss": 0.688, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1922204486263967, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 9.209424544129621e-06, |
|
"loss": 0.7021, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.19356464756784003, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 9.197821232557625e-06, |
|
"loss": 0.6826, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.19490884650928336, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 9.186140810210066e-06, |
|
"loss": 0.6904, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.19625304545072672, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.1743834916474e-06, |
|
"loss": 0.6747, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.19759724439217005, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 9.162549492842603e-06, |
|
"loss": 0.7073, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.19894144333361338, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 9.150639031177211e-06, |
|
"loss": 0.6773, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.2002856422750567, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.138652325437326e-06, |
|
"loss": 0.689, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.20162984121650004, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.12658959580959e-06, |
|
"loss": 0.6874, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.20297404015794338, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.114451063877152e-06, |
|
"loss": 0.6617, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2043182390993867, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.102236952615588e-06, |
|
"loss": 0.6882, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.20566243804083004, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.08994748638881e-06, |
|
"loss": 0.6919, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.20700663698227337, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.077582890944945e-06, |
|
"loss": 0.6713, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2083508359237167, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 9.065143393412179e-06, |
|
"loss": 0.6504, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.20969503486516006, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 9.052629222294605e-06, |
|
"loss": 0.6674, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2110392338066034, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 9.040040607467999e-06, |
|
"loss": 0.7111, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.21238343274804672, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.02737778017562e-06, |
|
"loss": 0.6596, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.21372763168949005, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 9.014640973023951e-06, |
|
"loss": 0.6846, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.21507183063093338, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 9.00183041997843e-06, |
|
"loss": 0.6788, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2164160295723767, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 8.988946356359147e-06, |
|
"loss": 0.6888, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.21776022851382004, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 8.97598901883653e-06, |
|
"loss": 0.6854, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.21910442745526337, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.962958645426989e-06, |
|
"loss": 0.7137, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.2204486263967067, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.949855475488549e-06, |
|
"loss": 0.7041, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.22179282533815003, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 8.936679749716452e-06, |
|
"loss": 0.6573, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2231370242795934, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.923431710138735e-06, |
|
"loss": 0.6694, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.22448122322103672, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.910111600111786e-06, |
|
"loss": 0.6904, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.22582542216248006, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 8.896719664315866e-06, |
|
"loss": 0.679, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.22716962110392339, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.883256148750634e-06, |
|
"loss": 0.6631, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.22851382004536672, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 8.869721300730596e-06, |
|
"loss": 0.6576, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.22985801898681005, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.856115368880598e-06, |
|
"loss": 0.6714, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.23120221792825338, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 8.842438603131232e-06, |
|
"loss": 0.6572, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.2325464168696967, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 8.828691254714259e-06, |
|
"loss": 0.6798, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.23389061581114004, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 8.814873576157988e-06, |
|
"loss": 0.6934, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.23523481475258337, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 8.800985821282637e-06, |
|
"loss": 0.6648, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.23657901369402673, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 8.787028245195676e-06, |
|
"loss": 0.6705, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.23792321263547006, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.773001104287137e-06, |
|
"loss": 0.6661, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.2392674115769134, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.758904656224904e-06, |
|
"loss": 0.6968, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.24061161051835672, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.744739159949981e-06, |
|
"loss": 0.6817, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.24195580945980005, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.730504875671732e-06, |
|
"loss": 0.6704, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.24330000840124338, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 8.71620206486311e-06, |
|
"loss": 0.6484, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.24464420734268671, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 8.701830990255843e-06, |
|
"loss": 0.6711, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.24598840628413005, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 8.687391915835617e-06, |
|
"loss": 0.6526, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.24733260522557338, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 8.672885106837216e-06, |
|
"loss": 0.6907, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.2486768041670167, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.658310829739666e-06, |
|
"loss": 0.7003, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.25002100310846004, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.643669352261321e-06, |
|
"loss": 0.6353, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.25136520204990337, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 8.628960943354965e-06, |
|
"loss": 0.6566, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.2527094009913467, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 8.614185873202852e-06, |
|
"loss": 0.6676, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.25405359993279003, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 8.599344413211755e-06, |
|
"loss": 0.6647, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.25539779887423336, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.58443683600798e-06, |
|
"loss": 0.6771, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2567419978156767, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 8.569463415432356e-06, |
|
"loss": 0.6629, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.2580861967571201, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 8.554424426535202e-06, |
|
"loss": 0.7327, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.2594303956985634, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 8.539320145571277e-06, |
|
"loss": 0.6809, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.26077459464000674, |
|
"grad_norm": 0.875, |
|
"learning_rate": 8.524150849994708e-06, |
|
"loss": 0.6895, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.26211879358145007, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.50891681845389e-06, |
|
"loss": 0.6739, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2634629925228934, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 8.493618330786365e-06, |
|
"loss": 0.7063, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.26480719146433673, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.47825566801369e-06, |
|
"loss": 0.6676, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.26615139040578006, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.462829112336266e-06, |
|
"loss": 0.6842, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2674955893472234, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 8.44733894712816e-06, |
|
"loss": 0.7045, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.2688397882886667, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 8.431785456931898e-06, |
|
"loss": 0.6569, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27018398723011006, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 8.416168927453237e-06, |
|
"loss": 0.679, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.2715281861715534, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 8.400489645555914e-06, |
|
"loss": 0.6452, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.2728723851129967, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.384747899256386e-06, |
|
"loss": 0.6744, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.27421658405444005, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 8.368943977718528e-06, |
|
"loss": 0.6501, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2755607829958834, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 8.353078171248335e-06, |
|
"loss": 0.666, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2769049819373267, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 8.337150771288571e-06, |
|
"loss": 0.6663, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.27824918087877004, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.32116207041343e-06, |
|
"loss": 0.6481, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.2795933798202134, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 8.30511236232316e-06, |
|
"loss": 0.6884, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.2809375787616567, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 8.289001941838659e-06, |
|
"loss": 0.6562, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.28228177770310003, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.27283110489607e-06, |
|
"loss": 0.7106, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.28362597664454336, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 8.256600148541339e-06, |
|
"loss": 0.689, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.28497017558598675, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.240309370924758e-06, |
|
"loss": 0.6683, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.2863143745274301, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 8.223959071295492e-06, |
|
"loss": 0.6866, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.2876585734688734, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.207549549996083e-06, |
|
"loss": 0.6688, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.28900277241031674, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 8.191081108456922e-06, |
|
"loss": 0.6721, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2903469713517601, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 8.174554049190726e-06, |
|
"loss": 0.6579, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2916911702932034, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.157968675786971e-06, |
|
"loss": 0.6841, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.29303536923464674, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 8.141325292906325e-06, |
|
"loss": 0.6739, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.29437956817609007, |
|
"grad_norm": 0.875, |
|
"learning_rate": 8.124624206275041e-06, |
|
"loss": 0.6544, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.2957237671175334, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 8.107865722679347e-06, |
|
"loss": 0.6525, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.29706796605897673, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 8.091050149959808e-06, |
|
"loss": 0.6394, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.29841216500042006, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 8.074177797005677e-06, |
|
"loss": 0.6674, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2997563639418634, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 8.057248973749216e-06, |
|
"loss": 0.6749, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.3011005628833067, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 8.040263991159996e-06, |
|
"loss": 0.7223, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.30244476182475005, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.0232231612392e-06, |
|
"loss": 0.6861, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3037889607661934, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.006126797013884e-06, |
|
"loss": 0.6883, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.3051331597076367, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 7.98897521253122e-06, |
|
"loss": 0.655, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.30647735864908004, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 7.971768722852741e-06, |
|
"loss": 0.6696, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.3078215575905234, |
|
"grad_norm": 0.875, |
|
"learning_rate": 7.954507644048544e-06, |
|
"loss": 0.6676, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.3091657565319667, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 7.937192293191485e-06, |
|
"loss": 0.6508, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.31050995547341004, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.919822988351359e-06, |
|
"loss": 0.6676, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.3118541544148534, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 7.902400048589051e-06, |
|
"loss": 0.6512, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.31319835335629675, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 7.884923793950684e-06, |
|
"loss": 0.6846, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.3145425522977401, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.86739454546173e-06, |
|
"loss": 0.6644, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.3158867512391834, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 7.849812625121122e-06, |
|
"loss": 0.6701, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.31723095018062675, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 7.832178355895327e-06, |
|
"loss": 0.6749, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3185751491220701, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.81449206171243e-06, |
|
"loss": 0.6584, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.3199193480635134, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 7.796754067456168e-06, |
|
"loss": 0.6685, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.32126354700495674, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 7.778964698959973e-06, |
|
"loss": 0.6659, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.32260774594640007, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 7.761124283000982e-06, |
|
"loss": 0.6647, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3239519448878434, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 7.743233147294036e-06, |
|
"loss": 0.6793, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.32529614382928673, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 7.725291620485653e-06, |
|
"loss": 0.6633, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.32664034277073006, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 7.707300032148004e-06, |
|
"loss": 0.6812, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.3279845417121734, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 7.689258712772851e-06, |
|
"loss": 0.6734, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3293287406536167, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 7.671167993765474e-06, |
|
"loss": 0.6775, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.33067293959506006, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.653028207438588e-06, |
|
"loss": 0.6723, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3320171385365034, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 7.634839687006242e-06, |
|
"loss": 0.6814, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.3333613374779467, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 7.616602766577683e-06, |
|
"loss": 0.6816, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3333613374779467, |
|
"eval_loss": 2.7341208457946777, |
|
"eval_runtime": 5734.9058, |
|
"eval_samples_per_second": 3.861, |
|
"eval_steps_per_second": 0.483, |
|
"step": 248 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 743, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 248, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.854896016293102e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|