|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 100.0, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.4236936569213867, |
|
"learning_rate": 0.0001, |
|
"loss": 1.1597, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.4236936569213867, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1597, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.3299813270568848, |
|
"learning_rate": 0.00019795918367346938, |
|
"loss": 0.9809, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.62615966796875, |
|
"learning_rate": 0.0001959183673469388, |
|
"loss": 0.7416, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.3189771175384521, |
|
"learning_rate": 0.00019387755102040816, |
|
"loss": 0.5219, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.062267780303955, |
|
"learning_rate": 0.00019183673469387756, |
|
"loss": 0.3376, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.7606938481330872, |
|
"learning_rate": 0.00018979591836734697, |
|
"loss": 0.1999, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.6657623648643494, |
|
"learning_rate": 0.00018775510204081634, |
|
"loss": 0.1314, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.4108690917491913, |
|
"learning_rate": 0.00018571428571428572, |
|
"loss": 0.087, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.25315290689468384, |
|
"learning_rate": 0.00018367346938775512, |
|
"loss": 0.0777, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 4.897407054901123, |
|
"learning_rate": 0.0001816326530612245, |
|
"loss": 0.087, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.3001738488674164, |
|
"learning_rate": 0.0001795918367346939, |
|
"loss": 0.0707, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 5.1508378982543945, |
|
"learning_rate": 0.00017755102040816327, |
|
"loss": 0.088, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 9.67592716217041, |
|
"learning_rate": 0.00017551020408163265, |
|
"loss": 0.0799, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 2.7876245975494385, |
|
"learning_rate": 0.00017346938775510205, |
|
"loss": 0.0967, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 1.8201392889022827, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.0601, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 1.6338967084884644, |
|
"learning_rate": 0.00016938775510204083, |
|
"loss": 0.0512, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.4770618975162506, |
|
"learning_rate": 0.00016734693877551023, |
|
"loss": 0.0373, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.458904504776001, |
|
"learning_rate": 0.0001653061224489796, |
|
"loss": 0.0279, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 1.6127618551254272, |
|
"learning_rate": 0.00016326530612244898, |
|
"loss": 0.023, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 1.090635895729065, |
|
"learning_rate": 0.00016122448979591838, |
|
"loss": 0.0105, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.08737627416849136, |
|
"learning_rate": 0.00015918367346938776, |
|
"loss": 0.0061, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.08030153065919876, |
|
"learning_rate": 0.00015714285714285716, |
|
"loss": 0.0051, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.04834992438554764, |
|
"learning_rate": 0.00015510204081632654, |
|
"loss": 0.0048, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.09552427381277084, |
|
"learning_rate": 0.0001530612244897959, |
|
"loss": 0.0048, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.0460357591509819, |
|
"learning_rate": 0.0001510204081632653, |
|
"loss": 0.0046, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.0992983803153038, |
|
"learning_rate": 0.00014897959183673472, |
|
"loss": 0.0047, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.1185307577252388, |
|
"learning_rate": 0.0001469387755102041, |
|
"loss": 0.0048, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 0.14356577396392822, |
|
"learning_rate": 0.0001448979591836735, |
|
"loss": 0.0049, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.0876593366265297, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.0046, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.08368655294179916, |
|
"learning_rate": 0.00014081632653061224, |
|
"loss": 0.0047, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.042614325881004333, |
|
"learning_rate": 0.00013877551020408165, |
|
"loss": 0.0045, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.00863147247582674, |
|
"learning_rate": 0.00013673469387755102, |
|
"loss": 0.0044, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.0089035639539361, |
|
"learning_rate": 0.0001346938775510204, |
|
"loss": 0.0044, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.03151445463299751, |
|
"learning_rate": 0.0001326530612244898, |
|
"loss": 0.0044, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.0310219656676054, |
|
"learning_rate": 0.00013061224489795917, |
|
"loss": 0.0044, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.030537990853190422, |
|
"learning_rate": 0.00012857142857142858, |
|
"loss": 0.0044, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.03915323689579964, |
|
"learning_rate": 0.00012653061224489798, |
|
"loss": 0.0044, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.038823168724775314, |
|
"learning_rate": 0.00012448979591836735, |
|
"loss": 0.0044, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.005813094321638346, |
|
"learning_rate": 0.00012244897959183676, |
|
"loss": 0.0044, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.02931728959083557, |
|
"learning_rate": 0.00012040816326530613, |
|
"loss": 0.0044, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.02919776178896427, |
|
"learning_rate": 0.00011836734693877552, |
|
"loss": 0.0044, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.005404920782893896, |
|
"learning_rate": 0.0001163265306122449, |
|
"loss": 0.0044, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.03844524547457695, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.0044, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.005866413004696369, |
|
"learning_rate": 0.00011224489795918367, |
|
"loss": 0.0043, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.028769973665475845, |
|
"learning_rate": 0.00011020408163265306, |
|
"loss": 0.0044, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.028779389336705208, |
|
"learning_rate": 0.00010816326530612246, |
|
"loss": 0.0044, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.006406720262020826, |
|
"learning_rate": 0.00010612244897959185, |
|
"loss": 0.0043, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.03870103508234024, |
|
"learning_rate": 0.00010408163265306123, |
|
"loss": 0.0044, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.038732465356588364, |
|
"learning_rate": 0.00010204081632653062, |
|
"loss": 0.0044, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 0.00536749605089426, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0043, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 0.028795704245567322, |
|
"learning_rate": 9.79591836734694e-05, |
|
"loss": 0.0044, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"grad_norm": 0.02882838249206543, |
|
"learning_rate": 9.591836734693878e-05, |
|
"loss": 0.0044, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"grad_norm": 0.02882484346628189, |
|
"learning_rate": 9.387755102040817e-05, |
|
"loss": 0.0044, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"grad_norm": 0.00502988463267684, |
|
"learning_rate": 9.183673469387756e-05, |
|
"loss": 0.0043, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 0.005017813295125961, |
|
"learning_rate": 8.979591836734695e-05, |
|
"loss": 0.0043, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"grad_norm": 0.005218982230871916, |
|
"learning_rate": 8.775510204081632e-05, |
|
"loss": 0.0043, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"grad_norm": 0.038150474429130554, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.0044, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"grad_norm": 0.03801121190190315, |
|
"learning_rate": 8.367346938775511e-05, |
|
"loss": 0.0044, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 0.004977047443389893, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 0.0043, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"grad_norm": 0.0602005235850811, |
|
"learning_rate": 7.959183673469388e-05, |
|
"loss": 0.0045, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"grad_norm": 0.0284713264554739, |
|
"learning_rate": 7.755102040816327e-05, |
|
"loss": 0.0044, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"grad_norm": 0.004900462459772825, |
|
"learning_rate": 7.551020408163266e-05, |
|
"loss": 0.0043, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.0048695337027311325, |
|
"learning_rate": 7.346938775510205e-05, |
|
"loss": 0.0043, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"grad_norm": 0.004841310903429985, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.0043, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"grad_norm": 0.004835947882384062, |
|
"learning_rate": 6.938775510204082e-05, |
|
"loss": 0.0043, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"grad_norm": 0.03715803846716881, |
|
"learning_rate": 6.73469387755102e-05, |
|
"loss": 0.0044, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 0.004820631351321936, |
|
"learning_rate": 6.530612244897959e-05, |
|
"loss": 0.0043, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"grad_norm": 0.004830862861126661, |
|
"learning_rate": 6.326530612244899e-05, |
|
"loss": 0.0043, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"grad_norm": 0.02773592434823513, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 0.0044, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"grad_norm": 0.02766617201268673, |
|
"learning_rate": 5.918367346938776e-05, |
|
"loss": 0.0044, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 0.027592068538069725, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.0044, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"grad_norm": 0.004762730095535517, |
|
"learning_rate": 5.510204081632653e-05, |
|
"loss": 0.0043, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"grad_norm": 0.004771250765770674, |
|
"learning_rate": 5.3061224489795926e-05, |
|
"loss": 0.0043, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"grad_norm": 0.004748388193547726, |
|
"learning_rate": 5.102040816326531e-05, |
|
"loss": 0.0043, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 0.03664989769458771, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 0.0044, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"grad_norm": 0.004758178256452084, |
|
"learning_rate": 4.6938775510204086e-05, |
|
"loss": 0.0043, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"grad_norm": 0.03650437667965889, |
|
"learning_rate": 4.4897959183673474e-05, |
|
"loss": 0.0044, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"grad_norm": 0.004723742604255676, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.0043, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 0.027179496362805367, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 0.0044, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"grad_norm": 0.027180835604667664, |
|
"learning_rate": 3.8775510204081634e-05, |
|
"loss": 0.0044, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"grad_norm": 0.027134299278259277, |
|
"learning_rate": 3.673469387755102e-05, |
|
"loss": 0.0044, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"grad_norm": 0.027070023119449615, |
|
"learning_rate": 3.469387755102041e-05, |
|
"loss": 0.0044, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"grad_norm": 0.004842973779886961, |
|
"learning_rate": 3.265306122448979e-05, |
|
"loss": 0.0043, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"grad_norm": 0.004742528777569532, |
|
"learning_rate": 3.061224489795919e-05, |
|
"loss": 0.0043, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"grad_norm": 0.004724172409623861, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.0043, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"grad_norm": 0.03628339245915413, |
|
"learning_rate": 2.6530612244897963e-05, |
|
"loss": 0.0044, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"grad_norm": 0.036234479397535324, |
|
"learning_rate": 2.448979591836735e-05, |
|
"loss": 0.0044, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"grad_norm": 0.004697797354310751, |
|
"learning_rate": 2.2448979591836737e-05, |
|
"loss": 0.0043, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"grad_norm": 0.004719586111605167, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 0.0043, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"grad_norm": 0.004701649770140648, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 0.0043, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"grad_norm": 0.026852702721953392, |
|
"learning_rate": 1.6326530612244897e-05, |
|
"loss": 0.0044, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"grad_norm": 0.02685590460896492, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.0044, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"grad_norm": 0.0047478387132287025, |
|
"learning_rate": 1.2244897959183674e-05, |
|
"loss": 0.0043, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"grad_norm": 0.026850329712033272, |
|
"learning_rate": 1.0204081632653061e-05, |
|
"loss": 0.0044, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 0.00475778803229332, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 0.0043, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"grad_norm": 0.004756512586027384, |
|
"learning_rate": 6.122448979591837e-06, |
|
"loss": 0.0043, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"grad_norm": 0.004749584477394819, |
|
"learning_rate": 4.081632653061224e-06, |
|
"loss": 0.0043, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"grad_norm": 0.004759279545396566, |
|
"learning_rate": 2.040816326530612e-06, |
|
"loss": 0.0043, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 0.004726547747850418, |
|
"learning_rate": 0.0, |
|
"loss": 0.0043, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 760517336064000.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|