|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 100.0, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.3439955711364746, |
|
"learning_rate": 0.0001, |
|
"loss": 5.4154, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.3439955711364746, |
|
"learning_rate": 0.0002, |
|
"loss": 5.4154, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.64585542678833, |
|
"learning_rate": 0.00019795918367346938, |
|
"loss": 5.1115, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 7.641846656799316, |
|
"learning_rate": 0.0001959183673469388, |
|
"loss": 4.5815, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 10.534232139587402, |
|
"learning_rate": 0.00019387755102040816, |
|
"loss": 4.1388, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.00019387755102040816, |
|
"loss": 3.7196, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 13.133545875549316, |
|
"learning_rate": 0.00019183673469387756, |
|
"loss": 3.7196, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 15.314064025878906, |
|
"learning_rate": 0.00018979591836734697, |
|
"loss": 3.2758, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 16.992698669433594, |
|
"learning_rate": 0.00018775510204081634, |
|
"loss": 2.8057, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 18.105308532714844, |
|
"learning_rate": 0.00018571428571428572, |
|
"loss": 2.3037, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 18.624324798583984, |
|
"learning_rate": 0.00018367346938775512, |
|
"loss": 1.7698, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 18.554916381835938, |
|
"learning_rate": 0.0001816326530612245, |
|
"loss": 1.2219, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 17.93878746032715, |
|
"learning_rate": 0.0001795918367346939, |
|
"loss": 0.6767, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 2.917646884918213, |
|
"learning_rate": 0.00017755102040816327, |
|
"loss": 0.1873, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 3.2019948959350586, |
|
"learning_rate": 0.00017551020408163265, |
|
"loss": 0.1488, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 3.1802170276641846, |
|
"learning_rate": 0.00017346938775510205, |
|
"loss": 0.1134, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 1.5858300924301147, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.0832, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.5080211758613586, |
|
"learning_rate": 0.00016938775510204083, |
|
"loss": 0.0701, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.5199990272521973, |
|
"learning_rate": 0.00016734693877551023, |
|
"loss": 0.0623, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.5190378427505493, |
|
"learning_rate": 0.0001653061224489796, |
|
"loss": 0.0552, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.5189256072044373, |
|
"learning_rate": 0.00016326530612244898, |
|
"loss": 0.0456, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.5494657754898071, |
|
"learning_rate": 0.00016122448979591838, |
|
"loss": 0.0356, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.5577398538589478, |
|
"learning_rate": 0.00015918367346938776, |
|
"loss": 0.024, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.39652666449546814, |
|
"learning_rate": 0.00015714285714285716, |
|
"loss": 0.0128, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.12145466357469559, |
|
"learning_rate": 0.00015510204081632654, |
|
"loss": 0.0064, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.16227224469184875, |
|
"learning_rate": 0.0001530612244897959, |
|
"loss": 0.0058, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.010005966760218143, |
|
"learning_rate": 0.0001510204081632653, |
|
"loss": 0.0046, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.11974961310625076, |
|
"learning_rate": 0.00014897959183673472, |
|
"loss": 0.0053, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 0.026845891028642654, |
|
"learning_rate": 0.0001469387755102041, |
|
"loss": 0.0046, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.09766196459531784, |
|
"learning_rate": 0.0001448979591836735, |
|
"loss": 0.0053, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.05061652511358261, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.0046, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.0490519218146801, |
|
"learning_rate": 0.00014081632653061224, |
|
"loss": 0.0048, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.06446336209774017, |
|
"learning_rate": 0.00013877551020408165, |
|
"loss": 0.0049, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.007198956795036793, |
|
"learning_rate": 0.00013673469387755102, |
|
"loss": 0.0045, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.06024309992790222, |
|
"learning_rate": 0.0001346938775510204, |
|
"loss": 0.0048, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.04099668189883232, |
|
"learning_rate": 0.0001326530612244898, |
|
"loss": 0.0048, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.030607614666223526, |
|
"learning_rate": 0.00013061224489795917, |
|
"loss": 0.0047, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.04647354409098625, |
|
"learning_rate": 0.00012857142857142858, |
|
"loss": 0.0048, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.013468354940414429, |
|
"learning_rate": 0.00012653061224489798, |
|
"loss": 0.0047, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.02858574502170086, |
|
"learning_rate": 0.00012448979591836735, |
|
"loss": 0.0045, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.039958544075489044, |
|
"learning_rate": 0.00012244897959183676, |
|
"loss": 0.0048, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.003950281068682671, |
|
"learning_rate": 0.00012040816326530613, |
|
"loss": 0.0047, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.029744163155555725, |
|
"learning_rate": 0.00011836734693877552, |
|
"loss": 0.0047, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.029589517042040825, |
|
"learning_rate": 0.0001163265306122449, |
|
"loss": 0.0047, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.0025897289160639048, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.0049, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.03239087387919426, |
|
"learning_rate": 0.00011224489795918367, |
|
"loss": 0.005, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.021254172548651695, |
|
"learning_rate": 0.00011020408163265306, |
|
"loss": 0.0047, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.01296665333211422, |
|
"learning_rate": 0.00010816326530612246, |
|
"loss": 0.0047, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.012932351790368557, |
|
"learning_rate": 0.00010612244897959185, |
|
"loss": 0.0047, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.0024836526717990637, |
|
"learning_rate": 0.00010408163265306123, |
|
"loss": 0.0049, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 0.012770796194672585, |
|
"learning_rate": 0.00010204081632653062, |
|
"loss": 0.0047, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 0.010142244398593903, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0045, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"grad_norm": 0.020766526460647583, |
|
"learning_rate": 9.79591836734694e-05, |
|
"loss": 0.0047, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"grad_norm": 0.0039475164376199245, |
|
"learning_rate": 9.591836734693878e-05, |
|
"loss": 0.0047, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"grad_norm": 0.012760567478835583, |
|
"learning_rate": 9.387755102040817e-05, |
|
"loss": 0.0047, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 0.028441807255148888, |
|
"learning_rate": 9.183673469387756e-05, |
|
"loss": 0.0047, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"grad_norm": 0.004045899026095867, |
|
"learning_rate": 8.979591836734695e-05, |
|
"loss": 0.0047, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"grad_norm": 0.020622774958610535, |
|
"learning_rate": 8.775510204081632e-05, |
|
"loss": 0.0047, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"grad_norm": 0.010014132596552372, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.0045, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 0.0038325833156704903, |
|
"learning_rate": 8.367346938775511e-05, |
|
"loss": 0.0047, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"grad_norm": 0.01254805363714695, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 0.0047, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"grad_norm": 0.012542281299829483, |
|
"learning_rate": 7.959183673469388e-05, |
|
"loss": 0.0047, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"grad_norm": 0.006473184563219547, |
|
"learning_rate": 7.755102040816327e-05, |
|
"loss": 0.0045, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.014362330548465252, |
|
"learning_rate": 7.551020408163266e-05, |
|
"loss": 0.0049, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"grad_norm": 0.0204475000500679, |
|
"learning_rate": 7.346938775510205e-05, |
|
"loss": 0.0047, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"grad_norm": 0.0023198979906737804, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.0049, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"grad_norm": 0.018235381692647934, |
|
"learning_rate": 6.938775510204082e-05, |
|
"loss": 0.0049, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 0.012397863902151585, |
|
"learning_rate": 6.73469387755102e-05, |
|
"loss": 0.0047, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"grad_norm": 0.003842939855530858, |
|
"learning_rate": 6.530612244897959e-05, |
|
"loss": 0.0047, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"grad_norm": 0.003821918275207281, |
|
"learning_rate": 6.326530612244899e-05, |
|
"loss": 0.0047, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"grad_norm": 0.0038236272521317005, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 0.0047, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 0.009961229749023914, |
|
"learning_rate": 5.918367346938776e-05, |
|
"loss": 0.0045, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"grad_norm": 0.0037804129533469677, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.0047, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"grad_norm": 0.003801505547016859, |
|
"learning_rate": 5.510204081632653e-05, |
|
"loss": 0.0047, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"grad_norm": 0.012337015941739082, |
|
"learning_rate": 5.3061224489795926e-05, |
|
"loss": 0.0047, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 0.012310854159295559, |
|
"learning_rate": 5.102040816326531e-05, |
|
"loss": 0.0047, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"grad_norm": 0.0063883536495268345, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 0.0045, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"grad_norm": 0.0036704791709780693, |
|
"learning_rate": 4.6938775510204086e-05, |
|
"loss": 0.0047, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"grad_norm": 0.0038380566984415054, |
|
"learning_rate": 4.4897959183673474e-05, |
|
"loss": 0.0047, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 0.013979257084429264, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.0049, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"grad_norm": 0.003754157805815339, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 0.0047, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"grad_norm": 0.003731819801032543, |
|
"learning_rate": 3.8775510204081634e-05, |
|
"loss": 0.0047, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"grad_norm": 0.003740091575309634, |
|
"learning_rate": 3.673469387755102e-05, |
|
"loss": 0.0047, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"grad_norm": 0.002331367926672101, |
|
"learning_rate": 3.469387755102041e-05, |
|
"loss": 0.0049, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"grad_norm": 0.006460592150688171, |
|
"learning_rate": 3.265306122448979e-05, |
|
"loss": 0.0045, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"grad_norm": 0.01226158905774355, |
|
"learning_rate": 3.061224489795919e-05, |
|
"loss": 0.0047, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"grad_norm": 0.012233145534992218, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.0047, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"grad_norm": 0.002268604002892971, |
|
"learning_rate": 2.6530612244897963e-05, |
|
"loss": 0.0049, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"grad_norm": 0.009719762951135635, |
|
"learning_rate": 2.448979591836735e-05, |
|
"loss": 0.0045, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"grad_norm": 0.003743007080629468, |
|
"learning_rate": 2.2448979591836737e-05, |
|
"loss": 0.0047, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"grad_norm": 0.00371920526959002, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 0.0047, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"grad_norm": 0.0036711846478283405, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 0.0047, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"grad_norm": 0.020141731947660446, |
|
"learning_rate": 1.6326530612244897e-05, |
|
"loss": 0.0047, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"grad_norm": 0.0037132452707737684, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.0047, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"grad_norm": 0.0063994950614869595, |
|
"learning_rate": 1.2244897959183674e-05, |
|
"loss": 0.0045, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 0.0037392491940408945, |
|
"learning_rate": 1.0204081632653061e-05, |
|
"loss": 0.0047, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"grad_norm": 0.003673990024253726, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 0.0047, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"grad_norm": 0.012367010116577148, |
|
"learning_rate": 6.122448979591837e-06, |
|
"loss": 0.0047, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"grad_norm": 0.002525837393477559, |
|
"learning_rate": 4.081632653061224e-06, |
|
"loss": 0.0049, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 0.012382498942315578, |
|
"learning_rate": 2.040816326530612e-06, |
|
"loss": 0.0047, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 500, |
|
"total_flos": 713859830784000.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|