|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 372, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008064516129032258, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 59.544, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016129032258064516, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 59.5229, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024193548387096774, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 60.4618, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03225806451612903, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 60.0097, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04032258064516129, |
|
"grad_norm": 114.08177947998047, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 60.0951, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04838709677419355, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 60.5873, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.056451612903225805, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 60.1032, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 113.5850601196289, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 59.0658, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.07258064516129033, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 59.7311, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08064516129032258, |
|
"grad_norm": 106.35302734375, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 59.241, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08870967741935484, |
|
"grad_norm": 104.4482650756836, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 56.92, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 110.20195770263672, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 53.5878, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10483870967741936, |
|
"grad_norm": 111.98957061767578, |
|
"learning_rate": 6.31578947368421e-05, |
|
"loss": 49.6114, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11290322580645161, |
|
"grad_norm": 113.57233428955078, |
|
"learning_rate": 7.368421052631579e-05, |
|
"loss": 44.8496, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.12096774193548387, |
|
"grad_norm": 134.04367065429688, |
|
"learning_rate": 8.421052631578948e-05, |
|
"loss": 40.4934, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.421052631578948e-05, |
|
"loss": 35.9714, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13709677419354838, |
|
"grad_norm": 171.48928833007812, |
|
"learning_rate": 9.473684210526316e-05, |
|
"loss": 34.6631, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.14516129032258066, |
|
"grad_norm": 158.4920196533203, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 27.2936, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1532258064516129, |
|
"grad_norm": 128.47268676757812, |
|
"learning_rate": 0.00011578947368421053, |
|
"loss": 20.4796, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 106.93326568603516, |
|
"learning_rate": 0.0001263157894736842, |
|
"loss": 16.9849, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1693548387096774, |
|
"grad_norm": 115.46312713623047, |
|
"learning_rate": 0.0001368421052631579, |
|
"loss": 11.7068, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1774193548387097, |
|
"grad_norm": 100.3222427368164, |
|
"learning_rate": 0.00014736842105263158, |
|
"loss": 8.8996, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.18548387096774194, |
|
"grad_norm": 83.08586120605469, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 7.3062, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 162.99462890625, |
|
"learning_rate": 0.00016842105263157895, |
|
"loss": 7.0037, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.20161290322580644, |
|
"grad_norm": 83.60359191894531, |
|
"learning_rate": 0.00017894736842105264, |
|
"loss": 5.3992, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.20967741935483872, |
|
"grad_norm": 62.76321029663086, |
|
"learning_rate": 0.00018947368421052632, |
|
"loss": 4.2922, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.21774193548387097, |
|
"grad_norm": 46.04645538330078, |
|
"learning_rate": 0.0002, |
|
"loss": 3.3493, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.22580645161290322, |
|
"grad_norm": 16.34684944152832, |
|
"learning_rate": 0.0001999960397967811, |
|
"loss": 2.2553, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23387096774193547, |
|
"grad_norm": 40.65755081176758, |
|
"learning_rate": 0.00019998415950078858, |
|
"loss": 2.2114, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.24193548387096775, |
|
"grad_norm": 25.546449661254883, |
|
"learning_rate": 0.00019996436005299012, |
|
"loss": 2.0519, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 12.374382019042969, |
|
"learning_rate": 0.00019993664302158255, |
|
"loss": 1.9395, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 21.663803100585938, |
|
"learning_rate": 0.00019990101060186733, |
|
"loss": 1.9796, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2661290322580645, |
|
"grad_norm": 11.590826988220215, |
|
"learning_rate": 0.00019985746561607698, |
|
"loss": 1.8735, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.27419354838709675, |
|
"grad_norm": 12.179080963134766, |
|
"learning_rate": 0.0001998060115131513, |
|
"loss": 1.9194, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.28225806451612906, |
|
"grad_norm": 7.7178874015808105, |
|
"learning_rate": 0.00019974665236846442, |
|
"loss": 1.8216, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 9.172876358032227, |
|
"learning_rate": 0.00019967939288350182, |
|
"loss": 1.8924, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.29838709677419356, |
|
"grad_norm": 9.154807090759277, |
|
"learning_rate": 0.00019960423838548814, |
|
"loss": 1.7641, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3064516129032258, |
|
"grad_norm": 9.5409574508667, |
|
"learning_rate": 0.00019952119482696503, |
|
"loss": 1.8135, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.31451612903225806, |
|
"grad_norm": 8.261414527893066, |
|
"learning_rate": 0.00019943026878531983, |
|
"loss": 1.7606, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 17.037521362304688, |
|
"learning_rate": 0.0001993314674622646, |
|
"loss": 1.8723, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.33064516129032256, |
|
"grad_norm": 19.483617782592773, |
|
"learning_rate": 0.00019922479868326578, |
|
"loss": 1.8385, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3387096774193548, |
|
"grad_norm": 7.89741849899292, |
|
"learning_rate": 0.0001991102708969241, |
|
"loss": 1.6889, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3467741935483871, |
|
"grad_norm": 10.712681770324707, |
|
"learning_rate": 0.00019898789317430575, |
|
"loss": 1.6362, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3548387096774194, |
|
"grad_norm": 19.9344425201416, |
|
"learning_rate": 0.00019885767520822376, |
|
"loss": 1.7803, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3629032258064516, |
|
"grad_norm": 16.474592208862305, |
|
"learning_rate": 0.0001987196273124703, |
|
"loss": 1.6135, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3709677419354839, |
|
"grad_norm": 25.3697452545166, |
|
"learning_rate": 0.00019857376042099983, |
|
"loss": 1.807, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3790322580645161, |
|
"grad_norm": 18.723379135131836, |
|
"learning_rate": 0.00019842008608706295, |
|
"loss": 1.6252, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 9.34337329864502, |
|
"learning_rate": 0.00019825861648229152, |
|
"loss": 1.5023, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3951612903225806, |
|
"grad_norm": 8.793817520141602, |
|
"learning_rate": 0.00019808936439573454, |
|
"loss": 1.5323, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4032258064516129, |
|
"grad_norm": 15.310140609741211, |
|
"learning_rate": 0.00019791234323284513, |
|
"loss": 1.5089, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4112903225806452, |
|
"grad_norm": 7.69551944732666, |
|
"learning_rate": 0.00019772756701441887, |
|
"loss": 1.4234, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.41935483870967744, |
|
"grad_norm": 8.724205017089844, |
|
"learning_rate": 0.0001975350503754833, |
|
"loss": 1.5713, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4274193548387097, |
|
"grad_norm": 9.61841869354248, |
|
"learning_rate": 0.00019733480856413868, |
|
"loss": 1.4177, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.43548387096774194, |
|
"grad_norm": 4.887195110321045, |
|
"learning_rate": 0.0001971268574403503, |
|
"loss": 1.3632, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4435483870967742, |
|
"grad_norm": 14.628969192504883, |
|
"learning_rate": 0.00019691121347469235, |
|
"loss": 1.3166, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 16.508726119995117, |
|
"learning_rate": 0.00019668789374704338, |
|
"loss": 1.4231, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4596774193548387, |
|
"grad_norm": 12.207494735717773, |
|
"learning_rate": 0.0001964569159452335, |
|
"loss": 1.4128, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.46774193548387094, |
|
"grad_norm": 12.919827461242676, |
|
"learning_rate": 0.00019621829836364337, |
|
"loss": 1.2599, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.47580645161290325, |
|
"grad_norm": 11.627687454223633, |
|
"learning_rate": 0.00019597205990175525, |
|
"loss": 1.314, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 16.89763641357422, |
|
"learning_rate": 0.00019571822006265622, |
|
"loss": 1.3663, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.49193548387096775, |
|
"grad_norm": 11.0904541015625, |
|
"learning_rate": 0.00019545679895149315, |
|
"loss": 1.1658, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 10.36643123626709, |
|
"learning_rate": 0.0001951878172738806, |
|
"loss": 1.1038, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5080645161290323, |
|
"grad_norm": 12.922411918640137, |
|
"learning_rate": 0.00019491129633426068, |
|
"loss": 1.2568, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 10.987399101257324, |
|
"learning_rate": 0.00019462725803421566, |
|
"loss": 1.1732, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5241935483870968, |
|
"grad_norm": 8.95875072479248, |
|
"learning_rate": 0.0001943357248707334, |
|
"loss": 1.1885, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.532258064516129, |
|
"grad_norm": 12.103666305541992, |
|
"learning_rate": 0.0001940367199344253, |
|
"loss": 1.2598, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5403225806451613, |
|
"grad_norm": 13.01240062713623, |
|
"learning_rate": 0.00019373026690769763, |
|
"loss": 1.1869, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5483870967741935, |
|
"grad_norm": 7.299654960632324, |
|
"learning_rate": 0.0001934163900628756, |
|
"loss": 1.1157, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5564516129032258, |
|
"grad_norm": 20.50505256652832, |
|
"learning_rate": 0.00019309511426028104, |
|
"loss": 1.3221, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5645161290322581, |
|
"grad_norm": 12.676935195922852, |
|
"learning_rate": 0.00019276646494626332, |
|
"loss": 1.1948, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5725806451612904, |
|
"grad_norm": 13.313145637512207, |
|
"learning_rate": 0.00019243046815118386, |
|
"loss": 1.387, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 17.019075393676758, |
|
"learning_rate": 0.00019208715048735445, |
|
"loss": 1.3289, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5887096774193549, |
|
"grad_norm": 9.776533126831055, |
|
"learning_rate": 0.00019173653914692946, |
|
"loss": 1.1864, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5967741935483871, |
|
"grad_norm": 11.344463348388672, |
|
"learning_rate": 0.00019137866189975202, |
|
"loss": 1.1421, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6048387096774194, |
|
"grad_norm": 15.751832962036133, |
|
"learning_rate": 0.00019101354709115468, |
|
"loss": 1.2573, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6129032258064516, |
|
"grad_norm": 6.035460472106934, |
|
"learning_rate": 0.00019064122363971427, |
|
"loss": 1.1191, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6209677419354839, |
|
"grad_norm": 10.531025886535645, |
|
"learning_rate": 0.00019026172103496137, |
|
"loss": 1.1309, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6290322580645161, |
|
"grad_norm": 9.877840042114258, |
|
"learning_rate": 0.0001898750693350447, |
|
"loss": 1.068, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6370967741935484, |
|
"grad_norm": 5.4536662101745605, |
|
"learning_rate": 0.00018948129916435046, |
|
"loss": 0.9681, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 7.712090015411377, |
|
"learning_rate": 0.00018908044171107657, |
|
"loss": 1.0911, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6532258064516129, |
|
"grad_norm": 7.649374485015869, |
|
"learning_rate": 0.00018867252872476257, |
|
"loss": 1.0275, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6612903225806451, |
|
"grad_norm": 6.879230976104736, |
|
"learning_rate": 0.00018825759251377483, |
|
"loss": 1.0487, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6693548387096774, |
|
"grad_norm": 8.820428848266602, |
|
"learning_rate": 0.00018783566594274783, |
|
"loss": 0.9653, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 25.527233123779297, |
|
"learning_rate": 0.00018740678242998077, |
|
"loss": 1.0172, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6854838709677419, |
|
"grad_norm": 8.481152534484863, |
|
"learning_rate": 0.00018697097594479103, |
|
"loss": 1.1271, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6935483870967742, |
|
"grad_norm": 3.5627005100250244, |
|
"learning_rate": 0.0001865282810048235, |
|
"loss": 0.9596, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.7016129032258065, |
|
"grad_norm": 4.959561824798584, |
|
"learning_rate": 0.0001860787326733168, |
|
"loss": 1.0527, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 8.863293647766113, |
|
"learning_rate": 0.0001856223665563258, |
|
"loss": 0.9017, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.717741935483871, |
|
"grad_norm": 14.67612361907959, |
|
"learning_rate": 0.00018515921879990187, |
|
"loss": 1.1214, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.7258064516129032, |
|
"grad_norm": 8.897265434265137, |
|
"learning_rate": 0.00018468932608722973, |
|
"loss": 1.0049, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7338709677419355, |
|
"grad_norm": 8.269657135009766, |
|
"learning_rate": 0.000184212725635722, |
|
"loss": 0.9546, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7419354838709677, |
|
"grad_norm": 10.737746238708496, |
|
"learning_rate": 0.00018372945519407158, |
|
"loss": 0.9053, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.052773952484131, |
|
"learning_rate": 0.00018323955303926163, |
|
"loss": 0.8894, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7580645161290323, |
|
"grad_norm": 7.422825336456299, |
|
"learning_rate": 0.00018274305797353395, |
|
"loss": 0.9973, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7661290322580645, |
|
"grad_norm": 8.900811195373535, |
|
"learning_rate": 0.00018224000932131568, |
|
"loss": 0.9815, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 14.36154556274414, |
|
"learning_rate": 0.00018173044692610467, |
|
"loss": 1.0694, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.782258064516129, |
|
"grad_norm": 10.11827278137207, |
|
"learning_rate": 0.00018121441114731367, |
|
"loss": 0.9346, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7903225806451613, |
|
"grad_norm": 3.4843525886535645, |
|
"learning_rate": 0.0001806919428570737, |
|
"loss": 0.9354, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7983870967741935, |
|
"grad_norm": 8.487842559814453, |
|
"learning_rate": 0.00018016308343699687, |
|
"loss": 0.9226, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 7.202905178070068, |
|
"learning_rate": 0.00017962787477489878, |
|
"loss": 0.8934, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"eval_loss": 0.24680930376052856, |
|
"eval_runtime": 10.7775, |
|
"eval_samples_per_second": 18.557, |
|
"eval_steps_per_second": 0.464, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8145161290322581, |
|
"grad_norm": 4.741288661956787, |
|
"learning_rate": 0.00017908635926148069, |
|
"loss": 0.8684, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8225806451612904, |
|
"grad_norm": 12.127788543701172, |
|
"learning_rate": 0.00017853857978697223, |
|
"loss": 1.1226, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8306451612903226, |
|
"grad_norm": 7.634512901306152, |
|
"learning_rate": 0.00017798457973773417, |
|
"loss": 0.8809, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 12.988190650939941, |
|
"learning_rate": 0.00017742440299282203, |
|
"loss": 0.958, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8467741935483871, |
|
"grad_norm": 11.392962455749512, |
|
"learning_rate": 0.00017685809392051083, |
|
"loss": 1.0527, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8548387096774194, |
|
"grad_norm": 4.548227787017822, |
|
"learning_rate": 0.00017628569737478076, |
|
"loss": 0.9724, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8629032258064516, |
|
"grad_norm": 7.020524501800537, |
|
"learning_rate": 0.00017570725869176467, |
|
"loss": 0.8896, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 7.219239711761475, |
|
"learning_rate": 0.00017512282368615728, |
|
"loss": 0.8158, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8790322580645161, |
|
"grad_norm": 10.470832824707031, |
|
"learning_rate": 0.00017453243864758638, |
|
"loss": 0.9682, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8870967741935484, |
|
"grad_norm": 7.766493797302246, |
|
"learning_rate": 0.00017393615033694656, |
|
"loss": 0.8107, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8951612903225806, |
|
"grad_norm": 7.319194793701172, |
|
"learning_rate": 0.0001733340059826956, |
|
"loss": 0.8932, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 5.918209075927734, |
|
"learning_rate": 0.00017272605327711365, |
|
"loss": 0.7721, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9112903225806451, |
|
"grad_norm": 5.775798320770264, |
|
"learning_rate": 0.000172112340372526, |
|
"loss": 0.9207, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9193548387096774, |
|
"grad_norm": 7.500247001647949, |
|
"learning_rate": 0.00017149291587748898, |
|
"loss": 0.8275, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9274193548387096, |
|
"grad_norm": 9.166544914245605, |
|
"learning_rate": 0.00017086782885294025, |
|
"loss": 0.9284, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9354838709677419, |
|
"grad_norm": 6.179811000823975, |
|
"learning_rate": 0.0001702371288083127, |
|
"loss": 0.7416, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9435483870967742, |
|
"grad_norm": 8.84774112701416, |
|
"learning_rate": 0.00016960086569761332, |
|
"loss": 0.8153, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9516129032258065, |
|
"grad_norm": 4.8080644607543945, |
|
"learning_rate": 0.0001689590899154664, |
|
"loss": 0.7648, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9596774193548387, |
|
"grad_norm": 9.035804748535156, |
|
"learning_rate": 0.00016831185229312237, |
|
"loss": 0.8812, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 9.008280754089355, |
|
"learning_rate": 0.0001676592040944315, |
|
"loss": 0.8751, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9758064516129032, |
|
"grad_norm": 4.9784016609191895, |
|
"learning_rate": 0.0001670011970117838, |
|
"loss": 0.9186, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9838709677419355, |
|
"grad_norm": 4.657833576202393, |
|
"learning_rate": 0.00016633788316201454, |
|
"loss": 0.8455, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.9919354838709677, |
|
"grad_norm": 6.071223735809326, |
|
"learning_rate": 0.0001656693150822766, |
|
"loss": 0.8577, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 9.496427536010742, |
|
"learning_rate": 0.0001649955457258792, |
|
"loss": 0.8261, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.0080645161290323, |
|
"grad_norm": 8.591066360473633, |
|
"learning_rate": 0.00016431662845809388, |
|
"loss": 0.8075, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0161290322580645, |
|
"grad_norm": 4.906324863433838, |
|
"learning_rate": 0.00016363261705192757, |
|
"loss": 0.7008, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.0241935483870968, |
|
"grad_norm": 8.301801681518555, |
|
"learning_rate": 0.00016294356568386369, |
|
"loss": 0.749, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.032258064516129, |
|
"grad_norm": 7.614622116088867, |
|
"learning_rate": 0.00016224952892957123, |
|
"loss": 0.8998, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.0403225806451613, |
|
"grad_norm": 5.074779510498047, |
|
"learning_rate": 0.0001615505617595819, |
|
"loss": 0.703, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.0483870967741935, |
|
"grad_norm": 6.552790641784668, |
|
"learning_rate": 0.00016084671953493643, |
|
"loss": 0.7045, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0564516129032258, |
|
"grad_norm": 10.763309478759766, |
|
"learning_rate": 0.00016013805800279976, |
|
"loss": 0.8401, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.064516129032258, |
|
"grad_norm": 11.931623458862305, |
|
"learning_rate": 0.00015942463329204546, |
|
"loss": 0.8917, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0725806451612903, |
|
"grad_norm": 4.281033515930176, |
|
"learning_rate": 0.00015870650190881022, |
|
"loss": 0.7959, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0806451612903225, |
|
"grad_norm": 12.4117431640625, |
|
"learning_rate": 0.00015798372073201836, |
|
"loss": 0.9254, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0887096774193548, |
|
"grad_norm": 11.218693733215332, |
|
"learning_rate": 0.00015725634700887678, |
|
"loss": 1.0555, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.096774193548387, |
|
"grad_norm": 7.100437641143799, |
|
"learning_rate": 0.00015652443835034068, |
|
"loss": 0.7427, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.1048387096774193, |
|
"grad_norm": 12.360247611999512, |
|
"learning_rate": 0.0001557880527265505, |
|
"loss": 0.8966, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.1129032258064515, |
|
"grad_norm": 13.585692405700684, |
|
"learning_rate": 0.00015504724846224064, |
|
"loss": 0.822, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.120967741935484, |
|
"grad_norm": 5.876780986785889, |
|
"learning_rate": 0.00015430208423211975, |
|
"loss": 0.7431, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.129032258064516, |
|
"grad_norm": 7.8429856300354, |
|
"learning_rate": 0.00015355261905622343, |
|
"loss": 0.7748, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.1370967741935485, |
|
"grad_norm": 5.8134074211120605, |
|
"learning_rate": 0.0001527989122952398, |
|
"loss": 0.6663, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.1451612903225807, |
|
"grad_norm": 4.2216291427612305, |
|
"learning_rate": 0.00015204102364580765, |
|
"loss": 0.7218, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.153225806451613, |
|
"grad_norm": 5.702169895172119, |
|
"learning_rate": 0.00015127901313578831, |
|
"loss": 0.8223, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.1612903225806452, |
|
"grad_norm": 9.340275764465332, |
|
"learning_rate": 0.00015051294111951134, |
|
"loss": 0.7887, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.1693548387096775, |
|
"grad_norm": 6.679466247558594, |
|
"learning_rate": 0.000149742868272994, |
|
"loss": 0.7098, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1774193548387097, |
|
"grad_norm": 9.664475440979004, |
|
"learning_rate": 0.00014896885558913562, |
|
"loss": 0.6828, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.185483870967742, |
|
"grad_norm": 7.441429615020752, |
|
"learning_rate": 0.00014819096437288664, |
|
"loss": 0.6728, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1935483870967742, |
|
"grad_norm": 5.626153945922852, |
|
"learning_rate": 0.000147409256236393, |
|
"loss": 0.7832, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.2016129032258065, |
|
"grad_norm": 8.72919750213623, |
|
"learning_rate": 0.0001466237930941163, |
|
"loss": 0.7496, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.2096774193548387, |
|
"grad_norm": 11.019682884216309, |
|
"learning_rate": 0.00014583463715792984, |
|
"loss": 0.8167, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.217741935483871, |
|
"grad_norm": 6.4603729248046875, |
|
"learning_rate": 0.00014504185093219116, |
|
"loss": 0.6298, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.2258064516129032, |
|
"grad_norm": 3.6375272274017334, |
|
"learning_rate": 0.0001442454972087915, |
|
"loss": 0.7418, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.2338709677419355, |
|
"grad_norm": 9.633164405822754, |
|
"learning_rate": 0.00014344563906218256, |
|
"loss": 0.6086, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.2419354838709677, |
|
"grad_norm": 14.596175193786621, |
|
"learning_rate": 0.0001426423398443803, |
|
"loss": 0.941, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 12.69194507598877, |
|
"learning_rate": 0.0001418356631799478, |
|
"loss": 0.8116, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2580645161290323, |
|
"grad_norm": 10.628227233886719, |
|
"learning_rate": 0.00014102567296095551, |
|
"loss": 0.8101, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.2661290322580645, |
|
"grad_norm": 8.566390037536621, |
|
"learning_rate": 0.00014021243334192082, |
|
"loss": 0.844, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.2741935483870968, |
|
"grad_norm": 7.864006519317627, |
|
"learning_rate": 0.00013939600873472694, |
|
"loss": 0.7901, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.282258064516129, |
|
"grad_norm": 6.181084156036377, |
|
"learning_rate": 0.00013857646380352102, |
|
"loss": 0.6985, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 7.077906131744385, |
|
"learning_rate": 0.00013775386345959246, |
|
"loss": 0.7662, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2983870967741935, |
|
"grad_norm": 6.237743854522705, |
|
"learning_rate": 0.00013692827285623197, |
|
"loss": 0.7911, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.3064516129032258, |
|
"grad_norm": 5.7443389892578125, |
|
"learning_rate": 0.0001360997573835708, |
|
"loss": 0.7973, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.314516129032258, |
|
"grad_norm": 4.632808685302734, |
|
"learning_rate": 0.00013526838266340177, |
|
"loss": 0.7027, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.3225806451612903, |
|
"grad_norm": 7.703153133392334, |
|
"learning_rate": 0.00013443421454398174, |
|
"loss": 0.7742, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.3306451612903225, |
|
"grad_norm": 5.662054538726807, |
|
"learning_rate": 0.00013359731909481616, |
|
"loss": 0.7441, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3387096774193548, |
|
"grad_norm": 6.306896686553955, |
|
"learning_rate": 0.00013275776260142608, |
|
"loss": 0.7104, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.346774193548387, |
|
"grad_norm": 7.059073448181152, |
|
"learning_rate": 0.00013191561156009803, |
|
"loss": 0.7174, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.3548387096774195, |
|
"grad_norm": 7.520137310028076, |
|
"learning_rate": 0.0001310709326726173, |
|
"loss": 0.7067, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.3629032258064515, |
|
"grad_norm": 5.496311187744141, |
|
"learning_rate": 0.00013022379284098487, |
|
"loss": 0.6174, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.370967741935484, |
|
"grad_norm": 8.635679244995117, |
|
"learning_rate": 0.00012937425916211852, |
|
"loss": 0.7132, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.379032258064516, |
|
"grad_norm": 9.62961196899414, |
|
"learning_rate": 0.00012852239892253842, |
|
"loss": 0.6957, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.3870967741935485, |
|
"grad_norm": 4.993870735168457, |
|
"learning_rate": 0.00012766827959303787, |
|
"loss": 0.5696, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3951612903225805, |
|
"grad_norm": 7.96873664855957, |
|
"learning_rate": 0.00012681196882333916, |
|
"loss": 0.7912, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.403225806451613, |
|
"grad_norm": 4.273744106292725, |
|
"learning_rate": 0.0001259535344367357, |
|
"loss": 0.6703, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.4112903225806452, |
|
"grad_norm": 4.046467304229736, |
|
"learning_rate": 0.00012509304442471985, |
|
"loss": 0.7389, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.4193548387096775, |
|
"grad_norm": 6.154615879058838, |
|
"learning_rate": 0.0001242305669415979, |
|
"loss": 0.6684, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.4274193548387097, |
|
"grad_norm": 3.555600166320801, |
|
"learning_rate": 0.00012336617029909205, |
|
"loss": 0.7995, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.435483870967742, |
|
"grad_norm": 5.557123184204102, |
|
"learning_rate": 0.00012249992296092956, |
|
"loss": 0.6675, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.4435483870967742, |
|
"grad_norm": 2.9730565547943115, |
|
"learning_rate": 0.00012163189353742035, |
|
"loss": 0.6415, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.4516129032258065, |
|
"grad_norm": 4.130847930908203, |
|
"learning_rate": 0.00012076215078002278, |
|
"loss": 0.6912, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.4596774193548387, |
|
"grad_norm": 7.072065830230713, |
|
"learning_rate": 0.0001198907635758982, |
|
"loss": 0.767, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.467741935483871, |
|
"grad_norm": 4.085969924926758, |
|
"learning_rate": 0.00011901780094245483, |
|
"loss": 0.6037, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.4758064516129032, |
|
"grad_norm": 3.051870346069336, |
|
"learning_rate": 0.00011814333202188126, |
|
"loss": 0.6929, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.4838709677419355, |
|
"grad_norm": 3.356917142868042, |
|
"learning_rate": 0.0001172674260756702, |
|
"loss": 0.7698, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.4919354838709677, |
|
"grad_norm": 7.061250686645508, |
|
"learning_rate": 0.00011639015247913261, |
|
"loss": 0.733, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.7595629692077637, |
|
"learning_rate": 0.0001155115807159029, |
|
"loss": 0.7255, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.5080645161290323, |
|
"grad_norm": 3.9517953395843506, |
|
"learning_rate": 0.00011463178037243554, |
|
"loss": 0.6812, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.5161290322580645, |
|
"grad_norm": 5.799162864685059, |
|
"learning_rate": 0.0001137508211324936, |
|
"loss": 0.5861, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.5241935483870968, |
|
"grad_norm": 3.553206443786621, |
|
"learning_rate": 0.00011286877277162943, |
|
"loss": 0.5584, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.532258064516129, |
|
"grad_norm": 5.5843048095703125, |
|
"learning_rate": 0.00011198570515165822, |
|
"loss": 0.7282, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5403225806451613, |
|
"grad_norm": 4.415789604187012, |
|
"learning_rate": 0.00011110168821512452, |
|
"loss": 0.6123, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.5483870967741935, |
|
"grad_norm": 3.1573801040649414, |
|
"learning_rate": 0.00011021679197976274, |
|
"loss": 0.6823, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.5564516129032258, |
|
"grad_norm": 3.488051652908325, |
|
"learning_rate": 0.00010933108653295128, |
|
"loss": 0.604, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.564516129032258, |
|
"grad_norm": 7.8006086349487305, |
|
"learning_rate": 0.00010844464202616127, |
|
"loss": 0.6616, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.5725806451612905, |
|
"grad_norm": 4.5592498779296875, |
|
"learning_rate": 0.00010755752866940062, |
|
"loss": 0.5993, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5806451612903225, |
|
"grad_norm": 3.743605613708496, |
|
"learning_rate": 0.0001066698167256527, |
|
"loss": 0.6507, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.588709677419355, |
|
"grad_norm": 5.72109317779541, |
|
"learning_rate": 0.00010578157650531146, |
|
"loss": 0.6712, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.596774193548387, |
|
"grad_norm": 5.807249069213867, |
|
"learning_rate": 0.00010489287836061246, |
|
"loss": 0.7235, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.6048387096774195, |
|
"grad_norm": 4.517931938171387, |
|
"learning_rate": 0.00010400379268006082, |
|
"loss": 0.656, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 4.48443603515625, |
|
"learning_rate": 0.00010311438988285598, |
|
"loss": 0.5739, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"eval_loss": 0.21073505282402039, |
|
"eval_runtime": 10.7806, |
|
"eval_samples_per_second": 18.552, |
|
"eval_steps_per_second": 0.464, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.620967741935484, |
|
"grad_norm": 9.732367515563965, |
|
"learning_rate": 0.00010222474041331436, |
|
"loss": 0.7626, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.629032258064516, |
|
"grad_norm": 4.349972248077393, |
|
"learning_rate": 0.0001013349147352898, |
|
"loss": 0.4905, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.6370967741935485, |
|
"grad_norm": 4.75486946105957, |
|
"learning_rate": 0.00010044498332659264, |
|
"loss": 0.6628, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.6451612903225805, |
|
"grad_norm": 7.873876094818115, |
|
"learning_rate": 9.955501667340741e-05, |
|
"loss": 0.6695, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.653225806451613, |
|
"grad_norm": 7.211569786071777, |
|
"learning_rate": 9.866508526471023e-05, |
|
"loss": 0.6948, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.661290322580645, |
|
"grad_norm": 4.801213264465332, |
|
"learning_rate": 9.77752595866857e-05, |
|
"loss": 0.6499, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.6693548387096775, |
|
"grad_norm": 3.598294496536255, |
|
"learning_rate": 9.688561011714404e-05, |
|
"loss": 0.5904, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.6774193548387095, |
|
"grad_norm": 5.483266353607178, |
|
"learning_rate": 9.599620731993922e-05, |
|
"loss": 0.7218, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.685483870967742, |
|
"grad_norm": 6.151427268981934, |
|
"learning_rate": 9.510712163938755e-05, |
|
"loss": 0.546, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.6935483870967742, |
|
"grad_norm": 5.117896556854248, |
|
"learning_rate": 9.421842349468855e-05, |
|
"loss": 0.8246, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.7016129032258065, |
|
"grad_norm": 2.2528913021087646, |
|
"learning_rate": 9.333018327434731e-05, |
|
"loss": 0.5807, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.7096774193548387, |
|
"grad_norm": 6.890749931335449, |
|
"learning_rate": 9.244247133059938e-05, |
|
"loss": 0.6589, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.717741935483871, |
|
"grad_norm": 5.586848258972168, |
|
"learning_rate": 9.155535797383874e-05, |
|
"loss": 0.6099, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.7258064516129032, |
|
"grad_norm": 4.517979145050049, |
|
"learning_rate": 9.066891346704875e-05, |
|
"loss": 0.7462, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.7338709677419355, |
|
"grad_norm": 4.036890029907227, |
|
"learning_rate": 8.978320802023731e-05, |
|
"loss": 0.5783, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7419354838709677, |
|
"grad_norm": 3.646713972091675, |
|
"learning_rate": 8.88983117848755e-05, |
|
"loss": 0.6252, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.6490514278411865, |
|
"learning_rate": 8.801429484834183e-05, |
|
"loss": 0.5002, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.7580645161290323, |
|
"grad_norm": 3.775611162185669, |
|
"learning_rate": 8.713122722837058e-05, |
|
"loss": 0.7321, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.7661290322580645, |
|
"grad_norm": 3.230431318283081, |
|
"learning_rate": 8.624917886750638e-05, |
|
"loss": 0.6099, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.7741935483870968, |
|
"grad_norm": 4.476761817932129, |
|
"learning_rate": 8.536821962756447e-05, |
|
"loss": 0.6728, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.782258064516129, |
|
"grad_norm": 4.384328365325928, |
|
"learning_rate": 8.448841928409711e-05, |
|
"loss": 0.673, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.7903225806451613, |
|
"grad_norm": 4.286776065826416, |
|
"learning_rate": 8.360984752086743e-05, |
|
"loss": 0.6254, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.7983870967741935, |
|
"grad_norm": 3.730931043624878, |
|
"learning_rate": 8.273257392432981e-05, |
|
"loss": 0.5843, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.8064516129032258, |
|
"grad_norm": 6.454822063446045, |
|
"learning_rate": 8.185666797811878e-05, |
|
"loss": 0.5164, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.814516129032258, |
|
"grad_norm": 2.4952480792999268, |
|
"learning_rate": 8.09821990575452e-05, |
|
"loss": 0.579, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.8225806451612905, |
|
"grad_norm": 4.976474285125732, |
|
"learning_rate": 8.010923642410184e-05, |
|
"loss": 0.6086, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.8306451612903225, |
|
"grad_norm": 4.4986138343811035, |
|
"learning_rate": 7.923784921997726e-05, |
|
"loss": 0.69, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.838709677419355, |
|
"grad_norm": 3.605375289916992, |
|
"learning_rate": 7.836810646257971e-05, |
|
"loss": 0.6618, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.846774193548387, |
|
"grad_norm": 5.670890808105469, |
|
"learning_rate": 7.750007703907046e-05, |
|
"loss": 0.7193, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.8548387096774195, |
|
"grad_norm": 3.677788019180298, |
|
"learning_rate": 7.663382970090795e-05, |
|
"loss": 0.6999, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8629032258064515, |
|
"grad_norm": 3.3995673656463623, |
|
"learning_rate": 7.57694330584021e-05, |
|
"loss": 0.7301, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.870967741935484, |
|
"grad_norm": 4.24800968170166, |
|
"learning_rate": 7.490695557528016e-05, |
|
"loss": 0.6244, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.879032258064516, |
|
"grad_norm": 5.893555641174316, |
|
"learning_rate": 7.404646556326433e-05, |
|
"loss": 0.6922, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.8870967741935485, |
|
"grad_norm": 2.421617269515991, |
|
"learning_rate": 7.318803117666084e-05, |
|
"loss": 0.6497, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.8951612903225805, |
|
"grad_norm": 4.8660569190979, |
|
"learning_rate": 7.233172040696216e-05, |
|
"loss": 0.5576, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.903225806451613, |
|
"grad_norm": 3.6708362102508545, |
|
"learning_rate": 7.14776010774616e-05, |
|
"loss": 0.5174, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.911290322580645, |
|
"grad_norm": 3.4802322387695312, |
|
"learning_rate": 7.062574083788152e-05, |
|
"loss": 0.5394, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.9193548387096775, |
|
"grad_norm": 5.204659461975098, |
|
"learning_rate": 6.977620715901514e-05, |
|
"loss": 0.712, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.9274193548387095, |
|
"grad_norm": 4.288372039794922, |
|
"learning_rate": 6.892906732738271e-05, |
|
"loss": 0.5996, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 4.846992492675781, |
|
"learning_rate": 6.8084388439902e-05, |
|
"loss": 0.5977, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9435483870967742, |
|
"grad_norm": 4.843072891235352, |
|
"learning_rate": 6.724223739857392e-05, |
|
"loss": 0.6791, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.9516129032258065, |
|
"grad_norm": 3.891608953475952, |
|
"learning_rate": 6.640268090518385e-05, |
|
"loss": 0.6179, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.9596774193548387, |
|
"grad_norm": 3.6982712745666504, |
|
"learning_rate": 6.556578545601829e-05, |
|
"loss": 0.5845, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.967741935483871, |
|
"grad_norm": 5.458211421966553, |
|
"learning_rate": 6.473161733659828e-05, |
|
"loss": 0.5844, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.9758064516129032, |
|
"grad_norm": 3.0514776706695557, |
|
"learning_rate": 6.390024261642922e-05, |
|
"loss": 0.6009, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.9838709677419355, |
|
"grad_norm": 3.7049574851989746, |
|
"learning_rate": 6.307172714376808e-05, |
|
"loss": 0.5825, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.9919354838709677, |
|
"grad_norm": 6.664052963256836, |
|
"learning_rate": 6.224613654040753e-05, |
|
"loss": 0.6885, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.781983375549316, |
|
"learning_rate": 6.142353619647903e-05, |
|
"loss": 0.6091, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.0080645161290325, |
|
"grad_norm": 4.3253631591796875, |
|
"learning_rate": 6.0603991265273074e-05, |
|
"loss": 0.4975, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.0161290322580645, |
|
"grad_norm": 3.887197494506836, |
|
"learning_rate": 5.978756665807917e-05, |
|
"loss": 0.4938, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.024193548387097, |
|
"grad_norm": 3.1507112979888916, |
|
"learning_rate": 5.897432703904453e-05, |
|
"loss": 0.5248, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.032258064516129, |
|
"grad_norm": 4.295828819274902, |
|
"learning_rate": 5.8164336820052203e-05, |
|
"loss": 0.5077, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.0403225806451615, |
|
"grad_norm": 2.8793046474456787, |
|
"learning_rate": 5.735766015561971e-05, |
|
"loss": 0.4977, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.0483870967741935, |
|
"grad_norm": 2.919048547744751, |
|
"learning_rate": 5.65543609378175e-05, |
|
"loss": 0.445, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.056451612903226, |
|
"grad_norm": 5.547980308532715, |
|
"learning_rate": 5.5754502791208504e-05, |
|
"loss": 0.5433, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.064516129032258, |
|
"grad_norm": 4.959382057189941, |
|
"learning_rate": 5.495814906780886e-05, |
|
"loss": 0.45, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.0725806451612905, |
|
"grad_norm": 3.9832897186279297, |
|
"learning_rate": 5.4165362842070185e-05, |
|
"loss": 0.4753, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.0806451612903225, |
|
"grad_norm": 4.0972466468811035, |
|
"learning_rate": 5.3376206905883694e-05, |
|
"loss": 0.4636, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.088709677419355, |
|
"grad_norm": 4.405402183532715, |
|
"learning_rate": 5.259074376360701e-05, |
|
"loss": 0.4874, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.096774193548387, |
|
"grad_norm": 4.802483081817627, |
|
"learning_rate": 5.18090356271134e-05, |
|
"loss": 0.5898, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.1048387096774195, |
|
"grad_norm": 3.756276845932007, |
|
"learning_rate": 5.1031144410864384e-05, |
|
"loss": 0.4523, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.1129032258064515, |
|
"grad_norm": 6.373031139373779, |
|
"learning_rate": 5.0257131727006016e-05, |
|
"loss": 0.5572, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.120967741935484, |
|
"grad_norm": 4.426804065704346, |
|
"learning_rate": 4.9487058880488656e-05, |
|
"loss": 0.5488, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.129032258064516, |
|
"grad_norm": 3.5255842208862305, |
|
"learning_rate": 4.87209868642117e-05, |
|
"loss": 0.5899, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.1370967741935485, |
|
"grad_norm": 4.310513973236084, |
|
"learning_rate": 4.795897635419235e-05, |
|
"loss": 0.5834, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.1451612903225805, |
|
"grad_norm": 7.741870403289795, |
|
"learning_rate": 4.720108770476024e-05, |
|
"loss": 0.3979, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.153225806451613, |
|
"grad_norm": 4.049933433532715, |
|
"learning_rate": 4.6447380943776575e-05, |
|
"loss": 0.479, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.161290322580645, |
|
"grad_norm": 5.062816143035889, |
|
"learning_rate": 4.56979157678803e-05, |
|
"loss": 0.4285, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.1693548387096775, |
|
"grad_norm": 5.887932777404785, |
|
"learning_rate": 4.495275153775937e-05, |
|
"loss": 0.5338, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.1774193548387095, |
|
"grad_norm": 3.471012592315674, |
|
"learning_rate": 4.4211947273449494e-05, |
|
"loss": 0.4488, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.185483870967742, |
|
"grad_norm": 5.384176731109619, |
|
"learning_rate": 4.347556164965934e-05, |
|
"loss": 0.5661, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.193548387096774, |
|
"grad_norm": 3.071850061416626, |
|
"learning_rate": 4.274365299112323e-05, |
|
"loss": 0.5275, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.2016129032258065, |
|
"grad_norm": 4.12220573425293, |
|
"learning_rate": 4.2016279267981664e-05, |
|
"loss": 0.5014, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.2096774193548385, |
|
"grad_norm": 4.061901569366455, |
|
"learning_rate": 4.129349809118981e-05, |
|
"loss": 0.575, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.217741935483871, |
|
"grad_norm": 4.510647296905518, |
|
"learning_rate": 4.057536670795459e-05, |
|
"loss": 0.578, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.225806451612903, |
|
"grad_norm": 4.456209659576416, |
|
"learning_rate": 3.9861941997200245e-05, |
|
"loss": 0.4577, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.2338709677419355, |
|
"grad_norm": 3.253465175628662, |
|
"learning_rate": 3.915328046506357e-05, |
|
"loss": 0.4912, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.241935483870968, |
|
"grad_norm": 4.893517017364502, |
|
"learning_rate": 3.8449438240418134e-05, |
|
"loss": 0.5295, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 5.378866195678711, |
|
"learning_rate": 3.775047107042883e-05, |
|
"loss": 0.5919, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.258064516129032, |
|
"grad_norm": 4.689233303070068, |
|
"learning_rate": 3.705643431613634e-05, |
|
"loss": 0.446, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.2661290322580645, |
|
"grad_norm": 4.477736473083496, |
|
"learning_rate": 3.636738294807245e-05, |
|
"loss": 0.5107, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.274193548387097, |
|
"grad_norm": 5.829265594482422, |
|
"learning_rate": 3.568337154190614e-05, |
|
"loss": 0.4562, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.282258064516129, |
|
"grad_norm": 3.0017786026000977, |
|
"learning_rate": 3.500445427412077e-05, |
|
"loss": 0.43, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.2903225806451615, |
|
"grad_norm": 3.627903461456299, |
|
"learning_rate": 3.433068491772341e-05, |
|
"loss": 0.3979, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.2983870967741935, |
|
"grad_norm": 4.549633026123047, |
|
"learning_rate": 3.366211683798549e-05, |
|
"loss": 0.4884, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.306451612903226, |
|
"grad_norm": 4.661441326141357, |
|
"learning_rate": 3.299880298821625e-05, |
|
"loss": 0.5318, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.314516129032258, |
|
"grad_norm": 3.091770887374878, |
|
"learning_rate": 3.23407959055685e-05, |
|
"loss": 0.4053, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.3225806451612905, |
|
"grad_norm": 2.9879705905914307, |
|
"learning_rate": 3.1688147706877666e-05, |
|
"loss": 0.4134, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.3306451612903225, |
|
"grad_norm": 3.624067544937134, |
|
"learning_rate": 3.1040910084533614e-05, |
|
"loss": 0.3921, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.338709677419355, |
|
"grad_norm": 4.817528247833252, |
|
"learning_rate": 3.039913430238672e-05, |
|
"loss": 0.5302, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.346774193548387, |
|
"grad_norm": 4.517585277557373, |
|
"learning_rate": 2.9762871191687313e-05, |
|
"loss": 0.4715, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.3548387096774195, |
|
"grad_norm": 5.826594829559326, |
|
"learning_rate": 2.913217114705975e-05, |
|
"loss": 0.5499, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.3629032258064515, |
|
"grad_norm": 6.314401626586914, |
|
"learning_rate": 2.850708412251103e-05, |
|
"loss": 0.4083, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.370967741935484, |
|
"grad_norm": 5.107626438140869, |
|
"learning_rate": 2.7887659627474017e-05, |
|
"loss": 0.5045, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.379032258064516, |
|
"grad_norm": 3.4228501319885254, |
|
"learning_rate": 2.7273946722886366e-05, |
|
"loss": 0.4933, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.3870967741935485, |
|
"grad_norm": 5.124197006225586, |
|
"learning_rate": 2.6665994017304407e-05, |
|
"loss": 0.4542, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.3951612903225805, |
|
"grad_norm": 3.9016506671905518, |
|
"learning_rate": 2.6063849663053475e-05, |
|
"loss": 0.5037, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.403225806451613, |
|
"grad_norm": 3.828733205795288, |
|
"learning_rate": 2.5467561352413648e-05, |
|
"loss": 0.5173, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.411290322580645, |
|
"grad_norm": 4.956728458404541, |
|
"learning_rate": 2.4877176313842753e-05, |
|
"loss": 0.4014, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.4193548387096775, |
|
"grad_norm": 5.5596818923950195, |
|
"learning_rate": 2.4292741308235345e-05, |
|
"loss": 0.5035, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4193548387096775, |
|
"eval_loss": 0.18692229688167572, |
|
"eval_runtime": 10.7892, |
|
"eval_samples_per_second": 18.537, |
|
"eval_steps_per_second": 0.463, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4274193548387095, |
|
"grad_norm": 4.4349212646484375, |
|
"learning_rate": 2.3714302625219243e-05, |
|
"loss": 0.468, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.435483870967742, |
|
"grad_norm": 5.491191387176514, |
|
"learning_rate": 2.3141906079489183e-05, |
|
"loss": 0.5072, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.443548387096774, |
|
"grad_norm": 6.340859889984131, |
|
"learning_rate": 2.2575597007177984e-05, |
|
"loss": 0.5273, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.4516129032258065, |
|
"grad_norm": 4.324528217315674, |
|
"learning_rate": 2.2015420262265863e-05, |
|
"loss": 0.4456, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.4596774193548385, |
|
"grad_norm": 3.401756763458252, |
|
"learning_rate": 2.1461420213027772e-05, |
|
"loss": 0.4184, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.467741935483871, |
|
"grad_norm": 5.022490501403809, |
|
"learning_rate": 2.0913640738519335e-05, |
|
"loss": 0.473, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.475806451612903, |
|
"grad_norm": 4.941099166870117, |
|
"learning_rate": 2.0372125225101234e-05, |
|
"loss": 0.4454, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.4838709677419355, |
|
"grad_norm": 5.535488128662109, |
|
"learning_rate": 1.983691656300314e-05, |
|
"loss": 0.5843, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.491935483870968, |
|
"grad_norm": 3.0146915912628174, |
|
"learning_rate": 1.930805714292634e-05, |
|
"loss": 0.5081, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.5309910774230957, |
|
"learning_rate": 1.8785588852686376e-05, |
|
"loss": 0.369, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.508064516129032, |
|
"grad_norm": 3.969219207763672, |
|
"learning_rate": 1.8269553073895375e-05, |
|
"loss": 0.4837, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.5161290322580645, |
|
"grad_norm": 3.845773935317993, |
|
"learning_rate": 1.7759990678684335e-05, |
|
"loss": 0.4391, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.524193548387097, |
|
"grad_norm": 4.459768295288086, |
|
"learning_rate": 1.7256942026466072e-05, |
|
"loss": 0.4718, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.532258064516129, |
|
"grad_norm": 4.034666061401367, |
|
"learning_rate": 1.6760446960738364e-05, |
|
"loss": 0.5073, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.540322580645161, |
|
"grad_norm": 3.4610517024993896, |
|
"learning_rate": 1.6270544805928424e-05, |
|
"loss": 0.4778, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.5483870967741935, |
|
"grad_norm": 4.8552350997924805, |
|
"learning_rate": 1.5787274364278004e-05, |
|
"loss": 0.4314, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.556451612903226, |
|
"grad_norm": 3.211488723754883, |
|
"learning_rate": 1.5310673912770312e-05, |
|
"loss": 0.4073, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.564516129032258, |
|
"grad_norm": 3.884469509124756, |
|
"learning_rate": 1.4840781200098152e-05, |
|
"loss": 0.43, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.5725806451612905, |
|
"grad_norm": 4.035294055938721, |
|
"learning_rate": 1.4377633443674233e-05, |
|
"loss": 0.3948, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 4.55308723449707, |
|
"learning_rate": 1.392126732668323e-05, |
|
"loss": 0.398, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.588709677419355, |
|
"grad_norm": 4.714926242828369, |
|
"learning_rate": 1.3471718995176507e-05, |
|
"loss": 0.4145, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.596774193548387, |
|
"grad_norm": 3.8654613494873047, |
|
"learning_rate": 1.3029024055209015e-05, |
|
"loss": 0.533, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.6048387096774195, |
|
"grad_norm": 3.3056280612945557, |
|
"learning_rate": 1.2593217570019267e-05, |
|
"loss": 0.3909, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.6129032258064515, |
|
"grad_norm": 4.276644706726074, |
|
"learning_rate": 1.2164334057252203e-05, |
|
"loss": 0.4548, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.620967741935484, |
|
"grad_norm": 3.661343574523926, |
|
"learning_rate": 1.174240748622516e-05, |
|
"loss": 0.4126, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.629032258064516, |
|
"grad_norm": 4.099079132080078, |
|
"learning_rate": 1.1327471275237456e-05, |
|
"loss": 0.5763, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.6370967741935485, |
|
"grad_norm": 4.0881524085998535, |
|
"learning_rate": 1.0919558288923426e-05, |
|
"loss": 0.4333, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.6451612903225805, |
|
"grad_norm": 3.3521313667297363, |
|
"learning_rate": 1.0518700835649553e-05, |
|
"loss": 0.2746, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.653225806451613, |
|
"grad_norm": 4.324705600738525, |
|
"learning_rate": 1.0124930664955301e-05, |
|
"loss": 0.5431, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.661290322580645, |
|
"grad_norm": 4.372384071350098, |
|
"learning_rate": 9.73827896503865e-06, |
|
"loss": 0.4704, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.6693548387096775, |
|
"grad_norm": 4.0811357498168945, |
|
"learning_rate": 9.358776360285759e-06, |
|
"loss": 0.3613, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.6774193548387095, |
|
"grad_norm": 3.294893503189087, |
|
"learning_rate": 8.986452908845322e-06, |
|
"loss": 0.5004, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.685483870967742, |
|
"grad_norm": 4.411680698394775, |
|
"learning_rate": 8.621338100247988e-06, |
|
"loss": 0.4558, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.693548387096774, |
|
"grad_norm": 4.227653980255127, |
|
"learning_rate": 8.26346085307057e-06, |
|
"loss": 0.578, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.7016129032258065, |
|
"grad_norm": 5.313823699951172, |
|
"learning_rate": 7.91284951264557e-06, |
|
"loss": 0.4111, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.709677419354839, |
|
"grad_norm": 4.425273418426514, |
|
"learning_rate": 7.569531848816147e-06, |
|
"loss": 0.5326, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.717741935483871, |
|
"grad_norm": 4.490087032318115, |
|
"learning_rate": 7.233535053736706e-06, |
|
"loss": 0.4831, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.725806451612903, |
|
"grad_norm": 4.207247257232666, |
|
"learning_rate": 6.90488573971898e-06, |
|
"loss": 0.3913, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.7338709677419355, |
|
"grad_norm": 4.731769561767578, |
|
"learning_rate": 6.583609937124435e-06, |
|
"loss": 0.5326, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.741935483870968, |
|
"grad_norm": 4.942644119262695, |
|
"learning_rate": 6.269733092302399e-06, |
|
"loss": 0.448, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 3.1469435691833496, |
|
"learning_rate": 5.963280065574694e-06, |
|
"loss": 0.4406, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.758064516129032, |
|
"grad_norm": 3.505293846130371, |
|
"learning_rate": 5.664275129266605e-06, |
|
"loss": 0.434, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.7661290322580645, |
|
"grad_norm": 3.7461435794830322, |
|
"learning_rate": 5.372741965784323e-06, |
|
"loss": 0.5278, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.774193548387097, |
|
"grad_norm": 4.557045936584473, |
|
"learning_rate": 5.088703665739336e-06, |
|
"loss": 0.4197, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.782258064516129, |
|
"grad_norm": 4.964477062225342, |
|
"learning_rate": 4.812182726119397e-06, |
|
"loss": 0.4443, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.790322580645161, |
|
"grad_norm": 3.138605833053589, |
|
"learning_rate": 4.543201048506851e-06, |
|
"loss": 0.4778, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.7983870967741935, |
|
"grad_norm": 5.386980056762695, |
|
"learning_rate": 4.2817799373437994e-06, |
|
"loss": 0.5819, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.806451612903226, |
|
"grad_norm": 6.033466339111328, |
|
"learning_rate": 4.027940098244753e-06, |
|
"loss": 0.48, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.814516129032258, |
|
"grad_norm": 5.2147111892700195, |
|
"learning_rate": 3.7817016363566493e-06, |
|
"loss": 0.3988, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.8225806451612905, |
|
"grad_norm": 4.05673360824585, |
|
"learning_rate": 3.54308405476651e-06, |
|
"loss": 0.5636, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.8306451612903225, |
|
"grad_norm": 4.368668079376221, |
|
"learning_rate": 3.312106252956626e-06, |
|
"loss": 0.4855, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.838709677419355, |
|
"grad_norm": 4.68013858795166, |
|
"learning_rate": 3.0887865253076632e-06, |
|
"loss": 0.3255, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.846774193548387, |
|
"grad_norm": 4.889599323272705, |
|
"learning_rate": 2.873142559649722e-06, |
|
"loss": 0.3917, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.8548387096774195, |
|
"grad_norm": 4.104279041290283, |
|
"learning_rate": 2.6651914358613252e-06, |
|
"loss": 0.4507, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.8629032258064515, |
|
"grad_norm": 2.5767111778259277, |
|
"learning_rate": 2.464949624516688e-06, |
|
"loss": 0.3908, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.870967741935484, |
|
"grad_norm": 4.618126392364502, |
|
"learning_rate": 2.272432985581119e-06, |
|
"loss": 0.5756, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.879032258064516, |
|
"grad_norm": 5.131494998931885, |
|
"learning_rate": 2.0876567671548773e-06, |
|
"loss": 0.5951, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.8870967741935485, |
|
"grad_norm": 5.027335166931152, |
|
"learning_rate": 1.910635604265465e-06, |
|
"loss": 0.3785, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.8951612903225805, |
|
"grad_norm": 4.581331253051758, |
|
"learning_rate": 1.7413835177084835e-06, |
|
"loss": 0.484, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.903225806451613, |
|
"grad_norm": 5.248763084411621, |
|
"learning_rate": 1.5799139129370588e-06, |
|
"loss": 0.4451, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.911290322580645, |
|
"grad_norm": 3.955052137374878, |
|
"learning_rate": 1.4262395790001881e-06, |
|
"loss": 0.4345, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.9193548387096775, |
|
"grad_norm": 4.28586483001709, |
|
"learning_rate": 1.2803726875296963e-06, |
|
"loss": 0.5227, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.9274193548387095, |
|
"grad_norm": 2.89931321144104, |
|
"learning_rate": 1.142324791776239e-06, |
|
"loss": 0.3766, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.935483870967742, |
|
"grad_norm": 4.354786396026611, |
|
"learning_rate": 1.01210682569427e-06, |
|
"loss": 0.3991, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.943548387096774, |
|
"grad_norm": 3.9716200828552246, |
|
"learning_rate": 8.897291030759314e-07, |
|
"loss": 0.4012, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.9516129032258065, |
|
"grad_norm": 5.057220935821533, |
|
"learning_rate": 7.752013167342531e-07, |
|
"loss": 0.4659, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.959677419354839, |
|
"grad_norm": 3.645305633544922, |
|
"learning_rate": 6.68532537735389e-07, |
|
"loss": 0.41, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.967741935483871, |
|
"grad_norm": 6.602363586425781, |
|
"learning_rate": 5.697312146801915e-07, |
|
"loss": 0.5342, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.975806451612903, |
|
"grad_norm": 5.502582550048828, |
|
"learning_rate": 4.788051730349907e-07, |
|
"loss": 0.4802, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.9838709677419355, |
|
"grad_norm": 3.0015265941619873, |
|
"learning_rate": 3.9576161451186923e-07, |
|
"loss": 0.2839, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.991935483870968, |
|
"grad_norm": 3.8008387088775635, |
|
"learning_rate": 3.2060711649817277e-07, |
|
"loss": 0.5152, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 3.2317986488342285, |
|
"learning_rate": 2.5334763153559424e-07, |
|
"loss": 0.4034, |
|
"step": 372 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 372, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6747080732408545e+18, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|