|
{ |
|
"best_metric": 0.34852299094200134, |
|
"best_model_checkpoint": "./convnext-base-3e-4/checkpoint-10990", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 10990, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 17.357587814331055, |
|
"learning_rate": 0.00029993871755982685, |
|
"loss": 2.1854, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 28.103832244873047, |
|
"learning_rate": 0.0002997549203131404, |
|
"loss": 1.4323, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 15.243412017822266, |
|
"learning_rate": 0.0002994487584405243, |
|
"loss": 1.3262, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 13.571104049682617, |
|
"learning_rate": 0.00029902048210660057, |
|
"loss": 1.164, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 11.755253791809082, |
|
"learning_rate": 0.00029847044125561983, |
|
"loss": 1.1175, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 8.938959121704102, |
|
"learning_rate": 0.00029779908532552276, |
|
"loss": 1.0117, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 8.597779273986816, |
|
"learning_rate": 0.00029700696288070426, |
|
"loss": 1.0719, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 12.803329467773438, |
|
"learning_rate": 0.0002960947211637822, |
|
"loss": 1.0533, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 12.126448631286621, |
|
"learning_rate": 0.00029506310556673567, |
|
"loss": 0.9138, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 8.313648223876953, |
|
"learning_rate": 0.0002939129590218462, |
|
"loss": 0.8947, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7884691848906561, |
|
"eval_loss": 0.7363528609275818, |
|
"eval_runtime": 103.2916, |
|
"eval_samples_per_second": 24.349, |
|
"eval_steps_per_second": 1.53, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 8.587052345275879, |
|
"learning_rate": 0.00029264522131293815, |
|
"loss": 0.9902, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 8.60452938079834, |
|
"learning_rate": 0.00029126092830748215, |
|
"loss": 0.8517, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 14.598617553710938, |
|
"learning_rate": 0.00028976121111018877, |
|
"loss": 0.802, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 7.155284881591797, |
|
"learning_rate": 0.00028814729513878363, |
|
"loss": 0.7962, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 13.24478816986084, |
|
"learning_rate": 0.00028642049912271946, |
|
"loss": 0.7782, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 10.02603530883789, |
|
"learning_rate": 0.0002845822340256436, |
|
"loss": 0.7813, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 10.052382469177246, |
|
"learning_rate": 0.00028263400189250057, |
|
"loss": 0.8079, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 8.431123733520508, |
|
"learning_rate": 0.0002805773946222121, |
|
"loss": 0.8041, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 7.3062944412231445, |
|
"learning_rate": 0.00027841409266693835, |
|
"loss": 0.8019, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 7.146261215209961, |
|
"learning_rate": 0.0002761458636589813, |
|
"loss": 0.679, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 12.160822868347168, |
|
"learning_rate": 0.0002737745609664539, |
|
"loss": 0.7643, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8170974155069582, |
|
"eval_loss": 0.628582775592804, |
|
"eval_runtime": 103.2407, |
|
"eval_samples_per_second": 24.361, |
|
"eval_steps_per_second": 1.53, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 14.596354484558105, |
|
"learning_rate": 0.00027130212217889483, |
|
"loss": 0.7511, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 11.497203826904297, |
|
"learning_rate": 0.000268730567524065, |
|
"loss": 0.6527, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 6.976894855499268, |
|
"learning_rate": 0.00026606199821722166, |
|
"loss": 0.6289, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 9.718855857849121, |
|
"learning_rate": 0.0002632985947442167, |
|
"loss": 0.6755, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 6.231022357940674, |
|
"learning_rate": 0.00026044261507982355, |
|
"loss": 0.6377, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 15.673833847045898, |
|
"learning_rate": 0.0002574963928427478, |
|
"loss": 0.626, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 5.745795726776123, |
|
"learning_rate": 0.00025446233538882923, |
|
"loss": 0.6276, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 11.555608749389648, |
|
"learning_rate": 0.00025134292184399317, |
|
"loss": 0.695, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 7.4662089347839355, |
|
"learning_rate": 0.00024814070107855875, |
|
"loss": 0.6095, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 7.660247802734375, |
|
"learning_rate": 0.00024485828962455907, |
|
"loss": 0.631, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 9.731380462646484, |
|
"learning_rate": 0.00024149836953777485, |
|
"loss": 0.6036, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8481113320079523, |
|
"eval_loss": 0.5258452892303467, |
|
"eval_runtime": 104.1346, |
|
"eval_samples_per_second": 24.151, |
|
"eval_steps_per_second": 1.517, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 5.469219207763672, |
|
"learning_rate": 0.00023806368620622872, |
|
"loss": 0.5889, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 8.50100040435791, |
|
"learning_rate": 0.0002345570461069312, |
|
"loss": 0.5034, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 12.361306190490723, |
|
"learning_rate": 0.00023098131451271015, |
|
"loss": 0.5181, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 7.584137439727783, |
|
"learning_rate": 0.0002273394131509988, |
|
"loss": 0.5336, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 10.401506423950195, |
|
"learning_rate": 0.0002236343178164948, |
|
"loss": 0.5216, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 5.352778434753418, |
|
"learning_rate": 0.00021986905593964046, |
|
"loss": 0.4939, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 7.9993109703063965, |
|
"learning_rate": 0.0002160467041129117, |
|
"loss": 0.521, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 9.176218032836914, |
|
"learning_rate": 0.00021217038557693726, |
|
"loss": 0.5288, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 9.322657585144043, |
|
"learning_rate": 0.0002082432676685007, |
|
"loss": 0.5168, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 5.861387252807617, |
|
"learning_rate": 0.00020426855923251228, |
|
"loss": 0.5081, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 4.290045261383057, |
|
"learning_rate": 0.00020024950800006462, |
|
"loss": 0.5012, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.869582504970179, |
|
"eval_loss": 0.49109867215156555, |
|
"eval_runtime": 102.6443, |
|
"eval_samples_per_second": 24.502, |
|
"eval_steps_per_second": 1.539, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 4.063934803009033, |
|
"learning_rate": 0.0001961893979347137, |
|
"loss": 0.4754, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 5.163869380950928, |
|
"learning_rate": 0.00019209154654915522, |
|
"loss": 0.4471, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 3.733952522277832, |
|
"learning_rate": 0.0001879593021944875, |
|
"loss": 0.4004, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 7.615368366241455, |
|
"learning_rate": 0.00018379604132427648, |
|
"loss": 0.4076, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 9.600367546081543, |
|
"learning_rate": 0.0001796051657356582, |
|
"loss": 0.4035, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 9.41919231414795, |
|
"learning_rate": 0.0001753900997897331, |
|
"loss": 0.4281, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 13.647310256958008, |
|
"learning_rate": 0.00017115428761352327, |
|
"loss": 0.3674, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 2.1058132648468018, |
|
"learning_rate": 0.00016690119028577906, |
|
"loss": 0.3917, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 4.259520053863525, |
|
"learning_rate": 0.0001626342830089342, |
|
"loss": 0.3899, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 4.13034200668335, |
|
"learning_rate": 0.0001583570522695211, |
|
"loss": 0.4178, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 13.085577011108398, |
|
"learning_rate": 0.00015407299298936486, |
|
"loss": 0.3926, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8930417495029821, |
|
"eval_loss": 0.38039031624794006, |
|
"eval_runtime": 103.7463, |
|
"eval_samples_per_second": 24.242, |
|
"eval_steps_per_second": 1.523, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 3.522939443588257, |
|
"learning_rate": 0.000149785605669886, |
|
"loss": 0.3671, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 5.17425537109375, |
|
"learning_rate": 0.00014549839353184327, |
|
"loss": 0.3017, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 6.327219009399414, |
|
"learning_rate": 0.00014121485965285484, |
|
"loss": 0.2922, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.7092263102531433, |
|
"learning_rate": 0.00013693850410503614, |
|
"loss": 0.314, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 0.38373059034347534, |
|
"learning_rate": 0.0001326728210950942, |
|
"loss": 0.3141, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 5.553852558135986, |
|
"learning_rate": 0.00012842129610921376, |
|
"loss": 0.2821, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 3.678790330886841, |
|
"learning_rate": 0.00012418740306506922, |
|
"loss": 0.3359, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 4.428023338317871, |
|
"learning_rate": 0.00011997460147328983, |
|
"loss": 0.2825, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 3.3043198585510254, |
|
"learning_rate": 0.00011578633361069557, |
|
"loss": 0.3317, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 1.317456603050232, |
|
"learning_rate": 0.0001116260217076161, |
|
"loss": 0.2983, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 8.99087142944336, |
|
"learning_rate": 0.00010749706515158862, |
|
"loss": 0.3348, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.8970178926441352, |
|
"eval_loss": 0.41324833035469055, |
|
"eval_runtime": 103.3001, |
|
"eval_samples_per_second": 24.347, |
|
"eval_steps_per_second": 1.53, |
|
"step": 6594 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 4.097568988800049, |
|
"learning_rate": 0.00010340283770972167, |
|
"loss": 0.3045, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 10.98578929901123, |
|
"learning_rate": 9.93466847719919e-05, |
|
"loss": 0.2327, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 13.919866561889648, |
|
"learning_rate": 9.533192061772917e-05, |
|
"loss": 0.2696, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 6.072042942047119, |
|
"learning_rate": 9.136182570752152e-05, |
|
"loss": 0.2258, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 0.17451171576976776, |
|
"learning_rate": 8.743964400275302e-05, |
|
"loss": 0.2406, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.33122172951698303, |
|
"learning_rate": 8.356858031496595e-05, |
|
"loss": 0.2505, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 1.151172161102295, |
|
"learning_rate": 7.975179768721186e-05, |
|
"loss": 0.1903, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 0.7031873464584351, |
|
"learning_rate": 7.59924148095311e-05, |
|
"loss": 0.2085, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 6.131903171539307, |
|
"learning_rate": 7.229350347067424e-05, |
|
"loss": 0.2471, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 6.110349178314209, |
|
"learning_rate": 6.865808604814564e-05, |
|
"loss": 0.2085, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 4.413149833679199, |
|
"learning_rate": 6.508913303862143e-05, |
|
"loss": 0.2594, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9153081510934393, |
|
"eval_loss": 0.3626956641674042, |
|
"eval_runtime": 103.1814, |
|
"eval_samples_per_second": 24.375, |
|
"eval_steps_per_second": 1.531, |
|
"step": 7693 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 3.3153018951416016, |
|
"learning_rate": 6.158956063075865e-05, |
|
"loss": 0.1743, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 2.5595600605010986, |
|
"learning_rate": 5.816222832238015e-05, |
|
"loss": 0.1699, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 3.9605636596679688, |
|
"learning_rate": 5.4809936583981286e-05, |
|
"loss": 0.2036, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 0.7597993612289429, |
|
"learning_rate": 5.1535424570467366e-05, |
|
"loss": 0.1829, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 5.694293022155762, |
|
"learning_rate": 4.834136788299248e-05, |
|
"loss": 0.2039, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 0.5163713097572327, |
|
"learning_rate": 4.523037638272821e-05, |
|
"loss": 0.1764, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 4.396867275238037, |
|
"learning_rate": 4.220499205834782e-05, |
|
"loss": 0.1862, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.054451316595077515, |
|
"learning_rate": 3.926768694896931e-05, |
|
"loss": 0.1773, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 0.23744052648544312, |
|
"learning_rate": 3.64208611242546e-05, |
|
"loss": 0.1648, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 3.540268659591675, |
|
"learning_rate": 3.366684072331414e-05, |
|
"loss": 0.1541, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 0.33744722604751587, |
|
"learning_rate": 3.100787605402072e-05, |
|
"loss": 0.1751, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9308151093439364, |
|
"eval_loss": 0.3506681025028229, |
|
"eval_runtime": 103.8384, |
|
"eval_samples_per_second": 24.22, |
|
"eval_steps_per_second": 1.522, |
|
"step": 8792 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.13831892609596252, |
|
"learning_rate": 2.844613975428448e-05, |
|
"loss": 0.1472, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 2.3098576068878174, |
|
"learning_rate": 2.5983725016792572e-05, |
|
"loss": 0.1772, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 0.10428429394960403, |
|
"learning_rate": 2.3622643878662696e-05, |
|
"loss": 0.1524, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 9.646160125732422, |
|
"learning_rate": 2.1364825577409422e-05, |
|
"loss": 0.1023, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.3879956305027008, |
|
"learning_rate": 1.9212114974565664e-05, |
|
"loss": 0.1421, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 0.022449787706136703, |
|
"learning_rate": 1.7166271048247792e-05, |
|
"loss": 0.1101, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 4.682805061340332, |
|
"learning_rate": 1.5228965455896053e-05, |
|
"loss": 0.1355, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 4.212618350982666, |
|
"learning_rate": 1.3401781168364589e-05, |
|
"loss": 0.1465, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 0.17709462344646454, |
|
"learning_rate": 1.1686211176477206e-05, |
|
"loss": 0.1375, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 7.981707572937012, |
|
"learning_rate": 1.0083657271105799e-05, |
|
"loss": 0.1498, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 7.477297306060791, |
|
"learning_rate": 8.59542889776807e-06, |
|
"loss": 0.1613, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9300198807157057, |
|
"eval_loss": 0.34880414605140686, |
|
"eval_runtime": 103.3381, |
|
"eval_samples_per_second": 24.338, |
|
"eval_steps_per_second": 1.529, |
|
"step": 9891 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 1.4414594173431396, |
|
"learning_rate": 7.222742086680755e-06, |
|
"loss": 0.1335, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 4.2091474533081055, |
|
"learning_rate": 5.966718459142195e-06, |
|
"loss": 0.1066, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 18.196033477783203, |
|
"learning_rate": 4.828384311056549e-06, |
|
"loss": 0.125, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 2.323212146759033, |
|
"learning_rate": 3.8086697743481664e-06, |
|
"loss": 0.1239, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 0.2004556953907013, |
|
"learning_rate": 2.9084080569515775e-06, |
|
"loss": 0.1076, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 12.655696868896484, |
|
"learning_rate": 2.128334761997924e-06, |
|
"loss": 0.1054, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 0.03721316158771515, |
|
"learning_rate": 1.469087286754289e-06, |
|
"loss": 0.125, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 3.7458605766296387, |
|
"learning_rate": 9.31204301806776e-07, |
|
"loss": 0.1161, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 0.63627690076828, |
|
"learning_rate": 5.151253109133391e-07, |
|
"loss": 0.1342, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 7.162803649902344, |
|
"learning_rate": 2.211902918855313e-07, |
|
"loss": 0.1365, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 1.5799212455749512, |
|
"learning_rate": 4.9639418792951634e-08, |
|
"loss": 0.1102, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9280318091451292, |
|
"eval_loss": 0.34852299094200134, |
|
"eval_runtime": 103.4235, |
|
"eval_samples_per_second": 24.317, |
|
"eval_steps_per_second": 1.528, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 10990, |
|
"total_flos": 4.09349935387607e+19, |
|
"train_loss": 0.45005689912540897, |
|
"train_runtime": 16363.7477, |
|
"train_samples_per_second": 10.744, |
|
"train_steps_per_second": 0.672 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10990, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.09349935387607e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|