diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2861 +1,1613 @@ { - "best_metric": 0.10869565217391304, - "best_model_checkpoint": "vit-base-patch16-224-ve-U13b-80RX1\\checkpoint-103", - "epoch": 40.0, + "best_metric": 0.8043478260869565, + "best_model_checkpoint": "vit-base-patch16-224-ve-U13b-80RX1\\checkpoint-360", + "epoch": 39.61165048543689, "eval_steps": 500, - "global_step": 4120, + "global_step": 2040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.1, - "learning_rate": 2.6699029126213593e-06, - "loss": 2.0319380055747437e+25, + "epoch": 0.19, + "learning_rate": 5.392156862745098e-06, + "loss": 6.687978618025639e+28, "step": 10 }, { - "epoch": 0.19, - "learning_rate": 5.3398058252427185e-06, - "loss": 2.1407918734188223e+25, + "epoch": 0.39, + "learning_rate": 1.0784313725490196e-05, + "loss": 5.796248488225585e+28, "step": 20 }, { - "epoch": 0.29, - "learning_rate": 8.009708737864077e-06, - "loss": 2.0319380055747437e+25, + "epoch": 0.58, + "learning_rate": 1.6176470588235296e-05, + "loss": 6.539356300076766e+28, "step": 30 }, { - "epoch": 0.39, - "learning_rate": 1.0679611650485437e-05, - "loss": 2.3584994246395388e+25, + "epoch": 0.78, + "learning_rate": 2.156862745098039e-05, + "loss": 5.053138787427811e+28, "step": 40 }, { - "epoch": 0.49, - "learning_rate": 1.3349514563106797e-05, - "loss": 1.9956536289166384e+25, + "epoch": 0.97, + "learning_rate": 2.696078431372549e-05, + "loss": 6.242113175336294e+28, "step": 50 }, { - "epoch": 0.58, - "learning_rate": 1.6019417475728155e-05, - "loss": 2.213361180137355e+25, + "epoch": 0.99, + "eval_accuracy": 0.3695652173913043, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8432, + "eval_samples_per_second": 54.555, + "eval_steps_per_second": 7.116, + "step": 51 + }, + { + "epoch": 1.17, + "learning_rate": 3.235294117647059e-05, + "loss": 5.300842021027069e+28, "step": 60 }, { - "epoch": 0.68, - "learning_rate": 1.8689320388349518e-05, - "loss": 2.213360995669914e+25, + "epoch": 1.36, + "learning_rate": 3.774509803921569e-05, + "loss": 6.093491612966058e+28, "step": 70 }, { - "epoch": 0.78, - "learning_rate": 2.1359223300970874e-05, - "loss": 2.3584996091069793e+25, + "epoch": 1.55, + "learning_rate": 4.313725490196078e-05, + "loss": 4.90451835842553e+28, "step": 80 }, { - "epoch": 0.87, - "learning_rate": 2.4029126213592234e-05, - "loss": 2.1407916889513814e+25, + "epoch": 1.75, + "learning_rate": 4.8529411764705885e-05, + "loss": 7.133844060714986e+28, "step": 90 }, { - "epoch": 0.97, - "learning_rate": 2.6699029126213593e-05, - "loss": 2.1407918734188223e+25, + "epoch": 1.94, + "learning_rate": 5.392156862745098e-05, + "loss": 6.836600935974513e+28, "step": 100 }, { - "epoch": 1.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7942, - "eval_samples_per_second": 57.921, - "eval_steps_per_second": 7.555, + "epoch": 2.0, + "eval_accuracy": 0.5434782608695652, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7792, + "eval_samples_per_second": 59.035, + "eval_steps_per_second": 7.7, "step": 103 }, { - "epoch": 1.07, - "learning_rate": 2.9368932038834953e-05, - "loss": 2.2254559723567234e+25, + "epoch": 2.14, + "learning_rate": 5.477296181630547e-05, + "loss": 5.152220836446151e+28, "step": 110 }, { - "epoch": 1.17, - "learning_rate": 3.203883495145631e-05, - "loss": 2.2859304868558873e+25, + "epoch": 2.33, + "learning_rate": 5.448916408668731e-05, + "loss": 6.985221742766112e+28, "step": 120 }, { - "epoch": 1.26, - "learning_rate": 3.470873786407767e-05, - "loss": 2.104507312293276e+25, + "epoch": 2.52, + "learning_rate": 5.4205366357069146e-05, + "loss": 6.3907354932851674e+28, "step": 130 }, { - "epoch": 1.36, - "learning_rate": 3.7378640776699036e-05, - "loss": 2.213360995669914e+25, + "epoch": 2.72, + "learning_rate": 5.392156862745098e-05, + "loss": 5.350383045536239e+28, "step": 140 }, { - "epoch": 1.46, - "learning_rate": 4.004854368932039e-05, - "loss": 2.3584996091069793e+25, + "epoch": 2.91, + "learning_rate": 5.363777089783282e-05, + "loss": 5.64762654806603e+28, "step": 150 }, { - "epoch": 1.55, - "learning_rate": 4.271844660194175e-05, - "loss": 2.2496457412629006e+25, + "epoch": 2.99, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8532, + "eval_samples_per_second": 53.915, + "eval_steps_per_second": 7.032, + "step": 154 + }, + { + "epoch": 3.11, + "learning_rate": 5.3353973168214655e-05, + "loss": 5.944870050595821e+28, "step": 160 }, { - "epoch": 1.65, - "learning_rate": 4.5388349514563104e-05, - "loss": 2.177076434544368e+25, + "epoch": 3.3, + "learning_rate": 5.3070175438596496e-05, + "loss": 6.985223253923386e+28, "step": 170 }, { - "epoch": 1.75, - "learning_rate": 4.805825242718447e-05, - "loss": 1.8867993921376783e+25, + "epoch": 3.5, + "learning_rate": 5.278637770897833e-05, + "loss": 6.242113175336294e+28, "step": 180 }, { - "epoch": 1.84, - "learning_rate": 5.072815533980583e-05, - "loss": 2.213361180137355e+25, + "epoch": 3.69, + "learning_rate": 5.2502579979360165e-05, + "loss": 5.499004230117157e+28, "step": 190 }, { - "epoch": 1.94, - "learning_rate": 5.339805825242719e-05, - "loss": 1.923084137730665e+25, + "epoch": 3.88, + "learning_rate": 5.2218782249742006e-05, + "loss": 5.944870050595821e+28, "step": 200 }, { - "epoch": 2.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7862, - "eval_samples_per_second": 58.511, - "eval_steps_per_second": 7.632, + "epoch": 4.0, + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8142, + "eval_samples_per_second": 56.498, + "eval_steps_per_second": 7.369, "step": 206 }, { - "epoch": 2.04, - "learning_rate": 5.4943791517629026e-05, - "loss": 2.298025279075256e+25, + "epoch": 4.08, + "learning_rate": 5.193498452012384e-05, + "loss": 5.746707463716415e+28, "step": 210 }, { - "epoch": 2.14, - "learning_rate": 5.480327031170158e-05, - "loss": 2.2496457412629006e+25, + "epoch": 4.27, + "learning_rate": 5.165118679050568e-05, + "loss": 6.3907354932851674e+28, "step": 220 }, { - "epoch": 2.23, - "learning_rate": 5.466274910577415e-05, - "loss": 1.9956534444491974e+25, + "epoch": 4.47, + "learning_rate": 5.1367389060887515e-05, + "loss": 6.539355544498129e+28, "step": 230 }, { - "epoch": 2.33, - "learning_rate": 5.452222789984671e-05, - "loss": 2.104507312293276e+25, + "epoch": 4.66, + "learning_rate": 5.108359133126935e-05, + "loss": 6.3907354932851674e+28, "step": 240 }, { - "epoch": 2.43, - "learning_rate": 5.438170669391927e-05, - "loss": 2.213361180137355e+25, + "epoch": 4.85, + "learning_rate": 5.079979360165119e-05, + "loss": 5.796248110436267e+28, "step": 250 }, { - "epoch": 2.52, - "learning_rate": 5.4241185487991826e-05, - "loss": 2.0319380055747437e+25, + "epoch": 4.99, + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8212, + "eval_samples_per_second": 56.017, + "eval_steps_per_second": 7.307, + "step": 257 + }, + { + "epoch": 5.05, + "learning_rate": 5.0515995872033025e-05, + "loss": 4.161409035417075e+28, "step": 260 }, { - "epoch": 2.62, - "learning_rate": 5.410066428206439e-05, - "loss": 2.1045071278258356e+25, + "epoch": 5.24, + "learning_rate": 5.023219814241486e-05, + "loss": 6.539357811234041e+28, "step": 270 }, { - "epoch": 2.72, - "learning_rate": 5.396014307613695e-05, - "loss": 2.3947839857650846e+25, + "epoch": 5.44, + "learning_rate": 4.99484004127967e-05, + "loss": 6.836600935974513e+28, "step": 280 }, { - "epoch": 2.82, - "learning_rate": 5.381962187020951e-05, - "loss": 2.1407918734188223e+25, + "epoch": 5.63, + "learning_rate": 4.9664602683178534e-05, + "loss": 6.539356300076766e+28, "step": 290 }, { - "epoch": 2.91, - "learning_rate": 5.3679100664282064e-05, - "loss": 2.285930117921006e+25, + "epoch": 5.83, + "learning_rate": 4.9380804953560375e-05, + "loss": 5.647626925855348e+28, "step": 300 }, { - "epoch": 3.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8107, - "eval_samples_per_second": 56.742, - "eval_steps_per_second": 7.401, + "epoch": 6.0, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7827, + "eval_samples_per_second": 58.772, + "eval_steps_per_second": 7.666, "step": 309 }, { - "epoch": 3.01, - "learning_rate": 5.3538579458354626e-05, - "loss": 2.1407916889513814e+25, + "epoch": 6.02, + "learning_rate": 4.909700722394221e-05, + "loss": 4.5577334535972505e+28, "step": 310 }, { - "epoch": 3.11, - "learning_rate": 5.339805825242719e-05, - "loss": 2.2133613646047957e+25, + "epoch": 6.21, + "learning_rate": 4.8813209494324044e-05, + "loss": 5.647626170276711e+28, "step": 320 }, { - "epoch": 3.2, - "learning_rate": 5.325753704649975e-05, - "loss": 1.9956536289166384e+25, + "epoch": 6.41, + "learning_rate": 4.8529411764705885e-05, + "loss": 5.944870050595821e+28, "step": 330 }, { - "epoch": 3.3, - "learning_rate": 5.311701584057231e-05, - "loss": 2.1045071278258356e+25, + "epoch": 6.6, + "learning_rate": 4.824561403508772e-05, + "loss": 6.687978618025639e+28, "step": 340 }, { - "epoch": 3.4, - "learning_rate": 5.2976494634644864e-05, - "loss": 2.177076619011809e+25, + "epoch": 6.8, + "learning_rate": 4.796181630546956e-05, + "loss": 5.944870050595821e+28, "step": 350 }, { - "epoch": 3.5, - "learning_rate": 5.2835973428717425e-05, - "loss": 2.1045071278258356e+25, + "epoch": 6.99, + "learning_rate": 4.7678018575851394e-05, + "loss": 6.093491612966058e+28, + "step": 360 + }, + { + "epoch": 6.99, + "eval_accuracy": 0.8043478260869565, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7872, + "eval_samples_per_second": 58.435, + "eval_steps_per_second": 7.622, "step": 360 }, { - "epoch": 3.59, - "learning_rate": 5.2695452222789986e-05, - "loss": 2.1045071278258356e+25, + "epoch": 7.18, + "learning_rate": 4.739422084623323e-05, + "loss": 6.48981678672487e+28, "step": 370 }, { - "epoch": 3.69, - "learning_rate": 5.255493101686255e-05, - "loss": 2.3584996091069793e+25, + "epoch": 7.38, + "learning_rate": 4.711042311661507e-05, + "loss": 5.053139920795767e+28, "step": 380 }, { - "epoch": 3.79, - "learning_rate": 5.24144098109351e-05, - "loss": 2.285930302388447e+25, + "epoch": 7.57, + "learning_rate": 4.6826625386996904e-05, + "loss": 6.093492368544695e+28, "step": 390 }, { - "epoch": 3.88, - "learning_rate": 5.2273888605007663e-05, - "loss": 2.0682227511677303e+25, + "epoch": 7.77, + "learning_rate": 4.6542827657378745e-05, + "loss": 5.64762654806603e+28, "step": 400 }, { - "epoch": 3.98, - "learning_rate": 5.2133367399080225e-05, - "loss": 2.358499240172098e+25, + "epoch": 7.96, + "learning_rate": 4.625902992776058e-05, + "loss": 6.985221742766112e+28, "step": 410 }, { - "epoch": 4.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7522, - "eval_samples_per_second": 61.156, - "eval_steps_per_second": 7.977, + "epoch": 8.0, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8117, + "eval_samples_per_second": 56.672, + "eval_steps_per_second": 7.392, "step": 412 }, { - "epoch": 4.08, - "learning_rate": 5.1992846193152786e-05, - "loss": 2.044032797794112e+25, + "epoch": 8.16, + "learning_rate": 4.597523219814241e-05, + "loss": 6.3907347377065295e+28, "step": 420 }, { - "epoch": 4.17, - "learning_rate": 5.185232498722535e-05, - "loss": 2.031938190042184e+25, + "epoch": 8.35, + "learning_rate": 4.5691434468524254e-05, + "loss": 8.025574946093677e+28, "step": 430 }, { - "epoch": 4.27, - "learning_rate": 5.17118037812979e-05, - "loss": 2.213360995669914e+25, + "epoch": 8.54, + "learning_rate": 4.540763673890609e-05, + "loss": 4.0127867174682015e+28, "step": 440 }, { - "epoch": 4.37, - "learning_rate": 5.157128257537046e-05, - "loss": 1.9956534444491974e+25, + "epoch": 8.74, + "learning_rate": 4.512383900928793e-05, + "loss": 5.499005363485113e+28, "step": 450 }, { - "epoch": 4.47, - "learning_rate": 5.1430761369443024e-05, - "loss": 2.213361180137355e+25, + "epoch": 8.93, + "learning_rate": 4.4840041279669764e-05, + "loss": 5.94487042838514e+28, "step": 460 }, { - "epoch": 4.56, - "learning_rate": 5.129024016351559e-05, - "loss": 2.068222382232849e+25, + "epoch": 8.99, + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7912, + "eval_samples_per_second": 58.141, + "eval_steps_per_second": 7.584, + "step": 463 + }, + { + "epoch": 9.13, + "learning_rate": 4.45562435500516e-05, + "loss": 5.300842776605706e+28, "step": 470 }, { - "epoch": 4.66, - "learning_rate": 5.114971895758815e-05, - "loss": 2.177076619011809e+25, + "epoch": 9.32, + "learning_rate": 4.427244582043344e-05, + "loss": 5.647626925855348e+28, "step": 480 }, { - "epoch": 4.76, - "learning_rate": 5.100919775166071e-05, - "loss": 2.0682225667002894e+25, + "epoch": 9.51, + "learning_rate": 4.398864809081527e-05, + "loss": 6.539357811234041e+28, "step": 490 }, { - "epoch": 4.85, - "learning_rate": 5.086867654573327e-05, - "loss": 2.3222148635139926e+25, + "epoch": 9.71, + "learning_rate": 4.3704850361197114e-05, + "loss": 5.053139920795767e+28, "step": 500 }, { - "epoch": 4.95, - "learning_rate": 5.072815533980583e-05, - "loss": 2.4310687313580712e+25, + "epoch": 9.9, + "learning_rate": 4.342105263157895e-05, + "loss": 8.174196508463913e+28, "step": 510 }, { - "epoch": 5.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7562, - "eval_samples_per_second": 60.833, - "eval_steps_per_second": 7.935, + "epoch": 10.0, + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8382, + "eval_samples_per_second": 54.88, + "eval_steps_per_second": 7.158, "step": 515 }, { - "epoch": 5.05, - "learning_rate": 5.058763413387839e-05, - "loss": 2.3584996091069793e+25, + "epoch": 10.1, + "learning_rate": 4.313725490196078e-05, + "loss": 5.449463961186624e+28, "step": 520 }, { - "epoch": 5.15, - "learning_rate": 5.0447112927950946e-05, - "loss": 2.0682227511677303e+25, + "epoch": 10.29, + "learning_rate": 4.2853457172342624e-05, + "loss": 5.944870050595821e+28, "step": 530 }, { - "epoch": 5.24, - "learning_rate": 5.030659172202351e-05, - "loss": 2.1407918734188223e+25, + "epoch": 10.49, + "learning_rate": 4.2569659442724465e-05, + "loss": 6.687978618025639e+28, "step": 540 }, { - "epoch": 5.34, - "learning_rate": 5.016607051609607e-05, - "loss": 1.9593688833236517e+25, + "epoch": 10.68, + "learning_rate": 4.22858617131063e-05, + "loss": 6.3907347377065295e+28, "step": 550 }, { - "epoch": 5.44, - "learning_rate": 5.002554931016863e-05, - "loss": 2.177076619011809e+25, + "epoch": 10.87, + "learning_rate": 4.200206398348813e-05, + "loss": 5.796248488225585e+28, "step": 560 }, { - "epoch": 5.53, - "learning_rate": 4.9885028104241185e-05, - "loss": 1.9956534444491974e+25, + "epoch": 10.99, + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7817, + "eval_samples_per_second": 58.847, + "eval_steps_per_second": 7.676, + "step": 566 + }, + { + "epoch": 11.07, + "learning_rate": 4.171826625386997e-05, + "loss": 5.5980859013461785e+28, "step": 570 }, { - "epoch": 5.63, - "learning_rate": 4.9744506898313746e-05, - "loss": 2.2133613646047957e+25, + "epoch": 11.26, + "learning_rate": 4.143446852425181e-05, + "loss": 6.242113553125612e+28, "step": 580 }, { - "epoch": 5.73, - "learning_rate": 4.960398569238631e-05, - "loss": 2.3222148635139926e+25, + "epoch": 11.46, + "learning_rate": 4.115067079463365e-05, + "loss": 7.4310871854554575e+28, "step": 590 }, { - "epoch": 5.83, - "learning_rate": 4.946346448645887e-05, - "loss": 2.104507312293276e+25, + "epoch": 11.65, + "learning_rate": 4.0866873065015484e-05, + "loss": 5.350383045536239e+28, "step": 600 }, { - "epoch": 5.92, - "learning_rate": 4.932294328053143e-05, - "loss": 2.4310687313580712e+25, + "epoch": 11.84, + "learning_rate": 4.058307533539732e-05, + "loss": 5.796247354857629e+28, "step": 610 }, { - "epoch": 6.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7787, - "eval_samples_per_second": 59.074, - "eval_steps_per_second": 7.705, + "epoch": 12.0, + "eval_accuracy": 0.8043478260869565, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7862, + "eval_samples_per_second": 58.51, + "eval_steps_per_second": 7.632, "step": 618 }, { - "epoch": 6.02, - "learning_rate": 4.9182422074603984e-05, - "loss": 2.26174071794971e+25, + "epoch": 12.04, + "learning_rate": 4.029927760577915e-05, + "loss": 4.90451835842553e+28, "step": 620 }, { - "epoch": 6.12, - "learning_rate": 4.9041900868676545e-05, - "loss": 2.0319380055747437e+25, + "epoch": 12.23, + "learning_rate": 4.001547987616099e-05, + "loss": 5.053139920795767e+28, "step": 630 }, { - "epoch": 6.21, - "learning_rate": 4.890137966274911e-05, - "loss": 2.3947841702325255e+25, + "epoch": 12.43, + "learning_rate": 3.9731682146542834e-05, + "loss": 5.64762654806603e+28, "step": 640 }, { - "epoch": 6.31, - "learning_rate": 4.876085845682167e-05, - "loss": 2.1045071278258356e+25, + "epoch": 12.62, + "learning_rate": 3.944788441692467e-05, + "loss": 7.282465623085222e+28, "step": 650 }, { - "epoch": 6.41, - "learning_rate": 4.862033725089422e-05, - "loss": 2.2496459257303415e+25, + "epoch": 12.82, + "learning_rate": 3.91640866873065e-05, + "loss": 5.796248488225585e+28, "step": 660 }, { - "epoch": 6.5, - "learning_rate": 4.8479816044966784e-05, - "loss": 2.031938190042184e+25, + "epoch": 12.99, + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7922, + "eval_samples_per_second": 58.067, + "eval_steps_per_second": 7.574, + "step": 669 + }, + { + "epoch": 13.01, + "learning_rate": 3.888028895768834e-05, + "loss": 5.647626170276711e+28, "step": 670 }, { - "epoch": 6.6, - "learning_rate": 4.8339294839039345e-05, - "loss": 2.1407918734188223e+25, + "epoch": 13.2, + "learning_rate": 3.859649122807018e-05, + "loss": 6.687979373604276e+28, "step": 680 }, { - "epoch": 6.7, - "learning_rate": 4.8198773633111906e-05, - "loss": 2.140792057886263e+25, + "epoch": 13.4, + "learning_rate": 3.831269349845202e-05, + "loss": 6.985221742766112e+28, "step": 690 }, { - "epoch": 6.8, - "learning_rate": 4.805825242718447e-05, - "loss": 2.213360995669914e+25, + "epoch": 13.59, + "learning_rate": 3.802889576883385e-05, + "loss": 4.458652537946866e+28, "step": 700 }, { - "epoch": 6.89, - "learning_rate": 4.791773122125703e-05, - "loss": 2.0682225667002894e+25, + "epoch": 13.79, + "learning_rate": 3.774509803921569e-05, + "loss": 6.093492368544695e+28, "step": 710 }, { - "epoch": 6.99, - "learning_rate": 4.777721001532959e-05, - "loss": 2.2496457412629006e+25, + "epoch": 13.98, + "learning_rate": 3.746130030959752e-05, + "loss": 6.093491612966058e+28, "step": 720 }, { - "epoch": 7.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7757, - "eval_samples_per_second": 59.302, - "eval_steps_per_second": 7.735, + "epoch": 14.0, + "eval_accuracy": 0.8043478260869565, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7842, + "eval_samples_per_second": 58.659, + "eval_steps_per_second": 7.651, "step": 721 }, { - "epoch": 7.09, - "learning_rate": 4.763668880940215e-05, - "loss": 2.044032797794112e+25, + "epoch": 14.17, + "learning_rate": 3.7177502579979356e-05, + "loss": 5.449464338975942e+28, "step": 730 }, { - "epoch": 7.18, - "learning_rate": 4.749616760347471e-05, - "loss": 2.177076434544368e+25, + "epoch": 14.37, + "learning_rate": 3.6893704850361204e-05, + "loss": 6.242113175336294e+28, "step": 740 }, { - "epoch": 7.28, - "learning_rate": 4.735564639754727e-05, - "loss": 2.177076434544368e+25, + "epoch": 14.56, + "learning_rate": 3.660990712074304e-05, + "loss": 6.242113930914931e+28, "step": 750 }, { - "epoch": 7.38, - "learning_rate": 4.721512519161983e-05, - "loss": 2.3947841702325255e+25, + "epoch": 14.76, + "learning_rate": 3.632610939112487e-05, + "loss": 5.944870050595821e+28, "step": 760 }, { - "epoch": 7.48, - "learning_rate": 4.707460398569239e-05, - "loss": 2.3947841702325255e+25, + "epoch": 14.95, + "learning_rate": 3.6042311661506706e-05, + "loss": 5.796247732646948e+28, "step": 770 }, { - "epoch": 7.57, - "learning_rate": 4.693408277976495e-05, - "loss": 1.923084137730665e+25, + "epoch": 14.99, + "eval_accuracy": 0.782608695652174, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8192, + "eval_samples_per_second": 56.153, + "eval_steps_per_second": 7.324, + "step": 772 + }, + { + "epoch": 15.15, + "learning_rate": 3.575851393188854e-05, + "loss": 7.183384329645518e+28, "step": 780 }, { - "epoch": 7.67, - "learning_rate": 4.679356157383751e-05, - "loss": 2.3222148635139926e+25, + "epoch": 15.34, + "learning_rate": 3.547471620227039e-05, + "loss": 5.944870806174458e+28, "step": 790 }, { - "epoch": 7.77, - "learning_rate": 4.6653040367910067e-05, - "loss": 2.104507496760717e+25, + "epoch": 15.53, + "learning_rate": 3.519091847265222e-05, + "loss": 6.093491612966058e+28, "step": 800 }, { - "epoch": 7.86, - "learning_rate": 4.651251916198263e-05, - "loss": 1.995653259981757e+25, + "epoch": 15.73, + "learning_rate": 3.490712074303406e-05, + "loss": 5.64762654806603e+28, "step": 810 }, { - "epoch": 7.96, - "learning_rate": 4.637199795605519e-05, - "loss": 2.1045071278258356e+25, + "epoch": 15.92, + "learning_rate": 3.462332301341589e-05, + "loss": 5.0531391652171296e+28, "step": 820 }, { - "epoch": 8.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8062, - "eval_samples_per_second": 57.059, - "eval_steps_per_second": 7.443, + "epoch": 16.0, + "eval_accuracy": 0.782608695652174, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7942, + "eval_samples_per_second": 57.921, + "eval_steps_per_second": 7.555, "step": 824 }, { - "epoch": 8.06, - "learning_rate": 4.623147675012775e-05, - "loss": 2.0319380055747437e+25, + "epoch": 16.12, + "learning_rate": 3.4339525283797725e-05, + "loss": 7.4310871854554575e+28, "step": 830 }, { - "epoch": 8.16, - "learning_rate": 4.6090955544200305e-05, - "loss": 2.213361180137355e+25, + "epoch": 16.31, + "learning_rate": 3.405572755417957e-05, + "loss": 5.64762654806603e+28, "step": 840 }, { - "epoch": 8.25, - "learning_rate": 4.5950434338272866e-05, - "loss": 1.995653259981757e+25, + "epoch": 16.5, + "learning_rate": 3.377192982456141e-05, + "loss": 6.242113930914931e+28, "step": 850 }, { - "epoch": 8.35, - "learning_rate": 4.580991313234543e-05, - "loss": 1.850515015479573e+25, + "epoch": 16.7, + "learning_rate": 3.348813209494324e-05, + "loss": 5.944870050595821e+28, "step": 860 }, { - "epoch": 8.45, - "learning_rate": 4.566939192641799e-05, - "loss": 2.431068915825512e+25, + "epoch": 16.89, + "learning_rate": 3.3204334365325076e-05, + "loss": 5.796248110436267e+28, "step": 870 }, { - "epoch": 8.54, - "learning_rate": 4.552887072049055e-05, - "loss": 2.394784354699966e+25, + "epoch": 16.99, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7732, + "eval_samples_per_second": 59.495, + "eval_steps_per_second": 7.76, + "step": 875 + }, + { + "epoch": 17.09, + "learning_rate": 3.292053663570691e-05, + "loss": 5.300842776605706e+28, "step": 880 }, { - "epoch": 8.64, - "learning_rate": 4.5388349514563104e-05, - "loss": 2.2133613646047957e+25, + "epoch": 17.28, + "learning_rate": 3.263673890608876e-05, + "loss": 5.944870050595821e+28, "step": 890 }, { - "epoch": 8.74, - "learning_rate": 4.5247828308635666e-05, - "loss": 2.24964555679546e+25, + "epoch": 17.48, + "learning_rate": 3.235294117647059e-05, + "loss": 7.4310879410340955e+28, "step": 900 }, { - "epoch": 8.83, - "learning_rate": 4.510730710270823e-05, - "loss": 2.2496457412629006e+25, + "epoch": 17.67, + "learning_rate": 3.2069143446852426e-05, + "loss": 5.499004607906475e+28, "step": 910 }, { - "epoch": 8.93, - "learning_rate": 4.496678589678079e-05, - "loss": 2.1045071278258356e+25, + "epoch": 17.86, + "learning_rate": 3.178534571723426e-05, + "loss": 4.755896040476657e+28, "step": 920 }, { - "epoch": 9.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7902, - "eval_samples_per_second": 58.215, - "eval_steps_per_second": 7.593, + "epoch": 18.0, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8047, + "eval_samples_per_second": 57.165, + "eval_steps_per_second": 7.456, "step": 927 }, { - "epoch": 9.03, - "learning_rate": 4.482626469085334e-05, - "loss": 2.2254559723567234e+25, + "epoch": 18.06, + "learning_rate": 3.1501547987616095e-05, + "loss": 6.935681473835579e+28, "step": 930 }, { - "epoch": 9.13, - "learning_rate": 4.468574348492591e-05, - "loss": 2.285930117921006e+25, + "epoch": 18.25, + "learning_rate": 3.121775025799794e-05, + "loss": 5.94487042838514e+28, "step": 940 }, { - "epoch": 9.22, - "learning_rate": 4.454522227899847e-05, - "loss": 2.177076619011809e+25, + "epoch": 18.45, + "learning_rate": 3.093395252837978e-05, + "loss": 5.796247732646948e+28, "step": 950 }, { - "epoch": 9.32, - "learning_rate": 4.440470107307103e-05, - "loss": 2.2496459257303415e+25, + "epoch": 18.64, + "learning_rate": 3.065015479876161e-05, + "loss": 5.944869672806502e+28, "step": 960 }, { - "epoch": 9.42, - "learning_rate": 4.426417986714359e-05, - "loss": 1.9593688833236517e+25, + "epoch": 18.83, + "learning_rate": 3.0366357069143445e-05, + "loss": 6.836600180395876e+28, "step": 970 }, { - "epoch": 9.51, - "learning_rate": 4.412365866121615e-05, - "loss": 2.24964555679546e+25, + "epoch": 18.99, + "eval_accuracy": 0.782608695652174, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8292, + "eval_samples_per_second": 55.476, + "eval_steps_per_second": 7.236, + "step": 978 + }, + { + "epoch": 19.03, + "learning_rate": 3.0082559339525283e-05, + "loss": 5.746708219295052e+28, "step": 980 }, { - "epoch": 9.61, - "learning_rate": 4.398313745528871e-05, - "loss": 2.285930117921006e+25, + "epoch": 19.22, + "learning_rate": 2.9798761609907124e-05, + "loss": 5.3503834233255575e+28, "step": 990 }, { - "epoch": 9.71, - "learning_rate": 4.384261624936127e-05, - "loss": 2.285930117921006e+25, + "epoch": 19.42, + "learning_rate": 2.9514963880288958e-05, + "loss": 5.64762654806603e+28, "step": 1000 }, { - "epoch": 9.81, - "learning_rate": 4.370209504343383e-05, - "loss": 1.8505148310121323e+25, + "epoch": 19.61, + "learning_rate": 2.9231166150670796e-05, + "loss": 6.687979373604276e+28, "step": 1010 }, { - "epoch": 9.9, - "learning_rate": 4.356157383750639e-05, - "loss": 1.9593688833236517e+25, + "epoch": 19.81, + "learning_rate": 2.894736842105263e-05, + "loss": 6.093491612966058e+28, "step": 1020 }, { - "epoch": 10.0, - "learning_rate": 4.342105263157895e-05, - "loss": 2.3343098402008016e+25, + "epoch": 20.0, + "learning_rate": 2.8663570691434468e-05, + "loss": 6.043951721824843e+28, "step": 1030 }, { - "epoch": 10.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7707, - "eval_samples_per_second": 59.688, - "eval_steps_per_second": 7.785, + "epoch": 20.0, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7862, + "eval_samples_per_second": 58.511, + "eval_steps_per_second": 7.632, "step": 1030 }, { - "epoch": 10.1, - "learning_rate": 4.328053142565151e-05, - "loss": 2.1407918734188223e+25, + "epoch": 20.19, + "learning_rate": 2.837977296181631e-05, + "loss": 5.796248488225585e+28, "step": 1040 }, { - "epoch": 10.19, - "learning_rate": 4.314001021972407e-05, - "loss": 2.4310687313580712e+25, + "epoch": 20.39, + "learning_rate": 2.8095975232198143e-05, + "loss": 6.093491990755376e+28, "step": 1050 }, { - "epoch": 10.29, - "learning_rate": 4.2999489013796626e-05, - "loss": 1.9230843221981057e+25, + "epoch": 20.58, + "learning_rate": 2.781217750257998e-05, + "loss": 5.3503834233255575e+28, "step": 1060 }, { - "epoch": 10.39, - "learning_rate": 4.285896780786919e-05, - "loss": 2.285930117921006e+25, + "epoch": 20.78, + "learning_rate": 2.7528379772961815e-05, + "loss": 5.647626925855348e+28, "step": 1070 }, { - "epoch": 10.49, - "learning_rate": 4.271844660194175e-05, - "loss": 1.886799576605119e+25, + "epoch": 20.97, + "learning_rate": 2.7244582043343656e-05, + "loss": 7.579709503404331e+28, "step": 1080 }, { - "epoch": 10.58, - "learning_rate": 4.257792539601431e-05, - "loss": 2.177076250076927e+25, + "epoch": 20.99, + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8627, + "eval_samples_per_second": 53.321, + "eval_steps_per_second": 6.955, + "step": 1081 + }, + { + "epoch": 21.17, + "learning_rate": 2.696078431372549e-05, + "loss": 5.944870050595821e+28, "step": 1090 }, { - "epoch": 10.68, - "learning_rate": 4.243740419008687e-05, - "loss": 2.0682227511677303e+25, + "epoch": 21.36, + "learning_rate": 2.6676986584107328e-05, + "loss": 4.904517602846893e+28, "step": 1100 }, { - "epoch": 10.78, - "learning_rate": 4.2296882984159425e-05, - "loss": 2.1407918734188223e+25, + "epoch": 21.55, + "learning_rate": 2.6393188854489165e-05, + "loss": 6.242113175336294e+28, "step": 1110 }, { - "epoch": 10.87, - "learning_rate": 4.2156361778231986e-05, - "loss": 2.2496457412629006e+25, + "epoch": 21.75, + "learning_rate": 2.6109391124871003e-05, + "loss": 6.242113930914931e+28, "step": 1120 }, { - "epoch": 10.97, - "learning_rate": 4.201584057230455e-05, - "loss": 2.3222148635139926e+25, + "epoch": 21.94, + "learning_rate": 2.582559339525284e-05, + "loss": 5.944870050595821e+28, "step": 1130 }, { - "epoch": 11.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7762, - "eval_samples_per_second": 59.265, - "eval_steps_per_second": 7.73, + "epoch": 22.0, + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7952, + "eval_samples_per_second": 57.847, + "eval_steps_per_second": 7.545, "step": 1133 }, { - "epoch": 11.07, - "learning_rate": 4.187531936637711e-05, - "loss": 2.1166021045126447e+25, + "epoch": 22.14, + "learning_rate": 2.5541795665634675e-05, + "loss": 6.687978618025639e+28, "step": 1140 }, { - "epoch": 11.17, - "learning_rate": 4.173479816044967e-05, - "loss": 2.177076250076927e+25, + "epoch": 22.33, + "learning_rate": 2.5257997936016512e-05, + "loss": 5.796248866014904e+28, "step": 1150 }, { - "epoch": 11.26, - "learning_rate": 4.1594276954522225e-05, - "loss": 2.1045071278258356e+25, + "epoch": 22.52, + "learning_rate": 2.497420020639835e-05, + "loss": 5.0531391652171296e+28, "step": 1160 }, { - "epoch": 11.36, - "learning_rate": 4.1453755748594786e-05, - "loss": 2.068222382232849e+25, + "epoch": 22.72, + "learning_rate": 2.4690402476780188e-05, + "loss": 5.796248488225585e+28, "step": 1170 }, { - "epoch": 11.46, - "learning_rate": 4.1313234542667354e-05, - "loss": 1.923084137730665e+25, + "epoch": 22.91, + "learning_rate": 2.4406604747162022e-05, + "loss": 6.3907354932851674e+28, "step": 1180 }, { - "epoch": 11.55, - "learning_rate": 4.1172713336739915e-05, - "loss": 2.213361180137355e+25, + "epoch": 22.99, + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7682, + "eval_samples_per_second": 59.882, + "eval_steps_per_second": 7.811, + "step": 1184 + }, + { + "epoch": 23.11, + "learning_rate": 2.412280701754386e-05, + "loss": 5.449464338975942e+28, "step": 1190 }, { - "epoch": 11.65, - "learning_rate": 4.103219213081247e-05, - "loss": 2.285930117921006e+25, + "epoch": 23.3, + "learning_rate": 2.3839009287925697e-05, + "loss": 5.944869672806502e+28, "step": 1200 }, { - "epoch": 11.75, - "learning_rate": 4.089167092488503e-05, - "loss": 2.104507312293276e+25, + "epoch": 23.5, + "learning_rate": 2.3555211558307535e-05, + "loss": 5.796247732646948e+28, "step": 1210 }, { - "epoch": 11.84, - "learning_rate": 4.075114971895759e-05, - "loss": 2.2859304868558873e+25, + "epoch": 23.69, + "learning_rate": 2.3271413828689372e-05, + "loss": 5.350383045536239e+28, "step": 1220 }, { - "epoch": 11.94, - "learning_rate": 4.0610628513030154e-05, - "loss": 2.3222150479814335e+25, + "epoch": 23.88, + "learning_rate": 2.2987616099071207e-05, + "loss": 7.876953383723441e+28, "step": 1230 }, - { - "epoch": 12.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7912, - "eval_samples_per_second": 58.14, - "eval_steps_per_second": 7.583, - "step": 1236 - }, - { - "epoch": 12.04, - "learning_rate": 4.047010730710271e-05, - "loss": 2.285930117921006e+25, - "step": 1240 - }, - { - "epoch": 12.14, - "learning_rate": 4.032958610117527e-05, - "loss": 2.2496457412629006e+25, - "step": 1250 - }, - { - "epoch": 12.23, - "learning_rate": 4.018906489524783e-05, - "loss": 2.3222148635139926e+25, - "step": 1260 - }, - { - "epoch": 12.33, - "learning_rate": 4.004854368932039e-05, - "loss": 2.3222148635139926e+25, - "step": 1270 - }, - { - "epoch": 12.43, - "learning_rate": 3.990802248339295e-05, - "loss": 2.1045071278258356e+25, - "step": 1280 - }, - { - "epoch": 12.52, - "learning_rate": 3.976750127746551e-05, - "loss": 1.850515015479573e+25, - "step": 1290 - }, - { - "epoch": 12.62, - "learning_rate": 3.962698007153807e-05, - "loss": 2.177076250076927e+25, - "step": 1300 - }, - { - "epoch": 12.72, - "learning_rate": 3.948645886561063e-05, - "loss": 2.2496459257303415e+25, - "step": 1310 - }, - { - "epoch": 12.82, - "learning_rate": 3.934593765968319e-05, - "loss": 2.1407918734188223e+25, - "step": 1320 - }, - { - "epoch": 12.91, - "learning_rate": 3.9205416453755746e-05, - "loss": 2.1407918734188223e+25, - "step": 1330 - }, - { - "epoch": 13.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7662, - "eval_samples_per_second": 60.039, - "eval_steps_per_second": 7.831, - "step": 1339 - }, - { - "epoch": 13.01, - "learning_rate": 3.906489524782831e-05, - "loss": 2.2859304868558873e+25, - "step": 1340 - }, - { - "epoch": 13.11, - "learning_rate": 3.892437404190087e-05, - "loss": 2.104507312293276e+25, - "step": 1350 - }, - { - "epoch": 13.2, - "learning_rate": 3.878385283597343e-05, - "loss": 2.0682227511677303e+25, - "step": 1360 - }, - { - "epoch": 13.3, - "learning_rate": 3.864333163004599e-05, - "loss": 2.0682227511677303e+25, - "step": 1370 - }, - { - "epoch": 13.4, - "learning_rate": 3.8502810424118545e-05, - "loss": 2.0319378211073027e+25, - "step": 1380 - }, - { - "epoch": 13.5, - "learning_rate": 3.836228921819111e-05, - "loss": 2.3222148635139926e+25, - "step": 1390 - }, - { - "epoch": 13.59, - "learning_rate": 3.822176801226367e-05, - "loss": 2.3947839857650846e+25, - "step": 1400 - }, - { - "epoch": 13.69, - "learning_rate": 3.8081246806336236e-05, - "loss": 2.213361180137355e+25, - "step": 1410 - }, - { - "epoch": 13.79, - "learning_rate": 3.794072560040879e-05, - "loss": 2.104507312293276e+25, - "step": 1420 - }, - { - "epoch": 13.88, - "learning_rate": 3.780020439448135e-05, - "loss": 2.177076619011809e+25, - "step": 1430 - }, - { - "epoch": 13.98, - "learning_rate": 3.765968318855391e-05, - "loss": 2.1407916889513814e+25, - "step": 1440 - }, - { - "epoch": 14.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8082, - "eval_samples_per_second": 56.918, - "eval_steps_per_second": 7.424, - "step": 1442 - }, - { - "epoch": 14.08, - "learning_rate": 3.7519161982626474e-05, - "loss": 2.2617409024171505e+25, - "step": 1450 - }, - { - "epoch": 14.17, - "learning_rate": 3.7378640776699036e-05, - "loss": 2.213360995669914e+25, - "step": 1460 - }, - { - "epoch": 14.27, - "learning_rate": 3.723811957077159e-05, - "loss": 2.177076619011809e+25, - "step": 1470 - }, - { - "epoch": 14.37, - "learning_rate": 3.709759836484415e-05, - "loss": 2.104507312293276e+25, - "step": 1480 - }, - { - "epoch": 14.47, - "learning_rate": 3.695707715891671e-05, - "loss": 2.140792057886263e+25, - "step": 1490 - }, - { - "epoch": 14.56, - "learning_rate": 3.6816555952989274e-05, - "loss": 2.1407918734188223e+25, - "step": 1500 - }, - { - "epoch": 14.66, - "learning_rate": 3.667603474706183e-05, - "loss": 2.2859304868558873e+25, - "step": 1510 - }, - { - "epoch": 14.76, - "learning_rate": 3.653551354113439e-05, - "loss": 2.0682225667002894e+25, - "step": 1520 - }, - { - "epoch": 14.85, - "learning_rate": 3.639499233520695e-05, - "loss": 2.2133608112024734e+25, - "step": 1530 - }, - { - "epoch": 14.95, - "learning_rate": 3.625447112927951e-05, - "loss": 2.177076434544368e+25, - "step": 1540 - }, - { - "epoch": 15.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7906, - "eval_samples_per_second": 58.183, - "eval_steps_per_second": 7.589, - "step": 1545 - }, - { - "epoch": 15.05, - "learning_rate": 3.611394992335207e-05, - "loss": 1.9109895299787373e+25, - "step": 1550 - }, - { - "epoch": 15.15, - "learning_rate": 3.597342871742463e-05, - "loss": 2.1407918734188223e+25, - "step": 1560 - }, - { - "epoch": 15.24, - "learning_rate": 3.583290751149719e-05, - "loss": 2.104507312293276e+25, - "step": 1570 - }, - { - "epoch": 15.34, - "learning_rate": 3.569238630556975e-05, - "loss": 2.2496457412629006e+25, - "step": 1580 - }, - { - "epoch": 15.44, - "learning_rate": 3.555186509964231e-05, - "loss": 2.1407918734188223e+25, - "step": 1590 - }, - { - "epoch": 15.53, - "learning_rate": 3.5411343893714866e-05, - "loss": 2.177076434544368e+25, - "step": 1600 - }, - { - "epoch": 15.63, - "learning_rate": 3.527082268778743e-05, - "loss": 2.1407918734188223e+25, - "step": 1610 - }, - { - "epoch": 15.73, - "learning_rate": 3.513030148185999e-05, - "loss": 2.285930117921006e+25, - "step": 1620 - }, - { - "epoch": 15.83, - "learning_rate": 3.498978027593255e-05, - "loss": 2.3222148635139926e+25, - "step": 1630 - }, - { - "epoch": 15.92, - "learning_rate": 3.484925907000511e-05, - "loss": 2.24964555679546e+25, - "step": 1640 - }, - { - "epoch": 16.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7962, - "eval_samples_per_second": 57.774, - "eval_steps_per_second": 7.536, - "step": 1648 - }, - { - "epoch": 16.02, - "learning_rate": 3.470873786407767e-05, - "loss": 2.0682227511677303e+25, - "step": 1650 - }, - { - "epoch": 16.12, - "learning_rate": 3.4568216658150234e-05, - "loss": 1.923084137730665e+25, - "step": 1660 - }, - { - "epoch": 16.21, - "learning_rate": 3.4427695452222795e-05, - "loss": 2.2496457412629006e+25, - "step": 1670 - }, - { - "epoch": 16.31, - "learning_rate": 3.4287174246295356e-05, - "loss": 2.177076619011809e+25, - "step": 1680 - }, - { - "epoch": 16.41, - "learning_rate": 3.414665304036791e-05, - "loss": 2.1407918734188223e+25, - "step": 1690 - }, - { - "epoch": 16.5, - "learning_rate": 3.400613183444047e-05, - "loss": 2.1407916889513814e+25, - "step": 1700 - }, - { - "epoch": 16.6, - "learning_rate": 3.386561062851303e-05, - "loss": 2.0682227511677303e+25, - "step": 1710 - }, - { - "epoch": 16.7, - "learning_rate": 3.3725089422585595e-05, - "loss": 2.285930302388447e+25, - "step": 1720 - }, - { - "epoch": 16.8, - "learning_rate": 3.358456821665815e-05, - "loss": 2.104507496760717e+25, - "step": 1730 - }, - { - "epoch": 16.89, - "learning_rate": 3.344404701073071e-05, - "loss": 2.2859304868558873e+25, - "step": 1740 - }, - { - "epoch": 16.99, - "learning_rate": 3.330352580480327e-05, - "loss": 2.285930302388447e+25, - "step": 1750 - }, - { - "epoch": 17.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7727, - "eval_samples_per_second": 59.533, - "eval_steps_per_second": 7.765, - "step": 1751 - }, - { - "epoch": 17.09, - "learning_rate": 3.316300459887583e-05, - "loss": 2.2254557878892825e+25, - "step": 1760 - }, - { - "epoch": 17.18, - "learning_rate": 3.3022483392948394e-05, - "loss": 1.9956534444491974e+25, - "step": 1770 - }, - { - "epoch": 17.28, - "learning_rate": 3.288196218702095e-05, - "loss": 2.3584994246395388e+25, - "step": 1780 - }, - { - "epoch": 17.38, - "learning_rate": 3.274144098109351e-05, - "loss": 2.1045071278258356e+25, - "step": 1790 - }, - { - "epoch": 17.48, - "learning_rate": 3.260091977516607e-05, - "loss": 1.886799576605119e+25, - "step": 1800 - }, - { - "epoch": 17.57, - "learning_rate": 3.246039856923863e-05, - "loss": 2.3584994246395388e+25, - "step": 1810 - }, - { - "epoch": 17.67, - "learning_rate": 3.2319877363311194e-05, - "loss": 2.104507312293276e+25, - "step": 1820 - }, - { - "epoch": 17.77, - "learning_rate": 3.217935615738375e-05, - "loss": 2.285930302388447e+25, - "step": 1830 - }, - { - "epoch": 17.86, - "learning_rate": 3.203883495145631e-05, - "loss": 2.3584994246395388e+25, - "step": 1840 - }, - { - "epoch": 17.96, - "learning_rate": 3.189831374552887e-05, - "loss": 1.9593686988562108e+25, - "step": 1850 - }, - { - "epoch": 18.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8012, - "eval_samples_per_second": 57.414, - "eval_steps_per_second": 7.489, - "step": 1854 - }, - { - "epoch": 18.06, - "learning_rate": 3.175779253960143e-05, - "loss": 2.1528866656381905e+25, - "step": 1860 - }, - { - "epoch": 18.16, - "learning_rate": 3.1617271333673986e-05, - "loss": 2.0682225667002894e+25, - "step": 1870 - }, - { - "epoch": 18.25, - "learning_rate": 3.147675012774655e-05, - "loss": 2.285930117921006e+25, - "step": 1880 - }, - { - "epoch": 18.35, - "learning_rate": 3.1336228921819116e-05, - "loss": 2.285930302388447e+25, - "step": 1890 - }, - { - "epoch": 18.45, - "learning_rate": 3.119570771589168e-05, - "loss": 2.104507312293276e+25, - "step": 1900 - }, - { - "epoch": 18.54, - "learning_rate": 3.105518650996423e-05, - "loss": 2.140792057886263e+25, - "step": 1910 - }, - { - "epoch": 18.64, - "learning_rate": 3.091466530403679e-05, - "loss": 2.213360995669914e+25, - "step": 1920 - }, - { - "epoch": 18.74, - "learning_rate": 3.0774144098109354e-05, - "loss": 2.177076250076927e+25, - "step": 1930 - }, - { - "epoch": 18.83, - "learning_rate": 3.0633622892181915e-05, - "loss": 1.9593688833236517e+25, - "step": 1940 - }, - { - "epoch": 18.93, - "learning_rate": 3.0493101686254473e-05, - "loss": 2.2859304868558873e+25, - "step": 1950 - }, - { - "epoch": 19.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7922, - "eval_samples_per_second": 58.068, - "eval_steps_per_second": 7.574, - "step": 1957 - }, - { - "epoch": 19.03, - "learning_rate": 3.0352580480327034e-05, - "loss": 2.116602288980085e+25, - "step": 1960 - }, - { - "epoch": 19.13, - "learning_rate": 3.0212059274399592e-05, - "loss": 2.3222148635139926e+25, - "step": 1970 - }, - { - "epoch": 19.22, - "learning_rate": 3.0071538068472154e-05, - "loss": 2.177076619011809e+25, - "step": 1980 - }, - { - "epoch": 19.32, - "learning_rate": 2.993101686254471e-05, - "loss": 2.3222148635139926e+25, - "step": 1990 - }, - { - "epoch": 19.42, - "learning_rate": 2.9790495656617273e-05, - "loss": 2.1045071278258356e+25, - "step": 2000 - }, - { - "epoch": 19.51, - "learning_rate": 2.964997445068983e-05, - "loss": 1.9593685143887703e+25, - "step": 2010 - }, - { - "epoch": 19.61, - "learning_rate": 2.9509453244762392e-05, - "loss": 2.213360995669914e+25, - "step": 2020 - }, - { - "epoch": 19.71, - "learning_rate": 2.9368932038834953e-05, - "loss": 2.2496457412629006e+25, - "step": 2030 - }, - { - "epoch": 19.81, - "learning_rate": 2.922841083290751e-05, - "loss": 2.0682225667002894e+25, - "step": 2040 - }, - { - "epoch": 19.9, - "learning_rate": 2.9087889626980072e-05, - "loss": 2.177076434544368e+25, - "step": 2050 - }, - { - "epoch": 20.0, - "learning_rate": 2.894736842105263e-05, - "loss": 2.1528866656381905e+25, - "step": 2060 - }, - { - "epoch": 20.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7897, - "eval_samples_per_second": 58.251, - "eval_steps_per_second": 7.598, - "step": 2060 - }, - { - "epoch": 20.1, - "learning_rate": 2.880684721512519e-05, - "loss": 2.177076250076927e+25, - "step": 2070 - }, - { - "epoch": 20.19, - "learning_rate": 2.866632600919775e-05, - "loss": 2.213360995669914e+25, - "step": 2080 - }, - { - "epoch": 20.29, - "learning_rate": 2.852580480327031e-05, - "loss": 1.9593686988562108e+25, - "step": 2090 - }, - { - "epoch": 20.39, - "learning_rate": 2.8385283597342872e-05, - "loss": 2.358499240172098e+25, - "step": 2100 - }, - { - "epoch": 20.49, - "learning_rate": 2.824476239141543e-05, - "loss": 2.213360995669914e+25, - "step": 2110 - }, - { - "epoch": 20.58, - "learning_rate": 2.8104241185487994e-05, - "loss": 2.285930117921006e+25, - "step": 2120 - }, - { - "epoch": 20.68, - "learning_rate": 2.7963719979560556e-05, - "loss": 2.3222148635139926e+25, - "step": 2130 - }, - { - "epoch": 20.78, - "learning_rate": 2.7823198773633117e-05, - "loss": 2.1045071278258356e+25, - "step": 2140 - }, - { - "epoch": 20.87, - "learning_rate": 2.7682677567705675e-05, - "loss": 2.2496457412629006e+25, - "step": 2150 - }, - { - "epoch": 20.97, - "learning_rate": 2.7542156361778236e-05, - "loss": 1.7053764020425078e+25, - "step": 2160 - }, - { - "epoch": 21.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7782, - "eval_samples_per_second": 59.113, - "eval_steps_per_second": 7.71, - "step": 2163 - }, - { - "epoch": 21.07, - "learning_rate": 2.740163515585079e-05, - "loss": 2.213360995669914e+25, - "step": 2170 - }, - { - "epoch": 21.17, - "learning_rate": 2.7261113949923355e-05, - "loss": 2.1407918734188223e+25, - "step": 2180 - }, - { - "epoch": 21.26, - "learning_rate": 2.7120592743995913e-05, - "loss": 2.104507496760717e+25, - "step": 2190 - }, - { - "epoch": 21.36, - "learning_rate": 2.6980071538068474e-05, - "loss": 2.5036378536091632e+25, - "step": 2200 - }, - { - "epoch": 21.46, - "learning_rate": 2.6839550332141032e-05, - "loss": 2.0682227511677303e+25, - "step": 2210 - }, - { - "epoch": 21.55, - "learning_rate": 2.6699029126213593e-05, - "loss": 2.213360995669914e+25, - "step": 2220 - }, - { - "epoch": 21.65, - "learning_rate": 2.6558507920286155e-05, - "loss": 2.1407918734188223e+25, - "step": 2230 - }, - { - "epoch": 21.75, - "learning_rate": 2.6417986714358713e-05, - "loss": 2.1407918734188223e+25, - "step": 2240 - }, - { - "epoch": 21.84, - "learning_rate": 2.6277465508431274e-05, - "loss": 2.1407916889513814e+25, - "step": 2250 - }, - { - "epoch": 21.94, - "learning_rate": 2.6136944302503832e-05, - "loss": 2.213360995669914e+25, - "step": 2260 - }, - { - "epoch": 22.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7812, - "eval_samples_per_second": 58.884, - "eval_steps_per_second": 7.681, - "step": 2266 - }, - { - "epoch": 22.04, - "learning_rate": 2.5996423096576393e-05, - "loss": 2.1045071278258356e+25, - "step": 2270 - }, - { - "epoch": 22.14, - "learning_rate": 2.585590189064895e-05, - "loss": 2.0682227511677303e+25, - "step": 2280 - }, - { - "epoch": 22.23, - "learning_rate": 2.5715380684721512e-05, - "loss": 2.213360995669914e+25, - "step": 2290 - }, - { - "epoch": 22.33, - "learning_rate": 2.5574859478794073e-05, - "loss": 2.177076250076927e+25, - "step": 2300 - }, - { - "epoch": 22.43, - "learning_rate": 2.5434338272866635e-05, - "loss": 2.3584996091069793e+25, - "step": 2310 - }, - { - "epoch": 22.52, - "learning_rate": 2.5293817066939196e-05, - "loss": 2.213361180137355e+25, - "step": 2320 - }, - { - "epoch": 22.62, - "learning_rate": 2.5153295861011754e-05, - "loss": 2.177076434544368e+25, - "step": 2330 - }, - { - "epoch": 22.72, - "learning_rate": 2.5012774655084315e-05, - "loss": 2.213360995669914e+25, - "step": 2340 - }, - { - "epoch": 22.82, - "learning_rate": 2.4872253449156873e-05, - "loss": 2.1407918734188223e+25, - "step": 2350 - }, - { - "epoch": 22.91, - "learning_rate": 2.4731732243229434e-05, - "loss": 2.104507496760717e+25, - "step": 2360 - }, - { - "epoch": 23.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7982, - "eval_samples_per_second": 57.631, - "eval_steps_per_second": 7.517, - "step": 2369 - }, - { - "epoch": 23.01, - "learning_rate": 2.4591211037301992e-05, - "loss": 2.116602288980085e+25, - "step": 2370 - }, - { - "epoch": 23.11, - "learning_rate": 2.4450689831374553e-05, - "loss": 2.3584994246395388e+25, - "step": 2380 - }, - { - "epoch": 23.2, - "learning_rate": 2.431016862544711e-05, - "loss": 2.177076619011809e+25, - "step": 2390 - }, - { - "epoch": 23.3, - "learning_rate": 2.4169647419519672e-05, - "loss": 2.177076619011809e+25, - "step": 2400 - }, - { - "epoch": 23.4, - "learning_rate": 2.4029126213592234e-05, - "loss": 2.104507312293276e+25, - "step": 2410 - }, - { - "epoch": 23.5, - "learning_rate": 2.3888605007664795e-05, - "loss": 2.2859304868558873e+25, - "step": 2420 - }, - { - "epoch": 23.59, - "learning_rate": 2.3748083801737356e-05, - "loss": 2.213361180137355e+25, - "step": 2430 - }, - { - "epoch": 23.69, - "learning_rate": 2.3607562595809914e-05, - "loss": 2.285930302388447e+25, - "step": 2440 - }, - { - "epoch": 23.79, - "learning_rate": 2.3467041389882475e-05, - "loss": 1.9593688833236517e+25, - "step": 2450 - }, - { - "epoch": 23.88, - "learning_rate": 2.3326520183955033e-05, - "loss": 1.9230843221981057e+25, - "step": 2460 - }, - { - "epoch": 23.98, - "learning_rate": 2.3185998978027595e-05, - "loss": 2.1407915044839405e+25, - "step": 2470 - }, { "epoch": 24.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7787, - "eval_samples_per_second": 59.074, - "eval_steps_per_second": 7.705, - "step": 2472 + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7822, + "eval_samples_per_second": 58.81, + "eval_steps_per_second": 7.671, + "step": 1236 }, { "epoch": 24.08, - "learning_rate": 2.3045477772100152e-05, - "loss": 2.0440329822615527e+25, - "step": 2480 - }, - { - "epoch": 24.17, - "learning_rate": 2.2904956566172714e-05, - "loss": 2.3222148635139926e+25, - "step": 2490 + "learning_rate": 2.2703818369453044e-05, + "loss": 6.6384375935164695e+28, + "step": 1240 }, { "epoch": 24.27, - "learning_rate": 2.2764435360245275e-05, - "loss": 2.322215232448874e+25, - "step": 2500 - }, - { - "epoch": 24.37, - "learning_rate": 2.2623914154317833e-05, - "loss": 2.177076434544368e+25, - "step": 2510 + "learning_rate": 2.2420020639834882e-05, + "loss": 4.755895662687339e+28, + "step": 1250 }, { "epoch": 24.47, - "learning_rate": 2.2483392948390394e-05, - "loss": 2.2859304868558873e+25, - "step": 2520 - }, - { - "epoch": 24.56, - "learning_rate": 2.2342871742462955e-05, - "loss": 2.177076434544368e+25, - "step": 2530 + "learning_rate": 2.213622291021672e-05, + "loss": 5.499004985695794e+28, + "step": 1260 }, { "epoch": 24.66, - "learning_rate": 2.2202350536535517e-05, - "loss": 2.177076434544368e+25, - "step": 2540 - }, - { - "epoch": 24.76, - "learning_rate": 2.2061829330608075e-05, - "loss": 1.8142304543540272e+25, - "step": 2550 + "learning_rate": 2.1852425180598557e-05, + "loss": 5.944869295017184e+28, + "step": 1270 }, { "epoch": 24.85, - "learning_rate": 2.1921308124680636e-05, - "loss": 2.3584996091069793e+25, - "step": 2560 - }, - { - "epoch": 24.95, - "learning_rate": 2.1780786918753194e-05, - "loss": 2.177076250076927e+25, - "step": 2570 + "learning_rate": 2.156862745098039e-05, + "loss": 6.687978618025639e+28, + "step": 1280 }, { - "epoch": 25.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8352, - "eval_samples_per_second": 55.076, - "eval_steps_per_second": 7.184, - "step": 2575 + "epoch": 24.99, + "eval_accuracy": 0.8043478260869565, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7867, + "eval_samples_per_second": 58.473, + "eval_steps_per_second": 7.627, + "step": 1287 }, { "epoch": 25.05, - "learning_rate": 2.1640265712825755e-05, - "loss": 1.947274091104283e+25, - "step": 2580 - }, - { - "epoch": 25.15, - "learning_rate": 2.1499744506898313e-05, - "loss": 2.1407918734188223e+25, - "step": 2590 + "learning_rate": 2.1284829721362232e-05, + "loss": 6.8861404493264086e+28, + "step": 1290 }, { "epoch": 25.24, - "learning_rate": 2.1359223300970874e-05, - "loss": 2.104507312293276e+25, - "step": 2600 - }, - { - "epoch": 25.34, - "learning_rate": 2.1218702095043435e-05, - "loss": 2.0682227511677303e+25, - "step": 2610 + "learning_rate": 2.1001031991744067e-05, + "loss": 6.3907354932851674e+28, + "step": 1300 }, { "epoch": 25.44, - "learning_rate": 2.1078180889115993e-05, - "loss": 2.394784354699966e+25, - "step": 2620 - }, - { - "epoch": 25.53, - "learning_rate": 2.0937659683188554e-05, - "loss": 2.0682225667002894e+25, - "step": 2630 + "learning_rate": 2.0717234262125904e-05, + "loss": 5.499004607906475e+28, + "step": 1310 }, { "epoch": 25.63, - "learning_rate": 2.0797138477261112e-05, - "loss": 2.0319380055747437e+25, - "step": 2640 - }, - { - "epoch": 25.73, - "learning_rate": 2.0656617271333677e-05, - "loss": 2.0682227511677303e+25, - "step": 2650 + "learning_rate": 2.0433436532507742e-05, + "loss": 6.985221742766112e+28, + "step": 1320 }, { "epoch": 25.83, - "learning_rate": 2.0516096065406235e-05, - "loss": 2.3584996091069793e+25, - "step": 2660 - }, - { - "epoch": 25.92, - "learning_rate": 2.0375574859478796e-05, - "loss": 2.3947841702325255e+25, - "step": 2670 + "learning_rate": 2.0149638802889576e-05, + "loss": 5.647626925855348e+28, + "step": 1330 }, { "epoch": 26.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8067, - "eval_samples_per_second": 57.023, - "eval_steps_per_second": 7.438, - "step": 2678 + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7822, + "eval_samples_per_second": 58.81, + "eval_steps_per_second": 7.671, + "step": 1339 }, { "epoch": 26.02, - "learning_rate": 2.0235053653551354e-05, - "loss": 2.1528866656381905e+25, - "step": 2680 - }, - { - "epoch": 26.12, - "learning_rate": 2.0094532447623915e-05, - "loss": 2.3222148635139926e+25, - "step": 2690 + "learning_rate": 1.9865841073271417e-05, + "loss": 5.15222121423547e+28, + "step": 1340 }, { "epoch": 26.21, - "learning_rate": 1.9954011241696477e-05, - "loss": 2.285930302388447e+25, - "step": 2700 - }, - { - "epoch": 26.31, - "learning_rate": 1.9813490035769034e-05, - "loss": 1.923084137730665e+25, - "step": 2710 + "learning_rate": 1.958204334365325e-05, + "loss": 4.904517602846893e+28, + "step": 1350 }, { "epoch": 26.41, - "learning_rate": 1.9672968829841596e-05, - "loss": 2.1407918734188223e+25, - "step": 2720 - }, - { - "epoch": 26.5, - "learning_rate": 1.9532447623914154e-05, - "loss": 2.3222148635139926e+25, - "step": 2730 + "learning_rate": 1.929824561403509e-05, + "loss": 7.133844060714986e+28, + "step": 1360 }, { "epoch": 26.6, - "learning_rate": 1.9391926417986715e-05, - "loss": 2.0319380055747437e+25, - "step": 2740 - }, - { - "epoch": 26.7, - "learning_rate": 1.9251405212059273e-05, - "loss": 2.177076250076927e+25, - "step": 2750 + "learning_rate": 1.9014447884416927e-05, + "loss": 5.944870050595821e+28, + "step": 1370 }, { "epoch": 26.8, - "learning_rate": 1.9110884006131834e-05, - "loss": 2.213360995669914e+25, - "step": 2760 - }, - { - "epoch": 26.89, - "learning_rate": 1.8970362800204395e-05, - "loss": 2.0682225667002894e+25, - "step": 2770 + "learning_rate": 1.873065015479876e-05, + "loss": 5.796248110436267e+28, + "step": 1380 }, { "epoch": 26.99, - "learning_rate": 1.8829841594276956e-05, - "loss": 2.1045071278258356e+25, - "step": 2780 - }, - { - "epoch": 27.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.826, - "eval_samples_per_second": 55.688, - "eval_steps_per_second": 7.264, - "step": 2781 + "learning_rate": 1.8446852425180602e-05, + "loss": 6.687978618025639e+28, + "step": 1390 }, { - "epoch": 27.09, - "learning_rate": 1.8689320388349518e-05, - "loss": 2.0803175433870985e+25, - "step": 2790 + "epoch": 26.99, + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8347, + "eval_samples_per_second": 55.11, + "eval_steps_per_second": 7.188, + "step": 1390 }, { "epoch": 27.18, - "learning_rate": 1.8548799182422076e-05, - "loss": 2.068222382232849e+25, - "step": 2800 - }, - { - "epoch": 27.28, - "learning_rate": 1.8408277976494637e-05, - "loss": 2.2496457412629006e+25, - "step": 2810 + "learning_rate": 1.8163054695562436e-05, + "loss": 6.78706066704398e+28, + "step": 1400 }, { "epoch": 27.38, - "learning_rate": 1.8267756770567195e-05, - "loss": 2.2496457412629006e+25, - "step": 2820 - }, - { - "epoch": 27.48, - "learning_rate": 1.8127235564639756e-05, - "loss": 2.177076619011809e+25, - "step": 2830 + "learning_rate": 1.787925696594427e-05, + "loss": 5.350383045536239e+28, + "step": 1410 }, { "epoch": 27.57, - "learning_rate": 1.7986714358712314e-05, - "loss": 2.177076434544368e+25, - "step": 2840 - }, - { - "epoch": 27.67, - "learning_rate": 1.7846193152784875e-05, - "loss": 2.177076619011809e+25, - "step": 2850 + "learning_rate": 1.759545923632611e-05, + "loss": 5.944870050595821e+28, + "step": 1420 }, { "epoch": 27.77, - "learning_rate": 1.7705671946857433e-05, - "loss": 2.2496457412629006e+25, - "step": 2860 - }, - { - "epoch": 27.86, - "learning_rate": 1.7565150740929994e-05, - "loss": 2.1407918734188223e+25, - "step": 2870 + "learning_rate": 1.7311661506707946e-05, + "loss": 5.647626170276711e+28, + "step": 1430 }, { "epoch": 27.96, - "learning_rate": 1.7424629535002556e-05, - "loss": 2.177076250076927e+25, - "step": 2880 + "learning_rate": 1.7027863777089787e-05, + "loss": 6.093491990755376e+28, + "step": 1440 }, { "epoch": 28.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8323, - "eval_samples_per_second": 55.266, - "eval_steps_per_second": 7.209, - "step": 2884 - }, - { - "epoch": 28.06, - "learning_rate": 1.7284108329075117e-05, - "loss": 1.9714636755430202e+25, - "step": 2890 + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7802, + "eval_samples_per_second": 58.961, + "eval_steps_per_second": 7.691, + "step": 1442 }, { "epoch": 28.16, - "learning_rate": 1.7143587123147678e-05, - "loss": 2.104507312293276e+25, - "step": 2900 - }, - { - "epoch": 28.25, - "learning_rate": 1.7003065917220236e-05, - "loss": 2.1407918734188223e+25, - "step": 2910 + "learning_rate": 1.674406604747162e-05, + "loss": 7.084303036205815e+28, + "step": 1450 }, { "epoch": 28.35, - "learning_rate": 1.6862544711292797e-05, - "loss": 1.9593686988562108e+25, - "step": 2920 - }, - { - "epoch": 28.45, - "learning_rate": 1.6722023505365355e-05, - "loss": 2.467353292483617e+25, - "step": 2930 + "learning_rate": 1.6460268317853455e-05, + "loss": 6.985222498344749e+28, + "step": 1460 }, { "epoch": 28.54, - "learning_rate": 1.6581502299437916e-05, - "loss": 2.2496457412629006e+25, - "step": 2940 - }, - { - "epoch": 28.64, - "learning_rate": 1.6440981093510474e-05, - "loss": 2.3222148635139926e+25, - "step": 2950 + "learning_rate": 1.6176470588235296e-05, + "loss": 4.458652160157547e+28, + "step": 1470 }, { "epoch": 28.74, - "learning_rate": 1.6300459887583036e-05, - "loss": 1.9956534444491974e+25, - "step": 2960 - }, - { - "epoch": 28.83, - "learning_rate": 1.6159938681655597e-05, - "loss": 2.3222148635139926e+25, - "step": 2970 + "learning_rate": 1.589267285861713e-05, + "loss": 6.093491612966058e+28, + "step": 1480 }, { "epoch": 28.93, - "learning_rate": 1.6019417475728155e-05, - "loss": 2.1407918734188223e+25, - "step": 2980 - }, - { - "epoch": 29.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8133, - "eval_samples_per_second": 56.562, - "eval_steps_per_second": 7.378, - "step": 2987 + "learning_rate": 1.560887512899897e-05, + "loss": 5.499004607906475e+28, + "step": 1490 }, { - "epoch": 29.03, - "learning_rate": 1.5878896269800716e-05, - "loss": 2.1891712267637367e+25, - "step": 2990 + "epoch": 28.99, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7762, + "eval_samples_per_second": 59.264, + "eval_steps_per_second": 7.73, + "step": 1493 }, { "epoch": 29.13, - "learning_rate": 1.5738375063873274e-05, - "loss": 2.177076250076927e+25, - "step": 3000 - }, - { - "epoch": 29.22, - "learning_rate": 1.559785385794584e-05, - "loss": 2.177076434544368e+25, - "step": 3010 + "learning_rate": 1.5325077399380806e-05, + "loss": 5.895329403875969e+28, + "step": 1500 }, { "epoch": 29.32, - "learning_rate": 1.5457332652018396e-05, - "loss": 2.177076619011809e+25, - "step": 3020 - }, - { - "epoch": 29.42, - "learning_rate": 1.5316811446090958e-05, - "loss": 2.3584996091069793e+25, - "step": 3030 + "learning_rate": 1.5041279669762642e-05, + "loss": 5.944869672806502e+28, + "step": 1510 }, { "epoch": 29.51, - "learning_rate": 1.5176290240163517e-05, - "loss": 1.923084137730665e+25, - "step": 3040 - }, - { - "epoch": 29.61, - "learning_rate": 1.5035769034236077e-05, - "loss": 2.2496457412629006e+25, - "step": 3050 + "learning_rate": 1.4757481940144479e-05, + "loss": 6.242113930914931e+28, + "step": 1520 }, { "epoch": 29.71, - "learning_rate": 1.4895247828308636e-05, - "loss": 2.24964555679546e+25, - "step": 3060 - }, - { - "epoch": 29.81, - "learning_rate": 1.4754726622381196e-05, - "loss": 1.923084137730665e+25, - "step": 3070 + "learning_rate": 1.4473684210526315e-05, + "loss": 5.3503838011148765e+28, + "step": 1530 }, { "epoch": 29.9, - "learning_rate": 1.4614205416453755e-05, - "loss": 2.177076250076927e+25, - "step": 3080 - }, - { - "epoch": 30.0, - "learning_rate": 1.4473684210526315e-05, - "loss": 2.1528866656381905e+25, - "step": 3090 + "learning_rate": 1.4189886480908154e-05, + "loss": 6.985223253923386e+28, + "step": 1540 }, { "epoch": 30.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8077, - "eval_samples_per_second": 56.953, - "eval_steps_per_second": 7.429, - "step": 3090 + "eval_accuracy": 0.782608695652174, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7882, + "eval_samples_per_second": 58.363, + "eval_steps_per_second": 7.613, + "step": 1545 }, { "epoch": 30.1, - "learning_rate": 1.4333163004598875e-05, - "loss": 2.3222148635139926e+25, - "step": 3100 - }, - { - "epoch": 30.19, - "learning_rate": 1.4192641798671436e-05, - "loss": 2.3222148635139926e+25, - "step": 3110 + "learning_rate": 1.390608875128999e-05, + "loss": 5.449463961186624e+28, + "step": 1550 }, { "epoch": 30.29, - "learning_rate": 1.4052120592743997e-05, - "loss": 2.3222150479814335e+25, - "step": 3120 - }, - { - "epoch": 30.39, - "learning_rate": 1.3911599386816558e-05, - "loss": 2.213360995669914e+25, - "step": 3130 + "learning_rate": 1.3622291021671828e-05, + "loss": 4.755896040476657e+28, + "step": 1560 }, { "epoch": 30.49, - "learning_rate": 1.3771078180889118e-05, - "loss": 2.177076619011809e+25, - "step": 3140 - }, - { - "epoch": 30.58, - "learning_rate": 1.3630556974961678e-05, - "loss": 2.1407918734188223e+25, - "step": 3150 + "learning_rate": 1.3338493292053664e-05, + "loss": 5.796248488225585e+28, + "step": 1570 }, { "epoch": 30.68, - "learning_rate": 1.3490035769034237e-05, - "loss": 1.995653259981757e+25, - "step": 3160 - }, - { - "epoch": 30.78, - "learning_rate": 1.3349514563106797e-05, - "loss": 1.923084137730665e+25, - "step": 3170 + "learning_rate": 1.3054695562435501e-05, + "loss": 6.836600935974513e+28, + "step": 1580 }, { "epoch": 30.87, - "learning_rate": 1.3208993357179356e-05, - "loss": 2.1407918734188223e+25, - "step": 3180 - }, - { - "epoch": 30.97, - "learning_rate": 1.3068472151251916e-05, - "loss": 2.104507312293276e+25, - "step": 3190 + "learning_rate": 1.2770897832817337e-05, + "loss": 7.133844060714986e+28, + "step": 1590 }, { - "epoch": 31.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8172, - "eval_samples_per_second": 56.29, - "eval_steps_per_second": 7.342, - "step": 3193 + "epoch": 30.99, + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8897, + "eval_samples_per_second": 51.701, + "eval_steps_per_second": 6.744, + "step": 1596 }, { "epoch": 31.07, - "learning_rate": 1.2927950945324475e-05, - "loss": 2.3584994246395388e+25, - "step": 3200 - }, - { - "epoch": 31.17, - "learning_rate": 1.2787429739397037e-05, - "loss": 2.1045071278258356e+25, - "step": 3210 + "learning_rate": 1.2487100103199175e-05, + "loss": 5.499004607906475e+28, + "step": 1600 }, { "epoch": 31.26, - "learning_rate": 1.2646908533469598e-05, - "loss": 2.1407918734188223e+25, - "step": 3220 - }, - { - "epoch": 31.36, - "learning_rate": 1.2506387327542158e-05, - "loss": 2.140792057886263e+25, - "step": 3230 + "learning_rate": 1.2203302373581011e-05, + "loss": 6.3907347377065295e+28, + "step": 1610 }, { "epoch": 31.46, - "learning_rate": 1.2365866121614717e-05, - "loss": 2.213361180137355e+25, - "step": 3240 - }, - { - "epoch": 31.55, - "learning_rate": 1.2225344915687277e-05, - "loss": 2.0319380055747437e+25, - "step": 3250 + "learning_rate": 1.1919504643962849e-05, + "loss": 5.944870050595821e+28, + "step": 1620 }, { "epoch": 31.65, - "learning_rate": 1.2084823709759836e-05, - "loss": 2.2496457412629006e+25, - "step": 3260 - }, - { - "epoch": 31.75, - "learning_rate": 1.1944302503832397e-05, - "loss": 2.177076619011809e+25, - "step": 3270 + "learning_rate": 1.1635706914344686e-05, + "loss": 6.242113930914931e+28, + "step": 1630 }, { "epoch": 31.84, - "learning_rate": 1.1803781297904957e-05, - "loss": 1.9956534444491974e+25, - "step": 3280 - }, - { - "epoch": 31.94, - "learning_rate": 1.1663260091977517e-05, - "loss": 2.3584994246395388e+25, - "step": 3290 + "learning_rate": 1.1351909184726522e-05, + "loss": 6.687979373604276e+28, + "step": 1640 }, { "epoch": 32.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7732, - "eval_samples_per_second": 59.495, - "eval_steps_per_second": 7.76, - "step": 3296 + "eval_accuracy": 0.7608695652173914, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8212, + "eval_samples_per_second": 56.017, + "eval_steps_per_second": 7.307, + "step": 1648 }, { "epoch": 32.04, - "learning_rate": 1.1522738886050076e-05, - "loss": 2.116601920045204e+25, - "step": 3300 - }, - { - "epoch": 32.14, - "learning_rate": 1.1382217680122637e-05, - "loss": 2.177076434544368e+25, - "step": 3310 + "learning_rate": 1.106811145510836e-05, + "loss": 5.449463961186624e+28, + "step": 1650 }, { "epoch": 32.23, - "learning_rate": 1.1241696474195197e-05, - "loss": 2.2859304868558873e+25, - "step": 3320 - }, - { - "epoch": 32.33, - "learning_rate": 1.1101175268267758e-05, - "loss": 2.24964555679546e+25, - "step": 3330 + "learning_rate": 1.0784313725490196e-05, + "loss": 5.499004607906475e+28, + "step": 1660 }, { "epoch": 32.43, - "learning_rate": 1.0960654062340318e-05, - "loss": 2.104507312293276e+25, - "step": 3340 - }, - { - "epoch": 32.52, - "learning_rate": 1.0820132856412877e-05, - "loss": 1.923084137730665e+25, - "step": 3350 + "learning_rate": 1.0500515995872033e-05, + "loss": 5.944870050595821e+28, + "step": 1670 }, { "epoch": 32.62, - "learning_rate": 1.0679611650485437e-05, - "loss": 1.995653259981757e+25, - "step": 3360 - }, - { - "epoch": 32.72, - "learning_rate": 1.0539090444557997e-05, - "loss": 2.394784354699966e+25, - "step": 3370 + "learning_rate": 1.0216718266253871e-05, + "loss": 7.7283318213532045e+28, + "step": 1680 }, { "epoch": 32.82, - "learning_rate": 1.0398569238630556e-05, - "loss": 2.3222148635139926e+25, - "step": 3380 - }, - { - "epoch": 32.91, - "learning_rate": 1.0258048032703117e-05, - "loss": 2.1045069433583947e+25, - "step": 3390 + "learning_rate": 9.932920536635709e-06, + "loss": 4.458652537946866e+28, + "step": 1690 }, { - "epoch": 33.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8002, - "eval_samples_per_second": 57.486, - "eval_steps_per_second": 7.498, - "step": 3399 + "epoch": 32.99, + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8102, + "eval_samples_per_second": 56.775, + "eval_steps_per_second": 7.405, + "step": 1699 }, { "epoch": 33.01, - "learning_rate": 1.0117526826775677e-05, - "loss": 2.24964555679546e+25, - "step": 3400 - }, - { - "epoch": 33.11, - "learning_rate": 9.977005620848238e-06, - "loss": 1.9593686988562108e+25, - "step": 3410 + "learning_rate": 9.649122807017545e-06, + "loss": 5.944870050595821e+28, + "step": 1700 }, { "epoch": 33.2, - "learning_rate": 9.836484414920798e-06, - "loss": 2.394784354699966e+25, - "step": 3420 - }, - { - "epoch": 33.3, - "learning_rate": 9.695963208993357e-06, - "loss": 2.2496457412629006e+25, - "step": 3430 + "learning_rate": 9.36532507739938e-06, + "loss": 5.944869295017184e+28, + "step": 1710 }, { "epoch": 33.4, - "learning_rate": 9.555442003065917e-06, - "loss": 1.9593688833236517e+25, - "step": 3440 - }, - { - "epoch": 33.5, - "learning_rate": 9.414920797138478e-06, - "loss": 2.0682225667002894e+25, - "step": 3450 + "learning_rate": 9.081527347781218e-06, + "loss": 6.539356300076766e+28, + "step": 1720 }, { "epoch": 33.59, - "learning_rate": 9.274399591211038e-06, - "loss": 2.2496457412629006e+25, - "step": 3460 - }, - { - "epoch": 33.69, - "learning_rate": 9.133878385283597e-06, - "loss": 2.0682225667002894e+25, - "step": 3470 + "learning_rate": 8.797729618163056e-06, + "loss": 6.093491990755376e+28, + "step": 1730 }, { "epoch": 33.79, - "learning_rate": 8.993357179356157e-06, - "loss": 2.322215232448874e+25, - "step": 3480 - }, - { - "epoch": 33.88, - "learning_rate": 8.852835973428717e-06, - "loss": 2.0682227511677303e+25, - "step": 3490 + "learning_rate": 8.513931888544893e-06, + "loss": 5.796247732646948e+28, + "step": 1740 }, { "epoch": 33.98, - "learning_rate": 8.712314767501278e-06, - "loss": 2.2859304868558873e+25, - "step": 3500 + "learning_rate": 8.230134158926728e-06, + "loss": 5.944870050595821e+28, + "step": 1750 }, { "epoch": 34.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7812, - "eval_samples_per_second": 58.886, - "eval_steps_per_second": 7.681, - "step": 3502 - }, - { - "epoch": 34.08, - "learning_rate": 8.571793561573839e-06, - "loss": 2.189171595698618e+25, - "step": 3510 + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8142, + "eval_samples_per_second": 56.495, + "eval_steps_per_second": 7.369, + "step": 1751 }, { "epoch": 34.17, - "learning_rate": 8.431272355646399e-06, - "loss": 2.285930117921006e+25, - "step": 3520 - }, - { - "epoch": 34.27, - "learning_rate": 8.290751149718958e-06, - "loss": 1.886799576605119e+25, - "step": 3530 + "learning_rate": 7.946336429308565e-06, + "loss": 5.449464338975942e+28, + "step": 1760 }, { "epoch": 34.37, - "learning_rate": 8.150229943791518e-06, - "loss": 2.213361180137355e+25, - "step": 3540 - }, - { - "epoch": 34.47, - "learning_rate": 8.009708737864077e-06, - "loss": 2.177076434544368e+25, - "step": 3550 + "learning_rate": 7.662538699690403e-06, + "loss": 6.985221742766112e+28, + "step": 1770 }, { "epoch": 34.56, - "learning_rate": 7.869187531936637e-06, - "loss": 2.431068915825512e+25, - "step": 3560 - }, - { - "epoch": 34.66, - "learning_rate": 7.728666326009198e-06, - "loss": 1.9956534444491974e+25, - "step": 3570 + "learning_rate": 7.3787409700722396e-06, + "loss": 4.904517225057574e+28, + "step": 1780 }, { "epoch": 34.76, - "learning_rate": 7.588145120081759e-06, - "loss": 2.213360995669914e+25, - "step": 3580 - }, - { - "epoch": 34.85, - "learning_rate": 7.447623914154318e-06, - "loss": 2.1045071278258356e+25, - "step": 3590 + "learning_rate": 7.094943240454077e-06, + "loss": 6.539356300076766e+28, + "step": 1790 }, { "epoch": 34.95, - "learning_rate": 7.307102708226878e-06, - "loss": 2.140792057886263e+25, - "step": 3600 - }, - { - "epoch": 35.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7977, - "eval_samples_per_second": 57.667, - "eval_steps_per_second": 7.522, - "step": 3605 + "learning_rate": 6.811145510835914e-06, + "loss": 6.3907347377065295e+28, + "step": 1800 }, { - "epoch": 35.05, - "learning_rate": 7.166581502299437e-06, - "loss": 2.298025279075256e+25, - "step": 3610 + "epoch": 34.99, + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7832, + "eval_samples_per_second": 58.735, + "eval_steps_per_second": 7.661, + "step": 1802 }, { "epoch": 35.15, - "learning_rate": 7.026060296371999e-06, - "loss": 2.213360995669914e+25, - "step": 3620 - }, - { - "epoch": 35.24, - "learning_rate": 6.885539090444559e-06, - "loss": 2.3584996091069793e+25, - "step": 3630 + "learning_rate": 6.527347781217751e-06, + "loss": 5.300842021027069e+28, + "step": 1810 }, { "epoch": 35.34, - "learning_rate": 6.7450178845171186e-06, - "loss": 2.104507496760717e+25, - "step": 3640 - }, - { - "epoch": 35.44, - "learning_rate": 6.604496678589678e-06, - "loss": 2.2496457412629006e+25, - "step": 3650 + "learning_rate": 6.2435500515995875e-06, + "loss": 5.499004607906475e+28, + "step": 1820 }, { "epoch": 35.53, - "learning_rate": 6.463975472662238e-06, - "loss": 1.9593688833236517e+25, - "step": 3660 - }, - { - "epoch": 35.63, - "learning_rate": 6.323454266734799e-06, - "loss": 1.995653259981757e+25, - "step": 3670 + "learning_rate": 5.959752321981424e-06, + "loss": 6.539357055655403e+28, + "step": 1830 }, { "epoch": 35.73, - "learning_rate": 6.1829330608073585e-06, - "loss": 2.3584996091069793e+25, - "step": 3680 - }, - { - "epoch": 35.83, - "learning_rate": 6.042411854879918e-06, - "loss": 1.8505148310121323e+25, - "step": 3690 + "learning_rate": 5.675954592363261e-06, + "loss": 5.94487042838514e+28, + "step": 1840 }, { "epoch": 35.92, - "learning_rate": 5.9018906489524785e-06, - "loss": 2.104507496760717e+25, - "step": 3700 + "learning_rate": 5.392156862745098e-06, + "loss": 7.579709503404331e+28, + "step": 1850 }, { "epoch": 36.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7792, - "eval_samples_per_second": 59.035, - "eval_steps_per_second": 7.7, - "step": 3708 - }, - { - "epoch": 36.02, - "learning_rate": 5.761369443025038e-06, - "loss": 2.3584994246395388e+25, - "step": 3710 + "eval_accuracy": 0.7391304347826086, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8372, + "eval_samples_per_second": 54.945, + "eval_steps_per_second": 7.167, + "step": 1854 }, { "epoch": 36.12, - "learning_rate": 5.6208482370975985e-06, - "loss": 2.24964555679546e+25, - "step": 3720 - }, - { - "epoch": 36.21, - "learning_rate": 5.480327031170159e-06, - "loss": 2.213361180137355e+25, - "step": 3730 + "learning_rate": 5.1083591331269355e-06, + "loss": 4.90451835842553e+28, + "step": 1860 }, { "epoch": 36.31, - "learning_rate": 5.3398058252427185e-06, - "loss": 2.3584994246395388e+25, - "step": 3740 - }, - { - "epoch": 36.41, - "learning_rate": 5.199284619315278e-06, - "loss": 1.9956536289166384e+25, - "step": 3750 + "learning_rate": 4.824561403508772e-06, + "loss": 5.053139920795767e+28, + "step": 1870 }, { "epoch": 36.5, - "learning_rate": 5.0587634133878385e-06, - "loss": 1.923084137730665e+25, - "step": 3760 - }, - { - "epoch": 36.6, - "learning_rate": 4.918242207460399e-06, - "loss": 2.285930117921006e+25, - "step": 3770 + "learning_rate": 4.540763673890609e-06, + "loss": 7.72833031019593e+28, + "step": 1880 }, { "epoch": 36.7, - "learning_rate": 4.7777210015329585e-06, - "loss": 2.3222148635139926e+25, - "step": 3780 - }, - { - "epoch": 36.8, - "learning_rate": 4.637199795605519e-06, - "loss": 2.177076619011809e+25, - "step": 3790 + "learning_rate": 4.256965944272447e-06, + "loss": 4.90451835842553e+28, + "step": 1890 }, { "epoch": 36.89, - "learning_rate": 4.4966785896780785e-06, - "loss": 2.104507496760717e+25, - "step": 3800 + "learning_rate": 3.973168214654283e-06, + "loss": 6.242113553125612e+28, + "step": 1900 }, { "epoch": 36.99, - "learning_rate": 4.356157383750639e-06, - "loss": 2.177076434544368e+25, - "step": 3810 - }, - { - "epoch": 37.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7992, - "eval_samples_per_second": 57.558, - "eval_steps_per_second": 7.508, - "step": 3811 + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8212, + "eval_samples_per_second": 56.017, + "eval_steps_per_second": 7.307, + "step": 1905 }, { "epoch": 37.09, - "learning_rate": 4.215636177823199e-06, - "loss": 1.8626098076989416e+25, - "step": 3820 - }, - { - "epoch": 37.18, - "learning_rate": 4.075114971895759e-06, - "loss": 1.923084137730665e+25, - "step": 3830 + "learning_rate": 3.6893704850361198e-06, + "loss": 7.232925354154688e+28, + "step": 1910 }, { "epoch": 37.28, - "learning_rate": 3.9345937659683185e-06, - "loss": 2.1045071278258356e+25, - "step": 3840 - }, - { - "epoch": 37.38, - "learning_rate": 3.7940725600408793e-06, - "loss": 2.1045071278258356e+25, - "step": 3850 + "learning_rate": 3.405572755417957e-06, + "loss": 7.282466378663859e+28, + "step": 1920 }, { "epoch": 37.48, - "learning_rate": 3.653551354113439e-06, - "loss": 2.2496457412629006e+25, - "step": 3860 - }, - { - "epoch": 37.57, - "learning_rate": 3.5130301481859993e-06, - "loss": 2.3584994246395388e+25, - "step": 3870 + "learning_rate": 3.1217750257997938e-06, + "loss": 5.944870050595821e+28, + "step": 1930 }, { "epoch": 37.67, - "learning_rate": 3.3725089422585593e-06, - "loss": 2.177076619011809e+25, - "step": 3880 - }, - { - "epoch": 37.77, - "learning_rate": 3.231987736331119e-06, - "loss": 2.3584994246395388e+25, - "step": 3890 + "learning_rate": 2.8379772961816305e-06, + "loss": 5.201761105376684e+28, + "step": 1940 }, { "epoch": 37.86, - "learning_rate": 3.0914665304036793e-06, - "loss": 2.1407918734188223e+25, - "step": 3900 - }, - { - "epoch": 37.96, - "learning_rate": 2.9509453244762393e-06, - "loss": 2.2496457412629006e+25, - "step": 3910 + "learning_rate": 2.5541795665634677e-06, + "loss": 5.350383045536239e+28, + "step": 1950 }, { "epoch": 38.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.8064, - "eval_samples_per_second": 57.045, - "eval_steps_per_second": 7.441, - "step": 3914 + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.8017, + "eval_samples_per_second": 57.379, + "eval_steps_per_second": 7.484, + "step": 1957 }, { "epoch": 38.06, - "learning_rate": 2.8104241185487993e-06, - "loss": 2.3947839857650846e+25, - "step": 3920 - }, - { - "epoch": 38.16, - "learning_rate": 2.6699029126213593e-06, - "loss": 2.1407918734188223e+25, - "step": 3930 + "learning_rate": 2.2703818369453045e-06, + "loss": 4.755896040476657e+28, + "step": 1960 }, { "epoch": 38.25, - "learning_rate": 2.5293817066939193e-06, - "loss": 1.9593686988562108e+25, - "step": 3940 - }, - { - "epoch": 38.35, - "learning_rate": 2.3888605007664792e-06, - "loss": 2.3222148635139926e+25, - "step": 3950 + "learning_rate": 1.9865841073271413e-06, + "loss": 6.985222498344749e+28, + "step": 1970 }, { "epoch": 38.45, - "learning_rate": 2.2483392948390392e-06, - "loss": 2.1407918734188223e+25, - "step": 3960 - }, - { - "epoch": 38.54, - "learning_rate": 2.1078180889115997e-06, - "loss": 2.213361180137355e+25, - "step": 3970 + "learning_rate": 1.7027863777089785e-06, + "loss": 5.499004607906475e+28, + "step": 1980 }, { "epoch": 38.64, - "learning_rate": 1.9672968829841592e-06, - "loss": 2.068222382232849e+25, - "step": 3980 - }, - { - "epoch": 38.74, - "learning_rate": 1.8267756770567194e-06, - "loss": 2.177076434544368e+25, - "step": 3990 + "learning_rate": 1.4189886480908153e-06, + "loss": 6.242113930914931e+28, + "step": 1990 }, { "epoch": 38.83, - "learning_rate": 1.6862544711292796e-06, - "loss": 2.24964555679546e+25, - "step": 4000 - }, - { - "epoch": 38.93, - "learning_rate": 1.5457332652018396e-06, - "loss": 2.1407918734188223e+25, - "step": 4010 + "learning_rate": 1.1351909184726523e-06, + "loss": 5.64762654806603e+28, + "step": 2000 }, { - "epoch": 39.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7932, - "eval_samples_per_second": 57.993, - "eval_steps_per_second": 7.564, - "step": 4017 + "epoch": 38.99, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7817, + "eval_samples_per_second": 58.848, + "eval_steps_per_second": 7.676, + "step": 2008 }, { "epoch": 39.03, - "learning_rate": 1.4052120592743996e-06, - "loss": 2.2254557878892825e+25, - "step": 4020 - }, - { - "epoch": 39.13, - "learning_rate": 1.2646908533469596e-06, - "loss": 2.1407918734188223e+25, - "step": 4030 + "learning_rate": 8.513931888544892e-07, + "loss": 5.895329403875969e+28, + "step": 2010 }, { "epoch": 39.22, - "learning_rate": 1.1241696474195196e-06, - "loss": 2.0319380055747437e+25, - "step": 4040 - }, - { - "epoch": 39.32, - "learning_rate": 9.836484414920796e-07, - "loss": 2.35849979357442e+25, - "step": 4050 + "learning_rate": 5.675954592363261e-07, + "loss": 6.687978618025639e+28, + "step": 2020 }, { "epoch": 39.42, - "learning_rate": 8.431272355646398e-07, - "loss": 1.9593688833236517e+25, - "step": 4060 - }, - { - "epoch": 39.51, - "learning_rate": 7.026060296371998e-07, - "loss": 2.2133613646047957e+25, - "step": 4070 + "learning_rate": 2.8379772961816306e-07, + "loss": 6.093491612966058e+28, + "step": 2030 }, { "epoch": 39.61, - "learning_rate": 5.620848237097598e-07, - "loss": 2.2859304868558873e+25, - "step": 4080 - }, - { - "epoch": 39.71, - "learning_rate": 4.215636177823199e-07, - "loss": 1.9593688833236517e+25, - "step": 4090 - }, - { - "epoch": 39.81, - "learning_rate": 2.810424118548799e-07, - "loss": 2.3222148635139926e+25, - "step": 4100 + "learning_rate": 0.0, + "loss": 5.3503826677469204e+28, + "step": 2040 }, { - "epoch": 39.9, - "learning_rate": 1.4052120592743995e-07, - "loss": 2.1407918734188223e+25, - "step": 4110 + "epoch": 39.61, + "eval_accuracy": 0.717391304347826, + "eval_loss": 2.584726139752747e+28, + "eval_runtime": 0.7909, + "eval_samples_per_second": 58.159, + "eval_steps_per_second": 7.586, + "step": 2040 }, { - "epoch": 40.0, - "learning_rate": 0.0, - "loss": 2.3222148635139926e+25, - "step": 4120 - }, - { - "epoch": 40.0, - "eval_accuracy": 0.10869565217391304, - "eval_loss": 2.5872499347325405e+25, - "eval_runtime": 0.7917, - "eval_samples_per_second": 58.104, - "eval_steps_per_second": 7.579, - "step": 4120 - }, - { - "epoch": 40.0, - "step": 4120, - "total_flos": 2.538683085785334e+18, - "train_loss": 2.1692970037867085e+25, - "train_runtime": 690.2509, - "train_samples_per_second": 47.461, - "train_steps_per_second": 5.969 + "epoch": 39.61, + "step": 2040, + "total_flos": 2.5142726714989363e+18, + "train_loss": 6.009952755433709e+28, + "train_runtime": 674.9098, + "train_samples_per_second": 48.54, + "train_steps_per_second": 3.023 } ], "logging_steps": 10, - "max_steps": 4120, + "max_steps": 2040, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, - "total_flos": 2.538683085785334e+18, + "total_flos": 2.5142726714989363e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null