diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25081 @@ +{ + "best_metric": 1.599829912185669, + "best_model_checkpoint": "runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_20240420-031057/checkpoint-10000_20240420-061900/checkpoint-10000_20240420-141714/checkpoint-30000_20240421-001954/checkpoint-20000_20240421-063809/checkpoint-27000", + "epoch": 2.9166666666666665, + "eval_steps": 500, + "global_step": 35000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.2827765941619873, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.2577, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2.5021369457244873, + "learning_rate": 8.000000000000001e-07, + "loss": 2.392, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.0452425479888916, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.3256, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.0641889572143555, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.3476, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 4.357707977294922, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.2706, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 5.017151355743408, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.125, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.5771751403808594, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.2445, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 1.8890554904937744, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.2334, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 3.2498245239257812, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.2226, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 4.084829807281494, + "learning_rate": 4.000000000000001e-06, + "loss": 2.0497, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 3.884516716003418, + "learning_rate": 4.4e-06, + "loss": 2.0363, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.7535431385040283, + "learning_rate": 4.800000000000001e-06, + "loss": 2.1011, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 3.637084722518921, + "learning_rate": 5.2e-06, + "loss": 2.0225, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 2.6097259521484375, + "learning_rate": 5.600000000000001e-06, + "loss": 1.9933, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 3.4520087242126465, + "learning_rate": 6e-06, + "loss": 2.0717, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 2.878129005432129, + "learning_rate": 6.4000000000000006e-06, + "loss": 2.078, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 2.6473491191864014, + "learning_rate": 6.800000000000001e-06, + "loss": 1.8528, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 2.8662400245666504, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.9476, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 2.6106956005096436, + "learning_rate": 7.600000000000001e-06, + "loss": 1.9378, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 3.1151645183563232, + "learning_rate": 8.000000000000001e-06, + "loss": 1.8459, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 2.933845281600952, + "learning_rate": 8.400000000000001e-06, + "loss": 1.9992, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 2.989990472793579, + "learning_rate": 8.8e-06, + "loss": 1.9832, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 5.849925994873047, + "learning_rate": 9.200000000000002e-06, + "loss": 1.9316, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 10.09212875366211, + "learning_rate": 9.600000000000001e-06, + "loss": 1.9109, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 3.965531587600708, + "learning_rate": 1e-05, + "loss": 1.9034, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 3.259747266769409, + "learning_rate": 1.04e-05, + "loss": 1.9923, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 4.087090015411377, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.9494, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 2.418077230453491, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.8784, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 3.9205658435821533, + "learning_rate": 1.16e-05, + "loss": 1.7895, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 7.845510959625244, + "learning_rate": 1.2e-05, + "loss": 1.8195, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 6.274329662322998, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.9255, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 3.376899242401123, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.9822, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 4.367288589477539, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.8333, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 2.786440849304199, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.8435, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 2.883002758026123, + "learning_rate": 1.4e-05, + "loss": 1.8751, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 2.210261583328247, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.8073, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 2.7850048542022705, + "learning_rate": 1.48e-05, + "loss": 1.8612, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 2.726701259613037, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.9211, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 3.326230764389038, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.9087, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 1.577868938446045, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0082, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 2.0567996501922607, + "learning_rate": 1.64e-05, + "loss": 1.9232, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 6.262115478515625, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.9156, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 1.7286055088043213, + "learning_rate": 1.72e-05, + "loss": 1.7268, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 2.0500264167785645, + "learning_rate": 1.76e-05, + "loss": 1.9313, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 1.8140387535095215, + "learning_rate": 1.8e-05, + "loss": 1.8847, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 2.911093235015869, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.6759, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 4.010791778564453, + "learning_rate": 1.88e-05, + "loss": 1.9112, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 2.5784366130828857, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.9196, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 1.904146671295166, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.9137, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 2.3381693363189697, + "learning_rate": 2e-05, + "loss": 1.8686, + "step": 500 + }, + { + "epoch": 0.04, + "eval_loss": 1.8578028678894043, + "eval_runtime": 107.4833, + "eval_samples_per_second": 9.304, + "eval_steps_per_second": 2.326, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 1.6459054946899414, + "learning_rate": 1.9994202898550726e-05, + "loss": 1.8223, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 5.068203926086426, + "learning_rate": 1.998840579710145e-05, + "loss": 1.7466, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 2.4843411445617676, + "learning_rate": 1.9982608695652174e-05, + "loss": 1.9179, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 1.6055413484573364, + "learning_rate": 1.99768115942029e-05, + "loss": 1.9317, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 3.9912831783294678, + "learning_rate": 1.9971014492753625e-05, + "loss": 1.9632, + "step": 550 + }, + { + "epoch": 0.05, + "grad_norm": 1.7726389169692993, + "learning_rate": 1.996521739130435e-05, + "loss": 1.8542, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 3.2357091903686523, + "learning_rate": 1.9959420289855073e-05, + "loss": 1.8574, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 1.8849786520004272, + "learning_rate": 1.99536231884058e-05, + "loss": 1.8169, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 1.756426215171814, + "learning_rate": 1.9947826086956524e-05, + "loss": 1.8116, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 1.3438060283660889, + "learning_rate": 1.994202898550725e-05, + "loss": 1.9238, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 1.7150946855545044, + "learning_rate": 1.9936231884057972e-05, + "loss": 1.9171, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 2.1640572547912598, + "learning_rate": 1.9930434782608696e-05, + "loss": 1.8279, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 2.2766189575195312, + "learning_rate": 1.9924637681159424e-05, + "loss": 1.8898, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 5.365070819854736, + "learning_rate": 1.9918840579710144e-05, + "loss": 1.862, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 2.1916489601135254, + "learning_rate": 1.9913043478260872e-05, + "loss": 1.8582, + "step": 650 + }, + { + "epoch": 0.06, + "grad_norm": 2.0066256523132324, + "learning_rate": 1.9907246376811596e-05, + "loss": 1.7583, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 2.382798671722412, + "learning_rate": 1.990144927536232e-05, + "loss": 1.8526, + "step": 670 + }, + { + "epoch": 0.06, + "grad_norm": 2.918565273284912, + "learning_rate": 1.9895652173913044e-05, + "loss": 1.8112, + "step": 680 + }, + { + "epoch": 0.06, + "grad_norm": 1.8301721811294556, + "learning_rate": 1.988985507246377e-05, + "loss": 1.8805, + "step": 690 + }, + { + "epoch": 0.06, + "grad_norm": 2.482556104660034, + "learning_rate": 1.9884057971014495e-05, + "loss": 1.778, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 2.739922046661377, + "learning_rate": 1.987826086956522e-05, + "loss": 1.8817, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 1.8400835990905762, + "learning_rate": 1.9872463768115943e-05, + "loss": 1.9506, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 1.7623933553695679, + "learning_rate": 1.9866666666666667e-05, + "loss": 1.8342, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 4.191768169403076, + "learning_rate": 1.9860869565217395e-05, + "loss": 1.8549, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 3.0716779232025146, + "learning_rate": 1.9855072463768115e-05, + "loss": 1.7888, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 2.673297643661499, + "learning_rate": 1.9849275362318843e-05, + "loss": 1.8287, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 3.45849609375, + "learning_rate": 1.9843478260869567e-05, + "loss": 1.8583, + "step": 770 + }, + { + "epoch": 0.07, + "grad_norm": 1.941355586051941, + "learning_rate": 1.983768115942029e-05, + "loss": 1.7381, + "step": 780 + }, + { + "epoch": 0.07, + "grad_norm": 2.047844886779785, + "learning_rate": 1.9831884057971015e-05, + "loss": 1.8842, + "step": 790 + }, + { + "epoch": 0.07, + "grad_norm": 1.3743892908096313, + "learning_rate": 1.9826086956521742e-05, + "loss": 1.9104, + "step": 800 + }, + { + "epoch": 0.07, + "grad_norm": 1.7510887384414673, + "learning_rate": 1.9820289855072466e-05, + "loss": 1.8223, + "step": 810 + }, + { + "epoch": 0.07, + "grad_norm": 1.888203740119934, + "learning_rate": 1.981449275362319e-05, + "loss": 1.7711, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 3.0050196647644043, + "learning_rate": 1.9808695652173914e-05, + "loss": 1.756, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 1.3720544576644897, + "learning_rate": 1.9802898550724638e-05, + "loss": 1.8173, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 2.654989242553711, + "learning_rate": 1.9797101449275366e-05, + "loss": 1.7568, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 3.6716177463531494, + "learning_rate": 1.979130434782609e-05, + "loss": 1.9341, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 2.005023241043091, + "learning_rate": 1.9785507246376814e-05, + "loss": 1.9117, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 1.9368737936019897, + "learning_rate": 1.9779710144927538e-05, + "loss": 1.6635, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 2.550541400909424, + "learning_rate": 1.9773913043478265e-05, + "loss": 1.8452, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 1.289937138557434, + "learning_rate": 1.9768115942028986e-05, + "loss": 1.8579, + "step": 900 + }, + { + "epoch": 0.08, + "grad_norm": 2.2402184009552, + "learning_rate": 1.9762318840579713e-05, + "loss": 1.8988, + "step": 910 + }, + { + "epoch": 0.08, + "grad_norm": 1.5080454349517822, + "learning_rate": 1.9756521739130437e-05, + "loss": 1.9026, + "step": 920 + }, + { + "epoch": 0.08, + "grad_norm": 1.0446062088012695, + "learning_rate": 1.975072463768116e-05, + "loss": 1.8181, + "step": 930 + }, + { + "epoch": 0.08, + "grad_norm": 2.787261724472046, + "learning_rate": 1.9744927536231885e-05, + "loss": 1.8197, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 2.6586248874664307, + "learning_rate": 1.973913043478261e-05, + "loss": 1.8774, + "step": 950 + }, + { + "epoch": 0.08, + "grad_norm": 2.1193392276763916, + "learning_rate": 1.9733333333333336e-05, + "loss": 1.9113, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 3.299938917160034, + "learning_rate": 1.972753623188406e-05, + "loss": 1.8613, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 2.443349838256836, + "learning_rate": 1.9721739130434784e-05, + "loss": 1.8193, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 1.3777451515197754, + "learning_rate": 1.971594202898551e-05, + "loss": 1.7872, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 2.1168971061706543, + "learning_rate": 1.9710144927536236e-05, + "loss": 1.7879, + "step": 1000 + }, + { + "epoch": 0.08, + "eval_loss": 1.8089497089385986, + "eval_runtime": 107.4884, + "eval_samples_per_second": 9.303, + "eval_steps_per_second": 2.326, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 1.817137360572815, + "learning_rate": 1.9704347826086956e-05, + "loss": 1.9, + "step": 1010 + }, + { + "epoch": 0.09, + "grad_norm": 2.249089479446411, + "learning_rate": 1.969855072463768e-05, + "loss": 1.9053, + "step": 1020 + }, + { + "epoch": 0.09, + "grad_norm": 2.998011589050293, + "learning_rate": 1.9692753623188408e-05, + "loss": 1.8222, + "step": 1030 + }, + { + "epoch": 0.09, + "grad_norm": 4.248562335968018, + "learning_rate": 1.9686956521739132e-05, + "loss": 1.7655, + "step": 1040 + }, + { + "epoch": 0.09, + "grad_norm": 4.401443004608154, + "learning_rate": 1.9681159420289856e-05, + "loss": 1.9682, + "step": 1050 + }, + { + "epoch": 0.09, + "grad_norm": 3.3086190223693848, + "learning_rate": 1.967536231884058e-05, + "loss": 1.7689, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 3.4858791828155518, + "learning_rate": 1.9669565217391307e-05, + "loss": 1.6351, + "step": 1070 + }, + { + "epoch": 0.09, + "grad_norm": 0.9979552030563354, + "learning_rate": 1.966376811594203e-05, + "loss": 1.8494, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 1.989320158958435, + "learning_rate": 1.9657971014492755e-05, + "loss": 1.7798, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 4.888331890106201, + "learning_rate": 1.965217391304348e-05, + "loss": 1.8133, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 1.666988730430603, + "learning_rate": 1.9646376811594207e-05, + "loss": 1.8949, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 1.6761118173599243, + "learning_rate": 1.964057971014493e-05, + "loss": 1.9065, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 1.6665714979171753, + "learning_rate": 1.963478260869565e-05, + "loss": 1.8475, + "step": 1130 + }, + { + "epoch": 0.1, + "grad_norm": 3.513988971710205, + "learning_rate": 1.962898550724638e-05, + "loss": 1.6495, + "step": 1140 + }, + { + "epoch": 0.1, + "grad_norm": 1.7759839296340942, + "learning_rate": 1.9623188405797103e-05, + "loss": 1.7775, + "step": 1150 + }, + { + "epoch": 0.1, + "grad_norm": 1.4246402978897095, + "learning_rate": 1.9617391304347827e-05, + "loss": 1.765, + "step": 1160 + }, + { + "epoch": 0.1, + "grad_norm": 1.7224540710449219, + "learning_rate": 1.961159420289855e-05, + "loss": 1.7599, + "step": 1170 + }, + { + "epoch": 0.1, + "grad_norm": 2.1602578163146973, + "learning_rate": 1.9605797101449278e-05, + "loss": 1.63, + "step": 1180 + }, + { + "epoch": 0.1, + "grad_norm": 1.389148473739624, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.771, + "step": 1190 + }, + { + "epoch": 0.1, + "grad_norm": 1.9033640623092651, + "learning_rate": 1.9594202898550726e-05, + "loss": 1.8217, + "step": 1200 + }, + { + "epoch": 0.1, + "grad_norm": 1.8465853929519653, + "learning_rate": 1.958840579710145e-05, + "loss": 1.8974, + "step": 1210 + }, + { + "epoch": 0.1, + "grad_norm": 1.0622667074203491, + "learning_rate": 1.9582608695652177e-05, + "loss": 1.7574, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 2.802722692489624, + "learning_rate": 1.95768115942029e-05, + "loss": 1.7193, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 1.6129810810089111, + "learning_rate": 1.9571014492753625e-05, + "loss": 1.8253, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 2.9575114250183105, + "learning_rate": 1.956521739130435e-05, + "loss": 1.8882, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 2.4238998889923096, + "learning_rate": 1.9559420289855074e-05, + "loss": 1.7379, + "step": 1260 + }, + { + "epoch": 0.11, + "grad_norm": 3.9338934421539307, + "learning_rate": 1.9553623188405798e-05, + "loss": 1.7347, + "step": 1270 + }, + { + "epoch": 0.11, + "grad_norm": 1.6793437004089355, + "learning_rate": 1.954782608695652e-05, + "loss": 1.9141, + "step": 1280 + }, + { + "epoch": 0.11, + "grad_norm": 4.266021728515625, + "learning_rate": 1.954202898550725e-05, + "loss": 1.8423, + "step": 1290 + }, + { + "epoch": 0.11, + "grad_norm": 1.8466399908065796, + "learning_rate": 1.9536231884057973e-05, + "loss": 1.8266, + "step": 1300 + }, + { + "epoch": 0.11, + "grad_norm": 1.8755276203155518, + "learning_rate": 1.9530434782608697e-05, + "loss": 1.9117, + "step": 1310 + }, + { + "epoch": 0.11, + "grad_norm": 1.3602133989334106, + "learning_rate": 1.952463768115942e-05, + "loss": 1.7265, + "step": 1320 + }, + { + "epoch": 0.11, + "grad_norm": 1.353096842765808, + "learning_rate": 1.9518840579710145e-05, + "loss": 1.8355, + "step": 1330 + }, + { + "epoch": 0.11, + "grad_norm": 1.312878966331482, + "learning_rate": 1.9513043478260872e-05, + "loss": 1.7331, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 2.431149959564209, + "learning_rate": 1.9507246376811596e-05, + "loss": 1.7316, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 1.3108808994293213, + "learning_rate": 1.950144927536232e-05, + "loss": 1.8539, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 3.86793851852417, + "learning_rate": 1.9495652173913044e-05, + "loss": 1.8277, + "step": 1370 + }, + { + "epoch": 0.12, + "grad_norm": 1.5965479612350464, + "learning_rate": 1.9489855072463772e-05, + "loss": 1.7548, + "step": 1380 + }, + { + "epoch": 0.12, + "grad_norm": 3.187406063079834, + "learning_rate": 1.9484057971014492e-05, + "loss": 1.8396, + "step": 1390 + }, + { + "epoch": 0.12, + "grad_norm": 1.5422654151916504, + "learning_rate": 1.947826086956522e-05, + "loss": 1.7876, + "step": 1400 + }, + { + "epoch": 0.12, + "grad_norm": 2.088440418243408, + "learning_rate": 1.9472463768115944e-05, + "loss": 1.8558, + "step": 1410 + }, + { + "epoch": 0.12, + "grad_norm": 1.8927093744277954, + "learning_rate": 1.9466666666666668e-05, + "loss": 1.8849, + "step": 1420 + }, + { + "epoch": 0.12, + "grad_norm": 3.945380687713623, + "learning_rate": 1.9460869565217392e-05, + "loss": 1.7351, + "step": 1430 + }, + { + "epoch": 0.12, + "grad_norm": 4.608578681945801, + "learning_rate": 1.9455072463768116e-05, + "loss": 1.7438, + "step": 1440 + }, + { + "epoch": 0.12, + "grad_norm": 3.4089462757110596, + "learning_rate": 1.9449275362318843e-05, + "loss": 1.8227, + "step": 1450 + }, + { + "epoch": 0.12, + "grad_norm": 4.93408203125, + "learning_rate": 1.9443478260869567e-05, + "loss": 1.8358, + "step": 1460 + }, + { + "epoch": 0.12, + "grad_norm": 2.577270269393921, + "learning_rate": 1.943768115942029e-05, + "loss": 1.8707, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 2.3741328716278076, + "learning_rate": 1.9431884057971015e-05, + "loss": 1.764, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 2.4061601161956787, + "learning_rate": 1.9426086956521743e-05, + "loss": 1.8593, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 3.7175543308258057, + "learning_rate": 1.9420289855072467e-05, + "loss": 1.9067, + "step": 1500 + }, + { + "epoch": 0.12, + "eval_loss": 1.7743674516677856, + "eval_runtime": 107.4624, + "eval_samples_per_second": 9.306, + "eval_steps_per_second": 2.326, + "step": 1500 + }, + { + "epoch": 0.13, + "grad_norm": 1.384735107421875, + "learning_rate": 1.941449275362319e-05, + "loss": 1.6986, + "step": 1510 + }, + { + "epoch": 0.13, + "grad_norm": 2.8683741092681885, + "learning_rate": 1.9408695652173915e-05, + "loss": 1.8999, + "step": 1520 + }, + { + "epoch": 0.13, + "grad_norm": 2.08429217338562, + "learning_rate": 1.9402898550724642e-05, + "loss": 1.8664, + "step": 1530 + }, + { + "epoch": 0.13, + "grad_norm": 1.030617117881775, + "learning_rate": 1.9397101449275363e-05, + "loss": 1.5804, + "step": 1540 + }, + { + "epoch": 0.13, + "grad_norm": 2.734713315963745, + "learning_rate": 1.9391304347826087e-05, + "loss": 1.8053, + "step": 1550 + }, + { + "epoch": 0.13, + "grad_norm": 1.2831270694732666, + "learning_rate": 1.9385507246376814e-05, + "loss": 1.7894, + "step": 1560 + }, + { + "epoch": 0.13, + "grad_norm": 4.505608558654785, + "learning_rate": 1.9379710144927538e-05, + "loss": 1.6539, + "step": 1570 + }, + { + "epoch": 0.13, + "grad_norm": 1.0595582723617554, + "learning_rate": 1.9373913043478262e-05, + "loss": 1.8655, + "step": 1580 + }, + { + "epoch": 0.13, + "grad_norm": 2.5555408000946045, + "learning_rate": 1.9368115942028986e-05, + "loss": 1.763, + "step": 1590 + }, + { + "epoch": 0.13, + "grad_norm": 3.7931909561157227, + "learning_rate": 1.9362318840579713e-05, + "loss": 1.8121, + "step": 1600 + }, + { + "epoch": 0.13, + "grad_norm": 4.358292579650879, + "learning_rate": 1.9356521739130437e-05, + "loss": 1.8876, + "step": 1610 + }, + { + "epoch": 0.14, + "grad_norm": 4.060112953186035, + "learning_rate": 1.935072463768116e-05, + "loss": 1.7779, + "step": 1620 + }, + { + "epoch": 0.14, + "grad_norm": 1.94615638256073, + "learning_rate": 1.9344927536231885e-05, + "loss": 1.7276, + "step": 1630 + }, + { + "epoch": 0.14, + "grad_norm": 2.230520009994507, + "learning_rate": 1.933913043478261e-05, + "loss": 1.7617, + "step": 1640 + }, + { + "epoch": 0.14, + "grad_norm": 2.058115243911743, + "learning_rate": 1.9333333333333333e-05, + "loss": 1.8117, + "step": 1650 + }, + { + "epoch": 0.14, + "grad_norm": 1.2918510437011719, + "learning_rate": 1.9327536231884057e-05, + "loss": 1.6933, + "step": 1660 + }, + { + "epoch": 0.14, + "grad_norm": 1.8761500120162964, + "learning_rate": 1.9321739130434785e-05, + "loss": 1.8075, + "step": 1670 + }, + { + "epoch": 0.14, + "grad_norm": 3.5313031673431396, + "learning_rate": 1.931594202898551e-05, + "loss": 1.6154, + "step": 1680 + }, + { + "epoch": 0.14, + "grad_norm": 1.8059382438659668, + "learning_rate": 1.9310144927536233e-05, + "loss": 1.7508, + "step": 1690 + }, + { + "epoch": 0.14, + "grad_norm": 3.9243950843811035, + "learning_rate": 1.9304347826086957e-05, + "loss": 1.832, + "step": 1700 + }, + { + "epoch": 0.14, + "grad_norm": 5.17168664932251, + "learning_rate": 1.9298550724637684e-05, + "loss": 1.8438, + "step": 1710 + }, + { + "epoch": 0.14, + "grad_norm": 2.7666871547698975, + "learning_rate": 1.9292753623188408e-05, + "loss": 1.8314, + "step": 1720 + }, + { + "epoch": 0.14, + "grad_norm": 4.743995666503906, + "learning_rate": 1.9286956521739132e-05, + "loss": 1.7588, + "step": 1730 + }, + { + "epoch": 0.14, + "grad_norm": 2.5909578800201416, + "learning_rate": 1.9281159420289856e-05, + "loss": 1.839, + "step": 1740 + }, + { + "epoch": 0.15, + "grad_norm": 1.395579218864441, + "learning_rate": 1.927536231884058e-05, + "loss": 1.6756, + "step": 1750 + }, + { + "epoch": 0.15, + "grad_norm": 2.7446272373199463, + "learning_rate": 1.9269565217391308e-05, + "loss": 1.8533, + "step": 1760 + }, + { + "epoch": 0.15, + "grad_norm": 2.249009132385254, + "learning_rate": 1.9263768115942028e-05, + "loss": 1.8051, + "step": 1770 + }, + { + "epoch": 0.15, + "grad_norm": 0.8617609143257141, + "learning_rate": 1.9257971014492756e-05, + "loss": 1.7813, + "step": 1780 + }, + { + "epoch": 0.15, + "grad_norm": 1.3332220315933228, + "learning_rate": 1.925217391304348e-05, + "loss": 1.7937, + "step": 1790 + }, + { + "epoch": 0.15, + "grad_norm": 3.5308120250701904, + "learning_rate": 1.9246376811594204e-05, + "loss": 1.7395, + "step": 1800 + }, + { + "epoch": 0.15, + "grad_norm": 3.629775047302246, + "learning_rate": 1.9240579710144928e-05, + "loss": 1.7794, + "step": 1810 + }, + { + "epoch": 0.15, + "grad_norm": 3.5039477348327637, + "learning_rate": 1.9234782608695655e-05, + "loss": 1.7868, + "step": 1820 + }, + { + "epoch": 0.15, + "grad_norm": 2.4312188625335693, + "learning_rate": 1.922898550724638e-05, + "loss": 1.7787, + "step": 1830 + }, + { + "epoch": 0.15, + "grad_norm": 2.299351930618286, + "learning_rate": 1.9223188405797103e-05, + "loss": 1.5264, + "step": 1840 + }, + { + "epoch": 0.15, + "grad_norm": 2.888294219970703, + "learning_rate": 1.9217391304347827e-05, + "loss": 1.7851, + "step": 1850 + }, + { + "epoch": 0.15, + "grad_norm": 2.4942209720611572, + "learning_rate": 1.921159420289855e-05, + "loss": 1.8019, + "step": 1860 + }, + { + "epoch": 0.16, + "grad_norm": 1.9073644876480103, + "learning_rate": 1.920579710144928e-05, + "loss": 1.8111, + "step": 1870 + }, + { + "epoch": 0.16, + "grad_norm": 1.672957420349121, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.8331, + "step": 1880 + }, + { + "epoch": 0.16, + "grad_norm": 1.514133095741272, + "learning_rate": 1.9194202898550727e-05, + "loss": 1.7383, + "step": 1890 + }, + { + "epoch": 0.16, + "grad_norm": 3.5273349285125732, + "learning_rate": 1.918840579710145e-05, + "loss": 1.7974, + "step": 1900 + }, + { + "epoch": 0.16, + "grad_norm": 3.798083782196045, + "learning_rate": 1.9182608695652175e-05, + "loss": 1.7457, + "step": 1910 + }, + { + "epoch": 0.16, + "grad_norm": 5.434926509857178, + "learning_rate": 1.91768115942029e-05, + "loss": 1.7104, + "step": 1920 + }, + { + "epoch": 0.16, + "grad_norm": 1.5063635110855103, + "learning_rate": 1.9171014492753626e-05, + "loss": 1.7753, + "step": 1930 + }, + { + "epoch": 0.16, + "grad_norm": 3.0606720447540283, + "learning_rate": 1.916521739130435e-05, + "loss": 1.7012, + "step": 1940 + }, + { + "epoch": 0.16, + "grad_norm": 0.9884568452835083, + "learning_rate": 1.9159420289855074e-05, + "loss": 1.7646, + "step": 1950 + }, + { + "epoch": 0.16, + "grad_norm": 1.3289304971694946, + "learning_rate": 1.9153623188405798e-05, + "loss": 1.8317, + "step": 1960 + }, + { + "epoch": 0.16, + "grad_norm": 2.8612945079803467, + "learning_rate": 1.9147826086956522e-05, + "loss": 1.8826, + "step": 1970 + }, + { + "epoch": 0.17, + "grad_norm": 2.970140218734741, + "learning_rate": 1.914202898550725e-05, + "loss": 1.9036, + "step": 1980 + }, + { + "epoch": 0.17, + "grad_norm": 2.007293939590454, + "learning_rate": 1.9136231884057973e-05, + "loss": 1.7557, + "step": 1990 + }, + { + "epoch": 0.17, + "grad_norm": 1.1491663455963135, + "learning_rate": 1.9130434782608697e-05, + "loss": 1.7295, + "step": 2000 + }, + { + "epoch": 0.17, + "eval_loss": 1.7665390968322754, + "eval_runtime": 107.4574, + "eval_samples_per_second": 9.306, + "eval_steps_per_second": 2.327, + "step": 2000 + }, + { + "epoch": 0.17, + "grad_norm": 3.889491319656372, + "learning_rate": 1.912463768115942e-05, + "loss": 1.7628, + "step": 2010 + }, + { + "epoch": 0.17, + "grad_norm": 1.507429838180542, + "learning_rate": 1.911884057971015e-05, + "loss": 1.7585, + "step": 2020 + }, + { + "epoch": 0.17, + "grad_norm": 2.2127914428710938, + "learning_rate": 1.911304347826087e-05, + "loss": 1.8121, + "step": 2030 + }, + { + "epoch": 0.17, + "grad_norm": 2.9668962955474854, + "learning_rate": 1.9107246376811597e-05, + "loss": 1.8563, + "step": 2040 + }, + { + "epoch": 0.17, + "grad_norm": 0.9941543936729431, + "learning_rate": 1.910144927536232e-05, + "loss": 1.8258, + "step": 2050 + }, + { + "epoch": 0.17, + "grad_norm": 3.4788460731506348, + "learning_rate": 1.9095652173913045e-05, + "loss": 1.7617, + "step": 2060 + }, + { + "epoch": 0.17, + "grad_norm": 2.523179769515991, + "learning_rate": 1.908985507246377e-05, + "loss": 1.6908, + "step": 2070 + }, + { + "epoch": 0.17, + "grad_norm": 2.2159104347229004, + "learning_rate": 1.9084057971014493e-05, + "loss": 1.6652, + "step": 2080 + }, + { + "epoch": 0.17, + "grad_norm": 2.2359981536865234, + "learning_rate": 1.907826086956522e-05, + "loss": 1.8826, + "step": 2090 + }, + { + "epoch": 0.17, + "grad_norm": 3.3493943214416504, + "learning_rate": 1.9072463768115944e-05, + "loss": 1.8261, + "step": 2100 + }, + { + "epoch": 0.18, + "grad_norm": 1.5862370729446411, + "learning_rate": 1.9066666666666668e-05, + "loss": 1.6941, + "step": 2110 + }, + { + "epoch": 0.18, + "grad_norm": 1.1425403356552124, + "learning_rate": 1.9060869565217392e-05, + "loss": 1.8313, + "step": 2120 + }, + { + "epoch": 0.18, + "grad_norm": 4.16150426864624, + "learning_rate": 1.905507246376812e-05, + "loss": 1.7649, + "step": 2130 + }, + { + "epoch": 0.18, + "grad_norm": 2.25124192237854, + "learning_rate": 1.9049275362318844e-05, + "loss": 1.8382, + "step": 2140 + }, + { + "epoch": 0.18, + "grad_norm": 3.4652185440063477, + "learning_rate": 1.9043478260869568e-05, + "loss": 1.8486, + "step": 2150 + }, + { + "epoch": 0.18, + "grad_norm": 3.7186965942382812, + "learning_rate": 1.903768115942029e-05, + "loss": 1.8783, + "step": 2160 + }, + { + "epoch": 0.18, + "grad_norm": 2.2005832195281982, + "learning_rate": 1.9031884057971016e-05, + "loss": 1.8612, + "step": 2170 + }, + { + "epoch": 0.18, + "grad_norm": 2.344748020172119, + "learning_rate": 1.902608695652174e-05, + "loss": 1.6858, + "step": 2180 + }, + { + "epoch": 0.18, + "grad_norm": 1.9262315034866333, + "learning_rate": 1.9020289855072464e-05, + "loss": 1.7719, + "step": 2190 + }, + { + "epoch": 0.18, + "grad_norm": 2.292480945587158, + "learning_rate": 1.901449275362319e-05, + "loss": 1.883, + "step": 2200 + }, + { + "epoch": 0.18, + "grad_norm": 3.0080437660217285, + "learning_rate": 1.9008695652173915e-05, + "loss": 1.7567, + "step": 2210 + }, + { + "epoch": 0.18, + "grad_norm": 1.655610203742981, + "learning_rate": 1.900289855072464e-05, + "loss": 1.7931, + "step": 2220 + }, + { + "epoch": 0.19, + "grad_norm": 1.8952635526657104, + "learning_rate": 1.8997101449275363e-05, + "loss": 1.7877, + "step": 2230 + }, + { + "epoch": 0.19, + "grad_norm": 3.0049967765808105, + "learning_rate": 1.899130434782609e-05, + "loss": 1.8179, + "step": 2240 + }, + { + "epoch": 0.19, + "grad_norm": 1.809584379196167, + "learning_rate": 1.8985507246376814e-05, + "loss": 1.7893, + "step": 2250 + }, + { + "epoch": 0.19, + "grad_norm": 3.3408210277557373, + "learning_rate": 1.8979710144927535e-05, + "loss": 1.76, + "step": 2260 + }, + { + "epoch": 0.19, + "grad_norm": 2.576713800430298, + "learning_rate": 1.8973913043478262e-05, + "loss": 1.6123, + "step": 2270 + }, + { + "epoch": 0.19, + "grad_norm": 2.999994993209839, + "learning_rate": 1.8968115942028986e-05, + "loss": 1.748, + "step": 2280 + }, + { + "epoch": 0.19, + "grad_norm": 1.8222105503082275, + "learning_rate": 1.896231884057971e-05, + "loss": 1.6274, + "step": 2290 + }, + { + "epoch": 0.19, + "grad_norm": 1.565905213356018, + "learning_rate": 1.8956521739130434e-05, + "loss": 1.7498, + "step": 2300 + }, + { + "epoch": 0.19, + "grad_norm": 1.2533594369888306, + "learning_rate": 1.8950724637681162e-05, + "loss": 1.8822, + "step": 2310 + }, + { + "epoch": 0.19, + "grad_norm": 2.8874733448028564, + "learning_rate": 1.8944927536231886e-05, + "loss": 1.7679, + "step": 2320 + }, + { + "epoch": 0.19, + "grad_norm": 1.1580071449279785, + "learning_rate": 1.893913043478261e-05, + "loss": 1.829, + "step": 2330 + }, + { + "epoch": 0.2, + "grad_norm": 1.6889914274215698, + "learning_rate": 1.8933333333333334e-05, + "loss": 1.7765, + "step": 2340 + }, + { + "epoch": 0.2, + "grad_norm": 1.708866000175476, + "learning_rate": 1.892753623188406e-05, + "loss": 1.7612, + "step": 2350 + }, + { + "epoch": 0.2, + "grad_norm": 2.0311877727508545, + "learning_rate": 1.8921739130434785e-05, + "loss": 1.8271, + "step": 2360 + }, + { + "epoch": 0.2, + "grad_norm": 1.4897469282150269, + "learning_rate": 1.891594202898551e-05, + "loss": 1.67, + "step": 2370 + }, + { + "epoch": 0.2, + "grad_norm": 1.0771639347076416, + "learning_rate": 1.8910144927536233e-05, + "loss": 1.8175, + "step": 2380 + }, + { + "epoch": 0.2, + "grad_norm": 3.115084171295166, + "learning_rate": 1.8904347826086957e-05, + "loss": 1.8247, + "step": 2390 + }, + { + "epoch": 0.2, + "grad_norm": 2.106081008911133, + "learning_rate": 1.8898550724637685e-05, + "loss": 1.833, + "step": 2400 + }, + { + "epoch": 0.2, + "grad_norm": 1.4720683097839355, + "learning_rate": 1.8892753623188405e-05, + "loss": 1.8494, + "step": 2410 + }, + { + "epoch": 0.2, + "grad_norm": 1.4541406631469727, + "learning_rate": 1.8886956521739133e-05, + "loss": 1.8464, + "step": 2420 + }, + { + "epoch": 0.2, + "grad_norm": 1.226954698562622, + "learning_rate": 1.8881159420289857e-05, + "loss": 1.8729, + "step": 2430 + }, + { + "epoch": 0.2, + "grad_norm": 1.2664247751235962, + "learning_rate": 1.887536231884058e-05, + "loss": 1.8722, + "step": 2440 + }, + { + "epoch": 0.2, + "grad_norm": 2.0264010429382324, + "learning_rate": 1.8869565217391305e-05, + "loss": 1.7936, + "step": 2450 + }, + { + "epoch": 0.2, + "grad_norm": 1.336003303527832, + "learning_rate": 1.8863768115942032e-05, + "loss": 1.9073, + "step": 2460 + }, + { + "epoch": 0.21, + "grad_norm": 2.730409622192383, + "learning_rate": 1.8857971014492756e-05, + "loss": 1.9041, + "step": 2470 + }, + { + "epoch": 0.21, + "grad_norm": 2.9845330715179443, + "learning_rate": 1.885217391304348e-05, + "loss": 1.7753, + "step": 2480 + }, + { + "epoch": 0.21, + "grad_norm": 1.5443974733352661, + "learning_rate": 1.8846376811594204e-05, + "loss": 1.7274, + "step": 2490 + }, + { + "epoch": 0.21, + "grad_norm": 1.1884684562683105, + "learning_rate": 1.8840579710144928e-05, + "loss": 1.8284, + "step": 2500 + }, + { + "epoch": 0.21, + "eval_loss": 1.7748754024505615, + "eval_runtime": 107.4612, + "eval_samples_per_second": 9.306, + "eval_steps_per_second": 2.326, + "step": 2500 + }, + { + "epoch": 0.21, + "grad_norm": 2.1604175567626953, + "learning_rate": 1.8834782608695656e-05, + "loss": 1.7344, + "step": 2510 + }, + { + "epoch": 0.21, + "grad_norm": 1.7605400085449219, + "learning_rate": 1.882898550724638e-05, + "loss": 1.7241, + "step": 2520 + }, + { + "epoch": 0.21, + "grad_norm": 3.8347537517547607, + "learning_rate": 1.8823188405797104e-05, + "loss": 1.7196, + "step": 2530 + }, + { + "epoch": 0.21, + "grad_norm": 1.828438639640808, + "learning_rate": 1.8817391304347828e-05, + "loss": 1.8529, + "step": 2540 + }, + { + "epoch": 0.21, + "grad_norm": 1.69232976436615, + "learning_rate": 1.881159420289855e-05, + "loss": 1.7768, + "step": 2550 + }, + { + "epoch": 0.21, + "grad_norm": 3.592120885848999, + "learning_rate": 1.8805797101449276e-05, + "loss": 1.7074, + "step": 2560 + }, + { + "epoch": 0.21, + "grad_norm": 1.56288743019104, + "learning_rate": 1.88e-05, + "loss": 1.8631, + "step": 2570 + }, + { + "epoch": 0.21, + "grad_norm": 0.6752883791923523, + "learning_rate": 1.8794202898550727e-05, + "loss": 1.8049, + "step": 2580 + }, + { + "epoch": 0.22, + "grad_norm": 2.010446071624756, + "learning_rate": 1.878840579710145e-05, + "loss": 1.7486, + "step": 2590 + }, + { + "epoch": 0.22, + "grad_norm": 1.9133752584457397, + "learning_rate": 1.8782608695652175e-05, + "loss": 1.7831, + "step": 2600 + }, + { + "epoch": 0.22, + "grad_norm": 1.1954097747802734, + "learning_rate": 1.87768115942029e-05, + "loss": 1.7329, + "step": 2610 + }, + { + "epoch": 0.22, + "grad_norm": 2.0870425701141357, + "learning_rate": 1.8771014492753626e-05, + "loss": 1.7143, + "step": 2620 + }, + { + "epoch": 0.22, + "grad_norm": 2.162560224533081, + "learning_rate": 1.876521739130435e-05, + "loss": 1.8418, + "step": 2630 + }, + { + "epoch": 0.22, + "grad_norm": 1.2718247175216675, + "learning_rate": 1.8759420289855074e-05, + "loss": 1.7022, + "step": 2640 + }, + { + "epoch": 0.22, + "grad_norm": 2.4909746646881104, + "learning_rate": 1.87536231884058e-05, + "loss": 1.8087, + "step": 2650 + }, + { + "epoch": 0.22, + "grad_norm": 5.101371765136719, + "learning_rate": 1.8747826086956526e-05, + "loss": 1.8178, + "step": 2660 + }, + { + "epoch": 0.22, + "grad_norm": 3.9989445209503174, + "learning_rate": 1.8742028985507246e-05, + "loss": 1.7954, + "step": 2670 + }, + { + "epoch": 0.22, + "grad_norm": 3.0736329555511475, + "learning_rate": 1.873623188405797e-05, + "loss": 1.8488, + "step": 2680 + }, + { + "epoch": 0.22, + "grad_norm": 2.652923822402954, + "learning_rate": 1.8730434782608698e-05, + "loss": 1.7438, + "step": 2690 + }, + { + "epoch": 0.23, + "grad_norm": 4.146462917327881, + "learning_rate": 1.8724637681159422e-05, + "loss": 1.8169, + "step": 2700 + }, + { + "epoch": 0.23, + "grad_norm": 1.5568904876708984, + "learning_rate": 1.8718840579710146e-05, + "loss": 1.7904, + "step": 2710 + }, + { + "epoch": 0.23, + "grad_norm": 2.2244021892547607, + "learning_rate": 1.871304347826087e-05, + "loss": 1.8384, + "step": 2720 + }, + { + "epoch": 0.23, + "grad_norm": 1.3431702852249146, + "learning_rate": 1.8707246376811597e-05, + "loss": 1.8376, + "step": 2730 + }, + { + "epoch": 0.23, + "grad_norm": 3.871310234069824, + "learning_rate": 1.870144927536232e-05, + "loss": 1.6912, + "step": 2740 + }, + { + "epoch": 0.23, + "grad_norm": 2.3337674140930176, + "learning_rate": 1.8695652173913045e-05, + "loss": 1.7164, + "step": 2750 + }, + { + "epoch": 0.23, + "grad_norm": 1.844233751296997, + "learning_rate": 1.868985507246377e-05, + "loss": 1.7873, + "step": 2760 + }, + { + "epoch": 0.23, + "grad_norm": 1.0465248823165894, + "learning_rate": 1.8684057971014497e-05, + "loss": 1.5893, + "step": 2770 + }, + { + "epoch": 0.23, + "grad_norm": 2.0744643211364746, + "learning_rate": 1.867826086956522e-05, + "loss": 1.8043, + "step": 2780 + }, + { + "epoch": 0.23, + "grad_norm": 1.2488594055175781, + "learning_rate": 1.867246376811594e-05, + "loss": 1.7215, + "step": 2790 + }, + { + "epoch": 0.23, + "grad_norm": 3.04681658744812, + "learning_rate": 1.866666666666667e-05, + "loss": 1.901, + "step": 2800 + }, + { + "epoch": 0.23, + "grad_norm": 2.6009609699249268, + "learning_rate": 1.8660869565217393e-05, + "loss": 1.7446, + "step": 2810 + }, + { + "epoch": 0.23, + "grad_norm": 3.019435167312622, + "learning_rate": 1.8655072463768117e-05, + "loss": 1.8343, + "step": 2820 + }, + { + "epoch": 0.24, + "grad_norm": 2.385256290435791, + "learning_rate": 1.864927536231884e-05, + "loss": 1.6675, + "step": 2830 + }, + { + "epoch": 0.24, + "grad_norm": 3.222172737121582, + "learning_rate": 1.8643478260869568e-05, + "loss": 1.6239, + "step": 2840 + }, + { + "epoch": 0.24, + "grad_norm": 4.963768005371094, + "learning_rate": 1.8637681159420292e-05, + "loss": 1.711, + "step": 2850 + }, + { + "epoch": 0.24, + "grad_norm": 3.1189117431640625, + "learning_rate": 1.8631884057971016e-05, + "loss": 1.886, + "step": 2860 + }, + { + "epoch": 0.24, + "grad_norm": 1.5243103504180908, + "learning_rate": 1.862608695652174e-05, + "loss": 1.8398, + "step": 2870 + }, + { + "epoch": 0.24, + "grad_norm": 1.8455665111541748, + "learning_rate": 1.8620289855072464e-05, + "loss": 1.8868, + "step": 2880 + }, + { + "epoch": 0.24, + "grad_norm": 1.1385709047317505, + "learning_rate": 1.861449275362319e-05, + "loss": 1.8302, + "step": 2890 + }, + { + "epoch": 0.24, + "grad_norm": 3.1462037563323975, + "learning_rate": 1.8608695652173912e-05, + "loss": 1.8159, + "step": 2900 + }, + { + "epoch": 0.24, + "grad_norm": 1.0207372903823853, + "learning_rate": 1.860289855072464e-05, + "loss": 1.8887, + "step": 2910 + }, + { + "epoch": 0.24, + "grad_norm": 1.9228794574737549, + "learning_rate": 1.8597101449275363e-05, + "loss": 1.801, + "step": 2920 + }, + { + "epoch": 0.24, + "grad_norm": 2.8184621334075928, + "learning_rate": 1.8591304347826087e-05, + "loss": 1.7667, + "step": 2930 + }, + { + "epoch": 0.24, + "grad_norm": 2.6740689277648926, + "learning_rate": 1.858550724637681e-05, + "loss": 1.8734, + "step": 2940 + }, + { + "epoch": 0.25, + "grad_norm": 2.6178441047668457, + "learning_rate": 1.857971014492754e-05, + "loss": 1.7861, + "step": 2950 + }, + { + "epoch": 0.25, + "grad_norm": 3.4509289264678955, + "learning_rate": 1.8573913043478263e-05, + "loss": 1.7024, + "step": 2960 + }, + { + "epoch": 0.25, + "grad_norm": 7.4058709144592285, + "learning_rate": 1.8568115942028987e-05, + "loss": 1.7561, + "step": 2970 + }, + { + "epoch": 0.25, + "grad_norm": 1.0747387409210205, + "learning_rate": 1.856231884057971e-05, + "loss": 1.8715, + "step": 2980 + }, + { + "epoch": 0.25, + "grad_norm": 4.675367832183838, + "learning_rate": 1.8556521739130435e-05, + "loss": 1.6404, + "step": 2990 + }, + { + "epoch": 0.25, + "grad_norm": 2.9367339611053467, + "learning_rate": 1.8550724637681162e-05, + "loss": 1.7286, + "step": 3000 + }, + { + "epoch": 0.25, + "eval_loss": 1.7500901222229004, + "eval_runtime": 107.4483, + "eval_samples_per_second": 9.307, + "eval_steps_per_second": 2.327, + "step": 3000 + }, + { + "epoch": 0.25, + "grad_norm": 2.8943095207214355, + "learning_rate": 1.8544927536231886e-05, + "loss": 1.6788, + "step": 3010 + }, + { + "epoch": 0.25, + "grad_norm": 3.3797285556793213, + "learning_rate": 1.853913043478261e-05, + "loss": 1.8931, + "step": 3020 + }, + { + "epoch": 0.25, + "grad_norm": 3.7746102809906006, + "learning_rate": 1.8533333333333334e-05, + "loss": 1.7836, + "step": 3030 + }, + { + "epoch": 0.25, + "grad_norm": 1.9604554176330566, + "learning_rate": 1.8527536231884062e-05, + "loss": 1.704, + "step": 3040 + }, + { + "epoch": 0.25, + "grad_norm": 1.5160584449768066, + "learning_rate": 1.8521739130434782e-05, + "loss": 1.7964, + "step": 3050 + }, + { + "epoch": 0.26, + "grad_norm": 1.3620455265045166, + "learning_rate": 1.851594202898551e-05, + "loss": 1.7291, + "step": 3060 + }, + { + "epoch": 0.26, + "grad_norm": 2.6815402507781982, + "learning_rate": 1.8510144927536234e-05, + "loss": 1.744, + "step": 3070 + }, + { + "epoch": 0.26, + "grad_norm": 1.7348963022232056, + "learning_rate": 1.8504347826086958e-05, + "loss": 1.6285, + "step": 3080 + }, + { + "epoch": 0.26, + "grad_norm": 1.5644665956497192, + "learning_rate": 1.8498550724637682e-05, + "loss": 1.6034, + "step": 3090 + }, + { + "epoch": 0.26, + "grad_norm": 2.5588579177856445, + "learning_rate": 1.8492753623188406e-05, + "loss": 1.6455, + "step": 3100 + }, + { + "epoch": 0.26, + "grad_norm": 2.486201763153076, + "learning_rate": 1.8486956521739133e-05, + "loss": 1.6324, + "step": 3110 + }, + { + "epoch": 0.26, + "grad_norm": 4.734580039978027, + "learning_rate": 1.8481159420289857e-05, + "loss": 1.7564, + "step": 3120 + }, + { + "epoch": 0.26, + "grad_norm": 2.060638427734375, + "learning_rate": 1.847536231884058e-05, + "loss": 1.8093, + "step": 3130 + }, + { + "epoch": 0.26, + "grad_norm": 1.396253228187561, + "learning_rate": 1.8469565217391305e-05, + "loss": 1.8422, + "step": 3140 + }, + { + "epoch": 0.26, + "grad_norm": 3.647871494293213, + "learning_rate": 1.8463768115942033e-05, + "loss": 1.692, + "step": 3150 + }, + { + "epoch": 0.26, + "grad_norm": 4.792811870574951, + "learning_rate": 1.8457971014492753e-05, + "loss": 1.8721, + "step": 3160 + }, + { + "epoch": 0.26, + "grad_norm": 1.6648412942886353, + "learning_rate": 1.845217391304348e-05, + "loss": 1.8176, + "step": 3170 + }, + { + "epoch": 0.27, + "grad_norm": 2.2956972122192383, + "learning_rate": 1.8446376811594205e-05, + "loss": 1.7783, + "step": 3180 + }, + { + "epoch": 0.27, + "grad_norm": 1.968624472618103, + "learning_rate": 1.844057971014493e-05, + "loss": 1.75, + "step": 3190 + }, + { + "epoch": 0.27, + "grad_norm": 2.384540557861328, + "learning_rate": 1.8434782608695653e-05, + "loss": 1.8703, + "step": 3200 + }, + { + "epoch": 0.27, + "grad_norm": 0.963712751865387, + "learning_rate": 1.8428985507246377e-05, + "loss": 1.8838, + "step": 3210 + }, + { + "epoch": 0.27, + "grad_norm": 3.5194199085235596, + "learning_rate": 1.8423188405797104e-05, + "loss": 1.7351, + "step": 3220 + }, + { + "epoch": 0.27, + "grad_norm": 3.223130226135254, + "learning_rate": 1.8417971014492754e-05, + "loss": 1.7526, + "step": 3230 + }, + { + "epoch": 0.27, + "grad_norm": 1.9289439916610718, + "learning_rate": 1.841217391304348e-05, + "loss": 1.8648, + "step": 3240 + }, + { + "epoch": 0.27, + "grad_norm": 1.942191481590271, + "learning_rate": 1.8406376811594205e-05, + "loss": 1.7607, + "step": 3250 + }, + { + "epoch": 0.27, + "grad_norm": 1.61818265914917, + "learning_rate": 1.840057971014493e-05, + "loss": 1.6653, + "step": 3260 + }, + { + "epoch": 0.27, + "grad_norm": 2.776350736618042, + "learning_rate": 1.8394782608695653e-05, + "loss": 1.7829, + "step": 3270 + }, + { + "epoch": 0.27, + "grad_norm": 1.71359384059906, + "learning_rate": 1.8388985507246377e-05, + "loss": 1.8434, + "step": 3280 + }, + { + "epoch": 0.27, + "grad_norm": 1.9720557928085327, + "learning_rate": 1.8383188405797104e-05, + "loss": 1.749, + "step": 3290 + }, + { + "epoch": 0.28, + "grad_norm": 2.1935908794403076, + "learning_rate": 1.837739130434783e-05, + "loss": 1.8992, + "step": 3300 + }, + { + "epoch": 0.28, + "grad_norm": 4.199913024902344, + "learning_rate": 1.8371594202898552e-05, + "loss": 1.8549, + "step": 3310 + }, + { + "epoch": 0.28, + "grad_norm": 3.7523651123046875, + "learning_rate": 1.8365797101449276e-05, + "loss": 1.8814, + "step": 3320 + }, + { + "epoch": 0.28, + "grad_norm": 2.2951200008392334, + "learning_rate": 1.8360000000000004e-05, + "loss": 1.8184, + "step": 3330 + }, + { + "epoch": 0.28, + "grad_norm": 2.8267765045166016, + "learning_rate": 1.8354202898550724e-05, + "loss": 1.8589, + "step": 3340 + }, + { + "epoch": 0.28, + "grad_norm": 1.0535434484481812, + "learning_rate": 1.8348405797101452e-05, + "loss": 1.609, + "step": 3350 + }, + { + "epoch": 0.28, + "grad_norm": 5.798702716827393, + "learning_rate": 1.8342608695652176e-05, + "loss": 1.7119, + "step": 3360 + }, + { + "epoch": 0.28, + "grad_norm": 4.116357326507568, + "learning_rate": 1.83368115942029e-05, + "loss": 1.7238, + "step": 3370 + }, + { + "epoch": 0.28, + "grad_norm": 1.4448415040969849, + "learning_rate": 1.8331014492753624e-05, + "loss": 1.6308, + "step": 3380 + }, + { + "epoch": 0.28, + "grad_norm": 2.775078296661377, + "learning_rate": 1.8325217391304348e-05, + "loss": 1.8911, + "step": 3390 + }, + { + "epoch": 0.28, + "grad_norm": 2.1946187019348145, + "learning_rate": 1.8319420289855075e-05, + "loss": 1.7165, + "step": 3400 + }, + { + "epoch": 0.28, + "grad_norm": 2.4208180904388428, + "learning_rate": 1.83136231884058e-05, + "loss": 1.851, + "step": 3410 + }, + { + "epoch": 0.28, + "grad_norm": 2.1975769996643066, + "learning_rate": 1.8307826086956523e-05, + "loss": 1.7533, + "step": 3420 + }, + { + "epoch": 0.29, + "grad_norm": 5.09836483001709, + "learning_rate": 1.8302028985507247e-05, + "loss": 1.7399, + "step": 3430 + }, + { + "epoch": 0.29, + "grad_norm": 2.047780990600586, + "learning_rate": 1.8296231884057975e-05, + "loss": 1.6752, + "step": 3440 + }, + { + "epoch": 0.29, + "grad_norm": 2.692073106765747, + "learning_rate": 1.82904347826087e-05, + "loss": 1.649, + "step": 3450 + }, + { + "epoch": 0.29, + "grad_norm": 2.907693862915039, + "learning_rate": 1.8284637681159423e-05, + "loss": 1.7774, + "step": 3460 + }, + { + "epoch": 0.29, + "grad_norm": 2.013425350189209, + "learning_rate": 1.8278840579710147e-05, + "loss": 1.8186, + "step": 3470 + }, + { + "epoch": 0.29, + "grad_norm": 1.129022479057312, + "learning_rate": 1.827304347826087e-05, + "loss": 1.827, + "step": 3480 + }, + { + "epoch": 0.29, + "grad_norm": 2.979015350341797, + "learning_rate": 1.8267246376811595e-05, + "loss": 1.6826, + "step": 3490 + }, + { + "epoch": 0.29, + "grad_norm": 2.317847967147827, + "learning_rate": 1.826144927536232e-05, + "loss": 1.8459, + "step": 3500 + }, + { + "epoch": 0.29, + "eval_loss": 1.7489032745361328, + "eval_runtime": 107.4616, + "eval_samples_per_second": 9.306, + "eval_steps_per_second": 2.326, + "step": 3500 + }, + { + "epoch": 0.29, + "grad_norm": 1.7150217294692993, + "learning_rate": 1.8255652173913046e-05, + "loss": 1.734, + "step": 3510 + }, + { + "epoch": 0.29, + "grad_norm": 1.6586185693740845, + "learning_rate": 1.824985507246377e-05, + "loss": 1.7426, + "step": 3520 + }, + { + "epoch": 0.29, + "grad_norm": 1.2894079685211182, + "learning_rate": 1.8244057971014494e-05, + "loss": 1.7626, + "step": 3530 + }, + { + "epoch": 0.29, + "grad_norm": 2.0136561393737793, + "learning_rate": 1.8238260869565218e-05, + "loss": 1.7033, + "step": 3540 + }, + { + "epoch": 0.3, + "grad_norm": 2.7215077877044678, + "learning_rate": 1.8232463768115945e-05, + "loss": 1.8255, + "step": 3550 + }, + { + "epoch": 0.3, + "grad_norm": 3.4265096187591553, + "learning_rate": 1.822666666666667e-05, + "loss": 1.7646, + "step": 3560 + }, + { + "epoch": 0.3, + "grad_norm": 2.2528915405273438, + "learning_rate": 1.822086956521739e-05, + "loss": 1.809, + "step": 3570 + }, + { + "epoch": 0.3, + "grad_norm": 1.1465145349502563, + "learning_rate": 1.8215072463768117e-05, + "loss": 1.7062, + "step": 3580 + }, + { + "epoch": 0.3, + "grad_norm": 1.697144865989685, + "learning_rate": 1.820927536231884e-05, + "loss": 1.7734, + "step": 3590 + }, + { + "epoch": 0.3, + "grad_norm": 1.9666852951049805, + "learning_rate": 1.8203478260869565e-05, + "loss": 1.751, + "step": 3600 + }, + { + "epoch": 0.3, + "grad_norm": 0.8137166500091553, + "learning_rate": 1.819768115942029e-05, + "loss": 1.7801, + "step": 3610 + }, + { + "epoch": 0.3, + "grad_norm": 2.7446413040161133, + "learning_rate": 1.8191884057971017e-05, + "loss": 1.7313, + "step": 3620 + }, + { + "epoch": 0.3, + "grad_norm": 3.2775075435638428, + "learning_rate": 1.818608695652174e-05, + "loss": 1.7378, + "step": 3630 + }, + { + "epoch": 0.3, + "grad_norm": 1.90742027759552, + "learning_rate": 1.8180289855072465e-05, + "loss": 1.8367, + "step": 3640 + }, + { + "epoch": 0.3, + "grad_norm": 1.5931909084320068, + "learning_rate": 1.817449275362319e-05, + "loss": 1.7352, + "step": 3650 + }, + { + "epoch": 0.3, + "grad_norm": 2.411388397216797, + "learning_rate": 1.8168695652173916e-05, + "loss": 1.8184, + "step": 3660 + }, + { + "epoch": 0.31, + "grad_norm": 2.2610573768615723, + "learning_rate": 1.816289855072464e-05, + "loss": 1.8392, + "step": 3670 + }, + { + "epoch": 0.31, + "grad_norm": 1.2999207973480225, + "learning_rate": 1.8157101449275364e-05, + "loss": 1.7333, + "step": 3680 + }, + { + "epoch": 0.31, + "grad_norm": 1.7404083013534546, + "learning_rate": 1.815130434782609e-05, + "loss": 1.734, + "step": 3690 + }, + { + "epoch": 0.31, + "grad_norm": 1.5067249536514282, + "learning_rate": 1.8145507246376812e-05, + "loss": 1.7091, + "step": 3700 + }, + { + "epoch": 0.31, + "grad_norm": 1.4666568040847778, + "learning_rate": 1.813971014492754e-05, + "loss": 1.6971, + "step": 3710 + }, + { + "epoch": 0.31, + "grad_norm": 1.8228498697280884, + "learning_rate": 1.813391304347826e-05, + "loss": 1.6056, + "step": 3720 + }, + { + "epoch": 0.31, + "grad_norm": 6.056992530822754, + "learning_rate": 1.8128115942028988e-05, + "loss": 1.7473, + "step": 3730 + }, + { + "epoch": 0.31, + "grad_norm": 3.1190054416656494, + "learning_rate": 1.8122318840579712e-05, + "loss": 1.7543, + "step": 3740 + }, + { + "epoch": 0.31, + "grad_norm": 2.7520599365234375, + "learning_rate": 1.8116521739130436e-05, + "loss": 1.8603, + "step": 3750 + }, + { + "epoch": 0.31, + "grad_norm": 3.523573637008667, + "learning_rate": 1.811072463768116e-05, + "loss": 1.8091, + "step": 3760 + }, + { + "epoch": 0.31, + "grad_norm": 1.5726646184921265, + "learning_rate": 1.8104927536231887e-05, + "loss": 1.728, + "step": 3770 + }, + { + "epoch": 0.32, + "grad_norm": 2.465932846069336, + "learning_rate": 1.809913043478261e-05, + "loss": 1.5628, + "step": 3780 + }, + { + "epoch": 0.32, + "grad_norm": 2.4633328914642334, + "learning_rate": 1.8093333333333335e-05, + "loss": 1.6955, + "step": 3790 + }, + { + "epoch": 0.32, + "grad_norm": 1.3645209074020386, + "learning_rate": 1.808753623188406e-05, + "loss": 1.8479, + "step": 3800 + }, + { + "epoch": 0.32, + "grad_norm": 1.8331495523452759, + "learning_rate": 1.8081739130434783e-05, + "loss": 1.7356, + "step": 3810 + }, + { + "epoch": 0.32, + "grad_norm": 1.2319082021713257, + "learning_rate": 1.807594202898551e-05, + "loss": 1.6966, + "step": 3820 + }, + { + "epoch": 0.32, + "grad_norm": 2.204197645187378, + "learning_rate": 1.807014492753623e-05, + "loss": 1.8275, + "step": 3830 + }, + { + "epoch": 0.32, + "grad_norm": 2.6139984130859375, + "learning_rate": 1.806434782608696e-05, + "loss": 1.73, + "step": 3840 + }, + { + "epoch": 0.32, + "grad_norm": 3.1955862045288086, + "learning_rate": 1.8058550724637683e-05, + "loss": 1.7662, + "step": 3850 + }, + { + "epoch": 0.32, + "grad_norm": 0.9615164399147034, + "learning_rate": 1.8052753623188407e-05, + "loss": 1.8208, + "step": 3860 + }, + { + "epoch": 0.32, + "grad_norm": 1.718482494354248, + "learning_rate": 1.804695652173913e-05, + "loss": 1.7241, + "step": 3870 + }, + { + "epoch": 0.32, + "grad_norm": 3.189242124557495, + "learning_rate": 1.8041159420289855e-05, + "loss": 1.8602, + "step": 3880 + }, + { + "epoch": 0.32, + "grad_norm": 3.950349807739258, + "learning_rate": 1.8035362318840582e-05, + "loss": 1.866, + "step": 3890 + }, + { + "epoch": 0.33, + "grad_norm": 1.566131591796875, + "learning_rate": 1.8029565217391306e-05, + "loss": 1.528, + "step": 3900 + }, + { + "epoch": 0.33, + "grad_norm": 1.409193515777588, + "learning_rate": 1.802376811594203e-05, + "loss": 1.7897, + "step": 3910 + }, + { + "epoch": 0.33, + "grad_norm": 2.5493619441986084, + "learning_rate": 1.8017971014492754e-05, + "loss": 1.8192, + "step": 3920 + }, + { + "epoch": 0.33, + "grad_norm": 3.087979793548584, + "learning_rate": 1.801217391304348e-05, + "loss": 1.7661, + "step": 3930 + }, + { + "epoch": 0.33, + "grad_norm": 2.98085355758667, + "learning_rate": 1.8006376811594205e-05, + "loss": 1.7836, + "step": 3940 + }, + { + "epoch": 0.33, + "grad_norm": 1.6179476976394653, + "learning_rate": 1.800057971014493e-05, + "loss": 1.8128, + "step": 3950 + }, + { + "epoch": 0.33, + "grad_norm": 1.0330349206924438, + "learning_rate": 1.7994782608695653e-05, + "loss": 1.8616, + "step": 3960 + }, + { + "epoch": 0.33, + "grad_norm": 1.4651768207550049, + "learning_rate": 1.798898550724638e-05, + "loss": 1.8311, + "step": 3970 + }, + { + "epoch": 0.33, + "grad_norm": 0.7031122446060181, + "learning_rate": 1.79831884057971e-05, + "loss": 1.6809, + "step": 3980 + }, + { + "epoch": 0.33, + "grad_norm": 1.734285831451416, + "learning_rate": 1.7977391304347825e-05, + "loss": 1.7419, + "step": 3990 + }, + { + "epoch": 0.33, + "grad_norm": 2.079700469970703, + "learning_rate": 1.7971594202898553e-05, + "loss": 1.7631, + "step": 4000 + }, + { + "epoch": 0.33, + "eval_loss": 1.7669097185134888, + "eval_runtime": 107.4754, + "eval_samples_per_second": 9.304, + "eval_steps_per_second": 2.326, + "step": 4000 + }, + { + "epoch": 0.33, + "grad_norm": 1.9647717475891113, + "learning_rate": 1.7965797101449277e-05, + "loss": 1.723, + "step": 4010 + }, + { + "epoch": 0.34, + "grad_norm": 2.0198094844818115, + "learning_rate": 1.796e-05, + "loss": 1.5237, + "step": 4020 + }, + { + "epoch": 0.34, + "grad_norm": 1.165492296218872, + "learning_rate": 1.7954202898550725e-05, + "loss": 1.7683, + "step": 4030 + }, + { + "epoch": 0.34, + "grad_norm": 2.6819276809692383, + "learning_rate": 1.7948405797101452e-05, + "loss": 1.7101, + "step": 4040 + }, + { + "epoch": 0.34, + "grad_norm": 1.1148452758789062, + "learning_rate": 1.7942608695652176e-05, + "loss": 1.6593, + "step": 4050 + }, + { + "epoch": 0.34, + "grad_norm": 1.5147534608840942, + "learning_rate": 1.79368115942029e-05, + "loss": 1.599, + "step": 4060 + }, + { + "epoch": 0.34, + "grad_norm": 1.868163824081421, + "learning_rate": 1.7931014492753624e-05, + "loss": 1.7865, + "step": 4070 + }, + { + "epoch": 0.34, + "grad_norm": 2.7418978214263916, + "learning_rate": 1.792521739130435e-05, + "loss": 1.7432, + "step": 4080 + }, + { + "epoch": 0.34, + "grad_norm": 1.6911894083023071, + "learning_rate": 1.7919420289855076e-05, + "loss": 1.8255, + "step": 4090 + }, + { + "epoch": 0.34, + "grad_norm": 2.916471004486084, + "learning_rate": 1.7913623188405796e-05, + "loss": 1.8335, + "step": 4100 + }, + { + "epoch": 0.34, + "grad_norm": 1.2466706037521362, + "learning_rate": 1.7907826086956524e-05, + "loss": 1.8451, + "step": 4110 + }, + { + "epoch": 0.34, + "grad_norm": 1.5247670412063599, + "learning_rate": 1.7902028985507248e-05, + "loss": 1.7443, + "step": 4120 + }, + { + "epoch": 0.34, + "grad_norm": 1.580941081047058, + "learning_rate": 1.789623188405797e-05, + "loss": 1.8384, + "step": 4130 + }, + { + "epoch": 0.34, + "grad_norm": 4.8174824714660645, + "learning_rate": 1.7890434782608696e-05, + "loss": 1.6805, + "step": 4140 + }, + { + "epoch": 0.35, + "grad_norm": 1.682310938835144, + "learning_rate": 1.7884637681159423e-05, + "loss": 1.7765, + "step": 4150 + }, + { + "epoch": 0.35, + "grad_norm": 3.392796039581299, + "learning_rate": 1.7878840579710147e-05, + "loss": 1.6769, + "step": 4160 + }, + { + "epoch": 0.35, + "grad_norm": 1.4888694286346436, + "learning_rate": 1.787304347826087e-05, + "loss": 1.7414, + "step": 4170 + }, + { + "epoch": 0.35, + "grad_norm": 3.9246952533721924, + "learning_rate": 1.7867246376811595e-05, + "loss": 1.7297, + "step": 4180 + }, + { + "epoch": 0.35, + "grad_norm": 1.4315427541732788, + "learning_rate": 1.786144927536232e-05, + "loss": 1.7288, + "step": 4190 + }, + { + "epoch": 0.35, + "grad_norm": 1.2288168668746948, + "learning_rate": 1.7855652173913046e-05, + "loss": 1.9408, + "step": 4200 + }, + { + "epoch": 0.35, + "grad_norm": 1.8101353645324707, + "learning_rate": 1.7849855072463767e-05, + "loss": 1.7743, + "step": 4210 + }, + { + "epoch": 0.35, + "grad_norm": 1.800456166267395, + "learning_rate": 1.7844057971014495e-05, + "loss": 1.6975, + "step": 4220 + }, + { + "epoch": 0.35, + "grad_norm": 1.9426621198654175, + "learning_rate": 1.783826086956522e-05, + "loss": 1.9247, + "step": 4230 + }, + { + "epoch": 0.35, + "grad_norm": 1.1264923810958862, + "learning_rate": 1.7832463768115943e-05, + "loss": 1.8713, + "step": 4240 + }, + { + "epoch": 0.35, + "grad_norm": 3.0976786613464355, + "learning_rate": 1.7826666666666667e-05, + "loss": 1.8226, + "step": 4250 + }, + { + "epoch": 0.35, + "grad_norm": 3.625213146209717, + "learning_rate": 1.7820869565217394e-05, + "loss": 1.7695, + "step": 4260 + }, + { + "epoch": 0.36, + "grad_norm": 1.8756457567214966, + "learning_rate": 1.7815072463768118e-05, + "loss": 1.6924, + "step": 4270 + }, + { + "epoch": 0.36, + "grad_norm": 1.0314109325408936, + "learning_rate": 1.7809275362318842e-05, + "loss": 1.8747, + "step": 4280 + }, + { + "epoch": 0.36, + "grad_norm": 2.296934127807617, + "learning_rate": 1.7803478260869566e-05, + "loss": 1.7441, + "step": 4290 + }, + { + "epoch": 0.36, + "grad_norm": 3.1548571586608887, + "learning_rate": 1.779768115942029e-05, + "loss": 1.7059, + "step": 4300 + }, + { + "epoch": 0.36, + "grad_norm": 4.312896728515625, + "learning_rate": 1.7791884057971017e-05, + "loss": 1.8683, + "step": 4310 + }, + { + "epoch": 0.36, + "grad_norm": 3.2402548789978027, + "learning_rate": 1.778608695652174e-05, + "loss": 1.8141, + "step": 4320 + }, + { + "epoch": 0.36, + "grad_norm": 3.2827465534210205, + "learning_rate": 1.7780289855072465e-05, + "loss": 1.8275, + "step": 4330 + }, + { + "epoch": 0.36, + "grad_norm": 2.1937191486358643, + "learning_rate": 1.777449275362319e-05, + "loss": 1.8033, + "step": 4340 + }, + { + "epoch": 0.36, + "grad_norm": 2.4238903522491455, + "learning_rate": 1.7768695652173917e-05, + "loss": 1.8152, + "step": 4350 + }, + { + "epoch": 0.36, + "grad_norm": 3.212376117706299, + "learning_rate": 1.7762898550724637e-05, + "loss": 1.699, + "step": 4360 + }, + { + "epoch": 0.36, + "grad_norm": 3.7652318477630615, + "learning_rate": 1.7757101449275365e-05, + "loss": 1.6854, + "step": 4370 + }, + { + "epoch": 0.36, + "grad_norm": 1.9386508464813232, + "learning_rate": 1.775130434782609e-05, + "loss": 1.7143, + "step": 4380 + }, + { + "epoch": 0.37, + "grad_norm": 3.230910062789917, + "learning_rate": 1.7745507246376813e-05, + "loss": 1.7512, + "step": 4390 + }, + { + "epoch": 0.37, + "grad_norm": 4.213496685028076, + "learning_rate": 1.7739710144927537e-05, + "loss": 1.7486, + "step": 4400 + }, + { + "epoch": 0.37, + "grad_norm": 3.4668760299682617, + "learning_rate": 1.773391304347826e-05, + "loss": 1.687, + "step": 4410 + }, + { + "epoch": 0.37, + "grad_norm": 1.9463025331497192, + "learning_rate": 1.7728115942028988e-05, + "loss": 1.7829, + "step": 4420 + }, + { + "epoch": 0.37, + "grad_norm": 2.4058449268341064, + "learning_rate": 1.7722318840579712e-05, + "loss": 1.6866, + "step": 4430 + }, + { + "epoch": 0.37, + "grad_norm": 1.426607370376587, + "learning_rate": 1.7716521739130436e-05, + "loss": 1.7626, + "step": 4440 + }, + { + "epoch": 0.37, + "grad_norm": 2.6386098861694336, + "learning_rate": 1.771072463768116e-05, + "loss": 1.7471, + "step": 4450 + }, + { + "epoch": 0.37, + "grad_norm": 3.66159987449646, + "learning_rate": 1.7704927536231888e-05, + "loss": 1.6963, + "step": 4460 + }, + { + "epoch": 0.37, + "grad_norm": 1.790877103805542, + "learning_rate": 1.7699130434782608e-05, + "loss": 1.8013, + "step": 4470 + }, + { + "epoch": 0.37, + "grad_norm": 1.4159287214279175, + "learning_rate": 1.7693333333333336e-05, + "loss": 1.6159, + "step": 4480 + }, + { + "epoch": 0.37, + "grad_norm": 4.394288539886475, + "learning_rate": 1.768753623188406e-05, + "loss": 1.7185, + "step": 4490 + }, + { + "epoch": 0.38, + "grad_norm": 2.0887296199798584, + "learning_rate": 1.7681739130434784e-05, + "loss": 1.833, + "step": 4500 + }, + { + "epoch": 0.38, + "eval_loss": 1.7682534456253052, + "eval_runtime": 107.4724, + "eval_samples_per_second": 9.305, + "eval_steps_per_second": 2.326, + "step": 4500 + }, + { + "epoch": 0.38, + "grad_norm": 4.085282325744629, + "learning_rate": 1.7675942028985508e-05, + "loss": 1.7992, + "step": 4510 + }, + { + "epoch": 0.38, + "grad_norm": 2.182236671447754, + "learning_rate": 1.767014492753623e-05, + "loss": 1.8982, + "step": 4520 + }, + { + "epoch": 0.38, + "grad_norm": 2.9894583225250244, + "learning_rate": 1.766434782608696e-05, + "loss": 1.6623, + "step": 4530 + }, + { + "epoch": 0.38, + "grad_norm": 2.1609718799591064, + "learning_rate": 1.7658550724637683e-05, + "loss": 1.7441, + "step": 4540 + }, + { + "epoch": 0.38, + "grad_norm": 1.7761262655258179, + "learning_rate": 1.7652753623188407e-05, + "loss": 1.7824, + "step": 4550 + }, + { + "epoch": 0.38, + "grad_norm": 3.564272880554199, + "learning_rate": 1.764695652173913e-05, + "loss": 1.7627, + "step": 4560 + }, + { + "epoch": 0.38, + "grad_norm": 1.6390734910964966, + "learning_rate": 1.764115942028986e-05, + "loss": 1.7783, + "step": 4570 + }, + { + "epoch": 0.38, + "grad_norm": 1.8177413940429688, + "learning_rate": 1.7635362318840582e-05, + "loss": 1.6956, + "step": 4580 + }, + { + "epoch": 0.38, + "grad_norm": 1.2485848665237427, + "learning_rate": 1.7629565217391306e-05, + "loss": 1.7598, + "step": 4590 + }, + { + "epoch": 0.38, + "grad_norm": 2.094052791595459, + "learning_rate": 1.762376811594203e-05, + "loss": 1.8477, + "step": 4600 + }, + { + "epoch": 0.38, + "grad_norm": 2.854745864868164, + "learning_rate": 1.7617971014492754e-05, + "loss": 1.6767, + "step": 4610 + }, + { + "epoch": 0.39, + "grad_norm": 2.275739908218384, + "learning_rate": 1.761217391304348e-05, + "loss": 1.7992, + "step": 4620 + }, + { + "epoch": 0.39, + "grad_norm": 4.860130786895752, + "learning_rate": 1.7606376811594202e-05, + "loss": 1.6274, + "step": 4630 + }, + { + "epoch": 0.39, + "grad_norm": 1.5182762145996094, + "learning_rate": 1.760057971014493e-05, + "loss": 1.8214, + "step": 4640 + }, + { + "epoch": 0.39, + "grad_norm": 1.3440704345703125, + "learning_rate": 1.7594782608695654e-05, + "loss": 1.778, + "step": 4650 + }, + { + "epoch": 0.39, + "grad_norm": 1.2678601741790771, + "learning_rate": 1.7588985507246378e-05, + "loss": 1.6729, + "step": 4660 + }, + { + "epoch": 0.39, + "grad_norm": 2.45300030708313, + "learning_rate": 1.7583188405797102e-05, + "loss": 1.7167, + "step": 4670 + }, + { + "epoch": 0.39, + "grad_norm": 1.4436949491500854, + "learning_rate": 1.757739130434783e-05, + "loss": 1.7222, + "step": 4680 + }, + { + "epoch": 0.39, + "grad_norm": 1.70391047000885, + "learning_rate": 1.7571594202898553e-05, + "loss": 1.802, + "step": 4690 + }, + { + "epoch": 0.39, + "grad_norm": 4.7926859855651855, + "learning_rate": 1.7565797101449277e-05, + "loss": 1.7951, + "step": 4700 + }, + { + "epoch": 0.39, + "grad_norm": 1.777199387550354, + "learning_rate": 1.756e-05, + "loss": 1.7408, + "step": 4710 + }, + { + "epoch": 0.39, + "grad_norm": 0.9820401072502136, + "learning_rate": 1.7554202898550725e-05, + "loss": 1.8236, + "step": 4720 + }, + { + "epoch": 0.39, + "grad_norm": 2.505861759185791, + "learning_rate": 1.754840579710145e-05, + "loss": 1.6859, + "step": 4730 + }, + { + "epoch": 0.4, + "grad_norm": 3.0576391220092773, + "learning_rate": 1.7542608695652173e-05, + "loss": 1.7222, + "step": 4740 + }, + { + "epoch": 0.4, + "grad_norm": 3.96050763130188, + "learning_rate": 1.75368115942029e-05, + "loss": 1.8761, + "step": 4750 + }, + { + "epoch": 0.4, + "grad_norm": 4.387930870056152, + "learning_rate": 1.7531014492753625e-05, + "loss": 1.6781, + "step": 4760 + }, + { + "epoch": 0.4, + "grad_norm": 6.898324012756348, + "learning_rate": 1.752521739130435e-05, + "loss": 1.783, + "step": 4770 + }, + { + "epoch": 0.4, + "grad_norm": 4.5490803718566895, + "learning_rate": 1.7519420289855073e-05, + "loss": 1.7627, + "step": 4780 + }, + { + "epoch": 0.4, + "grad_norm": 5.1455864906311035, + "learning_rate": 1.75136231884058e-05, + "loss": 1.7222, + "step": 4790 + }, + { + "epoch": 0.4, + "grad_norm": 2.632255792617798, + "learning_rate": 1.7507826086956524e-05, + "loss": 1.8319, + "step": 4800 + }, + { + "epoch": 0.4, + "grad_norm": 1.351136565208435, + "learning_rate": 1.7502028985507248e-05, + "loss": 1.7507, + "step": 4810 + }, + { + "epoch": 0.4, + "grad_norm": 1.6380447149276733, + "learning_rate": 1.7496231884057972e-05, + "loss": 1.6645, + "step": 4820 + }, + { + "epoch": 0.4, + "grad_norm": 1.601467490196228, + "learning_rate": 1.7490434782608696e-05, + "loss": 1.8176, + "step": 4830 + }, + { + "epoch": 0.4, + "grad_norm": 1.0333625078201294, + "learning_rate": 1.7484637681159424e-05, + "loss": 1.6752, + "step": 4840 + }, + { + "epoch": 0.4, + "grad_norm": 2.7758312225341797, + "learning_rate": 1.7478840579710144e-05, + "loss": 1.7522, + "step": 4850 + }, + { + "epoch": 0.41, + "grad_norm": 3.394230842590332, + "learning_rate": 1.747304347826087e-05, + "loss": 1.6703, + "step": 4860 + }, + { + "epoch": 0.41, + "grad_norm": 1.6446850299835205, + "learning_rate": 1.7467246376811596e-05, + "loss": 1.6016, + "step": 4870 + }, + { + "epoch": 0.41, + "grad_norm": 3.261119842529297, + "learning_rate": 1.746144927536232e-05, + "loss": 1.598, + "step": 4880 + }, + { + "epoch": 0.41, + "grad_norm": 3.5779495239257812, + "learning_rate": 1.7455652173913044e-05, + "loss": 1.7129, + "step": 4890 + }, + { + "epoch": 0.41, + "grad_norm": 2.2901227474212646, + "learning_rate": 1.744985507246377e-05, + "loss": 1.6935, + "step": 4900 + }, + { + "epoch": 0.41, + "grad_norm": 2.433979034423828, + "learning_rate": 1.7444057971014495e-05, + "loss": 1.6954, + "step": 4910 + }, + { + "epoch": 0.41, + "grad_norm": 0.8954328894615173, + "learning_rate": 1.743826086956522e-05, + "loss": 1.7721, + "step": 4920 + }, + { + "epoch": 0.41, + "grad_norm": 2.513370990753174, + "learning_rate": 1.7432463768115943e-05, + "loss": 1.6607, + "step": 4930 + }, + { + "epoch": 0.41, + "grad_norm": 2.7021384239196777, + "learning_rate": 1.7426666666666667e-05, + "loss": 1.7342, + "step": 4940 + }, + { + "epoch": 0.41, + "grad_norm": 3.947293519973755, + "learning_rate": 1.7420869565217394e-05, + "loss": 1.6413, + "step": 4950 + }, + { + "epoch": 0.41, + "grad_norm": 1.9602371454238892, + "learning_rate": 1.741507246376812e-05, + "loss": 1.6927, + "step": 4960 + }, + { + "epoch": 0.41, + "grad_norm": 2.6332249641418457, + "learning_rate": 1.7409275362318842e-05, + "loss": 1.5976, + "step": 4970 + }, + { + "epoch": 0.41, + "grad_norm": 2.293816328048706, + "learning_rate": 1.7403478260869566e-05, + "loss": 1.7207, + "step": 4980 + }, + { + "epoch": 0.42, + "grad_norm": 6.466865539550781, + "learning_rate": 1.7397681159420294e-05, + "loss": 1.7308, + "step": 4990 + }, + { + "epoch": 0.42, + "grad_norm": 4.4341559410095215, + "learning_rate": 1.7391884057971014e-05, + "loss": 1.7434, + "step": 5000 + }, + { + "epoch": 0.42, + "eval_loss": 1.7550283670425415, + "eval_runtime": 107.4768, + "eval_samples_per_second": 9.304, + "eval_steps_per_second": 2.326, + "step": 5000 + }, + { + "epoch": 0.42, + "grad_norm": 1.294936180114746, + "learning_rate": 1.7386086956521742e-05, + "loss": 1.746, + "step": 5010 + }, + { + "epoch": 0.42, + "grad_norm": 3.660215139389038, + "learning_rate": 1.7380289855072466e-05, + "loss": 1.7183, + "step": 5020 + }, + { + "epoch": 0.42, + "grad_norm": 1.8590058088302612, + "learning_rate": 1.737449275362319e-05, + "loss": 1.8525, + "step": 5030 + }, + { + "epoch": 0.42, + "grad_norm": 1.5797920227050781, + "learning_rate": 1.7368695652173914e-05, + "loss": 1.7257, + "step": 5040 + }, + { + "epoch": 0.42, + "grad_norm": 1.3411496877670288, + "learning_rate": 1.7362898550724638e-05, + "loss": 1.8922, + "step": 5050 + }, + { + "epoch": 0.42, + "grad_norm": 3.5491766929626465, + "learning_rate": 1.7357101449275365e-05, + "loss": 1.7402, + "step": 5060 + }, + { + "epoch": 0.42, + "grad_norm": 2.72056245803833, + "learning_rate": 1.735130434782609e-05, + "loss": 1.7225, + "step": 5070 + }, + { + "epoch": 0.42, + "grad_norm": 2.4798800945281982, + "learning_rate": 1.7345507246376813e-05, + "loss": 1.7624, + "step": 5080 + }, + { + "epoch": 0.42, + "grad_norm": 2.2017393112182617, + "learning_rate": 1.7339710144927537e-05, + "loss": 1.7921, + "step": 5090 + }, + { + "epoch": 0.42, + "grad_norm": 3.185555934906006, + "learning_rate": 1.7333913043478265e-05, + "loss": 1.615, + "step": 5100 + }, + { + "epoch": 0.43, + "grad_norm": 1.8740787506103516, + "learning_rate": 1.7328115942028985e-05, + "loss": 1.778, + "step": 5110 + }, + { + "epoch": 0.43, + "grad_norm": 2.6810436248779297, + "learning_rate": 1.732231884057971e-05, + "loss": 1.7957, + "step": 5120 + }, + { + "epoch": 0.43, + "grad_norm": 2.130495309829712, + "learning_rate": 1.7316521739130437e-05, + "loss": 1.8177, + "step": 5130 + }, + { + "epoch": 0.43, + "grad_norm": 4.660665988922119, + "learning_rate": 1.731072463768116e-05, + "loss": 1.8078, + "step": 5140 + }, + { + "epoch": 0.43, + "grad_norm": 1.669716238975525, + "learning_rate": 1.7304927536231885e-05, + "loss": 1.7621, + "step": 5150 + }, + { + "epoch": 0.43, + "grad_norm": 1.6844958066940308, + "learning_rate": 1.729913043478261e-05, + "loss": 1.6396, + "step": 5160 + }, + { + "epoch": 0.43, + "grad_norm": 2.097205877304077, + "learning_rate": 1.7293333333333336e-05, + "loss": 1.7409, + "step": 5170 + }, + { + "epoch": 0.43, + "grad_norm": 0.8433555960655212, + "learning_rate": 1.728753623188406e-05, + "loss": 1.8663, + "step": 5180 + }, + { + "epoch": 0.43, + "grad_norm": 3.1566624641418457, + "learning_rate": 1.7281739130434784e-05, + "loss": 1.5808, + "step": 5190 + }, + { + "epoch": 0.43, + "grad_norm": 2.130633592605591, + "learning_rate": 1.7275942028985508e-05, + "loss": 1.6708, + "step": 5200 + }, + { + "epoch": 0.43, + "grad_norm": 2.3637614250183105, + "learning_rate": 1.7270144927536235e-05, + "loss": 1.7204, + "step": 5210 + }, + { + "epoch": 0.43, + "grad_norm": 6.3107686042785645, + "learning_rate": 1.726434782608696e-05, + "loss": 1.7913, + "step": 5220 + }, + { + "epoch": 0.44, + "grad_norm": 1.3103054761886597, + "learning_rate": 1.725855072463768e-05, + "loss": 1.7843, + "step": 5230 + }, + { + "epoch": 0.44, + "grad_norm": 4.409878730773926, + "learning_rate": 1.7252753623188407e-05, + "loss": 1.5763, + "step": 5240 + }, + { + "epoch": 0.44, + "grad_norm": 1.2016843557357788, + "learning_rate": 1.724695652173913e-05, + "loss": 1.7474, + "step": 5250 + }, + { + "epoch": 0.44, + "grad_norm": 2.6358120441436768, + "learning_rate": 1.7241159420289855e-05, + "loss": 1.6569, + "step": 5260 + }, + { + "epoch": 0.44, + "grad_norm": 2.816072702407837, + "learning_rate": 1.723536231884058e-05, + "loss": 1.7288, + "step": 5270 + }, + { + "epoch": 0.44, + "grad_norm": 5.151131629943848, + "learning_rate": 1.7229565217391307e-05, + "loss": 1.7776, + "step": 5280 + }, + { + "epoch": 0.44, + "grad_norm": 1.895945429801941, + "learning_rate": 1.722376811594203e-05, + "loss": 1.7824, + "step": 5290 + }, + { + "epoch": 0.44, + "grad_norm": 2.5758895874023438, + "learning_rate": 1.7217971014492755e-05, + "loss": 1.6245, + "step": 5300 + }, + { + "epoch": 0.44, + "grad_norm": 1.6517353057861328, + "learning_rate": 1.721217391304348e-05, + "loss": 1.5666, + "step": 5310 + }, + { + "epoch": 0.44, + "grad_norm": 3.3738932609558105, + "learning_rate": 1.7206376811594206e-05, + "loss": 1.6949, + "step": 5320 + }, + { + "epoch": 0.44, + "grad_norm": 4.818183898925781, + "learning_rate": 1.720057971014493e-05, + "loss": 1.7589, + "step": 5330 + }, + { + "epoch": 0.45, + "grad_norm": 3.0807089805603027, + "learning_rate": 1.7194782608695654e-05, + "loss": 1.8133, + "step": 5340 + }, + { + "epoch": 0.45, + "grad_norm": 0.9534027576446533, + "learning_rate": 1.7188985507246378e-05, + "loss": 1.7675, + "step": 5350 + }, + { + "epoch": 0.45, + "grad_norm": 2.3471546173095703, + "learning_rate": 1.7183188405797102e-05, + "loss": 1.4739, + "step": 5360 + }, + { + "epoch": 0.45, + "grad_norm": 2.1540451049804688, + "learning_rate": 1.7177391304347826e-05, + "loss": 1.7602, + "step": 5370 + }, + { + "epoch": 0.45, + "grad_norm": 2.6220521926879883, + "learning_rate": 1.717159420289855e-05, + "loss": 1.8812, + "step": 5380 + }, + { + "epoch": 0.45, + "grad_norm": 2.6644699573516846, + "learning_rate": 1.7165797101449278e-05, + "loss": 1.8191, + "step": 5390 + }, + { + "epoch": 0.45, + "grad_norm": 4.378482818603516, + "learning_rate": 1.7160000000000002e-05, + "loss": 1.7765, + "step": 5400 + }, + { + "epoch": 0.45, + "grad_norm": 3.0857391357421875, + "learning_rate": 1.7154202898550726e-05, + "loss": 1.7762, + "step": 5410 + }, + { + "epoch": 0.45, + "grad_norm": 0.7650538086891174, + "learning_rate": 1.714840579710145e-05, + "loss": 1.7468, + "step": 5420 + }, + { + "epoch": 0.45, + "grad_norm": 1.3827682733535767, + "learning_rate": 1.7142608695652174e-05, + "loss": 1.8392, + "step": 5430 + }, + { + "epoch": 0.45, + "grad_norm": 7.6188459396362305, + "learning_rate": 1.71368115942029e-05, + "loss": 1.765, + "step": 5440 + }, + { + "epoch": 0.45, + "grad_norm": 1.591964602470398, + "learning_rate": 1.7131014492753625e-05, + "loss": 1.6735, + "step": 5450 + }, + { + "epoch": 0.46, + "grad_norm": 2.4797956943511963, + "learning_rate": 1.712521739130435e-05, + "loss": 1.8693, + "step": 5460 + }, + { + "epoch": 0.46, + "grad_norm": 1.759324312210083, + "learning_rate": 1.7119420289855073e-05, + "loss": 1.691, + "step": 5470 + }, + { + "epoch": 0.46, + "grad_norm": 2.8067173957824707, + "learning_rate": 1.71136231884058e-05, + "loss": 1.8131, + "step": 5480 + }, + { + "epoch": 0.46, + "grad_norm": 1.9415967464447021, + "learning_rate": 1.710782608695652e-05, + "loss": 1.8269, + "step": 5490 + }, + { + "epoch": 0.46, + "grad_norm": 3.027404308319092, + "learning_rate": 1.710202898550725e-05, + "loss": 1.8457, + "step": 5500 + }, + { + "epoch": 0.46, + "eval_loss": 1.7384296655654907, + "eval_runtime": 107.4854, + "eval_samples_per_second": 9.304, + "eval_steps_per_second": 2.326, + "step": 5500 + }, + { + "epoch": 0.46, + "grad_norm": 2.0170934200286865, + "learning_rate": 1.7096231884057973e-05, + "loss": 1.7555, + "step": 5510 + }, + { + "epoch": 0.46, + "grad_norm": 1.8536728620529175, + "learning_rate": 1.7090434782608697e-05, + "loss": 1.6274, + "step": 5520 + }, + { + "epoch": 0.46, + "grad_norm": 0.8969680070877075, + "learning_rate": 1.708463768115942e-05, + "loss": 1.7977, + "step": 5530 + }, + { + "epoch": 0.46, + "grad_norm": 1.3134697675704956, + "learning_rate": 1.7078840579710145e-05, + "loss": 1.7819, + "step": 5540 + }, + { + "epoch": 0.46, + "grad_norm": 2.6828033924102783, + "learning_rate": 1.7073043478260872e-05, + "loss": 1.924, + "step": 5550 + }, + { + "epoch": 0.46, + "grad_norm": 1.980905294418335, + "learning_rate": 1.7067246376811596e-05, + "loss": 1.7677, + "step": 5560 + }, + { + "epoch": 0.46, + "grad_norm": 4.521270751953125, + "learning_rate": 1.706144927536232e-05, + "loss": 1.7319, + "step": 5570 + }, + { + "epoch": 0.47, + "grad_norm": 1.6345350742340088, + "learning_rate": 1.7055652173913044e-05, + "loss": 1.8736, + "step": 5580 + }, + { + "epoch": 0.47, + "grad_norm": 1.6569340229034424, + "learning_rate": 1.704985507246377e-05, + "loss": 1.7549, + "step": 5590 + }, + { + "epoch": 0.47, + "grad_norm": 2.3906219005584717, + "learning_rate": 1.7044057971014495e-05, + "loss": 1.8223, + "step": 5600 + }, + { + "epoch": 0.47, + "grad_norm": 1.6184358596801758, + "learning_rate": 1.703826086956522e-05, + "loss": 1.7187, + "step": 5610 + }, + { + "epoch": 0.47, + "grad_norm": 1.9662846326828003, + "learning_rate": 1.7032463768115943e-05, + "loss": 1.6826, + "step": 5620 + }, + { + "epoch": 0.47, + "grad_norm": 1.2513630390167236, + "learning_rate": 1.702666666666667e-05, + "loss": 1.8264, + "step": 5630 + }, + { + "epoch": 0.47, + "grad_norm": 2.757554769515991, + "learning_rate": 1.702086956521739e-05, + "loss": 1.7352, + "step": 5640 + }, + { + "epoch": 0.47, + "grad_norm": 0.9750083088874817, + "learning_rate": 1.7015072463768115e-05, + "loss": 1.68, + "step": 5650 + }, + { + "epoch": 0.47, + "grad_norm": 2.763793706893921, + "learning_rate": 1.7009275362318843e-05, + "loss": 1.7913, + "step": 5660 + }, + { + "epoch": 0.47, + "grad_norm": 2.929349899291992, + "learning_rate": 1.7003478260869567e-05, + "loss": 1.7713, + "step": 5670 + }, + { + "epoch": 0.47, + "grad_norm": 4.068824768066406, + "learning_rate": 1.699768115942029e-05, + "loss": 1.6546, + "step": 5680 + }, + { + "epoch": 0.47, + "grad_norm": 3.953929901123047, + "learning_rate": 1.6991884057971015e-05, + "loss": 1.8524, + "step": 5690 + }, + { + "epoch": 0.47, + "grad_norm": 1.38455069065094, + "learning_rate": 1.6986086956521742e-05, + "loss": 1.8452, + "step": 5700 + }, + { + "epoch": 0.48, + "grad_norm": 1.4616096019744873, + "learning_rate": 1.6980289855072466e-05, + "loss": 1.7802, + "step": 5710 + }, + { + "epoch": 0.48, + "grad_norm": 3.079864263534546, + "learning_rate": 1.697449275362319e-05, + "loss": 1.6293, + "step": 5720 + }, + { + "epoch": 0.48, + "grad_norm": 4.769149303436279, + "learning_rate": 1.6968695652173914e-05, + "loss": 1.7868, + "step": 5730 + }, + { + "epoch": 0.48, + "grad_norm": 3.6357669830322266, + "learning_rate": 1.6962898550724638e-05, + "loss": 1.8268, + "step": 5740 + }, + { + "epoch": 0.48, + "grad_norm": 4.242451190948486, + "learning_rate": 1.6957101449275362e-05, + "loss": 1.8289, + "step": 5750 + }, + { + "epoch": 0.48, + "grad_norm": 1.4025174379348755, + "learning_rate": 1.6951304347826086e-05, + "loss": 1.7262, + "step": 5760 + }, + { + "epoch": 0.48, + "grad_norm": 2.201425790786743, + "learning_rate": 1.6945507246376814e-05, + "loss": 1.7995, + "step": 5770 + }, + { + "epoch": 0.48, + "grad_norm": 3.1672089099884033, + "learning_rate": 1.6939710144927538e-05, + "loss": 1.7804, + "step": 5780 + }, + { + "epoch": 0.48, + "grad_norm": 1.394217848777771, + "learning_rate": 1.693391304347826e-05, + "loss": 1.6865, + "step": 5790 + }, + { + "epoch": 0.48, + "grad_norm": 4.455097198486328, + "learning_rate": 1.6928115942028986e-05, + "loss": 1.6132, + "step": 5800 + }, + { + "epoch": 0.48, + "grad_norm": 1.6794978380203247, + "learning_rate": 1.6922318840579713e-05, + "loss": 1.7146, + "step": 5810 + }, + { + "epoch": 0.48, + "grad_norm": 4.268734455108643, + "learning_rate": 1.6916521739130437e-05, + "loss": 1.8458, + "step": 5820 + }, + { + "epoch": 0.49, + "grad_norm": 0.9144909381866455, + "learning_rate": 1.691072463768116e-05, + "loss": 1.7298, + "step": 5830 + }, + { + "epoch": 0.49, + "grad_norm": 1.2349727153778076, + "learning_rate": 1.6904927536231885e-05, + "loss": 1.6982, + "step": 5840 + }, + { + "epoch": 0.49, + "grad_norm": 2.7180557250976562, + "learning_rate": 1.689913043478261e-05, + "loss": 1.8027, + "step": 5850 + }, + { + "epoch": 0.49, + "grad_norm": 3.6468281745910645, + "learning_rate": 1.6893913043478262e-05, + "loss": 1.8093, + "step": 5860 + }, + { + "epoch": 0.49, + "grad_norm": 2.979691982269287, + "learning_rate": 1.6888115942028986e-05, + "loss": 1.5826, + "step": 5870 + }, + { + "epoch": 0.49, + "grad_norm": 5.700094699859619, + "learning_rate": 1.6882318840579713e-05, + "loss": 1.6295, + "step": 5880 + }, + { + "epoch": 0.49, + "grad_norm": 5.222003936767578, + "learning_rate": 1.6876521739130437e-05, + "loss": 1.655, + "step": 5890 + }, + { + "epoch": 0.49, + "grad_norm": 1.3589376211166382, + "learning_rate": 1.687072463768116e-05, + "loss": 1.7627, + "step": 5900 + }, + { + "epoch": 0.49, + "grad_norm": 2.710470676422119, + "learning_rate": 1.6864927536231885e-05, + "loss": 1.7624, + "step": 5910 + }, + { + "epoch": 0.49, + "grad_norm": 1.3515254259109497, + "learning_rate": 1.685913043478261e-05, + "loss": 1.754, + "step": 5920 + }, + { + "epoch": 0.49, + "grad_norm": 1.581727385520935, + "learning_rate": 1.6853333333333333e-05, + "loss": 1.7327, + "step": 5930 + }, + { + "epoch": 0.49, + "grad_norm": 2.4707586765289307, + "learning_rate": 1.6847536231884057e-05, + "loss": 1.796, + "step": 5940 + }, + { + "epoch": 0.5, + "grad_norm": 2.6579630374908447, + "learning_rate": 1.6841739130434785e-05, + "loss": 1.664, + "step": 5950 + }, + { + "epoch": 0.5, + "grad_norm": 2.639225959777832, + "learning_rate": 1.683594202898551e-05, + "loss": 1.7041, + "step": 5960 + }, + { + "epoch": 0.5, + "grad_norm": 2.8888185024261475, + "learning_rate": 1.6830144927536233e-05, + "loss": 1.7493, + "step": 5970 + }, + { + "epoch": 0.5, + "grad_norm": 3.3176138401031494, + "learning_rate": 1.6824347826086957e-05, + "loss": 1.5773, + "step": 5980 + }, + { + "epoch": 0.5, + "grad_norm": 1.5189319849014282, + "learning_rate": 1.6818550724637684e-05, + "loss": 1.6852, + "step": 5990 + }, + { + "epoch": 0.5, + "grad_norm": 1.535059928894043, + "learning_rate": 1.6812753623188408e-05, + "loss": 1.7166, + "step": 6000 + }, + { + "epoch": 0.5, + "eval_loss": 1.727913498878479, + "eval_runtime": 107.4778, + "eval_samples_per_second": 9.304, + "eval_steps_per_second": 2.326, + "step": 6000 + }, + { + "epoch": 0.5, + "grad_norm": 2.6467275619506836, + "learning_rate": 1.6806956521739132e-05, + "loss": 1.8415, + "step": 6010 + }, + { + "epoch": 0.5, + "grad_norm": 1.4421344995498657, + "learning_rate": 1.6801159420289856e-05, + "loss": 1.6914, + "step": 6020 + }, + { + "epoch": 0.5, + "grad_norm": 1.6661970615386963, + "learning_rate": 1.679536231884058e-05, + "loss": 1.8147, + "step": 6030 + }, + { + "epoch": 0.5, + "grad_norm": 1.5970772504806519, + "learning_rate": 1.6789565217391304e-05, + "loss": 1.6534, + "step": 6040 + }, + { + "epoch": 0.5, + "grad_norm": 3.1191587448120117, + "learning_rate": 1.678376811594203e-05, + "loss": 1.7521, + "step": 6050 + }, + { + "epoch": 0.51, + "grad_norm": 1.529994249343872, + "learning_rate": 1.6777971014492756e-05, + "loss": 1.6383, + "step": 6060 + }, + { + "epoch": 0.51, + "grad_norm": 1.4743109941482544, + "learning_rate": 1.677217391304348e-05, + "loss": 1.6432, + "step": 6070 + }, + { + "epoch": 0.51, + "grad_norm": 2.2322564125061035, + "learning_rate": 1.6766376811594204e-05, + "loss": 1.6845, + "step": 6080 + }, + { + "epoch": 0.51, + "grad_norm": 5.006726264953613, + "learning_rate": 1.6760579710144928e-05, + "loss": 1.8527, + "step": 6090 + }, + { + "epoch": 0.51, + "grad_norm": 3.6824820041656494, + "learning_rate": 1.6754782608695655e-05, + "loss": 1.7306, + "step": 6100 + }, + { + "epoch": 0.51, + "grad_norm": 1.2515430450439453, + "learning_rate": 1.674898550724638e-05, + "loss": 1.6419, + "step": 6110 + }, + { + "epoch": 0.51, + "grad_norm": 1.548213005065918, + "learning_rate": 1.6743188405797103e-05, + "loss": 1.6744, + "step": 6120 + }, + { + "epoch": 0.51, + "grad_norm": 2.4482362270355225, + "learning_rate": 1.6737391304347827e-05, + "loss": 1.8767, + "step": 6130 + }, + { + "epoch": 0.51, + "grad_norm": 2.4814817905426025, + "learning_rate": 1.673159420289855e-05, + "loss": 1.71, + "step": 6140 + }, + { + "epoch": 0.51, + "grad_norm": 1.6053385734558105, + "learning_rate": 1.672579710144928e-05, + "loss": 1.557, + "step": 6150 + }, + { + "epoch": 0.51, + "grad_norm": 2.154989004135132, + "learning_rate": 1.672e-05, + "loss": 1.7413, + "step": 6160 + }, + { + "epoch": 0.51, + "grad_norm": 3.2812576293945312, + "learning_rate": 1.6714202898550727e-05, + "loss": 1.7674, + "step": 6170 + }, + { + "epoch": 0.52, + "grad_norm": 1.1822229623794556, + "learning_rate": 1.670840579710145e-05, + "loss": 1.7635, + "step": 6180 + }, + { + "epoch": 0.52, + "grad_norm": 3.022081136703491, + "learning_rate": 1.6702608695652175e-05, + "loss": 1.9249, + "step": 6190 + }, + { + "epoch": 0.52, + "grad_norm": 1.4836503267288208, + "learning_rate": 1.66968115942029e-05, + "loss": 1.7538, + "step": 6200 + }, + { + "epoch": 0.52, + "grad_norm": 2.186819314956665, + "learning_rate": 1.6691014492753626e-05, + "loss": 1.7048, + "step": 6210 + }, + { + "epoch": 0.52, + "grad_norm": 2.98888897895813, + "learning_rate": 1.668521739130435e-05, + "loss": 1.7195, + "step": 6220 + }, + { + "epoch": 0.52, + "grad_norm": 1.2272439002990723, + "learning_rate": 1.6679420289855074e-05, + "loss": 1.7088, + "step": 6230 + }, + { + "epoch": 0.52, + "grad_norm": 5.195132255554199, + "learning_rate": 1.6673623188405798e-05, + "loss": 1.9109, + "step": 6240 + }, + { + "epoch": 0.52, + "grad_norm": 1.4767738580703735, + "learning_rate": 1.6667826086956522e-05, + "loss": 1.7119, + "step": 6250 + }, + { + "epoch": 0.52, + "grad_norm": 4.870778560638428, + "learning_rate": 1.666202898550725e-05, + "loss": 1.6744, + "step": 6260 + }, + { + "epoch": 0.52, + "grad_norm": 2.495901584625244, + "learning_rate": 1.6656231884057973e-05, + "loss": 1.7129, + "step": 6270 + }, + { + "epoch": 0.52, + "grad_norm": 3.2989490032196045, + "learning_rate": 1.6650434782608697e-05, + "loss": 1.7543, + "step": 6280 + }, + { + "epoch": 0.52, + "grad_norm": 1.2560532093048096, + "learning_rate": 1.664463768115942e-05, + "loss": 1.6504, + "step": 6290 + }, + { + "epoch": 0.53, + "grad_norm": 1.1828808784484863, + "learning_rate": 1.663884057971015e-05, + "loss": 1.8215, + "step": 6300 + }, + { + "epoch": 0.53, + "grad_norm": 1.3935143947601318, + "learning_rate": 1.663304347826087e-05, + "loss": 1.7343, + "step": 6310 + }, + { + "epoch": 0.53, + "grad_norm": 2.2186152935028076, + "learning_rate": 1.6627246376811597e-05, + "loss": 1.8291, + "step": 6320 + }, + { + "epoch": 0.53, + "grad_norm": 2.0307185649871826, + "learning_rate": 1.662144927536232e-05, + "loss": 1.7496, + "step": 6330 + }, + { + "epoch": 0.53, + "grad_norm": 1.4176080226898193, + "learning_rate": 1.6615652173913045e-05, + "loss": 1.7847, + "step": 6340 + }, + { + "epoch": 0.53, + "grad_norm": 2.357802391052246, + "learning_rate": 1.660985507246377e-05, + "loss": 1.7378, + "step": 6350 + }, + { + "epoch": 0.53, + "grad_norm": 2.5888750553131104, + "learning_rate": 1.6604057971014493e-05, + "loss": 1.7028, + "step": 6360 + }, + { + "epoch": 0.53, + "grad_norm": 1.2639836072921753, + "learning_rate": 1.659826086956522e-05, + "loss": 1.7333, + "step": 6370 + }, + { + "epoch": 0.53, + "grad_norm": 2.8823935985565186, + "learning_rate": 1.6592463768115944e-05, + "loss": 1.7696, + "step": 6380 + }, + { + "epoch": 0.53, + "grad_norm": 1.7665212154388428, + "learning_rate": 1.6586666666666668e-05, + "loss": 1.8186, + "step": 6390 + }, + { + "epoch": 0.53, + "grad_norm": 3.463416814804077, + "learning_rate": 1.6580869565217392e-05, + "loss": 1.8244, + "step": 6400 + }, + { + "epoch": 0.53, + "grad_norm": 1.7725675106048584, + "learning_rate": 1.657507246376812e-05, + "loss": 1.7399, + "step": 6410 + }, + { + "epoch": 0.54, + "grad_norm": 2.1055688858032227, + "learning_rate": 1.656927536231884e-05, + "loss": 1.6752, + "step": 6420 + }, + { + "epoch": 0.54, + "grad_norm": 3.611109972000122, + "learning_rate": 1.6563478260869568e-05, + "loss": 1.5842, + "step": 6430 + }, + { + "epoch": 0.54, + "grad_norm": 2.2504234313964844, + "learning_rate": 1.655768115942029e-05, + "loss": 1.6229, + "step": 6440 + }, + { + "epoch": 0.54, + "grad_norm": 2.377387046813965, + "learning_rate": 1.6551884057971016e-05, + "loss": 1.7606, + "step": 6450 + }, + { + "epoch": 0.54, + "grad_norm": 1.7958356142044067, + "learning_rate": 1.654608695652174e-05, + "loss": 1.7271, + "step": 6460 + }, + { + "epoch": 0.54, + "grad_norm": 1.7075881958007812, + "learning_rate": 1.6540289855072464e-05, + "loss": 1.7918, + "step": 6470 + }, + { + "epoch": 0.54, + "grad_norm": 4.526883602142334, + "learning_rate": 1.653449275362319e-05, + "loss": 1.721, + "step": 6480 + }, + { + "epoch": 0.54, + "grad_norm": 3.219193458557129, + "learning_rate": 1.6528695652173915e-05, + "loss": 1.8068, + "step": 6490 + }, + { + "epoch": 0.54, + "grad_norm": 1.7453542947769165, + "learning_rate": 1.652289855072464e-05, + "loss": 1.7639, + "step": 6500 + }, + { + "epoch": 0.54, + "eval_loss": 1.727725625038147, + "eval_runtime": 107.4666, + "eval_samples_per_second": 9.305, + "eval_steps_per_second": 2.326, + "step": 6500 + }, + { + "epoch": 0.54, + "grad_norm": 0.9024394750595093, + "learning_rate": 1.6517101449275363e-05, + "loss": 1.7855, + "step": 6510 + }, + { + "epoch": 0.54, + "grad_norm": 2.008007287979126, + "learning_rate": 1.651130434782609e-05, + "loss": 1.7057, + "step": 6520 + }, + { + "epoch": 0.54, + "grad_norm": 1.9877010583877563, + "learning_rate": 1.6505507246376814e-05, + "loss": 1.7263, + "step": 6530 + }, + { + "epoch": 0.55, + "grad_norm": 4.027751445770264, + "learning_rate": 1.6499710144927535e-05, + "loss": 1.6714, + "step": 6540 + }, + { + "epoch": 0.55, + "grad_norm": 5.0006256103515625, + "learning_rate": 1.6493913043478262e-05, + "loss": 1.6872, + "step": 6550 + }, + { + "epoch": 0.55, + "grad_norm": 3.7696921825408936, + "learning_rate": 1.6488115942028986e-05, + "loss": 1.7761, + "step": 6560 + }, + { + "epoch": 0.55, + "grad_norm": 4.299485683441162, + "learning_rate": 1.648231884057971e-05, + "loss": 1.6696, + "step": 6570 + }, + { + "epoch": 0.55, + "grad_norm": 2.5291144847869873, + "learning_rate": 1.6476521739130435e-05, + "loss": 1.6788, + "step": 6580 + }, + { + "epoch": 0.55, + "grad_norm": 4.1017231941223145, + "learning_rate": 1.6470724637681162e-05, + "loss": 1.6339, + "step": 6590 + }, + { + "epoch": 0.55, + "grad_norm": 2.975684881210327, + "learning_rate": 1.6464927536231886e-05, + "loss": 1.63, + "step": 6600 + }, + { + "epoch": 0.55, + "grad_norm": 1.5306545495986938, + "learning_rate": 1.645913043478261e-05, + "loss": 1.6622, + "step": 6610 + }, + { + "epoch": 0.55, + "grad_norm": 1.816344976425171, + "learning_rate": 1.6453333333333334e-05, + "loss": 1.8507, + "step": 6620 + }, + { + "epoch": 0.55, + "grad_norm": 3.180396556854248, + "learning_rate": 1.644753623188406e-05, + "loss": 1.6871, + "step": 6630 + }, + { + "epoch": 0.55, + "grad_norm": 1.9468295574188232, + "learning_rate": 1.6441739130434785e-05, + "loss": 1.7409, + "step": 6640 + }, + { + "epoch": 0.55, + "grad_norm": 1.6706266403198242, + "learning_rate": 1.6435942028985506e-05, + "loss": 1.6801, + "step": 6650 + }, + { + "epoch": 0.56, + "grad_norm": 1.4290724992752075, + "learning_rate": 1.6430144927536233e-05, + "loss": 1.6932, + "step": 6660 + }, + { + "epoch": 0.56, + "grad_norm": 2.8305716514587402, + "learning_rate": 1.6424347826086957e-05, + "loss": 1.8525, + "step": 6670 + }, + { + "epoch": 0.56, + "grad_norm": 1.744149088859558, + "learning_rate": 1.641855072463768e-05, + "loss": 1.6997, + "step": 6680 + }, + { + "epoch": 0.56, + "grad_norm": 3.0578601360321045, + "learning_rate": 1.6412753623188405e-05, + "loss": 1.706, + "step": 6690 + }, + { + "epoch": 0.56, + "grad_norm": 2.352912425994873, + "learning_rate": 1.6406956521739133e-05, + "loss": 1.7743, + "step": 6700 + }, + { + "epoch": 0.56, + "grad_norm": 1.978705883026123, + "learning_rate": 1.6401159420289857e-05, + "loss": 1.7195, + "step": 6710 + }, + { + "epoch": 0.56, + "grad_norm": 1.2666630744934082, + "learning_rate": 1.639536231884058e-05, + "loss": 1.6889, + "step": 6720 + }, + { + "epoch": 0.56, + "grad_norm": 1.681015133857727, + "learning_rate": 1.6389565217391305e-05, + "loss": 1.6464, + "step": 6730 + }, + { + "epoch": 0.56, + "grad_norm": 1.7917625904083252, + "learning_rate": 1.6383768115942032e-05, + "loss": 1.7322, + "step": 6740 + }, + { + "epoch": 0.56, + "grad_norm": 2.438145875930786, + "learning_rate": 1.6377971014492756e-05, + "loss": 1.6975, + "step": 6750 + }, + { + "epoch": 0.56, + "grad_norm": 2.554532766342163, + "learning_rate": 1.637217391304348e-05, + "loss": 1.7592, + "step": 6760 + }, + { + "epoch": 0.56, + "grad_norm": 3.3768386840820312, + "learning_rate": 1.6366376811594204e-05, + "loss": 1.6267, + "step": 6770 + }, + { + "epoch": 0.56, + "grad_norm": 3.807661533355713, + "learning_rate": 1.6360579710144928e-05, + "loss": 1.6814, + "step": 6780 + }, + { + "epoch": 0.57, + "grad_norm": 5.202757835388184, + "learning_rate": 1.6354782608695656e-05, + "loss": 1.7636, + "step": 6790 + }, + { + "epoch": 0.57, + "grad_norm": 4.915995121002197, + "learning_rate": 1.6348985507246376e-05, + "loss": 1.6552, + "step": 6800 + }, + { + "epoch": 0.57, + "grad_norm": 3.9319915771484375, + "learning_rate": 1.6343188405797104e-05, + "loss": 1.6542, + "step": 6810 + }, + { + "epoch": 0.57, + "grad_norm": 2.227419137954712, + "learning_rate": 1.6337391304347828e-05, + "loss": 1.771, + "step": 6820 + }, + { + "epoch": 0.57, + "grad_norm": 6.618062973022461, + "learning_rate": 1.633159420289855e-05, + "loss": 1.7122, + "step": 6830 + }, + { + "epoch": 0.57, + "grad_norm": 1.2178608179092407, + "learning_rate": 1.6325797101449276e-05, + "loss": 1.6398, + "step": 6840 + }, + { + "epoch": 0.57, + "grad_norm": 2.5877292156219482, + "learning_rate": 1.632e-05, + "loss": 1.8347, + "step": 6850 + }, + { + "epoch": 0.57, + "grad_norm": 3.831631898880005, + "learning_rate": 1.6314202898550727e-05, + "loss": 1.6153, + "step": 6860 + }, + { + "epoch": 0.57, + "grad_norm": 2.031569242477417, + "learning_rate": 1.630840579710145e-05, + "loss": 1.5382, + "step": 6870 + }, + { + "epoch": 0.57, + "grad_norm": 5.036290645599365, + "learning_rate": 1.6302608695652175e-05, + "loss": 1.7124, + "step": 6880 + }, + { + "epoch": 0.57, + "grad_norm": 3.04699444770813, + "learning_rate": 1.62968115942029e-05, + "loss": 1.8437, + "step": 6890 + }, + { + "epoch": 0.57, + "grad_norm": 1.757121205329895, + "learning_rate": 1.6291014492753626e-05, + "loss": 1.7331, + "step": 6900 + }, + { + "epoch": 0.58, + "grad_norm": 2.107645273208618, + "learning_rate": 1.628521739130435e-05, + "loss": 1.7234, + "step": 6910 + }, + { + "epoch": 0.58, + "grad_norm": 2.109172821044922, + "learning_rate": 1.6279420289855074e-05, + "loss": 1.8055, + "step": 6920 + }, + { + "epoch": 0.58, + "grad_norm": 2.344881772994995, + "learning_rate": 1.62736231884058e-05, + "loss": 1.7585, + "step": 6930 + }, + { + "epoch": 0.58, + "grad_norm": 2.4652295112609863, + "learning_rate": 1.6267826086956522e-05, + "loss": 1.7634, + "step": 6940 + }, + { + "epoch": 0.58, + "grad_norm": 3.3896074295043945, + "learning_rate": 1.6262028985507246e-05, + "loss": 1.8021, + "step": 6950 + }, + { + "epoch": 0.58, + "grad_norm": 1.5588115453720093, + "learning_rate": 1.625623188405797e-05, + "loss": 1.7534, + "step": 6960 + }, + { + "epoch": 0.58, + "grad_norm": 3.033330202102661, + "learning_rate": 1.6250434782608698e-05, + "loss": 1.6904, + "step": 6970 + }, + { + "epoch": 0.58, + "grad_norm": 1.9910866022109985, + "learning_rate": 1.6244637681159422e-05, + "loss": 1.708, + "step": 6980 + }, + { + "epoch": 0.58, + "grad_norm": 3.4834866523742676, + "learning_rate": 1.6238840579710146e-05, + "loss": 1.717, + "step": 6990 + }, + { + "epoch": 0.58, + "grad_norm": 1.1782163381576538, + "learning_rate": 1.623304347826087e-05, + "loss": 1.5833, + "step": 7000 + }, + { + "epoch": 0.58, + "eval_loss": 1.7107688188552856, + "eval_runtime": 107.4755, + "eval_samples_per_second": 9.304, + "eval_steps_per_second": 2.326, + "step": 7000 + }, + { + "epoch": 0.58, + "grad_norm": 3.3755738735198975, + "learning_rate": 1.6227246376811597e-05, + "loss": 1.845, + "step": 7010 + }, + { + "epoch": 0.58, + "grad_norm": 1.7433241605758667, + "learning_rate": 1.622144927536232e-05, + "loss": 1.8534, + "step": 7020 + }, + { + "epoch": 0.59, + "grad_norm": 6.202053070068359, + "learning_rate": 1.6215652173913045e-05, + "loss": 1.6611, + "step": 7030 + }, + { + "epoch": 0.59, + "grad_norm": 1.540313720703125, + "learning_rate": 1.620985507246377e-05, + "loss": 1.7816, + "step": 7040 + }, + { + "epoch": 0.59, + "grad_norm": 2.980997085571289, + "learning_rate": 1.6204057971014497e-05, + "loss": 1.8204, + "step": 7050 + }, + { + "epoch": 0.59, + "grad_norm": 3.4950547218322754, + "learning_rate": 1.6198260869565217e-05, + "loss": 1.779, + "step": 7060 + }, + { + "epoch": 0.59, + "grad_norm": 1.1530729532241821, + "learning_rate": 1.619246376811594e-05, + "loss": 1.7435, + "step": 7070 + }, + { + "epoch": 0.59, + "grad_norm": 1.9771350622177124, + "learning_rate": 1.618666666666667e-05, + "loss": 1.65, + "step": 7080 + }, + { + "epoch": 0.59, + "grad_norm": 3.06072735786438, + "learning_rate": 1.6180869565217393e-05, + "loss": 1.741, + "step": 7090 + }, + { + "epoch": 0.59, + "grad_norm": 1.4323557615280151, + "learning_rate": 1.6175072463768117e-05, + "loss": 1.807, + "step": 7100 + }, + { + "epoch": 0.59, + "grad_norm": 1.6237205266952515, + "learning_rate": 1.616927536231884e-05, + "loss": 1.742, + "step": 7110 + }, + { + "epoch": 0.59, + "grad_norm": 2.375201463699341, + "learning_rate": 1.6163478260869568e-05, + "loss": 1.6201, + "step": 7120 + }, + { + "epoch": 0.59, + "grad_norm": 3.091128349304199, + "learning_rate": 1.6157681159420292e-05, + "loss": 1.7904, + "step": 7130 + }, + { + "epoch": 0.59, + "grad_norm": 1.4652012586593628, + "learning_rate": 1.6151884057971016e-05, + "loss": 1.6906, + "step": 7140 + }, + { + "epoch": 0.6, + "grad_norm": 2.8465373516082764, + "learning_rate": 1.614608695652174e-05, + "loss": 1.8106, + "step": 7150 + }, + { + "epoch": 0.6, + "grad_norm": 2.7708303928375244, + "learning_rate": 1.6140289855072464e-05, + "loss": 1.7154, + "step": 7160 + }, + { + "epoch": 0.6, + "grad_norm": 2.694922924041748, + "learning_rate": 1.613449275362319e-05, + "loss": 1.738, + "step": 7170 + }, + { + "epoch": 0.6, + "grad_norm": 2.2050352096557617, + "learning_rate": 1.6128695652173912e-05, + "loss": 1.7701, + "step": 7180 + }, + { + "epoch": 0.6, + "grad_norm": 3.498955249786377, + "learning_rate": 1.612289855072464e-05, + "loss": 1.7794, + "step": 7190 + }, + { + "epoch": 0.6, + "grad_norm": 1.483665108680725, + "learning_rate": 1.6117101449275364e-05, + "loss": 1.7243, + "step": 7200 + }, + { + "epoch": 0.6, + "grad_norm": 2.6146090030670166, + "learning_rate": 1.6111304347826088e-05, + "loss": 1.7547, + "step": 7210 + }, + { + "epoch": 0.6, + "grad_norm": 0.9853881597518921, + "learning_rate": 1.610550724637681e-05, + "loss": 1.6421, + "step": 7220 + }, + { + "epoch": 0.6, + "grad_norm": 1.687666654586792, + "learning_rate": 1.609971014492754e-05, + "loss": 1.6646, + "step": 7230 + }, + { + "epoch": 0.6, + "grad_norm": 1.8231513500213623, + "learning_rate": 1.6093913043478263e-05, + "loss": 1.8263, + "step": 7240 + }, + { + "epoch": 0.6, + "grad_norm": 1.8006614446640015, + "learning_rate": 1.6088115942028987e-05, + "loss": 1.6252, + "step": 7250 + }, + { + "epoch": 0.6, + "grad_norm": 1.6685547828674316, + "learning_rate": 1.608231884057971e-05, + "loss": 1.6308, + "step": 7260 + }, + { + "epoch": 0.61, + "grad_norm": 4.637762069702148, + "learning_rate": 1.6076521739130435e-05, + "loss": 1.6501, + "step": 7270 + }, + { + "epoch": 0.61, + "grad_norm": 2.1397578716278076, + "learning_rate": 1.6070724637681162e-05, + "loss": 1.7779, + "step": 7280 + }, + { + "epoch": 0.61, + "grad_norm": 2.391406536102295, + "learning_rate": 1.6064927536231883e-05, + "loss": 1.7377, + "step": 7290 + }, + { + "epoch": 0.61, + "grad_norm": 4.646698474884033, + "learning_rate": 1.605913043478261e-05, + "loss": 1.7138, + "step": 7300 + }, + { + "epoch": 0.61, + "grad_norm": 1.9890764951705933, + "learning_rate": 1.6053333333333334e-05, + "loss": 1.636, + "step": 7310 + }, + { + "epoch": 0.61, + "grad_norm": 1.770165205001831, + "learning_rate": 1.604753623188406e-05, + "loss": 1.8327, + "step": 7320 + }, + { + "epoch": 0.61, + "grad_norm": 1.544312834739685, + "learning_rate": 1.6041739130434782e-05, + "loss": 1.7628, + "step": 7330 + }, + { + "epoch": 0.61, + "grad_norm": 3.551856517791748, + "learning_rate": 1.603594202898551e-05, + "loss": 1.7112, + "step": 7340 + }, + { + "epoch": 0.61, + "grad_norm": 4.357561111450195, + "learning_rate": 1.6030144927536234e-05, + "loss": 1.8364, + "step": 7350 + }, + { + "epoch": 0.61, + "grad_norm": 8.165691375732422, + "learning_rate": 1.6024347826086958e-05, + "loss": 1.6699, + "step": 7360 + }, + { + "epoch": 0.61, + "grad_norm": 1.3608006238937378, + "learning_rate": 1.6018550724637682e-05, + "loss": 1.8786, + "step": 7370 + }, + { + "epoch": 0.61, + "grad_norm": 1.6346604824066162, + "learning_rate": 1.6012753623188406e-05, + "loss": 1.6912, + "step": 7380 + }, + { + "epoch": 0.62, + "grad_norm": 2.323948383331299, + "learning_rate": 1.6006956521739133e-05, + "loss": 1.7305, + "step": 7390 + }, + { + "epoch": 0.62, + "grad_norm": 1.645804762840271, + "learning_rate": 1.6001159420289857e-05, + "loss": 1.6702, + "step": 7400 + }, + { + "epoch": 0.62, + "grad_norm": 1.5827347040176392, + "learning_rate": 1.599536231884058e-05, + "loss": 1.6253, + "step": 7410 + }, + { + "epoch": 0.62, + "grad_norm": 2.5143661499023438, + "learning_rate": 1.5989565217391305e-05, + "loss": 1.6295, + "step": 7420 + }, + { + "epoch": 0.62, + "grad_norm": 3.025846242904663, + "learning_rate": 1.5983768115942033e-05, + "loss": 1.6297, + "step": 7430 + }, + { + "epoch": 0.62, + "grad_norm": 1.31910240650177, + "learning_rate": 1.5977971014492753e-05, + "loss": 1.66, + "step": 7440 + }, + { + "epoch": 0.62, + "grad_norm": 3.5227108001708984, + "learning_rate": 1.597217391304348e-05, + "loss": 1.5631, + "step": 7450 + }, + { + "epoch": 0.62, + "grad_norm": 2.1496291160583496, + "learning_rate": 1.5966376811594205e-05, + "loss": 1.7575, + "step": 7460 + }, + { + "epoch": 0.62, + "grad_norm": 1.5027258396148682, + "learning_rate": 1.596057971014493e-05, + "loss": 1.7693, + "step": 7470 + }, + { + "epoch": 0.62, + "grad_norm": 2.8915343284606934, + "learning_rate": 1.5954782608695653e-05, + "loss": 1.5916, + "step": 7480 + }, + { + "epoch": 0.62, + "grad_norm": 1.5063656568527222, + "learning_rate": 1.5948985507246377e-05, + "loss": 1.8137, + "step": 7490 + }, + { + "epoch": 0.62, + "grad_norm": 2.6433496475219727, + "learning_rate": 1.5943188405797104e-05, + "loss": 1.7322, + "step": 7500 + }, + { + "epoch": 0.62, + "eval_loss": 1.705659031867981, + "eval_runtime": 107.4816, + "eval_samples_per_second": 9.304, + "eval_steps_per_second": 2.326, + "step": 7500 + }, + { + "epoch": 0.63, + "grad_norm": 4.358209609985352, + "learning_rate": 1.5937391304347828e-05, + "loss": 1.7619, + "step": 7510 + }, + { + "epoch": 0.63, + "grad_norm": 4.150690078735352, + "learning_rate": 1.5931594202898552e-05, + "loss": 1.6083, + "step": 7520 + }, + { + "epoch": 0.63, + "grad_norm": 5.220200061798096, + "learning_rate": 1.5925797101449276e-05, + "loss": 1.7934, + "step": 7530 + }, + { + "epoch": 0.63, + "grad_norm": 1.5552024841308594, + "learning_rate": 1.5920000000000003e-05, + "loss": 1.7875, + "step": 7540 + }, + { + "epoch": 0.63, + "grad_norm": 2.2470431327819824, + "learning_rate": 1.5914202898550727e-05, + "loss": 1.8069, + "step": 7550 + }, + { + "epoch": 0.63, + "grad_norm": 2.775547981262207, + "learning_rate": 1.590840579710145e-05, + "loss": 1.7393, + "step": 7560 + }, + { + "epoch": 0.63, + "grad_norm": 3.7677903175354004, + "learning_rate": 1.5902608695652175e-05, + "loss": 1.7661, + "step": 7570 + }, + { + "epoch": 0.63, + "grad_norm": 1.0370768308639526, + "learning_rate": 1.58968115942029e-05, + "loss": 1.7814, + "step": 7580 + }, + { + "epoch": 0.63, + "grad_norm": 1.4217703342437744, + "learning_rate": 1.5891014492753623e-05, + "loss": 1.7731, + "step": 7590 + }, + { + "epoch": 0.63, + "grad_norm": 3.532466173171997, + "learning_rate": 1.5885217391304347e-05, + "loss": 1.68, + "step": 7600 + }, + { + "epoch": 0.63, + "grad_norm": 3.8123104572296143, + "learning_rate": 1.5879420289855075e-05, + "loss": 1.6958, + "step": 7610 + }, + { + "epoch": 0.64, + "grad_norm": 5.266079425811768, + "learning_rate": 1.58736231884058e-05, + "loss": 1.8838, + "step": 7620 + }, + { + "epoch": 0.64, + "grad_norm": 4.8042216300964355, + "learning_rate": 1.5867826086956523e-05, + "loss": 1.7128, + "step": 7630 + }, + { + "epoch": 0.64, + "grad_norm": 0.8047385215759277, + "learning_rate": 1.5862028985507247e-05, + "loss": 1.7068, + "step": 7640 + }, + { + "epoch": 0.64, + "grad_norm": 2.157292604446411, + "learning_rate": 1.5856231884057974e-05, + "loss": 1.8066, + "step": 7650 + }, + { + "epoch": 0.64, + "grad_norm": 1.3345887660980225, + "learning_rate": 1.5850434782608698e-05, + "loss": 1.6732, + "step": 7660 + }, + { + "epoch": 0.64, + "grad_norm": 1.439162254333496, + "learning_rate": 1.5844637681159422e-05, + "loss": 1.698, + "step": 7670 + }, + { + "epoch": 0.64, + "grad_norm": 4.751528263092041, + "learning_rate": 1.5838840579710146e-05, + "loss": 1.7296, + "step": 7680 + }, + { + "epoch": 0.64, + "grad_norm": 3.4270384311676025, + "learning_rate": 1.583304347826087e-05, + "loss": 1.84, + "step": 7690 + }, + { + "epoch": 0.64, + "grad_norm": 2.0358874797821045, + "learning_rate": 1.5827246376811594e-05, + "loss": 1.7292, + "step": 7700 + }, + { + "epoch": 0.64, + "grad_norm": 1.3515872955322266, + "learning_rate": 1.5821449275362318e-05, + "loss": 1.6988, + "step": 7710 + }, + { + "epoch": 0.64, + "grad_norm": 2.5109755992889404, + "learning_rate": 1.5815652173913046e-05, + "loss": 1.6356, + "step": 7720 + }, + { + "epoch": 0.64, + "grad_norm": 2.2800748348236084, + "learning_rate": 1.580985507246377e-05, + "loss": 1.6617, + "step": 7730 + }, + { + "epoch": 0.65, + "grad_norm": 7.169689178466797, + "learning_rate": 1.5804057971014494e-05, + "loss": 1.8323, + "step": 7740 + }, + { + "epoch": 0.65, + "grad_norm": 1.6964682340621948, + "learning_rate": 1.5798260869565218e-05, + "loss": 1.7671, + "step": 7750 + }, + { + "epoch": 0.65, + "grad_norm": 2.151012420654297, + "learning_rate": 1.5792463768115945e-05, + "loss": 1.7451, + "step": 7760 + }, + { + "epoch": 0.65, + "grad_norm": 1.739316463470459, + "learning_rate": 1.578666666666667e-05, + "loss": 1.7101, + "step": 7770 + }, + { + "epoch": 0.65, + "grad_norm": 1.8832248449325562, + "learning_rate": 1.5780869565217393e-05, + "loss": 1.7384, + "step": 7780 + }, + { + "epoch": 0.65, + "grad_norm": 3.235473155975342, + "learning_rate": 1.5775072463768117e-05, + "loss": 1.6944, + "step": 7790 + }, + { + "epoch": 0.65, + "grad_norm": 2.9296913146972656, + "learning_rate": 1.576927536231884e-05, + "loss": 1.7979, + "step": 7800 + }, + { + "epoch": 0.65, + "grad_norm": 3.7439193725585938, + "learning_rate": 1.576347826086957e-05, + "loss": 1.6823, + "step": 7810 + }, + { + "epoch": 0.65, + "grad_norm": 2.104919195175171, + "learning_rate": 1.575768115942029e-05, + "loss": 1.9278, + "step": 7820 + }, + { + "epoch": 0.65, + "grad_norm": 2.0970284938812256, + "learning_rate": 1.5751884057971017e-05, + "loss": 1.6475, + "step": 7830 + }, + { + "epoch": 0.65, + "grad_norm": 2.109387159347534, + "learning_rate": 1.574608695652174e-05, + "loss": 1.7482, + "step": 7840 + }, + { + "epoch": 0.65, + "grad_norm": 2.093923807144165, + "learning_rate": 1.5740289855072465e-05, + "loss": 1.708, + "step": 7850 + }, + { + "epoch": 0.66, + "grad_norm": 1.431930422782898, + "learning_rate": 1.573449275362319e-05, + "loss": 1.7585, + "step": 7860 + }, + { + "epoch": 0.66, + "grad_norm": 3.5843312740325928, + "learning_rate": 1.5728695652173916e-05, + "loss": 1.7017, + "step": 7870 + }, + { + "epoch": 0.66, + "grad_norm": 2.592013120651245, + "learning_rate": 1.572289855072464e-05, + "loss": 1.6862, + "step": 7880 + }, + { + "epoch": 0.66, + "grad_norm": 3.1581757068634033, + "learning_rate": 1.5717101449275364e-05, + "loss": 1.8041, + "step": 7890 + }, + { + "epoch": 0.66, + "grad_norm": 1.719002604484558, + "learning_rate": 1.5711304347826088e-05, + "loss": 1.7885, + "step": 7900 + }, + { + "epoch": 0.66, + "grad_norm": 1.8887274265289307, + "learning_rate": 1.5705507246376812e-05, + "loss": 1.741, + "step": 7910 + }, + { + "epoch": 0.66, + "grad_norm": 1.5597342252731323, + "learning_rate": 1.569971014492754e-05, + "loss": 1.7687, + "step": 7920 + }, + { + "epoch": 0.66, + "grad_norm": 3.3231523036956787, + "learning_rate": 1.569391304347826e-05, + "loss": 1.7038, + "step": 7930 + }, + { + "epoch": 0.66, + "grad_norm": 2.540226936340332, + "learning_rate": 1.5688115942028987e-05, + "loss": 1.8191, + "step": 7940 + }, + { + "epoch": 0.66, + "grad_norm": 2.614171266555786, + "learning_rate": 1.568231884057971e-05, + "loss": 1.5657, + "step": 7950 + }, + { + "epoch": 0.66, + "grad_norm": 3.2686009407043457, + "learning_rate": 1.5676521739130435e-05, + "loss": 1.6592, + "step": 7960 + }, + { + "epoch": 0.66, + "grad_norm": 2.4677610397338867, + "learning_rate": 1.567072463768116e-05, + "loss": 1.702, + "step": 7970 + }, + { + "epoch": 0.67, + "grad_norm": 2.712357521057129, + "learning_rate": 1.5664927536231887e-05, + "loss": 1.7096, + "step": 7980 + }, + { + "epoch": 0.67, + "grad_norm": 3.5932676792144775, + "learning_rate": 1.565913043478261e-05, + "loss": 1.6477, + "step": 7990 + }, + { + "epoch": 0.67, + "grad_norm": 2.4786651134490967, + "learning_rate": 1.5653333333333335e-05, + "loss": 1.7013, + "step": 8000 + }, + { + "epoch": 0.67, + "eval_loss": 1.751250982284546, + "eval_runtime": 107.5079, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 8000 + }, + { + "epoch": 0.67, + "grad_norm": 2.696633815765381, + "learning_rate": 1.564753623188406e-05, + "loss": 1.7176, + "step": 8010 + }, + { + "epoch": 0.67, + "grad_norm": 2.963792324066162, + "learning_rate": 1.5641739130434783e-05, + "loss": 1.7605, + "step": 8020 + }, + { + "epoch": 0.67, + "grad_norm": 2.107274055480957, + "learning_rate": 1.563594202898551e-05, + "loss": 1.5939, + "step": 8030 + }, + { + "epoch": 0.67, + "grad_norm": 3.7392399311065674, + "learning_rate": 1.5630144927536234e-05, + "loss": 1.7734, + "step": 8040 + }, + { + "epoch": 0.67, + "grad_norm": 2.943000078201294, + "learning_rate": 1.5624347826086958e-05, + "loss": 1.6703, + "step": 8050 + }, + { + "epoch": 0.67, + "grad_norm": 2.6716084480285645, + "learning_rate": 1.5618550724637682e-05, + "loss": 1.7177, + "step": 8060 + }, + { + "epoch": 0.67, + "grad_norm": 3.429185390472412, + "learning_rate": 1.561275362318841e-05, + "loss": 1.7043, + "step": 8070 + }, + { + "epoch": 0.67, + "grad_norm": 3.558910608291626, + "learning_rate": 1.560695652173913e-05, + "loss": 1.7066, + "step": 8080 + }, + { + "epoch": 0.67, + "grad_norm": 3.654998302459717, + "learning_rate": 1.5601159420289854e-05, + "loss": 1.7063, + "step": 8090 + }, + { + "epoch": 0.68, + "grad_norm": 3.6076571941375732, + "learning_rate": 1.559536231884058e-05, + "loss": 1.6818, + "step": 8100 + }, + { + "epoch": 0.68, + "grad_norm": 1.5298787355422974, + "learning_rate": 1.5589565217391306e-05, + "loss": 1.6751, + "step": 8110 + }, + { + "epoch": 0.68, + "grad_norm": 3.7017557621002197, + "learning_rate": 1.558376811594203e-05, + "loss": 1.7395, + "step": 8120 + }, + { + "epoch": 0.68, + "grad_norm": 2.02172589302063, + "learning_rate": 1.5577971014492754e-05, + "loss": 1.777, + "step": 8130 + }, + { + "epoch": 0.68, + "grad_norm": 4.633596897125244, + "learning_rate": 1.557217391304348e-05, + "loss": 1.7152, + "step": 8140 + }, + { + "epoch": 0.68, + "grad_norm": 1.3255687952041626, + "learning_rate": 1.5566376811594205e-05, + "loss": 1.6739, + "step": 8150 + }, + { + "epoch": 0.68, + "grad_norm": 2.6454014778137207, + "learning_rate": 1.556057971014493e-05, + "loss": 1.7551, + "step": 8160 + }, + { + "epoch": 0.68, + "grad_norm": 2.2562685012817383, + "learning_rate": 1.5554782608695653e-05, + "loss": 1.7421, + "step": 8170 + }, + { + "epoch": 0.68, + "grad_norm": 3.259690761566162, + "learning_rate": 1.554898550724638e-05, + "loss": 1.6761, + "step": 8180 + }, + { + "epoch": 0.68, + "grad_norm": 1.7266426086425781, + "learning_rate": 1.55431884057971e-05, + "loss": 1.8321, + "step": 8190 + }, + { + "epoch": 0.68, + "grad_norm": 1.9167896509170532, + "learning_rate": 1.5537391304347825e-05, + "loss": 1.7285, + "step": 8200 + }, + { + "epoch": 0.68, + "grad_norm": 4.338323593139648, + "learning_rate": 1.5531594202898552e-05, + "loss": 1.7145, + "step": 8210 + }, + { + "epoch": 0.69, + "grad_norm": 3.5180749893188477, + "learning_rate": 1.5525797101449276e-05, + "loss": 1.7482, + "step": 8220 + }, + { + "epoch": 0.69, + "grad_norm": 5.429786682128906, + "learning_rate": 1.552e-05, + "loss": 1.7808, + "step": 8230 + }, + { + "epoch": 0.69, + "grad_norm": 4.005053997039795, + "learning_rate": 1.5514202898550724e-05, + "loss": 1.7514, + "step": 8240 + }, + { + "epoch": 0.69, + "grad_norm": 3.007596969604492, + "learning_rate": 1.5508405797101452e-05, + "loss": 1.739, + "step": 8250 + }, + { + "epoch": 0.69, + "grad_norm": 2.8556976318359375, + "learning_rate": 1.5502608695652176e-05, + "loss": 1.6269, + "step": 8260 + }, + { + "epoch": 0.69, + "grad_norm": 3.3058815002441406, + "learning_rate": 1.54968115942029e-05, + "loss": 1.8055, + "step": 8270 + }, + { + "epoch": 0.69, + "grad_norm": 2.482654094696045, + "learning_rate": 1.5491014492753624e-05, + "loss": 1.7335, + "step": 8280 + }, + { + "epoch": 0.69, + "grad_norm": 2.104722499847412, + "learning_rate": 1.548521739130435e-05, + "loss": 1.6697, + "step": 8290 + }, + { + "epoch": 0.69, + "grad_norm": 1.4763132333755493, + "learning_rate": 1.5479420289855075e-05, + "loss": 1.7573, + "step": 8300 + }, + { + "epoch": 0.69, + "grad_norm": 2.5669054985046387, + "learning_rate": 1.5473623188405796e-05, + "loss": 1.737, + "step": 8310 + }, + { + "epoch": 0.69, + "grad_norm": 1.9973231554031372, + "learning_rate": 1.5467826086956523e-05, + "loss": 1.828, + "step": 8320 + }, + { + "epoch": 0.69, + "grad_norm": 2.6836585998535156, + "learning_rate": 1.5462028985507247e-05, + "loss": 1.6509, + "step": 8330 + }, + { + "epoch": 0.69, + "grad_norm": 1.6782662868499756, + "learning_rate": 1.545623188405797e-05, + "loss": 1.7843, + "step": 8340 + }, + { + "epoch": 0.7, + "grad_norm": 4.171507835388184, + "learning_rate": 1.5450434782608695e-05, + "loss": 1.697, + "step": 8350 + }, + { + "epoch": 0.7, + "grad_norm": 5.005525588989258, + "learning_rate": 1.5444637681159423e-05, + "loss": 1.6807, + "step": 8360 + }, + { + "epoch": 0.7, + "grad_norm": 2.1129069328308105, + "learning_rate": 1.5438840579710147e-05, + "loss": 1.6926, + "step": 8370 + }, + { + "epoch": 0.7, + "grad_norm": 2.524050235748291, + "learning_rate": 1.543304347826087e-05, + "loss": 1.7259, + "step": 8380 + }, + { + "epoch": 0.7, + "grad_norm": 3.451939105987549, + "learning_rate": 1.5427246376811595e-05, + "loss": 1.8682, + "step": 8390 + }, + { + "epoch": 0.7, + "grad_norm": 2.816455841064453, + "learning_rate": 1.542144927536232e-05, + "loss": 1.6865, + "step": 8400 + }, + { + "epoch": 0.7, + "grad_norm": 1.2041363716125488, + "learning_rate": 1.5415652173913046e-05, + "loss": 1.7065, + "step": 8410 + }, + { + "epoch": 0.7, + "grad_norm": 2.183321475982666, + "learning_rate": 1.540985507246377e-05, + "loss": 1.7404, + "step": 8420 + }, + { + "epoch": 0.7, + "grad_norm": 1.7429885864257812, + "learning_rate": 1.5404057971014494e-05, + "loss": 1.7694, + "step": 8430 + }, + { + "epoch": 0.7, + "grad_norm": 3.275554656982422, + "learning_rate": 1.5398260869565218e-05, + "loss": 1.6809, + "step": 8440 + }, + { + "epoch": 0.7, + "grad_norm": 2.737149953842163, + "learning_rate": 1.5392463768115946e-05, + "loss": 1.8422, + "step": 8450 + }, + { + "epoch": 0.7, + "grad_norm": 1.7504504919052124, + "learning_rate": 1.5386666666666666e-05, + "loss": 1.7135, + "step": 8460 + }, + { + "epoch": 0.71, + "grad_norm": 2.513226270675659, + "learning_rate": 1.5380869565217394e-05, + "loss": 1.7704, + "step": 8470 + }, + { + "epoch": 0.71, + "grad_norm": 3.151536226272583, + "learning_rate": 1.5375072463768118e-05, + "loss": 1.6754, + "step": 8480 + }, + { + "epoch": 0.71, + "grad_norm": 4.003200054168701, + "learning_rate": 1.536927536231884e-05, + "loss": 1.7605, + "step": 8490 + }, + { + "epoch": 0.71, + "grad_norm": 2.3642451763153076, + "learning_rate": 1.5363478260869566e-05, + "loss": 1.7791, + "step": 8500 + }, + { + "epoch": 0.71, + "eval_loss": 1.728246808052063, + "eval_runtime": 107.4982, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 8500 + }, + { + "epoch": 0.71, + "grad_norm": 1.0081084966659546, + "learning_rate": 1.535768115942029e-05, + "loss": 1.5997, + "step": 8510 + }, + { + "epoch": 0.71, + "grad_norm": 4.5596394538879395, + "learning_rate": 1.5351884057971017e-05, + "loss": 1.5862, + "step": 8520 + }, + { + "epoch": 0.71, + "grad_norm": 3.325129508972168, + "learning_rate": 1.534608695652174e-05, + "loss": 1.6014, + "step": 8530 + }, + { + "epoch": 0.71, + "grad_norm": 3.541696071624756, + "learning_rate": 1.5340289855072465e-05, + "loss": 1.7766, + "step": 8540 + }, + { + "epoch": 0.71, + "grad_norm": 1.9516916275024414, + "learning_rate": 1.533449275362319e-05, + "loss": 1.7535, + "step": 8550 + }, + { + "epoch": 0.71, + "grad_norm": 2.0460009574890137, + "learning_rate": 1.5328695652173916e-05, + "loss": 1.6929, + "step": 8560 + }, + { + "epoch": 0.71, + "grad_norm": 2.826079845428467, + "learning_rate": 1.5322898550724637e-05, + "loss": 1.7532, + "step": 8570 + }, + { + "epoch": 0.71, + "grad_norm": 2.370847702026367, + "learning_rate": 1.5317101449275364e-05, + "loss": 1.8819, + "step": 8580 + }, + { + "epoch": 0.72, + "grad_norm": 4.229450702667236, + "learning_rate": 1.531130434782609e-05, + "loss": 1.7454, + "step": 8590 + }, + { + "epoch": 0.72, + "grad_norm": 1.1140810251235962, + "learning_rate": 1.5305507246376812e-05, + "loss": 1.6904, + "step": 8600 + }, + { + "epoch": 0.72, + "grad_norm": 2.1529414653778076, + "learning_rate": 1.5299710144927536e-05, + "loss": 1.8351, + "step": 8610 + }, + { + "epoch": 0.72, + "grad_norm": 2.1377217769622803, + "learning_rate": 1.529391304347826e-05, + "loss": 1.7735, + "step": 8620 + }, + { + "epoch": 0.72, + "grad_norm": 2.921389102935791, + "learning_rate": 1.5288115942028988e-05, + "loss": 1.7565, + "step": 8630 + }, + { + "epoch": 0.72, + "grad_norm": 4.921605110168457, + "learning_rate": 1.5282318840579712e-05, + "loss": 1.8189, + "step": 8640 + }, + { + "epoch": 0.72, + "grad_norm": 1.4307273626327515, + "learning_rate": 1.5276521739130436e-05, + "loss": 1.8747, + "step": 8650 + }, + { + "epoch": 0.72, + "grad_norm": 3.513711452484131, + "learning_rate": 1.527072463768116e-05, + "loss": 1.7317, + "step": 8660 + }, + { + "epoch": 0.72, + "grad_norm": 1.3792582750320435, + "learning_rate": 1.5264927536231887e-05, + "loss": 1.8341, + "step": 8670 + }, + { + "epoch": 0.72, + "grad_norm": 2.2375993728637695, + "learning_rate": 1.525913043478261e-05, + "loss": 1.665, + "step": 8680 + }, + { + "epoch": 0.72, + "grad_norm": 1.7629612684249878, + "learning_rate": 1.5253333333333335e-05, + "loss": 1.7347, + "step": 8690 + }, + { + "epoch": 0.72, + "grad_norm": 1.4616599082946777, + "learning_rate": 1.524753623188406e-05, + "loss": 1.6149, + "step": 8700 + }, + { + "epoch": 0.73, + "grad_norm": 2.200507164001465, + "learning_rate": 1.5241739130434783e-05, + "loss": 1.6675, + "step": 8710 + }, + { + "epoch": 0.73, + "grad_norm": 7.254258632659912, + "learning_rate": 1.5235942028985509e-05, + "loss": 1.7182, + "step": 8720 + }, + { + "epoch": 0.73, + "grad_norm": 1.435335636138916, + "learning_rate": 1.5230144927536233e-05, + "loss": 1.738, + "step": 8730 + }, + { + "epoch": 0.73, + "grad_norm": 2.216738224029541, + "learning_rate": 1.5224347826086959e-05, + "loss": 1.6982, + "step": 8740 + }, + { + "epoch": 0.73, + "grad_norm": 2.1532115936279297, + "learning_rate": 1.5218550724637681e-05, + "loss": 1.6887, + "step": 8750 + }, + { + "epoch": 0.73, + "grad_norm": 2.216334581375122, + "learning_rate": 1.5212753623188408e-05, + "loss": 1.7531, + "step": 8760 + }, + { + "epoch": 0.73, + "grad_norm": 1.3834766149520874, + "learning_rate": 1.520695652173913e-05, + "loss": 1.7209, + "step": 8770 + }, + { + "epoch": 0.73, + "grad_norm": 3.171159267425537, + "learning_rate": 1.5201159420289856e-05, + "loss": 1.7489, + "step": 8780 + }, + { + "epoch": 0.73, + "grad_norm": 2.972515344619751, + "learning_rate": 1.519536231884058e-05, + "loss": 1.641, + "step": 8790 + }, + { + "epoch": 0.73, + "grad_norm": 1.618605136871338, + "learning_rate": 1.5189565217391306e-05, + "loss": 1.7289, + "step": 8800 + }, + { + "epoch": 0.73, + "grad_norm": 1.3899489641189575, + "learning_rate": 1.518376811594203e-05, + "loss": 1.7706, + "step": 8810 + }, + { + "epoch": 0.73, + "grad_norm": 3.194946527481079, + "learning_rate": 1.5177971014492754e-05, + "loss": 1.8283, + "step": 8820 + }, + { + "epoch": 0.74, + "grad_norm": 3.15311598777771, + "learning_rate": 1.517217391304348e-05, + "loss": 1.704, + "step": 8830 + }, + { + "epoch": 0.74, + "grad_norm": 1.4149380922317505, + "learning_rate": 1.5166376811594204e-05, + "loss": 1.6738, + "step": 8840 + }, + { + "epoch": 0.74, + "grad_norm": 3.4329111576080322, + "learning_rate": 1.516057971014493e-05, + "loss": 1.7411, + "step": 8850 + }, + { + "epoch": 0.74, + "grad_norm": 3.406054973602295, + "learning_rate": 1.5154782608695654e-05, + "loss": 1.8027, + "step": 8860 + }, + { + "epoch": 0.74, + "grad_norm": 4.0179643630981445, + "learning_rate": 1.514898550724638e-05, + "loss": 1.7083, + "step": 8870 + }, + { + "epoch": 0.74, + "grad_norm": 2.3201992511749268, + "learning_rate": 1.5143188405797103e-05, + "loss": 1.6307, + "step": 8880 + }, + { + "epoch": 0.74, + "grad_norm": 4.243584632873535, + "learning_rate": 1.5137391304347829e-05, + "loss": 1.7504, + "step": 8890 + }, + { + "epoch": 0.74, + "grad_norm": 1.8807439804077148, + "learning_rate": 1.5131594202898551e-05, + "loss": 1.7607, + "step": 8900 + }, + { + "epoch": 0.74, + "grad_norm": 3.392615556716919, + "learning_rate": 1.5125797101449277e-05, + "loss": 1.6733, + "step": 8910 + }, + { + "epoch": 0.74, + "grad_norm": 2.388437032699585, + "learning_rate": 1.5120000000000001e-05, + "loss": 1.6187, + "step": 8920 + }, + { + "epoch": 0.74, + "grad_norm": 7.407155990600586, + "learning_rate": 1.5114202898550725e-05, + "loss": 1.6473, + "step": 8930 + }, + { + "epoch": 0.74, + "grad_norm": 3.9519150257110596, + "learning_rate": 1.510840579710145e-05, + "loss": 1.7244, + "step": 8940 + }, + { + "epoch": 0.75, + "grad_norm": 3.5977227687835693, + "learning_rate": 1.5102608695652175e-05, + "loss": 1.7157, + "step": 8950 + }, + { + "epoch": 0.75, + "grad_norm": 5.951735019683838, + "learning_rate": 1.50968115942029e-05, + "loss": 1.7765, + "step": 8960 + }, + { + "epoch": 0.75, + "grad_norm": 2.0863535404205322, + "learning_rate": 1.5091014492753624e-05, + "loss": 1.5254, + "step": 8970 + }, + { + "epoch": 0.75, + "grad_norm": 1.2637622356414795, + "learning_rate": 1.508521739130435e-05, + "loss": 1.808, + "step": 8980 + }, + { + "epoch": 0.75, + "grad_norm": 2.3032875061035156, + "learning_rate": 1.5079420289855074e-05, + "loss": 1.7458, + "step": 8990 + }, + { + "epoch": 0.75, + "grad_norm": 3.13789701461792, + "learning_rate": 1.50736231884058e-05, + "loss": 1.6349, + "step": 9000 + }, + { + "epoch": 0.75, + "eval_loss": 1.7237039804458618, + "eval_runtime": 107.5052, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 9000 + }, + { + "epoch": 0.75, + "grad_norm": 1.9929821491241455, + "learning_rate": 1.5067826086956524e-05, + "loss": 1.7925, + "step": 9010 + }, + { + "epoch": 0.75, + "grad_norm": 5.004290580749512, + "learning_rate": 1.5062028985507246e-05, + "loss": 1.6467, + "step": 9020 + }, + { + "epoch": 0.75, + "grad_norm": 1.387837529182434, + "learning_rate": 1.5056231884057972e-05, + "loss": 1.6188, + "step": 9030 + }, + { + "epoch": 0.75, + "grad_norm": 2.9127049446105957, + "learning_rate": 1.5050434782608696e-05, + "loss": 1.6898, + "step": 9040 + }, + { + "epoch": 0.75, + "grad_norm": 2.460002899169922, + "learning_rate": 1.5044637681159421e-05, + "loss": 1.7306, + "step": 9050 + }, + { + "epoch": 0.76, + "grad_norm": 6.018058776855469, + "learning_rate": 1.5038840579710145e-05, + "loss": 1.6259, + "step": 9060 + }, + { + "epoch": 0.76, + "grad_norm": 1.8971534967422485, + "learning_rate": 1.5033043478260871e-05, + "loss": 1.6011, + "step": 9070 + }, + { + "epoch": 0.76, + "grad_norm": 4.7335357666015625, + "learning_rate": 1.5027246376811595e-05, + "loss": 1.7402, + "step": 9080 + }, + { + "epoch": 0.76, + "grad_norm": 2.477769613265991, + "learning_rate": 1.5021449275362321e-05, + "loss": 1.7765, + "step": 9090 + }, + { + "epoch": 0.76, + "grad_norm": 2.670304775238037, + "learning_rate": 1.5015652173913045e-05, + "loss": 1.8922, + "step": 9100 + }, + { + "epoch": 0.76, + "grad_norm": 3.747793197631836, + "learning_rate": 1.500985507246377e-05, + "loss": 1.6625, + "step": 9110 + }, + { + "epoch": 0.76, + "grad_norm": 3.719919204711914, + "learning_rate": 1.5004057971014495e-05, + "loss": 1.7383, + "step": 9120 + }, + { + "epoch": 0.76, + "grad_norm": 2.1044466495513916, + "learning_rate": 1.4998260869565217e-05, + "loss": 1.8098, + "step": 9130 + }, + { + "epoch": 0.76, + "grad_norm": 2.7055985927581787, + "learning_rate": 1.4992463768115944e-05, + "loss": 1.4545, + "step": 9140 + }, + { + "epoch": 0.76, + "grad_norm": 0.7693852186203003, + "learning_rate": 1.4986666666666667e-05, + "loss": 1.6745, + "step": 9150 + }, + { + "epoch": 0.76, + "grad_norm": 2.0475828647613525, + "learning_rate": 1.4980869565217392e-05, + "loss": 1.5843, + "step": 9160 + }, + { + "epoch": 0.76, + "grad_norm": 3.5320639610290527, + "learning_rate": 1.4975072463768116e-05, + "loss": 1.6735, + "step": 9170 + }, + { + "epoch": 0.77, + "grad_norm": 3.41857647895813, + "learning_rate": 1.4969275362318842e-05, + "loss": 1.6797, + "step": 9180 + }, + { + "epoch": 0.77, + "grad_norm": 2.8437910079956055, + "learning_rate": 1.4963478260869566e-05, + "loss": 1.7409, + "step": 9190 + }, + { + "epoch": 0.77, + "grad_norm": 3.104484796524048, + "learning_rate": 1.4957681159420292e-05, + "loss": 1.5646, + "step": 9200 + }, + { + "epoch": 0.77, + "grad_norm": 3.2742903232574463, + "learning_rate": 1.4951884057971016e-05, + "loss": 1.8273, + "step": 9210 + }, + { + "epoch": 0.77, + "grad_norm": 3.759800434112549, + "learning_rate": 1.494608695652174e-05, + "loss": 1.5638, + "step": 9220 + }, + { + "epoch": 0.77, + "grad_norm": 1.4003862142562866, + "learning_rate": 1.4940289855072465e-05, + "loss": 1.8205, + "step": 9230 + }, + { + "epoch": 0.77, + "grad_norm": 3.187140703201294, + "learning_rate": 1.493449275362319e-05, + "loss": 1.6775, + "step": 9240 + }, + { + "epoch": 0.77, + "grad_norm": 3.953629493713379, + "learning_rate": 1.4928695652173915e-05, + "loss": 1.7948, + "step": 9250 + }, + { + "epoch": 0.77, + "grad_norm": 3.153158664703369, + "learning_rate": 1.4922898550724637e-05, + "loss": 1.7641, + "step": 9260 + }, + { + "epoch": 0.77, + "grad_norm": 3.8816325664520264, + "learning_rate": 1.4917101449275365e-05, + "loss": 1.7967, + "step": 9270 + }, + { + "epoch": 0.77, + "grad_norm": 2.1909523010253906, + "learning_rate": 1.4911304347826087e-05, + "loss": 1.7295, + "step": 9280 + }, + { + "epoch": 0.77, + "grad_norm": 3.3025991916656494, + "learning_rate": 1.4905507246376813e-05, + "loss": 1.7594, + "step": 9290 + }, + { + "epoch": 0.78, + "grad_norm": 4.461934566497803, + "learning_rate": 1.4899710144927537e-05, + "loss": 1.8171, + "step": 9300 + }, + { + "epoch": 0.78, + "grad_norm": 4.337406158447266, + "learning_rate": 1.4893913043478263e-05, + "loss": 1.6085, + "step": 9310 + }, + { + "epoch": 0.78, + "grad_norm": 1.752158284187317, + "learning_rate": 1.4888115942028987e-05, + "loss": 1.6972, + "step": 9320 + }, + { + "epoch": 0.78, + "grad_norm": 2.966576099395752, + "learning_rate": 1.488231884057971e-05, + "loss": 1.6919, + "step": 9330 + }, + { + "epoch": 0.78, + "grad_norm": 2.894684076309204, + "learning_rate": 1.4876521739130436e-05, + "loss": 1.4976, + "step": 9340 + }, + { + "epoch": 0.78, + "grad_norm": 1.2293604612350464, + "learning_rate": 1.487072463768116e-05, + "loss": 1.6831, + "step": 9350 + }, + { + "epoch": 0.78, + "grad_norm": 1.8790899515151978, + "learning_rate": 1.4864927536231886e-05, + "loss": 1.7635, + "step": 9360 + }, + { + "epoch": 0.78, + "grad_norm": 2.168088436126709, + "learning_rate": 1.485913043478261e-05, + "loss": 1.6389, + "step": 9370 + }, + { + "epoch": 0.78, + "grad_norm": 1.0390825271606445, + "learning_rate": 1.4853333333333336e-05, + "loss": 1.6824, + "step": 9380 + }, + { + "epoch": 0.78, + "grad_norm": 2.15370512008667, + "learning_rate": 1.4847536231884058e-05, + "loss": 1.6074, + "step": 9390 + }, + { + "epoch": 0.78, + "grad_norm": 2.2792294025421143, + "learning_rate": 1.4841739130434785e-05, + "loss": 1.6977, + "step": 9400 + }, + { + "epoch": 0.78, + "grad_norm": 3.7052159309387207, + "learning_rate": 1.4835942028985508e-05, + "loss": 1.6734, + "step": 9410 + }, + { + "epoch": 0.79, + "grad_norm": 0.8690690994262695, + "learning_rate": 1.4830144927536233e-05, + "loss": 1.7114, + "step": 9420 + }, + { + "epoch": 0.79, + "grad_norm": 1.2574138641357422, + "learning_rate": 1.4824347826086957e-05, + "loss": 1.4424, + "step": 9430 + }, + { + "epoch": 0.79, + "grad_norm": 0.904484748840332, + "learning_rate": 1.4818550724637681e-05, + "loss": 1.7067, + "step": 9440 + }, + { + "epoch": 0.79, + "grad_norm": 2.294973611831665, + "learning_rate": 1.4812753623188407e-05, + "loss": 1.8665, + "step": 9450 + }, + { + "epoch": 0.79, + "grad_norm": 4.394903182983398, + "learning_rate": 1.4806956521739131e-05, + "loss": 1.5927, + "step": 9460 + }, + { + "epoch": 0.79, + "grad_norm": 2.6690807342529297, + "learning_rate": 1.4801159420289857e-05, + "loss": 1.6617, + "step": 9470 + }, + { + "epoch": 0.79, + "grad_norm": 1.6983529329299927, + "learning_rate": 1.479536231884058e-05, + "loss": 1.7189, + "step": 9480 + }, + { + "epoch": 0.79, + "grad_norm": 2.5802037715911865, + "learning_rate": 1.4789565217391307e-05, + "loss": 1.4436, + "step": 9490 + }, + { + "epoch": 0.79, + "grad_norm": 3.2202868461608887, + "learning_rate": 1.478376811594203e-05, + "loss": 1.8035, + "step": 9500 + }, + { + "epoch": 0.79, + "eval_loss": 1.7105869054794312, + "eval_runtime": 107.5846, + "eval_samples_per_second": 9.295, + "eval_steps_per_second": 2.324, + "step": 9500 + }, + { + "epoch": 0.79, + "grad_norm": 3.4393227100372314, + "learning_rate": 1.4777971014492756e-05, + "loss": 1.5999, + "step": 9510 + }, + { + "epoch": 0.79, + "grad_norm": 2.443208694458008, + "learning_rate": 1.4772173913043479e-05, + "loss": 1.6505, + "step": 9520 + }, + { + "epoch": 0.79, + "grad_norm": 5.468209743499756, + "learning_rate": 1.4766376811594203e-05, + "loss": 1.6897, + "step": 9530 + }, + { + "epoch": 0.8, + "grad_norm": 1.3215961456298828, + "learning_rate": 1.4760579710144928e-05, + "loss": 1.8336, + "step": 9540 + }, + { + "epoch": 0.8, + "grad_norm": 3.739179849624634, + "learning_rate": 1.4754782608695652e-05, + "loss": 1.8085, + "step": 9550 + }, + { + "epoch": 0.8, + "grad_norm": 1.8474160432815552, + "learning_rate": 1.4748985507246378e-05, + "loss": 1.8288, + "step": 9560 + }, + { + "epoch": 0.8, + "grad_norm": 2.9103896617889404, + "learning_rate": 1.4743188405797102e-05, + "loss": 1.7058, + "step": 9570 + }, + { + "epoch": 0.8, + "grad_norm": 1.432541847229004, + "learning_rate": 1.4737391304347828e-05, + "loss": 1.7497, + "step": 9580 + }, + { + "epoch": 0.8, + "grad_norm": 2.380267381668091, + "learning_rate": 1.4731594202898552e-05, + "loss": 1.6908, + "step": 9590 + }, + { + "epoch": 0.8, + "grad_norm": 1.8983180522918701, + "learning_rate": 1.4725797101449277e-05, + "loss": 1.7913, + "step": 9600 + }, + { + "epoch": 0.8, + "grad_norm": 1.5178720951080322, + "learning_rate": 1.4720000000000001e-05, + "loss": 1.6481, + "step": 9610 + }, + { + "epoch": 0.8, + "grad_norm": 4.430023670196533, + "learning_rate": 1.4714202898550727e-05, + "loss": 1.7796, + "step": 9620 + }, + { + "epoch": 0.8, + "grad_norm": 1.9804303646087646, + "learning_rate": 1.4708405797101451e-05, + "loss": 1.7676, + "step": 9630 + }, + { + "epoch": 0.8, + "grad_norm": 4.044782638549805, + "learning_rate": 1.4702608695652173e-05, + "loss": 1.6331, + "step": 9640 + }, + { + "epoch": 0.8, + "grad_norm": 3.2377848625183105, + "learning_rate": 1.46968115942029e-05, + "loss": 1.6992, + "step": 9650 + }, + { + "epoch": 0.81, + "grad_norm": 2.13010835647583, + "learning_rate": 1.4691014492753623e-05, + "loss": 1.7363, + "step": 9660 + }, + { + "epoch": 0.81, + "grad_norm": 2.638706684112549, + "learning_rate": 1.4685217391304349e-05, + "loss": 1.6856, + "step": 9670 + }, + { + "epoch": 0.81, + "grad_norm": 1.5270848274230957, + "learning_rate": 1.4679420289855073e-05, + "loss": 1.694, + "step": 9680 + }, + { + "epoch": 0.81, + "grad_norm": 2.1668355464935303, + "learning_rate": 1.4673623188405798e-05, + "loss": 1.5912, + "step": 9690 + }, + { + "epoch": 0.81, + "grad_norm": 1.8836257457733154, + "learning_rate": 1.466840579710145e-05, + "loss": 1.7058, + "step": 9700 + }, + { + "epoch": 0.81, + "grad_norm": 1.6809861660003662, + "learning_rate": 1.4662608695652174e-05, + "loss": 1.6679, + "step": 9710 + }, + { + "epoch": 0.81, + "grad_norm": 3.5727224349975586, + "learning_rate": 1.46568115942029e-05, + "loss": 1.7444, + "step": 9720 + }, + { + "epoch": 0.81, + "grad_norm": 5.230935573577881, + "learning_rate": 1.4651014492753623e-05, + "loss": 1.649, + "step": 9730 + }, + { + "epoch": 0.81, + "grad_norm": 2.9964046478271484, + "learning_rate": 1.464521739130435e-05, + "loss": 1.6816, + "step": 9740 + }, + { + "epoch": 0.81, + "grad_norm": 2.299757480621338, + "learning_rate": 1.4639420289855073e-05, + "loss": 1.7576, + "step": 9750 + }, + { + "epoch": 0.81, + "grad_norm": 2.057912588119507, + "learning_rate": 1.4633623188405799e-05, + "loss": 1.7915, + "step": 9760 + }, + { + "epoch": 0.81, + "grad_norm": 2.693455934524536, + "learning_rate": 1.4627826086956523e-05, + "loss": 1.7852, + "step": 9770 + }, + { + "epoch": 0.81, + "grad_norm": 1.2792394161224365, + "learning_rate": 1.4622028985507249e-05, + "loss": 1.7702, + "step": 9780 + }, + { + "epoch": 0.82, + "grad_norm": 1.9166826009750366, + "learning_rate": 1.4616231884057973e-05, + "loss": 1.5197, + "step": 9790 + }, + { + "epoch": 0.82, + "grad_norm": 4.76140832901001, + "learning_rate": 1.4610434782608698e-05, + "loss": 1.7217, + "step": 9800 + }, + { + "epoch": 0.82, + "grad_norm": 1.99151611328125, + "learning_rate": 1.4604637681159422e-05, + "loss": 1.8153, + "step": 9810 + }, + { + "epoch": 0.82, + "grad_norm": 1.7407947778701782, + "learning_rate": 1.4598840579710145e-05, + "loss": 1.7537, + "step": 9820 + }, + { + "epoch": 0.82, + "grad_norm": 4.095398426055908, + "learning_rate": 1.459304347826087e-05, + "loss": 1.7267, + "step": 9830 + }, + { + "epoch": 0.82, + "grad_norm": 2.100191593170166, + "learning_rate": 1.4587246376811594e-05, + "loss": 1.6822, + "step": 9840 + }, + { + "epoch": 0.82, + "grad_norm": 1.774409294128418, + "learning_rate": 1.458144927536232e-05, + "loss": 1.7082, + "step": 9850 + }, + { + "epoch": 0.82, + "grad_norm": 1.7792176008224487, + "learning_rate": 1.4575652173913044e-05, + "loss": 1.6193, + "step": 9860 + }, + { + "epoch": 0.82, + "grad_norm": 2.539458751678467, + "learning_rate": 1.456985507246377e-05, + "loss": 1.6587, + "step": 9870 + }, + { + "epoch": 0.82, + "grad_norm": 2.321563243865967, + "learning_rate": 1.4564057971014494e-05, + "loss": 1.7235, + "step": 9880 + }, + { + "epoch": 0.82, + "grad_norm": 1.7294243574142456, + "learning_rate": 1.455826086956522e-05, + "loss": 1.7623, + "step": 9890 + }, + { + "epoch": 0.82, + "grad_norm": 3.641298532485962, + "learning_rate": 1.4552463768115943e-05, + "loss": 1.7657, + "step": 9900 + }, + { + "epoch": 0.83, + "grad_norm": 1.3333618640899658, + "learning_rate": 1.4546666666666669e-05, + "loss": 1.6669, + "step": 9910 + }, + { + "epoch": 0.83, + "grad_norm": 3.8383522033691406, + "learning_rate": 1.4540869565217393e-05, + "loss": 1.7376, + "step": 9920 + }, + { + "epoch": 0.83, + "grad_norm": 13.071063995361328, + "learning_rate": 1.4535072463768115e-05, + "loss": 1.6751, + "step": 9930 + }, + { + "epoch": 0.83, + "grad_norm": 2.008157730102539, + "learning_rate": 1.4529275362318843e-05, + "loss": 1.6904, + "step": 9940 + }, + { + "epoch": 0.83, + "grad_norm": 6.280172824859619, + "learning_rate": 1.4523478260869565e-05, + "loss": 1.5522, + "step": 9950 + }, + { + "epoch": 0.83, + "grad_norm": 4.36051607131958, + "learning_rate": 1.4517681159420291e-05, + "loss": 1.6922, + "step": 9960 + }, + { + "epoch": 0.83, + "grad_norm": 4.0582804679870605, + "learning_rate": 1.4511884057971015e-05, + "loss": 1.7, + "step": 9970 + }, + { + "epoch": 0.83, + "grad_norm": 2.5071215629577637, + "learning_rate": 1.450608695652174e-05, + "loss": 1.6065, + "step": 9980 + }, + { + "epoch": 0.83, + "grad_norm": 4.5151567459106445, + "learning_rate": 1.4500289855072465e-05, + "loss": 1.646, + "step": 9990 + }, + { + "epoch": 0.83, + "grad_norm": 4.011928081512451, + "learning_rate": 1.449449275362319e-05, + "loss": 1.8767, + "step": 10000 + }, + { + "epoch": 0.83, + "eval_loss": 1.7204252481460571, + "eval_runtime": 107.4981, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 10000 + }, + { + "epoch": 0.83, + "grad_norm": 0.8452903032302856, + "learning_rate": 1.4488695652173914e-05, + "loss": 1.6305, + "step": 10010 + }, + { + "epoch": 0.83, + "grad_norm": 1.8112026453018188, + "learning_rate": 1.4482898550724638e-05, + "loss": 1.6268, + "step": 10020 + }, + { + "epoch": 0.84, + "grad_norm": 0.9594013094902039, + "learning_rate": 1.4477101449275364e-05, + "loss": 1.4332, + "step": 10030 + }, + { + "epoch": 0.84, + "grad_norm": 3.1587038040161133, + "learning_rate": 1.4471304347826088e-05, + "loss": 1.8313, + "step": 10040 + }, + { + "epoch": 0.84, + "grad_norm": 2.3806848526000977, + "learning_rate": 1.4465507246376814e-05, + "loss": 1.6507, + "step": 10050 + }, + { + "epoch": 0.84, + "grad_norm": 3.119511127471924, + "learning_rate": 1.4459710144927536e-05, + "loss": 1.8907, + "step": 10060 + }, + { + "epoch": 0.84, + "grad_norm": 2.085169553756714, + "learning_rate": 1.4453913043478263e-05, + "loss": 1.7827, + "step": 10070 + }, + { + "epoch": 0.84, + "grad_norm": 2.273642063140869, + "learning_rate": 1.4448115942028986e-05, + "loss": 1.8521, + "step": 10080 + }, + { + "epoch": 0.84, + "grad_norm": 2.0806262493133545, + "learning_rate": 1.4442318840579711e-05, + "loss": 1.7231, + "step": 10090 + }, + { + "epoch": 0.84, + "grad_norm": 1.6042985916137695, + "learning_rate": 1.4436521739130435e-05, + "loss": 1.6173, + "step": 10100 + }, + { + "epoch": 0.84, + "grad_norm": 2.670179843902588, + "learning_rate": 1.4430724637681161e-05, + "loss": 1.8075, + "step": 10110 + }, + { + "epoch": 0.84, + "grad_norm": 4.045022010803223, + "learning_rate": 1.4424927536231885e-05, + "loss": 1.7247, + "step": 10120 + }, + { + "epoch": 0.84, + "grad_norm": 1.5309295654296875, + "learning_rate": 1.4419130434782609e-05, + "loss": 1.7216, + "step": 10130 + }, + { + "epoch": 0.84, + "grad_norm": 2.8289453983306885, + "learning_rate": 1.4413333333333335e-05, + "loss": 1.7299, + "step": 10140 + }, + { + "epoch": 0.85, + "grad_norm": 2.870276927947998, + "learning_rate": 1.4407536231884059e-05, + "loss": 1.6632, + "step": 10150 + }, + { + "epoch": 0.85, + "grad_norm": 1.4293287992477417, + "learning_rate": 1.4401739130434785e-05, + "loss": 1.7234, + "step": 10160 + }, + { + "epoch": 0.85, + "grad_norm": 2.1435534954071045, + "learning_rate": 1.4395942028985509e-05, + "loss": 1.7715, + "step": 10170 + }, + { + "epoch": 0.85, + "grad_norm": 1.4607239961624146, + "learning_rate": 1.4390144927536234e-05, + "loss": 1.728, + "step": 10180 + }, + { + "epoch": 0.85, + "grad_norm": 2.555734872817993, + "learning_rate": 1.4384347826086957e-05, + "loss": 1.8154, + "step": 10190 + }, + { + "epoch": 0.85, + "grad_norm": 2.3336338996887207, + "learning_rate": 1.4378550724637684e-05, + "loss": 1.6378, + "step": 10200 + }, + { + "epoch": 0.85, + "grad_norm": 1.9876394271850586, + "learning_rate": 1.4372753623188406e-05, + "loss": 1.6636, + "step": 10210 + }, + { + "epoch": 0.85, + "grad_norm": 1.9591132402420044, + "learning_rate": 1.4366956521739132e-05, + "loss": 1.7456, + "step": 10220 + }, + { + "epoch": 0.85, + "grad_norm": 2.272648572921753, + "learning_rate": 1.4361159420289856e-05, + "loss": 1.8266, + "step": 10230 + }, + { + "epoch": 0.85, + "grad_norm": 6.858809947967529, + "learning_rate": 1.435536231884058e-05, + "loss": 1.6315, + "step": 10240 + }, + { + "epoch": 0.85, + "grad_norm": 0.7406672835350037, + "learning_rate": 1.4349565217391306e-05, + "loss": 1.4722, + "step": 10250 + }, + { + "epoch": 0.85, + "grad_norm": 1.0749115943908691, + "learning_rate": 1.434376811594203e-05, + "loss": 1.8539, + "step": 10260 + }, + { + "epoch": 0.86, + "grad_norm": 2.3175392150878906, + "learning_rate": 1.4337971014492755e-05, + "loss": 1.7049, + "step": 10270 + }, + { + "epoch": 0.86, + "grad_norm": 1.374159812927246, + "learning_rate": 1.433217391304348e-05, + "loss": 1.7605, + "step": 10280 + }, + { + "epoch": 0.86, + "grad_norm": 1.9364595413208008, + "learning_rate": 1.4326376811594205e-05, + "loss": 1.6288, + "step": 10290 + }, + { + "epoch": 0.86, + "grad_norm": 5.389453887939453, + "learning_rate": 1.4320579710144929e-05, + "loss": 1.6014, + "step": 10300 + }, + { + "epoch": 0.86, + "grad_norm": 3.7157788276672363, + "learning_rate": 1.4314782608695655e-05, + "loss": 1.738, + "step": 10310 + }, + { + "epoch": 0.86, + "grad_norm": 2.8316123485565186, + "learning_rate": 1.4308985507246377e-05, + "loss": 1.727, + "step": 10320 + }, + { + "epoch": 0.86, + "grad_norm": 3.3950917720794678, + "learning_rate": 1.4303188405797101e-05, + "loss": 1.6789, + "step": 10330 + }, + { + "epoch": 0.86, + "grad_norm": 4.218912601470947, + "learning_rate": 1.4297391304347827e-05, + "loss": 1.8176, + "step": 10340 + }, + { + "epoch": 0.86, + "grad_norm": 1.7928733825683594, + "learning_rate": 1.429159420289855e-05, + "loss": 1.6744, + "step": 10350 + }, + { + "epoch": 0.86, + "grad_norm": 1.1876684427261353, + "learning_rate": 1.4285797101449276e-05, + "loss": 1.6813, + "step": 10360 + }, + { + "epoch": 0.86, + "grad_norm": 1.4012842178344727, + "learning_rate": 1.428e-05, + "loss": 1.7983, + "step": 10370 + }, + { + "epoch": 0.86, + "grad_norm": 6.704860210418701, + "learning_rate": 1.4274202898550726e-05, + "loss": 1.6289, + "step": 10380 + }, + { + "epoch": 0.87, + "grad_norm": 1.7611334323883057, + "learning_rate": 1.426840579710145e-05, + "loss": 1.6068, + "step": 10390 + }, + { + "epoch": 0.87, + "grad_norm": 5.28679084777832, + "learning_rate": 1.4262608695652176e-05, + "loss": 1.644, + "step": 10400 + }, + { + "epoch": 0.87, + "grad_norm": 3.1040380001068115, + "learning_rate": 1.42568115942029e-05, + "loss": 1.8251, + "step": 10410 + }, + { + "epoch": 0.87, + "grad_norm": 2.5401835441589355, + "learning_rate": 1.4251014492753626e-05, + "loss": 1.6375, + "step": 10420 + }, + { + "epoch": 0.87, + "grad_norm": 3.4024248123168945, + "learning_rate": 1.424521739130435e-05, + "loss": 1.6935, + "step": 10430 + }, + { + "epoch": 0.87, + "grad_norm": 4.5380167961120605, + "learning_rate": 1.4239420289855072e-05, + "loss": 1.8278, + "step": 10440 + }, + { + "epoch": 0.87, + "grad_norm": 2.322249412536621, + "learning_rate": 1.42336231884058e-05, + "loss": 1.6748, + "step": 10450 + }, + { + "epoch": 0.87, + "grad_norm": 2.3199710845947266, + "learning_rate": 1.4227826086956522e-05, + "loss": 1.7673, + "step": 10460 + }, + { + "epoch": 0.87, + "grad_norm": 1.2281827926635742, + "learning_rate": 1.4222028985507247e-05, + "loss": 1.7763, + "step": 10470 + }, + { + "epoch": 0.87, + "grad_norm": 1.571286678314209, + "learning_rate": 1.4216231884057971e-05, + "loss": 1.5969, + "step": 10480 + }, + { + "epoch": 0.87, + "grad_norm": 4.81419038772583, + "learning_rate": 1.4210434782608697e-05, + "loss": 1.7263, + "step": 10490 + }, + { + "epoch": 0.88, + "grad_norm": 2.55985951423645, + "learning_rate": 1.4204637681159421e-05, + "loss": 1.7246, + "step": 10500 + }, + { + "epoch": 0.88, + "eval_loss": 1.7039064168930054, + "eval_runtime": 107.5109, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 10500 + }, + { + "epoch": 0.88, + "grad_norm": 2.4727416038513184, + "learning_rate": 1.4198840579710147e-05, + "loss": 1.7325, + "step": 10510 + }, + { + "epoch": 0.88, + "grad_norm": 2.254564046859741, + "learning_rate": 1.419304347826087e-05, + "loss": 1.6793, + "step": 10520 + }, + { + "epoch": 0.88, + "grad_norm": 2.339053153991699, + "learning_rate": 1.4187246376811596e-05, + "loss": 1.8367, + "step": 10530 + }, + { + "epoch": 0.88, + "grad_norm": 2.8006272315979004, + "learning_rate": 1.418144927536232e-05, + "loss": 1.5286, + "step": 10540 + }, + { + "epoch": 0.88, + "grad_norm": 1.7751678228378296, + "learning_rate": 1.4175652173913044e-05, + "loss": 1.7536, + "step": 10550 + }, + { + "epoch": 0.88, + "grad_norm": 7.587690830230713, + "learning_rate": 1.416985507246377e-05, + "loss": 1.8107, + "step": 10560 + }, + { + "epoch": 0.88, + "grad_norm": 1.8632152080535889, + "learning_rate": 1.4164057971014492e-05, + "loss": 1.6903, + "step": 10570 + }, + { + "epoch": 0.88, + "grad_norm": 3.762665033340454, + "learning_rate": 1.415826086956522e-05, + "loss": 1.7828, + "step": 10580 + }, + { + "epoch": 0.88, + "grad_norm": 2.1041312217712402, + "learning_rate": 1.4152463768115942e-05, + "loss": 1.8221, + "step": 10590 + }, + { + "epoch": 0.88, + "grad_norm": 1.7822948694229126, + "learning_rate": 1.4146666666666668e-05, + "loss": 1.7623, + "step": 10600 + }, + { + "epoch": 0.88, + "grad_norm": 3.2196130752563477, + "learning_rate": 1.4140869565217392e-05, + "loss": 1.8126, + "step": 10610 + }, + { + "epoch": 0.89, + "grad_norm": 2.2900569438934326, + "learning_rate": 1.4135072463768118e-05, + "loss": 1.7031, + "step": 10620 + }, + { + "epoch": 0.89, + "grad_norm": 4.033328056335449, + "learning_rate": 1.4129275362318842e-05, + "loss": 1.7968, + "step": 10630 + }, + { + "epoch": 0.89, + "grad_norm": 3.2577106952667236, + "learning_rate": 1.4123478260869566e-05, + "loss": 1.69, + "step": 10640 + }, + { + "epoch": 0.89, + "grad_norm": 6.332272052764893, + "learning_rate": 1.4117681159420291e-05, + "loss": 1.6143, + "step": 10650 + }, + { + "epoch": 0.89, + "grad_norm": 2.976055383682251, + "learning_rate": 1.4111884057971015e-05, + "loss": 1.7698, + "step": 10660 + }, + { + "epoch": 0.89, + "grad_norm": 1.821053385734558, + "learning_rate": 1.4106086956521741e-05, + "loss": 1.8864, + "step": 10670 + }, + { + "epoch": 0.89, + "grad_norm": 2.310410261154175, + "learning_rate": 1.4100289855072465e-05, + "loss": 1.583, + "step": 10680 + }, + { + "epoch": 0.89, + "grad_norm": 3.4083127975463867, + "learning_rate": 1.409449275362319e-05, + "loss": 1.7432, + "step": 10690 + }, + { + "epoch": 0.89, + "grad_norm": 1.7858573198318481, + "learning_rate": 1.4088695652173913e-05, + "loss": 1.566, + "step": 10700 + }, + { + "epoch": 0.89, + "grad_norm": 1.7986173629760742, + "learning_rate": 1.408289855072464e-05, + "loss": 1.7651, + "step": 10710 + }, + { + "epoch": 0.89, + "grad_norm": 1.9657052755355835, + "learning_rate": 1.4077101449275363e-05, + "loss": 1.7637, + "step": 10720 + }, + { + "epoch": 0.89, + "grad_norm": 3.002399206161499, + "learning_rate": 1.4071304347826088e-05, + "loss": 1.8534, + "step": 10730 + }, + { + "epoch": 0.9, + "grad_norm": 3.677050828933716, + "learning_rate": 1.4065507246376812e-05, + "loss": 1.8067, + "step": 10740 + }, + { + "epoch": 0.9, + "grad_norm": 2.9738922119140625, + "learning_rate": 1.4059710144927536e-05, + "loss": 1.7928, + "step": 10750 + }, + { + "epoch": 0.9, + "grad_norm": 2.2202324867248535, + "learning_rate": 1.4053913043478262e-05, + "loss": 1.7436, + "step": 10760 + }, + { + "epoch": 0.9, + "grad_norm": 2.8412764072418213, + "learning_rate": 1.4048115942028986e-05, + "loss": 1.7523, + "step": 10770 + }, + { + "epoch": 0.9, + "grad_norm": 0.8097102046012878, + "learning_rate": 1.4042318840579712e-05, + "loss": 1.7362, + "step": 10780 + }, + { + "epoch": 0.9, + "grad_norm": 2.471013069152832, + "learning_rate": 1.4036521739130436e-05, + "loss": 1.7287, + "step": 10790 + }, + { + "epoch": 0.9, + "grad_norm": 1.12705659866333, + "learning_rate": 1.4030724637681162e-05, + "loss": 1.7023, + "step": 10800 + }, + { + "epoch": 0.9, + "grad_norm": 2.3543355464935303, + "learning_rate": 1.4024927536231886e-05, + "loss": 1.7299, + "step": 10810 + }, + { + "epoch": 0.9, + "grad_norm": 6.779575824737549, + "learning_rate": 1.4019130434782611e-05, + "loss": 1.668, + "step": 10820 + }, + { + "epoch": 0.9, + "grad_norm": 3.1106367111206055, + "learning_rate": 1.4013333333333334e-05, + "loss": 1.713, + "step": 10830 + }, + { + "epoch": 0.9, + "grad_norm": 1.9769501686096191, + "learning_rate": 1.4007536231884061e-05, + "loss": 1.6006, + "step": 10840 + }, + { + "epoch": 0.9, + "grad_norm": 3.193175792694092, + "learning_rate": 1.4001739130434783e-05, + "loss": 1.734, + "step": 10850 + }, + { + "epoch": 0.91, + "grad_norm": 1.645627737045288, + "learning_rate": 1.3995942028985507e-05, + "loss": 1.5706, + "step": 10860 + }, + { + "epoch": 0.91, + "grad_norm": 6.674108982086182, + "learning_rate": 1.3990144927536233e-05, + "loss": 1.702, + "step": 10870 + }, + { + "epoch": 0.91, + "grad_norm": 1.0062819719314575, + "learning_rate": 1.3984347826086957e-05, + "loss": 1.7267, + "step": 10880 + }, + { + "epoch": 0.91, + "grad_norm": 4.037877559661865, + "learning_rate": 1.3978550724637683e-05, + "loss": 1.7847, + "step": 10890 + }, + { + "epoch": 0.91, + "grad_norm": 2.889549493789673, + "learning_rate": 1.3972753623188407e-05, + "loss": 1.6711, + "step": 10900 + }, + { + "epoch": 0.91, + "grad_norm": 3.1683433055877686, + "learning_rate": 1.3966956521739132e-05, + "loss": 1.7289, + "step": 10910 + }, + { + "epoch": 0.91, + "grad_norm": 3.776911973953247, + "learning_rate": 1.3961159420289856e-05, + "loss": 1.6323, + "step": 10920 + }, + { + "epoch": 0.91, + "grad_norm": 4.037374973297119, + "learning_rate": 1.3955942028985508e-05, + "loss": 1.7624, + "step": 10930 + }, + { + "epoch": 0.91, + "grad_norm": 6.559633255004883, + "learning_rate": 1.3950144927536233e-05, + "loss": 1.6858, + "step": 10940 + }, + { + "epoch": 0.91, + "grad_norm": 1.2170414924621582, + "learning_rate": 1.3944347826086957e-05, + "loss": 1.7849, + "step": 10950 + }, + { + "epoch": 0.91, + "grad_norm": 3.384916305541992, + "learning_rate": 1.3938550724637683e-05, + "loss": 1.6399, + "step": 10960 + }, + { + "epoch": 0.91, + "grad_norm": 2.4139840602874756, + "learning_rate": 1.3932753623188407e-05, + "loss": 1.7363, + "step": 10970 + }, + { + "epoch": 0.92, + "grad_norm": 3.0569369792938232, + "learning_rate": 1.3926956521739133e-05, + "loss": 1.8243, + "step": 10980 + }, + { + "epoch": 0.92, + "grad_norm": 3.9499967098236084, + "learning_rate": 1.3921159420289855e-05, + "loss": 1.8001, + "step": 10990 + }, + { + "epoch": 0.92, + "grad_norm": 2.538534164428711, + "learning_rate": 1.3915362318840582e-05, + "loss": 1.5845, + "step": 11000 + }, + { + "epoch": 0.92, + "eval_loss": 1.7137649059295654, + "eval_runtime": 107.4981, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 11000 + }, + { + "epoch": 0.92, + "grad_norm": 1.2500182390213013, + "learning_rate": 1.3909565217391305e-05, + "loss": 1.7198, + "step": 11010 + }, + { + "epoch": 0.92, + "grad_norm": 1.7554583549499512, + "learning_rate": 1.3903768115942029e-05, + "loss": 1.678, + "step": 11020 + }, + { + "epoch": 0.92, + "grad_norm": 2.5758776664733887, + "learning_rate": 1.3897971014492754e-05, + "loss": 1.7378, + "step": 11030 + }, + { + "epoch": 0.92, + "grad_norm": 6.143527984619141, + "learning_rate": 1.3892173913043478e-05, + "loss": 1.7865, + "step": 11040 + }, + { + "epoch": 0.92, + "grad_norm": 2.270503282546997, + "learning_rate": 1.3886376811594204e-05, + "loss": 1.7676, + "step": 11050 + }, + { + "epoch": 0.92, + "grad_norm": 3.5987863540649414, + "learning_rate": 1.3880579710144928e-05, + "loss": 1.7073, + "step": 11060 + }, + { + "epoch": 0.92, + "grad_norm": 3.498621702194214, + "learning_rate": 1.3874782608695654e-05, + "loss": 1.7672, + "step": 11070 + }, + { + "epoch": 0.92, + "grad_norm": 1.1236436367034912, + "learning_rate": 1.3868985507246378e-05, + "loss": 1.8432, + "step": 11080 + }, + { + "epoch": 0.92, + "grad_norm": 8.279869079589844, + "learning_rate": 1.3863188405797104e-05, + "loss": 1.6825, + "step": 11090 + }, + { + "epoch": 0.93, + "grad_norm": 7.814366817474365, + "learning_rate": 1.3857391304347828e-05, + "loss": 1.6321, + "step": 11100 + }, + { + "epoch": 0.93, + "grad_norm": 1.510978102684021, + "learning_rate": 1.3851594202898553e-05, + "loss": 1.6527, + "step": 11110 + }, + { + "epoch": 0.93, + "grad_norm": 2.76588773727417, + "learning_rate": 1.3845797101449276e-05, + "loss": 1.7147, + "step": 11120 + }, + { + "epoch": 0.93, + "grad_norm": 4.148089408874512, + "learning_rate": 1.384e-05, + "loss": 1.7523, + "step": 11130 + }, + { + "epoch": 0.93, + "grad_norm": 2.291975736618042, + "learning_rate": 1.3834202898550725e-05, + "loss": 1.6122, + "step": 11140 + }, + { + "epoch": 0.93, + "grad_norm": 3.4009017944335938, + "learning_rate": 1.382840579710145e-05, + "loss": 1.7563, + "step": 11150 + }, + { + "epoch": 0.93, + "grad_norm": 1.2660974264144897, + "learning_rate": 1.3822608695652175e-05, + "loss": 1.8469, + "step": 11160 + }, + { + "epoch": 0.93, + "grad_norm": 1.1221381425857544, + "learning_rate": 1.3816811594202899e-05, + "loss": 1.8096, + "step": 11170 + }, + { + "epoch": 0.93, + "grad_norm": 1.3825074434280396, + "learning_rate": 1.3811014492753625e-05, + "loss": 1.7913, + "step": 11180 + }, + { + "epoch": 0.93, + "grad_norm": 8.367438316345215, + "learning_rate": 1.3805217391304349e-05, + "loss": 1.7117, + "step": 11190 + }, + { + "epoch": 0.93, + "grad_norm": 3.15596866607666, + "learning_rate": 1.3799420289855074e-05, + "loss": 1.7031, + "step": 11200 + }, + { + "epoch": 0.93, + "grad_norm": 6.4687042236328125, + "learning_rate": 1.3793623188405798e-05, + "loss": 1.6699, + "step": 11210 + }, + { + "epoch": 0.94, + "grad_norm": 3.699357509613037, + "learning_rate": 1.3787826086956524e-05, + "loss": 1.829, + "step": 11220 + }, + { + "epoch": 0.94, + "grad_norm": 1.6832666397094727, + "learning_rate": 1.3782028985507248e-05, + "loss": 1.5892, + "step": 11230 + }, + { + "epoch": 0.94, + "grad_norm": 3.3249785900115967, + "learning_rate": 1.377623188405797e-05, + "loss": 1.8073, + "step": 11240 + }, + { + "epoch": 0.94, + "grad_norm": 3.6117970943450928, + "learning_rate": 1.3770434782608698e-05, + "loss": 1.684, + "step": 11250 + }, + { + "epoch": 0.94, + "grad_norm": 2.9994568824768066, + "learning_rate": 1.376463768115942e-05, + "loss": 1.6926, + "step": 11260 + }, + { + "epoch": 0.94, + "grad_norm": 2.465999126434326, + "learning_rate": 1.3758840579710146e-05, + "loss": 1.6104, + "step": 11270 + }, + { + "epoch": 0.94, + "grad_norm": 4.657724380493164, + "learning_rate": 1.375304347826087e-05, + "loss": 1.8288, + "step": 11280 + }, + { + "epoch": 0.94, + "grad_norm": 1.94265615940094, + "learning_rate": 1.3747246376811596e-05, + "loss": 1.7035, + "step": 11290 + }, + { + "epoch": 0.94, + "grad_norm": 1.6312084197998047, + "learning_rate": 1.374144927536232e-05, + "loss": 1.7352, + "step": 11300 + }, + { + "epoch": 0.94, + "grad_norm": 2.722726583480835, + "learning_rate": 1.3735652173913045e-05, + "loss": 1.7667, + "step": 11310 + }, + { + "epoch": 0.94, + "grad_norm": 6.727392196655273, + "learning_rate": 1.372985507246377e-05, + "loss": 1.5222, + "step": 11320 + }, + { + "epoch": 0.94, + "grad_norm": 5.453555583953857, + "learning_rate": 1.3724057971014493e-05, + "loss": 1.8861, + "step": 11330 + }, + { + "epoch": 0.94, + "grad_norm": 4.511663913726807, + "learning_rate": 1.3718260869565219e-05, + "loss": 1.836, + "step": 11340 + }, + { + "epoch": 0.95, + "grad_norm": 2.072493553161621, + "learning_rate": 1.3712463768115943e-05, + "loss": 1.5072, + "step": 11350 + }, + { + "epoch": 0.95, + "grad_norm": 1.9541881084442139, + "learning_rate": 1.3706666666666669e-05, + "loss": 1.6182, + "step": 11360 + }, + { + "epoch": 0.95, + "grad_norm": 4.463501453399658, + "learning_rate": 1.3700869565217391e-05, + "loss": 1.8108, + "step": 11370 + }, + { + "epoch": 0.95, + "grad_norm": 2.4841718673706055, + "learning_rate": 1.3695072463768118e-05, + "loss": 1.8205, + "step": 11380 + }, + { + "epoch": 0.95, + "grad_norm": 3.357886791229248, + "learning_rate": 1.368927536231884e-05, + "loss": 1.6904, + "step": 11390 + }, + { + "epoch": 0.95, + "grad_norm": 2.65535306930542, + "learning_rate": 1.3683478260869566e-05, + "loss": 1.7598, + "step": 11400 + }, + { + "epoch": 0.95, + "grad_norm": 6.2453694343566895, + "learning_rate": 1.367768115942029e-05, + "loss": 1.6009, + "step": 11410 + }, + { + "epoch": 0.95, + "grad_norm": 5.134276866912842, + "learning_rate": 1.3671884057971016e-05, + "loss": 1.7633, + "step": 11420 + }, + { + "epoch": 0.95, + "grad_norm": 6.813711643218994, + "learning_rate": 1.366608695652174e-05, + "loss": 1.5795, + "step": 11430 + }, + { + "epoch": 0.95, + "grad_norm": 1.364342451095581, + "learning_rate": 1.3660289855072464e-05, + "loss": 1.6675, + "step": 11440 + }, + { + "epoch": 0.95, + "grad_norm": 1.627918004989624, + "learning_rate": 1.365449275362319e-05, + "loss": 1.7405, + "step": 11450 + }, + { + "epoch": 0.95, + "grad_norm": 3.7319889068603516, + "learning_rate": 1.3648695652173914e-05, + "loss": 1.6259, + "step": 11460 + }, + { + "epoch": 0.96, + "grad_norm": 0.867110013961792, + "learning_rate": 1.364289855072464e-05, + "loss": 1.5038, + "step": 11470 + }, + { + "epoch": 0.96, + "grad_norm": 3.0498149394989014, + "learning_rate": 1.3637101449275364e-05, + "loss": 1.557, + "step": 11480 + }, + { + "epoch": 0.96, + "grad_norm": 3.804266929626465, + "learning_rate": 1.363130434782609e-05, + "loss": 1.4778, + "step": 11490 + }, + { + "epoch": 0.96, + "grad_norm": 4.833094120025635, + "learning_rate": 1.3625507246376812e-05, + "loss": 1.7432, + "step": 11500 + }, + { + "epoch": 0.96, + "eval_loss": 1.6946756839752197, + "eval_runtime": 107.5258, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 11500 + }, + { + "epoch": 0.96, + "grad_norm": 1.91051185131073, + "learning_rate": 1.3619710144927539e-05, + "loss": 1.6593, + "step": 11510 + }, + { + "epoch": 0.96, + "grad_norm": 1.2992429733276367, + "learning_rate": 1.3613913043478261e-05, + "loss": 1.6403, + "step": 11520 + }, + { + "epoch": 0.96, + "grad_norm": 1.9277349710464478, + "learning_rate": 1.3608115942028987e-05, + "loss": 1.7522, + "step": 11530 + }, + { + "epoch": 0.96, + "grad_norm": 3.0008699893951416, + "learning_rate": 1.3602318840579711e-05, + "loss": 1.6957, + "step": 11540 + }, + { + "epoch": 0.96, + "grad_norm": 1.4071532487869263, + "learning_rate": 1.3596521739130435e-05, + "loss": 1.6937, + "step": 11550 + }, + { + "epoch": 0.96, + "grad_norm": 2.890350103378296, + "learning_rate": 1.359072463768116e-05, + "loss": 1.6852, + "step": 11560 + }, + { + "epoch": 0.96, + "grad_norm": 5.77126932144165, + "learning_rate": 1.3584927536231885e-05, + "loss": 1.6281, + "step": 11570 + }, + { + "epoch": 0.96, + "grad_norm": 1.499045729637146, + "learning_rate": 1.357913043478261e-05, + "loss": 1.6476, + "step": 11580 + }, + { + "epoch": 0.97, + "grad_norm": 1.360612154006958, + "learning_rate": 1.3573333333333334e-05, + "loss": 1.7523, + "step": 11590 + }, + { + "epoch": 0.97, + "grad_norm": 3.634352922439575, + "learning_rate": 1.356753623188406e-05, + "loss": 1.7197, + "step": 11600 + }, + { + "epoch": 0.97, + "grad_norm": 2.7271361351013184, + "learning_rate": 1.3561739130434784e-05, + "loss": 1.7322, + "step": 11610 + }, + { + "epoch": 0.97, + "grad_norm": 4.323874473571777, + "learning_rate": 1.355594202898551e-05, + "loss": 1.6905, + "step": 11620 + }, + { + "epoch": 0.97, + "grad_norm": 4.2105393409729, + "learning_rate": 1.3550144927536232e-05, + "loss": 1.7726, + "step": 11630 + }, + { + "epoch": 0.97, + "grad_norm": 4.96962833404541, + "learning_rate": 1.3544347826086956e-05, + "loss": 1.6487, + "step": 11640 + }, + { + "epoch": 0.97, + "grad_norm": 3.641913652420044, + "learning_rate": 1.3538550724637682e-05, + "loss": 1.6343, + "step": 11650 + }, + { + "epoch": 0.97, + "grad_norm": 6.166054725646973, + "learning_rate": 1.3532753623188406e-05, + "loss": 1.6923, + "step": 11660 + }, + { + "epoch": 0.97, + "grad_norm": 2.171774387359619, + "learning_rate": 1.3526956521739132e-05, + "loss": 1.8464, + "step": 11670 + }, + { + "epoch": 0.97, + "grad_norm": 2.8756415843963623, + "learning_rate": 1.3521159420289856e-05, + "loss": 1.6128, + "step": 11680 + }, + { + "epoch": 0.97, + "grad_norm": 4.395898818969727, + "learning_rate": 1.3515362318840581e-05, + "loss": 1.7561, + "step": 11690 + }, + { + "epoch": 0.97, + "grad_norm": 3.581686019897461, + "learning_rate": 1.3509565217391305e-05, + "loss": 1.6437, + "step": 11700 + }, + { + "epoch": 0.98, + "grad_norm": 2.5616443157196045, + "learning_rate": 1.3503768115942031e-05, + "loss": 1.7065, + "step": 11710 + }, + { + "epoch": 0.98, + "grad_norm": 3.503169536590576, + "learning_rate": 1.3497971014492755e-05, + "loss": 1.7289, + "step": 11720 + }, + { + "epoch": 0.98, + "grad_norm": 2.8909196853637695, + "learning_rate": 1.349217391304348e-05, + "loss": 1.708, + "step": 11730 + }, + { + "epoch": 0.98, + "grad_norm": 2.47155499458313, + "learning_rate": 1.3486376811594205e-05, + "loss": 1.6653, + "step": 11740 + }, + { + "epoch": 0.98, + "grad_norm": 3.7819576263427734, + "learning_rate": 1.3480579710144927e-05, + "loss": 1.765, + "step": 11750 + }, + { + "epoch": 0.98, + "grad_norm": 4.265803337097168, + "learning_rate": 1.3474782608695653e-05, + "loss": 1.8526, + "step": 11760 + }, + { + "epoch": 0.98, + "grad_norm": 1.643122673034668, + "learning_rate": 1.3468985507246377e-05, + "loss": 1.7633, + "step": 11770 + }, + { + "epoch": 0.98, + "grad_norm": 8.063491821289062, + "learning_rate": 1.3463188405797102e-05, + "loss": 1.7809, + "step": 11780 + }, + { + "epoch": 0.98, + "grad_norm": 1.299911618232727, + "learning_rate": 1.3457391304347826e-05, + "loss": 1.7441, + "step": 11790 + }, + { + "epoch": 0.98, + "grad_norm": 2.312831401824951, + "learning_rate": 1.3451594202898552e-05, + "loss": 1.7987, + "step": 11800 + }, + { + "epoch": 0.98, + "grad_norm": 3.44124698638916, + "learning_rate": 1.3445797101449276e-05, + "loss": 1.6447, + "step": 11810 + }, + { + "epoch": 0.98, + "grad_norm": 3.1065027713775635, + "learning_rate": 1.3440000000000002e-05, + "loss": 1.8101, + "step": 11820 + }, + { + "epoch": 0.99, + "grad_norm": 4.687283039093018, + "learning_rate": 1.3434202898550726e-05, + "loss": 1.7435, + "step": 11830 + }, + { + "epoch": 0.99, + "grad_norm": 4.883983135223389, + "learning_rate": 1.3428405797101451e-05, + "loss": 1.6305, + "step": 11840 + }, + { + "epoch": 0.99, + "grad_norm": 1.454877257347107, + "learning_rate": 1.3422608695652175e-05, + "loss": 1.7328, + "step": 11850 + }, + { + "epoch": 0.99, + "grad_norm": 5.217712879180908, + "learning_rate": 1.34168115942029e-05, + "loss": 1.8219, + "step": 11860 + }, + { + "epoch": 0.99, + "grad_norm": 3.0228896141052246, + "learning_rate": 1.3411014492753625e-05, + "loss": 1.6406, + "step": 11870 + }, + { + "epoch": 0.99, + "grad_norm": 1.98235285282135, + "learning_rate": 1.3405217391304347e-05, + "loss": 1.814, + "step": 11880 + }, + { + "epoch": 0.99, + "grad_norm": 4.972719192504883, + "learning_rate": 1.3399420289855073e-05, + "loss": 1.7487, + "step": 11890 + }, + { + "epoch": 0.99, + "grad_norm": 2.730912446975708, + "learning_rate": 1.3393623188405797e-05, + "loss": 1.6201, + "step": 11900 + }, + { + "epoch": 0.99, + "grad_norm": 3.768197774887085, + "learning_rate": 1.3387826086956523e-05, + "loss": 1.5983, + "step": 11910 + }, + { + "epoch": 0.99, + "grad_norm": 1.7705687284469604, + "learning_rate": 1.3382028985507247e-05, + "loss": 1.8149, + "step": 11920 + }, + { + "epoch": 0.99, + "grad_norm": 3.303154230117798, + "learning_rate": 1.3376231884057973e-05, + "loss": 1.5987, + "step": 11930 + }, + { + "epoch": 0.99, + "grad_norm": 3.428690195083618, + "learning_rate": 1.3370434782608697e-05, + "loss": 1.7096, + "step": 11940 + }, + { + "epoch": 1.0, + "grad_norm": 1.814437985420227, + "learning_rate": 1.336463768115942e-05, + "loss": 1.5782, + "step": 11950 + }, + { + "epoch": 1.0, + "grad_norm": 3.096653938293457, + "learning_rate": 1.3358840579710146e-05, + "loss": 1.7079, + "step": 11960 + }, + { + "epoch": 1.0, + "grad_norm": 2.4044532775878906, + "learning_rate": 1.335304347826087e-05, + "loss": 1.5858, + "step": 11970 + }, + { + "epoch": 1.0, + "grad_norm": 4.206218719482422, + "learning_rate": 1.3347246376811596e-05, + "loss": 1.7468, + "step": 11980 + }, + { + "epoch": 1.0, + "grad_norm": 4.609010219573975, + "learning_rate": 1.334144927536232e-05, + "loss": 1.7859, + "step": 11990 + }, + { + "epoch": 1.0, + "grad_norm": 2.646367311477661, + "learning_rate": 1.3335652173913046e-05, + "loss": 1.4624, + "step": 12000 + }, + { + "epoch": 1.0, + "eval_loss": 1.7011674642562866, + "eval_runtime": 107.5473, + "eval_samples_per_second": 9.298, + "eval_steps_per_second": 2.325, + "step": 12000 + }, + { + "epoch": 1.0, + "grad_norm": 8.173443794250488, + "learning_rate": 1.3329855072463768e-05, + "loss": 1.6229, + "step": 12010 + }, + { + "epoch": 1.0, + "grad_norm": 1.7814215421676636, + "learning_rate": 1.3324057971014495e-05, + "loss": 1.4929, + "step": 12020 + }, + { + "epoch": 1.0, + "grad_norm": 1.4045521020889282, + "learning_rate": 1.3318260869565218e-05, + "loss": 1.6919, + "step": 12030 + }, + { + "epoch": 1.0, + "grad_norm": 1.0634101629257202, + "learning_rate": 1.3312463768115943e-05, + "loss": 1.6938, + "step": 12040 + }, + { + "epoch": 1.0, + "grad_norm": 1.312534213066101, + "learning_rate": 1.3306666666666667e-05, + "loss": 1.718, + "step": 12050 + }, + { + "epoch": 1.0, + "grad_norm": 6.818319320678711, + "learning_rate": 1.3300869565217391e-05, + "loss": 1.7151, + "step": 12060 + }, + { + "epoch": 1.01, + "grad_norm": 3.955862045288086, + "learning_rate": 1.3295072463768117e-05, + "loss": 1.7341, + "step": 12070 + }, + { + "epoch": 1.01, + "grad_norm": 1.5620914697647095, + "learning_rate": 1.3289275362318841e-05, + "loss": 1.8083, + "step": 12080 + }, + { + "epoch": 1.01, + "grad_norm": 3.745596170425415, + "learning_rate": 1.3283478260869567e-05, + "loss": 1.6492, + "step": 12090 + }, + { + "epoch": 1.01, + "grad_norm": 6.607650279998779, + "learning_rate": 1.3277681159420291e-05, + "loss": 1.6268, + "step": 12100 + }, + { + "epoch": 1.01, + "grad_norm": 1.8814033269882202, + "learning_rate": 1.3271884057971017e-05, + "loss": 1.6127, + "step": 12110 + }, + { + "epoch": 1.01, + "grad_norm": 1.7189909219741821, + "learning_rate": 1.326608695652174e-05, + "loss": 1.7186, + "step": 12120 + }, + { + "epoch": 1.01, + "grad_norm": 3.2475483417510986, + "learning_rate": 1.3260289855072466e-05, + "loss": 1.7642, + "step": 12130 + }, + { + "epoch": 1.01, + "grad_norm": 5.229576587677002, + "learning_rate": 1.3254492753623189e-05, + "loss": 1.7376, + "step": 12140 + }, + { + "epoch": 1.01, + "grad_norm": 5.324844837188721, + "learning_rate": 1.3248695652173916e-05, + "loss": 1.753, + "step": 12150 + }, + { + "epoch": 1.01, + "grad_norm": 1.727643370628357, + "learning_rate": 1.3242898550724638e-05, + "loss": 1.7612, + "step": 12160 + }, + { + "epoch": 1.01, + "grad_norm": 2.785902976989746, + "learning_rate": 1.3237101449275362e-05, + "loss": 1.8341, + "step": 12170 + }, + { + "epoch": 1.01, + "grad_norm": 1.65829598903656, + "learning_rate": 1.3231304347826088e-05, + "loss": 1.6544, + "step": 12180 + }, + { + "epoch": 1.02, + "grad_norm": 5.451395511627197, + "learning_rate": 1.3225507246376812e-05, + "loss": 1.7604, + "step": 12190 + }, + { + "epoch": 1.02, + "grad_norm": 11.722532272338867, + "learning_rate": 1.3219710144927538e-05, + "loss": 1.6377, + "step": 12200 + }, + { + "epoch": 1.02, + "grad_norm": 3.6431243419647217, + "learning_rate": 1.3213913043478262e-05, + "loss": 1.6285, + "step": 12210 + }, + { + "epoch": 1.02, + "grad_norm": 3.2399184703826904, + "learning_rate": 1.3208115942028987e-05, + "loss": 1.6122, + "step": 12220 + }, + { + "epoch": 1.02, + "grad_norm": 1.8696668148040771, + "learning_rate": 1.3202318840579711e-05, + "loss": 1.7179, + "step": 12230 + }, + { + "epoch": 1.02, + "grad_norm": 3.199878692626953, + "learning_rate": 1.3196521739130437e-05, + "loss": 1.6551, + "step": 12240 + }, + { + "epoch": 1.02, + "grad_norm": 9.326812744140625, + "learning_rate": 1.3190724637681161e-05, + "loss": 1.6373, + "step": 12250 + }, + { + "epoch": 1.02, + "grad_norm": 1.3237980604171753, + "learning_rate": 1.3184927536231883e-05, + "loss": 1.6698, + "step": 12260 + }, + { + "epoch": 1.02, + "grad_norm": 1.768131971359253, + "learning_rate": 1.3179130434782609e-05, + "loss": 1.7116, + "step": 12270 + }, + { + "epoch": 1.02, + "grad_norm": 4.411503791809082, + "learning_rate": 1.3173333333333333e-05, + "loss": 1.6156, + "step": 12280 + }, + { + "epoch": 1.02, + "grad_norm": 4.180882930755615, + "learning_rate": 1.3167536231884059e-05, + "loss": 1.6427, + "step": 12290 + }, + { + "epoch": 1.02, + "grad_norm": 4.48996639251709, + "learning_rate": 1.3161739130434783e-05, + "loss": 1.8268, + "step": 12300 + }, + { + "epoch": 1.03, + "grad_norm": 3.8137004375457764, + "learning_rate": 1.3155942028985509e-05, + "loss": 1.6113, + "step": 12310 + }, + { + "epoch": 1.03, + "grad_norm": 1.2274035215377808, + "learning_rate": 1.3150144927536233e-05, + "loss": 1.5674, + "step": 12320 + }, + { + "epoch": 1.03, + "grad_norm": 1.9615777730941772, + "learning_rate": 1.3144347826086958e-05, + "loss": 1.7014, + "step": 12330 + }, + { + "epoch": 1.03, + "grad_norm": 2.100159168243408, + "learning_rate": 1.3138550724637682e-05, + "loss": 1.6939, + "step": 12340 + }, + { + "epoch": 1.03, + "grad_norm": 1.2069636583328247, + "learning_rate": 1.3132753623188408e-05, + "loss": 1.8285, + "step": 12350 + }, + { + "epoch": 1.03, + "grad_norm": 4.2240705490112305, + "learning_rate": 1.3126956521739132e-05, + "loss": 1.7967, + "step": 12360 + }, + { + "epoch": 1.03, + "grad_norm": 4.67422342300415, + "learning_rate": 1.3121159420289856e-05, + "loss": 1.6853, + "step": 12370 + }, + { + "epoch": 1.03, + "grad_norm": 2.0639190673828125, + "learning_rate": 1.3115362318840582e-05, + "loss": 1.8051, + "step": 12380 + }, + { + "epoch": 1.03, + "grad_norm": 1.48922598361969, + "learning_rate": 1.3109565217391304e-05, + "loss": 1.744, + "step": 12390 + }, + { + "epoch": 1.03, + "grad_norm": 0.7941759824752808, + "learning_rate": 1.310376811594203e-05, + "loss": 1.6758, + "step": 12400 + }, + { + "epoch": 1.03, + "grad_norm": 1.099454402923584, + "learning_rate": 1.3097971014492754e-05, + "loss": 1.6345, + "step": 12410 + }, + { + "epoch": 1.03, + "grad_norm": 1.9024593830108643, + "learning_rate": 1.309217391304348e-05, + "loss": 1.5965, + "step": 12420 + }, + { + "epoch": 1.04, + "grad_norm": 3.2126779556274414, + "learning_rate": 1.3086376811594203e-05, + "loss": 1.6138, + "step": 12430 + }, + { + "epoch": 1.04, + "grad_norm": 3.7725675106048584, + "learning_rate": 1.3080579710144929e-05, + "loss": 1.7117, + "step": 12440 + }, + { + "epoch": 1.04, + "grad_norm": 3.924130439758301, + "learning_rate": 1.3074782608695653e-05, + "loss": 1.7234, + "step": 12450 + }, + { + "epoch": 1.04, + "grad_norm": 2.2998852729797363, + "learning_rate": 1.3068985507246379e-05, + "loss": 1.6755, + "step": 12460 + }, + { + "epoch": 1.04, + "grad_norm": 4.43101167678833, + "learning_rate": 1.3063188405797103e-05, + "loss": 1.6946, + "step": 12470 + }, + { + "epoch": 1.04, + "grad_norm": 5.988603115081787, + "learning_rate": 1.3057391304347827e-05, + "loss": 1.6585, + "step": 12480 + }, + { + "epoch": 1.04, + "grad_norm": 3.184678316116333, + "learning_rate": 1.3051594202898552e-05, + "loss": 1.6695, + "step": 12490 + }, + { + "epoch": 1.04, + "grad_norm": 6.5645880699157715, + "learning_rate": 1.3045797101449277e-05, + "loss": 1.7914, + "step": 12500 + }, + { + "epoch": 1.04, + "eval_loss": 1.7145652770996094, + "eval_runtime": 107.5024, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 12500 + }, + { + "epoch": 1.04, + "grad_norm": 1.6192374229431152, + "learning_rate": 1.3040000000000002e-05, + "loss": 1.7134, + "step": 12510 + }, + { + "epoch": 1.04, + "grad_norm": 1.7455005645751953, + "learning_rate": 1.3034202898550725e-05, + "loss": 1.7623, + "step": 12520 + }, + { + "epoch": 1.04, + "grad_norm": 3.3086254596710205, + "learning_rate": 1.302840579710145e-05, + "loss": 1.6877, + "step": 12530 + }, + { + "epoch": 1.04, + "grad_norm": 1.5355310440063477, + "learning_rate": 1.3022608695652174e-05, + "loss": 1.7792, + "step": 12540 + }, + { + "epoch": 1.05, + "grad_norm": 1.7353932857513428, + "learning_rate": 1.30168115942029e-05, + "loss": 1.7148, + "step": 12550 + }, + { + "epoch": 1.05, + "grad_norm": 6.468092918395996, + "learning_rate": 1.3011014492753624e-05, + "loss": 1.7166, + "step": 12560 + }, + { + "epoch": 1.05, + "grad_norm": 4.603166580200195, + "learning_rate": 1.3005217391304348e-05, + "loss": 1.7689, + "step": 12570 + }, + { + "epoch": 1.05, + "grad_norm": 3.521178960800171, + "learning_rate": 1.2999420289855074e-05, + "loss": 1.6698, + "step": 12580 + }, + { + "epoch": 1.05, + "grad_norm": 2.06740665435791, + "learning_rate": 1.2993623188405798e-05, + "loss": 1.6722, + "step": 12590 + }, + { + "epoch": 1.05, + "grad_norm": 2.7508833408355713, + "learning_rate": 1.2987826086956523e-05, + "loss": 1.6326, + "step": 12600 + }, + { + "epoch": 1.05, + "grad_norm": 2.8367888927459717, + "learning_rate": 1.2982028985507247e-05, + "loss": 1.7824, + "step": 12610 + }, + { + "epoch": 1.05, + "grad_norm": 2.0329577922821045, + "learning_rate": 1.2976231884057973e-05, + "loss": 1.6271, + "step": 12620 + }, + { + "epoch": 1.05, + "grad_norm": 1.7697292566299438, + "learning_rate": 1.2970434782608697e-05, + "loss": 1.5485, + "step": 12630 + }, + { + "epoch": 1.05, + "grad_norm": 2.019354820251465, + "learning_rate": 1.2964637681159423e-05, + "loss": 1.764, + "step": 12640 + }, + { + "epoch": 1.05, + "grad_norm": 3.5531795024871826, + "learning_rate": 1.2958840579710145e-05, + "loss": 1.5918, + "step": 12650 + }, + { + "epoch": 1.05, + "grad_norm": 4.986388206481934, + "learning_rate": 1.295304347826087e-05, + "loss": 1.7121, + "step": 12660 + }, + { + "epoch": 1.06, + "grad_norm": 1.05776047706604, + "learning_rate": 1.2947246376811595e-05, + "loss": 1.8649, + "step": 12670 + }, + { + "epoch": 1.06, + "grad_norm": 8.3735990524292, + "learning_rate": 1.2941449275362319e-05, + "loss": 1.7337, + "step": 12680 + }, + { + "epoch": 1.06, + "grad_norm": 1.643716812133789, + "learning_rate": 1.2935652173913044e-05, + "loss": 1.8637, + "step": 12690 + }, + { + "epoch": 1.06, + "grad_norm": 6.21591854095459, + "learning_rate": 1.2929855072463768e-05, + "loss": 1.6824, + "step": 12700 + }, + { + "epoch": 1.06, + "grad_norm": 1.3029552698135376, + "learning_rate": 1.2924057971014494e-05, + "loss": 1.7475, + "step": 12710 + }, + { + "epoch": 1.06, + "grad_norm": 1.8332914113998413, + "learning_rate": 1.2918260869565218e-05, + "loss": 1.6892, + "step": 12720 + }, + { + "epoch": 1.06, + "grad_norm": 3.6949455738067627, + "learning_rate": 1.2912463768115944e-05, + "loss": 1.6439, + "step": 12730 + }, + { + "epoch": 1.06, + "grad_norm": 2.436410427093506, + "learning_rate": 1.2906666666666668e-05, + "loss": 1.7154, + "step": 12740 + }, + { + "epoch": 1.06, + "grad_norm": 2.0431625843048096, + "learning_rate": 1.2900869565217394e-05, + "loss": 1.6907, + "step": 12750 + }, + { + "epoch": 1.06, + "grad_norm": 6.340989112854004, + "learning_rate": 1.2895072463768118e-05, + "loss": 1.6155, + "step": 12760 + }, + { + "epoch": 1.06, + "grad_norm": 2.4236230850219727, + "learning_rate": 1.2889275362318843e-05, + "loss": 1.7229, + "step": 12770 + }, + { + "epoch": 1.06, + "grad_norm": 2.5978729724884033, + "learning_rate": 1.2883478260869566e-05, + "loss": 1.7474, + "step": 12780 + }, + { + "epoch": 1.07, + "grad_norm": 1.729132056236267, + "learning_rate": 1.287768115942029e-05, + "loss": 1.6459, + "step": 12790 + }, + { + "epoch": 1.07, + "grad_norm": 2.0741076469421387, + "learning_rate": 1.2871884057971015e-05, + "loss": 1.7362, + "step": 12800 + }, + { + "epoch": 1.07, + "grad_norm": 2.056138038635254, + "learning_rate": 1.286608695652174e-05, + "loss": 1.6319, + "step": 12810 + }, + { + "epoch": 1.07, + "grad_norm": 3.7434098720550537, + "learning_rate": 1.2860289855072465e-05, + "loss": 1.6161, + "step": 12820 + }, + { + "epoch": 1.07, + "grad_norm": 1.8511974811553955, + "learning_rate": 1.2854492753623189e-05, + "loss": 1.5639, + "step": 12830 + }, + { + "epoch": 1.07, + "grad_norm": 4.42405891418457, + "learning_rate": 1.2848695652173915e-05, + "loss": 1.7232, + "step": 12840 + }, + { + "epoch": 1.07, + "grad_norm": 6.329336166381836, + "learning_rate": 1.2842898550724639e-05, + "loss": 1.4743, + "step": 12850 + }, + { + "epoch": 1.07, + "grad_norm": 2.943577527999878, + "learning_rate": 1.2837101449275364e-05, + "loss": 1.724, + "step": 12860 + }, + { + "epoch": 1.07, + "grad_norm": 2.932284116744995, + "learning_rate": 1.2831304347826088e-05, + "loss": 1.7264, + "step": 12870 + }, + { + "epoch": 1.07, + "grad_norm": 5.513105392456055, + "learning_rate": 1.282550724637681e-05, + "loss": 1.7304, + "step": 12880 + }, + { + "epoch": 1.07, + "grad_norm": 0.8422374129295349, + "learning_rate": 1.2819710144927538e-05, + "loss": 1.7715, + "step": 12890 + }, + { + "epoch": 1.07, + "grad_norm": 2.7153878211975098, + "learning_rate": 1.281391304347826e-05, + "loss": 1.6772, + "step": 12900 + }, + { + "epoch": 1.08, + "grad_norm": 4.201610088348389, + "learning_rate": 1.2808115942028986e-05, + "loss": 1.6741, + "step": 12910 + }, + { + "epoch": 1.08, + "grad_norm": 2.2562031745910645, + "learning_rate": 1.280231884057971e-05, + "loss": 1.7259, + "step": 12920 + }, + { + "epoch": 1.08, + "grad_norm": 3.5883278846740723, + "learning_rate": 1.2797101449275365e-05, + "loss": 1.6095, + "step": 12930 + }, + { + "epoch": 1.08, + "grad_norm": 2.405609130859375, + "learning_rate": 1.2791304347826087e-05, + "loss": 1.6715, + "step": 12940 + }, + { + "epoch": 1.08, + "grad_norm": 4.966475486755371, + "learning_rate": 1.2785507246376815e-05, + "loss": 1.7278, + "step": 12950 + }, + { + "epoch": 1.08, + "grad_norm": 2.7825913429260254, + "learning_rate": 1.2779710144927537e-05, + "loss": 1.7728, + "step": 12960 + }, + { + "epoch": 1.08, + "grad_norm": 1.6676304340362549, + "learning_rate": 1.277391304347826e-05, + "loss": 1.604, + "step": 12970 + }, + { + "epoch": 1.08, + "grad_norm": 8.661015510559082, + "learning_rate": 1.2768115942028987e-05, + "loss": 1.6988, + "step": 12980 + }, + { + "epoch": 1.08, + "grad_norm": 1.4278024435043335, + "learning_rate": 1.276231884057971e-05, + "loss": 1.6335, + "step": 12990 + }, + { + "epoch": 1.08, + "grad_norm": 1.0306954383850098, + "learning_rate": 1.2756521739130436e-05, + "loss": 1.7252, + "step": 13000 + }, + { + "epoch": 1.08, + "eval_loss": 1.6973094940185547, + "eval_runtime": 107.5084, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 13000 + }, + { + "epoch": 1.08, + "grad_norm": 4.311023712158203, + "learning_rate": 1.275072463768116e-05, + "loss": 1.6827, + "step": 13010 + }, + { + "epoch": 1.08, + "grad_norm": 5.270779132843018, + "learning_rate": 1.2744927536231886e-05, + "loss": 1.6404, + "step": 13020 + }, + { + "epoch": 1.09, + "grad_norm": 1.767482876777649, + "learning_rate": 1.273913043478261e-05, + "loss": 1.6437, + "step": 13030 + }, + { + "epoch": 1.09, + "grad_norm": 4.334309101104736, + "learning_rate": 1.2733333333333336e-05, + "loss": 1.8447, + "step": 13040 + }, + { + "epoch": 1.09, + "grad_norm": 8.177881240844727, + "learning_rate": 1.272753623188406e-05, + "loss": 1.7184, + "step": 13050 + }, + { + "epoch": 1.09, + "grad_norm": 0.994053304195404, + "learning_rate": 1.2721739130434782e-05, + "loss": 1.5713, + "step": 13060 + }, + { + "epoch": 1.09, + "grad_norm": 2.124035120010376, + "learning_rate": 1.2715942028985508e-05, + "loss": 1.7455, + "step": 13070 + }, + { + "epoch": 1.09, + "grad_norm": 4.642811298370361, + "learning_rate": 1.2710144927536232e-05, + "loss": 1.6266, + "step": 13080 + }, + { + "epoch": 1.09, + "grad_norm": 5.6701507568359375, + "learning_rate": 1.2704347826086957e-05, + "loss": 1.5264, + "step": 13090 + }, + { + "epoch": 1.09, + "grad_norm": 3.8725638389587402, + "learning_rate": 1.2698550724637681e-05, + "loss": 1.6485, + "step": 13100 + }, + { + "epoch": 1.09, + "grad_norm": 4.132758140563965, + "learning_rate": 1.2692753623188407e-05, + "loss": 1.7384, + "step": 13110 + }, + { + "epoch": 1.09, + "grad_norm": 1.472156047821045, + "learning_rate": 1.2686956521739131e-05, + "loss": 1.7509, + "step": 13120 + }, + { + "epoch": 1.09, + "grad_norm": 2.3745663166046143, + "learning_rate": 1.2681159420289857e-05, + "loss": 1.8063, + "step": 13130 + }, + { + "epoch": 1.09, + "grad_norm": 3.400909423828125, + "learning_rate": 1.267536231884058e-05, + "loss": 1.502, + "step": 13140 + }, + { + "epoch": 1.1, + "grad_norm": 2.341827154159546, + "learning_rate": 1.2669565217391306e-05, + "loss": 1.8114, + "step": 13150 + }, + { + "epoch": 1.1, + "grad_norm": 4.864140033721924, + "learning_rate": 1.266376811594203e-05, + "loss": 1.6826, + "step": 13160 + }, + { + "epoch": 1.1, + "grad_norm": 1.464040756225586, + "learning_rate": 1.2657971014492755e-05, + "loss": 1.8461, + "step": 13170 + }, + { + "epoch": 1.1, + "grad_norm": 1.3931642770767212, + "learning_rate": 1.265217391304348e-05, + "loss": 1.5932, + "step": 13180 + }, + { + "epoch": 1.1, + "grad_norm": 1.9136666059494019, + "learning_rate": 1.2646376811594203e-05, + "loss": 1.655, + "step": 13190 + }, + { + "epoch": 1.1, + "grad_norm": 0.6585626006126404, + "learning_rate": 1.2640579710144928e-05, + "loss": 1.6953, + "step": 13200 + }, + { + "epoch": 1.1, + "grad_norm": 1.4719610214233398, + "learning_rate": 1.2634782608695652e-05, + "loss": 1.7122, + "step": 13210 + }, + { + "epoch": 1.1, + "grad_norm": 2.93149995803833, + "learning_rate": 1.2628985507246378e-05, + "loss": 1.7336, + "step": 13220 + }, + { + "epoch": 1.1, + "grad_norm": 2.2544167041778564, + "learning_rate": 1.2623188405797102e-05, + "loss": 1.5404, + "step": 13230 + }, + { + "epoch": 1.1, + "grad_norm": 2.97430682182312, + "learning_rate": 1.2617391304347828e-05, + "loss": 1.7408, + "step": 13240 + }, + { + "epoch": 1.1, + "grad_norm": 1.8780460357666016, + "learning_rate": 1.2611594202898552e-05, + "loss": 1.5946, + "step": 13250 + }, + { + "epoch": 1.1, + "grad_norm": 3.108811140060425, + "learning_rate": 1.2605797101449277e-05, + "loss": 1.7989, + "step": 13260 + }, + { + "epoch": 1.11, + "grad_norm": 1.7901716232299805, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.715, + "step": 13270 + }, + { + "epoch": 1.11, + "grad_norm": 2.103832244873047, + "learning_rate": 1.2594202898550725e-05, + "loss": 1.5802, + "step": 13280 + }, + { + "epoch": 1.11, + "grad_norm": 3.0903170108795166, + "learning_rate": 1.2588405797101451e-05, + "loss": 1.7531, + "step": 13290 + }, + { + "epoch": 1.11, + "grad_norm": 2.20255184173584, + "learning_rate": 1.2582608695652175e-05, + "loss": 1.7651, + "step": 13300 + }, + { + "epoch": 1.11, + "grad_norm": 1.1130000352859497, + "learning_rate": 1.25768115942029e-05, + "loss": 1.6944, + "step": 13310 + }, + { + "epoch": 1.11, + "grad_norm": 2.6338019371032715, + "learning_rate": 1.2571014492753623e-05, + "loss": 1.6242, + "step": 13320 + }, + { + "epoch": 1.11, + "grad_norm": 3.7077834606170654, + "learning_rate": 1.2565217391304349e-05, + "loss": 1.5732, + "step": 13330 + }, + { + "epoch": 1.11, + "grad_norm": 2.3346569538116455, + "learning_rate": 1.2559420289855073e-05, + "loss": 1.8445, + "step": 13340 + }, + { + "epoch": 1.11, + "grad_norm": 9.315794944763184, + "learning_rate": 1.2553623188405798e-05, + "loss": 1.6678, + "step": 13350 + }, + { + "epoch": 1.11, + "grad_norm": 3.115358591079712, + "learning_rate": 1.2547826086956522e-05, + "loss": 1.719, + "step": 13360 + }, + { + "epoch": 1.11, + "grad_norm": 7.239322662353516, + "learning_rate": 1.2542028985507246e-05, + "loss": 1.6076, + "step": 13370 + }, + { + "epoch": 1.11, + "grad_norm": 5.326850891113281, + "learning_rate": 1.2536231884057972e-05, + "loss": 1.722, + "step": 13380 + }, + { + "epoch": 1.12, + "grad_norm": 5.592775344848633, + "learning_rate": 1.2530434782608696e-05, + "loss": 1.7467, + "step": 13390 + }, + { + "epoch": 1.12, + "grad_norm": 7.9691667556762695, + "learning_rate": 1.2524637681159422e-05, + "loss": 1.5619, + "step": 13400 + }, + { + "epoch": 1.12, + "grad_norm": 1.5476248264312744, + "learning_rate": 1.2518840579710146e-05, + "loss": 1.7403, + "step": 13410 + }, + { + "epoch": 1.12, + "grad_norm": 2.466348886489868, + "learning_rate": 1.2513043478260872e-05, + "loss": 1.7568, + "step": 13420 + }, + { + "epoch": 1.12, + "grad_norm": 1.7985725402832031, + "learning_rate": 1.2507246376811596e-05, + "loss": 1.6505, + "step": 13430 + }, + { + "epoch": 1.12, + "grad_norm": 2.469550848007202, + "learning_rate": 1.2501449275362321e-05, + "loss": 1.686, + "step": 13440 + }, + { + "epoch": 1.12, + "grad_norm": 2.672849178314209, + "learning_rate": 1.2495652173913044e-05, + "loss": 1.6282, + "step": 13450 + }, + { + "epoch": 1.12, + "grad_norm": 2.2976021766662598, + "learning_rate": 1.2489855072463771e-05, + "loss": 1.7232, + "step": 13460 + }, + { + "epoch": 1.12, + "grad_norm": 4.909353256225586, + "learning_rate": 1.2484057971014493e-05, + "loss": 1.5368, + "step": 13470 + }, + { + "epoch": 1.12, + "grad_norm": 4.343336582183838, + "learning_rate": 1.2478260869565217e-05, + "loss": 1.8103, + "step": 13480 + }, + { + "epoch": 1.12, + "grad_norm": 4.372796058654785, + "learning_rate": 1.2472463768115943e-05, + "loss": 1.6098, + "step": 13490 + }, + { + "epoch": 1.12, + "grad_norm": 1.7971751689910889, + "learning_rate": 1.2466666666666667e-05, + "loss": 1.6064, + "step": 13500 + }, + { + "epoch": 1.12, + "eval_loss": 1.6684225797653198, + "eval_runtime": 107.5063, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 13500 + }, + { + "epoch": 1.13, + "grad_norm": 3.6352314949035645, + "learning_rate": 1.2460869565217393e-05, + "loss": 1.7123, + "step": 13510 + }, + { + "epoch": 1.13, + "grad_norm": 3.39823317527771, + "learning_rate": 1.2455072463768117e-05, + "loss": 1.7258, + "step": 13520 + }, + { + "epoch": 1.13, + "grad_norm": 0.8198342323303223, + "learning_rate": 1.2449275362318842e-05, + "loss": 1.6146, + "step": 13530 + }, + { + "epoch": 1.13, + "grad_norm": 2.5022692680358887, + "learning_rate": 1.2443478260869566e-05, + "loss": 1.6835, + "step": 13540 + }, + { + "epoch": 1.13, + "grad_norm": 3.4644007682800293, + "learning_rate": 1.2437681159420292e-05, + "loss": 1.7186, + "step": 13550 + }, + { + "epoch": 1.13, + "grad_norm": 2.09822416305542, + "learning_rate": 1.2431884057971016e-05, + "loss": 1.7713, + "step": 13560 + }, + { + "epoch": 1.13, + "grad_norm": 3.5740549564361572, + "learning_rate": 1.2426086956521742e-05, + "loss": 1.7463, + "step": 13570 + }, + { + "epoch": 1.13, + "grad_norm": 2.3453164100646973, + "learning_rate": 1.2420289855072464e-05, + "loss": 1.6706, + "step": 13580 + }, + { + "epoch": 1.13, + "grad_norm": 4.465335369110107, + "learning_rate": 1.2414492753623188e-05, + "loss": 1.5353, + "step": 13590 + }, + { + "epoch": 1.13, + "grad_norm": 3.2128164768218994, + "learning_rate": 1.2408695652173914e-05, + "loss": 1.7146, + "step": 13600 + }, + { + "epoch": 1.13, + "grad_norm": 1.6610260009765625, + "learning_rate": 1.2402898550724638e-05, + "loss": 1.6992, + "step": 13610 + }, + { + "epoch": 1.14, + "grad_norm": 2.8867626190185547, + "learning_rate": 1.2397101449275364e-05, + "loss": 1.7159, + "step": 13620 + }, + { + "epoch": 1.14, + "grad_norm": 1.6658381223678589, + "learning_rate": 1.2391304347826088e-05, + "loss": 1.6678, + "step": 13630 + }, + { + "epoch": 1.14, + "grad_norm": 3.239511728286743, + "learning_rate": 1.2385507246376813e-05, + "loss": 1.6895, + "step": 13640 + }, + { + "epoch": 1.14, + "grad_norm": 3.459529399871826, + "learning_rate": 1.2379710144927537e-05, + "loss": 1.7424, + "step": 13650 + }, + { + "epoch": 1.14, + "grad_norm": 4.84796142578125, + "learning_rate": 1.2373913043478263e-05, + "loss": 1.6314, + "step": 13660 + }, + { + "epoch": 1.14, + "grad_norm": 3.1975581645965576, + "learning_rate": 1.2368115942028987e-05, + "loss": 1.7106, + "step": 13670 + }, + { + "epoch": 1.14, + "grad_norm": 2.918569564819336, + "learning_rate": 1.236231884057971e-05, + "loss": 1.6561, + "step": 13680 + }, + { + "epoch": 1.14, + "grad_norm": 2.9456100463867188, + "learning_rate": 1.2356521739130437e-05, + "loss": 1.5517, + "step": 13690 + }, + { + "epoch": 1.14, + "grad_norm": 2.2089602947235107, + "learning_rate": 1.2350724637681159e-05, + "loss": 1.6579, + "step": 13700 + }, + { + "epoch": 1.14, + "grad_norm": 4.878429889678955, + "learning_rate": 1.2344927536231885e-05, + "loss": 1.6001, + "step": 13710 + }, + { + "epoch": 1.14, + "grad_norm": 1.9600847959518433, + "learning_rate": 1.2339130434782609e-05, + "loss": 1.6324, + "step": 13720 + }, + { + "epoch": 1.14, + "grad_norm": 4.215534687042236, + "learning_rate": 1.2333333333333334e-05, + "loss": 1.76, + "step": 13730 + }, + { + "epoch": 1.15, + "grad_norm": 1.4697761535644531, + "learning_rate": 1.2327536231884058e-05, + "loss": 1.6336, + "step": 13740 + }, + { + "epoch": 1.15, + "grad_norm": 2.4715230464935303, + "learning_rate": 1.2321739130434784e-05, + "loss": 1.7028, + "step": 13750 + }, + { + "epoch": 1.15, + "grad_norm": 2.2741434574127197, + "learning_rate": 1.2315942028985508e-05, + "loss": 1.713, + "step": 13760 + }, + { + "epoch": 1.15, + "grad_norm": 1.9270362854003906, + "learning_rate": 1.2310144927536234e-05, + "loss": 1.6415, + "step": 13770 + }, + { + "epoch": 1.15, + "grad_norm": 4.036669731140137, + "learning_rate": 1.2304347826086958e-05, + "loss": 1.7856, + "step": 13780 + }, + { + "epoch": 1.15, + "grad_norm": 3.498189926147461, + "learning_rate": 1.2298550724637682e-05, + "loss": 1.575, + "step": 13790 + }, + { + "epoch": 1.15, + "grad_norm": 2.0799221992492676, + "learning_rate": 1.2292753623188408e-05, + "loss": 1.4947, + "step": 13800 + }, + { + "epoch": 1.15, + "grad_norm": 2.5803940296173096, + "learning_rate": 1.228695652173913e-05, + "loss": 1.7288, + "step": 13810 + }, + { + "epoch": 1.15, + "grad_norm": 4.215254306793213, + "learning_rate": 1.2281159420289857e-05, + "loss": 1.76, + "step": 13820 + }, + { + "epoch": 1.15, + "grad_norm": 4.722815990447998, + "learning_rate": 1.227536231884058e-05, + "loss": 1.6812, + "step": 13830 + }, + { + "epoch": 1.15, + "grad_norm": 3.2459473609924316, + "learning_rate": 1.2269565217391305e-05, + "loss": 1.6047, + "step": 13840 + }, + { + "epoch": 1.15, + "grad_norm": 1.8425486087799072, + "learning_rate": 1.226376811594203e-05, + "loss": 1.6592, + "step": 13850 + }, + { + "epoch": 1.16, + "grad_norm": 3.949090003967285, + "learning_rate": 1.2257971014492755e-05, + "loss": 1.644, + "step": 13860 + }, + { + "epoch": 1.16, + "grad_norm": 2.5830769538879395, + "learning_rate": 1.2252173913043479e-05, + "loss": 1.7824, + "step": 13870 + }, + { + "epoch": 1.16, + "grad_norm": 1.8728455305099487, + "learning_rate": 1.2246376811594205e-05, + "loss": 1.7766, + "step": 13880 + }, + { + "epoch": 1.16, + "grad_norm": 2.3987927436828613, + "learning_rate": 1.2240579710144929e-05, + "loss": 1.6303, + "step": 13890 + }, + { + "epoch": 1.16, + "grad_norm": 6.897881031036377, + "learning_rate": 1.2234782608695653e-05, + "loss": 1.6986, + "step": 13900 + }, + { + "epoch": 1.16, + "grad_norm": 1.8493584394454956, + "learning_rate": 1.2228985507246378e-05, + "loss": 1.6711, + "step": 13910 + }, + { + "epoch": 1.16, + "grad_norm": 2.1658079624176025, + "learning_rate": 1.2223188405797102e-05, + "loss": 1.5714, + "step": 13920 + }, + { + "epoch": 1.16, + "grad_norm": 7.026312828063965, + "learning_rate": 1.2217391304347828e-05, + "loss": 1.6648, + "step": 13930 + }, + { + "epoch": 1.16, + "grad_norm": 4.64369010925293, + "learning_rate": 1.2211594202898552e-05, + "loss": 1.622, + "step": 13940 + }, + { + "epoch": 1.16, + "grad_norm": 1.4812980890274048, + "learning_rate": 1.2205797101449278e-05, + "loss": 1.6563, + "step": 13950 + }, + { + "epoch": 1.16, + "grad_norm": 3.3375606536865234, + "learning_rate": 1.22e-05, + "loss": 1.7177, + "step": 13960 + }, + { + "epoch": 1.16, + "grad_norm": 3.0398337841033936, + "learning_rate": 1.2194202898550726e-05, + "loss": 1.8483, + "step": 13970 + }, + { + "epoch": 1.17, + "grad_norm": 2.477541446685791, + "learning_rate": 1.218840579710145e-05, + "loss": 1.7636, + "step": 13980 + }, + { + "epoch": 1.17, + "grad_norm": 2.2809600830078125, + "learning_rate": 1.2182608695652174e-05, + "loss": 1.6272, + "step": 13990 + }, + { + "epoch": 1.17, + "grad_norm": 3.5135672092437744, + "learning_rate": 1.21768115942029e-05, + "loss": 1.6053, + "step": 14000 + }, + { + "epoch": 1.17, + "eval_loss": 1.655966877937317, + "eval_runtime": 107.5028, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 14000 + }, + { + "epoch": 1.17, + "grad_norm": 6.65315580368042, + "learning_rate": 1.2171014492753624e-05, + "loss": 1.6857, + "step": 14010 + }, + { + "epoch": 1.17, + "grad_norm": 3.5551207065582275, + "learning_rate": 1.216521739130435e-05, + "loss": 1.6957, + "step": 14020 + }, + { + "epoch": 1.17, + "grad_norm": 3.662346839904785, + "learning_rate": 1.2159420289855073e-05, + "loss": 1.709, + "step": 14030 + }, + { + "epoch": 1.17, + "grad_norm": 3.7407190799713135, + "learning_rate": 1.2153623188405799e-05, + "loss": 1.7064, + "step": 14040 + }, + { + "epoch": 1.17, + "grad_norm": 3.3801932334899902, + "learning_rate": 1.2147826086956523e-05, + "loss": 1.5048, + "step": 14050 + }, + { + "epoch": 1.17, + "grad_norm": 1.1840766668319702, + "learning_rate": 1.2142028985507249e-05, + "loss": 1.6463, + "step": 14060 + }, + { + "epoch": 1.17, + "grad_norm": 5.610103130340576, + "learning_rate": 1.2136231884057973e-05, + "loss": 1.6947, + "step": 14070 + }, + { + "epoch": 1.17, + "grad_norm": 3.217325210571289, + "learning_rate": 1.2130434782608698e-05, + "loss": 1.6183, + "step": 14080 + }, + { + "epoch": 1.17, + "grad_norm": 3.316359281539917, + "learning_rate": 1.212463768115942e-05, + "loss": 1.6692, + "step": 14090 + }, + { + "epoch": 1.18, + "grad_norm": 4.7778801918029785, + "learning_rate": 1.2118840579710145e-05, + "loss": 1.7233, + "step": 14100 + }, + { + "epoch": 1.18, + "grad_norm": 4.980316162109375, + "learning_rate": 1.211304347826087e-05, + "loss": 1.5753, + "step": 14110 + }, + { + "epoch": 1.18, + "grad_norm": 2.6318607330322266, + "learning_rate": 1.2107246376811594e-05, + "loss": 1.5216, + "step": 14120 + }, + { + "epoch": 1.18, + "grad_norm": 4.447054386138916, + "learning_rate": 1.210144927536232e-05, + "loss": 1.82, + "step": 14130 + }, + { + "epoch": 1.18, + "grad_norm": 1.368971586227417, + "learning_rate": 1.2095652173913044e-05, + "loss": 1.7372, + "step": 14140 + }, + { + "epoch": 1.18, + "grad_norm": 3.2000670433044434, + "learning_rate": 1.208985507246377e-05, + "loss": 1.7553, + "step": 14150 + }, + { + "epoch": 1.18, + "grad_norm": 3.1410953998565674, + "learning_rate": 1.2084057971014494e-05, + "loss": 1.778, + "step": 14160 + }, + { + "epoch": 1.18, + "grad_norm": 1.6340982913970947, + "learning_rate": 1.207826086956522e-05, + "loss": 1.5523, + "step": 14170 + }, + { + "epoch": 1.18, + "grad_norm": 2.945784568786621, + "learning_rate": 1.2072463768115943e-05, + "loss": 1.6813, + "step": 14180 + }, + { + "epoch": 1.18, + "grad_norm": 1.7603634595870972, + "learning_rate": 1.206666666666667e-05, + "loss": 1.7787, + "step": 14190 + }, + { + "epoch": 1.18, + "grad_norm": 3.8819363117218018, + "learning_rate": 1.2060869565217393e-05, + "loss": 1.7003, + "step": 14200 + }, + { + "epoch": 1.18, + "grad_norm": 3.882131576538086, + "learning_rate": 1.2055072463768115e-05, + "loss": 1.7552, + "step": 14210 + }, + { + "epoch": 1.19, + "grad_norm": 2.339003324508667, + "learning_rate": 1.2049275362318841e-05, + "loss": 1.6867, + "step": 14220 + }, + { + "epoch": 1.19, + "grad_norm": 5.375877857208252, + "learning_rate": 1.2043478260869565e-05, + "loss": 1.7049, + "step": 14230 + }, + { + "epoch": 1.19, + "grad_norm": 3.6439826488494873, + "learning_rate": 1.2037681159420291e-05, + "loss": 1.7307, + "step": 14240 + }, + { + "epoch": 1.19, + "grad_norm": 2.504507541656494, + "learning_rate": 1.2031884057971015e-05, + "loss": 1.7403, + "step": 14250 + }, + { + "epoch": 1.19, + "grad_norm": 2.381837844848633, + "learning_rate": 1.202608695652174e-05, + "loss": 1.6733, + "step": 14260 + }, + { + "epoch": 1.19, + "grad_norm": 5.466141700744629, + "learning_rate": 1.2020289855072465e-05, + "loss": 1.7054, + "step": 14270 + }, + { + "epoch": 1.19, + "grad_norm": 2.095202922821045, + "learning_rate": 1.201449275362319e-05, + "loss": 1.6852, + "step": 14280 + }, + { + "epoch": 1.19, + "grad_norm": 2.33658766746521, + "learning_rate": 1.2008695652173914e-05, + "loss": 1.7658, + "step": 14290 + }, + { + "epoch": 1.19, + "grad_norm": 3.439746379852295, + "learning_rate": 1.2002898550724638e-05, + "loss": 1.5291, + "step": 14300 + }, + { + "epoch": 1.19, + "grad_norm": 6.367286682128906, + "learning_rate": 1.1997101449275364e-05, + "loss": 1.6469, + "step": 14310 + }, + { + "epoch": 1.19, + "grad_norm": 4.05848503112793, + "learning_rate": 1.1991304347826086e-05, + "loss": 1.8017, + "step": 14320 + }, + { + "epoch": 1.19, + "grad_norm": 4.569128036499023, + "learning_rate": 1.1985507246376814e-05, + "loss": 1.5023, + "step": 14330 + }, + { + "epoch": 1.2, + "grad_norm": 3.5229172706604004, + "learning_rate": 1.1979710144927536e-05, + "loss": 1.7475, + "step": 14340 + }, + { + "epoch": 1.2, + "grad_norm": 4.319218635559082, + "learning_rate": 1.1973913043478262e-05, + "loss": 1.6949, + "step": 14350 + }, + { + "epoch": 1.2, + "grad_norm": 5.897936820983887, + "learning_rate": 1.1968115942028986e-05, + "loss": 1.6718, + "step": 14360 + }, + { + "epoch": 1.2, + "grad_norm": 2.435079574584961, + "learning_rate": 1.1962318840579711e-05, + "loss": 1.6979, + "step": 14370 + }, + { + "epoch": 1.2, + "grad_norm": 3.011115312576294, + "learning_rate": 1.1956521739130435e-05, + "loss": 1.4714, + "step": 14380 + }, + { + "epoch": 1.2, + "grad_norm": 1.5290638208389282, + "learning_rate": 1.1950724637681161e-05, + "loss": 1.5394, + "step": 14390 + }, + { + "epoch": 1.2, + "grad_norm": 5.277037620544434, + "learning_rate": 1.1944927536231885e-05, + "loss": 1.6627, + "step": 14400 + }, + { + "epoch": 1.2, + "grad_norm": 5.786652088165283, + "learning_rate": 1.1939130434782609e-05, + "loss": 1.7029, + "step": 14410 + }, + { + "epoch": 1.2, + "grad_norm": 1.4963030815124512, + "learning_rate": 1.1933333333333335e-05, + "loss": 1.6999, + "step": 14420 + }, + { + "epoch": 1.2, + "grad_norm": 2.0223753452301025, + "learning_rate": 1.1927536231884059e-05, + "loss": 1.6998, + "step": 14430 + }, + { + "epoch": 1.2, + "grad_norm": 3.4138219356536865, + "learning_rate": 1.1921739130434785e-05, + "loss": 1.5385, + "step": 14440 + }, + { + "epoch": 1.2, + "grad_norm": 5.297712802886963, + "learning_rate": 1.1915942028985507e-05, + "loss": 1.6142, + "step": 14450 + }, + { + "epoch": 1.21, + "grad_norm": 3.5567455291748047, + "learning_rate": 1.1910144927536234e-05, + "loss": 1.7254, + "step": 14460 + }, + { + "epoch": 1.21, + "grad_norm": 4.329671382904053, + "learning_rate": 1.1904347826086957e-05, + "loss": 1.7098, + "step": 14470 + }, + { + "epoch": 1.21, + "grad_norm": 1.1015174388885498, + "learning_rate": 1.1898550724637682e-05, + "loss": 1.7312, + "step": 14480 + }, + { + "epoch": 1.21, + "grad_norm": 7.632756233215332, + "learning_rate": 1.1892753623188406e-05, + "loss": 1.5845, + "step": 14490 + }, + { + "epoch": 1.21, + "grad_norm": 4.536245822906494, + "learning_rate": 1.1886956521739132e-05, + "loss": 1.7143, + "step": 14500 + }, + { + "epoch": 1.21, + "eval_loss": 1.686023235321045, + "eval_runtime": 107.504, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 14500 + }, + { + "epoch": 1.21, + "grad_norm": 2.551739454269409, + "learning_rate": 1.1881159420289856e-05, + "loss": 1.7145, + "step": 14510 + }, + { + "epoch": 1.21, + "grad_norm": 2.037818431854248, + "learning_rate": 1.187536231884058e-05, + "loss": 1.4981, + "step": 14520 + }, + { + "epoch": 1.21, + "grad_norm": 2.1799204349517822, + "learning_rate": 1.1869565217391306e-05, + "loss": 1.634, + "step": 14530 + }, + { + "epoch": 1.21, + "grad_norm": 2.238110065460205, + "learning_rate": 1.186376811594203e-05, + "loss": 1.5684, + "step": 14540 + }, + { + "epoch": 1.21, + "grad_norm": 1.419713020324707, + "learning_rate": 1.1857971014492755e-05, + "loss": 1.6688, + "step": 14550 + }, + { + "epoch": 1.21, + "grad_norm": 2.3431668281555176, + "learning_rate": 1.185217391304348e-05, + "loss": 1.7666, + "step": 14560 + }, + { + "epoch": 1.21, + "grad_norm": 5.332759857177734, + "learning_rate": 1.1846376811594205e-05, + "loss": 1.7597, + "step": 14570 + }, + { + "epoch": 1.22, + "grad_norm": 1.4301503896713257, + "learning_rate": 1.1840579710144927e-05, + "loss": 1.742, + "step": 14580 + }, + { + "epoch": 1.22, + "grad_norm": 5.501166343688965, + "learning_rate": 1.1834782608695655e-05, + "loss": 1.6943, + "step": 14590 + }, + { + "epoch": 1.22, + "grad_norm": 1.4921094179153442, + "learning_rate": 1.1828985507246377e-05, + "loss": 1.7038, + "step": 14600 + }, + { + "epoch": 1.22, + "grad_norm": 2.763652801513672, + "learning_rate": 1.1823188405797101e-05, + "loss": 1.683, + "step": 14610 + }, + { + "epoch": 1.22, + "grad_norm": 7.096171855926514, + "learning_rate": 1.1817391304347827e-05, + "loss": 1.5952, + "step": 14620 + }, + { + "epoch": 1.22, + "grad_norm": 3.0550990104675293, + "learning_rate": 1.181159420289855e-05, + "loss": 1.6841, + "step": 14630 + }, + { + "epoch": 1.22, + "grad_norm": 6.7404937744140625, + "learning_rate": 1.1805797101449277e-05, + "loss": 1.3851, + "step": 14640 + }, + { + "epoch": 1.22, + "grad_norm": 2.006895065307617, + "learning_rate": 1.18e-05, + "loss": 1.6884, + "step": 14650 + }, + { + "epoch": 1.22, + "grad_norm": 4.734574317932129, + "learning_rate": 1.1794202898550726e-05, + "loss": 1.6566, + "step": 14660 + }, + { + "epoch": 1.22, + "grad_norm": 2.7309789657592773, + "learning_rate": 1.178840579710145e-05, + "loss": 1.7182, + "step": 14670 + }, + { + "epoch": 1.22, + "grad_norm": 2.692793846130371, + "learning_rate": 1.1782608695652176e-05, + "loss": 1.6907, + "step": 14680 + }, + { + "epoch": 1.22, + "grad_norm": 2.2033607959747314, + "learning_rate": 1.17768115942029e-05, + "loss": 1.6187, + "step": 14690 + }, + { + "epoch": 1.23, + "grad_norm": 1.37400221824646, + "learning_rate": 1.1771014492753626e-05, + "loss": 1.7607, + "step": 14700 + }, + { + "epoch": 1.23, + "grad_norm": 1.4525043964385986, + "learning_rate": 1.176521739130435e-05, + "loss": 1.5481, + "step": 14710 + }, + { + "epoch": 1.23, + "grad_norm": 4.075309753417969, + "learning_rate": 1.1759420289855072e-05, + "loss": 1.6437, + "step": 14720 + }, + { + "epoch": 1.23, + "grad_norm": 1.6363898515701294, + "learning_rate": 1.1753623188405798e-05, + "loss": 1.6288, + "step": 14730 + }, + { + "epoch": 1.23, + "grad_norm": 2.210326671600342, + "learning_rate": 1.1747826086956522e-05, + "loss": 1.6164, + "step": 14740 + }, + { + "epoch": 1.23, + "grad_norm": 2.808457851409912, + "learning_rate": 1.1742028985507247e-05, + "loss": 1.7345, + "step": 14750 + }, + { + "epoch": 1.23, + "grad_norm": 4.7805867195129395, + "learning_rate": 1.1736231884057971e-05, + "loss": 1.666, + "step": 14760 + }, + { + "epoch": 1.23, + "grad_norm": 1.179888129234314, + "learning_rate": 1.1730434782608697e-05, + "loss": 1.66, + "step": 14770 + }, + { + "epoch": 1.23, + "grad_norm": 3.300703287124634, + "learning_rate": 1.1724637681159421e-05, + "loss": 1.5873, + "step": 14780 + }, + { + "epoch": 1.23, + "grad_norm": 4.175435543060303, + "learning_rate": 1.1718840579710147e-05, + "loss": 1.7415, + "step": 14790 + }, + { + "epoch": 1.23, + "grad_norm": 3.4557175636291504, + "learning_rate": 1.171304347826087e-05, + "loss": 1.7369, + "step": 14800 + }, + { + "epoch": 1.23, + "grad_norm": 2.879883289337158, + "learning_rate": 1.1707246376811596e-05, + "loss": 1.6883, + "step": 14810 + }, + { + "epoch": 1.23, + "grad_norm": 7.861069679260254, + "learning_rate": 1.170144927536232e-05, + "loss": 1.6678, + "step": 14820 + }, + { + "epoch": 1.24, + "grad_norm": 2.97892427444458, + "learning_rate": 1.1695652173913043e-05, + "loss": 1.5186, + "step": 14830 + }, + { + "epoch": 1.24, + "grad_norm": 1.9701839685440063, + "learning_rate": 1.168985507246377e-05, + "loss": 1.6847, + "step": 14840 + }, + { + "epoch": 1.24, + "grad_norm": 2.1920979022979736, + "learning_rate": 1.1684057971014492e-05, + "loss": 1.6583, + "step": 14850 + }, + { + "epoch": 1.24, + "grad_norm": 4.745516777038574, + "learning_rate": 1.1678260869565218e-05, + "loss": 1.6884, + "step": 14860 + }, + { + "epoch": 1.24, + "grad_norm": 7.854526042938232, + "learning_rate": 1.1672463768115942e-05, + "loss": 1.7469, + "step": 14870 + }, + { + "epoch": 1.24, + "grad_norm": 2.85132098197937, + "learning_rate": 1.1666666666666668e-05, + "loss": 1.5597, + "step": 14880 + }, + { + "epoch": 1.24, + "grad_norm": 1.9505503177642822, + "learning_rate": 1.1660869565217392e-05, + "loss": 1.701, + "step": 14890 + }, + { + "epoch": 1.24, + "grad_norm": 4.181471824645996, + "learning_rate": 1.1655072463768118e-05, + "loss": 1.7843, + "step": 14900 + }, + { + "epoch": 1.24, + "grad_norm": 3.77048659324646, + "learning_rate": 1.1649275362318842e-05, + "loss": 1.7381, + "step": 14910 + }, + { + "epoch": 1.24, + "grad_norm": 7.85846471786499, + "learning_rate": 1.1643478260869566e-05, + "loss": 1.7673, + "step": 14920 + }, + { + "epoch": 1.24, + "grad_norm": 7.585514545440674, + "learning_rate": 1.1637681159420291e-05, + "loss": 1.6633, + "step": 14930 + }, + { + "epoch": 1.25, + "grad_norm": 10.48277759552002, + "learning_rate": 1.1631884057971015e-05, + "loss": 1.6939, + "step": 14940 + }, + { + "epoch": 1.25, + "grad_norm": 3.1543049812316895, + "learning_rate": 1.1626086956521741e-05, + "loss": 1.6358, + "step": 14950 + }, + { + "epoch": 1.25, + "grad_norm": 2.4650087356567383, + "learning_rate": 1.1620289855072463e-05, + "loss": 1.7219, + "step": 14960 + }, + { + "epoch": 1.25, + "grad_norm": 1.0995783805847168, + "learning_rate": 1.161449275362319e-05, + "loss": 1.68, + "step": 14970 + }, + { + "epoch": 1.25, + "grad_norm": 5.351716995239258, + "learning_rate": 1.1608695652173913e-05, + "loss": 1.7076, + "step": 14980 + }, + { + "epoch": 1.25, + "grad_norm": 1.2592127323150635, + "learning_rate": 1.1602898550724639e-05, + "loss": 1.6001, + "step": 14990 + }, + { + "epoch": 1.25, + "grad_norm": 1.4201385974884033, + "learning_rate": 1.1597101449275363e-05, + "loss": 1.753, + "step": 15000 + }, + { + "epoch": 1.25, + "eval_loss": 1.7132278680801392, + "eval_runtime": 107.4961, + "eval_samples_per_second": 9.303, + "eval_steps_per_second": 2.326, + "step": 15000 + }, + { + "epoch": 1.25, + "grad_norm": 2.306769609451294, + "learning_rate": 1.1591304347826088e-05, + "loss": 1.6939, + "step": 15010 + }, + { + "epoch": 1.25, + "grad_norm": 1.9689505100250244, + "learning_rate": 1.1585507246376812e-05, + "loss": 1.6635, + "step": 15020 + }, + { + "epoch": 1.25, + "grad_norm": 4.207001209259033, + "learning_rate": 1.1579710144927536e-05, + "loss": 1.7515, + "step": 15030 + }, + { + "epoch": 1.25, + "grad_norm": 3.0428285598754883, + "learning_rate": 1.1573913043478262e-05, + "loss": 1.5935, + "step": 15040 + }, + { + "epoch": 1.25, + "grad_norm": 4.358133792877197, + "learning_rate": 1.1568115942028986e-05, + "loss": 1.6732, + "step": 15050 + }, + { + "epoch": 1.25, + "grad_norm": 4.284139633178711, + "learning_rate": 1.1562318840579712e-05, + "loss": 1.7963, + "step": 15060 + }, + { + "epoch": 1.26, + "grad_norm": 2.9040706157684326, + "learning_rate": 1.1556521739130436e-05, + "loss": 1.7937, + "step": 15070 + }, + { + "epoch": 1.26, + "grad_norm": 4.678374290466309, + "learning_rate": 1.1550724637681162e-05, + "loss": 1.5987, + "step": 15080 + }, + { + "epoch": 1.26, + "grad_norm": 1.1551108360290527, + "learning_rate": 1.1544927536231884e-05, + "loss": 1.7292, + "step": 15090 + }, + { + "epoch": 1.26, + "grad_norm": 3.153188943862915, + "learning_rate": 1.1539130434782611e-05, + "loss": 1.5968, + "step": 15100 + }, + { + "epoch": 1.26, + "grad_norm": 7.321458339691162, + "learning_rate": 1.1533333333333334e-05, + "loss": 1.7164, + "step": 15110 + }, + { + "epoch": 1.26, + "grad_norm": 1.9099916219711304, + "learning_rate": 1.152753623188406e-05, + "loss": 1.7368, + "step": 15120 + }, + { + "epoch": 1.26, + "grad_norm": 4.101009368896484, + "learning_rate": 1.1521739130434783e-05, + "loss": 1.7376, + "step": 15130 + }, + { + "epoch": 1.26, + "grad_norm": 10.590901374816895, + "learning_rate": 1.1515942028985507e-05, + "loss": 1.7985, + "step": 15140 + }, + { + "epoch": 1.26, + "grad_norm": 4.799911022186279, + "learning_rate": 1.1510144927536233e-05, + "loss": 1.7259, + "step": 15150 + }, + { + "epoch": 1.26, + "grad_norm": 1.911226511001587, + "learning_rate": 1.1504347826086957e-05, + "loss": 1.7439, + "step": 15160 + }, + { + "epoch": 1.26, + "grad_norm": 1.5930674076080322, + "learning_rate": 1.1498550724637683e-05, + "loss": 1.7223, + "step": 15170 + }, + { + "epoch": 1.27, + "grad_norm": 3.125276565551758, + "learning_rate": 1.1492753623188407e-05, + "loss": 1.645, + "step": 15180 + }, + { + "epoch": 1.27, + "grad_norm": 1.448198914527893, + "learning_rate": 1.1486956521739132e-05, + "loss": 1.6675, + "step": 15190 + }, + { + "epoch": 1.27, + "grad_norm": 2.842217445373535, + "learning_rate": 1.1481159420289856e-05, + "loss": 1.761, + "step": 15200 + }, + { + "epoch": 1.27, + "grad_norm": 1.6608208417892456, + "learning_rate": 1.1475362318840582e-05, + "loss": 1.6394, + "step": 15210 + }, + { + "epoch": 1.27, + "grad_norm": 2.386859178543091, + "learning_rate": 1.1469565217391304e-05, + "loss": 1.6165, + "step": 15220 + }, + { + "epoch": 1.27, + "grad_norm": 1.5258991718292236, + "learning_rate": 1.1463768115942028e-05, + "loss": 1.7252, + "step": 15230 + }, + { + "epoch": 1.27, + "grad_norm": 2.106661081314087, + "learning_rate": 1.1457971014492754e-05, + "loss": 1.6343, + "step": 15240 + }, + { + "epoch": 1.27, + "grad_norm": 1.260819435119629, + "learning_rate": 1.1452173913043478e-05, + "loss": 1.7971, + "step": 15250 + }, + { + "epoch": 1.27, + "grad_norm": 3.5960564613342285, + "learning_rate": 1.1446376811594204e-05, + "loss": 1.6154, + "step": 15260 + }, + { + "epoch": 1.27, + "grad_norm": 3.4843623638153076, + "learning_rate": 1.1440579710144928e-05, + "loss": 1.8285, + "step": 15270 + }, + { + "epoch": 1.27, + "grad_norm": 2.6920154094696045, + "learning_rate": 1.1434782608695654e-05, + "loss": 1.7324, + "step": 15280 + }, + { + "epoch": 1.27, + "grad_norm": 3.802539110183716, + "learning_rate": 1.1428985507246378e-05, + "loss": 1.6399, + "step": 15290 + }, + { + "epoch": 1.27, + "grad_norm": 3.4101133346557617, + "learning_rate": 1.1423188405797103e-05, + "loss": 1.7186, + "step": 15300 + }, + { + "epoch": 1.28, + "grad_norm": 1.7906736135482788, + "learning_rate": 1.1417391304347827e-05, + "loss": 1.7397, + "step": 15310 + }, + { + "epoch": 1.28, + "grad_norm": 2.0107662677764893, + "learning_rate": 1.1411594202898553e-05, + "loss": 1.6297, + "step": 15320 + }, + { + "epoch": 1.28, + "grad_norm": 2.830557107925415, + "learning_rate": 1.1405797101449277e-05, + "loss": 1.6362, + "step": 15330 + }, + { + "epoch": 1.28, + "grad_norm": 1.7907980680465698, + "learning_rate": 1.14e-05, + "loss": 1.6913, + "step": 15340 + }, + { + "epoch": 1.28, + "grad_norm": 1.5224626064300537, + "learning_rate": 1.1394202898550725e-05, + "loss": 1.7274, + "step": 15350 + }, + { + "epoch": 1.28, + "grad_norm": 7.044021129608154, + "learning_rate": 1.1388405797101449e-05, + "loss": 1.6653, + "step": 15360 + }, + { + "epoch": 1.28, + "grad_norm": 2.0197882652282715, + "learning_rate": 1.1382608695652175e-05, + "loss": 1.7462, + "step": 15370 + }, + { + "epoch": 1.28, + "grad_norm": 3.1112966537475586, + "learning_rate": 1.1376811594202899e-05, + "loss": 1.7024, + "step": 15380 + }, + { + "epoch": 1.28, + "grad_norm": 2.9264869689941406, + "learning_rate": 1.1371014492753624e-05, + "loss": 1.6285, + "step": 15390 + }, + { + "epoch": 1.28, + "grad_norm": 2.849034309387207, + "learning_rate": 1.1365217391304348e-05, + "loss": 1.7542, + "step": 15400 + }, + { + "epoch": 1.28, + "grad_norm": 5.498842239379883, + "learning_rate": 1.1359420289855074e-05, + "loss": 1.6733, + "step": 15410 + }, + { + "epoch": 1.28, + "grad_norm": 3.047708034515381, + "learning_rate": 1.1353623188405798e-05, + "loss": 1.6027, + "step": 15420 + }, + { + "epoch": 1.29, + "grad_norm": 4.122775077819824, + "learning_rate": 1.1347826086956524e-05, + "loss": 1.7346, + "step": 15430 + }, + { + "epoch": 1.29, + "grad_norm": 2.988992929458618, + "learning_rate": 1.1342028985507248e-05, + "loss": 1.5555, + "step": 15440 + }, + { + "epoch": 1.29, + "grad_norm": 1.4278367757797241, + "learning_rate": 1.1336231884057972e-05, + "loss": 1.6886, + "step": 15450 + }, + { + "epoch": 1.29, + "grad_norm": 1.543633222579956, + "learning_rate": 1.1330434782608698e-05, + "loss": 1.5712, + "step": 15460 + }, + { + "epoch": 1.29, + "grad_norm": 2.563520908355713, + "learning_rate": 1.132463768115942e-05, + "loss": 1.6082, + "step": 15470 + }, + { + "epoch": 1.29, + "grad_norm": 3.311037540435791, + "learning_rate": 1.1318840579710147e-05, + "loss": 1.56, + "step": 15480 + }, + { + "epoch": 1.29, + "grad_norm": 1.979817271232605, + "learning_rate": 1.131304347826087e-05, + "loss": 1.7833, + "step": 15490 + }, + { + "epoch": 1.29, + "grad_norm": 2.7615084648132324, + "learning_rate": 1.1307246376811595e-05, + "loss": 1.6641, + "step": 15500 + }, + { + "epoch": 1.29, + "eval_loss": 1.6843485832214355, + "eval_runtime": 107.5057, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 15500 + }, + { + "epoch": 1.29, + "grad_norm": 2.724392890930176, + "learning_rate": 1.130144927536232e-05, + "loss": 1.5888, + "step": 15510 + }, + { + "epoch": 1.29, + "grad_norm": 4.149465560913086, + "learning_rate": 1.1295652173913045e-05, + "loss": 1.7467, + "step": 15520 + }, + { + "epoch": 1.29, + "grad_norm": 1.9117258787155151, + "learning_rate": 1.1289855072463769e-05, + "loss": 1.6763, + "step": 15530 + }, + { + "epoch": 1.29, + "grad_norm": 2.0557730197906494, + "learning_rate": 1.1284057971014493e-05, + "loss": 1.7202, + "step": 15540 + }, + { + "epoch": 1.3, + "grad_norm": 2.089738130569458, + "learning_rate": 1.1278260869565219e-05, + "loss": 1.7192, + "step": 15550 + }, + { + "epoch": 1.3, + "grad_norm": 4.925050735473633, + "learning_rate": 1.1272463768115943e-05, + "loss": 1.7204, + "step": 15560 + }, + { + "epoch": 1.3, + "grad_norm": 7.140872001647949, + "learning_rate": 1.1266666666666668e-05, + "loss": 1.5706, + "step": 15570 + }, + { + "epoch": 1.3, + "grad_norm": 6.26693058013916, + "learning_rate": 1.1260869565217392e-05, + "loss": 1.796, + "step": 15580 + }, + { + "epoch": 1.3, + "grad_norm": 1.269399881362915, + "learning_rate": 1.1255072463768118e-05, + "loss": 1.5746, + "step": 15590 + }, + { + "epoch": 1.3, + "grad_norm": 1.2779074907302856, + "learning_rate": 1.124927536231884e-05, + "loss": 1.5743, + "step": 15600 + }, + { + "epoch": 1.3, + "grad_norm": 4.808041572570801, + "learning_rate": 1.1243478260869568e-05, + "loss": 1.7358, + "step": 15610 + }, + { + "epoch": 1.3, + "grad_norm": 2.192412853240967, + "learning_rate": 1.123768115942029e-05, + "loss": 1.7665, + "step": 15620 + }, + { + "epoch": 1.3, + "grad_norm": 0.8821000456809998, + "learning_rate": 1.1231884057971016e-05, + "loss": 1.6442, + "step": 15630 + }, + { + "epoch": 1.3, + "grad_norm": 2.8440635204315186, + "learning_rate": 1.122608695652174e-05, + "loss": 1.6318, + "step": 15640 + }, + { + "epoch": 1.3, + "grad_norm": 3.6958911418914795, + "learning_rate": 1.1220289855072464e-05, + "loss": 1.7744, + "step": 15650 + }, + { + "epoch": 1.3, + "grad_norm": 12.584587097167969, + "learning_rate": 1.121449275362319e-05, + "loss": 1.7242, + "step": 15660 + }, + { + "epoch": 1.31, + "grad_norm": 4.3364410400390625, + "learning_rate": 1.1208695652173913e-05, + "loss": 1.7048, + "step": 15670 + }, + { + "epoch": 1.31, + "grad_norm": 3.9475982189178467, + "learning_rate": 1.120289855072464e-05, + "loss": 1.6612, + "step": 15680 + }, + { + "epoch": 1.31, + "grad_norm": 5.1554036140441895, + "learning_rate": 1.1197101449275363e-05, + "loss": 1.6869, + "step": 15690 + }, + { + "epoch": 1.31, + "grad_norm": 2.2033803462982178, + "learning_rate": 1.1191304347826089e-05, + "loss": 1.6598, + "step": 15700 + }, + { + "epoch": 1.31, + "grad_norm": 3.7358901500701904, + "learning_rate": 1.1185507246376813e-05, + "loss": 1.7445, + "step": 15710 + }, + { + "epoch": 1.31, + "grad_norm": 1.531160593032837, + "learning_rate": 1.1179710144927539e-05, + "loss": 1.6378, + "step": 15720 + }, + { + "epoch": 1.31, + "grad_norm": 4.818709373474121, + "learning_rate": 1.1173913043478261e-05, + "loss": 1.6497, + "step": 15730 + }, + { + "epoch": 1.31, + "grad_norm": 4.811004161834717, + "learning_rate": 1.1168115942028988e-05, + "loss": 1.7243, + "step": 15740 + }, + { + "epoch": 1.31, + "grad_norm": 4.756585597991943, + "learning_rate": 1.116231884057971e-05, + "loss": 1.7284, + "step": 15750 + }, + { + "epoch": 1.31, + "grad_norm": 1.713197946548462, + "learning_rate": 1.1156521739130435e-05, + "loss": 1.7706, + "step": 15760 + }, + { + "epoch": 1.31, + "grad_norm": 5.140644073486328, + "learning_rate": 1.115072463768116e-05, + "loss": 1.6415, + "step": 15770 + }, + { + "epoch": 1.31, + "grad_norm": 1.6640381813049316, + "learning_rate": 1.1144927536231884e-05, + "loss": 1.7129, + "step": 15780 + }, + { + "epoch": 1.32, + "grad_norm": 2.814182996749878, + "learning_rate": 1.113913043478261e-05, + "loss": 1.5169, + "step": 15790 + }, + { + "epoch": 1.32, + "grad_norm": 4.703028678894043, + "learning_rate": 1.1133333333333334e-05, + "loss": 1.6786, + "step": 15800 + }, + { + "epoch": 1.32, + "grad_norm": 5.120509147644043, + "learning_rate": 1.112753623188406e-05, + "loss": 1.7163, + "step": 15810 + }, + { + "epoch": 1.32, + "grad_norm": 1.0847809314727783, + "learning_rate": 1.1121739130434784e-05, + "loss": 1.6943, + "step": 15820 + }, + { + "epoch": 1.32, + "grad_norm": 10.130949974060059, + "learning_rate": 1.111594202898551e-05, + "loss": 1.729, + "step": 15830 + }, + { + "epoch": 1.32, + "grad_norm": 2.5162465572357178, + "learning_rate": 1.1110144927536233e-05, + "loss": 1.7251, + "step": 15840 + }, + { + "epoch": 1.32, + "grad_norm": 4.79127311706543, + "learning_rate": 1.1104347826086956e-05, + "loss": 1.7665, + "step": 15850 + }, + { + "epoch": 1.32, + "grad_norm": 1.3719213008880615, + "learning_rate": 1.1098550724637681e-05, + "loss": 1.6865, + "step": 15860 + }, + { + "epoch": 1.32, + "grad_norm": 3.254711151123047, + "learning_rate": 1.1092753623188405e-05, + "loss": 1.7149, + "step": 15870 + }, + { + "epoch": 1.32, + "grad_norm": 11.944002151489258, + "learning_rate": 1.1086956521739131e-05, + "loss": 1.6525, + "step": 15880 + }, + { + "epoch": 1.32, + "grad_norm": 4.941390037536621, + "learning_rate": 1.1081159420289855e-05, + "loss": 1.5516, + "step": 15890 + }, + { + "epoch": 1.32, + "grad_norm": 7.373305797576904, + "learning_rate": 1.1075362318840581e-05, + "loss": 1.473, + "step": 15900 + }, + { + "epoch": 1.33, + "grad_norm": 5.447077751159668, + "learning_rate": 1.1069565217391305e-05, + "loss": 1.6378, + "step": 15910 + }, + { + "epoch": 1.33, + "grad_norm": 4.908672332763672, + "learning_rate": 1.106376811594203e-05, + "loss": 1.7512, + "step": 15920 + }, + { + "epoch": 1.33, + "grad_norm": 2.0174648761749268, + "learning_rate": 1.1057971014492755e-05, + "loss": 1.6252, + "step": 15930 + }, + { + "epoch": 1.33, + "grad_norm": 7.651984214782715, + "learning_rate": 1.105217391304348e-05, + "loss": 1.5333, + "step": 15940 + }, + { + "epoch": 1.33, + "grad_norm": 4.4102983474731445, + "learning_rate": 1.1046376811594204e-05, + "loss": 1.5242, + "step": 15950 + }, + { + "epoch": 1.33, + "grad_norm": 1.826643943786621, + "learning_rate": 1.1040579710144928e-05, + "loss": 1.6655, + "step": 15960 + }, + { + "epoch": 1.33, + "grad_norm": 2.4821903705596924, + "learning_rate": 1.1034782608695654e-05, + "loss": 1.6926, + "step": 15970 + }, + { + "epoch": 1.33, + "grad_norm": 1.6242338418960571, + "learning_rate": 1.1028985507246376e-05, + "loss": 1.7808, + "step": 15980 + }, + { + "epoch": 1.33, + "grad_norm": 8.218697547912598, + "learning_rate": 1.1023188405797102e-05, + "loss": 1.648, + "step": 15990 + }, + { + "epoch": 1.33, + "grad_norm": 2.398993730545044, + "learning_rate": 1.1017391304347826e-05, + "loss": 1.624, + "step": 16000 + }, + { + "epoch": 1.33, + "eval_loss": 1.6754653453826904, + "eval_runtime": 107.521, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 16000 + }, + { + "epoch": 1.33, + "grad_norm": 4.014492511749268, + "learning_rate": 1.1011594202898552e-05, + "loss": 1.5021, + "step": 16010 + }, + { + "epoch": 1.33, + "grad_norm": 3.1234304904937744, + "learning_rate": 1.1006376811594203e-05, + "loss": 1.6802, + "step": 16020 + }, + { + "epoch": 1.34, + "grad_norm": 4.326152801513672, + "learning_rate": 1.1000579710144927e-05, + "loss": 1.7546, + "step": 16030 + }, + { + "epoch": 1.34, + "grad_norm": 1.9750605821609497, + "learning_rate": 1.0994782608695653e-05, + "loss": 1.6459, + "step": 16040 + }, + { + "epoch": 1.34, + "grad_norm": 2.2335305213928223, + "learning_rate": 1.0988985507246377e-05, + "loss": 1.6248, + "step": 16050 + }, + { + "epoch": 1.34, + "grad_norm": 2.1429505348205566, + "learning_rate": 1.0983188405797102e-05, + "loss": 1.7079, + "step": 16060 + }, + { + "epoch": 1.34, + "grad_norm": 3.822464942932129, + "learning_rate": 1.0977391304347826e-05, + "loss": 1.7218, + "step": 16070 + }, + { + "epoch": 1.34, + "grad_norm": 4.145028114318848, + "learning_rate": 1.0971594202898552e-05, + "loss": 1.7098, + "step": 16080 + }, + { + "epoch": 1.34, + "grad_norm": 5.912876129150391, + "learning_rate": 1.0965797101449276e-05, + "loss": 1.7075, + "step": 16090 + }, + { + "epoch": 1.34, + "grad_norm": 6.154576778411865, + "learning_rate": 1.0960000000000002e-05, + "loss": 1.6158, + "step": 16100 + }, + { + "epoch": 1.34, + "grad_norm": 1.7767752408981323, + "learning_rate": 1.0954202898550726e-05, + "loss": 1.5413, + "step": 16110 + }, + { + "epoch": 1.34, + "grad_norm": 3.51847243309021, + "learning_rate": 1.0948405797101451e-05, + "loss": 1.6848, + "step": 16120 + }, + { + "epoch": 1.34, + "grad_norm": 0.7433416843414307, + "learning_rate": 1.0942608695652176e-05, + "loss": 1.6288, + "step": 16130 + }, + { + "epoch": 1.34, + "grad_norm": 3.0296177864074707, + "learning_rate": 1.0936811594202898e-05, + "loss": 1.5401, + "step": 16140 + }, + { + "epoch": 1.35, + "grad_norm": 4.524282932281494, + "learning_rate": 1.0931014492753625e-05, + "loss": 1.5648, + "step": 16150 + }, + { + "epoch": 1.35, + "grad_norm": 7.122012138366699, + "learning_rate": 1.0925217391304348e-05, + "loss": 1.7171, + "step": 16160 + }, + { + "epoch": 1.35, + "grad_norm": 2.0002593994140625, + "learning_rate": 1.0919420289855073e-05, + "loss": 1.7173, + "step": 16170 + }, + { + "epoch": 1.35, + "grad_norm": 4.547085762023926, + "learning_rate": 1.0913623188405797e-05, + "loss": 1.7737, + "step": 16180 + }, + { + "epoch": 1.35, + "grad_norm": 2.6324098110198975, + "learning_rate": 1.0907826086956523e-05, + "loss": 1.7222, + "step": 16190 + }, + { + "epoch": 1.35, + "grad_norm": 4.268777847290039, + "learning_rate": 1.0902028985507247e-05, + "loss": 1.7525, + "step": 16200 + }, + { + "epoch": 1.35, + "grad_norm": 2.096090793609619, + "learning_rate": 1.0896231884057973e-05, + "loss": 1.5166, + "step": 16210 + }, + { + "epoch": 1.35, + "grad_norm": 3.225011110305786, + "learning_rate": 1.0890434782608697e-05, + "loss": 1.7065, + "step": 16220 + }, + { + "epoch": 1.35, + "grad_norm": 2.468010425567627, + "learning_rate": 1.088463768115942e-05, + "loss": 1.7441, + "step": 16230 + }, + { + "epoch": 1.35, + "grad_norm": 3.6063239574432373, + "learning_rate": 1.0878840579710146e-05, + "loss": 1.7374, + "step": 16240 + }, + { + "epoch": 1.35, + "grad_norm": 3.4506964683532715, + "learning_rate": 1.087304347826087e-05, + "loss": 1.8498, + "step": 16250 + }, + { + "epoch": 1.35, + "grad_norm": 2.358245611190796, + "learning_rate": 1.0867246376811596e-05, + "loss": 1.7641, + "step": 16260 + }, + { + "epoch": 1.36, + "grad_norm": 1.0673184394836426, + "learning_rate": 1.0861449275362318e-05, + "loss": 1.7407, + "step": 16270 + }, + { + "epoch": 1.36, + "grad_norm": 1.860028624534607, + "learning_rate": 1.0855652173913046e-05, + "loss": 1.7653, + "step": 16280 + }, + { + "epoch": 1.36, + "grad_norm": 4.493389129638672, + "learning_rate": 1.0849855072463768e-05, + "loss": 1.5432, + "step": 16290 + }, + { + "epoch": 1.36, + "grad_norm": 3.1333208084106445, + "learning_rate": 1.0844057971014494e-05, + "loss": 1.5558, + "step": 16300 + }, + { + "epoch": 1.36, + "grad_norm": 2.722856283187866, + "learning_rate": 1.0838260869565218e-05, + "loss": 1.7136, + "step": 16310 + }, + { + "epoch": 1.36, + "grad_norm": 6.029232501983643, + "learning_rate": 1.0832463768115943e-05, + "loss": 1.7611, + "step": 16320 + }, + { + "epoch": 1.36, + "grad_norm": 3.4627902507781982, + "learning_rate": 1.0826666666666667e-05, + "loss": 1.6839, + "step": 16330 + }, + { + "epoch": 1.36, + "grad_norm": 1.860472559928894, + "learning_rate": 1.0820869565217391e-05, + "loss": 1.6267, + "step": 16340 + }, + { + "epoch": 1.36, + "grad_norm": 1.3834178447723389, + "learning_rate": 1.0815072463768117e-05, + "loss": 1.6913, + "step": 16350 + }, + { + "epoch": 1.36, + "grad_norm": 2.249950885772705, + "learning_rate": 1.0809275362318841e-05, + "loss": 1.6362, + "step": 16360 + }, + { + "epoch": 1.36, + "grad_norm": 4.343896865844727, + "learning_rate": 1.0803478260869567e-05, + "loss": 1.7563, + "step": 16370 + }, + { + "epoch": 1.36, + "grad_norm": 2.7473175525665283, + "learning_rate": 1.0797681159420291e-05, + "loss": 1.6708, + "step": 16380 + }, + { + "epoch": 1.37, + "grad_norm": 1.320664882659912, + "learning_rate": 1.0791884057971017e-05, + "loss": 1.8427, + "step": 16390 + }, + { + "epoch": 1.37, + "grad_norm": 6.2452778816223145, + "learning_rate": 1.0786086956521739e-05, + "loss": 1.6387, + "step": 16400 + }, + { + "epoch": 1.37, + "grad_norm": 4.248746395111084, + "learning_rate": 1.0780289855072466e-05, + "loss": 1.5971, + "step": 16410 + }, + { + "epoch": 1.37, + "grad_norm": 4.274169921875, + "learning_rate": 1.0774492753623189e-05, + "loss": 1.6122, + "step": 16420 + }, + { + "epoch": 1.37, + "grad_norm": 1.0745168924331665, + "learning_rate": 1.0768695652173914e-05, + "loss": 1.6966, + "step": 16430 + }, + { + "epoch": 1.37, + "grad_norm": 1.370829463005066, + "learning_rate": 1.0762898550724638e-05, + "loss": 1.5828, + "step": 16440 + }, + { + "epoch": 1.37, + "grad_norm": 4.591113567352295, + "learning_rate": 1.0757101449275362e-05, + "loss": 1.6895, + "step": 16450 + }, + { + "epoch": 1.37, + "grad_norm": 2.2173213958740234, + "learning_rate": 1.0751304347826088e-05, + "loss": 1.734, + "step": 16460 + }, + { + "epoch": 1.37, + "grad_norm": 2.926522970199585, + "learning_rate": 1.0745507246376812e-05, + "loss": 1.6714, + "step": 16470 + }, + { + "epoch": 1.37, + "grad_norm": 1.1437100172042847, + "learning_rate": 1.0739710144927538e-05, + "loss": 1.7062, + "step": 16480 + }, + { + "epoch": 1.37, + "grad_norm": 3.2627272605895996, + "learning_rate": 1.0733913043478262e-05, + "loss": 1.5612, + "step": 16490 + }, + { + "epoch": 1.38, + "grad_norm": 2.8069777488708496, + "learning_rate": 1.0728115942028987e-05, + "loss": 1.7116, + "step": 16500 + }, + { + "epoch": 1.38, + "eval_loss": 1.7130539417266846, + "eval_runtime": 107.5132, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 16500 + }, + { + "epoch": 1.38, + "grad_norm": 5.705499172210693, + "learning_rate": 1.0722318840579711e-05, + "loss": 1.6609, + "step": 16510 + }, + { + "epoch": 1.38, + "grad_norm": 2.6209776401519775, + "learning_rate": 1.0716521739130437e-05, + "loss": 1.683, + "step": 16520 + }, + { + "epoch": 1.38, + "grad_norm": 2.3433544635772705, + "learning_rate": 1.071072463768116e-05, + "loss": 1.7747, + "step": 16530 + }, + { + "epoch": 1.38, + "grad_norm": 4.4690423011779785, + "learning_rate": 1.0704927536231883e-05, + "loss": 1.6671, + "step": 16540 + }, + { + "epoch": 1.38, + "grad_norm": 4.0458269119262695, + "learning_rate": 1.069913043478261e-05, + "loss": 1.7513, + "step": 16550 + }, + { + "epoch": 1.38, + "grad_norm": 7.192028522491455, + "learning_rate": 1.0693333333333333e-05, + "loss": 1.7646, + "step": 16560 + }, + { + "epoch": 1.38, + "grad_norm": 4.764258861541748, + "learning_rate": 1.0687536231884059e-05, + "loss": 1.6782, + "step": 16570 + }, + { + "epoch": 1.38, + "grad_norm": 3.255431890487671, + "learning_rate": 1.0681739130434783e-05, + "loss": 1.6417, + "step": 16580 + }, + { + "epoch": 1.38, + "grad_norm": 0.955453634262085, + "learning_rate": 1.0675942028985509e-05, + "loss": 1.7354, + "step": 16590 + }, + { + "epoch": 1.38, + "grad_norm": 2.478759765625, + "learning_rate": 1.0670144927536233e-05, + "loss": 1.7076, + "step": 16600 + }, + { + "epoch": 1.38, + "grad_norm": 10.808612823486328, + "learning_rate": 1.0664347826086958e-05, + "loss": 1.7509, + "step": 16610 + }, + { + "epoch": 1.39, + "grad_norm": 3.3443713188171387, + "learning_rate": 1.0658550724637682e-05, + "loss": 1.6233, + "step": 16620 + }, + { + "epoch": 1.39, + "grad_norm": 6.435446262359619, + "learning_rate": 1.0652753623188408e-05, + "loss": 1.7493, + "step": 16630 + }, + { + "epoch": 1.39, + "grad_norm": 2.0918169021606445, + "learning_rate": 1.0646956521739132e-05, + "loss": 1.7184, + "step": 16640 + }, + { + "epoch": 1.39, + "grad_norm": 0.8072077631950378, + "learning_rate": 1.0641159420289854e-05, + "loss": 1.6724, + "step": 16650 + }, + { + "epoch": 1.39, + "grad_norm": 2.0545530319213867, + "learning_rate": 1.063536231884058e-05, + "loss": 1.7191, + "step": 16660 + }, + { + "epoch": 1.39, + "grad_norm": 4.794948101043701, + "learning_rate": 1.0629565217391304e-05, + "loss": 1.6553, + "step": 16670 + }, + { + "epoch": 1.39, + "grad_norm": 6.329624176025391, + "learning_rate": 1.062376811594203e-05, + "loss": 1.7232, + "step": 16680 + }, + { + "epoch": 1.39, + "grad_norm": 2.5596625804901123, + "learning_rate": 1.0617971014492754e-05, + "loss": 1.77, + "step": 16690 + }, + { + "epoch": 1.39, + "grad_norm": 2.4740288257598877, + "learning_rate": 1.061217391304348e-05, + "loss": 1.6394, + "step": 16700 + }, + { + "epoch": 1.39, + "grad_norm": 0.8927739858627319, + "learning_rate": 1.0606376811594203e-05, + "loss": 1.7045, + "step": 16710 + }, + { + "epoch": 1.39, + "grad_norm": 2.0956900119781494, + "learning_rate": 1.0600579710144929e-05, + "loss": 1.7058, + "step": 16720 + }, + { + "epoch": 1.39, + "grad_norm": 3.9234437942504883, + "learning_rate": 1.0594782608695653e-05, + "loss": 1.6324, + "step": 16730 + }, + { + "epoch": 1.4, + "grad_norm": 4.277024745941162, + "learning_rate": 1.0588985507246379e-05, + "loss": 1.7152, + "step": 16740 + }, + { + "epoch": 1.4, + "grad_norm": 3.355858564376831, + "learning_rate": 1.0583188405797103e-05, + "loss": 1.7431, + "step": 16750 + }, + { + "epoch": 1.4, + "grad_norm": 2.8377106189727783, + "learning_rate": 1.0577391304347827e-05, + "loss": 1.6996, + "step": 16760 + }, + { + "epoch": 1.4, + "grad_norm": 3.7551558017730713, + "learning_rate": 1.0571594202898553e-05, + "loss": 1.597, + "step": 16770 + }, + { + "epoch": 1.4, + "grad_norm": 4.006781578063965, + "learning_rate": 1.0565797101449275e-05, + "loss": 1.7018, + "step": 16780 + }, + { + "epoch": 1.4, + "grad_norm": 2.4323341846466064, + "learning_rate": 1.056e-05, + "loss": 1.5812, + "step": 16790 + }, + { + "epoch": 1.4, + "grad_norm": 3.4357855319976807, + "learning_rate": 1.0554202898550725e-05, + "loss": 1.7026, + "step": 16800 + }, + { + "epoch": 1.4, + "grad_norm": 2.7937941551208496, + "learning_rate": 1.054840579710145e-05, + "loss": 1.7299, + "step": 16810 + }, + { + "epoch": 1.4, + "grad_norm": 2.6717631816864014, + "learning_rate": 1.0542608695652174e-05, + "loss": 1.759, + "step": 16820 + }, + { + "epoch": 1.4, + "grad_norm": 2.9999706745147705, + "learning_rate": 1.05368115942029e-05, + "loss": 1.736, + "step": 16830 + }, + { + "epoch": 1.4, + "grad_norm": 2.3674368858337402, + "learning_rate": 1.0531014492753624e-05, + "loss": 1.7221, + "step": 16840 + }, + { + "epoch": 1.4, + "grad_norm": 1.2170531749725342, + "learning_rate": 1.0525217391304348e-05, + "loss": 1.7482, + "step": 16850 + }, + { + "epoch": 1.41, + "grad_norm": 1.1290643215179443, + "learning_rate": 1.0519420289855074e-05, + "loss": 1.5598, + "step": 16860 + }, + { + "epoch": 1.41, + "grad_norm": 1.8863322734832764, + "learning_rate": 1.0513623188405798e-05, + "loss": 1.5603, + "step": 16870 + }, + { + "epoch": 1.41, + "grad_norm": 5.208227634429932, + "learning_rate": 1.0507826086956523e-05, + "loss": 1.7284, + "step": 16880 + }, + { + "epoch": 1.41, + "grad_norm": 6.908130168914795, + "learning_rate": 1.0502028985507247e-05, + "loss": 1.6787, + "step": 16890 + }, + { + "epoch": 1.41, + "grad_norm": 7.316353797912598, + "learning_rate": 1.0496231884057973e-05, + "loss": 1.8262, + "step": 16900 + }, + { + "epoch": 1.41, + "grad_norm": 8.9257173538208, + "learning_rate": 1.0490434782608695e-05, + "loss": 1.7177, + "step": 16910 + }, + { + "epoch": 1.41, + "grad_norm": 8.56701946258545, + "learning_rate": 1.0484637681159423e-05, + "loss": 1.6473, + "step": 16920 + }, + { + "epoch": 1.41, + "grad_norm": 2.9926609992980957, + "learning_rate": 1.0478840579710145e-05, + "loss": 1.7083, + "step": 16930 + }, + { + "epoch": 1.41, + "grad_norm": 3.859083414077759, + "learning_rate": 1.047304347826087e-05, + "loss": 1.7799, + "step": 16940 + }, + { + "epoch": 1.41, + "grad_norm": 1.5566657781600952, + "learning_rate": 1.0467246376811595e-05, + "loss": 1.7282, + "step": 16950 + }, + { + "epoch": 1.41, + "grad_norm": 8.074483871459961, + "learning_rate": 1.0461449275362319e-05, + "loss": 1.6706, + "step": 16960 + }, + { + "epoch": 1.41, + "grad_norm": 3.9916458129882812, + "learning_rate": 1.0455652173913045e-05, + "loss": 1.6624, + "step": 16970 + }, + { + "epoch": 1.42, + "grad_norm": 4.076849937438965, + "learning_rate": 1.0449855072463769e-05, + "loss": 1.7047, + "step": 16980 + }, + { + "epoch": 1.42, + "grad_norm": 2.859311819076538, + "learning_rate": 1.0444057971014494e-05, + "loss": 1.6748, + "step": 16990 + }, + { + "epoch": 1.42, + "grad_norm": 2.4924099445343018, + "learning_rate": 1.0438260869565218e-05, + "loss": 1.7947, + "step": 17000 + }, + { + "epoch": 1.42, + "eval_loss": 1.6634085178375244, + "eval_runtime": 107.5078, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 17000 + }, + { + "epoch": 1.42, + "grad_norm": 1.4697929620742798, + "learning_rate": 1.0432463768115944e-05, + "loss": 1.652, + "step": 17010 + }, + { + "epoch": 1.42, + "grad_norm": 2.3207297325134277, + "learning_rate": 1.0426666666666668e-05, + "loss": 1.6632, + "step": 17020 + }, + { + "epoch": 1.42, + "grad_norm": 2.104315757751465, + "learning_rate": 1.0420869565217394e-05, + "loss": 1.6028, + "step": 17030 + }, + { + "epoch": 1.42, + "grad_norm": 1.517398715019226, + "learning_rate": 1.0415072463768116e-05, + "loss": 1.7276, + "step": 17040 + }, + { + "epoch": 1.42, + "grad_norm": 3.3302900791168213, + "learning_rate": 1.0409275362318843e-05, + "loss": 1.5883, + "step": 17050 + }, + { + "epoch": 1.42, + "grad_norm": 3.5856385231018066, + "learning_rate": 1.0403478260869566e-05, + "loss": 1.6082, + "step": 17060 + }, + { + "epoch": 1.42, + "grad_norm": 7.210336685180664, + "learning_rate": 1.039768115942029e-05, + "loss": 1.7695, + "step": 17070 + }, + { + "epoch": 1.42, + "grad_norm": 2.3868658542633057, + "learning_rate": 1.0391884057971015e-05, + "loss": 1.6391, + "step": 17080 + }, + { + "epoch": 1.42, + "grad_norm": 7.809682369232178, + "learning_rate": 1.038608695652174e-05, + "loss": 1.7408, + "step": 17090 + }, + { + "epoch": 1.43, + "grad_norm": 3.6190929412841797, + "learning_rate": 1.0380289855072465e-05, + "loss": 1.5544, + "step": 17100 + }, + { + "epoch": 1.43, + "grad_norm": 1.5290706157684326, + "learning_rate": 1.0374492753623189e-05, + "loss": 1.6121, + "step": 17110 + }, + { + "epoch": 1.43, + "grad_norm": 0.6158972382545471, + "learning_rate": 1.0368695652173915e-05, + "loss": 1.6292, + "step": 17120 + }, + { + "epoch": 1.43, + "grad_norm": 2.539661407470703, + "learning_rate": 1.0362898550724639e-05, + "loss": 1.6785, + "step": 17130 + }, + { + "epoch": 1.43, + "grad_norm": 2.0767712593078613, + "learning_rate": 1.0357101449275364e-05, + "loss": 1.7784, + "step": 17140 + }, + { + "epoch": 1.43, + "grad_norm": 3.4671566486358643, + "learning_rate": 1.0351304347826088e-05, + "loss": 1.6613, + "step": 17150 + }, + { + "epoch": 1.43, + "grad_norm": 3.9954442977905273, + "learning_rate": 1.034550724637681e-05, + "loss": 1.6233, + "step": 17160 + }, + { + "epoch": 1.43, + "grad_norm": 1.9763154983520508, + "learning_rate": 1.0339710144927536e-05, + "loss": 1.6937, + "step": 17170 + }, + { + "epoch": 1.43, + "grad_norm": 1.507520079612732, + "learning_rate": 1.033391304347826e-05, + "loss": 1.765, + "step": 17180 + }, + { + "epoch": 1.43, + "grad_norm": 3.03114652633667, + "learning_rate": 1.0328115942028986e-05, + "loss": 1.6805, + "step": 17190 + }, + { + "epoch": 1.43, + "grad_norm": 3.9964370727539062, + "learning_rate": 1.032231884057971e-05, + "loss": 1.7913, + "step": 17200 + }, + { + "epoch": 1.43, + "grad_norm": 1.9785213470458984, + "learning_rate": 1.0316521739130436e-05, + "loss": 1.6369, + "step": 17210 + }, + { + "epoch": 1.44, + "grad_norm": 5.543067455291748, + "learning_rate": 1.031072463768116e-05, + "loss": 1.6102, + "step": 17220 + }, + { + "epoch": 1.44, + "grad_norm": 4.464430332183838, + "learning_rate": 1.0304927536231886e-05, + "loss": 1.6151, + "step": 17230 + }, + { + "epoch": 1.44, + "grad_norm": 2.594529390335083, + "learning_rate": 1.029913043478261e-05, + "loss": 1.68, + "step": 17240 + }, + { + "epoch": 1.44, + "grad_norm": 5.4555535316467285, + "learning_rate": 1.0293333333333335e-05, + "loss": 1.6307, + "step": 17250 + }, + { + "epoch": 1.44, + "grad_norm": 8.041542053222656, + "learning_rate": 1.028753623188406e-05, + "loss": 1.7092, + "step": 17260 + }, + { + "epoch": 1.44, + "grad_norm": 2.2063286304473877, + "learning_rate": 1.0281739130434782e-05, + "loss": 1.7111, + "step": 17270 + }, + { + "epoch": 1.44, + "grad_norm": 5.018704891204834, + "learning_rate": 1.0275942028985509e-05, + "loss": 1.6344, + "step": 17280 + }, + { + "epoch": 1.44, + "grad_norm": 6.327304840087891, + "learning_rate": 1.0270144927536231e-05, + "loss": 1.6644, + "step": 17290 + }, + { + "epoch": 1.44, + "grad_norm": 2.130178213119507, + "learning_rate": 1.0264347826086957e-05, + "loss": 1.7562, + "step": 17300 + }, + { + "epoch": 1.44, + "grad_norm": 3.026088237762451, + "learning_rate": 1.0258550724637681e-05, + "loss": 1.6024, + "step": 17310 + }, + { + "epoch": 1.44, + "grad_norm": 1.9865357875823975, + "learning_rate": 1.0252753623188407e-05, + "loss": 1.6829, + "step": 17320 + }, + { + "epoch": 1.44, + "grad_norm": 1.843345046043396, + "learning_rate": 1.024695652173913e-05, + "loss": 1.7297, + "step": 17330 + }, + { + "epoch": 1.45, + "grad_norm": 2.475865125656128, + "learning_rate": 1.0241159420289856e-05, + "loss": 1.6453, + "step": 17340 + }, + { + "epoch": 1.45, + "grad_norm": 5.109663963317871, + "learning_rate": 1.023536231884058e-05, + "loss": 1.6039, + "step": 17350 + }, + { + "epoch": 1.45, + "grad_norm": 4.058063507080078, + "learning_rate": 1.0229565217391306e-05, + "loss": 1.7368, + "step": 17360 + }, + { + "epoch": 1.45, + "grad_norm": 7.7936859130859375, + "learning_rate": 1.022376811594203e-05, + "loss": 1.6527, + "step": 17370 + }, + { + "epoch": 1.45, + "grad_norm": 1.9645744562149048, + "learning_rate": 1.0217971014492754e-05, + "loss": 1.6359, + "step": 17380 + }, + { + "epoch": 1.45, + "grad_norm": 1.9280155897140503, + "learning_rate": 1.021217391304348e-05, + "loss": 1.6755, + "step": 17390 + }, + { + "epoch": 1.45, + "grad_norm": 11.76801586151123, + "learning_rate": 1.0206376811594204e-05, + "loss": 1.5459, + "step": 17400 + }, + { + "epoch": 1.45, + "grad_norm": 2.5407767295837402, + "learning_rate": 1.020057971014493e-05, + "loss": 1.6835, + "step": 17410 + }, + { + "epoch": 1.45, + "grad_norm": 3.4772510528564453, + "learning_rate": 1.0194782608695652e-05, + "loss": 1.7516, + "step": 17420 + }, + { + "epoch": 1.45, + "grad_norm": 1.1580195426940918, + "learning_rate": 1.0188985507246378e-05, + "loss": 1.7234, + "step": 17430 + }, + { + "epoch": 1.45, + "grad_norm": 1.6305845975875854, + "learning_rate": 1.0183188405797102e-05, + "loss": 1.5738, + "step": 17440 + }, + { + "epoch": 1.45, + "grad_norm": 3.306994676589966, + "learning_rate": 1.0177391304347827e-05, + "loss": 1.6702, + "step": 17450 + }, + { + "epoch": 1.46, + "grad_norm": 4.651767253875732, + "learning_rate": 1.0171594202898551e-05, + "loss": 1.6585, + "step": 17460 + }, + { + "epoch": 1.46, + "grad_norm": 8.080167770385742, + "learning_rate": 1.0165797101449275e-05, + "loss": 1.8123, + "step": 17470 + }, + { + "epoch": 1.46, + "grad_norm": 3.453287363052368, + "learning_rate": 1.0160000000000001e-05, + "loss": 1.601, + "step": 17480 + }, + { + "epoch": 1.46, + "grad_norm": 2.5909740924835205, + "learning_rate": 1.0154202898550725e-05, + "loss": 1.7291, + "step": 17490 + }, + { + "epoch": 1.46, + "grad_norm": 4.380848407745361, + "learning_rate": 1.014840579710145e-05, + "loss": 1.6845, + "step": 17500 + }, + { + "epoch": 1.46, + "eval_loss": 1.6659685373306274, + "eval_runtime": 107.5088, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 17500 + }, + { + "epoch": 1.46, + "grad_norm": 1.1240538358688354, + "learning_rate": 1.0142608695652175e-05, + "loss": 1.6778, + "step": 17510 + }, + { + "epoch": 1.46, + "grad_norm": 4.835226535797119, + "learning_rate": 1.01368115942029e-05, + "loss": 1.7074, + "step": 17520 + }, + { + "epoch": 1.46, + "grad_norm": 4.039511680603027, + "learning_rate": 1.0131014492753624e-05, + "loss": 1.6447, + "step": 17530 + }, + { + "epoch": 1.46, + "grad_norm": 11.439852714538574, + "learning_rate": 1.012521739130435e-05, + "loss": 1.5716, + "step": 17540 + }, + { + "epoch": 1.46, + "grad_norm": 7.208695411682129, + "learning_rate": 1.0119420289855072e-05, + "loss": 1.5687, + "step": 17550 + }, + { + "epoch": 1.46, + "grad_norm": 12.316939353942871, + "learning_rate": 1.0113623188405798e-05, + "loss": 1.6564, + "step": 17560 + }, + { + "epoch": 1.46, + "grad_norm": 4.0833916664123535, + "learning_rate": 1.0107826086956522e-05, + "loss": 1.5983, + "step": 17570 + }, + { + "epoch": 1.47, + "grad_norm": 6.681445121765137, + "learning_rate": 1.0102028985507246e-05, + "loss": 1.6755, + "step": 17580 + }, + { + "epoch": 1.47, + "grad_norm": 5.685708045959473, + "learning_rate": 1.0096231884057972e-05, + "loss": 1.7461, + "step": 17590 + }, + { + "epoch": 1.47, + "grad_norm": 0.8818423748016357, + "learning_rate": 1.0090434782608696e-05, + "loss": 1.6818, + "step": 17600 + }, + { + "epoch": 1.47, + "grad_norm": 3.1815342903137207, + "learning_rate": 1.0084637681159422e-05, + "loss": 1.7476, + "step": 17610 + }, + { + "epoch": 1.47, + "grad_norm": 4.193720817565918, + "learning_rate": 1.0078840579710146e-05, + "loss": 1.8282, + "step": 17620 + }, + { + "epoch": 1.47, + "grad_norm": 2.380415439605713, + "learning_rate": 1.0073043478260871e-05, + "loss": 1.5551, + "step": 17630 + }, + { + "epoch": 1.47, + "grad_norm": 2.54988956451416, + "learning_rate": 1.0067246376811595e-05, + "loss": 1.6923, + "step": 17640 + }, + { + "epoch": 1.47, + "grad_norm": 3.875814914703369, + "learning_rate": 1.0061449275362321e-05, + "loss": 1.6452, + "step": 17650 + }, + { + "epoch": 1.47, + "grad_norm": 1.4553890228271484, + "learning_rate": 1.0055652173913045e-05, + "loss": 1.734, + "step": 17660 + }, + { + "epoch": 1.47, + "grad_norm": 3.171555280685425, + "learning_rate": 1.004985507246377e-05, + "loss": 1.4397, + "step": 17670 + }, + { + "epoch": 1.47, + "grad_norm": 3.2089016437530518, + "learning_rate": 1.0044057971014493e-05, + "loss": 1.5919, + "step": 17680 + }, + { + "epoch": 1.47, + "grad_norm": 6.439237117767334, + "learning_rate": 1.0038260869565217e-05, + "loss": 1.7071, + "step": 17690 + }, + { + "epoch": 1.48, + "grad_norm": 3.091012954711914, + "learning_rate": 1.0032463768115943e-05, + "loss": 1.6278, + "step": 17700 + }, + { + "epoch": 1.48, + "grad_norm": 1.4860005378723145, + "learning_rate": 1.0026666666666667e-05, + "loss": 1.6406, + "step": 17710 + }, + { + "epoch": 1.48, + "grad_norm": 8.64132308959961, + "learning_rate": 1.0020869565217392e-05, + "loss": 1.6317, + "step": 17720 + }, + { + "epoch": 1.48, + "grad_norm": 2.587737798690796, + "learning_rate": 1.0015072463768116e-05, + "loss": 1.5913, + "step": 17730 + }, + { + "epoch": 1.48, + "grad_norm": 5.59721040725708, + "learning_rate": 1.0009275362318842e-05, + "loss": 1.4954, + "step": 17740 + }, + { + "epoch": 1.48, + "grad_norm": 7.057238578796387, + "learning_rate": 1.0003478260869566e-05, + "loss": 1.7462, + "step": 17750 + }, + { + "epoch": 1.48, + "grad_norm": 4.500436782836914, + "learning_rate": 9.99768115942029e-06, + "loss": 1.6778, + "step": 17760 + }, + { + "epoch": 1.48, + "grad_norm": 1.1724953651428223, + "learning_rate": 9.991884057971016e-06, + "loss": 1.6173, + "step": 17770 + }, + { + "epoch": 1.48, + "grad_norm": 3.133427619934082, + "learning_rate": 9.98608695652174e-06, + "loss": 1.6226, + "step": 17780 + }, + { + "epoch": 1.48, + "grad_norm": 4.147580623626709, + "learning_rate": 9.980289855072465e-06, + "loss": 1.6144, + "step": 17790 + }, + { + "epoch": 1.48, + "grad_norm": 4.114604949951172, + "learning_rate": 9.97449275362319e-06, + "loss": 1.7495, + "step": 17800 + }, + { + "epoch": 1.48, + "grad_norm": 2.3972349166870117, + "learning_rate": 9.968695652173913e-06, + "loss": 1.686, + "step": 17810 + }, + { + "epoch": 1.48, + "grad_norm": 4.335039138793945, + "learning_rate": 9.96289855072464e-06, + "loss": 1.6874, + "step": 17820 + }, + { + "epoch": 1.49, + "grad_norm": 2.4044220447540283, + "learning_rate": 9.957101449275363e-06, + "loss": 1.6847, + "step": 17830 + }, + { + "epoch": 1.49, + "grad_norm": 7.1201982498168945, + "learning_rate": 9.951304347826087e-06, + "loss": 1.6875, + "step": 17840 + }, + { + "epoch": 1.49, + "grad_norm": 1.6048938035964966, + "learning_rate": 9.945507246376813e-06, + "loss": 1.6163, + "step": 17850 + }, + { + "epoch": 1.49, + "grad_norm": 1.1629419326782227, + "learning_rate": 9.939710144927537e-06, + "loss": 1.7531, + "step": 17860 + }, + { + "epoch": 1.49, + "grad_norm": 9.67171573638916, + "learning_rate": 9.933913043478261e-06, + "loss": 1.804, + "step": 17870 + }, + { + "epoch": 1.49, + "grad_norm": 1.4759604930877686, + "learning_rate": 9.928115942028987e-06, + "loss": 1.762, + "step": 17880 + }, + { + "epoch": 1.49, + "grad_norm": 3.9004738330841064, + "learning_rate": 9.92231884057971e-06, + "loss": 1.6533, + "step": 17890 + }, + { + "epoch": 1.49, + "grad_norm": 4.209875106811523, + "learning_rate": 9.916521739130436e-06, + "loss": 1.6523, + "step": 17900 + }, + { + "epoch": 1.49, + "grad_norm": 3.8896372318267822, + "learning_rate": 9.91072463768116e-06, + "loss": 1.6475, + "step": 17910 + }, + { + "epoch": 1.49, + "grad_norm": 1.572081208229065, + "learning_rate": 9.904927536231886e-06, + "loss": 1.5881, + "step": 17920 + }, + { + "epoch": 1.49, + "grad_norm": 3.5894155502319336, + "learning_rate": 9.89913043478261e-06, + "loss": 1.6233, + "step": 17930 + }, + { + "epoch": 1.5, + "grad_norm": 4.091580867767334, + "learning_rate": 9.893333333333334e-06, + "loss": 1.7261, + "step": 17940 + }, + { + "epoch": 1.5, + "grad_norm": 2.4239258766174316, + "learning_rate": 9.887536231884058e-06, + "loss": 1.6872, + "step": 17950 + }, + { + "epoch": 1.5, + "grad_norm": 2.6650853157043457, + "learning_rate": 9.881739130434784e-06, + "loss": 1.5972, + "step": 17960 + }, + { + "epoch": 1.5, + "grad_norm": 2.779630661010742, + "learning_rate": 9.875942028985508e-06, + "loss": 1.5442, + "step": 17970 + }, + { + "epoch": 1.5, + "grad_norm": 0.8651297092437744, + "learning_rate": 9.870144927536233e-06, + "loss": 1.6394, + "step": 17980 + }, + { + "epoch": 1.5, + "grad_norm": 5.072394371032715, + "learning_rate": 9.864347826086957e-06, + "loss": 1.6355, + "step": 17990 + }, + { + "epoch": 1.5, + "grad_norm": 4.223601341247559, + "learning_rate": 9.858550724637681e-06, + "loss": 1.7034, + "step": 18000 + }, + { + "epoch": 1.5, + "eval_loss": 1.6817396879196167, + "eval_runtime": 107.5133, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 18000 + }, + { + "epoch": 1.5, + "grad_norm": 5.345555782318115, + "learning_rate": 9.852753623188407e-06, + "loss": 1.6598, + "step": 18010 + }, + { + "epoch": 1.5, + "grad_norm": 4.217700958251953, + "learning_rate": 9.846956521739131e-06, + "loss": 1.7581, + "step": 18020 + }, + { + "epoch": 1.5, + "grad_norm": 5.60097074508667, + "learning_rate": 9.841159420289857e-06, + "loss": 1.4824, + "step": 18030 + }, + { + "epoch": 1.5, + "grad_norm": 2.772341012954712, + "learning_rate": 9.83536231884058e-06, + "loss": 1.6643, + "step": 18040 + }, + { + "epoch": 1.5, + "grad_norm": 1.193320631980896, + "learning_rate": 9.829565217391305e-06, + "loss": 1.8004, + "step": 18050 + }, + { + "epoch": 1.5, + "grad_norm": 5.566644191741943, + "learning_rate": 9.823768115942029e-06, + "loss": 1.7329, + "step": 18060 + }, + { + "epoch": 1.51, + "grad_norm": 5.771097183227539, + "learning_rate": 9.817971014492755e-06, + "loss": 1.5573, + "step": 18070 + }, + { + "epoch": 1.51, + "grad_norm": 5.004515647888184, + "learning_rate": 9.812173913043479e-06, + "loss": 1.5583, + "step": 18080 + }, + { + "epoch": 1.51, + "grad_norm": 3.670802116394043, + "learning_rate": 9.806376811594204e-06, + "loss": 1.6425, + "step": 18090 + }, + { + "epoch": 1.51, + "grad_norm": 4.591397762298584, + "learning_rate": 9.800579710144928e-06, + "loss": 1.7255, + "step": 18100 + }, + { + "epoch": 1.51, + "grad_norm": 4.184049606323242, + "learning_rate": 9.794782608695654e-06, + "loss": 1.6388, + "step": 18110 + }, + { + "epoch": 1.51, + "grad_norm": 5.081575870513916, + "learning_rate": 9.788985507246378e-06, + "loss": 1.741, + "step": 18120 + }, + { + "epoch": 1.51, + "grad_norm": 2.1452643871307373, + "learning_rate": 9.783188405797102e-06, + "loss": 1.5816, + "step": 18130 + }, + { + "epoch": 1.51, + "grad_norm": 2.788238048553467, + "learning_rate": 9.777391304347826e-06, + "loss": 1.6184, + "step": 18140 + }, + { + "epoch": 1.51, + "grad_norm": 5.260552883148193, + "learning_rate": 9.771594202898552e-06, + "loss": 1.6836, + "step": 18150 + }, + { + "epoch": 1.51, + "grad_norm": 4.803924083709717, + "learning_rate": 9.765797101449276e-06, + "loss": 1.5638, + "step": 18160 + }, + { + "epoch": 1.51, + "grad_norm": 1.803575873374939, + "learning_rate": 9.760000000000001e-06, + "loss": 1.7536, + "step": 18170 + }, + { + "epoch": 1.52, + "grad_norm": 2.703744888305664, + "learning_rate": 9.754202898550725e-06, + "loss": 1.6187, + "step": 18180 + }, + { + "epoch": 1.52, + "grad_norm": 4.61587381362915, + "learning_rate": 9.74840579710145e-06, + "loss": 1.5927, + "step": 18190 + }, + { + "epoch": 1.52, + "grad_norm": 2.6593663692474365, + "learning_rate": 9.742608695652175e-06, + "loss": 1.8459, + "step": 18200 + }, + { + "epoch": 1.52, + "grad_norm": 3.2275209426879883, + "learning_rate": 9.736811594202899e-06, + "loss": 1.5672, + "step": 18210 + }, + { + "epoch": 1.52, + "grad_norm": 4.009672164916992, + "learning_rate": 9.731014492753625e-06, + "loss": 1.613, + "step": 18220 + }, + { + "epoch": 1.52, + "grad_norm": 0.9421452879905701, + "learning_rate": 9.725217391304349e-06, + "loss": 1.8051, + "step": 18230 + }, + { + "epoch": 1.52, + "grad_norm": 4.675242900848389, + "learning_rate": 9.719420289855075e-06, + "loss": 1.5781, + "step": 18240 + }, + { + "epoch": 1.52, + "grad_norm": 4.626055717468262, + "learning_rate": 9.713623188405797e-06, + "loss": 1.6158, + "step": 18250 + }, + { + "epoch": 1.52, + "grad_norm": 2.181307554244995, + "learning_rate": 9.707826086956523e-06, + "loss": 1.6871, + "step": 18260 + }, + { + "epoch": 1.52, + "grad_norm": 2.5483827590942383, + "learning_rate": 9.702028985507247e-06, + "loss": 1.6884, + "step": 18270 + }, + { + "epoch": 1.52, + "grad_norm": 2.3285396099090576, + "learning_rate": 9.696231884057972e-06, + "loss": 1.7371, + "step": 18280 + }, + { + "epoch": 1.52, + "grad_norm": 6.175654888153076, + "learning_rate": 9.690434782608696e-06, + "loss": 1.7425, + "step": 18290 + }, + { + "epoch": 1.52, + "grad_norm": 2.972175359725952, + "learning_rate": 9.684637681159422e-06, + "loss": 1.838, + "step": 18300 + }, + { + "epoch": 1.53, + "grad_norm": 3.2489099502563477, + "learning_rate": 9.678840579710146e-06, + "loss": 1.6443, + "step": 18310 + }, + { + "epoch": 1.53, + "grad_norm": 4.820380210876465, + "learning_rate": 9.67304347826087e-06, + "loss": 1.6839, + "step": 18320 + }, + { + "epoch": 1.53, + "grad_norm": 1.2533926963806152, + "learning_rate": 9.667246376811596e-06, + "loss": 1.6476, + "step": 18330 + }, + { + "epoch": 1.53, + "grad_norm": 4.8515777587890625, + "learning_rate": 9.66144927536232e-06, + "loss": 1.6342, + "step": 18340 + }, + { + "epoch": 1.53, + "grad_norm": 1.8389453887939453, + "learning_rate": 9.655652173913044e-06, + "loss": 1.5585, + "step": 18350 + }, + { + "epoch": 1.53, + "grad_norm": 1.700596570968628, + "learning_rate": 9.649855072463768e-06, + "loss": 1.5371, + "step": 18360 + }, + { + "epoch": 1.53, + "grad_norm": 2.20552396774292, + "learning_rate": 9.644057971014493e-06, + "loss": 1.7413, + "step": 18370 + }, + { + "epoch": 1.53, + "grad_norm": 9.809810638427734, + "learning_rate": 9.638260869565217e-06, + "loss": 1.6194, + "step": 18380 + }, + { + "epoch": 1.53, + "grad_norm": 1.2184398174285889, + "learning_rate": 9.632463768115943e-06, + "loss": 1.7167, + "step": 18390 + }, + { + "epoch": 1.53, + "grad_norm": 1.6306843757629395, + "learning_rate": 9.626666666666667e-06, + "loss": 1.7024, + "step": 18400 + }, + { + "epoch": 1.53, + "grad_norm": 4.053318500518799, + "learning_rate": 9.620869565217393e-06, + "loss": 1.6515, + "step": 18410 + }, + { + "epoch": 1.54, + "grad_norm": 7.242835998535156, + "learning_rate": 9.615072463768117e-06, + "loss": 1.542, + "step": 18420 + }, + { + "epoch": 1.54, + "grad_norm": 3.169401168823242, + "learning_rate": 9.609275362318843e-06, + "loss": 1.6536, + "step": 18430 + }, + { + "epoch": 1.54, + "grad_norm": 2.6384594440460205, + "learning_rate": 9.603478260869567e-06, + "loss": 1.6468, + "step": 18440 + }, + { + "epoch": 1.54, + "grad_norm": 4.198095321655273, + "learning_rate": 9.59768115942029e-06, + "loss": 1.6955, + "step": 18450 + }, + { + "epoch": 1.54, + "grad_norm": 5.319828987121582, + "learning_rate": 9.591884057971015e-06, + "loss": 1.7273, + "step": 18460 + }, + { + "epoch": 1.54, + "grad_norm": 4.001225471496582, + "learning_rate": 9.58608695652174e-06, + "loss": 1.8093, + "step": 18470 + }, + { + "epoch": 1.54, + "grad_norm": 4.059565544128418, + "learning_rate": 9.580289855072464e-06, + "loss": 1.6228, + "step": 18480 + }, + { + "epoch": 1.54, + "grad_norm": 0.9582866430282593, + "learning_rate": 9.574492753623188e-06, + "loss": 1.6258, + "step": 18490 + }, + { + "epoch": 1.54, + "grad_norm": 1.2751723527908325, + "learning_rate": 9.568695652173914e-06, + "loss": 1.7295, + "step": 18500 + }, + { + "epoch": 1.54, + "eval_loss": 1.6792817115783691, + "eval_runtime": 107.5135, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 18500 + }, + { + "epoch": 1.54, + "grad_norm": 2.7914340496063232, + "learning_rate": 9.562898550724638e-06, + "loss": 1.6173, + "step": 18510 + }, + { + "epoch": 1.54, + "grad_norm": 7.133903980255127, + "learning_rate": 9.557101449275364e-06, + "loss": 1.6678, + "step": 18520 + }, + { + "epoch": 1.54, + "grad_norm": 2.9746956825256348, + "learning_rate": 9.551304347826088e-06, + "loss": 1.6362, + "step": 18530 + }, + { + "epoch": 1.54, + "grad_norm": 5.67676305770874, + "learning_rate": 9.545507246376813e-06, + "loss": 1.6312, + "step": 18540 + }, + { + "epoch": 1.55, + "grad_norm": 1.4198997020721436, + "learning_rate": 9.539710144927537e-06, + "loss": 1.6763, + "step": 18550 + }, + { + "epoch": 1.55, + "grad_norm": 1.5471702814102173, + "learning_rate": 9.533913043478261e-06, + "loss": 1.6033, + "step": 18560 + }, + { + "epoch": 1.55, + "grad_norm": 2.1242103576660156, + "learning_rate": 9.528115942028985e-06, + "loss": 1.5981, + "step": 18570 + }, + { + "epoch": 1.55, + "grad_norm": 3.2516043186187744, + "learning_rate": 9.522318840579711e-06, + "loss": 1.7269, + "step": 18580 + }, + { + "epoch": 1.55, + "grad_norm": 2.795421600341797, + "learning_rate": 9.516521739130435e-06, + "loss": 1.6065, + "step": 18590 + }, + { + "epoch": 1.55, + "grad_norm": 3.5633459091186523, + "learning_rate": 9.51072463768116e-06, + "loss": 1.621, + "step": 18600 + }, + { + "epoch": 1.55, + "grad_norm": 4.669309616088867, + "learning_rate": 9.504927536231885e-06, + "loss": 1.6455, + "step": 18610 + }, + { + "epoch": 1.55, + "grad_norm": 4.369694232940674, + "learning_rate": 9.49913043478261e-06, + "loss": 1.55, + "step": 18620 + }, + { + "epoch": 1.55, + "grad_norm": 2.2723934650421143, + "learning_rate": 9.493333333333334e-06, + "loss": 1.7658, + "step": 18630 + }, + { + "epoch": 1.55, + "grad_norm": 3.9826152324676514, + "learning_rate": 9.487536231884058e-06, + "loss": 1.6784, + "step": 18640 + }, + { + "epoch": 1.55, + "grad_norm": 2.489523410797119, + "learning_rate": 9.481739130434784e-06, + "loss": 1.7932, + "step": 18650 + }, + { + "epoch": 1.56, + "grad_norm": 2.9629557132720947, + "learning_rate": 9.475942028985508e-06, + "loss": 1.7041, + "step": 18660 + }, + { + "epoch": 1.56, + "grad_norm": 4.74808406829834, + "learning_rate": 9.470144927536232e-06, + "loss": 1.6448, + "step": 18670 + }, + { + "epoch": 1.56, + "grad_norm": 1.5019195079803467, + "learning_rate": 9.464347826086956e-06, + "loss": 1.7318, + "step": 18680 + }, + { + "epoch": 1.56, + "grad_norm": 4.352245330810547, + "learning_rate": 9.458550724637682e-06, + "loss": 1.7178, + "step": 18690 + }, + { + "epoch": 1.56, + "grad_norm": 2.7317593097686768, + "learning_rate": 9.452753623188406e-06, + "loss": 1.6883, + "step": 18700 + }, + { + "epoch": 1.56, + "grad_norm": 1.4841550588607788, + "learning_rate": 9.446956521739132e-06, + "loss": 1.654, + "step": 18710 + }, + { + "epoch": 1.56, + "grad_norm": 2.4384384155273438, + "learning_rate": 9.441159420289856e-06, + "loss": 1.6427, + "step": 18720 + }, + { + "epoch": 1.56, + "grad_norm": 2.350482225418091, + "learning_rate": 9.435362318840581e-06, + "loss": 1.5817, + "step": 18730 + }, + { + "epoch": 1.56, + "grad_norm": 2.0648393630981445, + "learning_rate": 9.429565217391305e-06, + "loss": 1.4933, + "step": 18740 + }, + { + "epoch": 1.56, + "grad_norm": 4.213344573974609, + "learning_rate": 9.423768115942031e-06, + "loss": 1.575, + "step": 18750 + }, + { + "epoch": 1.56, + "grad_norm": 7.273472309112549, + "learning_rate": 9.417971014492753e-06, + "loss": 1.726, + "step": 18760 + }, + { + "epoch": 1.56, + "grad_norm": 2.119070053100586, + "learning_rate": 9.412173913043479e-06, + "loss": 1.7348, + "step": 18770 + }, + { + "epoch": 1.56, + "grad_norm": 4.862486362457275, + "learning_rate": 9.406376811594203e-06, + "loss": 1.6862, + "step": 18780 + }, + { + "epoch": 1.57, + "grad_norm": 0.6140770316123962, + "learning_rate": 9.400579710144929e-06, + "loss": 1.5872, + "step": 18790 + }, + { + "epoch": 1.57, + "grad_norm": 3.3810670375823975, + "learning_rate": 9.394782608695653e-06, + "loss": 1.6539, + "step": 18800 + }, + { + "epoch": 1.57, + "grad_norm": 1.982848882675171, + "learning_rate": 9.388985507246377e-06, + "loss": 1.8119, + "step": 18810 + }, + { + "epoch": 1.57, + "grad_norm": 1.6384955644607544, + "learning_rate": 9.383188405797102e-06, + "loss": 1.5366, + "step": 18820 + }, + { + "epoch": 1.57, + "grad_norm": 6.467543125152588, + "learning_rate": 9.377391304347826e-06, + "loss": 1.641, + "step": 18830 + }, + { + "epoch": 1.57, + "grad_norm": 6.90090799331665, + "learning_rate": 9.371594202898552e-06, + "loss": 1.6931, + "step": 18840 + }, + { + "epoch": 1.57, + "grad_norm": 5.378636837005615, + "learning_rate": 9.365797101449276e-06, + "loss": 1.7595, + "step": 18850 + }, + { + "epoch": 1.57, + "grad_norm": 4.310837268829346, + "learning_rate": 9.360000000000002e-06, + "loss": 1.6787, + "step": 18860 + }, + { + "epoch": 1.57, + "grad_norm": 2.714045524597168, + "learning_rate": 9.354202898550724e-06, + "loss": 1.7402, + "step": 18870 + }, + { + "epoch": 1.57, + "grad_norm": 2.784644842147827, + "learning_rate": 9.34840579710145e-06, + "loss": 1.565, + "step": 18880 + }, + { + "epoch": 1.57, + "grad_norm": 8.056758880615234, + "learning_rate": 9.342608695652174e-06, + "loss": 1.6375, + "step": 18890 + }, + { + "epoch": 1.57, + "grad_norm": 5.494594097137451, + "learning_rate": 9.3368115942029e-06, + "loss": 1.6977, + "step": 18900 + }, + { + "epoch": 1.58, + "grad_norm": 1.0827395915985107, + "learning_rate": 9.331014492753624e-06, + "loss": 1.6315, + "step": 18910 + }, + { + "epoch": 1.58, + "grad_norm": 6.295031547546387, + "learning_rate": 9.32521739130435e-06, + "loss": 1.7304, + "step": 18920 + }, + { + "epoch": 1.58, + "grad_norm": 1.8913993835449219, + "learning_rate": 9.319420289855073e-06, + "loss": 1.7426, + "step": 18930 + }, + { + "epoch": 1.58, + "grad_norm": 2.0160393714904785, + "learning_rate": 9.313623188405799e-06, + "loss": 1.7997, + "step": 18940 + }, + { + "epoch": 1.58, + "grad_norm": 4.568789482116699, + "learning_rate": 9.307826086956523e-06, + "loss": 1.699, + "step": 18950 + }, + { + "epoch": 1.58, + "grad_norm": 2.200634479522705, + "learning_rate": 9.302028985507247e-06, + "loss": 1.5647, + "step": 18960 + }, + { + "epoch": 1.58, + "grad_norm": 5.3525495529174805, + "learning_rate": 9.296231884057971e-06, + "loss": 1.6899, + "step": 18970 + }, + { + "epoch": 1.58, + "grad_norm": 2.9189863204956055, + "learning_rate": 9.290434782608697e-06, + "loss": 1.7537, + "step": 18980 + }, + { + "epoch": 1.58, + "grad_norm": 3.9687070846557617, + "learning_rate": 9.28463768115942e-06, + "loss": 1.6875, + "step": 18990 + }, + { + "epoch": 1.58, + "grad_norm": 1.2662925720214844, + "learning_rate": 9.278840579710145e-06, + "loss": 1.6292, + "step": 19000 + }, + { + "epoch": 1.58, + "eval_loss": 1.665460228919983, + "eval_runtime": 107.4967, + "eval_samples_per_second": 9.303, + "eval_steps_per_second": 2.326, + "step": 19000 + }, + { + "epoch": 1.58, + "grad_norm": 2.05936336517334, + "learning_rate": 9.27304347826087e-06, + "loss": 1.5103, + "step": 19010 + }, + { + "epoch": 1.58, + "grad_norm": 1.9469785690307617, + "learning_rate": 9.267246376811594e-06, + "loss": 1.6837, + "step": 19020 + }, + { + "epoch": 1.59, + "grad_norm": 2.0714728832244873, + "learning_rate": 9.26144927536232e-06, + "loss": 1.725, + "step": 19030 + }, + { + "epoch": 1.59, + "grad_norm": 3.3942904472351074, + "learning_rate": 9.255652173913044e-06, + "loss": 1.6261, + "step": 19040 + }, + { + "epoch": 1.59, + "grad_norm": 3.1022446155548096, + "learning_rate": 9.24985507246377e-06, + "loss": 1.7056, + "step": 19050 + }, + { + "epoch": 1.59, + "grad_norm": 1.8929376602172852, + "learning_rate": 9.244057971014494e-06, + "loss": 1.666, + "step": 19060 + }, + { + "epoch": 1.59, + "grad_norm": 1.3702325820922852, + "learning_rate": 9.238260869565218e-06, + "loss": 1.6023, + "step": 19070 + }, + { + "epoch": 1.59, + "grad_norm": 2.481436014175415, + "learning_rate": 9.232463768115942e-06, + "loss": 1.6922, + "step": 19080 + }, + { + "epoch": 1.59, + "grad_norm": 4.172421455383301, + "learning_rate": 9.226666666666668e-06, + "loss": 1.7799, + "step": 19090 + }, + { + "epoch": 1.59, + "grad_norm": 2.3141672611236572, + "learning_rate": 9.220869565217392e-06, + "loss": 1.326, + "step": 19100 + }, + { + "epoch": 1.59, + "grad_norm": 4.576709270477295, + "learning_rate": 9.215072463768117e-06, + "loss": 1.6309, + "step": 19110 + }, + { + "epoch": 1.59, + "grad_norm": 1.9501328468322754, + "learning_rate": 9.209275362318841e-06, + "loss": 1.6507, + "step": 19120 + }, + { + "epoch": 1.59, + "grad_norm": 3.715846300125122, + "learning_rate": 9.203478260869565e-06, + "loss": 1.6828, + "step": 19130 + }, + { + "epoch": 1.59, + "grad_norm": 2.534573554992676, + "learning_rate": 9.197681159420291e-06, + "loss": 1.619, + "step": 19140 + }, + { + "epoch": 1.6, + "grad_norm": 2.9621031284332275, + "learning_rate": 9.191884057971015e-06, + "loss": 1.6573, + "step": 19150 + }, + { + "epoch": 1.6, + "grad_norm": 2.051302194595337, + "learning_rate": 9.18608695652174e-06, + "loss": 1.71, + "step": 19160 + }, + { + "epoch": 1.6, + "grad_norm": 5.671502113342285, + "learning_rate": 9.180289855072465e-06, + "loss": 1.6008, + "step": 19170 + }, + { + "epoch": 1.6, + "grad_norm": 3.0547165870666504, + "learning_rate": 9.174492753623189e-06, + "loss": 1.7074, + "step": 19180 + }, + { + "epoch": 1.6, + "grad_norm": 2.7989306449890137, + "learning_rate": 9.168695652173913e-06, + "loss": 1.7383, + "step": 19190 + }, + { + "epoch": 1.6, + "grad_norm": 2.843214750289917, + "learning_rate": 9.162898550724638e-06, + "loss": 1.6794, + "step": 19200 + }, + { + "epoch": 1.6, + "grad_norm": 2.1126368045806885, + "learning_rate": 9.157101449275362e-06, + "loss": 1.6367, + "step": 19210 + }, + { + "epoch": 1.6, + "grad_norm": 1.7680147886276245, + "learning_rate": 9.151304347826088e-06, + "loss": 1.6034, + "step": 19220 + }, + { + "epoch": 1.6, + "grad_norm": 5.450879096984863, + "learning_rate": 9.145507246376812e-06, + "loss": 1.632, + "step": 19230 + }, + { + "epoch": 1.6, + "grad_norm": 2.6349544525146484, + "learning_rate": 9.139710144927538e-06, + "loss": 1.6385, + "step": 19240 + }, + { + "epoch": 1.6, + "grad_norm": 0.9699334502220154, + "learning_rate": 9.133913043478262e-06, + "loss": 1.5493, + "step": 19250 + }, + { + "epoch": 1.6, + "grad_norm": 6.04218053817749, + "learning_rate": 9.128115942028986e-06, + "loss": 1.6542, + "step": 19260 + }, + { + "epoch": 1.61, + "grad_norm": 3.5029866695404053, + "learning_rate": 9.122318840579712e-06, + "loss": 1.8018, + "step": 19270 + }, + { + "epoch": 1.61, + "grad_norm": 3.081104278564453, + "learning_rate": 9.116521739130436e-06, + "loss": 1.6778, + "step": 19280 + }, + { + "epoch": 1.61, + "grad_norm": 6.336639881134033, + "learning_rate": 9.11072463768116e-06, + "loss": 1.5925, + "step": 19290 + }, + { + "epoch": 1.61, + "grad_norm": 2.3441286087036133, + "learning_rate": 9.104927536231885e-06, + "loss": 1.6447, + "step": 19300 + }, + { + "epoch": 1.61, + "grad_norm": 9.648870468139648, + "learning_rate": 9.09913043478261e-06, + "loss": 1.8095, + "step": 19310 + }, + { + "epoch": 1.61, + "grad_norm": 2.4305033683776855, + "learning_rate": 9.093333333333333e-06, + "loss": 1.6808, + "step": 19320 + }, + { + "epoch": 1.61, + "grad_norm": 1.9530631303787231, + "learning_rate": 9.087536231884059e-06, + "loss": 1.6648, + "step": 19330 + }, + { + "epoch": 1.61, + "grad_norm": 2.785933494567871, + "learning_rate": 9.081739130434783e-06, + "loss": 1.5965, + "step": 19340 + }, + { + "epoch": 1.61, + "grad_norm": 2.8842251300811768, + "learning_rate": 9.075942028985509e-06, + "loss": 1.6147, + "step": 19350 + }, + { + "epoch": 1.61, + "grad_norm": 5.127828121185303, + "learning_rate": 9.070144927536233e-06, + "loss": 1.7384, + "step": 19360 + }, + { + "epoch": 1.61, + "grad_norm": 0.9837595224380493, + "learning_rate": 9.064347826086958e-06, + "loss": 1.8099, + "step": 19370 + }, + { + "epoch": 1.61, + "grad_norm": 3.733158588409424, + "learning_rate": 9.05855072463768e-06, + "loss": 1.5608, + "step": 19380 + }, + { + "epoch": 1.62, + "grad_norm": 2.9853909015655518, + "learning_rate": 9.052753623188406e-06, + "loss": 1.6227, + "step": 19390 + }, + { + "epoch": 1.62, + "grad_norm": 5.49697208404541, + "learning_rate": 9.04695652173913e-06, + "loss": 1.5671, + "step": 19400 + }, + { + "epoch": 1.62, + "grad_norm": 1.7057849168777466, + "learning_rate": 9.041159420289856e-06, + "loss": 1.6561, + "step": 19410 + }, + { + "epoch": 1.62, + "grad_norm": 2.0063562393188477, + "learning_rate": 9.03536231884058e-06, + "loss": 1.7223, + "step": 19420 + }, + { + "epoch": 1.62, + "grad_norm": 8.740435600280762, + "learning_rate": 9.029565217391306e-06, + "loss": 1.6543, + "step": 19430 + }, + { + "epoch": 1.62, + "grad_norm": 4.221389293670654, + "learning_rate": 9.02376811594203e-06, + "loss": 1.673, + "step": 19440 + }, + { + "epoch": 1.62, + "grad_norm": 6.550647735595703, + "learning_rate": 9.017971014492754e-06, + "loss": 1.5946, + "step": 19450 + }, + { + "epoch": 1.62, + "grad_norm": 2.3443706035614014, + "learning_rate": 9.01217391304348e-06, + "loss": 1.6832, + "step": 19460 + }, + { + "epoch": 1.62, + "grad_norm": 1.3949636220932007, + "learning_rate": 9.006376811594203e-06, + "loss": 1.5993, + "step": 19470 + }, + { + "epoch": 1.62, + "grad_norm": 3.4200923442840576, + "learning_rate": 9.00057971014493e-06, + "loss": 1.6081, + "step": 19480 + }, + { + "epoch": 1.62, + "grad_norm": 5.661345958709717, + "learning_rate": 8.994782608695653e-06, + "loss": 1.7607, + "step": 19490 + }, + { + "epoch": 1.62, + "grad_norm": 2.108694314956665, + "learning_rate": 8.988985507246377e-06, + "loss": 1.6209, + "step": 19500 + }, + { + "epoch": 1.62, + "eval_loss": 1.6830703020095825, + "eval_runtime": 107.5287, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 19500 + }, + { + "epoch": 1.63, + "grad_norm": 5.532451629638672, + "learning_rate": 8.983188405797101e-06, + "loss": 1.6172, + "step": 19510 + }, + { + "epoch": 1.63, + "grad_norm": 1.787123441696167, + "learning_rate": 8.977391304347827e-06, + "loss": 1.6074, + "step": 19520 + }, + { + "epoch": 1.63, + "grad_norm": 1.405480146408081, + "learning_rate": 8.971594202898551e-06, + "loss": 1.6032, + "step": 19530 + }, + { + "epoch": 1.63, + "grad_norm": 7.917876243591309, + "learning_rate": 8.965797101449277e-06, + "loss": 1.6732, + "step": 19540 + }, + { + "epoch": 1.63, + "grad_norm": 3.5339460372924805, + "learning_rate": 8.96e-06, + "loss": 1.7914, + "step": 19550 + }, + { + "epoch": 1.63, + "grad_norm": 3.5086684226989746, + "learning_rate": 8.954202898550726e-06, + "loss": 1.5816, + "step": 19560 + }, + { + "epoch": 1.63, + "grad_norm": 9.773869514465332, + "learning_rate": 8.94840579710145e-06, + "loss": 1.67, + "step": 19570 + }, + { + "epoch": 1.63, + "grad_norm": 5.713991165161133, + "learning_rate": 8.942608695652174e-06, + "loss": 1.7089, + "step": 19580 + }, + { + "epoch": 1.63, + "grad_norm": 2.009612560272217, + "learning_rate": 8.936811594202898e-06, + "loss": 1.7485, + "step": 19590 + }, + { + "epoch": 1.63, + "grad_norm": 2.547402858734131, + "learning_rate": 8.931014492753624e-06, + "loss": 1.5412, + "step": 19600 + }, + { + "epoch": 1.63, + "grad_norm": 1.649285078048706, + "learning_rate": 8.925217391304348e-06, + "loss": 1.7208, + "step": 19610 + }, + { + "epoch": 1.64, + "grad_norm": 5.029441833496094, + "learning_rate": 8.919420289855074e-06, + "loss": 1.6938, + "step": 19620 + }, + { + "epoch": 1.64, + "grad_norm": 2.3328795433044434, + "learning_rate": 8.913623188405798e-06, + "loss": 1.5802, + "step": 19630 + }, + { + "epoch": 1.64, + "grad_norm": 2.51261043548584, + "learning_rate": 8.907826086956522e-06, + "loss": 1.6576, + "step": 19640 + }, + { + "epoch": 1.64, + "grad_norm": 5.171065330505371, + "learning_rate": 8.902028985507247e-06, + "loss": 1.5348, + "step": 19650 + }, + { + "epoch": 1.64, + "grad_norm": 5.7769856452941895, + "learning_rate": 8.896231884057971e-06, + "loss": 1.6869, + "step": 19660 + }, + { + "epoch": 1.64, + "grad_norm": 4.950839042663574, + "learning_rate": 8.890434782608697e-06, + "loss": 1.6537, + "step": 19670 + }, + { + "epoch": 1.64, + "grad_norm": 4.615936279296875, + "learning_rate": 8.884637681159421e-06, + "loss": 1.7185, + "step": 19680 + }, + { + "epoch": 1.64, + "grad_norm": 1.862908959388733, + "learning_rate": 8.878840579710145e-06, + "loss": 1.6145, + "step": 19690 + }, + { + "epoch": 1.64, + "grad_norm": 2.8000476360321045, + "learning_rate": 8.87304347826087e-06, + "loss": 1.736, + "step": 19700 + }, + { + "epoch": 1.64, + "grad_norm": 2.0812816619873047, + "learning_rate": 8.867246376811595e-06, + "loss": 1.4757, + "step": 19710 + }, + { + "epoch": 1.64, + "grad_norm": 3.9957151412963867, + "learning_rate": 8.861449275362319e-06, + "loss": 1.6096, + "step": 19720 + }, + { + "epoch": 1.64, + "grad_norm": 4.853338241577148, + "learning_rate": 8.855652173913045e-06, + "loss": 1.6642, + "step": 19730 + }, + { + "epoch": 1.65, + "grad_norm": 3.507507085800171, + "learning_rate": 8.849855072463769e-06, + "loss": 1.6589, + "step": 19740 + }, + { + "epoch": 1.65, + "grad_norm": 3.7916064262390137, + "learning_rate": 8.844057971014494e-06, + "loss": 1.6656, + "step": 19750 + }, + { + "epoch": 1.65, + "grad_norm": 9.049138069152832, + "learning_rate": 8.838260869565218e-06, + "loss": 1.6074, + "step": 19760 + }, + { + "epoch": 1.65, + "grad_norm": 4.988990306854248, + "learning_rate": 8.832463768115942e-06, + "loss": 1.697, + "step": 19770 + }, + { + "epoch": 1.65, + "grad_norm": 3.35215163230896, + "learning_rate": 8.826666666666668e-06, + "loss": 1.6842, + "step": 19780 + }, + { + "epoch": 1.65, + "grad_norm": 3.3260128498077393, + "learning_rate": 8.820869565217392e-06, + "loss": 1.6981, + "step": 19790 + }, + { + "epoch": 1.65, + "grad_norm": 1.1436783075332642, + "learning_rate": 8.815072463768116e-06, + "loss": 1.7511, + "step": 19800 + }, + { + "epoch": 1.65, + "grad_norm": 2.495922565460205, + "learning_rate": 8.809275362318842e-06, + "loss": 1.5865, + "step": 19810 + }, + { + "epoch": 1.65, + "grad_norm": 2.033712148666382, + "learning_rate": 8.803478260869566e-06, + "loss": 1.6375, + "step": 19820 + }, + { + "epoch": 1.65, + "grad_norm": 0.9689141511917114, + "learning_rate": 8.79768115942029e-06, + "loss": 1.5311, + "step": 19830 + }, + { + "epoch": 1.65, + "grad_norm": 2.1666340827941895, + "learning_rate": 8.791884057971015e-06, + "loss": 1.7137, + "step": 19840 + }, + { + "epoch": 1.65, + "grad_norm": 5.735472202301025, + "learning_rate": 8.78608695652174e-06, + "loss": 1.7429, + "step": 19850 + }, + { + "epoch": 1.66, + "grad_norm": 7.482568264007568, + "learning_rate": 8.780289855072465e-06, + "loss": 1.5834, + "step": 19860 + }, + { + "epoch": 1.66, + "grad_norm": 3.3337314128875732, + "learning_rate": 8.774492753623189e-06, + "loss": 1.7067, + "step": 19870 + }, + { + "epoch": 1.66, + "grad_norm": 3.9348156452178955, + "learning_rate": 8.768695652173915e-06, + "loss": 1.7376, + "step": 19880 + }, + { + "epoch": 1.66, + "grad_norm": 3.5238430500030518, + "learning_rate": 8.762898550724639e-06, + "loss": 1.5539, + "step": 19890 + }, + { + "epoch": 1.66, + "grad_norm": 1.653454303741455, + "learning_rate": 8.757101449275363e-06, + "loss": 1.7208, + "step": 19900 + }, + { + "epoch": 1.66, + "grad_norm": 5.208953857421875, + "learning_rate": 8.751304347826087e-06, + "loss": 1.5903, + "step": 19910 + }, + { + "epoch": 1.66, + "grad_norm": 4.089755535125732, + "learning_rate": 8.745507246376813e-06, + "loss": 1.4553, + "step": 19920 + }, + { + "epoch": 1.66, + "grad_norm": 2.7506258487701416, + "learning_rate": 8.739710144927537e-06, + "loss": 1.6508, + "step": 19930 + }, + { + "epoch": 1.66, + "grad_norm": 5.436467170715332, + "learning_rate": 8.733913043478262e-06, + "loss": 1.8371, + "step": 19940 + }, + { + "epoch": 1.66, + "grad_norm": 3.3517704010009766, + "learning_rate": 8.728115942028986e-06, + "loss": 1.7193, + "step": 19950 + }, + { + "epoch": 1.66, + "grad_norm": 2.238973379135132, + "learning_rate": 8.72231884057971e-06, + "loss": 1.7621, + "step": 19960 + }, + { + "epoch": 1.66, + "grad_norm": 3.6009724140167236, + "learning_rate": 8.716521739130436e-06, + "loss": 1.8128, + "step": 19970 + }, + { + "epoch": 1.67, + "grad_norm": 4.952020645141602, + "learning_rate": 8.71072463768116e-06, + "loss": 1.7867, + "step": 19980 + }, + { + "epoch": 1.67, + "grad_norm": 4.133454322814941, + "learning_rate": 8.704927536231886e-06, + "loss": 1.7099, + "step": 19990 + }, + { + "epoch": 1.67, + "grad_norm": 1.35225510597229, + "learning_rate": 8.69913043478261e-06, + "loss": 1.7125, + "step": 20000 + }, + { + "epoch": 1.67, + "eval_loss": 1.6788996458053589, + "eval_runtime": 107.4956, + "eval_samples_per_second": 9.303, + "eval_steps_per_second": 2.326, + "step": 20000 + }, + { + "epoch": 1.67, + "grad_norm": 2.7249436378479004, + "learning_rate": 8.693333333333334e-06, + "loss": 1.6412, + "step": 20010 + }, + { + "epoch": 1.67, + "grad_norm": 4.052044868469238, + "learning_rate": 8.687536231884058e-06, + "loss": 1.4474, + "step": 20020 + }, + { + "epoch": 1.67, + "grad_norm": 2.286026954650879, + "learning_rate": 8.681739130434783e-06, + "loss": 1.7352, + "step": 20030 + }, + { + "epoch": 1.67, + "grad_norm": 9.765252113342285, + "learning_rate": 8.675942028985507e-06, + "loss": 1.6815, + "step": 20040 + }, + { + "epoch": 1.67, + "grad_norm": 2.1208224296569824, + "learning_rate": 8.670144927536233e-06, + "loss": 1.7426, + "step": 20050 + }, + { + "epoch": 1.67, + "grad_norm": 4.312246799468994, + "learning_rate": 8.664347826086957e-06, + "loss": 1.7116, + "step": 20060 + }, + { + "epoch": 1.67, + "grad_norm": 3.1685314178466797, + "learning_rate": 8.658550724637683e-06, + "loss": 1.7183, + "step": 20070 + }, + { + "epoch": 1.67, + "grad_norm": 1.873128890991211, + "learning_rate": 8.652753623188407e-06, + "loss": 1.811, + "step": 20080 + }, + { + "epoch": 1.67, + "grad_norm": 4.21943473815918, + "learning_rate": 8.64695652173913e-06, + "loss": 1.6304, + "step": 20090 + }, + { + "epoch": 1.68, + "grad_norm": 3.778602123260498, + "learning_rate": 8.641159420289857e-06, + "loss": 1.6324, + "step": 20100 + }, + { + "epoch": 1.68, + "grad_norm": 2.6876754760742188, + "learning_rate": 8.63536231884058e-06, + "loss": 1.6039, + "step": 20110 + }, + { + "epoch": 1.68, + "grad_norm": 4.973613739013672, + "learning_rate": 8.629565217391305e-06, + "loss": 1.7671, + "step": 20120 + }, + { + "epoch": 1.68, + "grad_norm": 1.4355108737945557, + "learning_rate": 8.62376811594203e-06, + "loss": 1.6335, + "step": 20130 + }, + { + "epoch": 1.68, + "grad_norm": 2.8951032161712646, + "learning_rate": 8.617971014492754e-06, + "loss": 1.7804, + "step": 20140 + }, + { + "epoch": 1.68, + "grad_norm": 2.68106746673584, + "learning_rate": 8.612173913043478e-06, + "loss": 1.5194, + "step": 20150 + }, + { + "epoch": 1.68, + "grad_norm": 4.569166660308838, + "learning_rate": 8.606376811594204e-06, + "loss": 1.6177, + "step": 20160 + }, + { + "epoch": 1.68, + "grad_norm": 5.555330276489258, + "learning_rate": 8.600579710144928e-06, + "loss": 1.6004, + "step": 20170 + }, + { + "epoch": 1.68, + "grad_norm": 1.8976370096206665, + "learning_rate": 8.59536231884058e-06, + "loss": 1.5999, + "step": 20180 + }, + { + "epoch": 1.68, + "grad_norm": 1.5247642993927002, + "learning_rate": 8.589565217391305e-06, + "loss": 1.6919, + "step": 20190 + }, + { + "epoch": 1.68, + "grad_norm": 5.274435997009277, + "learning_rate": 8.583768115942029e-06, + "loss": 1.5242, + "step": 20200 + }, + { + "epoch": 1.68, + "grad_norm": 4.763355255126953, + "learning_rate": 8.577971014492755e-06, + "loss": 1.4803, + "step": 20210 + }, + { + "epoch": 1.69, + "grad_norm": 3.4387519359588623, + "learning_rate": 8.572173913043479e-06, + "loss": 1.5794, + "step": 20220 + }, + { + "epoch": 1.69, + "grad_norm": 4.496255874633789, + "learning_rate": 8.566376811594204e-06, + "loss": 1.6001, + "step": 20230 + }, + { + "epoch": 1.69, + "grad_norm": 1.4461989402770996, + "learning_rate": 8.560579710144928e-06, + "loss": 1.7451, + "step": 20240 + }, + { + "epoch": 1.69, + "grad_norm": 5.702919006347656, + "learning_rate": 8.554782608695652e-06, + "loss": 1.6566, + "step": 20250 + }, + { + "epoch": 1.69, + "grad_norm": 1.9157620668411255, + "learning_rate": 8.548985507246378e-06, + "loss": 1.5958, + "step": 20260 + }, + { + "epoch": 1.69, + "grad_norm": 4.364008903503418, + "learning_rate": 8.543188405797102e-06, + "loss": 1.6164, + "step": 20270 + }, + { + "epoch": 1.69, + "grad_norm": 1.2819979190826416, + "learning_rate": 8.537391304347826e-06, + "loss": 1.7358, + "step": 20280 + }, + { + "epoch": 1.69, + "grad_norm": 6.845746994018555, + "learning_rate": 8.531594202898552e-06, + "loss": 1.6241, + "step": 20290 + }, + { + "epoch": 1.69, + "grad_norm": 3.371389627456665, + "learning_rate": 8.525797101449276e-06, + "loss": 1.6148, + "step": 20300 + }, + { + "epoch": 1.69, + "grad_norm": 6.634834289550781, + "learning_rate": 8.52e-06, + "loss": 1.7313, + "step": 20310 + }, + { + "epoch": 1.69, + "grad_norm": 11.510677337646484, + "learning_rate": 8.514202898550725e-06, + "loss": 1.6605, + "step": 20320 + }, + { + "epoch": 1.69, + "grad_norm": 3.1439342498779297, + "learning_rate": 8.50840579710145e-06, + "loss": 1.5686, + "step": 20330 + }, + { + "epoch": 1.69, + "grad_norm": 1.642844796180725, + "learning_rate": 8.502608695652175e-06, + "loss": 1.6276, + "step": 20340 + }, + { + "epoch": 1.7, + "grad_norm": 4.143886089324951, + "learning_rate": 8.496811594202899e-06, + "loss": 1.5472, + "step": 20350 + }, + { + "epoch": 1.7, + "grad_norm": 4.869246006011963, + "learning_rate": 8.491014492753625e-06, + "loss": 1.6785, + "step": 20360 + }, + { + "epoch": 1.7, + "grad_norm": 2.9599814414978027, + "learning_rate": 8.485217391304349e-06, + "loss": 1.6191, + "step": 20370 + }, + { + "epoch": 1.7, + "grad_norm": 2.2835745811462402, + "learning_rate": 8.479420289855073e-06, + "loss": 1.7113, + "step": 20380 + }, + { + "epoch": 1.7, + "grad_norm": 4.400454998016357, + "learning_rate": 8.473623188405797e-06, + "loss": 1.7069, + "step": 20390 + }, + { + "epoch": 1.7, + "grad_norm": 3.021839141845703, + "learning_rate": 8.467826086956523e-06, + "loss": 1.7014, + "step": 20400 + }, + { + "epoch": 1.7, + "grad_norm": 3.3753602504730225, + "learning_rate": 8.462028985507247e-06, + "loss": 1.6844, + "step": 20410 + }, + { + "epoch": 1.7, + "grad_norm": 1.501394271850586, + "learning_rate": 8.456231884057972e-06, + "loss": 1.7469, + "step": 20420 + }, + { + "epoch": 1.7, + "grad_norm": 16.001710891723633, + "learning_rate": 8.450434782608696e-06, + "loss": 1.7221, + "step": 20430 + }, + { + "epoch": 1.7, + "grad_norm": 4.816986083984375, + "learning_rate": 8.44463768115942e-06, + "loss": 1.7513, + "step": 20440 + }, + { + "epoch": 1.7, + "grad_norm": 1.2568550109863281, + "learning_rate": 8.438840579710146e-06, + "loss": 1.5909, + "step": 20450 + }, + { + "epoch": 1.71, + "grad_norm": 1.2014676332473755, + "learning_rate": 8.43304347826087e-06, + "loss": 1.5539, + "step": 20460 + }, + { + "epoch": 1.71, + "grad_norm": 2.841648817062378, + "learning_rate": 8.427246376811596e-06, + "loss": 1.6945, + "step": 20470 + }, + { + "epoch": 1.71, + "grad_norm": 2.8036043643951416, + "learning_rate": 8.42144927536232e-06, + "loss": 1.6231, + "step": 20480 + }, + { + "epoch": 1.71, + "grad_norm": 2.289719343185425, + "learning_rate": 8.415652173913044e-06, + "loss": 1.5679, + "step": 20490 + }, + { + "epoch": 1.71, + "grad_norm": 1.5245754718780518, + "learning_rate": 8.409855072463768e-06, + "loss": 1.5689, + "step": 20500 + }, + { + "epoch": 1.71, + "eval_loss": 1.6623921394348145, + "eval_runtime": 107.5043, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 20500 + }, + { + "epoch": 1.71, + "grad_norm": 9.230620384216309, + "learning_rate": 8.404057971014493e-06, + "loss": 1.4972, + "step": 20510 + }, + { + "epoch": 1.71, + "grad_norm": 4.701317310333252, + "learning_rate": 8.398260869565217e-06, + "loss": 1.7549, + "step": 20520 + }, + { + "epoch": 1.71, + "grad_norm": 2.25839900970459, + "learning_rate": 8.392463768115943e-06, + "loss": 1.66, + "step": 20530 + }, + { + "epoch": 1.71, + "grad_norm": 3.2549502849578857, + "learning_rate": 8.386666666666667e-06, + "loss": 1.8211, + "step": 20540 + }, + { + "epoch": 1.71, + "grad_norm": 5.639694690704346, + "learning_rate": 8.380869565217393e-06, + "loss": 1.6038, + "step": 20550 + }, + { + "epoch": 1.71, + "grad_norm": 4.658710956573486, + "learning_rate": 8.375072463768117e-06, + "loss": 1.7501, + "step": 20560 + }, + { + "epoch": 1.71, + "grad_norm": 2.85872745513916, + "learning_rate": 8.36927536231884e-06, + "loss": 1.5652, + "step": 20570 + }, + { + "epoch": 1.71, + "grad_norm": 3.761110544204712, + "learning_rate": 8.363478260869567e-06, + "loss": 1.6325, + "step": 20580 + }, + { + "epoch": 1.72, + "grad_norm": 2.664401054382324, + "learning_rate": 8.35768115942029e-06, + "loss": 1.6404, + "step": 20590 + }, + { + "epoch": 1.72, + "grad_norm": 1.662811279296875, + "learning_rate": 8.351884057971015e-06, + "loss": 1.5619, + "step": 20600 + }, + { + "epoch": 1.72, + "grad_norm": 2.5120227336883545, + "learning_rate": 8.34608695652174e-06, + "loss": 1.7325, + "step": 20610 + }, + { + "epoch": 1.72, + "grad_norm": 6.742555618286133, + "learning_rate": 8.340289855072464e-06, + "loss": 1.7704, + "step": 20620 + }, + { + "epoch": 1.72, + "grad_norm": 1.6222686767578125, + "learning_rate": 8.334492753623188e-06, + "loss": 1.6901, + "step": 20630 + }, + { + "epoch": 1.72, + "grad_norm": 4.6312642097473145, + "learning_rate": 8.328695652173914e-06, + "loss": 1.728, + "step": 20640 + }, + { + "epoch": 1.72, + "grad_norm": 1.7402911186218262, + "learning_rate": 8.322898550724638e-06, + "loss": 1.6404, + "step": 20650 + }, + { + "epoch": 1.72, + "grad_norm": 2.214266777038574, + "learning_rate": 8.317101449275364e-06, + "loss": 1.7745, + "step": 20660 + }, + { + "epoch": 1.72, + "grad_norm": 2.402463674545288, + "learning_rate": 8.311304347826088e-06, + "loss": 1.6937, + "step": 20670 + }, + { + "epoch": 1.72, + "grad_norm": 4.309727668762207, + "learning_rate": 8.305507246376813e-06, + "loss": 1.633, + "step": 20680 + }, + { + "epoch": 1.72, + "grad_norm": 1.6282193660736084, + "learning_rate": 8.299710144927537e-06, + "loss": 1.6727, + "step": 20690 + }, + { + "epoch": 1.73, + "grad_norm": 4.9179277420043945, + "learning_rate": 8.293913043478261e-06, + "loss": 1.6977, + "step": 20700 + }, + { + "epoch": 1.73, + "grad_norm": 5.5713701248168945, + "learning_rate": 8.288115942028985e-06, + "loss": 1.7506, + "step": 20710 + }, + { + "epoch": 1.73, + "grad_norm": 2.8336901664733887, + "learning_rate": 8.282318840579711e-06, + "loss": 1.6477, + "step": 20720 + }, + { + "epoch": 1.73, + "grad_norm": 3.1530611515045166, + "learning_rate": 8.276521739130435e-06, + "loss": 1.6213, + "step": 20730 + }, + { + "epoch": 1.73, + "grad_norm": 2.452496290206909, + "learning_rate": 8.27072463768116e-06, + "loss": 1.5587, + "step": 20740 + }, + { + "epoch": 1.73, + "grad_norm": 10.321528434753418, + "learning_rate": 8.264927536231885e-06, + "loss": 1.6628, + "step": 20750 + }, + { + "epoch": 1.73, + "grad_norm": 1.9270588159561157, + "learning_rate": 8.259130434782609e-06, + "loss": 1.6388, + "step": 20760 + }, + { + "epoch": 1.73, + "grad_norm": 2.518328905105591, + "learning_rate": 8.253333333333334e-06, + "loss": 1.6791, + "step": 20770 + }, + { + "epoch": 1.73, + "grad_norm": 1.031456470489502, + "learning_rate": 8.247536231884059e-06, + "loss": 1.5683, + "step": 20780 + }, + { + "epoch": 1.73, + "grad_norm": 2.8340535163879395, + "learning_rate": 8.241739130434784e-06, + "loss": 1.5371, + "step": 20790 + }, + { + "epoch": 1.73, + "grad_norm": 6.309947967529297, + "learning_rate": 8.235942028985508e-06, + "loss": 1.5707, + "step": 20800 + }, + { + "epoch": 1.73, + "grad_norm": 2.8457183837890625, + "learning_rate": 8.230144927536232e-06, + "loss": 1.6439, + "step": 20810 + }, + { + "epoch": 1.73, + "grad_norm": 2.240163803100586, + "learning_rate": 8.224347826086956e-06, + "loss": 1.7126, + "step": 20820 + }, + { + "epoch": 1.74, + "grad_norm": 2.0228397846221924, + "learning_rate": 8.218550724637682e-06, + "loss": 1.5457, + "step": 20830 + }, + { + "epoch": 1.74, + "grad_norm": 2.194091796875, + "learning_rate": 8.212753623188406e-06, + "loss": 1.6615, + "step": 20840 + }, + { + "epoch": 1.74, + "grad_norm": 3.5026867389678955, + "learning_rate": 8.206956521739132e-06, + "loss": 1.5305, + "step": 20850 + }, + { + "epoch": 1.74, + "grad_norm": 4.275241374969482, + "learning_rate": 8.201159420289856e-06, + "loss": 1.5742, + "step": 20860 + }, + { + "epoch": 1.74, + "grad_norm": 2.040112257003784, + "learning_rate": 8.195362318840581e-06, + "loss": 1.6762, + "step": 20870 + }, + { + "epoch": 1.74, + "grad_norm": 3.858863592147827, + "learning_rate": 8.189565217391305e-06, + "loss": 1.7056, + "step": 20880 + }, + { + "epoch": 1.74, + "grad_norm": 3.349377155303955, + "learning_rate": 8.18376811594203e-06, + "loss": 1.5865, + "step": 20890 + }, + { + "epoch": 1.74, + "grad_norm": 4.167051792144775, + "learning_rate": 8.177971014492753e-06, + "loss": 1.712, + "step": 20900 + }, + { + "epoch": 1.74, + "grad_norm": 5.692874431610107, + "learning_rate": 8.172173913043479e-06, + "loss": 1.6484, + "step": 20910 + }, + { + "epoch": 1.74, + "grad_norm": 13.255517959594727, + "learning_rate": 8.166376811594203e-06, + "loss": 1.7715, + "step": 20920 + }, + { + "epoch": 1.74, + "grad_norm": 5.60584831237793, + "learning_rate": 8.160579710144929e-06, + "loss": 1.5116, + "step": 20930 + }, + { + "epoch": 1.75, + "grad_norm": 4.476940631866455, + "learning_rate": 8.154782608695653e-06, + "loss": 1.6632, + "step": 20940 + }, + { + "epoch": 1.75, + "grad_norm": 3.8701999187469482, + "learning_rate": 8.148985507246377e-06, + "loss": 1.6628, + "step": 20950 + }, + { + "epoch": 1.75, + "grad_norm": 1.236228346824646, + "learning_rate": 8.143188405797102e-06, + "loss": 1.4092, + "step": 20960 + }, + { + "epoch": 1.75, + "grad_norm": 4.624843120574951, + "learning_rate": 8.137391304347826e-06, + "loss": 1.742, + "step": 20970 + }, + { + "epoch": 1.75, + "grad_norm": 1.9257400035858154, + "learning_rate": 8.131594202898552e-06, + "loss": 1.6487, + "step": 20980 + }, + { + "epoch": 1.75, + "grad_norm": 5.001676082611084, + "learning_rate": 8.125797101449276e-06, + "loss": 1.6715, + "step": 20990 + }, + { + "epoch": 1.75, + "grad_norm": 1.8285595178604126, + "learning_rate": 8.120000000000002e-06, + "loss": 1.5451, + "step": 21000 + }, + { + "epoch": 1.75, + "eval_loss": 1.6759874820709229, + "eval_runtime": 107.5007, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 21000 + }, + { + "epoch": 1.75, + "grad_norm": 6.6983747482299805, + "learning_rate": 8.114202898550724e-06, + "loss": 1.6217, + "step": 21010 + }, + { + "epoch": 1.75, + "grad_norm": 3.789233446121216, + "learning_rate": 8.10840579710145e-06, + "loss": 1.5388, + "step": 21020 + }, + { + "epoch": 1.75, + "grad_norm": 9.76033878326416, + "learning_rate": 8.102608695652174e-06, + "loss": 1.7604, + "step": 21030 + }, + { + "epoch": 1.75, + "grad_norm": 2.5290467739105225, + "learning_rate": 8.0968115942029e-06, + "loss": 1.6859, + "step": 21040 + }, + { + "epoch": 1.75, + "grad_norm": 3.0937581062316895, + "learning_rate": 8.091014492753624e-06, + "loss": 1.481, + "step": 21050 + }, + { + "epoch": 1.75, + "grad_norm": 4.91987943649292, + "learning_rate": 8.08521739130435e-06, + "loss": 1.5735, + "step": 21060 + }, + { + "epoch": 1.76, + "grad_norm": 2.2947590351104736, + "learning_rate": 8.079420289855073e-06, + "loss": 1.6143, + "step": 21070 + }, + { + "epoch": 1.76, + "grad_norm": 4.225331783294678, + "learning_rate": 8.073623188405797e-06, + "loss": 1.6024, + "step": 21080 + }, + { + "epoch": 1.76, + "grad_norm": 6.730823993682861, + "learning_rate": 8.067826086956523e-06, + "loss": 1.667, + "step": 21090 + }, + { + "epoch": 1.76, + "grad_norm": 1.8926206827163696, + "learning_rate": 8.062028985507247e-06, + "loss": 1.6495, + "step": 21100 + }, + { + "epoch": 1.76, + "grad_norm": 7.650490760803223, + "learning_rate": 8.056231884057971e-06, + "loss": 1.7194, + "step": 21110 + }, + { + "epoch": 1.76, + "grad_norm": 3.5820090770721436, + "learning_rate": 8.050434782608697e-06, + "loss": 1.591, + "step": 21120 + }, + { + "epoch": 1.76, + "grad_norm": 1.5368926525115967, + "learning_rate": 8.04463768115942e-06, + "loss": 1.6234, + "step": 21130 + }, + { + "epoch": 1.76, + "grad_norm": 1.6347370147705078, + "learning_rate": 8.038840579710145e-06, + "loss": 1.7416, + "step": 21140 + }, + { + "epoch": 1.76, + "grad_norm": 10.997393608093262, + "learning_rate": 8.03304347826087e-06, + "loss": 1.7531, + "step": 21150 + }, + { + "epoch": 1.76, + "grad_norm": 4.327144145965576, + "learning_rate": 8.027246376811594e-06, + "loss": 1.4644, + "step": 21160 + }, + { + "epoch": 1.76, + "grad_norm": 1.6970939636230469, + "learning_rate": 8.02144927536232e-06, + "loss": 1.761, + "step": 21170 + }, + { + "epoch": 1.77, + "grad_norm": 2.163623332977295, + "learning_rate": 8.015652173913044e-06, + "loss": 1.6956, + "step": 21180 + }, + { + "epoch": 1.77, + "grad_norm": 1.5294394493103027, + "learning_rate": 8.00985507246377e-06, + "loss": 1.7231, + "step": 21190 + }, + { + "epoch": 1.77, + "grad_norm": 7.195105075836182, + "learning_rate": 8.004057971014494e-06, + "loss": 1.7115, + "step": 21200 + }, + { + "epoch": 1.77, + "grad_norm": 2.7161829471588135, + "learning_rate": 7.998260869565218e-06, + "loss": 1.7947, + "step": 21210 + }, + { + "epoch": 1.77, + "grad_norm": 1.7141447067260742, + "learning_rate": 7.992463768115942e-06, + "loss": 1.6888, + "step": 21220 + }, + { + "epoch": 1.77, + "grad_norm": 2.2814693450927734, + "learning_rate": 7.986666666666668e-06, + "loss": 1.5235, + "step": 21230 + }, + { + "epoch": 1.77, + "grad_norm": 3.9310901165008545, + "learning_rate": 7.980869565217392e-06, + "loss": 1.6429, + "step": 21240 + }, + { + "epoch": 1.77, + "grad_norm": 3.654700994491577, + "learning_rate": 7.975072463768117e-06, + "loss": 1.6168, + "step": 21250 + }, + { + "epoch": 1.77, + "grad_norm": 6.43727445602417, + "learning_rate": 7.969275362318841e-06, + "loss": 1.6492, + "step": 21260 + }, + { + "epoch": 1.77, + "grad_norm": 9.46625804901123, + "learning_rate": 7.963478260869565e-06, + "loss": 1.735, + "step": 21270 + }, + { + "epoch": 1.77, + "grad_norm": 1.2169768810272217, + "learning_rate": 7.957681159420291e-06, + "loss": 1.5743, + "step": 21280 + }, + { + "epoch": 1.77, + "grad_norm": 5.560244560241699, + "learning_rate": 7.951884057971015e-06, + "loss": 1.5637, + "step": 21290 + }, + { + "epoch": 1.77, + "grad_norm": 7.38496208190918, + "learning_rate": 7.94608695652174e-06, + "loss": 1.6104, + "step": 21300 + }, + { + "epoch": 1.78, + "grad_norm": 3.6433284282684326, + "learning_rate": 7.940289855072465e-06, + "loss": 1.644, + "step": 21310 + }, + { + "epoch": 1.78, + "grad_norm": 2.7574374675750732, + "learning_rate": 7.934492753623189e-06, + "loss": 1.6456, + "step": 21320 + }, + { + "epoch": 1.78, + "grad_norm": 3.709298610687256, + "learning_rate": 7.928695652173913e-06, + "loss": 1.6377, + "step": 21330 + }, + { + "epoch": 1.78, + "grad_norm": 3.04289174079895, + "learning_rate": 7.922898550724638e-06, + "loss": 1.6613, + "step": 21340 + }, + { + "epoch": 1.78, + "grad_norm": 4.023538112640381, + "learning_rate": 7.917101449275362e-06, + "loss": 1.7339, + "step": 21350 + }, + { + "epoch": 1.78, + "grad_norm": 6.419352054595947, + "learning_rate": 7.911304347826088e-06, + "loss": 1.7423, + "step": 21360 + }, + { + "epoch": 1.78, + "grad_norm": 6.251063823699951, + "learning_rate": 7.905507246376812e-06, + "loss": 1.5134, + "step": 21370 + }, + { + "epoch": 1.78, + "grad_norm": 2.289116859436035, + "learning_rate": 7.899710144927538e-06, + "loss": 1.6577, + "step": 21380 + }, + { + "epoch": 1.78, + "grad_norm": 2.699251651763916, + "learning_rate": 7.893913043478262e-06, + "loss": 1.6832, + "step": 21390 + }, + { + "epoch": 1.78, + "grad_norm": 3.5587689876556396, + "learning_rate": 7.888115942028986e-06, + "loss": 1.6295, + "step": 21400 + }, + { + "epoch": 1.78, + "grad_norm": 1.560650110244751, + "learning_rate": 7.882318840579712e-06, + "loss": 1.6037, + "step": 21410 + }, + { + "epoch": 1.79, + "grad_norm": 2.8277647495269775, + "learning_rate": 7.876521739130436e-06, + "loss": 1.6725, + "step": 21420 + }, + { + "epoch": 1.79, + "grad_norm": 2.283670425415039, + "learning_rate": 7.87072463768116e-06, + "loss": 1.6771, + "step": 21430 + }, + { + "epoch": 1.79, + "grad_norm": 5.659524917602539, + "learning_rate": 7.864927536231885e-06, + "loss": 1.7715, + "step": 21440 + }, + { + "epoch": 1.79, + "grad_norm": 4.301861763000488, + "learning_rate": 7.85913043478261e-06, + "loss": 1.5021, + "step": 21450 + }, + { + "epoch": 1.79, + "grad_norm": 3.5721640586853027, + "learning_rate": 7.853333333333333e-06, + "loss": 1.5547, + "step": 21460 + }, + { + "epoch": 1.79, + "grad_norm": 3.1962008476257324, + "learning_rate": 7.847536231884059e-06, + "loss": 1.6434, + "step": 21470 + }, + { + "epoch": 1.79, + "grad_norm": 1.1904305219650269, + "learning_rate": 7.841739130434783e-06, + "loss": 1.7324, + "step": 21480 + }, + { + "epoch": 1.79, + "grad_norm": 6.97566556930542, + "learning_rate": 7.835942028985509e-06, + "loss": 1.5438, + "step": 21490 + }, + { + "epoch": 1.79, + "grad_norm": 2.0194718837738037, + "learning_rate": 7.830144927536233e-06, + "loss": 1.8216, + "step": 21500 + }, + { + "epoch": 1.79, + "eval_loss": 1.6438640356063843, + "eval_runtime": 107.5066, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 21500 + }, + { + "epoch": 1.79, + "grad_norm": 2.1328859329223633, + "learning_rate": 7.824347826086958e-06, + "loss": 1.6511, + "step": 21510 + }, + { + "epoch": 1.79, + "grad_norm": 16.312145233154297, + "learning_rate": 7.81855072463768e-06, + "loss": 1.6316, + "step": 21520 + }, + { + "epoch": 1.79, + "grad_norm": 2.9844260215759277, + "learning_rate": 7.812753623188406e-06, + "loss": 1.6512, + "step": 21530 + }, + { + "epoch": 1.79, + "grad_norm": 3.2778491973876953, + "learning_rate": 7.80695652173913e-06, + "loss": 1.5582, + "step": 21540 + }, + { + "epoch": 1.8, + "grad_norm": 4.904211044311523, + "learning_rate": 7.801159420289856e-06, + "loss": 1.55, + "step": 21550 + }, + { + "epoch": 1.8, + "grad_norm": 4.345255374908447, + "learning_rate": 7.79536231884058e-06, + "loss": 1.6164, + "step": 21560 + }, + { + "epoch": 1.8, + "grad_norm": 5.088725566864014, + "learning_rate": 7.789565217391306e-06, + "loss": 1.5831, + "step": 21570 + }, + { + "epoch": 1.8, + "grad_norm": 4.778326988220215, + "learning_rate": 7.78376811594203e-06, + "loss": 1.6626, + "step": 21580 + }, + { + "epoch": 1.8, + "grad_norm": 3.572787284851074, + "learning_rate": 7.777971014492754e-06, + "loss": 1.7164, + "step": 21590 + }, + { + "epoch": 1.8, + "grad_norm": 7.807311534881592, + "learning_rate": 7.77217391304348e-06, + "loss": 1.5888, + "step": 21600 + }, + { + "epoch": 1.8, + "grad_norm": 2.578456401824951, + "learning_rate": 7.766376811594203e-06, + "loss": 1.7378, + "step": 21610 + }, + { + "epoch": 1.8, + "grad_norm": 3.514986276626587, + "learning_rate": 7.76057971014493e-06, + "loss": 1.72, + "step": 21620 + }, + { + "epoch": 1.8, + "grad_norm": 1.335219383239746, + "learning_rate": 7.754782608695653e-06, + "loss": 1.6237, + "step": 21630 + }, + { + "epoch": 1.8, + "grad_norm": 4.087151527404785, + "learning_rate": 7.748985507246377e-06, + "loss": 1.6961, + "step": 21640 + }, + { + "epoch": 1.8, + "grad_norm": 5.020003318786621, + "learning_rate": 7.743188405797101e-06, + "loss": 1.5885, + "step": 21650 + }, + { + "epoch": 1.81, + "grad_norm": 3.0945472717285156, + "learning_rate": 7.737391304347827e-06, + "loss": 1.656, + "step": 21660 + }, + { + "epoch": 1.81, + "grad_norm": 10.43069076538086, + "learning_rate": 7.731594202898551e-06, + "loss": 1.7332, + "step": 21670 + }, + { + "epoch": 1.81, + "grad_norm": 2.4677326679229736, + "learning_rate": 7.725797101449277e-06, + "loss": 1.5584, + "step": 21680 + }, + { + "epoch": 1.81, + "grad_norm": 1.4591469764709473, + "learning_rate": 7.72e-06, + "loss": 1.777, + "step": 21690 + }, + { + "epoch": 1.81, + "grad_norm": 2.6109650135040283, + "learning_rate": 7.714202898550726e-06, + "loss": 1.7946, + "step": 21700 + }, + { + "epoch": 1.81, + "grad_norm": 2.215765953063965, + "learning_rate": 7.70840579710145e-06, + "loss": 1.5901, + "step": 21710 + }, + { + "epoch": 1.81, + "grad_norm": 2.505645751953125, + "learning_rate": 7.702608695652174e-06, + "loss": 1.6892, + "step": 21720 + }, + { + "epoch": 1.81, + "grad_norm": 1.5885299444198608, + "learning_rate": 7.696811594202898e-06, + "loss": 1.7118, + "step": 21730 + }, + { + "epoch": 1.81, + "grad_norm": 2.334027051925659, + "learning_rate": 7.691014492753624e-06, + "loss": 1.647, + "step": 21740 + }, + { + "epoch": 1.81, + "grad_norm": 8.775703430175781, + "learning_rate": 7.685217391304348e-06, + "loss": 1.6092, + "step": 21750 + }, + { + "epoch": 1.81, + "grad_norm": 4.955008506774902, + "learning_rate": 7.679420289855074e-06, + "loss": 1.5739, + "step": 21760 + }, + { + "epoch": 1.81, + "grad_norm": 4.728649139404297, + "learning_rate": 7.673623188405798e-06, + "loss": 1.532, + "step": 21770 + }, + { + "epoch": 1.81, + "grad_norm": 2.9815008640289307, + "learning_rate": 7.667826086956522e-06, + "loss": 1.6954, + "step": 21780 + }, + { + "epoch": 1.82, + "grad_norm": 6.196691513061523, + "learning_rate": 7.662028985507247e-06, + "loss": 1.6589, + "step": 21790 + }, + { + "epoch": 1.82, + "grad_norm": 6.2602643966674805, + "learning_rate": 7.656231884057971e-06, + "loss": 1.6467, + "step": 21800 + }, + { + "epoch": 1.82, + "grad_norm": 2.570413112640381, + "learning_rate": 7.650434782608697e-06, + "loss": 1.6406, + "step": 21810 + }, + { + "epoch": 1.82, + "grad_norm": 11.799811363220215, + "learning_rate": 7.644637681159421e-06, + "loss": 1.7035, + "step": 21820 + }, + { + "epoch": 1.82, + "grad_norm": 6.307993412017822, + "learning_rate": 7.638840579710145e-06, + "loss": 1.6075, + "step": 21830 + }, + { + "epoch": 1.82, + "grad_norm": 4.131196975708008, + "learning_rate": 7.63304347826087e-06, + "loss": 1.5969, + "step": 21840 + }, + { + "epoch": 1.82, + "grad_norm": 2.6842682361602783, + "learning_rate": 7.627246376811595e-06, + "loss": 1.6489, + "step": 21850 + }, + { + "epoch": 1.82, + "grad_norm": 3.3321924209594727, + "learning_rate": 7.621449275362319e-06, + "loss": 1.5213, + "step": 21860 + }, + { + "epoch": 1.82, + "grad_norm": 8.997515678405762, + "learning_rate": 7.615652173913044e-06, + "loss": 1.7068, + "step": 21870 + }, + { + "epoch": 1.82, + "grad_norm": 2.286634922027588, + "learning_rate": 7.609855072463769e-06, + "loss": 1.6143, + "step": 21880 + }, + { + "epoch": 1.82, + "grad_norm": 5.148158073425293, + "learning_rate": 7.6040579710144934e-06, + "loss": 1.7376, + "step": 21890 + }, + { + "epoch": 1.82, + "grad_norm": 4.665143013000488, + "learning_rate": 7.598260869565218e-06, + "loss": 1.6027, + "step": 21900 + }, + { + "epoch": 1.83, + "grad_norm": 1.7268582582473755, + "learning_rate": 7.592463768115943e-06, + "loss": 1.5526, + "step": 21910 + }, + { + "epoch": 1.83, + "grad_norm": 1.673620581626892, + "learning_rate": 7.586666666666668e-06, + "loss": 1.7982, + "step": 21920 + }, + { + "epoch": 1.83, + "grad_norm": 2.819336175918579, + "learning_rate": 7.580869565217393e-06, + "loss": 1.7283, + "step": 21930 + }, + { + "epoch": 1.83, + "grad_norm": 6.207809925079346, + "learning_rate": 7.575072463768116e-06, + "loss": 1.5976, + "step": 21940 + }, + { + "epoch": 1.83, + "grad_norm": 3.5421364307403564, + "learning_rate": 7.569275362318841e-06, + "loss": 1.7216, + "step": 21950 + }, + { + "epoch": 1.83, + "grad_norm": 2.2121992111206055, + "learning_rate": 7.563478260869566e-06, + "loss": 1.5512, + "step": 21960 + }, + { + "epoch": 1.83, + "grad_norm": 6.52865743637085, + "learning_rate": 7.5576811594202906e-06, + "loss": 1.6024, + "step": 21970 + }, + { + "epoch": 1.83, + "grad_norm": 6.420586109161377, + "learning_rate": 7.5518840579710154e-06, + "loss": 1.6298, + "step": 21980 + }, + { + "epoch": 1.83, + "grad_norm": 9.54375171661377, + "learning_rate": 7.5460869565217394e-06, + "loss": 1.7235, + "step": 21990 + }, + { + "epoch": 1.83, + "grad_norm": 3.1675570011138916, + "learning_rate": 7.540289855072464e-06, + "loss": 1.8252, + "step": 22000 + }, + { + "epoch": 1.83, + "eval_loss": 1.6268221139907837, + "eval_runtime": 107.5262, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 22000 + }, + { + "epoch": 1.83, + "grad_norm": 5.670650005340576, + "learning_rate": 7.534492753623189e-06, + "loss": 1.7059, + "step": 22010 + }, + { + "epoch": 1.83, + "grad_norm": 1.6870843172073364, + "learning_rate": 7.528695652173914e-06, + "loss": 1.8291, + "step": 22020 + }, + { + "epoch": 1.84, + "grad_norm": 3.0037596225738525, + "learning_rate": 7.522898550724639e-06, + "loss": 1.7288, + "step": 22030 + }, + { + "epoch": 1.84, + "grad_norm": 2.6715378761291504, + "learning_rate": 7.517101449275363e-06, + "loss": 1.5929, + "step": 22040 + }, + { + "epoch": 1.84, + "grad_norm": 3.457519769668579, + "learning_rate": 7.511304347826087e-06, + "loss": 1.4734, + "step": 22050 + }, + { + "epoch": 1.84, + "grad_norm": 7.1646342277526855, + "learning_rate": 7.505507246376812e-06, + "loss": 1.7495, + "step": 22060 + }, + { + "epoch": 1.84, + "grad_norm": 4.050849437713623, + "learning_rate": 7.4997101449275366e-06, + "loss": 1.667, + "step": 22070 + }, + { + "epoch": 1.84, + "grad_norm": 3.6044504642486572, + "learning_rate": 7.493913043478261e-06, + "loss": 1.4592, + "step": 22080 + }, + { + "epoch": 1.84, + "grad_norm": 1.669327974319458, + "learning_rate": 7.488115942028986e-06, + "loss": 1.563, + "step": 22090 + }, + { + "epoch": 1.84, + "grad_norm": 4.431074142456055, + "learning_rate": 7.482318840579711e-06, + "loss": 1.5672, + "step": 22100 + }, + { + "epoch": 1.84, + "grad_norm": 6.121443748474121, + "learning_rate": 7.476521739130436e-06, + "loss": 1.6698, + "step": 22110 + }, + { + "epoch": 1.84, + "grad_norm": 1.799599289894104, + "learning_rate": 7.470724637681161e-06, + "loss": 1.6929, + "step": 22120 + }, + { + "epoch": 1.84, + "grad_norm": 3.3442468643188477, + "learning_rate": 7.464927536231885e-06, + "loss": 1.7485, + "step": 22130 + }, + { + "epoch": 1.84, + "grad_norm": 2.691938638687134, + "learning_rate": 7.459130434782609e-06, + "loss": 1.727, + "step": 22140 + }, + { + "epoch": 1.85, + "grad_norm": 3.82914400100708, + "learning_rate": 7.453333333333334e-06, + "loss": 1.5514, + "step": 22150 + }, + { + "epoch": 1.85, + "grad_norm": 3.622535467147827, + "learning_rate": 7.4475362318840585e-06, + "loss": 1.7807, + "step": 22160 + }, + { + "epoch": 1.85, + "grad_norm": 1.5094232559204102, + "learning_rate": 7.441739130434783e-06, + "loss": 1.7863, + "step": 22170 + }, + { + "epoch": 1.85, + "grad_norm": 2.016897201538086, + "learning_rate": 7.435942028985507e-06, + "loss": 1.7699, + "step": 22180 + }, + { + "epoch": 1.85, + "grad_norm": 2.2854208946228027, + "learning_rate": 7.430144927536232e-06, + "loss": 1.7514, + "step": 22190 + }, + { + "epoch": 1.85, + "grad_norm": 8.095964431762695, + "learning_rate": 7.424347826086957e-06, + "loss": 1.6181, + "step": 22200 + }, + { + "epoch": 1.85, + "grad_norm": 2.0803048610687256, + "learning_rate": 7.418550724637682e-06, + "loss": 1.6808, + "step": 22210 + }, + { + "epoch": 1.85, + "grad_norm": 5.521609306335449, + "learning_rate": 7.413333333333333e-06, + "loss": 1.641, + "step": 22220 + }, + { + "epoch": 1.85, + "grad_norm": 4.716726303100586, + "learning_rate": 7.407536231884058e-06, + "loss": 1.7521, + "step": 22230 + }, + { + "epoch": 1.85, + "grad_norm": 4.207428455352783, + "learning_rate": 7.401739130434783e-06, + "loss": 1.6185, + "step": 22240 + }, + { + "epoch": 1.85, + "grad_norm": 3.840510368347168, + "learning_rate": 7.395942028985508e-06, + "loss": 1.4995, + "step": 22250 + }, + { + "epoch": 1.85, + "grad_norm": 2.4733941555023193, + "learning_rate": 7.390144927536233e-06, + "loss": 1.5198, + "step": 22260 + }, + { + "epoch": 1.86, + "grad_norm": 3.7702128887176514, + "learning_rate": 7.3843478260869575e-06, + "loss": 1.543, + "step": 22270 + }, + { + "epoch": 1.86, + "grad_norm": 3.0251762866973877, + "learning_rate": 7.378550724637682e-06, + "loss": 1.5978, + "step": 22280 + }, + { + "epoch": 1.86, + "grad_norm": 7.0895233154296875, + "learning_rate": 7.372753623188406e-06, + "loss": 1.6949, + "step": 22290 + }, + { + "epoch": 1.86, + "grad_norm": 2.8260159492492676, + "learning_rate": 7.366956521739131e-06, + "loss": 1.7792, + "step": 22300 + }, + { + "epoch": 1.86, + "grad_norm": 3.7007031440734863, + "learning_rate": 7.361159420289856e-06, + "loss": 1.6979, + "step": 22310 + }, + { + "epoch": 1.86, + "grad_norm": 4.845219612121582, + "learning_rate": 7.35536231884058e-06, + "loss": 1.3902, + "step": 22320 + }, + { + "epoch": 1.86, + "grad_norm": 4.411277770996094, + "learning_rate": 7.349565217391305e-06, + "loss": 1.6546, + "step": 22330 + }, + { + "epoch": 1.86, + "grad_norm": 3.6635563373565674, + "learning_rate": 7.343768115942029e-06, + "loss": 1.7551, + "step": 22340 + }, + { + "epoch": 1.86, + "grad_norm": 8.20759391784668, + "learning_rate": 7.337971014492754e-06, + "loss": 1.574, + "step": 22350 + }, + { + "epoch": 1.86, + "grad_norm": 1.9890128374099731, + "learning_rate": 7.332173913043479e-06, + "loss": 1.6738, + "step": 22360 + }, + { + "epoch": 1.86, + "grad_norm": 8.436697959899902, + "learning_rate": 7.3263768115942035e-06, + "loss": 1.6415, + "step": 22370 + }, + { + "epoch": 1.86, + "grad_norm": 1.9131646156311035, + "learning_rate": 7.320579710144928e-06, + "loss": 1.5153, + "step": 22380 + }, + { + "epoch": 1.87, + "grad_norm": 2.4065988063812256, + "learning_rate": 7.314782608695653e-06, + "loss": 1.6153, + "step": 22390 + }, + { + "epoch": 1.87, + "grad_norm": 2.279060125350952, + "learning_rate": 7.308985507246378e-06, + "loss": 1.6871, + "step": 22400 + }, + { + "epoch": 1.87, + "grad_norm": 2.864788770675659, + "learning_rate": 7.303188405797103e-06, + "loss": 1.6629, + "step": 22410 + }, + { + "epoch": 1.87, + "grad_norm": 6.575710296630859, + "learning_rate": 7.297391304347826e-06, + "loss": 1.5627, + "step": 22420 + }, + { + "epoch": 1.87, + "grad_norm": 5.330643653869629, + "learning_rate": 7.291594202898551e-06, + "loss": 1.7939, + "step": 22430 + }, + { + "epoch": 1.87, + "grad_norm": 1.9671368598937988, + "learning_rate": 7.285797101449276e-06, + "loss": 1.6338, + "step": 22440 + }, + { + "epoch": 1.87, + "grad_norm": 8.052875518798828, + "learning_rate": 7.280000000000001e-06, + "loss": 1.7247, + "step": 22450 + }, + { + "epoch": 1.87, + "grad_norm": 8.003440856933594, + "learning_rate": 7.2742028985507255e-06, + "loss": 1.6002, + "step": 22460 + }, + { + "epoch": 1.87, + "grad_norm": 15.555729866027832, + "learning_rate": 7.26840579710145e-06, + "loss": 1.7289, + "step": 22470 + }, + { + "epoch": 1.87, + "grad_norm": 2.3797450065612793, + "learning_rate": 7.262608695652174e-06, + "loss": 1.5819, + "step": 22480 + }, + { + "epoch": 1.87, + "grad_norm": 5.605469226837158, + "learning_rate": 7.256811594202899e-06, + "loss": 1.6578, + "step": 22490 + }, + { + "epoch": 1.88, + "grad_norm": 4.348262786865234, + "learning_rate": 7.251014492753624e-06, + "loss": 1.6862, + "step": 22500 + }, + { + "epoch": 1.88, + "eval_loss": 1.624435544013977, + "eval_runtime": 107.5057, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 22500 + }, + { + "epoch": 1.88, + "grad_norm": 1.7244126796722412, + "learning_rate": 7.245217391304349e-06, + "loss": 1.6801, + "step": 22510 + }, + { + "epoch": 1.88, + "grad_norm": 3.5117290019989014, + "learning_rate": 7.239420289855073e-06, + "loss": 1.6471, + "step": 22520 + }, + { + "epoch": 1.88, + "grad_norm": 3.5990796089172363, + "learning_rate": 7.233623188405797e-06, + "loss": 1.625, + "step": 22530 + }, + { + "epoch": 1.88, + "grad_norm": 4.40592622756958, + "learning_rate": 7.227826086956522e-06, + "loss": 1.6896, + "step": 22540 + }, + { + "epoch": 1.88, + "grad_norm": 2.524301528930664, + "learning_rate": 7.222028985507247e-06, + "loss": 1.6493, + "step": 22550 + }, + { + "epoch": 1.88, + "grad_norm": 3.01413631439209, + "learning_rate": 7.2162318840579714e-06, + "loss": 1.6456, + "step": 22560 + }, + { + "epoch": 1.88, + "grad_norm": 2.1266565322875977, + "learning_rate": 7.210434782608696e-06, + "loss": 1.6527, + "step": 22570 + }, + { + "epoch": 1.88, + "grad_norm": 1.7390304803848267, + "learning_rate": 7.204637681159421e-06, + "loss": 1.6855, + "step": 22580 + }, + { + "epoch": 1.88, + "grad_norm": 2.171137571334839, + "learning_rate": 7.198840579710146e-06, + "loss": 1.6591, + "step": 22590 + }, + { + "epoch": 1.88, + "grad_norm": 2.5304112434387207, + "learning_rate": 7.193043478260871e-06, + "loss": 1.62, + "step": 22600 + }, + { + "epoch": 1.88, + "grad_norm": 1.766373872756958, + "learning_rate": 7.187246376811595e-06, + "loss": 1.7098, + "step": 22610 + }, + { + "epoch": 1.89, + "grad_norm": 2.5186986923217773, + "learning_rate": 7.18144927536232e-06, + "loss": 1.5517, + "step": 22620 + }, + { + "epoch": 1.89, + "grad_norm": 1.9023741483688354, + "learning_rate": 7.175652173913044e-06, + "loss": 1.7088, + "step": 22630 + }, + { + "epoch": 1.89, + "grad_norm": 6.54264497756958, + "learning_rate": 7.1698550724637686e-06, + "loss": 1.6399, + "step": 22640 + }, + { + "epoch": 1.89, + "grad_norm": 7.001004695892334, + "learning_rate": 7.164057971014493e-06, + "loss": 1.5971, + "step": 22650 + }, + { + "epoch": 1.89, + "grad_norm": 6.7402238845825195, + "learning_rate": 7.1582608695652174e-06, + "loss": 1.6201, + "step": 22660 + }, + { + "epoch": 1.89, + "grad_norm": 5.761517524719238, + "learning_rate": 7.152463768115942e-06, + "loss": 1.6743, + "step": 22670 + }, + { + "epoch": 1.89, + "grad_norm": 1.6771907806396484, + "learning_rate": 7.146666666666667e-06, + "loss": 1.5944, + "step": 22680 + }, + { + "epoch": 1.89, + "grad_norm": 9.115729331970215, + "learning_rate": 7.140869565217392e-06, + "loss": 1.6292, + "step": 22690 + }, + { + "epoch": 1.89, + "grad_norm": 3.5204763412475586, + "learning_rate": 7.135072463768117e-06, + "loss": 1.6079, + "step": 22700 + }, + { + "epoch": 1.89, + "grad_norm": 1.9801039695739746, + "learning_rate": 7.129275362318842e-06, + "loss": 1.6728, + "step": 22710 + }, + { + "epoch": 1.89, + "grad_norm": 4.535811424255371, + "learning_rate": 7.1234782608695665e-06, + "loss": 1.5367, + "step": 22720 + }, + { + "epoch": 1.89, + "grad_norm": 4.705811023712158, + "learning_rate": 7.11768115942029e-06, + "loss": 1.6359, + "step": 22730 + }, + { + "epoch": 1.9, + "grad_norm": 5.373073577880859, + "learning_rate": 7.1118840579710146e-06, + "loss": 1.5461, + "step": 22740 + }, + { + "epoch": 1.9, + "grad_norm": 2.482778549194336, + "learning_rate": 7.106086956521739e-06, + "loss": 1.6968, + "step": 22750 + }, + { + "epoch": 1.9, + "grad_norm": 7.682448387145996, + "learning_rate": 7.100289855072464e-06, + "loss": 1.6213, + "step": 22760 + }, + { + "epoch": 1.9, + "grad_norm": 3.669218063354492, + "learning_rate": 7.094492753623189e-06, + "loss": 1.6408, + "step": 22770 + }, + { + "epoch": 1.9, + "grad_norm": 2.680028200149536, + "learning_rate": 7.088695652173914e-06, + "loss": 1.6225, + "step": 22780 + }, + { + "epoch": 1.9, + "grad_norm": 9.829727172851562, + "learning_rate": 7.082898550724638e-06, + "loss": 1.5706, + "step": 22790 + }, + { + "epoch": 1.9, + "grad_norm": 5.793923377990723, + "learning_rate": 7.077101449275363e-06, + "loss": 1.6649, + "step": 22800 + }, + { + "epoch": 1.9, + "grad_norm": 3.1820645332336426, + "learning_rate": 7.071304347826088e-06, + "loss": 1.518, + "step": 22810 + }, + { + "epoch": 1.9, + "grad_norm": 3.055821418762207, + "learning_rate": 7.0655072463768125e-06, + "loss": 1.6835, + "step": 22820 + }, + { + "epoch": 1.9, + "grad_norm": 6.942118167877197, + "learning_rate": 7.0597101449275365e-06, + "loss": 1.5547, + "step": 22830 + }, + { + "epoch": 1.9, + "grad_norm": 3.1956920623779297, + "learning_rate": 7.053913043478261e-06, + "loss": 1.7392, + "step": 22840 + }, + { + "epoch": 1.9, + "grad_norm": 1.6930139064788818, + "learning_rate": 7.048115942028985e-06, + "loss": 1.7377, + "step": 22850 + }, + { + "epoch": 1.91, + "grad_norm": 3.8593344688415527, + "learning_rate": 7.04231884057971e-06, + "loss": 1.6003, + "step": 22860 + }, + { + "epoch": 1.91, + "grad_norm": 6.152258396148682, + "learning_rate": 7.036521739130435e-06, + "loss": 1.671, + "step": 22870 + }, + { + "epoch": 1.91, + "grad_norm": 2.3631834983825684, + "learning_rate": 7.03072463768116e-06, + "loss": 1.5857, + "step": 22880 + }, + { + "epoch": 1.91, + "grad_norm": 1.4059417247772217, + "learning_rate": 7.024927536231885e-06, + "loss": 1.7629, + "step": 22890 + }, + { + "epoch": 1.91, + "grad_norm": 1.9621164798736572, + "learning_rate": 7.01913043478261e-06, + "loss": 1.7076, + "step": 22900 + }, + { + "epoch": 1.91, + "grad_norm": 0.742729127407074, + "learning_rate": 7.0133333333333345e-06, + "loss": 1.6253, + "step": 22910 + }, + { + "epoch": 1.91, + "grad_norm": 3.2152140140533447, + "learning_rate": 7.007536231884059e-06, + "loss": 1.6246, + "step": 22920 + }, + { + "epoch": 1.91, + "grad_norm": 3.5290796756744385, + "learning_rate": 7.001739130434783e-06, + "loss": 1.4322, + "step": 22930 + }, + { + "epoch": 1.91, + "grad_norm": 5.470486164093018, + "learning_rate": 6.995942028985507e-06, + "loss": 1.6678, + "step": 22940 + }, + { + "epoch": 1.91, + "grad_norm": 3.923020124435425, + "learning_rate": 6.990144927536232e-06, + "loss": 1.5589, + "step": 22950 + }, + { + "epoch": 1.91, + "grad_norm": 2.6417629718780518, + "learning_rate": 6.984347826086957e-06, + "loss": 1.5959, + "step": 22960 + }, + { + "epoch": 1.91, + "grad_norm": 3.574911117553711, + "learning_rate": 6.978550724637682e-06, + "loss": 1.7439, + "step": 22970 + }, + { + "epoch": 1.92, + "grad_norm": 1.8391579389572144, + "learning_rate": 6.972753623188406e-06, + "loss": 1.6204, + "step": 22980 + }, + { + "epoch": 1.92, + "grad_norm": 3.0558955669403076, + "learning_rate": 6.966956521739131e-06, + "loss": 1.6833, + "step": 22990 + }, + { + "epoch": 1.92, + "grad_norm": 6.424144744873047, + "learning_rate": 6.961159420289856e-06, + "loss": 1.7008, + "step": 23000 + }, + { + "epoch": 1.92, + "eval_loss": 1.6694403886795044, + "eval_runtime": 107.5169, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 23000 + }, + { + "epoch": 1.92, + "grad_norm": 2.5954649448394775, + "learning_rate": 6.9553623188405805e-06, + "loss": 1.6293, + "step": 23010 + }, + { + "epoch": 1.92, + "grad_norm": 3.2413156032562256, + "learning_rate": 6.949565217391305e-06, + "loss": 1.6196, + "step": 23020 + }, + { + "epoch": 1.92, + "grad_norm": 2.4454963207244873, + "learning_rate": 6.94376811594203e-06, + "loss": 1.689, + "step": 23030 + }, + { + "epoch": 1.92, + "grad_norm": 3.627885580062866, + "learning_rate": 6.937971014492753e-06, + "loss": 1.623, + "step": 23040 + }, + { + "epoch": 1.92, + "grad_norm": 1.9731435775756836, + "learning_rate": 6.932173913043478e-06, + "loss": 1.5804, + "step": 23050 + }, + { + "epoch": 1.92, + "grad_norm": 1.2060045003890991, + "learning_rate": 6.926376811594203e-06, + "loss": 1.5979, + "step": 23060 + }, + { + "epoch": 1.92, + "grad_norm": 4.452988147735596, + "learning_rate": 6.920579710144928e-06, + "loss": 1.693, + "step": 23070 + }, + { + "epoch": 1.92, + "grad_norm": 2.145307779312134, + "learning_rate": 6.914782608695653e-06, + "loss": 1.7325, + "step": 23080 + }, + { + "epoch": 1.92, + "grad_norm": 5.082767009735107, + "learning_rate": 6.908985507246378e-06, + "loss": 1.6337, + "step": 23090 + }, + { + "epoch": 1.93, + "grad_norm": 3.040972948074341, + "learning_rate": 6.9031884057971025e-06, + "loss": 1.6223, + "step": 23100 + }, + { + "epoch": 1.93, + "grad_norm": 1.870182752609253, + "learning_rate": 6.8973913043478265e-06, + "loss": 1.7026, + "step": 23110 + }, + { + "epoch": 1.93, + "grad_norm": 0.9211017489433289, + "learning_rate": 6.891594202898551e-06, + "loss": 1.642, + "step": 23120 + }, + { + "epoch": 1.93, + "grad_norm": 2.97402286529541, + "learning_rate": 6.885797101449276e-06, + "loss": 1.6483, + "step": 23130 + }, + { + "epoch": 1.93, + "grad_norm": 2.9657325744628906, + "learning_rate": 6.88e-06, + "loss": 1.6167, + "step": 23140 + }, + { + "epoch": 1.93, + "grad_norm": 4.580372333526611, + "learning_rate": 6.874202898550725e-06, + "loss": 1.8135, + "step": 23150 + }, + { + "epoch": 1.93, + "grad_norm": 3.190333843231201, + "learning_rate": 6.86840579710145e-06, + "loss": 1.669, + "step": 23160 + }, + { + "epoch": 1.93, + "grad_norm": 9.4367094039917, + "learning_rate": 6.862608695652174e-06, + "loss": 1.5822, + "step": 23170 + }, + { + "epoch": 1.93, + "grad_norm": 4.261363506317139, + "learning_rate": 6.856811594202899e-06, + "loss": 1.5777, + "step": 23180 + }, + { + "epoch": 1.93, + "grad_norm": 5.0194172859191895, + "learning_rate": 6.851014492753624e-06, + "loss": 1.7024, + "step": 23190 + }, + { + "epoch": 1.93, + "grad_norm": 5.917288780212402, + "learning_rate": 6.8452173913043485e-06, + "loss": 1.565, + "step": 23200 + }, + { + "epoch": 1.93, + "grad_norm": 3.0884828567504883, + "learning_rate": 6.839420289855073e-06, + "loss": 1.7616, + "step": 23210 + }, + { + "epoch": 1.94, + "grad_norm": 8.573901176452637, + "learning_rate": 6.833623188405798e-06, + "loss": 1.7546, + "step": 23220 + }, + { + "epoch": 1.94, + "grad_norm": 5.3873114585876465, + "learning_rate": 6.827826086956523e-06, + "loss": 1.69, + "step": 23230 + }, + { + "epoch": 1.94, + "grad_norm": 2.1748428344726562, + "learning_rate": 6.822028985507248e-06, + "loss": 1.5809, + "step": 23240 + }, + { + "epoch": 1.94, + "grad_norm": 1.9165292978286743, + "learning_rate": 6.816231884057971e-06, + "loss": 1.6822, + "step": 23250 + }, + { + "epoch": 1.94, + "grad_norm": 1.971045970916748, + "learning_rate": 6.810434782608696e-06, + "loss": 1.7906, + "step": 23260 + }, + { + "epoch": 1.94, + "grad_norm": 7.549450397491455, + "learning_rate": 6.804637681159421e-06, + "loss": 1.6903, + "step": 23270 + }, + { + "epoch": 1.94, + "grad_norm": 3.463688611984253, + "learning_rate": 6.798840579710146e-06, + "loss": 1.601, + "step": 23280 + }, + { + "epoch": 1.94, + "grad_norm": 2.8097610473632812, + "learning_rate": 6.7930434782608704e-06, + "loss": 1.6345, + "step": 23290 + }, + { + "epoch": 1.94, + "grad_norm": 5.737390518188477, + "learning_rate": 6.7872463768115945e-06, + "loss": 1.5124, + "step": 23300 + }, + { + "epoch": 1.94, + "grad_norm": 4.170960426330566, + "learning_rate": 6.781449275362319e-06, + "loss": 1.6134, + "step": 23310 + }, + { + "epoch": 1.94, + "grad_norm": 2.446510076522827, + "learning_rate": 6.775652173913044e-06, + "loss": 1.6565, + "step": 23320 + }, + { + "epoch": 1.94, + "grad_norm": 3.7264490127563477, + "learning_rate": 6.769855072463769e-06, + "loss": 1.7299, + "step": 23330 + }, + { + "epoch": 1.94, + "grad_norm": 3.8352794647216797, + "learning_rate": 6.764057971014494e-06, + "loss": 1.6823, + "step": 23340 + }, + { + "epoch": 1.95, + "grad_norm": 4.742753982543945, + "learning_rate": 6.758260869565217e-06, + "loss": 1.6586, + "step": 23350 + }, + { + "epoch": 1.95, + "grad_norm": 2.161562204360962, + "learning_rate": 6.752463768115942e-06, + "loss": 1.5763, + "step": 23360 + }, + { + "epoch": 1.95, + "grad_norm": 5.118688583374023, + "learning_rate": 6.746666666666667e-06, + "loss": 1.6764, + "step": 23370 + }, + { + "epoch": 1.95, + "grad_norm": 2.455047369003296, + "learning_rate": 6.740869565217392e-06, + "loss": 1.7486, + "step": 23380 + }, + { + "epoch": 1.95, + "grad_norm": 3.586467742919922, + "learning_rate": 6.7350724637681164e-06, + "loss": 1.6002, + "step": 23390 + }, + { + "epoch": 1.95, + "grad_norm": 1.8399118185043335, + "learning_rate": 6.729275362318841e-06, + "loss": 1.617, + "step": 23400 + }, + { + "epoch": 1.95, + "grad_norm": 2.191316604614258, + "learning_rate": 6.723478260869566e-06, + "loss": 1.6777, + "step": 23410 + }, + { + "epoch": 1.95, + "grad_norm": 3.110720634460449, + "learning_rate": 6.717681159420291e-06, + "loss": 1.627, + "step": 23420 + }, + { + "epoch": 1.95, + "grad_norm": 6.99454927444458, + "learning_rate": 6.711884057971015e-06, + "loss": 1.6708, + "step": 23430 + }, + { + "epoch": 1.95, + "grad_norm": 4.553925514221191, + "learning_rate": 6.70608695652174e-06, + "loss": 1.5284, + "step": 23440 + }, + { + "epoch": 1.95, + "grad_norm": 5.1461501121521, + "learning_rate": 6.700289855072464e-06, + "loss": 1.6239, + "step": 23450 + }, + { + "epoch": 1.96, + "grad_norm": 2.479620933532715, + "learning_rate": 6.694492753623189e-06, + "loss": 1.7803, + "step": 23460 + }, + { + "epoch": 1.96, + "grad_norm": 4.193335056304932, + "learning_rate": 6.6886956521739136e-06, + "loss": 1.726, + "step": 23470 + }, + { + "epoch": 1.96, + "grad_norm": 5.909477710723877, + "learning_rate": 6.682898550724638e-06, + "loss": 1.7277, + "step": 23480 + }, + { + "epoch": 1.96, + "grad_norm": 1.2803231477737427, + "learning_rate": 6.677101449275362e-06, + "loss": 1.6677, + "step": 23490 + }, + { + "epoch": 1.96, + "grad_norm": 2.0764338970184326, + "learning_rate": 6.671304347826087e-06, + "loss": 1.6192, + "step": 23500 + }, + { + "epoch": 1.96, + "eval_loss": 1.663988471031189, + "eval_runtime": 107.5268, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 23500 + }, + { + "epoch": 1.96, + "grad_norm": 4.041726112365723, + "learning_rate": 6.665507246376812e-06, + "loss": 1.6856, + "step": 23510 + }, + { + "epoch": 1.96, + "grad_norm": 7.944361209869385, + "learning_rate": 6.659710144927537e-06, + "loss": 1.7495, + "step": 23520 + }, + { + "epoch": 1.96, + "grad_norm": 2.918194532394409, + "learning_rate": 6.653913043478262e-06, + "loss": 1.6635, + "step": 23530 + }, + { + "epoch": 1.96, + "grad_norm": 8.257862091064453, + "learning_rate": 6.648115942028987e-06, + "loss": 1.717, + "step": 23540 + }, + { + "epoch": 1.96, + "grad_norm": 6.422058582305908, + "learning_rate": 6.6423188405797115e-06, + "loss": 1.6294, + "step": 23550 + }, + { + "epoch": 1.96, + "grad_norm": 5.261116027832031, + "learning_rate": 6.636521739130435e-06, + "loss": 1.712, + "step": 23560 + }, + { + "epoch": 1.96, + "grad_norm": 2.29148006439209, + "learning_rate": 6.6307246376811595e-06, + "loss": 1.7626, + "step": 23570 + }, + { + "epoch": 1.96, + "grad_norm": 4.524306774139404, + "learning_rate": 6.624927536231884e-06, + "loss": 1.4419, + "step": 23580 + }, + { + "epoch": 1.97, + "grad_norm": 5.170078277587891, + "learning_rate": 6.619130434782609e-06, + "loss": 1.7131, + "step": 23590 + }, + { + "epoch": 1.97, + "grad_norm": 7.53287410736084, + "learning_rate": 6.613333333333334e-06, + "loss": 1.6708, + "step": 23600 + }, + { + "epoch": 1.97, + "grad_norm": 3.6637678146362305, + "learning_rate": 6.607536231884059e-06, + "loss": 1.8157, + "step": 23610 + }, + { + "epoch": 1.97, + "grad_norm": 1.637990117073059, + "learning_rate": 6.601739130434783e-06, + "loss": 1.5326, + "step": 23620 + }, + { + "epoch": 1.97, + "grad_norm": 5.433206081390381, + "learning_rate": 6.595942028985508e-06, + "loss": 1.8064, + "step": 23630 + }, + { + "epoch": 1.97, + "grad_norm": 4.04767370223999, + "learning_rate": 6.590144927536233e-06, + "loss": 1.5902, + "step": 23640 + }, + { + "epoch": 1.97, + "grad_norm": 3.667262554168701, + "learning_rate": 6.5843478260869575e-06, + "loss": 1.5682, + "step": 23650 + }, + { + "epoch": 1.97, + "grad_norm": 3.9290482997894287, + "learning_rate": 6.5785507246376815e-06, + "loss": 1.6237, + "step": 23660 + }, + { + "epoch": 1.97, + "grad_norm": 3.297651529312134, + "learning_rate": 6.5727536231884055e-06, + "loss": 1.537, + "step": 23670 + }, + { + "epoch": 1.97, + "grad_norm": 3.4150543212890625, + "learning_rate": 6.56695652173913e-06, + "loss": 1.6064, + "step": 23680 + }, + { + "epoch": 1.97, + "grad_norm": 1.4117975234985352, + "learning_rate": 6.561159420289855e-06, + "loss": 1.6012, + "step": 23690 + }, + { + "epoch": 1.98, + "grad_norm": 4.487409591674805, + "learning_rate": 6.55536231884058e-06, + "loss": 1.6483, + "step": 23700 + }, + { + "epoch": 1.98, + "grad_norm": 2.014012575149536, + "learning_rate": 6.549565217391305e-06, + "loss": 1.592, + "step": 23710 + }, + { + "epoch": 1.98, + "grad_norm": 2.575296401977539, + "learning_rate": 6.54376811594203e-06, + "loss": 1.574, + "step": 23720 + }, + { + "epoch": 1.98, + "grad_norm": 1.4668456315994263, + "learning_rate": 6.537971014492755e-06, + "loss": 1.6368, + "step": 23730 + }, + { + "epoch": 1.98, + "grad_norm": 6.705700874328613, + "learning_rate": 6.5321739130434795e-06, + "loss": 1.6754, + "step": 23740 + }, + { + "epoch": 1.98, + "grad_norm": 2.414182424545288, + "learning_rate": 6.5263768115942035e-06, + "loss": 1.5873, + "step": 23750 + }, + { + "epoch": 1.98, + "grad_norm": 3.606837749481201, + "learning_rate": 6.5205797101449275e-06, + "loss": 1.7331, + "step": 23760 + }, + { + "epoch": 1.98, + "grad_norm": 6.832721710205078, + "learning_rate": 6.514782608695652e-06, + "loss": 1.668, + "step": 23770 + }, + { + "epoch": 1.98, + "grad_norm": 3.788400173187256, + "learning_rate": 6.508985507246377e-06, + "loss": 1.5374, + "step": 23780 + }, + { + "epoch": 1.98, + "grad_norm": 3.2363767623901367, + "learning_rate": 6.503188405797102e-06, + "loss": 1.6609, + "step": 23790 + }, + { + "epoch": 1.98, + "grad_norm": 1.4389679431915283, + "learning_rate": 6.497391304347826e-06, + "loss": 1.6405, + "step": 23800 + }, + { + "epoch": 1.98, + "grad_norm": 4.902224540710449, + "learning_rate": 6.491594202898551e-06, + "loss": 1.6073, + "step": 23810 + }, + { + "epoch": 1.98, + "grad_norm": 3.991961717605591, + "learning_rate": 6.485797101449276e-06, + "loss": 1.6646, + "step": 23820 + }, + { + "epoch": 1.99, + "grad_norm": 4.761397838592529, + "learning_rate": 6.480000000000001e-06, + "loss": 1.6553, + "step": 23830 + }, + { + "epoch": 1.99, + "grad_norm": 3.5798428058624268, + "learning_rate": 6.4742028985507255e-06, + "loss": 1.6409, + "step": 23840 + }, + { + "epoch": 1.99, + "grad_norm": 1.4175989627838135, + "learning_rate": 6.46840579710145e-06, + "loss": 1.7199, + "step": 23850 + }, + { + "epoch": 1.99, + "grad_norm": 2.976370334625244, + "learning_rate": 6.462608695652175e-06, + "loss": 1.6925, + "step": 23860 + }, + { + "epoch": 1.99, + "grad_norm": 5.479163646697998, + "learning_rate": 6.456811594202898e-06, + "loss": 1.6045, + "step": 23870 + }, + { + "epoch": 1.99, + "grad_norm": 2.5710270404815674, + "learning_rate": 6.451014492753623e-06, + "loss": 1.6237, + "step": 23880 + }, + { + "epoch": 1.99, + "grad_norm": 7.615454196929932, + "learning_rate": 6.445217391304348e-06, + "loss": 1.7599, + "step": 23890 + }, + { + "epoch": 1.99, + "grad_norm": 7.336389064788818, + "learning_rate": 6.439420289855073e-06, + "loss": 1.4353, + "step": 23900 + }, + { + "epoch": 1.99, + "grad_norm": 8.621050834655762, + "learning_rate": 6.433623188405798e-06, + "loss": 1.6964, + "step": 23910 + }, + { + "epoch": 1.99, + "grad_norm": 6.770620346069336, + "learning_rate": 6.427826086956523e-06, + "loss": 1.467, + "step": 23920 + }, + { + "epoch": 1.99, + "grad_norm": 4.004281520843506, + "learning_rate": 6.4220289855072475e-06, + "loss": 1.6323, + "step": 23930 + }, + { + "epoch": 2.0, + "grad_norm": 3.508981704711914, + "learning_rate": 6.4162318840579715e-06, + "loss": 1.648, + "step": 23940 + }, + { + "epoch": 2.0, + "grad_norm": 6.081500053405762, + "learning_rate": 6.410434782608696e-06, + "loss": 1.5768, + "step": 23950 + }, + { + "epoch": 2.0, + "grad_norm": 6.746595859527588, + "learning_rate": 6.404637681159421e-06, + "loss": 1.6595, + "step": 23960 + }, + { + "epoch": 2.0, + "grad_norm": 3.1742167472839355, + "learning_rate": 6.398840579710145e-06, + "loss": 1.6841, + "step": 23970 + }, + { + "epoch": 2.0, + "grad_norm": 3.1276462078094482, + "learning_rate": 6.39304347826087e-06, + "loss": 1.5213, + "step": 23980 + }, + { + "epoch": 2.0, + "grad_norm": 3.458132028579712, + "learning_rate": 6.387246376811594e-06, + "loss": 1.5896, + "step": 23990 + }, + { + "epoch": 2.0, + "grad_norm": 5.596000671386719, + "learning_rate": 6.381449275362319e-06, + "loss": 1.6449, + "step": 24000 + }, + { + "epoch": 2.0, + "eval_loss": 1.6730765104293823, + "eval_runtime": 107.5278, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 24000 + }, + { + "epoch": 2.0, + "grad_norm": 4.354638576507568, + "learning_rate": 6.375652173913044e-06, + "loss": 1.5624, + "step": 24010 + }, + { + "epoch": 2.0, + "grad_norm": 1.6478346586227417, + "learning_rate": 6.369855072463769e-06, + "loss": 1.7343, + "step": 24020 + }, + { + "epoch": 2.0, + "grad_norm": 4.3177080154418945, + "learning_rate": 6.3640579710144935e-06, + "loss": 1.4786, + "step": 24030 + }, + { + "epoch": 2.0, + "grad_norm": 6.199551582336426, + "learning_rate": 6.358260869565218e-06, + "loss": 1.5448, + "step": 24040 + }, + { + "epoch": 2.0, + "grad_norm": 6.634474754333496, + "learning_rate": 6.352463768115943e-06, + "loss": 1.4954, + "step": 24050 + }, + { + "epoch": 2.0, + "grad_norm": 2.2182674407958984, + "learning_rate": 6.346666666666668e-06, + "loss": 1.6519, + "step": 24060 + }, + { + "epoch": 2.01, + "grad_norm": 12.482495307922363, + "learning_rate": 6.340869565217391e-06, + "loss": 1.5864, + "step": 24070 + }, + { + "epoch": 2.01, + "grad_norm": 4.257279872894287, + "learning_rate": 6.335072463768116e-06, + "loss": 1.4682, + "step": 24080 + }, + { + "epoch": 2.01, + "grad_norm": 5.081076622009277, + "learning_rate": 6.329275362318841e-06, + "loss": 1.5972, + "step": 24090 + }, + { + "epoch": 2.01, + "grad_norm": 4.3276896476745605, + "learning_rate": 6.323478260869566e-06, + "loss": 1.658, + "step": 24100 + }, + { + "epoch": 2.01, + "grad_norm": 2.144803524017334, + "learning_rate": 6.317681159420291e-06, + "loss": 1.4938, + "step": 24110 + }, + { + "epoch": 2.01, + "grad_norm": 11.347249031066895, + "learning_rate": 6.311884057971015e-06, + "loss": 1.6704, + "step": 24120 + }, + { + "epoch": 2.01, + "grad_norm": 5.875922679901123, + "learning_rate": 6.3060869565217394e-06, + "loss": 1.5384, + "step": 24130 + }, + { + "epoch": 2.01, + "grad_norm": 2.232929229736328, + "learning_rate": 6.300289855072464e-06, + "loss": 1.6183, + "step": 24140 + }, + { + "epoch": 2.01, + "grad_norm": 2.2293903827667236, + "learning_rate": 6.294492753623189e-06, + "loss": 1.4965, + "step": 24150 + }, + { + "epoch": 2.01, + "grad_norm": 7.873164653778076, + "learning_rate": 6.288695652173914e-06, + "loss": 1.6817, + "step": 24160 + }, + { + "epoch": 2.01, + "grad_norm": 6.767528057098389, + "learning_rate": 6.282898550724639e-06, + "loss": 1.7035, + "step": 24170 + }, + { + "epoch": 2.02, + "grad_norm": 2.819685220718384, + "learning_rate": 6.277101449275362e-06, + "loss": 1.6476, + "step": 24180 + }, + { + "epoch": 2.02, + "grad_norm": 8.485639572143555, + "learning_rate": 6.271304347826087e-06, + "loss": 1.6737, + "step": 24190 + }, + { + "epoch": 2.02, + "grad_norm": 4.753501892089844, + "learning_rate": 6.265507246376812e-06, + "loss": 1.5851, + "step": 24200 + }, + { + "epoch": 2.02, + "grad_norm": 2.462150812149048, + "learning_rate": 6.2597101449275366e-06, + "loss": 1.5416, + "step": 24210 + }, + { + "epoch": 2.02, + "grad_norm": 4.648767948150635, + "learning_rate": 6.2539130434782614e-06, + "loss": 1.6021, + "step": 24220 + }, + { + "epoch": 2.02, + "grad_norm": 4.275688648223877, + "learning_rate": 6.248115942028986e-06, + "loss": 1.6221, + "step": 24230 + }, + { + "epoch": 2.02, + "grad_norm": 5.949949264526367, + "learning_rate": 6.242318840579711e-06, + "loss": 1.5643, + "step": 24240 + }, + { + "epoch": 2.02, + "grad_norm": 0.8903957605361938, + "learning_rate": 6.236521739130436e-06, + "loss": 1.8374, + "step": 24250 + }, + { + "epoch": 2.02, + "grad_norm": 12.835528373718262, + "learning_rate": 6.23072463768116e-06, + "loss": 1.7176, + "step": 24260 + }, + { + "epoch": 2.02, + "grad_norm": 2.02567195892334, + "learning_rate": 6.224927536231885e-06, + "loss": 1.7012, + "step": 24270 + }, + { + "epoch": 2.02, + "grad_norm": 2.2769625186920166, + "learning_rate": 6.219130434782609e-06, + "loss": 1.6408, + "step": 24280 + }, + { + "epoch": 2.02, + "grad_norm": 4.010924339294434, + "learning_rate": 6.213333333333334e-06, + "loss": 1.7554, + "step": 24290 + }, + { + "epoch": 2.02, + "grad_norm": 3.4019651412963867, + "learning_rate": 6.2075362318840586e-06, + "loss": 1.6405, + "step": 24300 + }, + { + "epoch": 2.03, + "grad_norm": 12.412097930908203, + "learning_rate": 6.2017391304347826e-06, + "loss": 1.5947, + "step": 24310 + }, + { + "epoch": 2.03, + "grad_norm": 5.544475555419922, + "learning_rate": 6.195942028985507e-06, + "loss": 1.6161, + "step": 24320 + }, + { + "epoch": 2.03, + "grad_norm": 4.096646308898926, + "learning_rate": 6.190144927536232e-06, + "loss": 1.5265, + "step": 24330 + }, + { + "epoch": 2.03, + "grad_norm": 6.934004783630371, + "learning_rate": 6.184347826086957e-06, + "loss": 1.6078, + "step": 24340 + }, + { + "epoch": 2.03, + "grad_norm": 3.017855405807495, + "learning_rate": 6.178550724637682e-06, + "loss": 1.7315, + "step": 24350 + }, + { + "epoch": 2.03, + "grad_norm": 4.79693603515625, + "learning_rate": 6.172753623188407e-06, + "loss": 1.6719, + "step": 24360 + }, + { + "epoch": 2.03, + "grad_norm": 2.320878267288208, + "learning_rate": 6.166956521739132e-06, + "loss": 1.7665, + "step": 24370 + }, + { + "epoch": 2.03, + "grad_norm": 1.4704517126083374, + "learning_rate": 6.161159420289855e-06, + "loss": 1.6487, + "step": 24380 + }, + { + "epoch": 2.03, + "grad_norm": 1.9392638206481934, + "learning_rate": 6.15536231884058e-06, + "loss": 1.6703, + "step": 24390 + }, + { + "epoch": 2.03, + "grad_norm": 1.3862284421920776, + "learning_rate": 6.1495652173913045e-06, + "loss": 1.5763, + "step": 24400 + }, + { + "epoch": 2.03, + "grad_norm": 2.0127170085906982, + "learning_rate": 6.143768115942029e-06, + "loss": 1.6736, + "step": 24410 + }, + { + "epoch": 2.04, + "grad_norm": 2.7247025966644287, + "learning_rate": 6.137971014492754e-06, + "loss": 1.5715, + "step": 24420 + }, + { + "epoch": 2.04, + "grad_norm": 1.8624944686889648, + "learning_rate": 6.132173913043479e-06, + "loss": 1.7204, + "step": 24430 + }, + { + "epoch": 2.04, + "grad_norm": 9.080060005187988, + "learning_rate": 6.126376811594203e-06, + "loss": 1.6873, + "step": 24440 + }, + { + "epoch": 2.04, + "grad_norm": 5.190478801727295, + "learning_rate": 6.120579710144928e-06, + "loss": 1.6602, + "step": 24450 + }, + { + "epoch": 2.04, + "grad_norm": 2.000903606414795, + "learning_rate": 6.114782608695653e-06, + "loss": 1.6189, + "step": 24460 + }, + { + "epoch": 2.04, + "grad_norm": 2.4447052478790283, + "learning_rate": 6.108985507246378e-06, + "loss": 1.6426, + "step": 24470 + }, + { + "epoch": 2.04, + "grad_norm": 2.6354033946990967, + "learning_rate": 6.1031884057971025e-06, + "loss": 1.5703, + "step": 24480 + }, + { + "epoch": 2.04, + "grad_norm": 3.4452903270721436, + "learning_rate": 6.0973913043478265e-06, + "loss": 1.6759, + "step": 24490 + }, + { + "epoch": 2.04, + "grad_norm": 3.570955991744995, + "learning_rate": 6.0915942028985505e-06, + "loss": 1.7965, + "step": 24500 + }, + { + "epoch": 2.04, + "eval_loss": 1.6831989288330078, + "eval_runtime": 107.5466, + "eval_samples_per_second": 9.298, + "eval_steps_per_second": 2.325, + "step": 24500 + }, + { + "epoch": 2.04, + "grad_norm": 3.475395917892456, + "learning_rate": 6.085797101449275e-06, + "loss": 1.5667, + "step": 24510 + }, + { + "epoch": 2.04, + "grad_norm": 4.197578430175781, + "learning_rate": 6.08e-06, + "loss": 1.443, + "step": 24520 + }, + { + "epoch": 2.04, + "grad_norm": 3.9446558952331543, + "learning_rate": 6.074202898550725e-06, + "loss": 1.6287, + "step": 24530 + }, + { + "epoch": 2.04, + "grad_norm": 2.6334280967712402, + "learning_rate": 6.06840579710145e-06, + "loss": 1.6424, + "step": 24540 + }, + { + "epoch": 2.05, + "grad_norm": 3.000389575958252, + "learning_rate": 6.062608695652175e-06, + "loss": 1.5955, + "step": 24550 + }, + { + "epoch": 2.05, + "grad_norm": 2.637186050415039, + "learning_rate": 6.0568115942029e-06, + "loss": 1.5686, + "step": 24560 + }, + { + "epoch": 2.05, + "grad_norm": 2.8098504543304443, + "learning_rate": 6.051014492753624e-06, + "loss": 1.6036, + "step": 24570 + }, + { + "epoch": 2.05, + "grad_norm": 4.713962078094482, + "learning_rate": 6.0452173913043485e-06, + "loss": 1.7327, + "step": 24580 + }, + { + "epoch": 2.05, + "grad_norm": 3.344076156616211, + "learning_rate": 6.0394202898550725e-06, + "loss": 1.7235, + "step": 24590 + }, + { + "epoch": 2.05, + "grad_norm": 6.279098033905029, + "learning_rate": 6.0342028985507255e-06, + "loss": 1.7064, + "step": 24600 + }, + { + "epoch": 2.05, + "grad_norm": 3.025627851486206, + "learning_rate": 6.0284057971014495e-06, + "loss": 1.6894, + "step": 24610 + }, + { + "epoch": 2.05, + "grad_norm": 22.400253295898438, + "learning_rate": 6.022608695652174e-06, + "loss": 1.6231, + "step": 24620 + }, + { + "epoch": 2.05, + "grad_norm": 3.277951955795288, + "learning_rate": 6.016811594202899e-06, + "loss": 1.6483, + "step": 24630 + }, + { + "epoch": 2.05, + "grad_norm": 3.1129698753356934, + "learning_rate": 6.011014492753624e-06, + "loss": 1.6069, + "step": 24640 + }, + { + "epoch": 2.05, + "grad_norm": 1.6595231294631958, + "learning_rate": 6.005217391304349e-06, + "loss": 1.5796, + "step": 24650 + }, + { + "epoch": 2.06, + "grad_norm": 4.5862345695495605, + "learning_rate": 5.999420289855072e-06, + "loss": 1.6568, + "step": 24660 + }, + { + "epoch": 2.06, + "grad_norm": 4.170677661895752, + "learning_rate": 5.993623188405797e-06, + "loss": 1.608, + "step": 24670 + }, + { + "epoch": 2.06, + "grad_norm": 5.7852559089660645, + "learning_rate": 5.987826086956522e-06, + "loss": 1.6639, + "step": 24680 + }, + { + "epoch": 2.06, + "grad_norm": 2.2292609214782715, + "learning_rate": 5.982028985507247e-06, + "loss": 1.7244, + "step": 24690 + }, + { + "epoch": 2.06, + "grad_norm": 1.7636544704437256, + "learning_rate": 5.9762318840579715e-06, + "loss": 1.6089, + "step": 24700 + }, + { + "epoch": 2.06, + "grad_norm": 2.0442698001861572, + "learning_rate": 5.970434782608696e-06, + "loss": 1.6019, + "step": 24710 + }, + { + "epoch": 2.06, + "grad_norm": 5.134725093841553, + "learning_rate": 5.964637681159421e-06, + "loss": 1.5547, + "step": 24720 + }, + { + "epoch": 2.06, + "grad_norm": 4.921815872192383, + "learning_rate": 5.958840579710146e-06, + "loss": 1.5884, + "step": 24730 + }, + { + "epoch": 2.06, + "grad_norm": 2.0440871715545654, + "learning_rate": 5.95304347826087e-06, + "loss": 1.7173, + "step": 24740 + }, + { + "epoch": 2.06, + "grad_norm": 9.600371360778809, + "learning_rate": 5.947246376811595e-06, + "loss": 1.6929, + "step": 24750 + }, + { + "epoch": 2.06, + "grad_norm": 5.437139987945557, + "learning_rate": 5.94144927536232e-06, + "loss": 1.5867, + "step": 24760 + }, + { + "epoch": 2.06, + "grad_norm": 2.7451493740081787, + "learning_rate": 5.935652173913044e-06, + "loss": 1.5753, + "step": 24770 + }, + { + "epoch": 2.06, + "grad_norm": 5.03071928024292, + "learning_rate": 5.929855072463769e-06, + "loss": 1.7336, + "step": 24780 + }, + { + "epoch": 2.07, + "grad_norm": 5.349041938781738, + "learning_rate": 5.924057971014493e-06, + "loss": 1.5073, + "step": 24790 + }, + { + "epoch": 2.07, + "grad_norm": 2.7859678268432617, + "learning_rate": 5.9182608695652174e-06, + "loss": 1.5372, + "step": 24800 + }, + { + "epoch": 2.07, + "grad_norm": 2.020723819732666, + "learning_rate": 5.912463768115942e-06, + "loss": 1.4775, + "step": 24810 + }, + { + "epoch": 2.07, + "grad_norm": 5.465567111968994, + "learning_rate": 5.906666666666667e-06, + "loss": 1.6493, + "step": 24820 + }, + { + "epoch": 2.07, + "grad_norm": 5.622159004211426, + "learning_rate": 5.900869565217392e-06, + "loss": 1.5543, + "step": 24830 + }, + { + "epoch": 2.07, + "grad_norm": 3.285824775695801, + "learning_rate": 5.895072463768117e-06, + "loss": 1.5926, + "step": 24840 + }, + { + "epoch": 2.07, + "grad_norm": 18.595064163208008, + "learning_rate": 5.889275362318842e-06, + "loss": 1.5055, + "step": 24850 + }, + { + "epoch": 2.07, + "grad_norm": 1.7584556341171265, + "learning_rate": 5.8834782608695666e-06, + "loss": 1.6876, + "step": 24860 + }, + { + "epoch": 2.07, + "grad_norm": 9.670427322387695, + "learning_rate": 5.87768115942029e-06, + "loss": 1.5141, + "step": 24870 + }, + { + "epoch": 2.07, + "grad_norm": 1.9229793548583984, + "learning_rate": 5.8718840579710146e-06, + "loss": 1.5989, + "step": 24880 + }, + { + "epoch": 2.07, + "grad_norm": 2.915713310241699, + "learning_rate": 5.866086956521739e-06, + "loss": 1.6928, + "step": 24890 + }, + { + "epoch": 2.08, + "grad_norm": 2.840363025665283, + "learning_rate": 5.860289855072464e-06, + "loss": 1.6718, + "step": 24900 + }, + { + "epoch": 2.08, + "grad_norm": 9.295182228088379, + "learning_rate": 5.854492753623189e-06, + "loss": 1.6016, + "step": 24910 + }, + { + "epoch": 2.08, + "grad_norm": 3.412475109100342, + "learning_rate": 5.848695652173913e-06, + "loss": 1.6354, + "step": 24920 + }, + { + "epoch": 2.08, + "grad_norm": 8.369576454162598, + "learning_rate": 5.842898550724638e-06, + "loss": 1.6169, + "step": 24930 + }, + { + "epoch": 2.08, + "grad_norm": 1.795441746711731, + "learning_rate": 5.837101449275363e-06, + "loss": 1.6285, + "step": 24940 + }, + { + "epoch": 2.08, + "grad_norm": 4.794939994812012, + "learning_rate": 5.831304347826088e-06, + "loss": 1.5579, + "step": 24950 + }, + { + "epoch": 2.08, + "grad_norm": 1.6949840784072876, + "learning_rate": 5.8255072463768125e-06, + "loss": 1.6978, + "step": 24960 + }, + { + "epoch": 2.08, + "grad_norm": 3.829167366027832, + "learning_rate": 5.8197101449275366e-06, + "loss": 1.6054, + "step": 24970 + }, + { + "epoch": 2.08, + "grad_norm": 5.521337509155273, + "learning_rate": 5.8139130434782606e-06, + "loss": 1.6029, + "step": 24980 + }, + { + "epoch": 2.08, + "grad_norm": 7.663822174072266, + "learning_rate": 5.808115942028985e-06, + "loss": 1.5795, + "step": 24990 + }, + { + "epoch": 2.08, + "grad_norm": 2.8638572692871094, + "learning_rate": 5.80231884057971e-06, + "loss": 1.715, + "step": 25000 + }, + { + "epoch": 2.08, + "eval_loss": 1.62440824508667, + "eval_runtime": 107.4912, + "eval_samples_per_second": 9.303, + "eval_steps_per_second": 2.326, + "step": 25000 + }, + { + "epoch": 2.08, + "grad_norm": 2.584785223007202, + "learning_rate": 5.796521739130435e-06, + "loss": 1.6872, + "step": 25010 + }, + { + "epoch": 2.08, + "grad_norm": 2.3847389221191406, + "learning_rate": 5.79072463768116e-06, + "loss": 1.6624, + "step": 25020 + }, + { + "epoch": 2.09, + "grad_norm": 4.764808654785156, + "learning_rate": 5.784927536231885e-06, + "loss": 1.682, + "step": 25030 + }, + { + "epoch": 2.09, + "grad_norm": 4.870251178741455, + "learning_rate": 5.77913043478261e-06, + "loss": 1.4977, + "step": 25040 + }, + { + "epoch": 2.09, + "grad_norm": 4.905124187469482, + "learning_rate": 5.7733333333333345e-06, + "loss": 1.573, + "step": 25050 + }, + { + "epoch": 2.09, + "grad_norm": 11.395575523376465, + "learning_rate": 5.7675362318840585e-06, + "loss": 1.6053, + "step": 25060 + }, + { + "epoch": 2.09, + "grad_norm": 2.989179849624634, + "learning_rate": 5.761739130434783e-06, + "loss": 1.6168, + "step": 25070 + }, + { + "epoch": 2.09, + "grad_norm": 4.6165571212768555, + "learning_rate": 5.755942028985507e-06, + "loss": 1.6171, + "step": 25080 + }, + { + "epoch": 2.09, + "grad_norm": 5.178078651428223, + "learning_rate": 5.750144927536232e-06, + "loss": 1.6461, + "step": 25090 + }, + { + "epoch": 2.09, + "grad_norm": 2.245847702026367, + "learning_rate": 5.744347826086957e-06, + "loss": 1.581, + "step": 25100 + }, + { + "epoch": 2.09, + "grad_norm": 8.20195484161377, + "learning_rate": 5.738550724637681e-06, + "loss": 1.6275, + "step": 25110 + }, + { + "epoch": 2.09, + "grad_norm": 1.7283331155776978, + "learning_rate": 5.732753623188406e-06, + "loss": 1.6411, + "step": 25120 + }, + { + "epoch": 2.09, + "grad_norm": 2.998704195022583, + "learning_rate": 5.726956521739131e-06, + "loss": 1.6231, + "step": 25130 + }, + { + "epoch": 2.1, + "grad_norm": 2.490156412124634, + "learning_rate": 5.721159420289856e-06, + "loss": 1.6043, + "step": 25140 + }, + { + "epoch": 2.1, + "grad_norm": 2.0524210929870605, + "learning_rate": 5.7153623188405805e-06, + "loss": 1.7545, + "step": 25150 + }, + { + "epoch": 2.1, + "grad_norm": 9.426039695739746, + "learning_rate": 5.709565217391305e-06, + "loss": 1.5325, + "step": 25160 + }, + { + "epoch": 2.1, + "grad_norm": 2.069500207901001, + "learning_rate": 5.70376811594203e-06, + "loss": 1.6505, + "step": 25170 + }, + { + "epoch": 2.1, + "grad_norm": 6.134079933166504, + "learning_rate": 5.697971014492753e-06, + "loss": 1.5572, + "step": 25180 + }, + { + "epoch": 2.1, + "grad_norm": 3.0880463123321533, + "learning_rate": 5.692173913043478e-06, + "loss": 1.6467, + "step": 25190 + }, + { + "epoch": 2.1, + "grad_norm": 5.196481227874756, + "learning_rate": 5.686376811594203e-06, + "loss": 1.5883, + "step": 25200 + }, + { + "epoch": 2.1, + "grad_norm": 3.574937343597412, + "learning_rate": 5.680579710144928e-06, + "loss": 1.7104, + "step": 25210 + }, + { + "epoch": 2.1, + "grad_norm": 7.061581611633301, + "learning_rate": 5.674782608695653e-06, + "loss": 1.6283, + "step": 25220 + }, + { + "epoch": 2.1, + "grad_norm": 4.538878917694092, + "learning_rate": 5.668985507246378e-06, + "loss": 1.5797, + "step": 25230 + }, + { + "epoch": 2.1, + "grad_norm": 5.407559394836426, + "learning_rate": 5.663188405797102e-06, + "loss": 1.6253, + "step": 25240 + }, + { + "epoch": 2.1, + "grad_norm": 4.123929500579834, + "learning_rate": 5.6573913043478265e-06, + "loss": 1.7386, + "step": 25250 + }, + { + "epoch": 2.1, + "grad_norm": 10.668411254882812, + "learning_rate": 5.651594202898551e-06, + "loss": 1.6129, + "step": 25260 + }, + { + "epoch": 2.11, + "grad_norm": 7.001271724700928, + "learning_rate": 5.645797101449276e-06, + "loss": 1.6302, + "step": 25270 + }, + { + "epoch": 2.11, + "grad_norm": 9.053343772888184, + "learning_rate": 5.64e-06, + "loss": 1.7988, + "step": 25280 + }, + { + "epoch": 2.11, + "grad_norm": 2.4710533618927, + "learning_rate": 5.634202898550725e-06, + "loss": 1.6616, + "step": 25290 + }, + { + "epoch": 2.11, + "grad_norm": 3.005622148513794, + "learning_rate": 5.628405797101449e-06, + "loss": 1.5469, + "step": 25300 + }, + { + "epoch": 2.11, + "grad_norm": 5.569432258605957, + "learning_rate": 5.623188405797102e-06, + "loss": 1.5047, + "step": 25310 + }, + { + "epoch": 2.11, + "grad_norm": 2.52546763420105, + "learning_rate": 5.617391304347827e-06, + "loss": 1.6252, + "step": 25320 + }, + { + "epoch": 2.11, + "grad_norm": 2.454789400100708, + "learning_rate": 5.611594202898552e-06, + "loss": 1.6849, + "step": 25330 + }, + { + "epoch": 2.11, + "grad_norm": 6.595395565032959, + "learning_rate": 5.605797101449277e-06, + "loss": 1.5866, + "step": 25340 + }, + { + "epoch": 2.11, + "grad_norm": 1.6814996004104614, + "learning_rate": 5.600000000000001e-06, + "loss": 1.6539, + "step": 25350 + }, + { + "epoch": 2.11, + "grad_norm": 1.3363173007965088, + "learning_rate": 5.594202898550725e-06, + "loss": 1.7953, + "step": 25360 + }, + { + "epoch": 2.11, + "grad_norm": 4.2266364097595215, + "learning_rate": 5.5884057971014495e-06, + "loss": 1.459, + "step": 25370 + }, + { + "epoch": 2.12, + "grad_norm": 8.90019702911377, + "learning_rate": 5.582608695652174e-06, + "loss": 1.5548, + "step": 25380 + }, + { + "epoch": 2.12, + "grad_norm": 8.731317520141602, + "learning_rate": 5.576811594202899e-06, + "loss": 1.6977, + "step": 25390 + }, + { + "epoch": 2.12, + "grad_norm": 3.7743234634399414, + "learning_rate": 5.571014492753624e-06, + "loss": 1.632, + "step": 25400 + }, + { + "epoch": 2.12, + "grad_norm": 3.33520245552063, + "learning_rate": 5.565217391304348e-06, + "loss": 1.634, + "step": 25410 + }, + { + "epoch": 2.12, + "grad_norm": 8.20616340637207, + "learning_rate": 5.559420289855073e-06, + "loss": 1.6429, + "step": 25420 + }, + { + "epoch": 2.12, + "grad_norm": 4.349891662597656, + "learning_rate": 5.553623188405798e-06, + "loss": 1.5035, + "step": 25430 + }, + { + "epoch": 2.12, + "grad_norm": 2.2954518795013428, + "learning_rate": 5.5478260869565226e-06, + "loss": 1.6812, + "step": 25440 + }, + { + "epoch": 2.12, + "grad_norm": 7.884774684906006, + "learning_rate": 5.5420289855072474e-06, + "loss": 1.6622, + "step": 25450 + }, + { + "epoch": 2.12, + "grad_norm": 3.3407180309295654, + "learning_rate": 5.536231884057971e-06, + "loss": 1.7026, + "step": 25460 + }, + { + "epoch": 2.12, + "grad_norm": 4.300926685333252, + "learning_rate": 5.5304347826086954e-06, + "loss": 1.4354, + "step": 25470 + }, + { + "epoch": 2.12, + "grad_norm": 6.3809895515441895, + "learning_rate": 5.52463768115942e-06, + "loss": 1.5977, + "step": 25480 + }, + { + "epoch": 2.12, + "grad_norm": 1.672784447669983, + "learning_rate": 5.518840579710145e-06, + "loss": 1.69, + "step": 25490 + }, + { + "epoch": 2.12, + "grad_norm": 1.11064875125885, + "learning_rate": 5.51304347826087e-06, + "loss": 1.4795, + "step": 25500 + }, + { + "epoch": 2.12, + "eval_loss": 1.6643449068069458, + "eval_runtime": 107.487, + "eval_samples_per_second": 9.303, + "eval_steps_per_second": 2.326, + "step": 25500 + }, + { + "epoch": 2.13, + "grad_norm": 1.4832613468170166, + "learning_rate": 5.507246376811595e-06, + "loss": 1.631, + "step": 25510 + }, + { + "epoch": 2.13, + "grad_norm": 4.866186141967773, + "learning_rate": 5.50144927536232e-06, + "loss": 1.6299, + "step": 25520 + }, + { + "epoch": 2.13, + "grad_norm": 5.978271961212158, + "learning_rate": 5.4956521739130446e-06, + "loss": 1.69, + "step": 25530 + }, + { + "epoch": 2.13, + "grad_norm": 2.5575902462005615, + "learning_rate": 5.4898550724637686e-06, + "loss": 1.7588, + "step": 25540 + }, + { + "epoch": 2.13, + "grad_norm": 3.088501214981079, + "learning_rate": 5.484057971014493e-06, + "loss": 1.5889, + "step": 25550 + }, + { + "epoch": 2.13, + "grad_norm": 4.234321594238281, + "learning_rate": 5.478260869565217e-06, + "loss": 1.615, + "step": 25560 + }, + { + "epoch": 2.13, + "grad_norm": 2.050870895385742, + "learning_rate": 5.472463768115942e-06, + "loss": 1.8112, + "step": 25570 + }, + { + "epoch": 2.13, + "grad_norm": 2.9914329051971436, + "learning_rate": 5.466666666666667e-06, + "loss": 1.4678, + "step": 25580 + }, + { + "epoch": 2.13, + "grad_norm": 4.666996479034424, + "learning_rate": 5.460869565217391e-06, + "loss": 1.6458, + "step": 25590 + }, + { + "epoch": 2.13, + "grad_norm": 3.144287109375, + "learning_rate": 5.455072463768116e-06, + "loss": 1.6335, + "step": 25600 + }, + { + "epoch": 2.13, + "grad_norm": 8.561944007873535, + "learning_rate": 5.449275362318841e-06, + "loss": 1.7007, + "step": 25610 + }, + { + "epoch": 2.13, + "grad_norm": 8.11923599243164, + "learning_rate": 5.443478260869566e-06, + "loss": 1.5806, + "step": 25620 + }, + { + "epoch": 2.14, + "grad_norm": 2.6465091705322266, + "learning_rate": 5.4376811594202905e-06, + "loss": 1.5382, + "step": 25630 + }, + { + "epoch": 2.14, + "grad_norm": 5.644728183746338, + "learning_rate": 5.431884057971015e-06, + "loss": 1.6434, + "step": 25640 + }, + { + "epoch": 2.14, + "grad_norm": 3.5903024673461914, + "learning_rate": 5.42608695652174e-06, + "loss": 1.7184, + "step": 25650 + }, + { + "epoch": 2.14, + "grad_norm": 6.686862945556641, + "learning_rate": 5.420289855072465e-06, + "loss": 1.6501, + "step": 25660 + }, + { + "epoch": 2.14, + "grad_norm": 4.454892158508301, + "learning_rate": 5.414492753623188e-06, + "loss": 1.4907, + "step": 25670 + }, + { + "epoch": 2.14, + "grad_norm": 6.291064262390137, + "learning_rate": 5.408695652173913e-06, + "loss": 1.6432, + "step": 25680 + }, + { + "epoch": 2.14, + "grad_norm": 0.9967716336250305, + "learning_rate": 5.402898550724638e-06, + "loss": 1.6613, + "step": 25690 + }, + { + "epoch": 2.14, + "grad_norm": 10.572330474853516, + "learning_rate": 5.397101449275363e-06, + "loss": 1.6701, + "step": 25700 + }, + { + "epoch": 2.14, + "grad_norm": 2.3549177646636963, + "learning_rate": 5.391304347826088e-06, + "loss": 1.5943, + "step": 25710 + }, + { + "epoch": 2.14, + "grad_norm": 6.504870891571045, + "learning_rate": 5.3855072463768125e-06, + "loss": 1.5729, + "step": 25720 + }, + { + "epoch": 2.14, + "grad_norm": 1.8512842655181885, + "learning_rate": 5.3797101449275365e-06, + "loss": 1.653, + "step": 25730 + }, + { + "epoch": 2.15, + "grad_norm": 1.5188934803009033, + "learning_rate": 5.373913043478261e-06, + "loss": 1.6456, + "step": 25740 + }, + { + "epoch": 2.15, + "grad_norm": 2.3979270458221436, + "learning_rate": 5.368115942028986e-06, + "loss": 1.7375, + "step": 25750 + }, + { + "epoch": 2.15, + "grad_norm": 6.565988540649414, + "learning_rate": 5.362318840579711e-06, + "loss": 1.6154, + "step": 25760 + }, + { + "epoch": 2.15, + "grad_norm": 5.995846271514893, + "learning_rate": 5.356521739130435e-06, + "loss": 1.6018, + "step": 25770 + }, + { + "epoch": 2.15, + "grad_norm": 5.048307418823242, + "learning_rate": 5.350724637681159e-06, + "loss": 1.5733, + "step": 25780 + }, + { + "epoch": 2.15, + "grad_norm": 3.6430954933166504, + "learning_rate": 5.344927536231884e-06, + "loss": 1.6623, + "step": 25790 + }, + { + "epoch": 2.15, + "grad_norm": 5.271862983703613, + "learning_rate": 5.339130434782609e-06, + "loss": 1.7376, + "step": 25800 + }, + { + "epoch": 2.15, + "grad_norm": 4.284571647644043, + "learning_rate": 5.333333333333334e-06, + "loss": 1.6058, + "step": 25810 + }, + { + "epoch": 2.15, + "grad_norm": 3.8240787982940674, + "learning_rate": 5.3275362318840585e-06, + "loss": 1.6588, + "step": 25820 + }, + { + "epoch": 2.15, + "grad_norm": 2.922074556350708, + "learning_rate": 5.321739130434783e-06, + "loss": 1.504, + "step": 25830 + }, + { + "epoch": 2.15, + "grad_norm": 3.5163557529449463, + "learning_rate": 5.315942028985508e-06, + "loss": 1.7208, + "step": 25840 + }, + { + "epoch": 2.15, + "grad_norm": 4.59926176071167, + "learning_rate": 5.310144927536233e-06, + "loss": 1.5188, + "step": 25850 + }, + { + "epoch": 2.15, + "grad_norm": 7.922865867614746, + "learning_rate": 5.304347826086957e-06, + "loss": 1.7082, + "step": 25860 + }, + { + "epoch": 2.16, + "grad_norm": 2.3443105220794678, + "learning_rate": 5.298550724637681e-06, + "loss": 1.5892, + "step": 25870 + }, + { + "epoch": 2.16, + "grad_norm": 1.4057115316390991, + "learning_rate": 5.292753623188406e-06, + "loss": 1.5478, + "step": 25880 + }, + { + "epoch": 2.16, + "grad_norm": 5.410768985748291, + "learning_rate": 5.286956521739131e-06, + "loss": 1.6926, + "step": 25890 + }, + { + "epoch": 2.16, + "grad_norm": 3.0512008666992188, + "learning_rate": 5.281159420289856e-06, + "loss": 1.6573, + "step": 25900 + }, + { + "epoch": 2.16, + "grad_norm": 4.070111274719238, + "learning_rate": 5.27536231884058e-06, + "loss": 1.6467, + "step": 25910 + }, + { + "epoch": 2.16, + "grad_norm": 13.7697114944458, + "learning_rate": 5.2695652173913045e-06, + "loss": 1.4524, + "step": 25920 + }, + { + "epoch": 2.16, + "grad_norm": 6.01429557800293, + "learning_rate": 5.263768115942029e-06, + "loss": 1.6422, + "step": 25930 + }, + { + "epoch": 2.16, + "grad_norm": 1.9737797975540161, + "learning_rate": 5.257971014492754e-06, + "loss": 1.8127, + "step": 25940 + }, + { + "epoch": 2.16, + "grad_norm": 4.113332271575928, + "learning_rate": 5.252173913043479e-06, + "loss": 1.6502, + "step": 25950 + }, + { + "epoch": 2.16, + "grad_norm": 4.877610683441162, + "learning_rate": 5.246376811594204e-06, + "loss": 1.7218, + "step": 25960 + }, + { + "epoch": 2.16, + "grad_norm": 4.620281219482422, + "learning_rate": 5.240579710144929e-06, + "loss": 1.7076, + "step": 25970 + }, + { + "epoch": 2.17, + "grad_norm": 4.5956878662109375, + "learning_rate": 5.234782608695652e-06, + "loss": 1.5189, + "step": 25980 + }, + { + "epoch": 2.17, + "grad_norm": 9.458510398864746, + "learning_rate": 5.228985507246377e-06, + "loss": 1.5011, + "step": 25990 + }, + { + "epoch": 2.17, + "grad_norm": 3.703575372695923, + "learning_rate": 5.223188405797102e-06, + "loss": 1.6217, + "step": 26000 + }, + { + "epoch": 2.17, + "eval_loss": 1.641511082649231, + "eval_runtime": 107.5147, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 26000 + }, + { + "epoch": 2.17, + "grad_norm": 4.613325595855713, + "learning_rate": 5.2173913043478265e-06, + "loss": 1.5841, + "step": 26010 + }, + { + "epoch": 2.17, + "grad_norm": 5.493524551391602, + "learning_rate": 5.211594202898551e-06, + "loss": 1.6712, + "step": 26020 + }, + { + "epoch": 2.17, + "grad_norm": 4.726990699768066, + "learning_rate": 5.205797101449276e-06, + "loss": 1.562, + "step": 26030 + }, + { + "epoch": 2.17, + "grad_norm": 4.116297721862793, + "learning_rate": 5.2e-06, + "loss": 1.6151, + "step": 26040 + }, + { + "epoch": 2.17, + "grad_norm": 5.906617641448975, + "learning_rate": 5.194202898550725e-06, + "loss": 1.8011, + "step": 26050 + }, + { + "epoch": 2.17, + "grad_norm": 5.110503196716309, + "learning_rate": 5.18840579710145e-06, + "loss": 1.5389, + "step": 26060 + }, + { + "epoch": 2.17, + "grad_norm": 6.075992584228516, + "learning_rate": 5.182608695652175e-06, + "loss": 1.6278, + "step": 26070 + }, + { + "epoch": 2.17, + "grad_norm": 4.633796215057373, + "learning_rate": 5.176811594202899e-06, + "loss": 1.6312, + "step": 26080 + }, + { + "epoch": 2.17, + "grad_norm": 8.259605407714844, + "learning_rate": 5.171014492753624e-06, + "loss": 1.6943, + "step": 26090 + }, + { + "epoch": 2.17, + "grad_norm": 5.173551082611084, + "learning_rate": 5.165217391304348e-06, + "loss": 1.6731, + "step": 26100 + }, + { + "epoch": 2.18, + "grad_norm": 1.1462539434432983, + "learning_rate": 5.1594202898550725e-06, + "loss": 1.6857, + "step": 26110 + }, + { + "epoch": 2.18, + "grad_norm": 1.7535749673843384, + "learning_rate": 5.153623188405797e-06, + "loss": 1.6063, + "step": 26120 + }, + { + "epoch": 2.18, + "grad_norm": 0.7708317041397095, + "learning_rate": 5.147826086956522e-06, + "loss": 1.5231, + "step": 26130 + }, + { + "epoch": 2.18, + "grad_norm": 5.939055919647217, + "learning_rate": 5.142028985507247e-06, + "loss": 1.5663, + "step": 26140 + }, + { + "epoch": 2.18, + "grad_norm": 1.7642502784729004, + "learning_rate": 5.136231884057972e-06, + "loss": 1.4762, + "step": 26150 + }, + { + "epoch": 2.18, + "grad_norm": 2.4374587535858154, + "learning_rate": 5.130434782608697e-06, + "loss": 1.5594, + "step": 26160 + }, + { + "epoch": 2.18, + "grad_norm": 6.011373996734619, + "learning_rate": 5.124637681159422e-06, + "loss": 1.6385, + "step": 26170 + }, + { + "epoch": 2.18, + "grad_norm": 6.366337299346924, + "learning_rate": 5.118840579710145e-06, + "loss": 1.6437, + "step": 26180 + }, + { + "epoch": 2.18, + "grad_norm": 4.652083873748779, + "learning_rate": 5.11304347826087e-06, + "loss": 1.6124, + "step": 26190 + }, + { + "epoch": 2.18, + "grad_norm": 7.6200690269470215, + "learning_rate": 5.1072463768115944e-06, + "loss": 1.6403, + "step": 26200 + }, + { + "epoch": 2.18, + "grad_norm": 0.9343698620796204, + "learning_rate": 5.101449275362319e-06, + "loss": 1.5959, + "step": 26210 + }, + { + "epoch": 2.19, + "grad_norm": 6.836380481719971, + "learning_rate": 5.095652173913044e-06, + "loss": 1.6459, + "step": 26220 + }, + { + "epoch": 2.19, + "grad_norm": 0.8471882343292236, + "learning_rate": 5.089855072463768e-06, + "loss": 1.7146, + "step": 26230 + }, + { + "epoch": 2.19, + "grad_norm": 1.5582016706466675, + "learning_rate": 5.084057971014493e-06, + "loss": 1.752, + "step": 26240 + }, + { + "epoch": 2.19, + "grad_norm": 4.58816385269165, + "learning_rate": 5.078260869565218e-06, + "loss": 1.7379, + "step": 26250 + }, + { + "epoch": 2.19, + "grad_norm": 12.132075309753418, + "learning_rate": 5.072463768115943e-06, + "loss": 1.7138, + "step": 26260 + }, + { + "epoch": 2.19, + "grad_norm": 7.812301158905029, + "learning_rate": 5.0666666666666676e-06, + "loss": 1.522, + "step": 26270 + }, + { + "epoch": 2.19, + "grad_norm": 8.638223648071289, + "learning_rate": 5.060869565217392e-06, + "loss": 1.4687, + "step": 26280 + }, + { + "epoch": 2.19, + "grad_norm": 1.7802847623825073, + "learning_rate": 5.055072463768116e-06, + "loss": 1.6814, + "step": 26290 + }, + { + "epoch": 2.19, + "grad_norm": 3.118692398071289, + "learning_rate": 5.0492753623188404e-06, + "loss": 1.5805, + "step": 26300 + }, + { + "epoch": 2.19, + "grad_norm": 1.812072515487671, + "learning_rate": 5.043478260869565e-06, + "loss": 1.5966, + "step": 26310 + }, + { + "epoch": 2.19, + "grad_norm": 3.6763594150543213, + "learning_rate": 5.03768115942029e-06, + "loss": 1.5277, + "step": 26320 + }, + { + "epoch": 2.19, + "grad_norm": 5.058526992797852, + "learning_rate": 5.031884057971015e-06, + "loss": 1.5608, + "step": 26330 + }, + { + "epoch": 2.19, + "grad_norm": 6.4017653465271, + "learning_rate": 5.02608695652174e-06, + "loss": 1.702, + "step": 26340 + }, + { + "epoch": 2.2, + "grad_norm": 4.419919490814209, + "learning_rate": 5.020289855072465e-06, + "loss": 1.7232, + "step": 26350 + }, + { + "epoch": 2.2, + "grad_norm": 3.014277458190918, + "learning_rate": 5.014492753623189e-06, + "loss": 1.7648, + "step": 26360 + }, + { + "epoch": 2.2, + "grad_norm": 2.1701581478118896, + "learning_rate": 5.0086956521739136e-06, + "loss": 1.3927, + "step": 26370 + }, + { + "epoch": 2.2, + "grad_norm": 4.6772942543029785, + "learning_rate": 5.002898550724638e-06, + "loss": 1.5294, + "step": 26380 + }, + { + "epoch": 2.2, + "grad_norm": 6.949331283569336, + "learning_rate": 4.997101449275362e-06, + "loss": 1.7033, + "step": 26390 + }, + { + "epoch": 2.2, + "grad_norm": 7.254538536071777, + "learning_rate": 4.991304347826087e-06, + "loss": 1.6756, + "step": 26400 + }, + { + "epoch": 2.2, + "grad_norm": 2.6317763328552246, + "learning_rate": 4.985507246376812e-06, + "loss": 1.5334, + "step": 26410 + }, + { + "epoch": 2.2, + "grad_norm": 6.098857402801514, + "learning_rate": 4.979710144927536e-06, + "loss": 1.5211, + "step": 26420 + }, + { + "epoch": 2.2, + "grad_norm": 10.028970718383789, + "learning_rate": 4.973913043478261e-06, + "loss": 1.6545, + "step": 26430 + }, + { + "epoch": 2.2, + "grad_norm": 1.9619600772857666, + "learning_rate": 4.968115942028986e-06, + "loss": 1.5913, + "step": 26440 + }, + { + "epoch": 2.2, + "grad_norm": 5.192411422729492, + "learning_rate": 4.962318840579711e-06, + "loss": 1.5367, + "step": 26450 + }, + { + "epoch": 2.21, + "grad_norm": 3.78745698928833, + "learning_rate": 4.9565217391304355e-06, + "loss": 1.6372, + "step": 26460 + }, + { + "epoch": 2.21, + "grad_norm": 3.531749963760376, + "learning_rate": 4.9507246376811595e-06, + "loss": 1.6329, + "step": 26470 + }, + { + "epoch": 2.21, + "grad_norm": 4.566132068634033, + "learning_rate": 4.944927536231884e-06, + "loss": 1.6664, + "step": 26480 + }, + { + "epoch": 2.21, + "grad_norm": 11.072998046875, + "learning_rate": 4.939130434782609e-06, + "loss": 1.6001, + "step": 26490 + }, + { + "epoch": 2.21, + "grad_norm": 14.129816055297852, + "learning_rate": 4.933333333333334e-06, + "loss": 1.6051, + "step": 26500 + }, + { + "epoch": 2.21, + "eval_loss": 1.6546690464019775, + "eval_runtime": 107.5057, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 26500 + }, + { + "epoch": 2.21, + "grad_norm": 7.938653945922852, + "learning_rate": 4.927536231884059e-06, + "loss": 1.549, + "step": 26510 + }, + { + "epoch": 2.21, + "grad_norm": 3.820674419403076, + "learning_rate": 4.921739130434783e-06, + "loss": 1.5996, + "step": 26520 + }, + { + "epoch": 2.21, + "grad_norm": 3.9461727142333984, + "learning_rate": 4.915942028985508e-06, + "loss": 1.5242, + "step": 26530 + }, + { + "epoch": 2.21, + "grad_norm": 2.117321729660034, + "learning_rate": 4.910144927536233e-06, + "loss": 1.6593, + "step": 26540 + }, + { + "epoch": 2.21, + "grad_norm": 6.951183795928955, + "learning_rate": 4.904347826086957e-06, + "loss": 1.6066, + "step": 26550 + }, + { + "epoch": 2.21, + "grad_norm": 7.954775333404541, + "learning_rate": 4.8985507246376815e-06, + "loss": 1.6093, + "step": 26560 + }, + { + "epoch": 2.21, + "grad_norm": 2.857492208480835, + "learning_rate": 4.892753623188406e-06, + "loss": 1.6337, + "step": 26570 + }, + { + "epoch": 2.21, + "grad_norm": 6.1911091804504395, + "learning_rate": 4.88695652173913e-06, + "loss": 1.6241, + "step": 26580 + }, + { + "epoch": 2.22, + "grad_norm": 5.39035701751709, + "learning_rate": 4.881159420289855e-06, + "loss": 1.4798, + "step": 26590 + }, + { + "epoch": 2.22, + "grad_norm": 4.794005870819092, + "learning_rate": 4.87536231884058e-06, + "loss": 1.6761, + "step": 26600 + }, + { + "epoch": 2.22, + "grad_norm": 3.688593626022339, + "learning_rate": 4.869565217391305e-06, + "loss": 1.7455, + "step": 26610 + }, + { + "epoch": 2.22, + "grad_norm": 2.9753758907318115, + "learning_rate": 4.863768115942029e-06, + "loss": 1.6043, + "step": 26620 + }, + { + "epoch": 2.22, + "grad_norm": 4.76633882522583, + "learning_rate": 4.857971014492754e-06, + "loss": 1.5899, + "step": 26630 + }, + { + "epoch": 2.22, + "grad_norm": 9.436808586120605, + "learning_rate": 4.852173913043479e-06, + "loss": 1.5456, + "step": 26640 + }, + { + "epoch": 2.22, + "grad_norm": 3.1801443099975586, + "learning_rate": 4.8463768115942035e-06, + "loss": 1.4973, + "step": 26650 + }, + { + "epoch": 2.22, + "grad_norm": 3.6295244693756104, + "learning_rate": 4.840579710144928e-06, + "loss": 1.6056, + "step": 26660 + }, + { + "epoch": 2.22, + "grad_norm": 2.5441739559173584, + "learning_rate": 4.834782608695652e-06, + "loss": 1.5993, + "step": 26670 + }, + { + "epoch": 2.22, + "grad_norm": 5.536238193511963, + "learning_rate": 4.828985507246377e-06, + "loss": 1.7003, + "step": 26680 + }, + { + "epoch": 2.22, + "grad_norm": 8.867695808410645, + "learning_rate": 4.823188405797102e-06, + "loss": 1.5816, + "step": 26690 + }, + { + "epoch": 2.23, + "grad_norm": 2.4776854515075684, + "learning_rate": 4.817391304347827e-06, + "loss": 1.6089, + "step": 26700 + }, + { + "epoch": 2.23, + "grad_norm": 3.4541239738464355, + "learning_rate": 4.811594202898551e-06, + "loss": 1.5486, + "step": 26710 + }, + { + "epoch": 2.23, + "grad_norm": 5.7615251541137695, + "learning_rate": 4.805797101449276e-06, + "loss": 1.4767, + "step": 26720 + }, + { + "epoch": 2.23, + "grad_norm": 5.553129196166992, + "learning_rate": 4.800000000000001e-06, + "loss": 1.6036, + "step": 26730 + }, + { + "epoch": 2.23, + "grad_norm": 1.3022050857543945, + "learning_rate": 4.794202898550725e-06, + "loss": 1.5677, + "step": 26740 + }, + { + "epoch": 2.23, + "grad_norm": 2.267906427383423, + "learning_rate": 4.7884057971014495e-06, + "loss": 1.5816, + "step": 26750 + }, + { + "epoch": 2.23, + "grad_norm": 3.8131909370422363, + "learning_rate": 4.782608695652174e-06, + "loss": 1.7683, + "step": 26760 + }, + { + "epoch": 2.23, + "grad_norm": 4.468419551849365, + "learning_rate": 4.776811594202899e-06, + "loss": 1.4521, + "step": 26770 + }, + { + "epoch": 2.23, + "grad_norm": 5.726469039916992, + "learning_rate": 4.771014492753623e-06, + "loss": 1.6083, + "step": 26780 + }, + { + "epoch": 2.23, + "grad_norm": 2.375027894973755, + "learning_rate": 4.765217391304348e-06, + "loss": 1.6481, + "step": 26790 + }, + { + "epoch": 2.23, + "grad_norm": 8.728473663330078, + "learning_rate": 4.759420289855073e-06, + "loss": 1.6519, + "step": 26800 + }, + { + "epoch": 2.23, + "grad_norm": 13.2547607421875, + "learning_rate": 4.753623188405798e-06, + "loss": 1.646, + "step": 26810 + }, + { + "epoch": 2.23, + "grad_norm": 5.739991664886475, + "learning_rate": 4.747826086956523e-06, + "loss": 1.6067, + "step": 26820 + }, + { + "epoch": 2.24, + "grad_norm": 5.9073076248168945, + "learning_rate": 4.742028985507247e-06, + "loss": 1.4971, + "step": 26830 + }, + { + "epoch": 2.24, + "grad_norm": 8.832313537597656, + "learning_rate": 4.7362318840579715e-06, + "loss": 1.6724, + "step": 26840 + }, + { + "epoch": 2.24, + "grad_norm": 1.5870254039764404, + "learning_rate": 4.730434782608696e-06, + "loss": 1.7331, + "step": 26850 + }, + { + "epoch": 2.24, + "grad_norm": 7.8640055656433105, + "learning_rate": 4.724637681159421e-06, + "loss": 1.5264, + "step": 26860 + }, + { + "epoch": 2.24, + "grad_norm": 6.005427360534668, + "learning_rate": 4.718840579710145e-06, + "loss": 1.5717, + "step": 26870 + }, + { + "epoch": 2.24, + "grad_norm": 3.7698235511779785, + "learning_rate": 4.71304347826087e-06, + "loss": 1.7427, + "step": 26880 + }, + { + "epoch": 2.24, + "grad_norm": 6.162291049957275, + "learning_rate": 4.707246376811595e-06, + "loss": 1.7923, + "step": 26890 + }, + { + "epoch": 2.24, + "grad_norm": 1.3494528532028198, + "learning_rate": 4.701449275362319e-06, + "loss": 1.7313, + "step": 26900 + }, + { + "epoch": 2.24, + "grad_norm": 3.9067726135253906, + "learning_rate": 4.695652173913044e-06, + "loss": 1.7465, + "step": 26910 + }, + { + "epoch": 2.24, + "grad_norm": 2.578123092651367, + "learning_rate": 4.689855072463769e-06, + "loss": 1.5809, + "step": 26920 + }, + { + "epoch": 2.24, + "grad_norm": 3.2738120555877686, + "learning_rate": 4.684057971014493e-06, + "loss": 1.662, + "step": 26930 + }, + { + "epoch": 2.25, + "grad_norm": 2.7239420413970947, + "learning_rate": 4.6782608695652175e-06, + "loss": 1.5232, + "step": 26940 + }, + { + "epoch": 2.25, + "grad_norm": 2.876070022583008, + "learning_rate": 4.672463768115942e-06, + "loss": 1.7607, + "step": 26950 + }, + { + "epoch": 2.25, + "grad_norm": 5.976984977722168, + "learning_rate": 4.666666666666667e-06, + "loss": 1.6693, + "step": 26960 + }, + { + "epoch": 2.25, + "grad_norm": 7.821426868438721, + "learning_rate": 4.660869565217392e-06, + "loss": 1.6281, + "step": 26970 + }, + { + "epoch": 2.25, + "grad_norm": 4.057045936584473, + "learning_rate": 4.655072463768116e-06, + "loss": 1.6141, + "step": 26980 + }, + { + "epoch": 2.25, + "grad_norm": 6.272705554962158, + "learning_rate": 4.649275362318841e-06, + "loss": 1.6509, + "step": 26990 + }, + { + "epoch": 2.25, + "grad_norm": 2.8042986392974854, + "learning_rate": 4.643478260869566e-06, + "loss": 1.6283, + "step": 27000 + }, + { + "epoch": 2.25, + "eval_loss": 1.599829912185669, + "eval_runtime": 107.518, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 27000 + }, + { + "epoch": 2.25, + "grad_norm": 2.672848701477051, + "learning_rate": 4.637681159420291e-06, + "loss": 1.587, + "step": 27010 + }, + { + "epoch": 2.25, + "grad_norm": 3.5648651123046875, + "learning_rate": 4.6318840579710154e-06, + "loss": 1.6535, + "step": 27020 + }, + { + "epoch": 2.25, + "grad_norm": 5.619153022766113, + "learning_rate": 4.6260869565217394e-06, + "loss": 1.7095, + "step": 27030 + }, + { + "epoch": 2.25, + "grad_norm": 3.661830425262451, + "learning_rate": 4.620289855072464e-06, + "loss": 1.5421, + "step": 27040 + }, + { + "epoch": 2.25, + "grad_norm": 2.601088285446167, + "learning_rate": 4.614492753623188e-06, + "loss": 1.6169, + "step": 27050 + }, + { + "epoch": 2.25, + "grad_norm": 1.6670879125595093, + "learning_rate": 4.608695652173913e-06, + "loss": 1.7808, + "step": 27060 + }, + { + "epoch": 2.26, + "grad_norm": 5.285208702087402, + "learning_rate": 4.602898550724638e-06, + "loss": 1.6018, + "step": 27070 + }, + { + "epoch": 2.26, + "grad_norm": 13.955137252807617, + "learning_rate": 4.597101449275363e-06, + "loss": 1.6071, + "step": 27080 + }, + { + "epoch": 2.26, + "grad_norm": 3.246000289916992, + "learning_rate": 4.591304347826087e-06, + "loss": 1.74, + "step": 27090 + }, + { + "epoch": 2.26, + "grad_norm": 4.297258377075195, + "learning_rate": 4.585507246376812e-06, + "loss": 1.6233, + "step": 27100 + }, + { + "epoch": 2.26, + "grad_norm": 6.08992338180542, + "learning_rate": 4.5797101449275366e-06, + "loss": 1.6511, + "step": 27110 + }, + { + "epoch": 2.26, + "grad_norm": 4.511499404907227, + "learning_rate": 4.573913043478261e-06, + "loss": 1.5344, + "step": 27120 + }, + { + "epoch": 2.26, + "grad_norm": 2.989239454269409, + "learning_rate": 4.568115942028986e-06, + "loss": 1.6172, + "step": 27130 + }, + { + "epoch": 2.26, + "grad_norm": 2.358445882797241, + "learning_rate": 4.56231884057971e-06, + "loss": 1.6004, + "step": 27140 + }, + { + "epoch": 2.26, + "grad_norm": 2.533210039138794, + "learning_rate": 4.556521739130435e-06, + "loss": 1.6651, + "step": 27150 + }, + { + "epoch": 2.26, + "grad_norm": 1.6375526189804077, + "learning_rate": 4.55072463768116e-06, + "loss": 1.603, + "step": 27160 + }, + { + "epoch": 2.26, + "grad_norm": 16.163049697875977, + "learning_rate": 4.544927536231885e-06, + "loss": 1.7045, + "step": 27170 + }, + { + "epoch": 2.27, + "grad_norm": 3.7427773475646973, + "learning_rate": 4.53913043478261e-06, + "loss": 1.7164, + "step": 27180 + }, + { + "epoch": 2.27, + "grad_norm": 3.3884432315826416, + "learning_rate": 4.533333333333334e-06, + "loss": 1.5997, + "step": 27190 + }, + { + "epoch": 2.27, + "grad_norm": 11.96393871307373, + "learning_rate": 4.5275362318840585e-06, + "loss": 1.6458, + "step": 27200 + }, + { + "epoch": 2.27, + "grad_norm": 3.4181969165802, + "learning_rate": 4.5217391304347826e-06, + "loss": 1.5166, + "step": 27210 + }, + { + "epoch": 2.27, + "grad_norm": 6.070342540740967, + "learning_rate": 4.515942028985507e-06, + "loss": 1.5037, + "step": 27220 + }, + { + "epoch": 2.27, + "grad_norm": 4.330051898956299, + "learning_rate": 4.510144927536232e-06, + "loss": 1.6322, + "step": 27230 + }, + { + "epoch": 2.27, + "grad_norm": 4.178266525268555, + "learning_rate": 4.504347826086956e-06, + "loss": 1.5712, + "step": 27240 + }, + { + "epoch": 2.27, + "grad_norm": 10.042227745056152, + "learning_rate": 4.498550724637681e-06, + "loss": 1.5195, + "step": 27250 + }, + { + "epoch": 2.27, + "grad_norm": 4.68449592590332, + "learning_rate": 4.492753623188406e-06, + "loss": 1.5767, + "step": 27260 + }, + { + "epoch": 2.27, + "grad_norm": 6.098611354827881, + "learning_rate": 4.486956521739131e-06, + "loss": 1.5826, + "step": 27270 + }, + { + "epoch": 2.27, + "grad_norm": 3.7514631748199463, + "learning_rate": 4.481159420289856e-06, + "loss": 1.6519, + "step": 27280 + }, + { + "epoch": 2.27, + "grad_norm": 6.086874961853027, + "learning_rate": 4.47536231884058e-06, + "loss": 1.6356, + "step": 27290 + }, + { + "epoch": 2.27, + "grad_norm": 2.969055414199829, + "learning_rate": 4.4695652173913045e-06, + "loss": 1.5372, + "step": 27300 + }, + { + "epoch": 2.28, + "grad_norm": 3.0818963050842285, + "learning_rate": 4.463768115942029e-06, + "loss": 1.6195, + "step": 27310 + }, + { + "epoch": 2.28, + "grad_norm": 2.355165481567383, + "learning_rate": 4.457971014492754e-06, + "loss": 1.5498, + "step": 27320 + }, + { + "epoch": 2.28, + "grad_norm": 3.69412899017334, + "learning_rate": 4.452173913043479e-06, + "loss": 1.6927, + "step": 27330 + }, + { + "epoch": 2.28, + "grad_norm": 4.266238212585449, + "learning_rate": 4.446376811594204e-06, + "loss": 1.5444, + "step": 27340 + }, + { + "epoch": 2.28, + "grad_norm": 10.938898086547852, + "learning_rate": 4.440579710144928e-06, + "loss": 1.5767, + "step": 27350 + }, + { + "epoch": 2.28, + "grad_norm": 3.8983266353607178, + "learning_rate": 4.434782608695653e-06, + "loss": 1.6385, + "step": 27360 + }, + { + "epoch": 2.28, + "grad_norm": 6.039658069610596, + "learning_rate": 4.428985507246377e-06, + "loss": 1.6902, + "step": 27370 + }, + { + "epoch": 2.28, + "grad_norm": 3.6886088848114014, + "learning_rate": 4.423188405797102e-06, + "loss": 1.5464, + "step": 27380 + }, + { + "epoch": 2.28, + "grad_norm": 3.805041551589966, + "learning_rate": 4.4173913043478265e-06, + "loss": 1.5516, + "step": 27390 + }, + { + "epoch": 2.28, + "grad_norm": 6.767727851867676, + "learning_rate": 4.4115942028985505e-06, + "loss": 1.6857, + "step": 27400 + }, + { + "epoch": 2.28, + "grad_norm": 2.386350631713867, + "learning_rate": 4.405797101449275e-06, + "loss": 1.7113, + "step": 27410 + }, + { + "epoch": 2.29, + "grad_norm": 1.8526190519332886, + "learning_rate": 4.4e-06, + "loss": 1.6934, + "step": 27420 + }, + { + "epoch": 2.29, + "grad_norm": 6.962111949920654, + "learning_rate": 4.394202898550725e-06, + "loss": 1.5782, + "step": 27430 + }, + { + "epoch": 2.29, + "grad_norm": 3.9486684799194336, + "learning_rate": 4.38840579710145e-06, + "loss": 1.5233, + "step": 27440 + }, + { + "epoch": 2.29, + "grad_norm": 4.742575645446777, + "learning_rate": 4.382608695652174e-06, + "loss": 1.5672, + "step": 27450 + }, + { + "epoch": 2.29, + "grad_norm": 7.8938422203063965, + "learning_rate": 4.376811594202899e-06, + "loss": 1.5662, + "step": 27460 + }, + { + "epoch": 2.29, + "grad_norm": 4.794543743133545, + "learning_rate": 4.371014492753624e-06, + "loss": 1.6878, + "step": 27470 + }, + { + "epoch": 2.29, + "grad_norm": 2.3197507858276367, + "learning_rate": 4.3652173913043485e-06, + "loss": 1.6168, + "step": 27480 + }, + { + "epoch": 2.29, + "grad_norm": 3.5373730659484863, + "learning_rate": 4.359420289855073e-06, + "loss": 1.6601, + "step": 27490 + }, + { + "epoch": 2.29, + "grad_norm": 1.5637263059616089, + "learning_rate": 4.353623188405797e-06, + "loss": 1.6436, + "step": 27500 + }, + { + "epoch": 2.29, + "eval_loss": 1.637457251548767, + "eval_runtime": 107.504, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 27500 + }, + { + "epoch": 2.29, + "grad_norm": 4.249775409698486, + "learning_rate": 4.347826086956522e-06, + "loss": 1.5759, + "step": 27510 + }, + { + "epoch": 2.29, + "grad_norm": 9.582161903381348, + "learning_rate": 4.342028985507247e-06, + "loss": 1.5566, + "step": 27520 + }, + { + "epoch": 2.29, + "grad_norm": 3.517526626586914, + "learning_rate": 4.336231884057971e-06, + "loss": 1.5944, + "step": 27530 + }, + { + "epoch": 2.29, + "grad_norm": 8.537724494934082, + "learning_rate": 4.330434782608696e-06, + "loss": 1.5976, + "step": 27540 + }, + { + "epoch": 2.3, + "grad_norm": 3.978813886642456, + "learning_rate": 4.324637681159421e-06, + "loss": 1.6186, + "step": 27550 + }, + { + "epoch": 2.3, + "grad_norm": 2.7505924701690674, + "learning_rate": 4.318840579710145e-06, + "loss": 1.5513, + "step": 27560 + }, + { + "epoch": 2.3, + "grad_norm": 2.5586419105529785, + "learning_rate": 4.31304347826087e-06, + "loss": 1.7299, + "step": 27570 + }, + { + "epoch": 2.3, + "grad_norm": 4.848520278930664, + "learning_rate": 4.3072463768115945e-06, + "loss": 1.6996, + "step": 27580 + }, + { + "epoch": 2.3, + "grad_norm": 10.454072952270508, + "learning_rate": 4.301449275362319e-06, + "loss": 1.748, + "step": 27590 + }, + { + "epoch": 2.3, + "grad_norm": 5.605503559112549, + "learning_rate": 4.295652173913043e-06, + "loss": 1.6378, + "step": 27600 + }, + { + "epoch": 2.3, + "grad_norm": 7.994022369384766, + "learning_rate": 4.289855072463768e-06, + "loss": 1.5866, + "step": 27610 + }, + { + "epoch": 2.3, + "grad_norm": 3.65685772895813, + "learning_rate": 4.284057971014493e-06, + "loss": 1.5671, + "step": 27620 + }, + { + "epoch": 2.3, + "grad_norm": 8.096505165100098, + "learning_rate": 4.278260869565218e-06, + "loss": 1.5329, + "step": 27630 + }, + { + "epoch": 2.3, + "grad_norm": 4.837905406951904, + "learning_rate": 4.272463768115943e-06, + "loss": 1.7046, + "step": 27640 + }, + { + "epoch": 2.3, + "grad_norm": 4.230941295623779, + "learning_rate": 4.266666666666668e-06, + "loss": 1.6219, + "step": 27650 + }, + { + "epoch": 2.31, + "grad_norm": 6.498970031738281, + "learning_rate": 4.260869565217392e-06, + "loss": 1.5791, + "step": 27660 + }, + { + "epoch": 2.31, + "grad_norm": 2.494067430496216, + "learning_rate": 4.2550724637681165e-06, + "loss": 1.5099, + "step": 27670 + }, + { + "epoch": 2.31, + "grad_norm": 3.1975343227386475, + "learning_rate": 4.249275362318841e-06, + "loss": 1.5382, + "step": 27680 + }, + { + "epoch": 2.31, + "grad_norm": 4.870430946350098, + "learning_rate": 4.243478260869565e-06, + "loss": 1.4734, + "step": 27690 + }, + { + "epoch": 2.31, + "grad_norm": 3.4920201301574707, + "learning_rate": 4.23768115942029e-06, + "loss": 1.5764, + "step": 27700 + }, + { + "epoch": 2.31, + "grad_norm": 2.826338052749634, + "learning_rate": 4.231884057971015e-06, + "loss": 1.766, + "step": 27710 + }, + { + "epoch": 2.31, + "grad_norm": 9.284537315368652, + "learning_rate": 4.226086956521739e-06, + "loss": 1.5152, + "step": 27720 + }, + { + "epoch": 2.31, + "grad_norm": 4.703794479370117, + "learning_rate": 4.220289855072464e-06, + "loss": 1.5506, + "step": 27730 + }, + { + "epoch": 2.31, + "grad_norm": 4.4055094718933105, + "learning_rate": 4.214492753623189e-06, + "loss": 1.6593, + "step": 27740 + }, + { + "epoch": 2.31, + "grad_norm": 2.2604644298553467, + "learning_rate": 4.208695652173914e-06, + "loss": 1.5203, + "step": 27750 + }, + { + "epoch": 2.31, + "grad_norm": 2.278348445892334, + "learning_rate": 4.202898550724638e-06, + "loss": 1.6086, + "step": 27760 + }, + { + "epoch": 2.31, + "grad_norm": 8.626958847045898, + "learning_rate": 4.1971014492753624e-06, + "loss": 1.569, + "step": 27770 + }, + { + "epoch": 2.31, + "grad_norm": 6.3467583656311035, + "learning_rate": 4.191304347826087e-06, + "loss": 1.5253, + "step": 27780 + }, + { + "epoch": 2.32, + "grad_norm": 2.055905818939209, + "learning_rate": 4.185507246376812e-06, + "loss": 1.6537, + "step": 27790 + }, + { + "epoch": 2.32, + "grad_norm": 2.519817352294922, + "learning_rate": 4.179710144927537e-06, + "loss": 1.5329, + "step": 27800 + }, + { + "epoch": 2.32, + "grad_norm": 3.1581406593322754, + "learning_rate": 4.173913043478261e-06, + "loss": 1.616, + "step": 27810 + }, + { + "epoch": 2.32, + "grad_norm": 1.114212155342102, + "learning_rate": 4.168115942028986e-06, + "loss": 1.4248, + "step": 27820 + }, + { + "epoch": 2.32, + "grad_norm": 4.742372989654541, + "learning_rate": 4.162318840579711e-06, + "loss": 1.7195, + "step": 27830 + }, + { + "epoch": 2.32, + "grad_norm": 2.707331895828247, + "learning_rate": 4.1565217391304356e-06, + "loss": 1.5344, + "step": 27840 + }, + { + "epoch": 2.32, + "grad_norm": 4.827779293060303, + "learning_rate": 4.1507246376811596e-06, + "loss": 1.6523, + "step": 27850 + }, + { + "epoch": 2.32, + "grad_norm": 3.977830410003662, + "learning_rate": 4.1449275362318844e-06, + "loss": 1.6839, + "step": 27860 + }, + { + "epoch": 2.32, + "grad_norm": 8.55379867553711, + "learning_rate": 4.139130434782609e-06, + "loss": 1.6007, + "step": 27870 + }, + { + "epoch": 2.32, + "grad_norm": 4.7034173011779785, + "learning_rate": 4.133333333333333e-06, + "loss": 1.5704, + "step": 27880 + }, + { + "epoch": 2.32, + "grad_norm": 2.772855520248413, + "learning_rate": 4.127536231884058e-06, + "loss": 1.7209, + "step": 27890 + }, + { + "epoch": 2.33, + "grad_norm": 2.952080011367798, + "learning_rate": 4.121739130434783e-06, + "loss": 1.5282, + "step": 27900 + }, + { + "epoch": 2.33, + "grad_norm": 2.7901113033294678, + "learning_rate": 4.115942028985507e-06, + "loss": 1.6545, + "step": 27910 + }, + { + "epoch": 2.33, + "grad_norm": 8.063865661621094, + "learning_rate": 4.110144927536232e-06, + "loss": 1.4981, + "step": 27920 + }, + { + "epoch": 2.33, + "grad_norm": 9.868477821350098, + "learning_rate": 4.104347826086957e-06, + "loss": 1.6642, + "step": 27930 + }, + { + "epoch": 2.33, + "grad_norm": 5.399648666381836, + "learning_rate": 4.0985507246376816e-06, + "loss": 1.6422, + "step": 27940 + }, + { + "epoch": 2.33, + "grad_norm": 3.0602734088897705, + "learning_rate": 4.092753623188406e-06, + "loss": 1.7643, + "step": 27950 + }, + { + "epoch": 2.33, + "grad_norm": 5.221512794494629, + "learning_rate": 4.086956521739131e-06, + "loss": 1.5439, + "step": 27960 + }, + { + "epoch": 2.33, + "grad_norm": 3.30639386177063, + "learning_rate": 4.081159420289855e-06, + "loss": 1.5125, + "step": 27970 + }, + { + "epoch": 2.33, + "grad_norm": 4.630527973175049, + "learning_rate": 4.07536231884058e-06, + "loss": 1.5665, + "step": 27980 + }, + { + "epoch": 2.33, + "grad_norm": 9.667155265808105, + "learning_rate": 4.069565217391305e-06, + "loss": 1.6917, + "step": 27990 + }, + { + "epoch": 2.33, + "grad_norm": 2.4528236389160156, + "learning_rate": 4.06376811594203e-06, + "loss": 1.6995, + "step": 28000 + }, + { + "epoch": 2.33, + "eval_loss": 1.644856333732605, + "eval_runtime": 107.5008, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 28000 + }, + { + "epoch": 2.33, + "grad_norm": 3.805037260055542, + "learning_rate": 4.057971014492754e-06, + "loss": 1.5207, + "step": 28010 + }, + { + "epoch": 2.33, + "grad_norm": 3.819047451019287, + "learning_rate": 4.052173913043479e-06, + "loss": 1.6518, + "step": 28020 + }, + { + "epoch": 2.34, + "grad_norm": 8.75721263885498, + "learning_rate": 4.0463768115942035e-06, + "loss": 1.6758, + "step": 28030 + }, + { + "epoch": 2.34, + "grad_norm": 5.432452201843262, + "learning_rate": 4.0405797101449275e-06, + "loss": 1.7066, + "step": 28040 + }, + { + "epoch": 2.34, + "grad_norm": 3.6831490993499756, + "learning_rate": 4.034782608695652e-06, + "loss": 1.6379, + "step": 28050 + }, + { + "epoch": 2.34, + "grad_norm": 6.150071620941162, + "learning_rate": 4.028985507246377e-06, + "loss": 1.5458, + "step": 28060 + }, + { + "epoch": 2.34, + "grad_norm": 6.020603179931641, + "learning_rate": 4.023188405797101e-06, + "loss": 1.6909, + "step": 28070 + }, + { + "epoch": 2.34, + "grad_norm": 11.10090446472168, + "learning_rate": 4.017391304347826e-06, + "loss": 1.7057, + "step": 28080 + }, + { + "epoch": 2.34, + "grad_norm": 2.3139498233795166, + "learning_rate": 4.011594202898551e-06, + "loss": 1.6334, + "step": 28090 + }, + { + "epoch": 2.34, + "grad_norm": 11.909808158874512, + "learning_rate": 4.005797101449276e-06, + "loss": 1.6384, + "step": 28100 + }, + { + "epoch": 2.34, + "grad_norm": 2.950678586959839, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6356, + "step": 28110 + }, + { + "epoch": 2.34, + "grad_norm": 2.596022844314575, + "learning_rate": 3.994202898550725e-06, + "loss": 1.4085, + "step": 28120 + }, + { + "epoch": 2.34, + "grad_norm": 9.143681526184082, + "learning_rate": 3.9884057971014495e-06, + "loss": 1.5866, + "step": 28130 + }, + { + "epoch": 2.34, + "grad_norm": 2.4474549293518066, + "learning_rate": 3.982608695652174e-06, + "loss": 1.5971, + "step": 28140 + }, + { + "epoch": 2.35, + "grad_norm": 3.5778965950012207, + "learning_rate": 3.976811594202899e-06, + "loss": 1.5654, + "step": 28150 + }, + { + "epoch": 2.35, + "grad_norm": 1.3869068622589111, + "learning_rate": 3.971014492753624e-06, + "loss": 1.6479, + "step": 28160 + }, + { + "epoch": 2.35, + "grad_norm": 6.334060192108154, + "learning_rate": 3.965217391304348e-06, + "loss": 1.4869, + "step": 28170 + }, + { + "epoch": 2.35, + "grad_norm": 3.896120071411133, + "learning_rate": 3.959420289855073e-06, + "loss": 1.5473, + "step": 28180 + }, + { + "epoch": 2.35, + "grad_norm": 4.137752532958984, + "learning_rate": 3.953623188405798e-06, + "loss": 1.5977, + "step": 28190 + }, + { + "epoch": 2.35, + "grad_norm": 3.7057723999023438, + "learning_rate": 3.947826086956522e-06, + "loss": 1.6333, + "step": 28200 + }, + { + "epoch": 2.35, + "grad_norm": 7.279668807983398, + "learning_rate": 3.942028985507247e-06, + "loss": 1.6963, + "step": 28210 + }, + { + "epoch": 2.35, + "grad_norm": 4.471526145935059, + "learning_rate": 3.936231884057971e-06, + "loss": 1.6287, + "step": 28220 + }, + { + "epoch": 2.35, + "grad_norm": 3.1095166206359863, + "learning_rate": 3.9304347826086955e-06, + "loss": 1.5774, + "step": 28230 + }, + { + "epoch": 2.35, + "grad_norm": 5.295337677001953, + "learning_rate": 3.92463768115942e-06, + "loss": 1.6255, + "step": 28240 + }, + { + "epoch": 2.35, + "grad_norm": 2.934601306915283, + "learning_rate": 3.918840579710145e-06, + "loss": 1.6194, + "step": 28250 + }, + { + "epoch": 2.35, + "grad_norm": 3.069929361343384, + "learning_rate": 3.91304347826087e-06, + "loss": 1.7143, + "step": 28260 + }, + { + "epoch": 2.36, + "grad_norm": 4.211582660675049, + "learning_rate": 3.907246376811595e-06, + "loss": 1.5541, + "step": 28270 + }, + { + "epoch": 2.36, + "grad_norm": 8.216347694396973, + "learning_rate": 3.901449275362319e-06, + "loss": 1.5638, + "step": 28280 + }, + { + "epoch": 2.36, + "grad_norm": 8.743631362915039, + "learning_rate": 3.895652173913044e-06, + "loss": 1.5687, + "step": 28290 + }, + { + "epoch": 2.36, + "grad_norm": 4.041175842285156, + "learning_rate": 3.889855072463769e-06, + "loss": 1.5939, + "step": 28300 + }, + { + "epoch": 2.36, + "grad_norm": 8.4274263381958, + "learning_rate": 3.8840579710144935e-06, + "loss": 1.7864, + "step": 28310 + }, + { + "epoch": 2.36, + "grad_norm": 6.043384552001953, + "learning_rate": 3.878260869565218e-06, + "loss": 1.5686, + "step": 28320 + }, + { + "epoch": 2.36, + "grad_norm": 2.899559736251831, + "learning_rate": 3.872463768115942e-06, + "loss": 1.7651, + "step": 28330 + }, + { + "epoch": 2.36, + "grad_norm": 6.566895008087158, + "learning_rate": 3.866666666666667e-06, + "loss": 1.6548, + "step": 28340 + }, + { + "epoch": 2.36, + "grad_norm": 4.242386341094971, + "learning_rate": 3.860869565217392e-06, + "loss": 1.5816, + "step": 28350 + }, + { + "epoch": 2.36, + "grad_norm": 3.0503904819488525, + "learning_rate": 3.855072463768116e-06, + "loss": 1.7181, + "step": 28360 + }, + { + "epoch": 2.36, + "grad_norm": 8.445423126220703, + "learning_rate": 3.849275362318841e-06, + "loss": 1.6133, + "step": 28370 + }, + { + "epoch": 2.37, + "grad_norm": 4.893218040466309, + "learning_rate": 3.843478260869565e-06, + "loss": 1.6204, + "step": 28380 + }, + { + "epoch": 2.37, + "grad_norm": 3.2958295345306396, + "learning_rate": 3.83768115942029e-06, + "loss": 1.524, + "step": 28390 + }, + { + "epoch": 2.37, + "grad_norm": 3.6230056285858154, + "learning_rate": 3.831884057971015e-06, + "loss": 1.7141, + "step": 28400 + }, + { + "epoch": 2.37, + "grad_norm": 4.484030246734619, + "learning_rate": 3.8260869565217395e-06, + "loss": 1.7142, + "step": 28410 + }, + { + "epoch": 2.37, + "grad_norm": 5.462039470672607, + "learning_rate": 3.820289855072464e-06, + "loss": 1.6588, + "step": 28420 + }, + { + "epoch": 2.37, + "grad_norm": 1.6934198141098022, + "learning_rate": 3.8144927536231883e-06, + "loss": 1.6298, + "step": 28430 + }, + { + "epoch": 2.37, + "grad_norm": 4.632734298706055, + "learning_rate": 3.808695652173913e-06, + "loss": 1.523, + "step": 28440 + }, + { + "epoch": 2.37, + "grad_norm": 1.3441051244735718, + "learning_rate": 3.802898550724638e-06, + "loss": 1.6716, + "step": 28450 + }, + { + "epoch": 2.37, + "grad_norm": 14.253397941589355, + "learning_rate": 3.797101449275363e-06, + "loss": 1.6655, + "step": 28460 + }, + { + "epoch": 2.37, + "grad_norm": 3.6277294158935547, + "learning_rate": 3.7913043478260873e-06, + "loss": 1.6434, + "step": 28470 + }, + { + "epoch": 2.37, + "grad_norm": 3.960604429244995, + "learning_rate": 3.7855072463768117e-06, + "loss": 1.6302, + "step": 28480 + }, + { + "epoch": 2.37, + "grad_norm": 8.853973388671875, + "learning_rate": 3.7797101449275366e-06, + "loss": 1.5669, + "step": 28490 + }, + { + "epoch": 2.38, + "grad_norm": 6.044206619262695, + "learning_rate": 3.773913043478261e-06, + "loss": 1.6932, + "step": 28500 + }, + { + "epoch": 2.38, + "eval_loss": 1.6286858320236206, + "eval_runtime": 107.5089, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.325, + "step": 28500 + }, + { + "epoch": 2.38, + "grad_norm": 2.0954174995422363, + "learning_rate": 3.768115942028986e-06, + "loss": 1.7568, + "step": 28510 + }, + { + "epoch": 2.38, + "grad_norm": 7.709628105163574, + "learning_rate": 3.7623188405797107e-06, + "loss": 1.4997, + "step": 28520 + }, + { + "epoch": 2.38, + "grad_norm": 3.7862284183502197, + "learning_rate": 3.7565217391304347e-06, + "loss": 1.6189, + "step": 28530 + }, + { + "epoch": 2.38, + "grad_norm": 2.443877696990967, + "learning_rate": 3.7507246376811596e-06, + "loss": 1.6114, + "step": 28540 + }, + { + "epoch": 2.38, + "grad_norm": 9.974430084228516, + "learning_rate": 3.7449275362318844e-06, + "loss": 1.7863, + "step": 28550 + }, + { + "epoch": 2.38, + "grad_norm": 4.032052516937256, + "learning_rate": 3.739130434782609e-06, + "loss": 1.5454, + "step": 28560 + }, + { + "epoch": 2.38, + "grad_norm": 15.427571296691895, + "learning_rate": 3.7333333333333337e-06, + "loss": 1.583, + "step": 28570 + }, + { + "epoch": 2.38, + "grad_norm": 5.56199836730957, + "learning_rate": 3.727536231884058e-06, + "loss": 1.5224, + "step": 28580 + }, + { + "epoch": 2.38, + "grad_norm": 2.051842212677002, + "learning_rate": 3.7217391304347826e-06, + "loss": 1.4863, + "step": 28590 + }, + { + "epoch": 2.38, + "grad_norm": 2.057842493057251, + "learning_rate": 3.7159420289855074e-06, + "loss": 1.6266, + "step": 28600 + }, + { + "epoch": 2.38, + "grad_norm": 2.723554849624634, + "learning_rate": 3.7101449275362323e-06, + "loss": 1.4752, + "step": 28610 + }, + { + "epoch": 2.38, + "grad_norm": 2.5356404781341553, + "learning_rate": 3.704347826086957e-06, + "loss": 1.6376, + "step": 28620 + }, + { + "epoch": 2.39, + "grad_norm": 2.806687593460083, + "learning_rate": 3.6985507246376816e-06, + "loss": 1.5874, + "step": 28630 + }, + { + "epoch": 2.39, + "grad_norm": 2.9221794605255127, + "learning_rate": 3.692753623188406e-06, + "loss": 1.5031, + "step": 28640 + }, + { + "epoch": 2.39, + "grad_norm": 4.143265724182129, + "learning_rate": 3.686956521739131e-06, + "loss": 1.5727, + "step": 28650 + }, + { + "epoch": 2.39, + "grad_norm": 2.832207679748535, + "learning_rate": 3.6811594202898553e-06, + "loss": 1.6113, + "step": 28660 + }, + { + "epoch": 2.39, + "grad_norm": 9.17288875579834, + "learning_rate": 3.67536231884058e-06, + "loss": 1.593, + "step": 28670 + }, + { + "epoch": 2.39, + "grad_norm": 2.8944125175476074, + "learning_rate": 3.669565217391305e-06, + "loss": 1.6084, + "step": 28680 + }, + { + "epoch": 2.39, + "grad_norm": 9.046699523925781, + "learning_rate": 3.663768115942029e-06, + "loss": 1.5391, + "step": 28690 + }, + { + "epoch": 2.39, + "grad_norm": 6.482266902923584, + "learning_rate": 3.657971014492754e-06, + "loss": 1.6621, + "step": 28700 + }, + { + "epoch": 2.39, + "grad_norm": 4.573151588439941, + "learning_rate": 3.6521739130434787e-06, + "loss": 1.6467, + "step": 28710 + }, + { + "epoch": 2.39, + "grad_norm": 13.082232475280762, + "learning_rate": 3.646376811594203e-06, + "loss": 1.6243, + "step": 28720 + }, + { + "epoch": 2.39, + "grad_norm": 5.799623012542725, + "learning_rate": 3.640579710144928e-06, + "loss": 1.5777, + "step": 28730 + }, + { + "epoch": 2.4, + "grad_norm": 4.449925422668457, + "learning_rate": 3.6347826086956524e-06, + "loss": 1.7436, + "step": 28740 + }, + { + "epoch": 2.4, + "grad_norm": 6.415322780609131, + "learning_rate": 3.628985507246377e-06, + "loss": 1.5015, + "step": 28750 + }, + { + "epoch": 2.4, + "grad_norm": 3.3753433227539062, + "learning_rate": 3.6231884057971017e-06, + "loss": 1.696, + "step": 28760 + }, + { + "epoch": 2.4, + "grad_norm": 3.1241776943206787, + "learning_rate": 3.6173913043478265e-06, + "loss": 1.5431, + "step": 28770 + }, + { + "epoch": 2.4, + "grad_norm": 6.052039623260498, + "learning_rate": 3.6115942028985514e-06, + "loss": 1.5878, + "step": 28780 + }, + { + "epoch": 2.4, + "grad_norm": 4.090257167816162, + "learning_rate": 3.6057971014492754e-06, + "loss": 1.5928, + "step": 28790 + }, + { + "epoch": 2.4, + "grad_norm": 2.91934871673584, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.6831, + "step": 28800 + }, + { + "epoch": 2.4, + "grad_norm": 8.889466285705566, + "learning_rate": 3.594202898550725e-06, + "loss": 1.599, + "step": 28810 + }, + { + "epoch": 2.4, + "grad_norm": 5.158334255218506, + "learning_rate": 3.5884057971014495e-06, + "loss": 1.6809, + "step": 28820 + }, + { + "epoch": 2.4, + "grad_norm": 4.268106460571289, + "learning_rate": 3.5826086956521744e-06, + "loss": 1.6967, + "step": 28830 + }, + { + "epoch": 2.4, + "grad_norm": 5.080434322357178, + "learning_rate": 3.5768115942028984e-06, + "loss": 1.7103, + "step": 28840 + }, + { + "epoch": 2.4, + "grad_norm": 2.549793243408203, + "learning_rate": 3.5710144927536233e-06, + "loss": 1.5933, + "step": 28850 + }, + { + "epoch": 2.41, + "grad_norm": 5.401309013366699, + "learning_rate": 3.565217391304348e-06, + "loss": 1.6199, + "step": 28860 + }, + { + "epoch": 2.41, + "grad_norm": 1.3040978908538818, + "learning_rate": 3.559420289855073e-06, + "loss": 1.7081, + "step": 28870 + }, + { + "epoch": 2.41, + "grad_norm": 2.473288059234619, + "learning_rate": 3.5536231884057974e-06, + "loss": 1.626, + "step": 28880 + }, + { + "epoch": 2.41, + "grad_norm": 1.599831461906433, + "learning_rate": 3.547826086956522e-06, + "loss": 1.7197, + "step": 28890 + }, + { + "epoch": 2.41, + "grad_norm": 3.284329414367676, + "learning_rate": 3.5420289855072467e-06, + "loss": 1.6332, + "step": 28900 + }, + { + "epoch": 2.41, + "grad_norm": 8.579666137695312, + "learning_rate": 3.536231884057971e-06, + "loss": 1.7461, + "step": 28910 + }, + { + "epoch": 2.41, + "grad_norm": 0.893390953540802, + "learning_rate": 3.530434782608696e-06, + "loss": 1.588, + "step": 28920 + }, + { + "epoch": 2.41, + "grad_norm": 6.217680931091309, + "learning_rate": 3.524637681159421e-06, + "loss": 1.6019, + "step": 28930 + }, + { + "epoch": 2.41, + "grad_norm": 3.5240437984466553, + "learning_rate": 3.5188405797101457e-06, + "loss": 1.5526, + "step": 28940 + }, + { + "epoch": 2.41, + "grad_norm": 1.2045577764511108, + "learning_rate": 3.5130434782608697e-06, + "loss": 1.5679, + "step": 28950 + }, + { + "epoch": 2.41, + "grad_norm": 7.309528350830078, + "learning_rate": 3.5072463768115945e-06, + "loss": 1.6655, + "step": 28960 + }, + { + "epoch": 2.41, + "grad_norm": 7.580850601196289, + "learning_rate": 3.501449275362319e-06, + "loss": 1.6961, + "step": 28970 + }, + { + "epoch": 2.42, + "grad_norm": 3.6850571632385254, + "learning_rate": 3.495652173913044e-06, + "loss": 1.6288, + "step": 28980 + }, + { + "epoch": 2.42, + "grad_norm": 6.09687614440918, + "learning_rate": 3.4898550724637687e-06, + "loss": 1.7291, + "step": 28990 + }, + { + "epoch": 2.42, + "grad_norm": 2.748039960861206, + "learning_rate": 3.4840579710144927e-06, + "loss": 1.5591, + "step": 29000 + }, + { + "epoch": 2.42, + "eval_loss": 1.657044529914856, + "eval_runtime": 107.5181, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 29000 + }, + { + "epoch": 2.42, + "grad_norm": 3.007520914077759, + "learning_rate": 3.4782608695652175e-06, + "loss": 1.6379, + "step": 29010 + }, + { + "epoch": 2.42, + "grad_norm": 4.748697280883789, + "learning_rate": 3.4724637681159424e-06, + "loss": 1.7399, + "step": 29020 + }, + { + "epoch": 2.42, + "grad_norm": 15.37971305847168, + "learning_rate": 3.4666666666666672e-06, + "loss": 1.5297, + "step": 29030 + }, + { + "epoch": 2.42, + "grad_norm": 3.962806463241577, + "learning_rate": 3.4608695652173916e-06, + "loss": 1.6266, + "step": 29040 + }, + { + "epoch": 2.42, + "grad_norm": 13.86978530883789, + "learning_rate": 3.455072463768116e-06, + "loss": 1.6452, + "step": 29050 + }, + { + "epoch": 2.42, + "grad_norm": 3.6884708404541016, + "learning_rate": 3.449275362318841e-06, + "loss": 1.63, + "step": 29060 + }, + { + "epoch": 2.42, + "grad_norm": 2.2268710136413574, + "learning_rate": 3.4434782608695654e-06, + "loss": 1.7602, + "step": 29070 + }, + { + "epoch": 2.42, + "grad_norm": 8.4721040725708, + "learning_rate": 3.43768115942029e-06, + "loss": 1.7621, + "step": 29080 + }, + { + "epoch": 2.42, + "grad_norm": 9.810726165771484, + "learning_rate": 3.431884057971015e-06, + "loss": 1.5289, + "step": 29090 + }, + { + "epoch": 2.42, + "grad_norm": 4.837118625640869, + "learning_rate": 3.426086956521739e-06, + "loss": 1.6023, + "step": 29100 + }, + { + "epoch": 2.43, + "grad_norm": 13.719472885131836, + "learning_rate": 3.420289855072464e-06, + "loss": 1.6571, + "step": 29110 + }, + { + "epoch": 2.43, + "grad_norm": 1.3916150331497192, + "learning_rate": 3.4144927536231888e-06, + "loss": 1.6231, + "step": 29120 + }, + { + "epoch": 2.43, + "grad_norm": 4.7587890625, + "learning_rate": 3.408695652173913e-06, + "loss": 1.6458, + "step": 29130 + }, + { + "epoch": 2.43, + "grad_norm": 5.0899200439453125, + "learning_rate": 3.402898550724638e-06, + "loss": 1.681, + "step": 29140 + }, + { + "epoch": 2.43, + "grad_norm": 3.8998475074768066, + "learning_rate": 3.3971014492753625e-06, + "loss": 1.5843, + "step": 29150 + }, + { + "epoch": 2.43, + "grad_norm": 13.847112655639648, + "learning_rate": 3.391304347826087e-06, + "loss": 1.5932, + "step": 29160 + }, + { + "epoch": 2.43, + "grad_norm": 16.733882904052734, + "learning_rate": 3.3855072463768118e-06, + "loss": 1.6131, + "step": 29170 + }, + { + "epoch": 2.43, + "grad_norm": 3.589210033416748, + "learning_rate": 3.3797101449275366e-06, + "loss": 1.5347, + "step": 29180 + }, + { + "epoch": 2.43, + "grad_norm": 2.7056937217712402, + "learning_rate": 3.3739130434782615e-06, + "loss": 1.6423, + "step": 29190 + }, + { + "epoch": 2.43, + "grad_norm": 5.761937618255615, + "learning_rate": 3.3681159420289855e-06, + "loss": 1.5252, + "step": 29200 + }, + { + "epoch": 2.43, + "grad_norm": 4.621154308319092, + "learning_rate": 3.3623188405797103e-06, + "loss": 1.4666, + "step": 29210 + }, + { + "epoch": 2.44, + "grad_norm": 9.775867462158203, + "learning_rate": 3.356521739130435e-06, + "loss": 1.6301, + "step": 29220 + }, + { + "epoch": 2.44, + "grad_norm": 4.965156078338623, + "learning_rate": 3.3507246376811596e-06, + "loss": 1.7043, + "step": 29230 + }, + { + "epoch": 2.44, + "grad_norm": 11.6044282913208, + "learning_rate": 3.3449275362318845e-06, + "loss": 1.6566, + "step": 29240 + }, + { + "epoch": 2.44, + "grad_norm": 2.9858810901641846, + "learning_rate": 3.3391304347826093e-06, + "loss": 1.5687, + "step": 29250 + }, + { + "epoch": 2.44, + "grad_norm": 5.0615386962890625, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.4778, + "step": 29260 + }, + { + "epoch": 2.44, + "grad_norm": 8.376899719238281, + "learning_rate": 3.327536231884058e-06, + "loss": 1.6104, + "step": 29270 + }, + { + "epoch": 2.44, + "grad_norm": 4.015613555908203, + "learning_rate": 3.321739130434783e-06, + "loss": 1.5788, + "step": 29280 + }, + { + "epoch": 2.44, + "grad_norm": 2.4846994876861572, + "learning_rate": 3.3159420289855075e-06, + "loss": 1.5098, + "step": 29290 + }, + { + "epoch": 2.44, + "grad_norm": 3.8240742683410645, + "learning_rate": 3.3101449275362323e-06, + "loss": 1.7382, + "step": 29300 + }, + { + "epoch": 2.44, + "grad_norm": 3.34401798248291, + "learning_rate": 3.3043478260869567e-06, + "loss": 1.6224, + "step": 29310 + }, + { + "epoch": 2.44, + "grad_norm": 4.242111682891846, + "learning_rate": 3.298550724637681e-06, + "loss": 1.7168, + "step": 29320 + }, + { + "epoch": 2.44, + "grad_norm": 5.714269638061523, + "learning_rate": 3.292753623188406e-06, + "loss": 1.6101, + "step": 29330 + }, + { + "epoch": 2.44, + "grad_norm": 2.2656619548797607, + "learning_rate": 3.286956521739131e-06, + "loss": 1.6284, + "step": 29340 + }, + { + "epoch": 2.45, + "grad_norm": 8.77830696105957, + "learning_rate": 3.2811594202898557e-06, + "loss": 1.6186, + "step": 29350 + }, + { + "epoch": 2.45, + "grad_norm": 3.6399919986724854, + "learning_rate": 3.2753623188405797e-06, + "loss": 1.5704, + "step": 29360 + }, + { + "epoch": 2.45, + "grad_norm": 1.7771320343017578, + "learning_rate": 3.2695652173913046e-06, + "loss": 1.6302, + "step": 29370 + }, + { + "epoch": 2.45, + "grad_norm": 5.038367748260498, + "learning_rate": 3.2637681159420294e-06, + "loss": 1.4699, + "step": 29380 + }, + { + "epoch": 2.45, + "grad_norm": 1.5462366342544556, + "learning_rate": 3.257971014492754e-06, + "loss": 1.5656, + "step": 29390 + }, + { + "epoch": 2.45, + "grad_norm": 8.673677444458008, + "learning_rate": 3.252753623188406e-06, + "loss": 1.5968, + "step": 29400 + }, + { + "epoch": 2.45, + "grad_norm": 3.5238194465637207, + "learning_rate": 3.246956521739131e-06, + "loss": 1.7341, + "step": 29410 + }, + { + "epoch": 2.45, + "grad_norm": 3.633085012435913, + "learning_rate": 3.2411594202898557e-06, + "loss": 1.6154, + "step": 29420 + }, + { + "epoch": 2.45, + "grad_norm": 7.964749336242676, + "learning_rate": 3.2353623188405797e-06, + "loss": 1.4897, + "step": 29430 + }, + { + "epoch": 2.45, + "grad_norm": 6.036418437957764, + "learning_rate": 3.2295652173913045e-06, + "loss": 1.74, + "step": 29440 + }, + { + "epoch": 2.45, + "grad_norm": 5.448636054992676, + "learning_rate": 3.2237681159420294e-06, + "loss": 1.7668, + "step": 29450 + }, + { + "epoch": 2.46, + "grad_norm": 5.758767604827881, + "learning_rate": 3.217971014492754e-06, + "loss": 1.5285, + "step": 29460 + }, + { + "epoch": 2.46, + "grad_norm": 13.795671463012695, + "learning_rate": 3.2121739130434787e-06, + "loss": 1.5928, + "step": 29470 + }, + { + "epoch": 2.46, + "grad_norm": 0.950065553188324, + "learning_rate": 3.2063768115942027e-06, + "loss": 1.6188, + "step": 29480 + }, + { + "epoch": 2.46, + "grad_norm": 9.130352020263672, + "learning_rate": 3.2005797101449275e-06, + "loss": 1.5059, + "step": 29490 + }, + { + "epoch": 2.46, + "grad_norm": 3.847170114517212, + "learning_rate": 3.1947826086956524e-06, + "loss": 1.6167, + "step": 29500 + }, + { + "epoch": 2.46, + "eval_loss": 1.6451491117477417, + "eval_runtime": 107.4982, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 29500 + }, + { + "epoch": 2.46, + "grad_norm": 3.0313570499420166, + "learning_rate": 3.1889855072463772e-06, + "loss": 1.6151, + "step": 29510 + }, + { + "epoch": 2.46, + "grad_norm": 2.9217429161071777, + "learning_rate": 3.1831884057971017e-06, + "loss": 1.643, + "step": 29520 + }, + { + "epoch": 2.46, + "grad_norm": 3.5794498920440674, + "learning_rate": 3.1773913043478265e-06, + "loss": 1.6273, + "step": 29530 + }, + { + "epoch": 2.46, + "grad_norm": 2.837756395339966, + "learning_rate": 3.171594202898551e-06, + "loss": 1.5733, + "step": 29540 + }, + { + "epoch": 2.46, + "grad_norm": 4.105989933013916, + "learning_rate": 3.1657971014492754e-06, + "loss": 1.5915, + "step": 29550 + }, + { + "epoch": 2.46, + "grad_norm": 9.32128620147705, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.5463, + "step": 29560 + }, + { + "epoch": 2.46, + "grad_norm": 4.723803520202637, + "learning_rate": 3.154202898550725e-06, + "loss": 1.7413, + "step": 29570 + }, + { + "epoch": 2.46, + "grad_norm": 8.316760063171387, + "learning_rate": 3.14840579710145e-06, + "loss": 1.6757, + "step": 29580 + }, + { + "epoch": 2.47, + "grad_norm": 7.694721698760986, + "learning_rate": 3.142608695652174e-06, + "loss": 1.5203, + "step": 29590 + }, + { + "epoch": 2.47, + "grad_norm": 3.9127633571624756, + "learning_rate": 3.136811594202899e-06, + "loss": 1.5992, + "step": 29600 + }, + { + "epoch": 2.47, + "grad_norm": 4.171841144561768, + "learning_rate": 3.1310144927536237e-06, + "loss": 1.6754, + "step": 29610 + }, + { + "epoch": 2.47, + "grad_norm": 6.929067134857178, + "learning_rate": 3.125217391304348e-06, + "loss": 1.6738, + "step": 29620 + }, + { + "epoch": 2.47, + "grad_norm": 7.294098854064941, + "learning_rate": 3.119420289855073e-06, + "loss": 1.6663, + "step": 29630 + }, + { + "epoch": 2.47, + "grad_norm": 3.76629376411438, + "learning_rate": 3.113623188405797e-06, + "loss": 1.6625, + "step": 29640 + }, + { + "epoch": 2.47, + "grad_norm": 6.9984612464904785, + "learning_rate": 3.107826086956522e-06, + "loss": 1.5415, + "step": 29650 + }, + { + "epoch": 2.47, + "grad_norm": 2.3784775733947754, + "learning_rate": 3.1020289855072466e-06, + "loss": 1.7637, + "step": 29660 + }, + { + "epoch": 2.47, + "grad_norm": 5.289485454559326, + "learning_rate": 3.0962318840579715e-06, + "loss": 1.7363, + "step": 29670 + }, + { + "epoch": 2.47, + "grad_norm": 3.363936424255371, + "learning_rate": 3.090434782608696e-06, + "loss": 1.7829, + "step": 29680 + }, + { + "epoch": 2.47, + "grad_norm": 0.904037356376648, + "learning_rate": 3.0846376811594204e-06, + "loss": 1.6108, + "step": 29690 + }, + { + "epoch": 2.48, + "grad_norm": 4.6849846839904785, + "learning_rate": 3.0788405797101452e-06, + "loss": 1.53, + "step": 29700 + }, + { + "epoch": 2.48, + "grad_norm": 13.439961433410645, + "learning_rate": 3.0730434782608696e-06, + "loss": 1.6294, + "step": 29710 + }, + { + "epoch": 2.48, + "grad_norm": 14.236470222473145, + "learning_rate": 3.0672463768115945e-06, + "loss": 1.5566, + "step": 29720 + }, + { + "epoch": 2.48, + "grad_norm": 2.827075481414795, + "learning_rate": 3.0614492753623193e-06, + "loss": 1.6481, + "step": 29730 + }, + { + "epoch": 2.48, + "grad_norm": 2.0929653644561768, + "learning_rate": 3.0556521739130434e-06, + "loss": 1.6433, + "step": 29740 + }, + { + "epoch": 2.48, + "grad_norm": 6.4234700202941895, + "learning_rate": 3.049855072463768e-06, + "loss": 1.5093, + "step": 29750 + }, + { + "epoch": 2.48, + "grad_norm": 3.105968952178955, + "learning_rate": 3.044057971014493e-06, + "loss": 1.5319, + "step": 29760 + }, + { + "epoch": 2.48, + "grad_norm": 2.3987698554992676, + "learning_rate": 3.038260869565218e-06, + "loss": 1.5574, + "step": 29770 + }, + { + "epoch": 2.48, + "grad_norm": 6.3514299392700195, + "learning_rate": 3.0324637681159423e-06, + "loss": 1.5673, + "step": 29780 + }, + { + "epoch": 2.48, + "grad_norm": 10.782695770263672, + "learning_rate": 3.0266666666666668e-06, + "loss": 1.5443, + "step": 29790 + }, + { + "epoch": 2.48, + "grad_norm": 5.090881824493408, + "learning_rate": 3.020869565217391e-06, + "loss": 1.5812, + "step": 29800 + }, + { + "epoch": 2.48, + "grad_norm": 11.7606782913208, + "learning_rate": 3.015072463768116e-06, + "loss": 1.6019, + "step": 29810 + }, + { + "epoch": 2.48, + "grad_norm": 10.391364097595215, + "learning_rate": 3.009275362318841e-06, + "loss": 1.5642, + "step": 29820 + }, + { + "epoch": 2.49, + "grad_norm": 5.301925182342529, + "learning_rate": 3.0034782608695658e-06, + "loss": 1.575, + "step": 29830 + }, + { + "epoch": 2.49, + "grad_norm": 4.747443675994873, + "learning_rate": 2.99768115942029e-06, + "loss": 1.7891, + "step": 29840 + }, + { + "epoch": 2.49, + "grad_norm": 3.40224027633667, + "learning_rate": 2.9918840579710146e-06, + "loss": 1.7931, + "step": 29850 + }, + { + "epoch": 2.49, + "grad_norm": 1.9095501899719238, + "learning_rate": 2.9860869565217395e-06, + "loss": 1.727, + "step": 29860 + }, + { + "epoch": 2.49, + "grad_norm": 5.217514514923096, + "learning_rate": 2.980289855072464e-06, + "loss": 1.5056, + "step": 29870 + }, + { + "epoch": 2.49, + "grad_norm": 3.045649766921997, + "learning_rate": 2.9744927536231888e-06, + "loss": 1.5775, + "step": 29880 + }, + { + "epoch": 2.49, + "grad_norm": 3.6917686462402344, + "learning_rate": 2.9686956521739136e-06, + "loss": 1.6197, + "step": 29890 + }, + { + "epoch": 2.49, + "grad_norm": 2.3253226280212402, + "learning_rate": 2.9628985507246376e-06, + "loss": 1.7079, + "step": 29900 + }, + { + "epoch": 2.49, + "grad_norm": 2.5302798748016357, + "learning_rate": 2.9571014492753625e-06, + "loss": 1.5915, + "step": 29910 + }, + { + "epoch": 2.49, + "grad_norm": 2.9401729106903076, + "learning_rate": 2.9513043478260873e-06, + "loss": 1.5172, + "step": 29920 + }, + { + "epoch": 2.49, + "grad_norm": 4.63379430770874, + "learning_rate": 2.945507246376812e-06, + "loss": 1.6475, + "step": 29930 + }, + { + "epoch": 2.5, + "grad_norm": 3.7005679607391357, + "learning_rate": 2.9397101449275366e-06, + "loss": 1.383, + "step": 29940 + }, + { + "epoch": 2.5, + "grad_norm": 1.5383400917053223, + "learning_rate": 2.933913043478261e-06, + "loss": 1.6116, + "step": 29950 + }, + { + "epoch": 2.5, + "grad_norm": 3.307799816131592, + "learning_rate": 2.9281159420289855e-06, + "loss": 1.7254, + "step": 29960 + }, + { + "epoch": 2.5, + "grad_norm": 5.233972549438477, + "learning_rate": 2.9223188405797103e-06, + "loss": 1.5307, + "step": 29970 + }, + { + "epoch": 2.5, + "grad_norm": 2.475644588470459, + "learning_rate": 2.916521739130435e-06, + "loss": 1.6835, + "step": 29980 + }, + { + "epoch": 2.5, + "grad_norm": 3.9054574966430664, + "learning_rate": 2.91072463768116e-06, + "loss": 1.5383, + "step": 29990 + }, + { + "epoch": 2.5, + "grad_norm": 2.8980517387390137, + "learning_rate": 2.904927536231884e-06, + "loss": 1.6759, + "step": 30000 + }, + { + "epoch": 2.5, + "eval_loss": 1.6585332155227661, + "eval_runtime": 107.501, + "eval_samples_per_second": 9.302, + "eval_steps_per_second": 2.326, + "step": 30000 + }, + { + "epoch": 2.5, + "grad_norm": 1.9723654985427856, + "learning_rate": 2.899130434782609e-06, + "loss": 1.6399, + "step": 30010 + }, + { + "epoch": 2.5, + "grad_norm": 7.082981586456299, + "learning_rate": 2.8933333333333337e-06, + "loss": 1.6246, + "step": 30020 + }, + { + "epoch": 2.5, + "grad_norm": 5.984813690185547, + "learning_rate": 2.887536231884058e-06, + "loss": 1.6357, + "step": 30030 + }, + { + "epoch": 2.5, + "grad_norm": 9.565591812133789, + "learning_rate": 2.881739130434783e-06, + "loss": 1.6206, + "step": 30040 + }, + { + "epoch": 2.5, + "grad_norm": 13.118734359741211, + "learning_rate": 2.8759420289855074e-06, + "loss": 1.6352, + "step": 30050 + }, + { + "epoch": 2.5, + "grad_norm": 2.744248867034912, + "learning_rate": 2.870144927536232e-06, + "loss": 1.4582, + "step": 30060 + }, + { + "epoch": 2.51, + "grad_norm": 3.0195579528808594, + "learning_rate": 2.8643478260869567e-06, + "loss": 1.6441, + "step": 30070 + }, + { + "epoch": 2.51, + "grad_norm": 3.9167885780334473, + "learning_rate": 2.8585507246376816e-06, + "loss": 1.7603, + "step": 30080 + }, + { + "epoch": 2.51, + "grad_norm": 6.419768810272217, + "learning_rate": 2.852753623188406e-06, + "loss": 1.598, + "step": 30090 + }, + { + "epoch": 2.51, + "grad_norm": 3.9617881774902344, + "learning_rate": 2.8469565217391304e-06, + "loss": 1.639, + "step": 30100 + }, + { + "epoch": 2.51, + "grad_norm": 2.755096673965454, + "learning_rate": 2.8411594202898553e-06, + "loss": 1.6888, + "step": 30110 + }, + { + "epoch": 2.51, + "grad_norm": 9.04216194152832, + "learning_rate": 2.8353623188405797e-06, + "loss": 1.6008, + "step": 30120 + }, + { + "epoch": 2.51, + "grad_norm": 2.4999070167541504, + "learning_rate": 2.8295652173913046e-06, + "loss": 1.6094, + "step": 30130 + }, + { + "epoch": 2.51, + "grad_norm": 10.484206199645996, + "learning_rate": 2.8237681159420294e-06, + "loss": 1.6802, + "step": 30140 + }, + { + "epoch": 2.51, + "grad_norm": 7.458893299102783, + "learning_rate": 2.8179710144927543e-06, + "loss": 1.559, + "step": 30150 + }, + { + "epoch": 2.51, + "grad_norm": 8.366874694824219, + "learning_rate": 2.8121739130434783e-06, + "loss": 1.5715, + "step": 30160 + }, + { + "epoch": 2.51, + "grad_norm": 10.303510665893555, + "learning_rate": 2.806376811594203e-06, + "loss": 1.5516, + "step": 30170 + }, + { + "epoch": 2.52, + "grad_norm": 4.16677713394165, + "learning_rate": 2.800579710144928e-06, + "loss": 1.5497, + "step": 30180 + }, + { + "epoch": 2.52, + "grad_norm": 8.9423246383667, + "learning_rate": 2.7947826086956524e-06, + "loss": 1.7121, + "step": 30190 + }, + { + "epoch": 2.52, + "grad_norm": 3.328890562057495, + "learning_rate": 2.7889855072463773e-06, + "loss": 1.5159, + "step": 30200 + }, + { + "epoch": 2.52, + "grad_norm": 7.184723854064941, + "learning_rate": 2.7831884057971013e-06, + "loss": 1.6757, + "step": 30210 + }, + { + "epoch": 2.52, + "grad_norm": 5.9649858474731445, + "learning_rate": 2.777391304347826e-06, + "loss": 1.4885, + "step": 30220 + }, + { + "epoch": 2.52, + "grad_norm": 5.443271160125732, + "learning_rate": 2.771594202898551e-06, + "loss": 1.5998, + "step": 30230 + }, + { + "epoch": 2.52, + "grad_norm": 2.292788028717041, + "learning_rate": 2.765797101449276e-06, + "loss": 1.5722, + "step": 30240 + }, + { + "epoch": 2.52, + "grad_norm": 9.991342544555664, + "learning_rate": 2.7600000000000003e-06, + "loss": 1.6356, + "step": 30250 + }, + { + "epoch": 2.52, + "grad_norm": 2.1068317890167236, + "learning_rate": 2.7542028985507247e-06, + "loss": 1.651, + "step": 30260 + }, + { + "epoch": 2.52, + "grad_norm": 7.125411033630371, + "learning_rate": 2.7484057971014495e-06, + "loss": 1.5846, + "step": 30270 + }, + { + "epoch": 2.52, + "grad_norm": 2.9197959899902344, + "learning_rate": 2.742608695652174e-06, + "loss": 1.6979, + "step": 30280 + }, + { + "epoch": 2.52, + "grad_norm": 2.4163668155670166, + "learning_rate": 2.736811594202899e-06, + "loss": 1.6169, + "step": 30290 + }, + { + "epoch": 2.52, + "grad_norm": 8.496438026428223, + "learning_rate": 2.7310144927536237e-06, + "loss": 1.6185, + "step": 30300 + }, + { + "epoch": 2.53, + "grad_norm": 2.996656894683838, + "learning_rate": 2.7252173913043477e-06, + "loss": 1.747, + "step": 30310 + }, + { + "epoch": 2.53, + "grad_norm": 2.3623204231262207, + "learning_rate": 2.7194202898550725e-06, + "loss": 1.6652, + "step": 30320 + }, + { + "epoch": 2.53, + "grad_norm": 6.751153945922852, + "learning_rate": 2.7136231884057974e-06, + "loss": 1.5636, + "step": 30330 + }, + { + "epoch": 2.53, + "grad_norm": 2.7132070064544678, + "learning_rate": 2.7078260869565222e-06, + "loss": 1.5571, + "step": 30340 + }, + { + "epoch": 2.53, + "grad_norm": 12.558267593383789, + "learning_rate": 2.7020289855072467e-06, + "loss": 1.6319, + "step": 30350 + }, + { + "epoch": 2.53, + "grad_norm": 4.379528522491455, + "learning_rate": 2.696231884057971e-06, + "loss": 1.6902, + "step": 30360 + }, + { + "epoch": 2.53, + "grad_norm": 6.271668910980225, + "learning_rate": 2.6904347826086955e-06, + "loss": 1.7105, + "step": 30370 + }, + { + "epoch": 2.53, + "grad_norm": 5.071247577667236, + "learning_rate": 2.6846376811594204e-06, + "loss": 1.5997, + "step": 30380 + }, + { + "epoch": 2.53, + "grad_norm": 5.980905532836914, + "learning_rate": 2.6788405797101452e-06, + "loss": 1.6008, + "step": 30390 + }, + { + "epoch": 2.53, + "grad_norm": 3.2732436656951904, + "learning_rate": 2.67304347826087e-06, + "loss": 1.6783, + "step": 30400 + }, + { + "epoch": 2.53, + "grad_norm": 3.015439748764038, + "learning_rate": 2.667246376811594e-06, + "loss": 1.6118, + "step": 30410 + }, + { + "epoch": 2.54, + "grad_norm": 3.7114615440368652, + "learning_rate": 2.661449275362319e-06, + "loss": 1.6513, + "step": 30420 + }, + { + "epoch": 2.54, + "grad_norm": 4.484541893005371, + "learning_rate": 2.655652173913044e-06, + "loss": 1.6789, + "step": 30430 + }, + { + "epoch": 2.54, + "grad_norm": 6.272510528564453, + "learning_rate": 2.6498550724637682e-06, + "loss": 1.5341, + "step": 30440 + }, + { + "epoch": 2.54, + "grad_norm": 2.3156094551086426, + "learning_rate": 2.644057971014493e-06, + "loss": 1.7266, + "step": 30450 + }, + { + "epoch": 2.54, + "grad_norm": 15.670822143554688, + "learning_rate": 2.638260869565218e-06, + "loss": 1.6492, + "step": 30460 + }, + { + "epoch": 2.54, + "grad_norm": 10.792664527893066, + "learning_rate": 2.632463768115942e-06, + "loss": 1.7282, + "step": 30470 + }, + { + "epoch": 2.54, + "grad_norm": 1.0802708864212036, + "learning_rate": 2.6266666666666668e-06, + "loss": 1.6272, + "step": 30480 + }, + { + "epoch": 2.54, + "grad_norm": 1.8151583671569824, + "learning_rate": 2.6208695652173916e-06, + "loss": 1.6041, + "step": 30490 + }, + { + "epoch": 2.54, + "grad_norm": 2.1548194885253906, + "learning_rate": 2.6150724637681165e-06, + "loss": 1.6457, + "step": 30500 + }, + { + "epoch": 2.54, + "eval_loss": 1.642120599746704, + "eval_runtime": 107.4944, + "eval_samples_per_second": 9.303, + "eval_steps_per_second": 2.326, + "step": 30500 + }, + { + "epoch": 2.54, + "grad_norm": 1.8505381345748901, + "learning_rate": 2.609275362318841e-06, + "loss": 1.5916, + "step": 30510 + }, + { + "epoch": 2.54, + "grad_norm": 4.077845096588135, + "learning_rate": 2.6034782608695654e-06, + "loss": 1.5464, + "step": 30520 + }, + { + "epoch": 2.54, + "grad_norm": 3.143275022506714, + "learning_rate": 2.5976811594202898e-06, + "loss": 1.5341, + "step": 30530 + }, + { + "epoch": 2.54, + "grad_norm": 4.190448760986328, + "learning_rate": 2.5918840579710146e-06, + "loss": 1.5817, + "step": 30540 + }, + { + "epoch": 2.55, + "grad_norm": 2.009246587753296, + "learning_rate": 2.5860869565217395e-06, + "loss": 1.5787, + "step": 30550 + }, + { + "epoch": 2.55, + "grad_norm": 6.875110149383545, + "learning_rate": 2.5802898550724643e-06, + "loss": 1.5827, + "step": 30560 + }, + { + "epoch": 2.55, + "grad_norm": 8.125142097473145, + "learning_rate": 2.5744927536231883e-06, + "loss": 1.4877, + "step": 30570 + }, + { + "epoch": 2.55, + "grad_norm": 6.384122848510742, + "learning_rate": 2.568695652173913e-06, + "loss": 1.6345, + "step": 30580 + }, + { + "epoch": 2.55, + "grad_norm": 10.49345874786377, + "learning_rate": 2.562898550724638e-06, + "loss": 1.6619, + "step": 30590 + }, + { + "epoch": 2.55, + "grad_norm": 1.870846152305603, + "learning_rate": 2.5571014492753625e-06, + "loss": 1.5736, + "step": 30600 + }, + { + "epoch": 2.55, + "grad_norm": 3.0048367977142334, + "learning_rate": 2.5513043478260873e-06, + "loss": 1.6575, + "step": 30610 + }, + { + "epoch": 2.55, + "grad_norm": 7.404007434844971, + "learning_rate": 2.5455072463768118e-06, + "loss": 1.6789, + "step": 30620 + }, + { + "epoch": 2.55, + "grad_norm": 5.966566562652588, + "learning_rate": 2.539710144927536e-06, + "loss": 1.5581, + "step": 30630 + }, + { + "epoch": 2.55, + "grad_norm": 2.988835096359253, + "learning_rate": 2.533913043478261e-06, + "loss": 1.4879, + "step": 30640 + }, + { + "epoch": 2.55, + "grad_norm": 7.500990390777588, + "learning_rate": 2.528115942028986e-06, + "loss": 1.6214, + "step": 30650 + }, + { + "epoch": 2.56, + "grad_norm": 6.991439342498779, + "learning_rate": 2.5223188405797107e-06, + "loss": 1.5002, + "step": 30660 + }, + { + "epoch": 2.56, + "grad_norm": 2.588336706161499, + "learning_rate": 2.5165217391304348e-06, + "loss": 1.6576, + "step": 30670 + }, + { + "epoch": 2.56, + "grad_norm": 2.5451226234436035, + "learning_rate": 2.5107246376811596e-06, + "loss": 1.6218, + "step": 30680 + }, + { + "epoch": 2.56, + "grad_norm": 10.67144775390625, + "learning_rate": 2.504927536231884e-06, + "loss": 1.6439, + "step": 30690 + }, + { + "epoch": 2.56, + "grad_norm": 14.395882606506348, + "learning_rate": 2.499130434782609e-06, + "loss": 1.5812, + "step": 30700 + }, + { + "epoch": 2.56, + "grad_norm": 3.2323532104492188, + "learning_rate": 2.4933333333333333e-06, + "loss": 1.6072, + "step": 30710 + }, + { + "epoch": 2.56, + "grad_norm": 2.6720938682556152, + "learning_rate": 2.487536231884058e-06, + "loss": 1.5967, + "step": 30720 + }, + { + "epoch": 2.56, + "grad_norm": 2.9183871746063232, + "learning_rate": 2.481739130434783e-06, + "loss": 1.659, + "step": 30730 + }, + { + "epoch": 2.56, + "grad_norm": 8.462002754211426, + "learning_rate": 2.4759420289855075e-06, + "loss": 1.6629, + "step": 30740 + }, + { + "epoch": 2.56, + "grad_norm": 9.175357818603516, + "learning_rate": 2.4701449275362323e-06, + "loss": 1.6591, + "step": 30750 + }, + { + "epoch": 2.56, + "grad_norm": 11.072102546691895, + "learning_rate": 2.4643478260869567e-06, + "loss": 1.5515, + "step": 30760 + }, + { + "epoch": 2.56, + "grad_norm": 5.013851642608643, + "learning_rate": 2.458550724637681e-06, + "loss": 1.5813, + "step": 30770 + }, + { + "epoch": 2.56, + "grad_norm": 1.7383862733840942, + "learning_rate": 2.452753623188406e-06, + "loss": 1.7057, + "step": 30780 + }, + { + "epoch": 2.57, + "grad_norm": 5.308183670043945, + "learning_rate": 2.4469565217391304e-06, + "loss": 1.5989, + "step": 30790 + }, + { + "epoch": 2.57, + "grad_norm": 4.486476898193359, + "learning_rate": 2.4411594202898553e-06, + "loss": 1.6127, + "step": 30800 + }, + { + "epoch": 2.57, + "grad_norm": 4.1956095695495605, + "learning_rate": 2.4353623188405797e-06, + "loss": 1.515, + "step": 30810 + }, + { + "epoch": 2.57, + "grad_norm": 6.074836730957031, + "learning_rate": 2.4295652173913046e-06, + "loss": 1.5636, + "step": 30820 + }, + { + "epoch": 2.57, + "grad_norm": 3.5480659008026123, + "learning_rate": 2.4237681159420294e-06, + "loss": 1.6296, + "step": 30830 + }, + { + "epoch": 2.57, + "grad_norm": 3.6157848834991455, + "learning_rate": 2.417971014492754e-06, + "loss": 1.7141, + "step": 30840 + }, + { + "epoch": 2.57, + "grad_norm": 1.7036499977111816, + "learning_rate": 2.4121739130434783e-06, + "loss": 1.6024, + "step": 30850 + }, + { + "epoch": 2.57, + "grad_norm": 9.184311866760254, + "learning_rate": 2.406376811594203e-06, + "loss": 1.6096, + "step": 30860 + }, + { + "epoch": 2.57, + "grad_norm": 10.717764854431152, + "learning_rate": 2.4005797101449276e-06, + "loss": 1.6807, + "step": 30870 + }, + { + "epoch": 2.57, + "grad_norm": 6.877434253692627, + "learning_rate": 2.3947826086956524e-06, + "loss": 1.6401, + "step": 30880 + }, + { + "epoch": 2.57, + "grad_norm": 4.13861083984375, + "learning_rate": 2.388985507246377e-06, + "loss": 1.6329, + "step": 30890 + }, + { + "epoch": 2.58, + "grad_norm": 3.996332883834839, + "learning_rate": 2.3831884057971017e-06, + "loss": 1.5384, + "step": 30900 + }, + { + "epoch": 2.58, + "grad_norm": 1.9450478553771973, + "learning_rate": 2.3773913043478266e-06, + "loss": 1.6154, + "step": 30910 + }, + { + "epoch": 2.58, + "grad_norm": 5.448505401611328, + "learning_rate": 2.371594202898551e-06, + "loss": 1.7094, + "step": 30920 + }, + { + "epoch": 2.58, + "grad_norm": 8.060430526733398, + "learning_rate": 2.3657971014492754e-06, + "loss": 1.5228, + "step": 30930 + }, + { + "epoch": 2.58, + "grad_norm": 11.698649406433105, + "learning_rate": 2.3600000000000003e-06, + "loss": 1.6043, + "step": 30940 + }, + { + "epoch": 2.58, + "grad_norm": 3.175755262374878, + "learning_rate": 2.3542028985507247e-06, + "loss": 1.5744, + "step": 30950 + }, + { + "epoch": 2.58, + "grad_norm": 0.9357259273529053, + "learning_rate": 2.3484057971014496e-06, + "loss": 1.5971, + "step": 30960 + }, + { + "epoch": 2.58, + "grad_norm": 3.1497623920440674, + "learning_rate": 2.342608695652174e-06, + "loss": 1.5522, + "step": 30970 + }, + { + "epoch": 2.58, + "grad_norm": 3.6525659561157227, + "learning_rate": 2.336811594202899e-06, + "loss": 1.6543, + "step": 30980 + }, + { + "epoch": 2.58, + "grad_norm": 11.570157051086426, + "learning_rate": 2.3310144927536237e-06, + "loss": 1.5808, + "step": 30990 + }, + { + "epoch": 2.58, + "grad_norm": 7.399476528167725, + "learning_rate": 2.325217391304348e-06, + "loss": 1.701, + "step": 31000 + }, + { + "epoch": 2.58, + "eval_loss": 1.6337002515792847, + "eval_runtime": 107.5309, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 31000 + }, + { + "epoch": 2.58, + "grad_norm": 4.558446407318115, + "learning_rate": 2.3194202898550725e-06, + "loss": 1.6179, + "step": 31010 + }, + { + "epoch": 2.58, + "grad_norm": 2.1192896366119385, + "learning_rate": 2.3142028985507247e-06, + "loss": 1.4463, + "step": 31020 + }, + { + "epoch": 2.59, + "grad_norm": 1.965945839881897, + "learning_rate": 2.3084057971014495e-06, + "loss": 1.6299, + "step": 31030 + }, + { + "epoch": 2.59, + "grad_norm": 3.286752223968506, + "learning_rate": 2.302608695652174e-06, + "loss": 1.5018, + "step": 31040 + }, + { + "epoch": 2.59, + "grad_norm": 1.3863645792007446, + "learning_rate": 2.296811594202899e-06, + "loss": 1.6844, + "step": 31050 + }, + { + "epoch": 2.59, + "grad_norm": 13.46044921875, + "learning_rate": 2.2910144927536237e-06, + "loss": 1.6116, + "step": 31060 + }, + { + "epoch": 2.59, + "grad_norm": 7.819109916687012, + "learning_rate": 2.285217391304348e-06, + "loss": 1.5284, + "step": 31070 + }, + { + "epoch": 2.59, + "grad_norm": 3.452699899673462, + "learning_rate": 2.2794202898550725e-06, + "loss": 1.6716, + "step": 31080 + }, + { + "epoch": 2.59, + "grad_norm": 4.367595195770264, + "learning_rate": 2.2736231884057974e-06, + "loss": 1.4822, + "step": 31090 + }, + { + "epoch": 2.59, + "grad_norm": 9.822129249572754, + "learning_rate": 2.267826086956522e-06, + "loss": 1.6761, + "step": 31100 + }, + { + "epoch": 2.59, + "grad_norm": 3.4133310317993164, + "learning_rate": 2.2620289855072466e-06, + "loss": 1.655, + "step": 31110 + }, + { + "epoch": 2.59, + "grad_norm": 6.704021453857422, + "learning_rate": 2.256231884057971e-06, + "loss": 1.531, + "step": 31120 + }, + { + "epoch": 2.59, + "grad_norm": 6.551307201385498, + "learning_rate": 2.250434782608696e-06, + "loss": 1.6892, + "step": 31130 + }, + { + "epoch": 2.59, + "grad_norm": 4.172097682952881, + "learning_rate": 2.2446376811594208e-06, + "loss": 1.6129, + "step": 31140 + }, + { + "epoch": 2.6, + "grad_norm": 7.622472286224365, + "learning_rate": 2.238840579710145e-06, + "loss": 1.5993, + "step": 31150 + }, + { + "epoch": 2.6, + "grad_norm": 3.656231641769409, + "learning_rate": 2.2330434782608696e-06, + "loss": 1.6874, + "step": 31160 + }, + { + "epoch": 2.6, + "grad_norm": 2.335791826248169, + "learning_rate": 2.2272463768115945e-06, + "loss": 1.5399, + "step": 31170 + }, + { + "epoch": 2.6, + "grad_norm": 0.9562406539916992, + "learning_rate": 2.221449275362319e-06, + "loss": 1.7015, + "step": 31180 + }, + { + "epoch": 2.6, + "grad_norm": 5.210826873779297, + "learning_rate": 2.2156521739130438e-06, + "loss": 1.5114, + "step": 31190 + }, + { + "epoch": 2.6, + "grad_norm": 9.8193998336792, + "learning_rate": 2.209855072463768e-06, + "loss": 1.509, + "step": 31200 + }, + { + "epoch": 2.6, + "grad_norm": 4.946864128112793, + "learning_rate": 2.204057971014493e-06, + "loss": 1.6648, + "step": 31210 + }, + { + "epoch": 2.6, + "grad_norm": 4.112027168273926, + "learning_rate": 2.1982608695652175e-06, + "loss": 1.6629, + "step": 31220 + }, + { + "epoch": 2.6, + "grad_norm": 1.7467082738876343, + "learning_rate": 2.1924637681159423e-06, + "loss": 1.637, + "step": 31230 + }, + { + "epoch": 2.6, + "grad_norm": 5.126824855804443, + "learning_rate": 2.1866666666666668e-06, + "loss": 1.6365, + "step": 31240 + }, + { + "epoch": 2.6, + "grad_norm": 7.7713398933410645, + "learning_rate": 2.180869565217391e-06, + "loss": 1.4703, + "step": 31250 + }, + { + "epoch": 2.6, + "grad_norm": 5.900889873504639, + "learning_rate": 2.175072463768116e-06, + "loss": 1.5412, + "step": 31260 + }, + { + "epoch": 2.61, + "grad_norm": 3.582507848739624, + "learning_rate": 2.169275362318841e-06, + "loss": 1.51, + "step": 31270 + }, + { + "epoch": 2.61, + "grad_norm": 4.354831218719482, + "learning_rate": 2.1634782608695653e-06, + "loss": 1.7147, + "step": 31280 + }, + { + "epoch": 2.61, + "grad_norm": 5.068592071533203, + "learning_rate": 2.15768115942029e-06, + "loss": 1.5849, + "step": 31290 + }, + { + "epoch": 2.61, + "grad_norm": 3.196958065032959, + "learning_rate": 2.1518840579710146e-06, + "loss": 1.6493, + "step": 31300 + }, + { + "epoch": 2.61, + "grad_norm": 6.429135322570801, + "learning_rate": 2.1460869565217395e-06, + "loss": 1.5347, + "step": 31310 + }, + { + "epoch": 2.61, + "grad_norm": 5.728844165802002, + "learning_rate": 2.140289855072464e-06, + "loss": 1.5515, + "step": 31320 + }, + { + "epoch": 2.61, + "grad_norm": 13.872916221618652, + "learning_rate": 2.1344927536231883e-06, + "loss": 1.602, + "step": 31330 + }, + { + "epoch": 2.61, + "grad_norm": 4.392234802246094, + "learning_rate": 2.128695652173913e-06, + "loss": 1.7038, + "step": 31340 + }, + { + "epoch": 2.61, + "grad_norm": 6.907491683959961, + "learning_rate": 2.1228985507246376e-06, + "loss": 1.5619, + "step": 31350 + }, + { + "epoch": 2.61, + "grad_norm": 5.335078239440918, + "learning_rate": 2.1171014492753625e-06, + "loss": 1.5478, + "step": 31360 + }, + { + "epoch": 2.61, + "grad_norm": 1.8956576585769653, + "learning_rate": 2.1113043478260873e-06, + "loss": 1.6199, + "step": 31370 + }, + { + "epoch": 2.62, + "grad_norm": 12.366223335266113, + "learning_rate": 2.1055072463768117e-06, + "loss": 1.4663, + "step": 31380 + }, + { + "epoch": 2.62, + "grad_norm": 5.7240118980407715, + "learning_rate": 2.0997101449275366e-06, + "loss": 1.7283, + "step": 31390 + }, + { + "epoch": 2.62, + "grad_norm": 3.118774652481079, + "learning_rate": 2.093913043478261e-06, + "loss": 1.6955, + "step": 31400 + }, + { + "epoch": 2.62, + "grad_norm": 2.2114875316619873, + "learning_rate": 2.0881159420289855e-06, + "loss": 1.5495, + "step": 31410 + }, + { + "epoch": 2.62, + "grad_norm": 2.9266295433044434, + "learning_rate": 2.0823188405797103e-06, + "loss": 1.5095, + "step": 31420 + }, + { + "epoch": 2.62, + "grad_norm": 2.1518819332122803, + "learning_rate": 2.0765217391304347e-06, + "loss": 1.6809, + "step": 31430 + }, + { + "epoch": 2.62, + "grad_norm": 5.437044620513916, + "learning_rate": 2.0707246376811596e-06, + "loss": 1.6229, + "step": 31440 + }, + { + "epoch": 2.62, + "grad_norm": 4.866549491882324, + "learning_rate": 2.0649275362318844e-06, + "loss": 1.6808, + "step": 31450 + }, + { + "epoch": 2.62, + "grad_norm": 6.811148166656494, + "learning_rate": 2.059130434782609e-06, + "loss": 1.4217, + "step": 31460 + }, + { + "epoch": 2.62, + "grad_norm": 5.14778470993042, + "learning_rate": 2.0533333333333337e-06, + "loss": 1.6761, + "step": 31470 + }, + { + "epoch": 2.62, + "grad_norm": 2.273214340209961, + "learning_rate": 2.047536231884058e-06, + "loss": 1.5311, + "step": 31480 + }, + { + "epoch": 2.62, + "grad_norm": 0.9031133055686951, + "learning_rate": 2.0417391304347826e-06, + "loss": 1.7013, + "step": 31490 + }, + { + "epoch": 2.62, + "grad_norm": 10.49181079864502, + "learning_rate": 2.0359420289855074e-06, + "loss": 1.6723, + "step": 31500 + }, + { + "epoch": 2.62, + "eval_loss": 1.6217410564422607, + "eval_runtime": 107.5528, + "eval_samples_per_second": 9.298, + "eval_steps_per_second": 2.324, + "step": 31500 + }, + { + "epoch": 2.63, + "grad_norm": 2.9454545974731445, + "learning_rate": 2.030144927536232e-06, + "loss": 1.6591, + "step": 31510 + }, + { + "epoch": 2.63, + "grad_norm": 9.571122169494629, + "learning_rate": 2.0243478260869567e-06, + "loss": 1.6785, + "step": 31520 + }, + { + "epoch": 2.63, + "grad_norm": 7.118087291717529, + "learning_rate": 2.018550724637681e-06, + "loss": 1.5377, + "step": 31530 + }, + { + "epoch": 2.63, + "grad_norm": 16.46897315979004, + "learning_rate": 2.012753623188406e-06, + "loss": 1.5683, + "step": 31540 + }, + { + "epoch": 2.63, + "grad_norm": 2.300513505935669, + "learning_rate": 2.006956521739131e-06, + "loss": 1.6441, + "step": 31550 + }, + { + "epoch": 2.63, + "grad_norm": 1.414223551750183, + "learning_rate": 2.0011594202898553e-06, + "loss": 1.4831, + "step": 31560 + }, + { + "epoch": 2.63, + "grad_norm": 2.0008349418640137, + "learning_rate": 1.9953623188405797e-06, + "loss": 1.6988, + "step": 31570 + }, + { + "epoch": 2.63, + "grad_norm": 2.3853042125701904, + "learning_rate": 1.9895652173913046e-06, + "loss": 1.6968, + "step": 31580 + }, + { + "epoch": 2.63, + "grad_norm": 5.788661956787109, + "learning_rate": 1.983768115942029e-06, + "loss": 1.7416, + "step": 31590 + }, + { + "epoch": 2.63, + "grad_norm": 3.9878652095794678, + "learning_rate": 1.977971014492754e-06, + "loss": 1.733, + "step": 31600 + }, + { + "epoch": 2.63, + "grad_norm": 2.0681121349334717, + "learning_rate": 1.9721739130434783e-06, + "loss": 1.5268, + "step": 31610 + }, + { + "epoch": 2.63, + "grad_norm": 6.360716342926025, + "learning_rate": 1.966376811594203e-06, + "loss": 1.6611, + "step": 31620 + }, + { + "epoch": 2.64, + "grad_norm": 1.8834378719329834, + "learning_rate": 1.960579710144928e-06, + "loss": 1.59, + "step": 31630 + }, + { + "epoch": 2.64, + "grad_norm": 5.510819911956787, + "learning_rate": 1.9547826086956524e-06, + "loss": 1.6145, + "step": 31640 + }, + { + "epoch": 2.64, + "grad_norm": 2.6274240016937256, + "learning_rate": 1.948985507246377e-06, + "loss": 1.5678, + "step": 31650 + }, + { + "epoch": 2.64, + "grad_norm": 2.6434683799743652, + "learning_rate": 1.9431884057971017e-06, + "loss": 1.6534, + "step": 31660 + }, + { + "epoch": 2.64, + "grad_norm": 5.2217583656311035, + "learning_rate": 1.937391304347826e-06, + "loss": 1.5893, + "step": 31670 + }, + { + "epoch": 2.64, + "grad_norm": 6.410333156585693, + "learning_rate": 1.931594202898551e-06, + "loss": 1.7107, + "step": 31680 + }, + { + "epoch": 2.64, + "grad_norm": 7.742589473724365, + "learning_rate": 1.9257971014492754e-06, + "loss": 1.7014, + "step": 31690 + }, + { + "epoch": 2.64, + "grad_norm": 5.723726272583008, + "learning_rate": 1.9200000000000003e-06, + "loss": 1.4988, + "step": 31700 + }, + { + "epoch": 2.64, + "grad_norm": 6.500274181365967, + "learning_rate": 1.9142028985507247e-06, + "loss": 1.5023, + "step": 31710 + }, + { + "epoch": 2.64, + "grad_norm": 2.310852527618408, + "learning_rate": 1.9084057971014495e-06, + "loss": 1.65, + "step": 31720 + }, + { + "epoch": 2.64, + "grad_norm": 8.708599090576172, + "learning_rate": 1.9026086956521742e-06, + "loss": 1.6035, + "step": 31730 + }, + { + "epoch": 2.65, + "grad_norm": 7.076088905334473, + "learning_rate": 1.8968115942028986e-06, + "loss": 1.5787, + "step": 31740 + }, + { + "epoch": 2.65, + "grad_norm": 4.023421287536621, + "learning_rate": 1.8910144927536235e-06, + "loss": 1.6103, + "step": 31750 + }, + { + "epoch": 2.65, + "grad_norm": 10.380520820617676, + "learning_rate": 1.885217391304348e-06, + "loss": 1.5645, + "step": 31760 + }, + { + "epoch": 2.65, + "grad_norm": 12.353821754455566, + "learning_rate": 1.8794202898550725e-06, + "loss": 1.487, + "step": 31770 + }, + { + "epoch": 2.65, + "grad_norm": 10.361377716064453, + "learning_rate": 1.8736231884057974e-06, + "loss": 1.7336, + "step": 31780 + }, + { + "epoch": 2.65, + "grad_norm": 2.4413888454437256, + "learning_rate": 1.8678260869565218e-06, + "loss": 1.6445, + "step": 31790 + }, + { + "epoch": 2.65, + "grad_norm": 5.693152904510498, + "learning_rate": 1.8620289855072465e-06, + "loss": 1.5021, + "step": 31800 + }, + { + "epoch": 2.65, + "grad_norm": 27.886554718017578, + "learning_rate": 1.8562318840579713e-06, + "loss": 1.5851, + "step": 31810 + }, + { + "epoch": 2.65, + "grad_norm": 0.8801918029785156, + "learning_rate": 1.8504347826086957e-06, + "loss": 1.6223, + "step": 31820 + }, + { + "epoch": 2.65, + "grad_norm": 3.8942666053771973, + "learning_rate": 1.8446376811594206e-06, + "loss": 1.7608, + "step": 31830 + }, + { + "epoch": 2.65, + "grad_norm": 2.6183860301971436, + "learning_rate": 1.838840579710145e-06, + "loss": 1.644, + "step": 31840 + }, + { + "epoch": 2.65, + "grad_norm": 2.7478418350219727, + "learning_rate": 1.8330434782608697e-06, + "loss": 1.4282, + "step": 31850 + }, + { + "epoch": 2.66, + "grad_norm": 3.082428216934204, + "learning_rate": 1.8272463768115945e-06, + "loss": 1.6814, + "step": 31860 + }, + { + "epoch": 2.66, + "grad_norm": 4.891972064971924, + "learning_rate": 1.821449275362319e-06, + "loss": 1.5985, + "step": 31870 + }, + { + "epoch": 2.66, + "grad_norm": 10.067206382751465, + "learning_rate": 1.8156521739130436e-06, + "loss": 1.411, + "step": 31880 + }, + { + "epoch": 2.66, + "grad_norm": 7.1874613761901855, + "learning_rate": 1.8098550724637682e-06, + "loss": 1.5593, + "step": 31890 + }, + { + "epoch": 2.66, + "grad_norm": 2.310758590698242, + "learning_rate": 1.8040579710144929e-06, + "loss": 1.5513, + "step": 31900 + }, + { + "epoch": 2.66, + "grad_norm": 3.2715983390808105, + "learning_rate": 1.7982608695652177e-06, + "loss": 1.7019, + "step": 31910 + }, + { + "epoch": 2.66, + "grad_norm": 8.090458869934082, + "learning_rate": 1.7924637681159421e-06, + "loss": 1.6117, + "step": 31920 + }, + { + "epoch": 2.66, + "grad_norm": 12.502359390258789, + "learning_rate": 1.7866666666666668e-06, + "loss": 1.4899, + "step": 31930 + }, + { + "epoch": 2.66, + "grad_norm": 13.158916473388672, + "learning_rate": 1.7808695652173916e-06, + "loss": 1.7233, + "step": 31940 + }, + { + "epoch": 2.66, + "grad_norm": 15.319182395935059, + "learning_rate": 1.775072463768116e-06, + "loss": 1.5413, + "step": 31950 + }, + { + "epoch": 2.66, + "grad_norm": 4.916179180145264, + "learning_rate": 1.7692753623188407e-06, + "loss": 1.6006, + "step": 31960 + }, + { + "epoch": 2.66, + "grad_norm": 8.421045303344727, + "learning_rate": 1.7634782608695653e-06, + "loss": 1.5972, + "step": 31970 + }, + { + "epoch": 2.67, + "grad_norm": 2.742832660675049, + "learning_rate": 1.75768115942029e-06, + "loss": 1.6625, + "step": 31980 + }, + { + "epoch": 2.67, + "grad_norm": 1.5600188970565796, + "learning_rate": 1.7518840579710148e-06, + "loss": 1.647, + "step": 31990 + }, + { + "epoch": 2.67, + "grad_norm": 3.127601385116577, + "learning_rate": 1.7460869565217393e-06, + "loss": 1.5884, + "step": 32000 + }, + { + "epoch": 2.67, + "eval_loss": 1.6017138957977295, + "eval_runtime": 107.5185, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 32000 + }, + { + "epoch": 2.67, + "grad_norm": 8.60046672821045, + "learning_rate": 1.740289855072464e-06, + "loss": 1.5752, + "step": 32010 + }, + { + "epoch": 2.67, + "grad_norm": 14.514521598815918, + "learning_rate": 1.7344927536231883e-06, + "loss": 1.6261, + "step": 32020 + }, + { + "epoch": 2.67, + "grad_norm": 8.017278671264648, + "learning_rate": 1.7286956521739132e-06, + "loss": 1.6809, + "step": 32030 + }, + { + "epoch": 2.67, + "grad_norm": 1.6506872177124023, + "learning_rate": 1.7228985507246378e-06, + "loss": 1.6235, + "step": 32040 + }, + { + "epoch": 2.67, + "grad_norm": 5.44380521774292, + "learning_rate": 1.7171014492753625e-06, + "loss": 1.6831, + "step": 32050 + }, + { + "epoch": 2.67, + "grad_norm": 6.318562984466553, + "learning_rate": 1.7113043478260871e-06, + "loss": 1.726, + "step": 32060 + }, + { + "epoch": 2.67, + "grad_norm": 2.484785556793213, + "learning_rate": 1.705507246376812e-06, + "loss": 1.63, + "step": 32070 + }, + { + "epoch": 2.67, + "grad_norm": 3.5618247985839844, + "learning_rate": 1.6997101449275364e-06, + "loss": 1.6426, + "step": 32080 + }, + { + "epoch": 2.67, + "grad_norm": 1.5634952783584595, + "learning_rate": 1.693913043478261e-06, + "loss": 1.506, + "step": 32090 + }, + { + "epoch": 2.67, + "grad_norm": 5.788817882537842, + "learning_rate": 1.6881159420289855e-06, + "loss": 1.6246, + "step": 32100 + }, + { + "epoch": 2.68, + "grad_norm": 7.975259780883789, + "learning_rate": 1.6823188405797103e-06, + "loss": 1.5248, + "step": 32110 + }, + { + "epoch": 2.68, + "grad_norm": 8.401017189025879, + "learning_rate": 1.676521739130435e-06, + "loss": 1.5491, + "step": 32120 + }, + { + "epoch": 2.68, + "grad_norm": 1.3177763223648071, + "learning_rate": 1.6707246376811596e-06, + "loss": 1.6666, + "step": 32130 + }, + { + "epoch": 2.68, + "grad_norm": 3.550060749053955, + "learning_rate": 1.6649275362318842e-06, + "loss": 1.4496, + "step": 32140 + }, + { + "epoch": 2.68, + "grad_norm": 4.780557155609131, + "learning_rate": 1.6591304347826087e-06, + "loss": 1.5778, + "step": 32150 + }, + { + "epoch": 2.68, + "grad_norm": 1.2793478965759277, + "learning_rate": 1.6533333333333335e-06, + "loss": 1.7047, + "step": 32160 + }, + { + "epoch": 2.68, + "grad_norm": 2.5238006114959717, + "learning_rate": 1.6475362318840582e-06, + "loss": 1.6762, + "step": 32170 + }, + { + "epoch": 2.68, + "grad_norm": 5.78254508972168, + "learning_rate": 1.6417391304347826e-06, + "loss": 1.473, + "step": 32180 + }, + { + "epoch": 2.68, + "grad_norm": 3.1378421783447266, + "learning_rate": 1.6359420289855074e-06, + "loss": 1.5478, + "step": 32190 + }, + { + "epoch": 2.68, + "grad_norm": 4.800199508666992, + "learning_rate": 1.6301449275362319e-06, + "loss": 1.7282, + "step": 32200 + }, + { + "epoch": 2.68, + "grad_norm": 11.791644096374512, + "learning_rate": 1.6243478260869565e-06, + "loss": 1.6786, + "step": 32210 + }, + { + "epoch": 2.69, + "grad_norm": 6.185345649719238, + "learning_rate": 1.6185507246376814e-06, + "loss": 1.5419, + "step": 32220 + }, + { + "epoch": 2.69, + "grad_norm": 4.825332164764404, + "learning_rate": 1.6127536231884058e-06, + "loss": 1.7859, + "step": 32230 + }, + { + "epoch": 2.69, + "grad_norm": 2.289442300796509, + "learning_rate": 1.6069565217391307e-06, + "loss": 1.7831, + "step": 32240 + }, + { + "epoch": 2.69, + "grad_norm": 11.469537734985352, + "learning_rate": 1.6011594202898553e-06, + "loss": 1.591, + "step": 32250 + }, + { + "epoch": 2.69, + "grad_norm": 2.8224618434906006, + "learning_rate": 1.5953623188405797e-06, + "loss": 1.7497, + "step": 32260 + }, + { + "epoch": 2.69, + "grad_norm": 3.616290807723999, + "learning_rate": 1.5895652173913046e-06, + "loss": 1.6872, + "step": 32270 + }, + { + "epoch": 2.69, + "grad_norm": 1.3341703414916992, + "learning_rate": 1.583768115942029e-06, + "loss": 1.593, + "step": 32280 + }, + { + "epoch": 2.69, + "grad_norm": 8.595649719238281, + "learning_rate": 1.5779710144927536e-06, + "loss": 1.5657, + "step": 32290 + }, + { + "epoch": 2.69, + "grad_norm": 2.221670389175415, + "learning_rate": 1.5721739130434785e-06, + "loss": 1.6259, + "step": 32300 + }, + { + "epoch": 2.69, + "grad_norm": 2.5493180751800537, + "learning_rate": 1.566376811594203e-06, + "loss": 1.6628, + "step": 32310 + }, + { + "epoch": 2.69, + "grad_norm": 1.4357565641403198, + "learning_rate": 1.5605797101449278e-06, + "loss": 1.5747, + "step": 32320 + }, + { + "epoch": 2.69, + "grad_norm": 16.11993408203125, + "learning_rate": 1.5547826086956522e-06, + "loss": 1.4607, + "step": 32330 + }, + { + "epoch": 2.69, + "grad_norm": 2.597362995147705, + "learning_rate": 1.5489855072463769e-06, + "loss": 1.5912, + "step": 32340 + }, + { + "epoch": 2.7, + "grad_norm": 4.872979164123535, + "learning_rate": 1.5431884057971017e-06, + "loss": 1.4533, + "step": 32350 + }, + { + "epoch": 2.7, + "grad_norm": 3.2023985385894775, + "learning_rate": 1.5373913043478261e-06, + "loss": 1.5058, + "step": 32360 + }, + { + "epoch": 2.7, + "grad_norm": 4.435876369476318, + "learning_rate": 1.5315942028985508e-06, + "loss": 1.5454, + "step": 32370 + }, + { + "epoch": 2.7, + "grad_norm": 2.7985036373138428, + "learning_rate": 1.5257971014492756e-06, + "loss": 1.6741, + "step": 32380 + }, + { + "epoch": 2.7, + "grad_norm": 6.3253607749938965, + "learning_rate": 1.52e-06, + "loss": 1.7431, + "step": 32390 + }, + { + "epoch": 2.7, + "grad_norm": 1.4724886417388916, + "learning_rate": 1.514202898550725e-06, + "loss": 1.6594, + "step": 32400 + }, + { + "epoch": 2.7, + "grad_norm": 2.5811171531677246, + "learning_rate": 1.5084057971014493e-06, + "loss": 1.5847, + "step": 32410 + }, + { + "epoch": 2.7, + "grad_norm": 9.18812084197998, + "learning_rate": 1.502608695652174e-06, + "loss": 1.6295, + "step": 32420 + }, + { + "epoch": 2.7, + "grad_norm": 9.978080749511719, + "learning_rate": 1.4968115942028988e-06, + "loss": 1.5189, + "step": 32430 + }, + { + "epoch": 2.7, + "grad_norm": 2.765017509460449, + "learning_rate": 1.4910144927536233e-06, + "loss": 1.6167, + "step": 32440 + }, + { + "epoch": 2.7, + "grad_norm": 3.3332769870758057, + "learning_rate": 1.485217391304348e-06, + "loss": 1.6536, + "step": 32450 + }, + { + "epoch": 2.71, + "grad_norm": 7.815036773681641, + "learning_rate": 1.4794202898550725e-06, + "loss": 1.6844, + "step": 32460 + }, + { + "epoch": 2.71, + "grad_norm": 8.239996910095215, + "learning_rate": 1.4736231884057972e-06, + "loss": 1.6551, + "step": 32470 + }, + { + "epoch": 2.71, + "grad_norm": 10.380647659301758, + "learning_rate": 1.467826086956522e-06, + "loss": 1.6455, + "step": 32480 + }, + { + "epoch": 2.71, + "grad_norm": 14.243300437927246, + "learning_rate": 1.4620289855072465e-06, + "loss": 1.6603, + "step": 32490 + }, + { + "epoch": 2.71, + "grad_norm": 2.9138994216918945, + "learning_rate": 1.4562318840579711e-06, + "loss": 1.56, + "step": 32500 + }, + { + "epoch": 2.71, + "eval_loss": 1.6212100982666016, + "eval_runtime": 107.5288, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 32500 + }, + { + "epoch": 2.71, + "grad_norm": 10.24506664276123, + "learning_rate": 1.4504347826086955e-06, + "loss": 1.6814, + "step": 32510 + }, + { + "epoch": 2.71, + "grad_norm": 12.093935012817383, + "learning_rate": 1.4446376811594204e-06, + "loss": 1.611, + "step": 32520 + }, + { + "epoch": 2.71, + "grad_norm": 5.0004658699035645, + "learning_rate": 1.438840579710145e-06, + "loss": 1.5318, + "step": 32530 + }, + { + "epoch": 2.71, + "grad_norm": 3.960986614227295, + "learning_rate": 1.4330434782608697e-06, + "loss": 1.6179, + "step": 32540 + }, + { + "epoch": 2.71, + "grad_norm": 3.640979290008545, + "learning_rate": 1.4272463768115943e-06, + "loss": 1.6522, + "step": 32550 + }, + { + "epoch": 2.71, + "grad_norm": 3.157996892929077, + "learning_rate": 1.4214492753623192e-06, + "loss": 1.6118, + "step": 32560 + }, + { + "epoch": 2.71, + "grad_norm": 7.887393474578857, + "learning_rate": 1.4156521739130436e-06, + "loss": 1.6207, + "step": 32570 + }, + { + "epoch": 2.71, + "grad_norm": 11.028023719787598, + "learning_rate": 1.4098550724637682e-06, + "loss": 1.5071, + "step": 32580 + }, + { + "epoch": 2.72, + "grad_norm": 2.2174723148345947, + "learning_rate": 1.4040579710144927e-06, + "loss": 1.6467, + "step": 32590 + }, + { + "epoch": 2.72, + "grad_norm": 2.945136785507202, + "learning_rate": 1.3982608695652175e-06, + "loss": 1.6025, + "step": 32600 + }, + { + "epoch": 2.72, + "grad_norm": 2.605086326599121, + "learning_rate": 1.3924637681159422e-06, + "loss": 1.5723, + "step": 32610 + }, + { + "epoch": 2.72, + "grad_norm": 1.6031261682510376, + "learning_rate": 1.3866666666666668e-06, + "loss": 1.6667, + "step": 32620 + }, + { + "epoch": 2.72, + "grad_norm": 10.576859474182129, + "learning_rate": 1.3808695652173914e-06, + "loss": 1.5214, + "step": 32630 + }, + { + "epoch": 2.72, + "grad_norm": 3.951624870300293, + "learning_rate": 1.3750724637681159e-06, + "loss": 1.4939, + "step": 32640 + }, + { + "epoch": 2.72, + "grad_norm": 7.097296714782715, + "learning_rate": 1.3692753623188407e-06, + "loss": 1.4366, + "step": 32650 + }, + { + "epoch": 2.72, + "grad_norm": 2.388517379760742, + "learning_rate": 1.3634782608695654e-06, + "loss": 1.5237, + "step": 32660 + }, + { + "epoch": 2.72, + "grad_norm": 7.370208263397217, + "learning_rate": 1.3576811594202898e-06, + "loss": 1.7699, + "step": 32670 + }, + { + "epoch": 2.72, + "grad_norm": 3.332535982131958, + "learning_rate": 1.3518840579710146e-06, + "loss": 1.6375, + "step": 32680 + }, + { + "epoch": 2.72, + "grad_norm": 1.344327449798584, + "learning_rate": 1.3460869565217393e-06, + "loss": 1.4128, + "step": 32690 + }, + { + "epoch": 2.73, + "grad_norm": 9.12246036529541, + "learning_rate": 1.340289855072464e-06, + "loss": 1.4211, + "step": 32700 + }, + { + "epoch": 2.73, + "grad_norm": 2.78950572013855, + "learning_rate": 1.3344927536231886e-06, + "loss": 1.6012, + "step": 32710 + }, + { + "epoch": 2.73, + "grad_norm": 3.5925843715667725, + "learning_rate": 1.328695652173913e-06, + "loss": 1.5014, + "step": 32720 + }, + { + "epoch": 2.73, + "grad_norm": 1.6471080780029297, + "learning_rate": 1.3228985507246379e-06, + "loss": 1.6523, + "step": 32730 + }, + { + "epoch": 2.73, + "grad_norm": 12.4561128616333, + "learning_rate": 1.3171014492753625e-06, + "loss": 1.6558, + "step": 32740 + }, + { + "epoch": 2.73, + "grad_norm": 2.051928997039795, + "learning_rate": 1.311304347826087e-06, + "loss": 1.5797, + "step": 32750 + }, + { + "epoch": 2.73, + "grad_norm": 6.791071891784668, + "learning_rate": 1.3055072463768118e-06, + "loss": 1.5058, + "step": 32760 + }, + { + "epoch": 2.73, + "grad_norm": 4.921985149383545, + "learning_rate": 1.2997101449275362e-06, + "loss": 1.727, + "step": 32770 + }, + { + "epoch": 2.73, + "grad_norm": 3.559007406234741, + "learning_rate": 1.293913043478261e-06, + "loss": 1.5479, + "step": 32780 + }, + { + "epoch": 2.73, + "grad_norm": 7.684724807739258, + "learning_rate": 1.2881159420289857e-06, + "loss": 1.5711, + "step": 32790 + }, + { + "epoch": 2.73, + "grad_norm": 6.558244228363037, + "learning_rate": 1.2823188405797101e-06, + "loss": 1.5848, + "step": 32800 + }, + { + "epoch": 2.73, + "grad_norm": 4.056134223937988, + "learning_rate": 1.276521739130435e-06, + "loss": 1.6693, + "step": 32810 + }, + { + "epoch": 2.73, + "grad_norm": 16.4197998046875, + "learning_rate": 1.2707246376811594e-06, + "loss": 1.5769, + "step": 32820 + }, + { + "epoch": 2.74, + "grad_norm": 4.05332612991333, + "learning_rate": 1.264927536231884e-06, + "loss": 1.6521, + "step": 32830 + }, + { + "epoch": 2.74, + "grad_norm": 10.956727027893066, + "learning_rate": 1.259130434782609e-06, + "loss": 1.596, + "step": 32840 + }, + { + "epoch": 2.74, + "grad_norm": 4.413317680358887, + "learning_rate": 1.2533333333333333e-06, + "loss": 1.496, + "step": 32850 + }, + { + "epoch": 2.74, + "grad_norm": 3.767711639404297, + "learning_rate": 1.2475362318840582e-06, + "loss": 1.5371, + "step": 32860 + }, + { + "epoch": 2.74, + "grad_norm": 5.680372714996338, + "learning_rate": 1.2417391304347826e-06, + "loss": 1.531, + "step": 32870 + }, + { + "epoch": 2.74, + "grad_norm": 1.4916908740997314, + "learning_rate": 1.2359420289855073e-06, + "loss": 1.6898, + "step": 32880 + }, + { + "epoch": 2.74, + "grad_norm": 9.154963493347168, + "learning_rate": 1.2301449275362321e-06, + "loss": 1.4688, + "step": 32890 + }, + { + "epoch": 2.74, + "grad_norm": 2.380986213684082, + "learning_rate": 1.2243478260869567e-06, + "loss": 1.7119, + "step": 32900 + }, + { + "epoch": 2.74, + "grad_norm": 10.677800178527832, + "learning_rate": 1.2185507246376812e-06, + "loss": 1.616, + "step": 32910 + }, + { + "epoch": 2.74, + "grad_norm": 7.30240535736084, + "learning_rate": 1.2127536231884058e-06, + "loss": 1.5754, + "step": 32920 + }, + { + "epoch": 2.74, + "grad_norm": 9.45409870147705, + "learning_rate": 1.2069565217391305e-06, + "loss": 1.6461, + "step": 32930 + }, + { + "epoch": 2.75, + "grad_norm": 6.102117538452148, + "learning_rate": 1.2011594202898553e-06, + "loss": 1.4394, + "step": 32940 + }, + { + "epoch": 2.75, + "grad_norm": 6.18507194519043, + "learning_rate": 1.1953623188405797e-06, + "loss": 1.6139, + "step": 32950 + }, + { + "epoch": 2.75, + "grad_norm": 7.034895896911621, + "learning_rate": 1.1895652173913044e-06, + "loss": 1.571, + "step": 32960 + }, + { + "epoch": 2.75, + "grad_norm": 1.3450069427490234, + "learning_rate": 1.183768115942029e-06, + "loss": 1.4893, + "step": 32970 + }, + { + "epoch": 2.75, + "grad_norm": 4.201149940490723, + "learning_rate": 1.1779710144927539e-06, + "loss": 1.4029, + "step": 32980 + }, + { + "epoch": 2.75, + "grad_norm": 1.9617148637771606, + "learning_rate": 1.1721739130434783e-06, + "loss": 1.6637, + "step": 32990 + }, + { + "epoch": 2.75, + "grad_norm": 1.6870609521865845, + "learning_rate": 1.166376811594203e-06, + "loss": 1.5653, + "step": 33000 + }, + { + "epoch": 2.75, + "eval_loss": 1.617643117904663, + "eval_runtime": 107.5322, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 33000 + }, + { + "epoch": 2.75, + "grad_norm": 2.035808563232422, + "learning_rate": 1.1605797101449276e-06, + "loss": 1.5895, + "step": 33010 + }, + { + "epoch": 2.75, + "grad_norm": 3.8778083324432373, + "learning_rate": 1.1547826086956522e-06, + "loss": 1.5143, + "step": 33020 + }, + { + "epoch": 2.75, + "grad_norm": 2.143531322479248, + "learning_rate": 1.1489855072463769e-06, + "loss": 1.5878, + "step": 33030 + }, + { + "epoch": 2.75, + "grad_norm": 2.5766236782073975, + "learning_rate": 1.1431884057971015e-06, + "loss": 1.5166, + "step": 33040 + }, + { + "epoch": 2.75, + "grad_norm": 3.0880119800567627, + "learning_rate": 1.1373913043478262e-06, + "loss": 1.5383, + "step": 33050 + }, + { + "epoch": 2.75, + "grad_norm": 6.4174723625183105, + "learning_rate": 1.1315942028985508e-06, + "loss": 1.6729, + "step": 33060 + }, + { + "epoch": 2.76, + "grad_norm": 2.1581320762634277, + "learning_rate": 1.1257971014492754e-06, + "loss": 1.7279, + "step": 33070 + }, + { + "epoch": 2.76, + "grad_norm": 4.984561443328857, + "learning_rate": 1.12e-06, + "loss": 1.651, + "step": 33080 + }, + { + "epoch": 2.76, + "grad_norm": 7.201277732849121, + "learning_rate": 1.1142028985507247e-06, + "loss": 1.6384, + "step": 33090 + }, + { + "epoch": 2.76, + "grad_norm": 2.131202459335327, + "learning_rate": 1.1084057971014494e-06, + "loss": 1.5721, + "step": 33100 + }, + { + "epoch": 2.76, + "grad_norm": 10.104864120483398, + "learning_rate": 1.102608695652174e-06, + "loss": 1.7424, + "step": 33110 + }, + { + "epoch": 2.76, + "grad_norm": 6.0055012702941895, + "learning_rate": 1.0968115942028986e-06, + "loss": 1.5323, + "step": 33120 + }, + { + "epoch": 2.76, + "grad_norm": 5.302622318267822, + "learning_rate": 1.0910144927536233e-06, + "loss": 1.4631, + "step": 33130 + }, + { + "epoch": 2.76, + "grad_norm": 7.973222732543945, + "learning_rate": 1.085217391304348e-06, + "loss": 1.7163, + "step": 33140 + }, + { + "epoch": 2.76, + "grad_norm": 9.29218578338623, + "learning_rate": 1.0794202898550726e-06, + "loss": 1.5899, + "step": 33150 + }, + { + "epoch": 2.76, + "grad_norm": 9.235881805419922, + "learning_rate": 1.0736231884057972e-06, + "loss": 1.612, + "step": 33160 + }, + { + "epoch": 2.76, + "grad_norm": 11.965916633605957, + "learning_rate": 1.0678260869565218e-06, + "loss": 1.602, + "step": 33170 + }, + { + "epoch": 2.77, + "grad_norm": 10.200899124145508, + "learning_rate": 1.0620289855072465e-06, + "loss": 1.5964, + "step": 33180 + }, + { + "epoch": 2.77, + "grad_norm": 10.006152153015137, + "learning_rate": 1.0562318840579711e-06, + "loss": 1.541, + "step": 33190 + }, + { + "epoch": 2.77, + "grad_norm": 2.9720447063446045, + "learning_rate": 1.0504347826086958e-06, + "loss": 1.5875, + "step": 33200 + }, + { + "epoch": 2.77, + "grad_norm": 2.2912840843200684, + "learning_rate": 1.0446376811594204e-06, + "loss": 1.5263, + "step": 33210 + }, + { + "epoch": 2.77, + "grad_norm": 2.361422538757324, + "learning_rate": 1.038840579710145e-06, + "loss": 1.6695, + "step": 33220 + }, + { + "epoch": 2.77, + "grad_norm": 1.8953531980514526, + "learning_rate": 1.0330434782608697e-06, + "loss": 1.6833, + "step": 33230 + }, + { + "epoch": 2.77, + "grad_norm": 4.783647537231445, + "learning_rate": 1.0272463768115941e-06, + "loss": 1.483, + "step": 33240 + }, + { + "epoch": 2.77, + "grad_norm": 4.266748905181885, + "learning_rate": 1.021449275362319e-06, + "loss": 1.5473, + "step": 33250 + }, + { + "epoch": 2.77, + "grad_norm": 5.104844093322754, + "learning_rate": 1.0156521739130436e-06, + "loss": 1.5915, + "step": 33260 + }, + { + "epoch": 2.77, + "grad_norm": 2.9186527729034424, + "learning_rate": 1.0098550724637683e-06, + "loss": 1.6114, + "step": 33270 + }, + { + "epoch": 2.77, + "grad_norm": 7.935939311981201, + "learning_rate": 1.0040579710144927e-06, + "loss": 1.4601, + "step": 33280 + }, + { + "epoch": 2.77, + "grad_norm": 5.332785129547119, + "learning_rate": 9.982608695652175e-07, + "loss": 1.7318, + "step": 33290 + }, + { + "epoch": 2.77, + "grad_norm": 7.0045485496521, + "learning_rate": 9.924637681159422e-07, + "loss": 1.5863, + "step": 33300 + }, + { + "epoch": 2.78, + "grad_norm": 10.329780578613281, + "learning_rate": 9.866666666666668e-07, + "loss": 1.5792, + "step": 33310 + }, + { + "epoch": 2.78, + "grad_norm": 3.1837329864501953, + "learning_rate": 9.808695652173912e-07, + "loss": 1.4957, + "step": 33320 + }, + { + "epoch": 2.78, + "grad_norm": 14.454916954040527, + "learning_rate": 9.750724637681159e-07, + "loss": 1.6237, + "step": 33330 + }, + { + "epoch": 2.78, + "grad_norm": 2.9528446197509766, + "learning_rate": 9.692753623188407e-07, + "loss": 1.6035, + "step": 33340 + }, + { + "epoch": 2.78, + "grad_norm": 2.9662082195281982, + "learning_rate": 9.634782608695654e-07, + "loss": 1.5402, + "step": 33350 + }, + { + "epoch": 2.78, + "grad_norm": 4.538402557373047, + "learning_rate": 9.576811594202898e-07, + "loss": 1.6336, + "step": 33360 + }, + { + "epoch": 2.78, + "grad_norm": 5.312148571014404, + "learning_rate": 9.518840579710146e-07, + "loss": 1.5634, + "step": 33370 + }, + { + "epoch": 2.78, + "grad_norm": 4.3434953689575195, + "learning_rate": 9.460869565217393e-07, + "loss": 1.587, + "step": 33380 + }, + { + "epoch": 2.78, + "grad_norm": 4.069890975952148, + "learning_rate": 9.402898550724638e-07, + "loss": 1.5212, + "step": 33390 + }, + { + "epoch": 2.78, + "grad_norm": 19.578651428222656, + "learning_rate": 9.344927536231885e-07, + "loss": 1.6608, + "step": 33400 + }, + { + "epoch": 2.78, + "grad_norm": 5.462759494781494, + "learning_rate": 9.286956521739131e-07, + "loss": 1.6739, + "step": 33410 + }, + { + "epoch": 2.79, + "grad_norm": 6.227196216583252, + "learning_rate": 9.228985507246377e-07, + "loss": 1.6326, + "step": 33420 + }, + { + "epoch": 2.79, + "grad_norm": 2.722791910171509, + "learning_rate": 9.171014492753624e-07, + "loss": 1.6126, + "step": 33430 + }, + { + "epoch": 2.79, + "grad_norm": 15.881592750549316, + "learning_rate": 9.11304347826087e-07, + "loss": 1.7239, + "step": 33440 + }, + { + "epoch": 2.79, + "grad_norm": 6.390254020690918, + "learning_rate": 9.055072463768117e-07, + "loss": 1.6419, + "step": 33450 + }, + { + "epoch": 2.79, + "grad_norm": 3.134054660797119, + "learning_rate": 8.997101449275362e-07, + "loss": 1.6066, + "step": 33460 + }, + { + "epoch": 2.79, + "grad_norm": 5.273614406585693, + "learning_rate": 8.93913043478261e-07, + "loss": 1.6421, + "step": 33470 + }, + { + "epoch": 2.79, + "grad_norm": 4.4810967445373535, + "learning_rate": 8.881159420289856e-07, + "loss": 1.705, + "step": 33480 + }, + { + "epoch": 2.79, + "grad_norm": 7.226357460021973, + "learning_rate": 8.823188405797103e-07, + "loss": 1.5615, + "step": 33490 + }, + { + "epoch": 2.79, + "grad_norm": 3.2726452350616455, + "learning_rate": 8.765217391304348e-07, + "loss": 1.713, + "step": 33500 + }, + { + "epoch": 2.79, + "eval_loss": 1.6301745176315308, + "eval_runtime": 107.5184, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 33500 + }, + { + "epoch": 2.79, + "grad_norm": 9.75317096710205, + "learning_rate": 8.707246376811595e-07, + "loss": 1.6305, + "step": 33510 + }, + { + "epoch": 2.79, + "grad_norm": 8.24269962310791, + "learning_rate": 8.649275362318842e-07, + "loss": 1.6059, + "step": 33520 + }, + { + "epoch": 2.79, + "grad_norm": 7.880953788757324, + "learning_rate": 8.591304347826088e-07, + "loss": 1.6878, + "step": 33530 + }, + { + "epoch": 2.79, + "grad_norm": 3.4583754539489746, + "learning_rate": 8.533333333333334e-07, + "loss": 1.7179, + "step": 33540 + }, + { + "epoch": 2.8, + "grad_norm": 5.8554558753967285, + "learning_rate": 8.47536231884058e-07, + "loss": 1.535, + "step": 33550 + }, + { + "epoch": 2.8, + "grad_norm": 5.77476167678833, + "learning_rate": 8.417391304347827e-07, + "loss": 1.6241, + "step": 33560 + }, + { + "epoch": 2.8, + "grad_norm": 1.786972999572754, + "learning_rate": 8.359420289855074e-07, + "loss": 1.6576, + "step": 33570 + }, + { + "epoch": 2.8, + "grad_norm": 3.2922704219818115, + "learning_rate": 8.301449275362319e-07, + "loss": 1.7354, + "step": 33580 + }, + { + "epoch": 2.8, + "grad_norm": 10.593795776367188, + "learning_rate": 8.243478260869566e-07, + "loss": 1.5815, + "step": 33590 + }, + { + "epoch": 2.8, + "grad_norm": 3.0302324295043945, + "learning_rate": 8.185507246376813e-07, + "loss": 1.6433, + "step": 33600 + }, + { + "epoch": 2.8, + "grad_norm": 10.300827026367188, + "learning_rate": 8.127536231884059e-07, + "loss": 1.613, + "step": 33610 + }, + { + "epoch": 2.8, + "grad_norm": 4.340428352355957, + "learning_rate": 8.069565217391305e-07, + "loss": 1.5346, + "step": 33620 + }, + { + "epoch": 2.8, + "grad_norm": 6.048191547393799, + "learning_rate": 8.011594202898551e-07, + "loss": 1.5197, + "step": 33630 + }, + { + "epoch": 2.8, + "grad_norm": 1.3901071548461914, + "learning_rate": 7.953623188405798e-07, + "loss": 1.4772, + "step": 33640 + }, + { + "epoch": 2.8, + "grad_norm": 2.204463481903076, + "learning_rate": 7.895652173913045e-07, + "loss": 1.6636, + "step": 33650 + }, + { + "epoch": 2.81, + "grad_norm": 3.664523124694824, + "learning_rate": 7.83768115942029e-07, + "loss": 1.5472, + "step": 33660 + }, + { + "epoch": 2.81, + "grad_norm": 6.5395827293396, + "learning_rate": 7.779710144927537e-07, + "loss": 1.6717, + "step": 33670 + }, + { + "epoch": 2.81, + "grad_norm": 4.011569976806641, + "learning_rate": 7.721739130434783e-07, + "loss": 1.7751, + "step": 33680 + }, + { + "epoch": 2.81, + "grad_norm": 6.7899556159973145, + "learning_rate": 7.66376811594203e-07, + "loss": 1.7148, + "step": 33690 + }, + { + "epoch": 2.81, + "grad_norm": 5.4738054275512695, + "learning_rate": 7.605797101449276e-07, + "loss": 1.4787, + "step": 33700 + }, + { + "epoch": 2.81, + "grad_norm": 8.042850494384766, + "learning_rate": 7.547826086956522e-07, + "loss": 1.675, + "step": 33710 + }, + { + "epoch": 2.81, + "grad_norm": 13.439699172973633, + "learning_rate": 7.489855072463768e-07, + "loss": 1.6719, + "step": 33720 + }, + { + "epoch": 2.81, + "grad_norm": 3.596892833709717, + "learning_rate": 7.431884057971014e-07, + "loss": 1.6386, + "step": 33730 + }, + { + "epoch": 2.81, + "grad_norm": 4.695061683654785, + "learning_rate": 7.373913043478262e-07, + "loss": 1.6444, + "step": 33740 + }, + { + "epoch": 2.81, + "grad_norm": 3.027362108230591, + "learning_rate": 7.315942028985508e-07, + "loss": 1.6984, + "step": 33750 + }, + { + "epoch": 2.81, + "grad_norm": 1.8642385005950928, + "learning_rate": 7.257971014492753e-07, + "loss": 1.5937, + "step": 33760 + }, + { + "epoch": 2.81, + "grad_norm": 2.800645112991333, + "learning_rate": 7.2e-07, + "loss": 1.5741, + "step": 33770 + }, + { + "epoch": 2.81, + "grad_norm": 10.626157760620117, + "learning_rate": 7.142028985507247e-07, + "loss": 1.5336, + "step": 33780 + }, + { + "epoch": 2.82, + "grad_norm": 9.157116889953613, + "learning_rate": 7.084057971014494e-07, + "loss": 1.5535, + "step": 33790 + }, + { + "epoch": 2.82, + "grad_norm": 7.302629470825195, + "learning_rate": 7.026086956521739e-07, + "loss": 1.5688, + "step": 33800 + }, + { + "epoch": 2.82, + "grad_norm": 3.5727436542510986, + "learning_rate": 6.968115942028986e-07, + "loss": 1.6339, + "step": 33810 + }, + { + "epoch": 2.82, + "grad_norm": 16.979459762573242, + "learning_rate": 6.910144927536233e-07, + "loss": 1.5552, + "step": 33820 + }, + { + "epoch": 2.82, + "grad_norm": 3.843989133834839, + "learning_rate": 6.852173913043479e-07, + "loss": 1.5359, + "step": 33830 + }, + { + "epoch": 2.82, + "grad_norm": 2.522932767868042, + "learning_rate": 6.794202898550725e-07, + "loss": 1.7011, + "step": 33840 + }, + { + "epoch": 2.82, + "grad_norm": 5.601334095001221, + "learning_rate": 6.736231884057971e-07, + "loss": 1.6746, + "step": 33850 + }, + { + "epoch": 2.82, + "grad_norm": 3.354949712753296, + "learning_rate": 6.678260869565218e-07, + "loss": 1.589, + "step": 33860 + }, + { + "epoch": 2.82, + "grad_norm": 1.3252222537994385, + "learning_rate": 6.620289855072465e-07, + "loss": 1.5887, + "step": 33870 + }, + { + "epoch": 2.82, + "grad_norm": 3.7831883430480957, + "learning_rate": 6.56231884057971e-07, + "loss": 1.5805, + "step": 33880 + }, + { + "epoch": 2.82, + "grad_norm": 8.868619918823242, + "learning_rate": 6.504347826086957e-07, + "loss": 1.6854, + "step": 33890 + }, + { + "epoch": 2.83, + "grad_norm": 7.663196563720703, + "learning_rate": 6.446376811594203e-07, + "loss": 1.7296, + "step": 33900 + }, + { + "epoch": 2.83, + "grad_norm": 5.567215919494629, + "learning_rate": 6.388405797101451e-07, + "loss": 1.5271, + "step": 33910 + }, + { + "epoch": 2.83, + "grad_norm": 5.213868141174316, + "learning_rate": 6.330434782608696e-07, + "loss": 1.4958, + "step": 33920 + }, + { + "epoch": 2.83, + "grad_norm": 2.9527573585510254, + "learning_rate": 6.272463768115942e-07, + "loss": 1.5046, + "step": 33930 + }, + { + "epoch": 2.83, + "grad_norm": 5.8709187507629395, + "learning_rate": 6.214492753623189e-07, + "loss": 1.5993, + "step": 33940 + }, + { + "epoch": 2.83, + "grad_norm": 17.194610595703125, + "learning_rate": 6.156521739130435e-07, + "loss": 1.6132, + "step": 33950 + }, + { + "epoch": 2.83, + "grad_norm": 6.302470684051514, + "learning_rate": 6.098550724637682e-07, + "loss": 1.6321, + "step": 33960 + }, + { + "epoch": 2.83, + "grad_norm": 1.782871127128601, + "learning_rate": 6.040579710144928e-07, + "loss": 1.4957, + "step": 33970 + }, + { + "epoch": 2.83, + "grad_norm": 4.105221748352051, + "learning_rate": 5.982608695652174e-07, + "loss": 1.6038, + "step": 33980 + }, + { + "epoch": 2.83, + "grad_norm": 3.2038047313690186, + "learning_rate": 5.924637681159421e-07, + "loss": 1.454, + "step": 33990 + }, + { + "epoch": 2.83, + "grad_norm": 11.589030265808105, + "learning_rate": 5.866666666666667e-07, + "loss": 1.583, + "step": 34000 + }, + { + "epoch": 2.83, + "eval_loss": 1.6165950298309326, + "eval_runtime": 107.5234, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 34000 + }, + { + "epoch": 2.83, + "grad_norm": 5.653379917144775, + "learning_rate": 5.808695652173914e-07, + "loss": 1.5756, + "step": 34010 + }, + { + "epoch": 2.83, + "grad_norm": 4.994700908660889, + "learning_rate": 5.75072463768116e-07, + "loss": 1.6044, + "step": 34020 + }, + { + "epoch": 2.84, + "grad_norm": 5.27285099029541, + "learning_rate": 5.692753623188407e-07, + "loss": 1.5561, + "step": 34030 + }, + { + "epoch": 2.84, + "grad_norm": 3.3184444904327393, + "learning_rate": 5.634782608695653e-07, + "loss": 1.5874, + "step": 34040 + }, + { + "epoch": 2.84, + "grad_norm": 3.3386433124542236, + "learning_rate": 5.576811594202898e-07, + "loss": 1.6027, + "step": 34050 + }, + { + "epoch": 2.84, + "grad_norm": 9.001840591430664, + "learning_rate": 5.518840579710146e-07, + "loss": 1.5911, + "step": 34060 + }, + { + "epoch": 2.84, + "grad_norm": 11.12852954864502, + "learning_rate": 5.460869565217391e-07, + "loss": 1.5201, + "step": 34070 + }, + { + "epoch": 2.84, + "grad_norm": 5.407654285430908, + "learning_rate": 5.402898550724639e-07, + "loss": 1.5665, + "step": 34080 + }, + { + "epoch": 2.84, + "grad_norm": 4.4106831550598145, + "learning_rate": 5.344927536231884e-07, + "loss": 1.7836, + "step": 34090 + }, + { + "epoch": 2.84, + "grad_norm": 8.831493377685547, + "learning_rate": 5.286956521739131e-07, + "loss": 1.5497, + "step": 34100 + }, + { + "epoch": 2.84, + "grad_norm": 3.7488794326782227, + "learning_rate": 5.228985507246377e-07, + "loss": 1.6616, + "step": 34110 + }, + { + "epoch": 2.84, + "grad_norm": 0.5988351702690125, + "learning_rate": 5.171014492753624e-07, + "loss": 1.4637, + "step": 34120 + }, + { + "epoch": 2.84, + "grad_norm": 7.661751747131348, + "learning_rate": 5.11304347826087e-07, + "loss": 1.6349, + "step": 34130 + }, + { + "epoch": 2.84, + "grad_norm": 1.7503852844238281, + "learning_rate": 5.055072463768116e-07, + "loss": 1.7236, + "step": 34140 + }, + { + "epoch": 2.85, + "grad_norm": 4.78981876373291, + "learning_rate": 4.997101449275362e-07, + "loss": 1.6712, + "step": 34150 + }, + { + "epoch": 2.85, + "grad_norm": 3.7767815589904785, + "learning_rate": 4.939130434782609e-07, + "loss": 1.5735, + "step": 34160 + }, + { + "epoch": 2.85, + "grad_norm": 8.948434829711914, + "learning_rate": 4.881159420289855e-07, + "loss": 1.4895, + "step": 34170 + }, + { + "epoch": 2.85, + "grad_norm": 5.406666278839111, + "learning_rate": 4.823188405797102e-07, + "loss": 1.5253, + "step": 34180 + }, + { + "epoch": 2.85, + "grad_norm": 1.939326524734497, + "learning_rate": 4.7652173913043486e-07, + "loss": 1.6299, + "step": 34190 + }, + { + "epoch": 2.85, + "grad_norm": 12.841042518615723, + "learning_rate": 4.7072463768115945e-07, + "loss": 1.6818, + "step": 34200 + }, + { + "epoch": 2.85, + "grad_norm": 2.7751150131225586, + "learning_rate": 4.6492753623188414e-07, + "loss": 1.7415, + "step": 34210 + }, + { + "epoch": 2.85, + "grad_norm": 5.986341953277588, + "learning_rate": 4.5913043478260873e-07, + "loss": 1.5771, + "step": 34220 + }, + { + "epoch": 2.85, + "grad_norm": 7.021378517150879, + "learning_rate": 4.533333333333334e-07, + "loss": 1.4319, + "step": 34230 + }, + { + "epoch": 2.85, + "grad_norm": 6.669992446899414, + "learning_rate": 4.47536231884058e-07, + "loss": 1.6399, + "step": 34240 + }, + { + "epoch": 2.85, + "grad_norm": 1.224604845046997, + "learning_rate": 4.417391304347826e-07, + "loss": 1.7313, + "step": 34250 + }, + { + "epoch": 2.85, + "grad_norm": 4.330286026000977, + "learning_rate": 4.359420289855073e-07, + "loss": 1.658, + "step": 34260 + }, + { + "epoch": 2.86, + "grad_norm": 1.8469319343566895, + "learning_rate": 4.301449275362319e-07, + "loss": 1.5495, + "step": 34270 + }, + { + "epoch": 2.86, + "grad_norm": 1.5525469779968262, + "learning_rate": 4.243478260869566e-07, + "loss": 1.5885, + "step": 34280 + }, + { + "epoch": 2.86, + "grad_norm": 3.59576416015625, + "learning_rate": 4.1855072463768116e-07, + "loss": 1.6725, + "step": 34290 + }, + { + "epoch": 2.86, + "grad_norm": 14.655567169189453, + "learning_rate": 4.1275362318840586e-07, + "loss": 1.4917, + "step": 34300 + }, + { + "epoch": 2.86, + "grad_norm": 4.645742416381836, + "learning_rate": 4.0695652173913044e-07, + "loss": 1.5779, + "step": 34310 + }, + { + "epoch": 2.86, + "grad_norm": 2.458324670791626, + "learning_rate": 4.0115942028985514e-07, + "loss": 1.6004, + "step": 34320 + }, + { + "epoch": 2.86, + "grad_norm": 4.773531913757324, + "learning_rate": 3.9536231884057973e-07, + "loss": 1.504, + "step": 34330 + }, + { + "epoch": 2.86, + "grad_norm": 4.06483268737793, + "learning_rate": 3.8956521739130437e-07, + "loss": 1.6038, + "step": 34340 + }, + { + "epoch": 2.86, + "grad_norm": 3.4933018684387207, + "learning_rate": 3.83768115942029e-07, + "loss": 1.6455, + "step": 34350 + }, + { + "epoch": 2.86, + "grad_norm": 2.6547389030456543, + "learning_rate": 3.7797101449275365e-07, + "loss": 1.5391, + "step": 34360 + }, + { + "epoch": 2.86, + "grad_norm": 2.740145683288574, + "learning_rate": 3.721739130434783e-07, + "loss": 1.6146, + "step": 34370 + }, + { + "epoch": 2.87, + "grad_norm": 2.028878688812256, + "learning_rate": 3.6637681159420293e-07, + "loss": 1.5921, + "step": 34380 + }, + { + "epoch": 2.87, + "grad_norm": 6.614234447479248, + "learning_rate": 3.6057971014492757e-07, + "loss": 1.5552, + "step": 34390 + }, + { + "epoch": 2.87, + "grad_norm": 2.7156872749328613, + "learning_rate": 3.547826086956522e-07, + "loss": 1.6545, + "step": 34400 + }, + { + "epoch": 2.87, + "grad_norm": 2.235097646713257, + "learning_rate": 3.4898550724637685e-07, + "loss": 1.5333, + "step": 34410 + }, + { + "epoch": 2.87, + "grad_norm": 4.5841474533081055, + "learning_rate": 3.431884057971015e-07, + "loss": 1.6009, + "step": 34420 + }, + { + "epoch": 2.87, + "grad_norm": 3.2173547744750977, + "learning_rate": 3.3739130434782614e-07, + "loss": 1.5737, + "step": 34430 + }, + { + "epoch": 2.87, + "grad_norm": 2.977945327758789, + "learning_rate": 3.315942028985508e-07, + "loss": 1.6099, + "step": 34440 + }, + { + "epoch": 2.87, + "grad_norm": 4.717473983764648, + "learning_rate": 3.2579710144927537e-07, + "loss": 1.6938, + "step": 34450 + }, + { + "epoch": 2.87, + "grad_norm": 3.8826589584350586, + "learning_rate": 3.2e-07, + "loss": 1.5333, + "step": 34460 + }, + { + "epoch": 2.87, + "grad_norm": 1.7628260850906372, + "learning_rate": 3.1420289855072465e-07, + "loss": 1.4657, + "step": 34470 + }, + { + "epoch": 2.87, + "grad_norm": 2.397793769836426, + "learning_rate": 3.084057971014493e-07, + "loss": 1.7186, + "step": 34480 + }, + { + "epoch": 2.87, + "grad_norm": 13.669360160827637, + "learning_rate": 3.0260869565217393e-07, + "loss": 1.6144, + "step": 34490 + }, + { + "epoch": 2.88, + "grad_norm": 1.7966532707214355, + "learning_rate": 2.9681159420289857e-07, + "loss": 1.7335, + "step": 34500 + }, + { + "epoch": 2.88, + "eval_loss": 1.6453466415405273, + "eval_runtime": 107.5244, + "eval_samples_per_second": 9.3, + "eval_steps_per_second": 2.325, + "step": 34500 + }, + { + "epoch": 2.88, + "grad_norm": 5.828636646270752, + "learning_rate": 2.910144927536232e-07, + "loss": 1.4402, + "step": 34510 + }, + { + "epoch": 2.88, + "grad_norm": 1.665623664855957, + "learning_rate": 2.8521739130434785e-07, + "loss": 1.5562, + "step": 34520 + }, + { + "epoch": 2.88, + "grad_norm": 7.800908088684082, + "learning_rate": 2.794202898550725e-07, + "loss": 1.5813, + "step": 34530 + }, + { + "epoch": 2.88, + "grad_norm": 6.734397888183594, + "learning_rate": 2.7362318840579713e-07, + "loss": 1.7721, + "step": 34540 + }, + { + "epoch": 2.88, + "grad_norm": 2.1839168071746826, + "learning_rate": 2.678260869565218e-07, + "loss": 1.5236, + "step": 34550 + }, + { + "epoch": 2.88, + "grad_norm": 3.9042229652404785, + "learning_rate": 2.620289855072464e-07, + "loss": 1.6205, + "step": 34560 + }, + { + "epoch": 2.88, + "grad_norm": 6.442493438720703, + "learning_rate": 2.5623188405797106e-07, + "loss": 1.6334, + "step": 34570 + }, + { + "epoch": 2.88, + "grad_norm": 11.60261058807373, + "learning_rate": 2.504347826086957e-07, + "loss": 1.6631, + "step": 34580 + }, + { + "epoch": 2.88, + "grad_norm": 1.5220028162002563, + "learning_rate": 2.4463768115942034e-07, + "loss": 1.5903, + "step": 34590 + }, + { + "epoch": 2.88, + "grad_norm": 6.463686466217041, + "learning_rate": 2.3884057971014493e-07, + "loss": 1.505, + "step": 34600 + }, + { + "epoch": 2.88, + "grad_norm": 1.3802415132522583, + "learning_rate": 2.3304347826086957e-07, + "loss": 1.5574, + "step": 34610 + }, + { + "epoch": 2.88, + "grad_norm": 5.931606292724609, + "learning_rate": 2.272463768115942e-07, + "loss": 1.6049, + "step": 34620 + }, + { + "epoch": 2.89, + "grad_norm": 4.055948734283447, + "learning_rate": 2.2144927536231885e-07, + "loss": 1.7223, + "step": 34630 + }, + { + "epoch": 2.89, + "grad_norm": 3.3893649578094482, + "learning_rate": 2.156521739130435e-07, + "loss": 1.6336, + "step": 34640 + }, + { + "epoch": 2.89, + "grad_norm": 4.007665157318115, + "learning_rate": 2.0985507246376813e-07, + "loss": 1.5563, + "step": 34650 + }, + { + "epoch": 2.89, + "grad_norm": 3.499795913696289, + "learning_rate": 2.0405797101449277e-07, + "loss": 1.6813, + "step": 34660 + }, + { + "epoch": 2.89, + "grad_norm": 3.31160569190979, + "learning_rate": 1.9826086956521742e-07, + "loss": 1.5491, + "step": 34670 + }, + { + "epoch": 2.89, + "grad_norm": 2.493201732635498, + "learning_rate": 1.9246376811594206e-07, + "loss": 1.7265, + "step": 34680 + }, + { + "epoch": 2.89, + "grad_norm": 6.865158557891846, + "learning_rate": 1.866666666666667e-07, + "loss": 1.6564, + "step": 34690 + }, + { + "epoch": 2.89, + "grad_norm": 2.9603042602539062, + "learning_rate": 1.808695652173913e-07, + "loss": 1.6632, + "step": 34700 + }, + { + "epoch": 2.89, + "grad_norm": 10.82306957244873, + "learning_rate": 1.7507246376811595e-07, + "loss": 1.555, + "step": 34710 + }, + { + "epoch": 2.89, + "grad_norm": 4.0846428871154785, + "learning_rate": 1.692753623188406e-07, + "loss": 1.6608, + "step": 34720 + }, + { + "epoch": 2.89, + "grad_norm": 5.046102523803711, + "learning_rate": 1.6347826086956523e-07, + "loss": 1.6355, + "step": 34730 + }, + { + "epoch": 2.9, + "grad_norm": 5.843890190124512, + "learning_rate": 1.5768115942028988e-07, + "loss": 1.7457, + "step": 34740 + }, + { + "epoch": 2.9, + "grad_norm": 2.0817549228668213, + "learning_rate": 1.5188405797101452e-07, + "loss": 1.5093, + "step": 34750 + }, + { + "epoch": 2.9, + "grad_norm": 3.4835991859436035, + "learning_rate": 1.4608695652173916e-07, + "loss": 1.5516, + "step": 34760 + }, + { + "epoch": 2.9, + "grad_norm": 4.763504505157471, + "learning_rate": 1.402898550724638e-07, + "loss": 1.5014, + "step": 34770 + }, + { + "epoch": 2.9, + "grad_norm": 7.537784576416016, + "learning_rate": 1.344927536231884e-07, + "loss": 1.5698, + "step": 34780 + }, + { + "epoch": 2.9, + "grad_norm": 3.3694698810577393, + "learning_rate": 1.2869565217391305e-07, + "loss": 1.7388, + "step": 34790 + }, + { + "epoch": 2.9, + "grad_norm": 3.1657655239105225, + "learning_rate": 1.228985507246377e-07, + "loss": 1.6657, + "step": 34800 + }, + { + "epoch": 2.9, + "grad_norm": 3.10614013671875, + "learning_rate": 1.1710144927536234e-07, + "loss": 1.6406, + "step": 34810 + }, + { + "epoch": 2.9, + "grad_norm": 8.151602745056152, + "learning_rate": 1.1130434782608698e-07, + "loss": 1.7237, + "step": 34820 + }, + { + "epoch": 2.9, + "grad_norm": 2.7785089015960693, + "learning_rate": 1.0550724637681159e-07, + "loss": 1.6641, + "step": 34830 + }, + { + "epoch": 2.9, + "grad_norm": 1.2298667430877686, + "learning_rate": 9.971014492753623e-08, + "loss": 1.659, + "step": 34840 + }, + { + "epoch": 2.9, + "grad_norm": 4.855597972869873, + "learning_rate": 9.391304347826087e-08, + "loss": 1.5468, + "step": 34850 + }, + { + "epoch": 2.91, + "grad_norm": 7.356712818145752, + "learning_rate": 8.811594202898551e-08, + "loss": 1.5559, + "step": 34860 + }, + { + "epoch": 2.91, + "grad_norm": 6.330800533294678, + "learning_rate": 8.231884057971016e-08, + "loss": 1.5451, + "step": 34870 + }, + { + "epoch": 2.91, + "grad_norm": 4.4921183586120605, + "learning_rate": 7.65217391304348e-08, + "loss": 1.6324, + "step": 34880 + }, + { + "epoch": 2.91, + "grad_norm": 11.110346794128418, + "learning_rate": 7.072463768115942e-08, + "loss": 1.5751, + "step": 34890 + }, + { + "epoch": 2.91, + "grad_norm": 5.403293609619141, + "learning_rate": 6.492753623188407e-08, + "loss": 1.6089, + "step": 34900 + }, + { + "epoch": 2.91, + "grad_norm": 11.329370498657227, + "learning_rate": 5.9130434782608707e-08, + "loss": 1.7421, + "step": 34910 + }, + { + "epoch": 2.91, + "grad_norm": 6.819499969482422, + "learning_rate": 5.3333333333333334e-08, + "loss": 1.6757, + "step": 34920 + }, + { + "epoch": 2.91, + "grad_norm": 3.4054958820343018, + "learning_rate": 4.7536231884057975e-08, + "loss": 1.6065, + "step": 34930 + }, + { + "epoch": 2.91, + "grad_norm": 6.741161823272705, + "learning_rate": 4.173913043478261e-08, + "loss": 1.705, + "step": 34940 + }, + { + "epoch": 2.91, + "grad_norm": 12.431832313537598, + "learning_rate": 3.594202898550725e-08, + "loss": 1.637, + "step": 34950 + }, + { + "epoch": 2.91, + "grad_norm": 3.2292697429656982, + "learning_rate": 3.0144927536231885e-08, + "loss": 1.6626, + "step": 34960 + }, + { + "epoch": 2.91, + "grad_norm": 6.705583095550537, + "learning_rate": 2.4347826086956523e-08, + "loss": 1.7878, + "step": 34970 + }, + { + "epoch": 2.92, + "grad_norm": 4.077023506164551, + "learning_rate": 1.855072463768116e-08, + "loss": 1.5233, + "step": 34980 + }, + { + "epoch": 2.92, + "grad_norm": 3.2491979598999023, + "learning_rate": 1.2753623188405798e-08, + "loss": 1.6806, + "step": 34990 + }, + { + "epoch": 2.92, + "grad_norm": 5.141303062438965, + "learning_rate": 6.956521739130436e-09, + "loss": 1.6406, + "step": 35000 + }, + { + "epoch": 2.92, + "eval_loss": 1.6195024251937866, + "eval_runtime": 107.5117, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 2.325, + "step": 35000 + } + ], + "logging_steps": 10, + "max_steps": 35000, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 5.6357440978944e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}