{ "best_metric": 1.599829912185669, "best_model_checkpoint": "runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_/home/audrey/air2/runs/deepseek_lora_20240420-031057/checkpoint-10000_20240420-061900/checkpoint-10000_20240420-141714/checkpoint-30000_20240421-001954/checkpoint-20000_20240421-063809/checkpoint-27000", "epoch": 2.9166666666666665, "eval_steps": 500, "global_step": 35000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.2827765941619873, "learning_rate": 4.0000000000000003e-07, "loss": 2.2577, "step": 10 }, { "epoch": 0.0, "grad_norm": 2.5021369457244873, "learning_rate": 8.000000000000001e-07, "loss": 2.392, "step": 20 }, { "epoch": 0.0, "grad_norm": 3.0452425479888916, "learning_rate": 1.2000000000000002e-06, "loss": 2.3256, "step": 30 }, { "epoch": 0.0, "grad_norm": 3.0641889572143555, "learning_rate": 1.6000000000000001e-06, "loss": 2.3476, "step": 40 }, { "epoch": 0.0, "grad_norm": 4.357707977294922, "learning_rate": 2.0000000000000003e-06, "loss": 2.2706, "step": 50 }, { "epoch": 0.01, "grad_norm": 5.017151355743408, "learning_rate": 2.4000000000000003e-06, "loss": 2.125, "step": 60 }, { "epoch": 0.01, "grad_norm": 2.5771751403808594, "learning_rate": 2.8000000000000003e-06, "loss": 2.2445, "step": 70 }, { "epoch": 0.01, "grad_norm": 1.8890554904937744, "learning_rate": 3.2000000000000003e-06, "loss": 2.2334, "step": 80 }, { "epoch": 0.01, "grad_norm": 3.2498245239257812, "learning_rate": 3.6000000000000003e-06, "loss": 2.2226, "step": 90 }, { "epoch": 0.01, "grad_norm": 4.084829807281494, "learning_rate": 4.000000000000001e-06, "loss": 2.0497, "step": 100 }, { "epoch": 0.01, "grad_norm": 3.884516716003418, "learning_rate": 4.4e-06, "loss": 2.0363, "step": 110 }, { "epoch": 0.01, "grad_norm": 2.7535431385040283, "learning_rate": 4.800000000000001e-06, "loss": 2.1011, "step": 120 }, { "epoch": 0.01, "grad_norm": 3.637084722518921, "learning_rate": 5.2e-06, "loss": 2.0225, "step": 130 }, { "epoch": 0.01, "grad_norm": 2.6097259521484375, "learning_rate": 5.600000000000001e-06, "loss": 1.9933, "step": 140 }, { "epoch": 0.01, "grad_norm": 3.4520087242126465, "learning_rate": 6e-06, "loss": 2.0717, "step": 150 }, { "epoch": 0.01, "grad_norm": 2.878129005432129, "learning_rate": 6.4000000000000006e-06, "loss": 2.078, "step": 160 }, { "epoch": 0.01, "grad_norm": 2.6473491191864014, "learning_rate": 6.800000000000001e-06, "loss": 1.8528, "step": 170 }, { "epoch": 0.01, "grad_norm": 2.8662400245666504, "learning_rate": 7.2000000000000005e-06, "loss": 1.9476, "step": 180 }, { "epoch": 0.02, "grad_norm": 2.6106956005096436, "learning_rate": 7.600000000000001e-06, "loss": 1.9378, "step": 190 }, { "epoch": 0.02, "grad_norm": 3.1151645183563232, "learning_rate": 8.000000000000001e-06, "loss": 1.8459, "step": 200 }, { "epoch": 0.02, "grad_norm": 2.933845281600952, "learning_rate": 8.400000000000001e-06, "loss": 1.9992, "step": 210 }, { "epoch": 0.02, "grad_norm": 2.989990472793579, "learning_rate": 8.8e-06, "loss": 1.9832, "step": 220 }, { "epoch": 0.02, "grad_norm": 5.849925994873047, "learning_rate": 9.200000000000002e-06, "loss": 1.9316, "step": 230 }, { "epoch": 0.02, "grad_norm": 10.09212875366211, "learning_rate": 9.600000000000001e-06, "loss": 1.9109, "step": 240 }, { "epoch": 0.02, "grad_norm": 3.965531587600708, "learning_rate": 1e-05, "loss": 1.9034, "step": 250 }, { "epoch": 0.02, "grad_norm": 3.259747266769409, "learning_rate": 1.04e-05, "loss": 1.9923, "step": 260 }, { "epoch": 0.02, "grad_norm": 4.087090015411377, "learning_rate": 1.0800000000000002e-05, "loss": 1.9494, "step": 270 }, { "epoch": 0.02, "grad_norm": 2.418077230453491, "learning_rate": 1.1200000000000001e-05, "loss": 1.8784, "step": 280 }, { "epoch": 0.02, "grad_norm": 3.9205658435821533, "learning_rate": 1.16e-05, "loss": 1.7895, "step": 290 }, { "epoch": 0.03, "grad_norm": 7.845510959625244, "learning_rate": 1.2e-05, "loss": 1.8195, "step": 300 }, { "epoch": 0.03, "grad_norm": 6.274329662322998, "learning_rate": 1.2400000000000002e-05, "loss": 1.9255, "step": 310 }, { "epoch": 0.03, "grad_norm": 3.376899242401123, "learning_rate": 1.2800000000000001e-05, "loss": 1.9822, "step": 320 }, { "epoch": 0.03, "grad_norm": 4.367288589477539, "learning_rate": 1.3200000000000002e-05, "loss": 1.8333, "step": 330 }, { "epoch": 0.03, "grad_norm": 2.786440849304199, "learning_rate": 1.3600000000000002e-05, "loss": 1.8435, "step": 340 }, { "epoch": 0.03, "grad_norm": 2.883002758026123, "learning_rate": 1.4e-05, "loss": 1.8751, "step": 350 }, { "epoch": 0.03, "grad_norm": 2.210261583328247, "learning_rate": 1.4400000000000001e-05, "loss": 1.8073, "step": 360 }, { "epoch": 0.03, "grad_norm": 2.7850048542022705, "learning_rate": 1.48e-05, "loss": 1.8612, "step": 370 }, { "epoch": 0.03, "grad_norm": 2.726701259613037, "learning_rate": 1.5200000000000002e-05, "loss": 1.9211, "step": 380 }, { "epoch": 0.03, "grad_norm": 3.326230764389038, "learning_rate": 1.5600000000000003e-05, "loss": 1.9087, "step": 390 }, { "epoch": 0.03, "grad_norm": 1.577868938446045, "learning_rate": 1.6000000000000003e-05, "loss": 2.0082, "step": 400 }, { "epoch": 0.03, "grad_norm": 2.0567996501922607, "learning_rate": 1.64e-05, "loss": 1.9232, "step": 410 }, { "epoch": 0.04, "grad_norm": 6.262115478515625, "learning_rate": 1.6800000000000002e-05, "loss": 1.9156, "step": 420 }, { "epoch": 0.04, "grad_norm": 1.7286055088043213, "learning_rate": 1.72e-05, "loss": 1.7268, "step": 430 }, { "epoch": 0.04, "grad_norm": 2.0500264167785645, "learning_rate": 1.76e-05, "loss": 1.9313, "step": 440 }, { "epoch": 0.04, "grad_norm": 1.8140387535095215, "learning_rate": 1.8e-05, "loss": 1.8847, "step": 450 }, { "epoch": 0.04, "grad_norm": 2.911093235015869, "learning_rate": 1.8400000000000003e-05, "loss": 1.6759, "step": 460 }, { "epoch": 0.04, "grad_norm": 4.010791778564453, "learning_rate": 1.88e-05, "loss": 1.9112, "step": 470 }, { "epoch": 0.04, "grad_norm": 2.5784366130828857, "learning_rate": 1.9200000000000003e-05, "loss": 1.9196, "step": 480 }, { "epoch": 0.04, "grad_norm": 1.904146671295166, "learning_rate": 1.9600000000000002e-05, "loss": 1.9137, "step": 490 }, { "epoch": 0.04, "grad_norm": 2.3381693363189697, "learning_rate": 2e-05, "loss": 1.8686, "step": 500 }, { "epoch": 0.04, "eval_loss": 1.8578028678894043, "eval_runtime": 107.4833, "eval_samples_per_second": 9.304, "eval_steps_per_second": 2.326, "step": 500 }, { "epoch": 0.04, "grad_norm": 1.6459054946899414, "learning_rate": 1.9994202898550726e-05, "loss": 1.8223, "step": 510 }, { "epoch": 0.04, "grad_norm": 5.068203926086426, "learning_rate": 1.998840579710145e-05, "loss": 1.7466, "step": 520 }, { "epoch": 0.04, "grad_norm": 2.4843411445617676, "learning_rate": 1.9982608695652174e-05, "loss": 1.9179, "step": 530 }, { "epoch": 0.04, "grad_norm": 1.6055413484573364, "learning_rate": 1.99768115942029e-05, "loss": 1.9317, "step": 540 }, { "epoch": 0.05, "grad_norm": 3.9912831783294678, "learning_rate": 1.9971014492753625e-05, "loss": 1.9632, "step": 550 }, { "epoch": 0.05, "grad_norm": 1.7726389169692993, "learning_rate": 1.996521739130435e-05, "loss": 1.8542, "step": 560 }, { "epoch": 0.05, "grad_norm": 3.2357091903686523, "learning_rate": 1.9959420289855073e-05, "loss": 1.8574, "step": 570 }, { "epoch": 0.05, "grad_norm": 1.8849786520004272, "learning_rate": 1.99536231884058e-05, "loss": 1.8169, "step": 580 }, { "epoch": 0.05, "grad_norm": 1.756426215171814, "learning_rate": 1.9947826086956524e-05, "loss": 1.8116, "step": 590 }, { "epoch": 0.05, "grad_norm": 1.3438060283660889, "learning_rate": 1.994202898550725e-05, "loss": 1.9238, "step": 600 }, { "epoch": 0.05, "grad_norm": 1.7150946855545044, "learning_rate": 1.9936231884057972e-05, "loss": 1.9171, "step": 610 }, { "epoch": 0.05, "grad_norm": 2.1640572547912598, "learning_rate": 1.9930434782608696e-05, "loss": 1.8279, "step": 620 }, { "epoch": 0.05, "grad_norm": 2.2766189575195312, "learning_rate": 1.9924637681159424e-05, "loss": 1.8898, "step": 630 }, { "epoch": 0.05, "grad_norm": 5.365070819854736, "learning_rate": 1.9918840579710144e-05, "loss": 1.862, "step": 640 }, { "epoch": 0.05, "grad_norm": 2.1916489601135254, "learning_rate": 1.9913043478260872e-05, "loss": 1.8582, "step": 650 }, { "epoch": 0.06, "grad_norm": 2.0066256523132324, "learning_rate": 1.9907246376811596e-05, "loss": 1.7583, "step": 660 }, { "epoch": 0.06, "grad_norm": 2.382798671722412, "learning_rate": 1.990144927536232e-05, "loss": 1.8526, "step": 670 }, { "epoch": 0.06, "grad_norm": 2.918565273284912, "learning_rate": 1.9895652173913044e-05, "loss": 1.8112, "step": 680 }, { "epoch": 0.06, "grad_norm": 1.8301721811294556, "learning_rate": 1.988985507246377e-05, "loss": 1.8805, "step": 690 }, { "epoch": 0.06, "grad_norm": 2.482556104660034, "learning_rate": 1.9884057971014495e-05, "loss": 1.778, "step": 700 }, { "epoch": 0.06, "grad_norm": 2.739922046661377, "learning_rate": 1.987826086956522e-05, "loss": 1.8817, "step": 710 }, { "epoch": 0.06, "grad_norm": 1.8400835990905762, "learning_rate": 1.9872463768115943e-05, "loss": 1.9506, "step": 720 }, { "epoch": 0.06, "grad_norm": 1.7623933553695679, "learning_rate": 1.9866666666666667e-05, "loss": 1.8342, "step": 730 }, { "epoch": 0.06, "grad_norm": 4.191768169403076, "learning_rate": 1.9860869565217395e-05, "loss": 1.8549, "step": 740 }, { "epoch": 0.06, "grad_norm": 3.0716779232025146, "learning_rate": 1.9855072463768115e-05, "loss": 1.7888, "step": 750 }, { "epoch": 0.06, "grad_norm": 2.673297643661499, "learning_rate": 1.9849275362318843e-05, "loss": 1.8287, "step": 760 }, { "epoch": 0.06, "grad_norm": 3.45849609375, "learning_rate": 1.9843478260869567e-05, "loss": 1.8583, "step": 770 }, { "epoch": 0.07, "grad_norm": 1.941355586051941, "learning_rate": 1.983768115942029e-05, "loss": 1.7381, "step": 780 }, { "epoch": 0.07, "grad_norm": 2.047844886779785, "learning_rate": 1.9831884057971015e-05, "loss": 1.8842, "step": 790 }, { "epoch": 0.07, "grad_norm": 1.3743892908096313, "learning_rate": 1.9826086956521742e-05, "loss": 1.9104, "step": 800 }, { "epoch": 0.07, "grad_norm": 1.7510887384414673, "learning_rate": 1.9820289855072466e-05, "loss": 1.8223, "step": 810 }, { "epoch": 0.07, "grad_norm": 1.888203740119934, "learning_rate": 1.981449275362319e-05, "loss": 1.7711, "step": 820 }, { "epoch": 0.07, "grad_norm": 3.0050196647644043, "learning_rate": 1.9808695652173914e-05, "loss": 1.756, "step": 830 }, { "epoch": 0.07, "grad_norm": 1.3720544576644897, "learning_rate": 1.9802898550724638e-05, "loss": 1.8173, "step": 840 }, { "epoch": 0.07, "grad_norm": 2.654989242553711, "learning_rate": 1.9797101449275366e-05, "loss": 1.7568, "step": 850 }, { "epoch": 0.07, "grad_norm": 3.6716177463531494, "learning_rate": 1.979130434782609e-05, "loss": 1.9341, "step": 860 }, { "epoch": 0.07, "grad_norm": 2.005023241043091, "learning_rate": 1.9785507246376814e-05, "loss": 1.9117, "step": 870 }, { "epoch": 0.07, "grad_norm": 1.9368737936019897, "learning_rate": 1.9779710144927538e-05, "loss": 1.6635, "step": 880 }, { "epoch": 0.07, "grad_norm": 2.550541400909424, "learning_rate": 1.9773913043478265e-05, "loss": 1.8452, "step": 890 }, { "epoch": 0.07, "grad_norm": 1.289937138557434, "learning_rate": 1.9768115942028986e-05, "loss": 1.8579, "step": 900 }, { "epoch": 0.08, "grad_norm": 2.2402184009552, "learning_rate": 1.9762318840579713e-05, "loss": 1.8988, "step": 910 }, { "epoch": 0.08, "grad_norm": 1.5080454349517822, "learning_rate": 1.9756521739130437e-05, "loss": 1.9026, "step": 920 }, { "epoch": 0.08, "grad_norm": 1.0446062088012695, "learning_rate": 1.975072463768116e-05, "loss": 1.8181, "step": 930 }, { "epoch": 0.08, "grad_norm": 2.787261724472046, "learning_rate": 1.9744927536231885e-05, "loss": 1.8197, "step": 940 }, { "epoch": 0.08, "grad_norm": 2.6586248874664307, "learning_rate": 1.973913043478261e-05, "loss": 1.8774, "step": 950 }, { "epoch": 0.08, "grad_norm": 2.1193392276763916, "learning_rate": 1.9733333333333336e-05, "loss": 1.9113, "step": 960 }, { "epoch": 0.08, "grad_norm": 3.299938917160034, "learning_rate": 1.972753623188406e-05, "loss": 1.8613, "step": 970 }, { "epoch": 0.08, "grad_norm": 2.443349838256836, "learning_rate": 1.9721739130434784e-05, "loss": 1.8193, "step": 980 }, { "epoch": 0.08, "grad_norm": 1.3777451515197754, "learning_rate": 1.971594202898551e-05, "loss": 1.7872, "step": 990 }, { "epoch": 0.08, "grad_norm": 2.1168971061706543, "learning_rate": 1.9710144927536236e-05, "loss": 1.7879, "step": 1000 }, { "epoch": 0.08, "eval_loss": 1.8089497089385986, "eval_runtime": 107.4884, "eval_samples_per_second": 9.303, "eval_steps_per_second": 2.326, "step": 1000 }, { "epoch": 0.08, "grad_norm": 1.817137360572815, "learning_rate": 1.9704347826086956e-05, "loss": 1.9, "step": 1010 }, { "epoch": 0.09, "grad_norm": 2.249089479446411, "learning_rate": 1.969855072463768e-05, "loss": 1.9053, "step": 1020 }, { "epoch": 0.09, "grad_norm": 2.998011589050293, "learning_rate": 1.9692753623188408e-05, "loss": 1.8222, "step": 1030 }, { "epoch": 0.09, "grad_norm": 4.248562335968018, "learning_rate": 1.9686956521739132e-05, "loss": 1.7655, "step": 1040 }, { "epoch": 0.09, "grad_norm": 4.401443004608154, "learning_rate": 1.9681159420289856e-05, "loss": 1.9682, "step": 1050 }, { "epoch": 0.09, "grad_norm": 3.3086190223693848, "learning_rate": 1.967536231884058e-05, "loss": 1.7689, "step": 1060 }, { "epoch": 0.09, "grad_norm": 3.4858791828155518, "learning_rate": 1.9669565217391307e-05, "loss": 1.6351, "step": 1070 }, { "epoch": 0.09, "grad_norm": 0.9979552030563354, "learning_rate": 1.966376811594203e-05, "loss": 1.8494, "step": 1080 }, { "epoch": 0.09, "grad_norm": 1.989320158958435, "learning_rate": 1.9657971014492755e-05, "loss": 1.7798, "step": 1090 }, { "epoch": 0.09, "grad_norm": 4.888331890106201, "learning_rate": 1.965217391304348e-05, "loss": 1.8133, "step": 1100 }, { "epoch": 0.09, "grad_norm": 1.666988730430603, "learning_rate": 1.9646376811594207e-05, "loss": 1.8949, "step": 1110 }, { "epoch": 0.09, "grad_norm": 1.6761118173599243, "learning_rate": 1.964057971014493e-05, "loss": 1.9065, "step": 1120 }, { "epoch": 0.09, "grad_norm": 1.6665714979171753, "learning_rate": 1.963478260869565e-05, "loss": 1.8475, "step": 1130 }, { "epoch": 0.1, "grad_norm": 3.513988971710205, "learning_rate": 1.962898550724638e-05, "loss": 1.6495, "step": 1140 }, { "epoch": 0.1, "grad_norm": 1.7759839296340942, "learning_rate": 1.9623188405797103e-05, "loss": 1.7775, "step": 1150 }, { "epoch": 0.1, "grad_norm": 1.4246402978897095, "learning_rate": 1.9617391304347827e-05, "loss": 1.765, "step": 1160 }, { "epoch": 0.1, "grad_norm": 1.7224540710449219, "learning_rate": 1.961159420289855e-05, "loss": 1.7599, "step": 1170 }, { "epoch": 0.1, "grad_norm": 2.1602578163146973, "learning_rate": 1.9605797101449278e-05, "loss": 1.63, "step": 1180 }, { "epoch": 0.1, "grad_norm": 1.389148473739624, "learning_rate": 1.9600000000000002e-05, "loss": 1.771, "step": 1190 }, { "epoch": 0.1, "grad_norm": 1.9033640623092651, "learning_rate": 1.9594202898550726e-05, "loss": 1.8217, "step": 1200 }, { "epoch": 0.1, "grad_norm": 1.8465853929519653, "learning_rate": 1.958840579710145e-05, "loss": 1.8974, "step": 1210 }, { "epoch": 0.1, "grad_norm": 1.0622667074203491, "learning_rate": 1.9582608695652177e-05, "loss": 1.7574, "step": 1220 }, { "epoch": 0.1, "grad_norm": 2.802722692489624, "learning_rate": 1.95768115942029e-05, "loss": 1.7193, "step": 1230 }, { "epoch": 0.1, "grad_norm": 1.6129810810089111, "learning_rate": 1.9571014492753625e-05, "loss": 1.8253, "step": 1240 }, { "epoch": 0.1, "grad_norm": 2.9575114250183105, "learning_rate": 1.956521739130435e-05, "loss": 1.8882, "step": 1250 }, { "epoch": 0.1, "grad_norm": 2.4238998889923096, "learning_rate": 1.9559420289855074e-05, "loss": 1.7379, "step": 1260 }, { "epoch": 0.11, "grad_norm": 3.9338934421539307, "learning_rate": 1.9553623188405798e-05, "loss": 1.7347, "step": 1270 }, { "epoch": 0.11, "grad_norm": 1.6793437004089355, "learning_rate": 1.954782608695652e-05, "loss": 1.9141, "step": 1280 }, { "epoch": 0.11, "grad_norm": 4.266021728515625, "learning_rate": 1.954202898550725e-05, "loss": 1.8423, "step": 1290 }, { "epoch": 0.11, "grad_norm": 1.8466399908065796, "learning_rate": 1.9536231884057973e-05, "loss": 1.8266, "step": 1300 }, { "epoch": 0.11, "grad_norm": 1.8755276203155518, "learning_rate": 1.9530434782608697e-05, "loss": 1.9117, "step": 1310 }, { "epoch": 0.11, "grad_norm": 1.3602133989334106, "learning_rate": 1.952463768115942e-05, "loss": 1.7265, "step": 1320 }, { "epoch": 0.11, "grad_norm": 1.353096842765808, "learning_rate": 1.9518840579710145e-05, "loss": 1.8355, "step": 1330 }, { "epoch": 0.11, "grad_norm": 1.312878966331482, "learning_rate": 1.9513043478260872e-05, "loss": 1.7331, "step": 1340 }, { "epoch": 0.11, "grad_norm": 2.431149959564209, "learning_rate": 1.9507246376811596e-05, "loss": 1.7316, "step": 1350 }, { "epoch": 0.11, "grad_norm": 1.3108808994293213, "learning_rate": 1.950144927536232e-05, "loss": 1.8539, "step": 1360 }, { "epoch": 0.11, "grad_norm": 3.86793851852417, "learning_rate": 1.9495652173913044e-05, "loss": 1.8277, "step": 1370 }, { "epoch": 0.12, "grad_norm": 1.5965479612350464, "learning_rate": 1.9489855072463772e-05, "loss": 1.7548, "step": 1380 }, { "epoch": 0.12, "grad_norm": 3.187406063079834, "learning_rate": 1.9484057971014492e-05, "loss": 1.8396, "step": 1390 }, { "epoch": 0.12, "grad_norm": 1.5422654151916504, "learning_rate": 1.947826086956522e-05, "loss": 1.7876, "step": 1400 }, { "epoch": 0.12, "grad_norm": 2.088440418243408, "learning_rate": 1.9472463768115944e-05, "loss": 1.8558, "step": 1410 }, { "epoch": 0.12, "grad_norm": 1.8927093744277954, "learning_rate": 1.9466666666666668e-05, "loss": 1.8849, "step": 1420 }, { "epoch": 0.12, "grad_norm": 3.945380687713623, "learning_rate": 1.9460869565217392e-05, "loss": 1.7351, "step": 1430 }, { "epoch": 0.12, "grad_norm": 4.608578681945801, "learning_rate": 1.9455072463768116e-05, "loss": 1.7438, "step": 1440 }, { "epoch": 0.12, "grad_norm": 3.4089462757110596, "learning_rate": 1.9449275362318843e-05, "loss": 1.8227, "step": 1450 }, { "epoch": 0.12, "grad_norm": 4.93408203125, "learning_rate": 1.9443478260869567e-05, "loss": 1.8358, "step": 1460 }, { "epoch": 0.12, "grad_norm": 2.577270269393921, "learning_rate": 1.943768115942029e-05, "loss": 1.8707, "step": 1470 }, { "epoch": 0.12, "grad_norm": 2.3741328716278076, "learning_rate": 1.9431884057971015e-05, "loss": 1.764, "step": 1480 }, { "epoch": 0.12, "grad_norm": 2.4061601161956787, "learning_rate": 1.9426086956521743e-05, "loss": 1.8593, "step": 1490 }, { "epoch": 0.12, "grad_norm": 3.7175543308258057, "learning_rate": 1.9420289855072467e-05, "loss": 1.9067, "step": 1500 }, { "epoch": 0.12, "eval_loss": 1.7743674516677856, "eval_runtime": 107.4624, "eval_samples_per_second": 9.306, "eval_steps_per_second": 2.326, "step": 1500 }, { "epoch": 0.13, "grad_norm": 1.384735107421875, "learning_rate": 1.941449275362319e-05, "loss": 1.6986, "step": 1510 }, { "epoch": 0.13, "grad_norm": 2.8683741092681885, "learning_rate": 1.9408695652173915e-05, "loss": 1.8999, "step": 1520 }, { "epoch": 0.13, "grad_norm": 2.08429217338562, "learning_rate": 1.9402898550724642e-05, "loss": 1.8664, "step": 1530 }, { "epoch": 0.13, "grad_norm": 1.030617117881775, "learning_rate": 1.9397101449275363e-05, "loss": 1.5804, "step": 1540 }, { "epoch": 0.13, "grad_norm": 2.734713315963745, "learning_rate": 1.9391304347826087e-05, "loss": 1.8053, "step": 1550 }, { "epoch": 0.13, "grad_norm": 1.2831270694732666, "learning_rate": 1.9385507246376814e-05, "loss": 1.7894, "step": 1560 }, { "epoch": 0.13, "grad_norm": 4.505608558654785, "learning_rate": 1.9379710144927538e-05, "loss": 1.6539, "step": 1570 }, { "epoch": 0.13, "grad_norm": 1.0595582723617554, "learning_rate": 1.9373913043478262e-05, "loss": 1.8655, "step": 1580 }, { "epoch": 0.13, "grad_norm": 2.5555408000946045, "learning_rate": 1.9368115942028986e-05, "loss": 1.763, "step": 1590 }, { "epoch": 0.13, "grad_norm": 3.7931909561157227, "learning_rate": 1.9362318840579713e-05, "loss": 1.8121, "step": 1600 }, { "epoch": 0.13, "grad_norm": 4.358292579650879, "learning_rate": 1.9356521739130437e-05, "loss": 1.8876, "step": 1610 }, { "epoch": 0.14, "grad_norm": 4.060112953186035, "learning_rate": 1.935072463768116e-05, "loss": 1.7779, "step": 1620 }, { "epoch": 0.14, "grad_norm": 1.94615638256073, "learning_rate": 1.9344927536231885e-05, "loss": 1.7276, "step": 1630 }, { "epoch": 0.14, "grad_norm": 2.230520009994507, "learning_rate": 1.933913043478261e-05, "loss": 1.7617, "step": 1640 }, { "epoch": 0.14, "grad_norm": 2.058115243911743, "learning_rate": 1.9333333333333333e-05, "loss": 1.8117, "step": 1650 }, { "epoch": 0.14, "grad_norm": 1.2918510437011719, "learning_rate": 1.9327536231884057e-05, "loss": 1.6933, "step": 1660 }, { "epoch": 0.14, "grad_norm": 1.8761500120162964, "learning_rate": 1.9321739130434785e-05, "loss": 1.8075, "step": 1670 }, { "epoch": 0.14, "grad_norm": 3.5313031673431396, "learning_rate": 1.931594202898551e-05, "loss": 1.6154, "step": 1680 }, { "epoch": 0.14, "grad_norm": 1.8059382438659668, "learning_rate": 1.9310144927536233e-05, "loss": 1.7508, "step": 1690 }, { "epoch": 0.14, "grad_norm": 3.9243950843811035, "learning_rate": 1.9304347826086957e-05, "loss": 1.832, "step": 1700 }, { "epoch": 0.14, "grad_norm": 5.17168664932251, "learning_rate": 1.9298550724637684e-05, "loss": 1.8438, "step": 1710 }, { "epoch": 0.14, "grad_norm": 2.7666871547698975, "learning_rate": 1.9292753623188408e-05, "loss": 1.8314, "step": 1720 }, { "epoch": 0.14, "grad_norm": 4.743995666503906, "learning_rate": 1.9286956521739132e-05, "loss": 1.7588, "step": 1730 }, { "epoch": 0.14, "grad_norm": 2.5909578800201416, "learning_rate": 1.9281159420289856e-05, "loss": 1.839, "step": 1740 }, { "epoch": 0.15, "grad_norm": 1.395579218864441, "learning_rate": 1.927536231884058e-05, "loss": 1.6756, "step": 1750 }, { "epoch": 0.15, "grad_norm": 2.7446272373199463, "learning_rate": 1.9269565217391308e-05, "loss": 1.8533, "step": 1760 }, { "epoch": 0.15, "grad_norm": 2.249009132385254, "learning_rate": 1.9263768115942028e-05, "loss": 1.8051, "step": 1770 }, { "epoch": 0.15, "grad_norm": 0.8617609143257141, "learning_rate": 1.9257971014492756e-05, "loss": 1.7813, "step": 1780 }, { "epoch": 0.15, "grad_norm": 1.3332220315933228, "learning_rate": 1.925217391304348e-05, "loss": 1.7937, "step": 1790 }, { "epoch": 0.15, "grad_norm": 3.5308120250701904, "learning_rate": 1.9246376811594204e-05, "loss": 1.7395, "step": 1800 }, { "epoch": 0.15, "grad_norm": 3.629775047302246, "learning_rate": 1.9240579710144928e-05, "loss": 1.7794, "step": 1810 }, { "epoch": 0.15, "grad_norm": 3.5039477348327637, "learning_rate": 1.9234782608695655e-05, "loss": 1.7868, "step": 1820 }, { "epoch": 0.15, "grad_norm": 2.4312188625335693, "learning_rate": 1.922898550724638e-05, "loss": 1.7787, "step": 1830 }, { "epoch": 0.15, "grad_norm": 2.299351930618286, "learning_rate": 1.9223188405797103e-05, "loss": 1.5264, "step": 1840 }, { "epoch": 0.15, "grad_norm": 2.888294219970703, "learning_rate": 1.9217391304347827e-05, "loss": 1.7851, "step": 1850 }, { "epoch": 0.15, "grad_norm": 2.4942209720611572, "learning_rate": 1.921159420289855e-05, "loss": 1.8019, "step": 1860 }, { "epoch": 0.16, "grad_norm": 1.9073644876480103, "learning_rate": 1.920579710144928e-05, "loss": 1.8111, "step": 1870 }, { "epoch": 0.16, "grad_norm": 1.672957420349121, "learning_rate": 1.9200000000000003e-05, "loss": 1.8331, "step": 1880 }, { "epoch": 0.16, "grad_norm": 1.514133095741272, "learning_rate": 1.9194202898550727e-05, "loss": 1.7383, "step": 1890 }, { "epoch": 0.16, "grad_norm": 3.5273349285125732, "learning_rate": 1.918840579710145e-05, "loss": 1.7974, "step": 1900 }, { "epoch": 0.16, "grad_norm": 3.798083782196045, "learning_rate": 1.9182608695652175e-05, "loss": 1.7457, "step": 1910 }, { "epoch": 0.16, "grad_norm": 5.434926509857178, "learning_rate": 1.91768115942029e-05, "loss": 1.7104, "step": 1920 }, { "epoch": 0.16, "grad_norm": 1.5063635110855103, "learning_rate": 1.9171014492753626e-05, "loss": 1.7753, "step": 1930 }, { "epoch": 0.16, "grad_norm": 3.0606720447540283, "learning_rate": 1.916521739130435e-05, "loss": 1.7012, "step": 1940 }, { "epoch": 0.16, "grad_norm": 0.9884568452835083, "learning_rate": 1.9159420289855074e-05, "loss": 1.7646, "step": 1950 }, { "epoch": 0.16, "grad_norm": 1.3289304971694946, "learning_rate": 1.9153623188405798e-05, "loss": 1.8317, "step": 1960 }, { "epoch": 0.16, "grad_norm": 2.8612945079803467, "learning_rate": 1.9147826086956522e-05, "loss": 1.8826, "step": 1970 }, { "epoch": 0.17, "grad_norm": 2.970140218734741, "learning_rate": 1.914202898550725e-05, "loss": 1.9036, "step": 1980 }, { "epoch": 0.17, "grad_norm": 2.007293939590454, "learning_rate": 1.9136231884057973e-05, "loss": 1.7557, "step": 1990 }, { "epoch": 0.17, "grad_norm": 1.1491663455963135, "learning_rate": 1.9130434782608697e-05, "loss": 1.7295, "step": 2000 }, { "epoch": 0.17, "eval_loss": 1.7665390968322754, "eval_runtime": 107.4574, "eval_samples_per_second": 9.306, "eval_steps_per_second": 2.327, "step": 2000 }, { "epoch": 0.17, "grad_norm": 3.889491319656372, "learning_rate": 1.912463768115942e-05, "loss": 1.7628, "step": 2010 }, { "epoch": 0.17, "grad_norm": 1.507429838180542, "learning_rate": 1.911884057971015e-05, "loss": 1.7585, "step": 2020 }, { "epoch": 0.17, "grad_norm": 2.2127914428710938, "learning_rate": 1.911304347826087e-05, "loss": 1.8121, "step": 2030 }, { "epoch": 0.17, "grad_norm": 2.9668962955474854, "learning_rate": 1.9107246376811597e-05, "loss": 1.8563, "step": 2040 }, { "epoch": 0.17, "grad_norm": 0.9941543936729431, "learning_rate": 1.910144927536232e-05, "loss": 1.8258, "step": 2050 }, { "epoch": 0.17, "grad_norm": 3.4788460731506348, "learning_rate": 1.9095652173913045e-05, "loss": 1.7617, "step": 2060 }, { "epoch": 0.17, "grad_norm": 2.523179769515991, "learning_rate": 1.908985507246377e-05, "loss": 1.6908, "step": 2070 }, { "epoch": 0.17, "grad_norm": 2.2159104347229004, "learning_rate": 1.9084057971014493e-05, "loss": 1.6652, "step": 2080 }, { "epoch": 0.17, "grad_norm": 2.2359981536865234, "learning_rate": 1.907826086956522e-05, "loss": 1.8826, "step": 2090 }, { "epoch": 0.17, "grad_norm": 3.3493943214416504, "learning_rate": 1.9072463768115944e-05, "loss": 1.8261, "step": 2100 }, { "epoch": 0.18, "grad_norm": 1.5862370729446411, "learning_rate": 1.9066666666666668e-05, "loss": 1.6941, "step": 2110 }, { "epoch": 0.18, "grad_norm": 1.1425403356552124, "learning_rate": 1.9060869565217392e-05, "loss": 1.8313, "step": 2120 }, { "epoch": 0.18, "grad_norm": 4.16150426864624, "learning_rate": 1.905507246376812e-05, "loss": 1.7649, "step": 2130 }, { "epoch": 0.18, "grad_norm": 2.25124192237854, "learning_rate": 1.9049275362318844e-05, "loss": 1.8382, "step": 2140 }, { "epoch": 0.18, "grad_norm": 3.4652185440063477, "learning_rate": 1.9043478260869568e-05, "loss": 1.8486, "step": 2150 }, { "epoch": 0.18, "grad_norm": 3.7186965942382812, "learning_rate": 1.903768115942029e-05, "loss": 1.8783, "step": 2160 }, { "epoch": 0.18, "grad_norm": 2.2005832195281982, "learning_rate": 1.9031884057971016e-05, "loss": 1.8612, "step": 2170 }, { "epoch": 0.18, "grad_norm": 2.344748020172119, "learning_rate": 1.902608695652174e-05, "loss": 1.6858, "step": 2180 }, { "epoch": 0.18, "grad_norm": 1.9262315034866333, "learning_rate": 1.9020289855072464e-05, "loss": 1.7719, "step": 2190 }, { "epoch": 0.18, "grad_norm": 2.292480945587158, "learning_rate": 1.901449275362319e-05, "loss": 1.883, "step": 2200 }, { "epoch": 0.18, "grad_norm": 3.0080437660217285, "learning_rate": 1.9008695652173915e-05, "loss": 1.7567, "step": 2210 }, { "epoch": 0.18, "grad_norm": 1.655610203742981, "learning_rate": 1.900289855072464e-05, "loss": 1.7931, "step": 2220 }, { "epoch": 0.19, "grad_norm": 1.8952635526657104, "learning_rate": 1.8997101449275363e-05, "loss": 1.7877, "step": 2230 }, { "epoch": 0.19, "grad_norm": 3.0049967765808105, "learning_rate": 1.899130434782609e-05, "loss": 1.8179, "step": 2240 }, { "epoch": 0.19, "grad_norm": 1.809584379196167, "learning_rate": 1.8985507246376814e-05, "loss": 1.7893, "step": 2250 }, { "epoch": 0.19, "grad_norm": 3.3408210277557373, "learning_rate": 1.8979710144927535e-05, "loss": 1.76, "step": 2260 }, { "epoch": 0.19, "grad_norm": 2.576713800430298, "learning_rate": 1.8973913043478262e-05, "loss": 1.6123, "step": 2270 }, { "epoch": 0.19, "grad_norm": 2.999994993209839, "learning_rate": 1.8968115942028986e-05, "loss": 1.748, "step": 2280 }, { "epoch": 0.19, "grad_norm": 1.8222105503082275, "learning_rate": 1.896231884057971e-05, "loss": 1.6274, "step": 2290 }, { "epoch": 0.19, "grad_norm": 1.565905213356018, "learning_rate": 1.8956521739130434e-05, "loss": 1.7498, "step": 2300 }, { "epoch": 0.19, "grad_norm": 1.2533594369888306, "learning_rate": 1.8950724637681162e-05, "loss": 1.8822, "step": 2310 }, { "epoch": 0.19, "grad_norm": 2.8874733448028564, "learning_rate": 1.8944927536231886e-05, "loss": 1.7679, "step": 2320 }, { "epoch": 0.19, "grad_norm": 1.1580071449279785, "learning_rate": 1.893913043478261e-05, "loss": 1.829, "step": 2330 }, { "epoch": 0.2, "grad_norm": 1.6889914274215698, "learning_rate": 1.8933333333333334e-05, "loss": 1.7765, "step": 2340 }, { "epoch": 0.2, "grad_norm": 1.708866000175476, "learning_rate": 1.892753623188406e-05, "loss": 1.7612, "step": 2350 }, { "epoch": 0.2, "grad_norm": 2.0311877727508545, "learning_rate": 1.8921739130434785e-05, "loss": 1.8271, "step": 2360 }, { "epoch": 0.2, "grad_norm": 1.4897469282150269, "learning_rate": 1.891594202898551e-05, "loss": 1.67, "step": 2370 }, { "epoch": 0.2, "grad_norm": 1.0771639347076416, "learning_rate": 1.8910144927536233e-05, "loss": 1.8175, "step": 2380 }, { "epoch": 0.2, "grad_norm": 3.115084171295166, "learning_rate": 1.8904347826086957e-05, "loss": 1.8247, "step": 2390 }, { "epoch": 0.2, "grad_norm": 2.106081008911133, "learning_rate": 1.8898550724637685e-05, "loss": 1.833, "step": 2400 }, { "epoch": 0.2, "grad_norm": 1.4720683097839355, "learning_rate": 1.8892753623188405e-05, "loss": 1.8494, "step": 2410 }, { "epoch": 0.2, "grad_norm": 1.4541406631469727, "learning_rate": 1.8886956521739133e-05, "loss": 1.8464, "step": 2420 }, { "epoch": 0.2, "grad_norm": 1.226954698562622, "learning_rate": 1.8881159420289857e-05, "loss": 1.8729, "step": 2430 }, { "epoch": 0.2, "grad_norm": 1.2664247751235962, "learning_rate": 1.887536231884058e-05, "loss": 1.8722, "step": 2440 }, { "epoch": 0.2, "grad_norm": 2.0264010429382324, "learning_rate": 1.8869565217391305e-05, "loss": 1.7936, "step": 2450 }, { "epoch": 0.2, "grad_norm": 1.336003303527832, "learning_rate": 1.8863768115942032e-05, "loss": 1.9073, "step": 2460 }, { "epoch": 0.21, "grad_norm": 2.730409622192383, "learning_rate": 1.8857971014492756e-05, "loss": 1.9041, "step": 2470 }, { "epoch": 0.21, "grad_norm": 2.9845330715179443, "learning_rate": 1.885217391304348e-05, "loss": 1.7753, "step": 2480 }, { "epoch": 0.21, "grad_norm": 1.5443974733352661, "learning_rate": 1.8846376811594204e-05, "loss": 1.7274, "step": 2490 }, { "epoch": 0.21, "grad_norm": 1.1884684562683105, "learning_rate": 1.8840579710144928e-05, "loss": 1.8284, "step": 2500 }, { "epoch": 0.21, "eval_loss": 1.7748754024505615, "eval_runtime": 107.4612, "eval_samples_per_second": 9.306, "eval_steps_per_second": 2.326, "step": 2500 }, { "epoch": 0.21, "grad_norm": 2.1604175567626953, "learning_rate": 1.8834782608695656e-05, "loss": 1.7344, "step": 2510 }, { "epoch": 0.21, "grad_norm": 1.7605400085449219, "learning_rate": 1.882898550724638e-05, "loss": 1.7241, "step": 2520 }, { "epoch": 0.21, "grad_norm": 3.8347537517547607, "learning_rate": 1.8823188405797104e-05, "loss": 1.7196, "step": 2530 }, { "epoch": 0.21, "grad_norm": 1.828438639640808, "learning_rate": 1.8817391304347828e-05, "loss": 1.8529, "step": 2540 }, { "epoch": 0.21, "grad_norm": 1.69232976436615, "learning_rate": 1.881159420289855e-05, "loss": 1.7768, "step": 2550 }, { "epoch": 0.21, "grad_norm": 3.592120885848999, "learning_rate": 1.8805797101449276e-05, "loss": 1.7074, "step": 2560 }, { "epoch": 0.21, "grad_norm": 1.56288743019104, "learning_rate": 1.88e-05, "loss": 1.8631, "step": 2570 }, { "epoch": 0.21, "grad_norm": 0.6752883791923523, "learning_rate": 1.8794202898550727e-05, "loss": 1.8049, "step": 2580 }, { "epoch": 0.22, "grad_norm": 2.010446071624756, "learning_rate": 1.878840579710145e-05, "loss": 1.7486, "step": 2590 }, { "epoch": 0.22, "grad_norm": 1.9133752584457397, "learning_rate": 1.8782608695652175e-05, "loss": 1.7831, "step": 2600 }, { "epoch": 0.22, "grad_norm": 1.1954097747802734, "learning_rate": 1.87768115942029e-05, "loss": 1.7329, "step": 2610 }, { "epoch": 0.22, "grad_norm": 2.0870425701141357, "learning_rate": 1.8771014492753626e-05, "loss": 1.7143, "step": 2620 }, { "epoch": 0.22, "grad_norm": 2.162560224533081, "learning_rate": 1.876521739130435e-05, "loss": 1.8418, "step": 2630 }, { "epoch": 0.22, "grad_norm": 1.2718247175216675, "learning_rate": 1.8759420289855074e-05, "loss": 1.7022, "step": 2640 }, { "epoch": 0.22, "grad_norm": 2.4909746646881104, "learning_rate": 1.87536231884058e-05, "loss": 1.8087, "step": 2650 }, { "epoch": 0.22, "grad_norm": 5.101371765136719, "learning_rate": 1.8747826086956526e-05, "loss": 1.8178, "step": 2660 }, { "epoch": 0.22, "grad_norm": 3.9989445209503174, "learning_rate": 1.8742028985507246e-05, "loss": 1.7954, "step": 2670 }, { "epoch": 0.22, "grad_norm": 3.0736329555511475, "learning_rate": 1.873623188405797e-05, "loss": 1.8488, "step": 2680 }, { "epoch": 0.22, "grad_norm": 2.652923822402954, "learning_rate": 1.8730434782608698e-05, "loss": 1.7438, "step": 2690 }, { "epoch": 0.23, "grad_norm": 4.146462917327881, "learning_rate": 1.8724637681159422e-05, "loss": 1.8169, "step": 2700 }, { "epoch": 0.23, "grad_norm": 1.5568904876708984, "learning_rate": 1.8718840579710146e-05, "loss": 1.7904, "step": 2710 }, { "epoch": 0.23, "grad_norm": 2.2244021892547607, "learning_rate": 1.871304347826087e-05, "loss": 1.8384, "step": 2720 }, { "epoch": 0.23, "grad_norm": 1.3431702852249146, "learning_rate": 1.8707246376811597e-05, "loss": 1.8376, "step": 2730 }, { "epoch": 0.23, "grad_norm": 3.871310234069824, "learning_rate": 1.870144927536232e-05, "loss": 1.6912, "step": 2740 }, { "epoch": 0.23, "grad_norm": 2.3337674140930176, "learning_rate": 1.8695652173913045e-05, "loss": 1.7164, "step": 2750 }, { "epoch": 0.23, "grad_norm": 1.844233751296997, "learning_rate": 1.868985507246377e-05, "loss": 1.7873, "step": 2760 }, { "epoch": 0.23, "grad_norm": 1.0465248823165894, "learning_rate": 1.8684057971014497e-05, "loss": 1.5893, "step": 2770 }, { "epoch": 0.23, "grad_norm": 2.0744643211364746, "learning_rate": 1.867826086956522e-05, "loss": 1.8043, "step": 2780 }, { "epoch": 0.23, "grad_norm": 1.2488594055175781, "learning_rate": 1.867246376811594e-05, "loss": 1.7215, "step": 2790 }, { "epoch": 0.23, "grad_norm": 3.04681658744812, "learning_rate": 1.866666666666667e-05, "loss": 1.901, "step": 2800 }, { "epoch": 0.23, "grad_norm": 2.6009609699249268, "learning_rate": 1.8660869565217393e-05, "loss": 1.7446, "step": 2810 }, { "epoch": 0.23, "grad_norm": 3.019435167312622, "learning_rate": 1.8655072463768117e-05, "loss": 1.8343, "step": 2820 }, { "epoch": 0.24, "grad_norm": 2.385256290435791, "learning_rate": 1.864927536231884e-05, "loss": 1.6675, "step": 2830 }, { "epoch": 0.24, "grad_norm": 3.222172737121582, "learning_rate": 1.8643478260869568e-05, "loss": 1.6239, "step": 2840 }, { "epoch": 0.24, "grad_norm": 4.963768005371094, "learning_rate": 1.8637681159420292e-05, "loss": 1.711, "step": 2850 }, { "epoch": 0.24, "grad_norm": 3.1189117431640625, "learning_rate": 1.8631884057971016e-05, "loss": 1.886, "step": 2860 }, { "epoch": 0.24, "grad_norm": 1.5243103504180908, "learning_rate": 1.862608695652174e-05, "loss": 1.8398, "step": 2870 }, { "epoch": 0.24, "grad_norm": 1.8455665111541748, "learning_rate": 1.8620289855072464e-05, "loss": 1.8868, "step": 2880 }, { "epoch": 0.24, "grad_norm": 1.1385709047317505, "learning_rate": 1.861449275362319e-05, "loss": 1.8302, "step": 2890 }, { "epoch": 0.24, "grad_norm": 3.1462037563323975, "learning_rate": 1.8608695652173912e-05, "loss": 1.8159, "step": 2900 }, { "epoch": 0.24, "grad_norm": 1.0207372903823853, "learning_rate": 1.860289855072464e-05, "loss": 1.8887, "step": 2910 }, { "epoch": 0.24, "grad_norm": 1.9228794574737549, "learning_rate": 1.8597101449275363e-05, "loss": 1.801, "step": 2920 }, { "epoch": 0.24, "grad_norm": 2.8184621334075928, "learning_rate": 1.8591304347826087e-05, "loss": 1.7667, "step": 2930 }, { "epoch": 0.24, "grad_norm": 2.6740689277648926, "learning_rate": 1.858550724637681e-05, "loss": 1.8734, "step": 2940 }, { "epoch": 0.25, "grad_norm": 2.6178441047668457, "learning_rate": 1.857971014492754e-05, "loss": 1.7861, "step": 2950 }, { "epoch": 0.25, "grad_norm": 3.4509289264678955, "learning_rate": 1.8573913043478263e-05, "loss": 1.7024, "step": 2960 }, { "epoch": 0.25, "grad_norm": 7.4058709144592285, "learning_rate": 1.8568115942028987e-05, "loss": 1.7561, "step": 2970 }, { "epoch": 0.25, "grad_norm": 1.0747387409210205, "learning_rate": 1.856231884057971e-05, "loss": 1.8715, "step": 2980 }, { "epoch": 0.25, "grad_norm": 4.675367832183838, "learning_rate": 1.8556521739130435e-05, "loss": 1.6404, "step": 2990 }, { "epoch": 0.25, "grad_norm": 2.9367339611053467, "learning_rate": 1.8550724637681162e-05, "loss": 1.7286, "step": 3000 }, { "epoch": 0.25, "eval_loss": 1.7500901222229004, "eval_runtime": 107.4483, "eval_samples_per_second": 9.307, "eval_steps_per_second": 2.327, "step": 3000 }, { "epoch": 0.25, "grad_norm": 2.8943095207214355, "learning_rate": 1.8544927536231886e-05, "loss": 1.6788, "step": 3010 }, { "epoch": 0.25, "grad_norm": 3.3797285556793213, "learning_rate": 1.853913043478261e-05, "loss": 1.8931, "step": 3020 }, { "epoch": 0.25, "grad_norm": 3.7746102809906006, "learning_rate": 1.8533333333333334e-05, "loss": 1.7836, "step": 3030 }, { "epoch": 0.25, "grad_norm": 1.9604554176330566, "learning_rate": 1.8527536231884062e-05, "loss": 1.704, "step": 3040 }, { "epoch": 0.25, "grad_norm": 1.5160584449768066, "learning_rate": 1.8521739130434782e-05, "loss": 1.7964, "step": 3050 }, { "epoch": 0.26, "grad_norm": 1.3620455265045166, "learning_rate": 1.851594202898551e-05, "loss": 1.7291, "step": 3060 }, { "epoch": 0.26, "grad_norm": 2.6815402507781982, "learning_rate": 1.8510144927536234e-05, "loss": 1.744, "step": 3070 }, { "epoch": 0.26, "grad_norm": 1.7348963022232056, "learning_rate": 1.8504347826086958e-05, "loss": 1.6285, "step": 3080 }, { "epoch": 0.26, "grad_norm": 1.5644665956497192, "learning_rate": 1.8498550724637682e-05, "loss": 1.6034, "step": 3090 }, { "epoch": 0.26, "grad_norm": 2.5588579177856445, "learning_rate": 1.8492753623188406e-05, "loss": 1.6455, "step": 3100 }, { "epoch": 0.26, "grad_norm": 2.486201763153076, "learning_rate": 1.8486956521739133e-05, "loss": 1.6324, "step": 3110 }, { "epoch": 0.26, "grad_norm": 4.734580039978027, "learning_rate": 1.8481159420289857e-05, "loss": 1.7564, "step": 3120 }, { "epoch": 0.26, "grad_norm": 2.060638427734375, "learning_rate": 1.847536231884058e-05, "loss": 1.8093, "step": 3130 }, { "epoch": 0.26, "grad_norm": 1.396253228187561, "learning_rate": 1.8469565217391305e-05, "loss": 1.8422, "step": 3140 }, { "epoch": 0.26, "grad_norm": 3.647871494293213, "learning_rate": 1.8463768115942033e-05, "loss": 1.692, "step": 3150 }, { "epoch": 0.26, "grad_norm": 4.792811870574951, "learning_rate": 1.8457971014492753e-05, "loss": 1.8721, "step": 3160 }, { "epoch": 0.26, "grad_norm": 1.6648412942886353, "learning_rate": 1.845217391304348e-05, "loss": 1.8176, "step": 3170 }, { "epoch": 0.27, "grad_norm": 2.2956972122192383, "learning_rate": 1.8446376811594205e-05, "loss": 1.7783, "step": 3180 }, { "epoch": 0.27, "grad_norm": 1.968624472618103, "learning_rate": 1.844057971014493e-05, "loss": 1.75, "step": 3190 }, { "epoch": 0.27, "grad_norm": 2.384540557861328, "learning_rate": 1.8434782608695653e-05, "loss": 1.8703, "step": 3200 }, { "epoch": 0.27, "grad_norm": 0.963712751865387, "learning_rate": 1.8428985507246377e-05, "loss": 1.8838, "step": 3210 }, { "epoch": 0.27, "grad_norm": 3.5194199085235596, "learning_rate": 1.8423188405797104e-05, "loss": 1.7351, "step": 3220 }, { "epoch": 0.27, "grad_norm": 3.223130226135254, "learning_rate": 1.8417971014492754e-05, "loss": 1.7526, "step": 3230 }, { "epoch": 0.27, "grad_norm": 1.9289439916610718, "learning_rate": 1.841217391304348e-05, "loss": 1.8648, "step": 3240 }, { "epoch": 0.27, "grad_norm": 1.942191481590271, "learning_rate": 1.8406376811594205e-05, "loss": 1.7607, "step": 3250 }, { "epoch": 0.27, "grad_norm": 1.61818265914917, "learning_rate": 1.840057971014493e-05, "loss": 1.6653, "step": 3260 }, { "epoch": 0.27, "grad_norm": 2.776350736618042, "learning_rate": 1.8394782608695653e-05, "loss": 1.7829, "step": 3270 }, { "epoch": 0.27, "grad_norm": 1.71359384059906, "learning_rate": 1.8388985507246377e-05, "loss": 1.8434, "step": 3280 }, { "epoch": 0.27, "grad_norm": 1.9720557928085327, "learning_rate": 1.8383188405797104e-05, "loss": 1.749, "step": 3290 }, { "epoch": 0.28, "grad_norm": 2.1935908794403076, "learning_rate": 1.837739130434783e-05, "loss": 1.8992, "step": 3300 }, { "epoch": 0.28, "grad_norm": 4.199913024902344, "learning_rate": 1.8371594202898552e-05, "loss": 1.8549, "step": 3310 }, { "epoch": 0.28, "grad_norm": 3.7523651123046875, "learning_rate": 1.8365797101449276e-05, "loss": 1.8814, "step": 3320 }, { "epoch": 0.28, "grad_norm": 2.2951200008392334, "learning_rate": 1.8360000000000004e-05, "loss": 1.8184, "step": 3330 }, { "epoch": 0.28, "grad_norm": 2.8267765045166016, "learning_rate": 1.8354202898550724e-05, "loss": 1.8589, "step": 3340 }, { "epoch": 0.28, "grad_norm": 1.0535434484481812, "learning_rate": 1.8348405797101452e-05, "loss": 1.609, "step": 3350 }, { "epoch": 0.28, "grad_norm": 5.798702716827393, "learning_rate": 1.8342608695652176e-05, "loss": 1.7119, "step": 3360 }, { "epoch": 0.28, "grad_norm": 4.116357326507568, "learning_rate": 1.83368115942029e-05, "loss": 1.7238, "step": 3370 }, { "epoch": 0.28, "grad_norm": 1.4448415040969849, "learning_rate": 1.8331014492753624e-05, "loss": 1.6308, "step": 3380 }, { "epoch": 0.28, "grad_norm": 2.775078296661377, "learning_rate": 1.8325217391304348e-05, "loss": 1.8911, "step": 3390 }, { "epoch": 0.28, "grad_norm": 2.1946187019348145, "learning_rate": 1.8319420289855075e-05, "loss": 1.7165, "step": 3400 }, { "epoch": 0.28, "grad_norm": 2.4208180904388428, "learning_rate": 1.83136231884058e-05, "loss": 1.851, "step": 3410 }, { "epoch": 0.28, "grad_norm": 2.1975769996643066, "learning_rate": 1.8307826086956523e-05, "loss": 1.7533, "step": 3420 }, { "epoch": 0.29, "grad_norm": 5.09836483001709, "learning_rate": 1.8302028985507247e-05, "loss": 1.7399, "step": 3430 }, { "epoch": 0.29, "grad_norm": 2.047780990600586, "learning_rate": 1.8296231884057975e-05, "loss": 1.6752, "step": 3440 }, { "epoch": 0.29, "grad_norm": 2.692073106765747, "learning_rate": 1.82904347826087e-05, "loss": 1.649, "step": 3450 }, { "epoch": 0.29, "grad_norm": 2.907693862915039, "learning_rate": 1.8284637681159423e-05, "loss": 1.7774, "step": 3460 }, { "epoch": 0.29, "grad_norm": 2.013425350189209, "learning_rate": 1.8278840579710147e-05, "loss": 1.8186, "step": 3470 }, { "epoch": 0.29, "grad_norm": 1.129022479057312, "learning_rate": 1.827304347826087e-05, "loss": 1.827, "step": 3480 }, { "epoch": 0.29, "grad_norm": 2.979015350341797, "learning_rate": 1.8267246376811595e-05, "loss": 1.6826, "step": 3490 }, { "epoch": 0.29, "grad_norm": 2.317847967147827, "learning_rate": 1.826144927536232e-05, "loss": 1.8459, "step": 3500 }, { "epoch": 0.29, "eval_loss": 1.7489032745361328, "eval_runtime": 107.4616, "eval_samples_per_second": 9.306, "eval_steps_per_second": 2.326, "step": 3500 }, { "epoch": 0.29, "grad_norm": 1.7150217294692993, "learning_rate": 1.8255652173913046e-05, "loss": 1.734, "step": 3510 }, { "epoch": 0.29, "grad_norm": 1.6586185693740845, "learning_rate": 1.824985507246377e-05, "loss": 1.7426, "step": 3520 }, { "epoch": 0.29, "grad_norm": 1.2894079685211182, "learning_rate": 1.8244057971014494e-05, "loss": 1.7626, "step": 3530 }, { "epoch": 0.29, "grad_norm": 2.0136561393737793, "learning_rate": 1.8238260869565218e-05, "loss": 1.7033, "step": 3540 }, { "epoch": 0.3, "grad_norm": 2.7215077877044678, "learning_rate": 1.8232463768115945e-05, "loss": 1.8255, "step": 3550 }, { "epoch": 0.3, "grad_norm": 3.4265096187591553, "learning_rate": 1.822666666666667e-05, "loss": 1.7646, "step": 3560 }, { "epoch": 0.3, "grad_norm": 2.2528915405273438, "learning_rate": 1.822086956521739e-05, "loss": 1.809, "step": 3570 }, { "epoch": 0.3, "grad_norm": 1.1465145349502563, "learning_rate": 1.8215072463768117e-05, "loss": 1.7062, "step": 3580 }, { "epoch": 0.3, "grad_norm": 1.697144865989685, "learning_rate": 1.820927536231884e-05, "loss": 1.7734, "step": 3590 }, { "epoch": 0.3, "grad_norm": 1.9666852951049805, "learning_rate": 1.8203478260869565e-05, "loss": 1.751, "step": 3600 }, { "epoch": 0.3, "grad_norm": 0.8137166500091553, "learning_rate": 1.819768115942029e-05, "loss": 1.7801, "step": 3610 }, { "epoch": 0.3, "grad_norm": 2.7446413040161133, "learning_rate": 1.8191884057971017e-05, "loss": 1.7313, "step": 3620 }, { "epoch": 0.3, "grad_norm": 3.2775075435638428, "learning_rate": 1.818608695652174e-05, "loss": 1.7378, "step": 3630 }, { "epoch": 0.3, "grad_norm": 1.90742027759552, "learning_rate": 1.8180289855072465e-05, "loss": 1.8367, "step": 3640 }, { "epoch": 0.3, "grad_norm": 1.5931909084320068, "learning_rate": 1.817449275362319e-05, "loss": 1.7352, "step": 3650 }, { "epoch": 0.3, "grad_norm": 2.411388397216797, "learning_rate": 1.8168695652173916e-05, "loss": 1.8184, "step": 3660 }, { "epoch": 0.31, "grad_norm": 2.2610573768615723, "learning_rate": 1.816289855072464e-05, "loss": 1.8392, "step": 3670 }, { "epoch": 0.31, "grad_norm": 1.2999207973480225, "learning_rate": 1.8157101449275364e-05, "loss": 1.7333, "step": 3680 }, { "epoch": 0.31, "grad_norm": 1.7404083013534546, "learning_rate": 1.815130434782609e-05, "loss": 1.734, "step": 3690 }, { "epoch": 0.31, "grad_norm": 1.5067249536514282, "learning_rate": 1.8145507246376812e-05, "loss": 1.7091, "step": 3700 }, { "epoch": 0.31, "grad_norm": 1.4666568040847778, "learning_rate": 1.813971014492754e-05, "loss": 1.6971, "step": 3710 }, { "epoch": 0.31, "grad_norm": 1.8228498697280884, "learning_rate": 1.813391304347826e-05, "loss": 1.6056, "step": 3720 }, { "epoch": 0.31, "grad_norm": 6.056992530822754, "learning_rate": 1.8128115942028988e-05, "loss": 1.7473, "step": 3730 }, { "epoch": 0.31, "grad_norm": 3.1190054416656494, "learning_rate": 1.8122318840579712e-05, "loss": 1.7543, "step": 3740 }, { "epoch": 0.31, "grad_norm": 2.7520599365234375, "learning_rate": 1.8116521739130436e-05, "loss": 1.8603, "step": 3750 }, { "epoch": 0.31, "grad_norm": 3.523573637008667, "learning_rate": 1.811072463768116e-05, "loss": 1.8091, "step": 3760 }, { "epoch": 0.31, "grad_norm": 1.5726646184921265, "learning_rate": 1.8104927536231887e-05, "loss": 1.728, "step": 3770 }, { "epoch": 0.32, "grad_norm": 2.465932846069336, "learning_rate": 1.809913043478261e-05, "loss": 1.5628, "step": 3780 }, { "epoch": 0.32, "grad_norm": 2.4633328914642334, "learning_rate": 1.8093333333333335e-05, "loss": 1.6955, "step": 3790 }, { "epoch": 0.32, "grad_norm": 1.3645209074020386, "learning_rate": 1.808753623188406e-05, "loss": 1.8479, "step": 3800 }, { "epoch": 0.32, "grad_norm": 1.8331495523452759, "learning_rate": 1.8081739130434783e-05, "loss": 1.7356, "step": 3810 }, { "epoch": 0.32, "grad_norm": 1.2319082021713257, "learning_rate": 1.807594202898551e-05, "loss": 1.6966, "step": 3820 }, { "epoch": 0.32, "grad_norm": 2.204197645187378, "learning_rate": 1.807014492753623e-05, "loss": 1.8275, "step": 3830 }, { "epoch": 0.32, "grad_norm": 2.6139984130859375, "learning_rate": 1.806434782608696e-05, "loss": 1.73, "step": 3840 }, { "epoch": 0.32, "grad_norm": 3.1955862045288086, "learning_rate": 1.8058550724637683e-05, "loss": 1.7662, "step": 3850 }, { "epoch": 0.32, "grad_norm": 0.9615164399147034, "learning_rate": 1.8052753623188407e-05, "loss": 1.8208, "step": 3860 }, { "epoch": 0.32, "grad_norm": 1.718482494354248, "learning_rate": 1.804695652173913e-05, "loss": 1.7241, "step": 3870 }, { "epoch": 0.32, "grad_norm": 3.189242124557495, "learning_rate": 1.8041159420289855e-05, "loss": 1.8602, "step": 3880 }, { "epoch": 0.32, "grad_norm": 3.950349807739258, "learning_rate": 1.8035362318840582e-05, "loss": 1.866, "step": 3890 }, { "epoch": 0.33, "grad_norm": 1.566131591796875, "learning_rate": 1.8029565217391306e-05, "loss": 1.528, "step": 3900 }, { "epoch": 0.33, "grad_norm": 1.409193515777588, "learning_rate": 1.802376811594203e-05, "loss": 1.7897, "step": 3910 }, { "epoch": 0.33, "grad_norm": 2.5493619441986084, "learning_rate": 1.8017971014492754e-05, "loss": 1.8192, "step": 3920 }, { "epoch": 0.33, "grad_norm": 3.087979793548584, "learning_rate": 1.801217391304348e-05, "loss": 1.7661, "step": 3930 }, { "epoch": 0.33, "grad_norm": 2.98085355758667, "learning_rate": 1.8006376811594205e-05, "loss": 1.7836, "step": 3940 }, { "epoch": 0.33, "grad_norm": 1.6179476976394653, "learning_rate": 1.800057971014493e-05, "loss": 1.8128, "step": 3950 }, { "epoch": 0.33, "grad_norm": 1.0330349206924438, "learning_rate": 1.7994782608695653e-05, "loss": 1.8616, "step": 3960 }, { "epoch": 0.33, "grad_norm": 1.4651768207550049, "learning_rate": 1.798898550724638e-05, "loss": 1.8311, "step": 3970 }, { "epoch": 0.33, "grad_norm": 0.7031122446060181, "learning_rate": 1.79831884057971e-05, "loss": 1.6809, "step": 3980 }, { "epoch": 0.33, "grad_norm": 1.734285831451416, "learning_rate": 1.7977391304347825e-05, "loss": 1.7419, "step": 3990 }, { "epoch": 0.33, "grad_norm": 2.079700469970703, "learning_rate": 1.7971594202898553e-05, "loss": 1.7631, "step": 4000 }, { "epoch": 0.33, "eval_loss": 1.7669097185134888, "eval_runtime": 107.4754, "eval_samples_per_second": 9.304, "eval_steps_per_second": 2.326, "step": 4000 }, { "epoch": 0.33, "grad_norm": 1.9647717475891113, "learning_rate": 1.7965797101449277e-05, "loss": 1.723, "step": 4010 }, { "epoch": 0.34, "grad_norm": 2.0198094844818115, "learning_rate": 1.796e-05, "loss": 1.5237, "step": 4020 }, { "epoch": 0.34, "grad_norm": 1.165492296218872, "learning_rate": 1.7954202898550725e-05, "loss": 1.7683, "step": 4030 }, { "epoch": 0.34, "grad_norm": 2.6819276809692383, "learning_rate": 1.7948405797101452e-05, "loss": 1.7101, "step": 4040 }, { "epoch": 0.34, "grad_norm": 1.1148452758789062, "learning_rate": 1.7942608695652176e-05, "loss": 1.6593, "step": 4050 }, { "epoch": 0.34, "grad_norm": 1.5147534608840942, "learning_rate": 1.79368115942029e-05, "loss": 1.599, "step": 4060 }, { "epoch": 0.34, "grad_norm": 1.868163824081421, "learning_rate": 1.7931014492753624e-05, "loss": 1.7865, "step": 4070 }, { "epoch": 0.34, "grad_norm": 2.7418978214263916, "learning_rate": 1.792521739130435e-05, "loss": 1.7432, "step": 4080 }, { "epoch": 0.34, "grad_norm": 1.6911894083023071, "learning_rate": 1.7919420289855076e-05, "loss": 1.8255, "step": 4090 }, { "epoch": 0.34, "grad_norm": 2.916471004486084, "learning_rate": 1.7913623188405796e-05, "loss": 1.8335, "step": 4100 }, { "epoch": 0.34, "grad_norm": 1.2466706037521362, "learning_rate": 1.7907826086956524e-05, "loss": 1.8451, "step": 4110 }, { "epoch": 0.34, "grad_norm": 1.5247670412063599, "learning_rate": 1.7902028985507248e-05, "loss": 1.7443, "step": 4120 }, { "epoch": 0.34, "grad_norm": 1.580941081047058, "learning_rate": 1.789623188405797e-05, "loss": 1.8384, "step": 4130 }, { "epoch": 0.34, "grad_norm": 4.8174824714660645, "learning_rate": 1.7890434782608696e-05, "loss": 1.6805, "step": 4140 }, { "epoch": 0.35, "grad_norm": 1.682310938835144, "learning_rate": 1.7884637681159423e-05, "loss": 1.7765, "step": 4150 }, { "epoch": 0.35, "grad_norm": 3.392796039581299, "learning_rate": 1.7878840579710147e-05, "loss": 1.6769, "step": 4160 }, { "epoch": 0.35, "grad_norm": 1.4888694286346436, "learning_rate": 1.787304347826087e-05, "loss": 1.7414, "step": 4170 }, { "epoch": 0.35, "grad_norm": 3.9246952533721924, "learning_rate": 1.7867246376811595e-05, "loss": 1.7297, "step": 4180 }, { "epoch": 0.35, "grad_norm": 1.4315427541732788, "learning_rate": 1.786144927536232e-05, "loss": 1.7288, "step": 4190 }, { "epoch": 0.35, "grad_norm": 1.2288168668746948, "learning_rate": 1.7855652173913046e-05, "loss": 1.9408, "step": 4200 }, { "epoch": 0.35, "grad_norm": 1.8101353645324707, "learning_rate": 1.7849855072463767e-05, "loss": 1.7743, "step": 4210 }, { "epoch": 0.35, "grad_norm": 1.800456166267395, "learning_rate": 1.7844057971014495e-05, "loss": 1.6975, "step": 4220 }, { "epoch": 0.35, "grad_norm": 1.9426621198654175, "learning_rate": 1.783826086956522e-05, "loss": 1.9247, "step": 4230 }, { "epoch": 0.35, "grad_norm": 1.1264923810958862, "learning_rate": 1.7832463768115943e-05, "loss": 1.8713, "step": 4240 }, { "epoch": 0.35, "grad_norm": 3.0976786613464355, "learning_rate": 1.7826666666666667e-05, "loss": 1.8226, "step": 4250 }, { "epoch": 0.35, "grad_norm": 3.625213146209717, "learning_rate": 1.7820869565217394e-05, "loss": 1.7695, "step": 4260 }, { "epoch": 0.36, "grad_norm": 1.8756457567214966, "learning_rate": 1.7815072463768118e-05, "loss": 1.6924, "step": 4270 }, { "epoch": 0.36, "grad_norm": 1.0314109325408936, "learning_rate": 1.7809275362318842e-05, "loss": 1.8747, "step": 4280 }, { "epoch": 0.36, "grad_norm": 2.296934127807617, "learning_rate": 1.7803478260869566e-05, "loss": 1.7441, "step": 4290 }, { "epoch": 0.36, "grad_norm": 3.1548571586608887, "learning_rate": 1.779768115942029e-05, "loss": 1.7059, "step": 4300 }, { "epoch": 0.36, "grad_norm": 4.312896728515625, "learning_rate": 1.7791884057971017e-05, "loss": 1.8683, "step": 4310 }, { "epoch": 0.36, "grad_norm": 3.2402548789978027, "learning_rate": 1.778608695652174e-05, "loss": 1.8141, "step": 4320 }, { "epoch": 0.36, "grad_norm": 3.2827465534210205, "learning_rate": 1.7780289855072465e-05, "loss": 1.8275, "step": 4330 }, { "epoch": 0.36, "grad_norm": 2.1937191486358643, "learning_rate": 1.777449275362319e-05, "loss": 1.8033, "step": 4340 }, { "epoch": 0.36, "grad_norm": 2.4238903522491455, "learning_rate": 1.7768695652173917e-05, "loss": 1.8152, "step": 4350 }, { "epoch": 0.36, "grad_norm": 3.212376117706299, "learning_rate": 1.7762898550724637e-05, "loss": 1.699, "step": 4360 }, { "epoch": 0.36, "grad_norm": 3.7652318477630615, "learning_rate": 1.7757101449275365e-05, "loss": 1.6854, "step": 4370 }, { "epoch": 0.36, "grad_norm": 1.9386508464813232, "learning_rate": 1.775130434782609e-05, "loss": 1.7143, "step": 4380 }, { "epoch": 0.37, "grad_norm": 3.230910062789917, "learning_rate": 1.7745507246376813e-05, "loss": 1.7512, "step": 4390 }, { "epoch": 0.37, "grad_norm": 4.213496685028076, "learning_rate": 1.7739710144927537e-05, "loss": 1.7486, "step": 4400 }, { "epoch": 0.37, "grad_norm": 3.4668760299682617, "learning_rate": 1.773391304347826e-05, "loss": 1.687, "step": 4410 }, { "epoch": 0.37, "grad_norm": 1.9463025331497192, "learning_rate": 1.7728115942028988e-05, "loss": 1.7829, "step": 4420 }, { "epoch": 0.37, "grad_norm": 2.4058449268341064, "learning_rate": 1.7722318840579712e-05, "loss": 1.6866, "step": 4430 }, { "epoch": 0.37, "grad_norm": 1.426607370376587, "learning_rate": 1.7716521739130436e-05, "loss": 1.7626, "step": 4440 }, { "epoch": 0.37, "grad_norm": 2.6386098861694336, "learning_rate": 1.771072463768116e-05, "loss": 1.7471, "step": 4450 }, { "epoch": 0.37, "grad_norm": 3.66159987449646, "learning_rate": 1.7704927536231888e-05, "loss": 1.6963, "step": 4460 }, { "epoch": 0.37, "grad_norm": 1.790877103805542, "learning_rate": 1.7699130434782608e-05, "loss": 1.8013, "step": 4470 }, { "epoch": 0.37, "grad_norm": 1.4159287214279175, "learning_rate": 1.7693333333333336e-05, "loss": 1.6159, "step": 4480 }, { "epoch": 0.37, "grad_norm": 4.394288539886475, "learning_rate": 1.768753623188406e-05, "loss": 1.7185, "step": 4490 }, { "epoch": 0.38, "grad_norm": 2.0887296199798584, "learning_rate": 1.7681739130434784e-05, "loss": 1.833, "step": 4500 }, { "epoch": 0.38, "eval_loss": 1.7682534456253052, "eval_runtime": 107.4724, "eval_samples_per_second": 9.305, "eval_steps_per_second": 2.326, "step": 4500 }, { "epoch": 0.38, "grad_norm": 4.085282325744629, "learning_rate": 1.7675942028985508e-05, "loss": 1.7992, "step": 4510 }, { "epoch": 0.38, "grad_norm": 2.182236671447754, "learning_rate": 1.767014492753623e-05, "loss": 1.8982, "step": 4520 }, { "epoch": 0.38, "grad_norm": 2.9894583225250244, "learning_rate": 1.766434782608696e-05, "loss": 1.6623, "step": 4530 }, { "epoch": 0.38, "grad_norm": 2.1609718799591064, "learning_rate": 1.7658550724637683e-05, "loss": 1.7441, "step": 4540 }, { "epoch": 0.38, "grad_norm": 1.7761262655258179, "learning_rate": 1.7652753623188407e-05, "loss": 1.7824, "step": 4550 }, { "epoch": 0.38, "grad_norm": 3.564272880554199, "learning_rate": 1.764695652173913e-05, "loss": 1.7627, "step": 4560 }, { "epoch": 0.38, "grad_norm": 1.6390734910964966, "learning_rate": 1.764115942028986e-05, "loss": 1.7783, "step": 4570 }, { "epoch": 0.38, "grad_norm": 1.8177413940429688, "learning_rate": 1.7635362318840582e-05, "loss": 1.6956, "step": 4580 }, { "epoch": 0.38, "grad_norm": 1.2485848665237427, "learning_rate": 1.7629565217391306e-05, "loss": 1.7598, "step": 4590 }, { "epoch": 0.38, "grad_norm": 2.094052791595459, "learning_rate": 1.762376811594203e-05, "loss": 1.8477, "step": 4600 }, { "epoch": 0.38, "grad_norm": 2.854745864868164, "learning_rate": 1.7617971014492754e-05, "loss": 1.6767, "step": 4610 }, { "epoch": 0.39, "grad_norm": 2.275739908218384, "learning_rate": 1.761217391304348e-05, "loss": 1.7992, "step": 4620 }, { "epoch": 0.39, "grad_norm": 4.860130786895752, "learning_rate": 1.7606376811594202e-05, "loss": 1.6274, "step": 4630 }, { "epoch": 0.39, "grad_norm": 1.5182762145996094, "learning_rate": 1.760057971014493e-05, "loss": 1.8214, "step": 4640 }, { "epoch": 0.39, "grad_norm": 1.3440704345703125, "learning_rate": 1.7594782608695654e-05, "loss": 1.778, "step": 4650 }, { "epoch": 0.39, "grad_norm": 1.2678601741790771, "learning_rate": 1.7588985507246378e-05, "loss": 1.6729, "step": 4660 }, { "epoch": 0.39, "grad_norm": 2.45300030708313, "learning_rate": 1.7583188405797102e-05, "loss": 1.7167, "step": 4670 }, { "epoch": 0.39, "grad_norm": 1.4436949491500854, "learning_rate": 1.757739130434783e-05, "loss": 1.7222, "step": 4680 }, { "epoch": 0.39, "grad_norm": 1.70391047000885, "learning_rate": 1.7571594202898553e-05, "loss": 1.802, "step": 4690 }, { "epoch": 0.39, "grad_norm": 4.7926859855651855, "learning_rate": 1.7565797101449277e-05, "loss": 1.7951, "step": 4700 }, { "epoch": 0.39, "grad_norm": 1.777199387550354, "learning_rate": 1.756e-05, "loss": 1.7408, "step": 4710 }, { "epoch": 0.39, "grad_norm": 0.9820401072502136, "learning_rate": 1.7554202898550725e-05, "loss": 1.8236, "step": 4720 }, { "epoch": 0.39, "grad_norm": 2.505861759185791, "learning_rate": 1.754840579710145e-05, "loss": 1.6859, "step": 4730 }, { "epoch": 0.4, "grad_norm": 3.0576391220092773, "learning_rate": 1.7542608695652173e-05, "loss": 1.7222, "step": 4740 }, { "epoch": 0.4, "grad_norm": 3.96050763130188, "learning_rate": 1.75368115942029e-05, "loss": 1.8761, "step": 4750 }, { "epoch": 0.4, "grad_norm": 4.387930870056152, "learning_rate": 1.7531014492753625e-05, "loss": 1.6781, "step": 4760 }, { "epoch": 0.4, "grad_norm": 6.898324012756348, "learning_rate": 1.752521739130435e-05, "loss": 1.783, "step": 4770 }, { "epoch": 0.4, "grad_norm": 4.5490803718566895, "learning_rate": 1.7519420289855073e-05, "loss": 1.7627, "step": 4780 }, { "epoch": 0.4, "grad_norm": 5.1455864906311035, "learning_rate": 1.75136231884058e-05, "loss": 1.7222, "step": 4790 }, { "epoch": 0.4, "grad_norm": 2.632255792617798, "learning_rate": 1.7507826086956524e-05, "loss": 1.8319, "step": 4800 }, { "epoch": 0.4, "grad_norm": 1.351136565208435, "learning_rate": 1.7502028985507248e-05, "loss": 1.7507, "step": 4810 }, { "epoch": 0.4, "grad_norm": 1.6380447149276733, "learning_rate": 1.7496231884057972e-05, "loss": 1.6645, "step": 4820 }, { "epoch": 0.4, "grad_norm": 1.601467490196228, "learning_rate": 1.7490434782608696e-05, "loss": 1.8176, "step": 4830 }, { "epoch": 0.4, "grad_norm": 1.0333625078201294, "learning_rate": 1.7484637681159424e-05, "loss": 1.6752, "step": 4840 }, { "epoch": 0.4, "grad_norm": 2.7758312225341797, "learning_rate": 1.7478840579710144e-05, "loss": 1.7522, "step": 4850 }, { "epoch": 0.41, "grad_norm": 3.394230842590332, "learning_rate": 1.747304347826087e-05, "loss": 1.6703, "step": 4860 }, { "epoch": 0.41, "grad_norm": 1.6446850299835205, "learning_rate": 1.7467246376811596e-05, "loss": 1.6016, "step": 4870 }, { "epoch": 0.41, "grad_norm": 3.261119842529297, "learning_rate": 1.746144927536232e-05, "loss": 1.598, "step": 4880 }, { "epoch": 0.41, "grad_norm": 3.5779495239257812, "learning_rate": 1.7455652173913044e-05, "loss": 1.7129, "step": 4890 }, { "epoch": 0.41, "grad_norm": 2.2901227474212646, "learning_rate": 1.744985507246377e-05, "loss": 1.6935, "step": 4900 }, { "epoch": 0.41, "grad_norm": 2.433979034423828, "learning_rate": 1.7444057971014495e-05, "loss": 1.6954, "step": 4910 }, { "epoch": 0.41, "grad_norm": 0.8954328894615173, "learning_rate": 1.743826086956522e-05, "loss": 1.7721, "step": 4920 }, { "epoch": 0.41, "grad_norm": 2.513370990753174, "learning_rate": 1.7432463768115943e-05, "loss": 1.6607, "step": 4930 }, { "epoch": 0.41, "grad_norm": 2.7021384239196777, "learning_rate": 1.7426666666666667e-05, "loss": 1.7342, "step": 4940 }, { "epoch": 0.41, "grad_norm": 3.947293519973755, "learning_rate": 1.7420869565217394e-05, "loss": 1.6413, "step": 4950 }, { "epoch": 0.41, "grad_norm": 1.9602371454238892, "learning_rate": 1.741507246376812e-05, "loss": 1.6927, "step": 4960 }, { "epoch": 0.41, "grad_norm": 2.6332249641418457, "learning_rate": 1.7409275362318842e-05, "loss": 1.5976, "step": 4970 }, { "epoch": 0.41, "grad_norm": 2.293816328048706, "learning_rate": 1.7403478260869566e-05, "loss": 1.7207, "step": 4980 }, { "epoch": 0.42, "grad_norm": 6.466865539550781, "learning_rate": 1.7397681159420294e-05, "loss": 1.7308, "step": 4990 }, { "epoch": 0.42, "grad_norm": 4.4341559410095215, "learning_rate": 1.7391884057971014e-05, "loss": 1.7434, "step": 5000 }, { "epoch": 0.42, "eval_loss": 1.7550283670425415, "eval_runtime": 107.4768, "eval_samples_per_second": 9.304, "eval_steps_per_second": 2.326, "step": 5000 }, { "epoch": 0.42, "grad_norm": 1.294936180114746, "learning_rate": 1.7386086956521742e-05, "loss": 1.746, "step": 5010 }, { "epoch": 0.42, "grad_norm": 3.660215139389038, "learning_rate": 1.7380289855072466e-05, "loss": 1.7183, "step": 5020 }, { "epoch": 0.42, "grad_norm": 1.8590058088302612, "learning_rate": 1.737449275362319e-05, "loss": 1.8525, "step": 5030 }, { "epoch": 0.42, "grad_norm": 1.5797920227050781, "learning_rate": 1.7368695652173914e-05, "loss": 1.7257, "step": 5040 }, { "epoch": 0.42, "grad_norm": 1.3411496877670288, "learning_rate": 1.7362898550724638e-05, "loss": 1.8922, "step": 5050 }, { "epoch": 0.42, "grad_norm": 3.5491766929626465, "learning_rate": 1.7357101449275365e-05, "loss": 1.7402, "step": 5060 }, { "epoch": 0.42, "grad_norm": 2.72056245803833, "learning_rate": 1.735130434782609e-05, "loss": 1.7225, "step": 5070 }, { "epoch": 0.42, "grad_norm": 2.4798800945281982, "learning_rate": 1.7345507246376813e-05, "loss": 1.7624, "step": 5080 }, { "epoch": 0.42, "grad_norm": 2.2017393112182617, "learning_rate": 1.7339710144927537e-05, "loss": 1.7921, "step": 5090 }, { "epoch": 0.42, "grad_norm": 3.185555934906006, "learning_rate": 1.7333913043478265e-05, "loss": 1.615, "step": 5100 }, { "epoch": 0.43, "grad_norm": 1.8740787506103516, "learning_rate": 1.7328115942028985e-05, "loss": 1.778, "step": 5110 }, { "epoch": 0.43, "grad_norm": 2.6810436248779297, "learning_rate": 1.732231884057971e-05, "loss": 1.7957, "step": 5120 }, { "epoch": 0.43, "grad_norm": 2.130495309829712, "learning_rate": 1.7316521739130437e-05, "loss": 1.8177, "step": 5130 }, { "epoch": 0.43, "grad_norm": 4.660665988922119, "learning_rate": 1.731072463768116e-05, "loss": 1.8078, "step": 5140 }, { "epoch": 0.43, "grad_norm": 1.669716238975525, "learning_rate": 1.7304927536231885e-05, "loss": 1.7621, "step": 5150 }, { "epoch": 0.43, "grad_norm": 1.6844958066940308, "learning_rate": 1.729913043478261e-05, "loss": 1.6396, "step": 5160 }, { "epoch": 0.43, "grad_norm": 2.097205877304077, "learning_rate": 1.7293333333333336e-05, "loss": 1.7409, "step": 5170 }, { "epoch": 0.43, "grad_norm": 0.8433555960655212, "learning_rate": 1.728753623188406e-05, "loss": 1.8663, "step": 5180 }, { "epoch": 0.43, "grad_norm": 3.1566624641418457, "learning_rate": 1.7281739130434784e-05, "loss": 1.5808, "step": 5190 }, { "epoch": 0.43, "grad_norm": 2.130633592605591, "learning_rate": 1.7275942028985508e-05, "loss": 1.6708, "step": 5200 }, { "epoch": 0.43, "grad_norm": 2.3637614250183105, "learning_rate": 1.7270144927536235e-05, "loss": 1.7204, "step": 5210 }, { "epoch": 0.43, "grad_norm": 6.3107686042785645, "learning_rate": 1.726434782608696e-05, "loss": 1.7913, "step": 5220 }, { "epoch": 0.44, "grad_norm": 1.3103054761886597, "learning_rate": 1.725855072463768e-05, "loss": 1.7843, "step": 5230 }, { "epoch": 0.44, "grad_norm": 4.409878730773926, "learning_rate": 1.7252753623188407e-05, "loss": 1.5763, "step": 5240 }, { "epoch": 0.44, "grad_norm": 1.2016843557357788, "learning_rate": 1.724695652173913e-05, "loss": 1.7474, "step": 5250 }, { "epoch": 0.44, "grad_norm": 2.6358120441436768, "learning_rate": 1.7241159420289855e-05, "loss": 1.6569, "step": 5260 }, { "epoch": 0.44, "grad_norm": 2.816072702407837, "learning_rate": 1.723536231884058e-05, "loss": 1.7288, "step": 5270 }, { "epoch": 0.44, "grad_norm": 5.151131629943848, "learning_rate": 1.7229565217391307e-05, "loss": 1.7776, "step": 5280 }, { "epoch": 0.44, "grad_norm": 1.895945429801941, "learning_rate": 1.722376811594203e-05, "loss": 1.7824, "step": 5290 }, { "epoch": 0.44, "grad_norm": 2.5758895874023438, "learning_rate": 1.7217971014492755e-05, "loss": 1.6245, "step": 5300 }, { "epoch": 0.44, "grad_norm": 1.6517353057861328, "learning_rate": 1.721217391304348e-05, "loss": 1.5666, "step": 5310 }, { "epoch": 0.44, "grad_norm": 3.3738932609558105, "learning_rate": 1.7206376811594206e-05, "loss": 1.6949, "step": 5320 }, { "epoch": 0.44, "grad_norm": 4.818183898925781, "learning_rate": 1.720057971014493e-05, "loss": 1.7589, "step": 5330 }, { "epoch": 0.45, "grad_norm": 3.0807089805603027, "learning_rate": 1.7194782608695654e-05, "loss": 1.8133, "step": 5340 }, { "epoch": 0.45, "grad_norm": 0.9534027576446533, "learning_rate": 1.7188985507246378e-05, "loss": 1.7675, "step": 5350 }, { "epoch": 0.45, "grad_norm": 2.3471546173095703, "learning_rate": 1.7183188405797102e-05, "loss": 1.4739, "step": 5360 }, { "epoch": 0.45, "grad_norm": 2.1540451049804688, "learning_rate": 1.7177391304347826e-05, "loss": 1.7602, "step": 5370 }, { "epoch": 0.45, "grad_norm": 2.6220521926879883, "learning_rate": 1.717159420289855e-05, "loss": 1.8812, "step": 5380 }, { "epoch": 0.45, "grad_norm": 2.6644699573516846, "learning_rate": 1.7165797101449278e-05, "loss": 1.8191, "step": 5390 }, { "epoch": 0.45, "grad_norm": 4.378482818603516, "learning_rate": 1.7160000000000002e-05, "loss": 1.7765, "step": 5400 }, { "epoch": 0.45, "grad_norm": 3.0857391357421875, "learning_rate": 1.7154202898550726e-05, "loss": 1.7762, "step": 5410 }, { "epoch": 0.45, "grad_norm": 0.7650538086891174, "learning_rate": 1.714840579710145e-05, "loss": 1.7468, "step": 5420 }, { "epoch": 0.45, "grad_norm": 1.3827682733535767, "learning_rate": 1.7142608695652174e-05, "loss": 1.8392, "step": 5430 }, { "epoch": 0.45, "grad_norm": 7.6188459396362305, "learning_rate": 1.71368115942029e-05, "loss": 1.765, "step": 5440 }, { "epoch": 0.45, "grad_norm": 1.591964602470398, "learning_rate": 1.7131014492753625e-05, "loss": 1.6735, "step": 5450 }, { "epoch": 0.46, "grad_norm": 2.4797956943511963, "learning_rate": 1.712521739130435e-05, "loss": 1.8693, "step": 5460 }, { "epoch": 0.46, "grad_norm": 1.759324312210083, "learning_rate": 1.7119420289855073e-05, "loss": 1.691, "step": 5470 }, { "epoch": 0.46, "grad_norm": 2.8067173957824707, "learning_rate": 1.71136231884058e-05, "loss": 1.8131, "step": 5480 }, { "epoch": 0.46, "grad_norm": 1.9415967464447021, "learning_rate": 1.710782608695652e-05, "loss": 1.8269, "step": 5490 }, { "epoch": 0.46, "grad_norm": 3.027404308319092, "learning_rate": 1.710202898550725e-05, "loss": 1.8457, "step": 5500 }, { "epoch": 0.46, "eval_loss": 1.7384296655654907, "eval_runtime": 107.4854, "eval_samples_per_second": 9.304, "eval_steps_per_second": 2.326, "step": 5500 }, { "epoch": 0.46, "grad_norm": 2.0170934200286865, "learning_rate": 1.7096231884057973e-05, "loss": 1.7555, "step": 5510 }, { "epoch": 0.46, "grad_norm": 1.8536728620529175, "learning_rate": 1.7090434782608697e-05, "loss": 1.6274, "step": 5520 }, { "epoch": 0.46, "grad_norm": 0.8969680070877075, "learning_rate": 1.708463768115942e-05, "loss": 1.7977, "step": 5530 }, { "epoch": 0.46, "grad_norm": 1.3134697675704956, "learning_rate": 1.7078840579710145e-05, "loss": 1.7819, "step": 5540 }, { "epoch": 0.46, "grad_norm": 2.6828033924102783, "learning_rate": 1.7073043478260872e-05, "loss": 1.924, "step": 5550 }, { "epoch": 0.46, "grad_norm": 1.980905294418335, "learning_rate": 1.7067246376811596e-05, "loss": 1.7677, "step": 5560 }, { "epoch": 0.46, "grad_norm": 4.521270751953125, "learning_rate": 1.706144927536232e-05, "loss": 1.7319, "step": 5570 }, { "epoch": 0.47, "grad_norm": 1.6345350742340088, "learning_rate": 1.7055652173913044e-05, "loss": 1.8736, "step": 5580 }, { "epoch": 0.47, "grad_norm": 1.6569340229034424, "learning_rate": 1.704985507246377e-05, "loss": 1.7549, "step": 5590 }, { "epoch": 0.47, "grad_norm": 2.3906219005584717, "learning_rate": 1.7044057971014495e-05, "loss": 1.8223, "step": 5600 }, { "epoch": 0.47, "grad_norm": 1.6184358596801758, "learning_rate": 1.703826086956522e-05, "loss": 1.7187, "step": 5610 }, { "epoch": 0.47, "grad_norm": 1.9662846326828003, "learning_rate": 1.7032463768115943e-05, "loss": 1.6826, "step": 5620 }, { "epoch": 0.47, "grad_norm": 1.2513630390167236, "learning_rate": 1.702666666666667e-05, "loss": 1.8264, "step": 5630 }, { "epoch": 0.47, "grad_norm": 2.757554769515991, "learning_rate": 1.702086956521739e-05, "loss": 1.7352, "step": 5640 }, { "epoch": 0.47, "grad_norm": 0.9750083088874817, "learning_rate": 1.7015072463768115e-05, "loss": 1.68, "step": 5650 }, { "epoch": 0.47, "grad_norm": 2.763793706893921, "learning_rate": 1.7009275362318843e-05, "loss": 1.7913, "step": 5660 }, { "epoch": 0.47, "grad_norm": 2.929349899291992, "learning_rate": 1.7003478260869567e-05, "loss": 1.7713, "step": 5670 }, { "epoch": 0.47, "grad_norm": 4.068824768066406, "learning_rate": 1.699768115942029e-05, "loss": 1.6546, "step": 5680 }, { "epoch": 0.47, "grad_norm": 3.953929901123047, "learning_rate": 1.6991884057971015e-05, "loss": 1.8524, "step": 5690 }, { "epoch": 0.47, "grad_norm": 1.38455069065094, "learning_rate": 1.6986086956521742e-05, "loss": 1.8452, "step": 5700 }, { "epoch": 0.48, "grad_norm": 1.4616096019744873, "learning_rate": 1.6980289855072466e-05, "loss": 1.7802, "step": 5710 }, { "epoch": 0.48, "grad_norm": 3.079864263534546, "learning_rate": 1.697449275362319e-05, "loss": 1.6293, "step": 5720 }, { "epoch": 0.48, "grad_norm": 4.769149303436279, "learning_rate": 1.6968695652173914e-05, "loss": 1.7868, "step": 5730 }, { "epoch": 0.48, "grad_norm": 3.6357669830322266, "learning_rate": 1.6962898550724638e-05, "loss": 1.8268, "step": 5740 }, { "epoch": 0.48, "grad_norm": 4.242451190948486, "learning_rate": 1.6957101449275362e-05, "loss": 1.8289, "step": 5750 }, { "epoch": 0.48, "grad_norm": 1.4025174379348755, "learning_rate": 1.6951304347826086e-05, "loss": 1.7262, "step": 5760 }, { "epoch": 0.48, "grad_norm": 2.201425790786743, "learning_rate": 1.6945507246376814e-05, "loss": 1.7995, "step": 5770 }, { "epoch": 0.48, "grad_norm": 3.1672089099884033, "learning_rate": 1.6939710144927538e-05, "loss": 1.7804, "step": 5780 }, { "epoch": 0.48, "grad_norm": 1.394217848777771, "learning_rate": 1.693391304347826e-05, "loss": 1.6865, "step": 5790 }, { "epoch": 0.48, "grad_norm": 4.455097198486328, "learning_rate": 1.6928115942028986e-05, "loss": 1.6132, "step": 5800 }, { "epoch": 0.48, "grad_norm": 1.6794978380203247, "learning_rate": 1.6922318840579713e-05, "loss": 1.7146, "step": 5810 }, { "epoch": 0.48, "grad_norm": 4.268734455108643, "learning_rate": 1.6916521739130437e-05, "loss": 1.8458, "step": 5820 }, { "epoch": 0.49, "grad_norm": 0.9144909381866455, "learning_rate": 1.691072463768116e-05, "loss": 1.7298, "step": 5830 }, { "epoch": 0.49, "grad_norm": 1.2349727153778076, "learning_rate": 1.6904927536231885e-05, "loss": 1.6982, "step": 5840 }, { "epoch": 0.49, "grad_norm": 2.7180557250976562, "learning_rate": 1.689913043478261e-05, "loss": 1.8027, "step": 5850 }, { "epoch": 0.49, "grad_norm": 3.6468281745910645, "learning_rate": 1.6893913043478262e-05, "loss": 1.8093, "step": 5860 }, { "epoch": 0.49, "grad_norm": 2.979691982269287, "learning_rate": 1.6888115942028986e-05, "loss": 1.5826, "step": 5870 }, { "epoch": 0.49, "grad_norm": 5.700094699859619, "learning_rate": 1.6882318840579713e-05, "loss": 1.6295, "step": 5880 }, { "epoch": 0.49, "grad_norm": 5.222003936767578, "learning_rate": 1.6876521739130437e-05, "loss": 1.655, "step": 5890 }, { "epoch": 0.49, "grad_norm": 1.3589376211166382, "learning_rate": 1.687072463768116e-05, "loss": 1.7627, "step": 5900 }, { "epoch": 0.49, "grad_norm": 2.710470676422119, "learning_rate": 1.6864927536231885e-05, "loss": 1.7624, "step": 5910 }, { "epoch": 0.49, "grad_norm": 1.3515254259109497, "learning_rate": 1.685913043478261e-05, "loss": 1.754, "step": 5920 }, { "epoch": 0.49, "grad_norm": 1.581727385520935, "learning_rate": 1.6853333333333333e-05, "loss": 1.7327, "step": 5930 }, { "epoch": 0.49, "grad_norm": 2.4707586765289307, "learning_rate": 1.6847536231884057e-05, "loss": 1.796, "step": 5940 }, { "epoch": 0.5, "grad_norm": 2.6579630374908447, "learning_rate": 1.6841739130434785e-05, "loss": 1.664, "step": 5950 }, { "epoch": 0.5, "grad_norm": 2.639225959777832, "learning_rate": 1.683594202898551e-05, "loss": 1.7041, "step": 5960 }, { "epoch": 0.5, "grad_norm": 2.8888185024261475, "learning_rate": 1.6830144927536233e-05, "loss": 1.7493, "step": 5970 }, { "epoch": 0.5, "grad_norm": 3.3176138401031494, "learning_rate": 1.6824347826086957e-05, "loss": 1.5773, "step": 5980 }, { "epoch": 0.5, "grad_norm": 1.5189319849014282, "learning_rate": 1.6818550724637684e-05, "loss": 1.6852, "step": 5990 }, { "epoch": 0.5, "grad_norm": 1.535059928894043, "learning_rate": 1.6812753623188408e-05, "loss": 1.7166, "step": 6000 }, { "epoch": 0.5, "eval_loss": 1.727913498878479, "eval_runtime": 107.4778, "eval_samples_per_second": 9.304, "eval_steps_per_second": 2.326, "step": 6000 }, { "epoch": 0.5, "grad_norm": 2.6467275619506836, "learning_rate": 1.6806956521739132e-05, "loss": 1.8415, "step": 6010 }, { "epoch": 0.5, "grad_norm": 1.4421344995498657, "learning_rate": 1.6801159420289856e-05, "loss": 1.6914, "step": 6020 }, { "epoch": 0.5, "grad_norm": 1.6661970615386963, "learning_rate": 1.679536231884058e-05, "loss": 1.8147, "step": 6030 }, { "epoch": 0.5, "grad_norm": 1.5970772504806519, "learning_rate": 1.6789565217391304e-05, "loss": 1.6534, "step": 6040 }, { "epoch": 0.5, "grad_norm": 3.1191587448120117, "learning_rate": 1.678376811594203e-05, "loss": 1.7521, "step": 6050 }, { "epoch": 0.51, "grad_norm": 1.529994249343872, "learning_rate": 1.6777971014492756e-05, "loss": 1.6383, "step": 6060 }, { "epoch": 0.51, "grad_norm": 1.4743109941482544, "learning_rate": 1.677217391304348e-05, "loss": 1.6432, "step": 6070 }, { "epoch": 0.51, "grad_norm": 2.2322564125061035, "learning_rate": 1.6766376811594204e-05, "loss": 1.6845, "step": 6080 }, { "epoch": 0.51, "grad_norm": 5.006726264953613, "learning_rate": 1.6760579710144928e-05, "loss": 1.8527, "step": 6090 }, { "epoch": 0.51, "grad_norm": 3.6824820041656494, "learning_rate": 1.6754782608695655e-05, "loss": 1.7306, "step": 6100 }, { "epoch": 0.51, "grad_norm": 1.2515430450439453, "learning_rate": 1.674898550724638e-05, "loss": 1.6419, "step": 6110 }, { "epoch": 0.51, "grad_norm": 1.548213005065918, "learning_rate": 1.6743188405797103e-05, "loss": 1.6744, "step": 6120 }, { "epoch": 0.51, "grad_norm": 2.4482362270355225, "learning_rate": 1.6737391304347827e-05, "loss": 1.8767, "step": 6130 }, { "epoch": 0.51, "grad_norm": 2.4814817905426025, "learning_rate": 1.673159420289855e-05, "loss": 1.71, "step": 6140 }, { "epoch": 0.51, "grad_norm": 1.6053385734558105, "learning_rate": 1.672579710144928e-05, "loss": 1.557, "step": 6150 }, { "epoch": 0.51, "grad_norm": 2.154989004135132, "learning_rate": 1.672e-05, "loss": 1.7413, "step": 6160 }, { "epoch": 0.51, "grad_norm": 3.2812576293945312, "learning_rate": 1.6714202898550727e-05, "loss": 1.7674, "step": 6170 }, { "epoch": 0.52, "grad_norm": 1.1822229623794556, "learning_rate": 1.670840579710145e-05, "loss": 1.7635, "step": 6180 }, { "epoch": 0.52, "grad_norm": 3.022081136703491, "learning_rate": 1.6702608695652175e-05, "loss": 1.9249, "step": 6190 }, { "epoch": 0.52, "grad_norm": 1.4836503267288208, "learning_rate": 1.66968115942029e-05, "loss": 1.7538, "step": 6200 }, { "epoch": 0.52, "grad_norm": 2.186819314956665, "learning_rate": 1.6691014492753626e-05, "loss": 1.7048, "step": 6210 }, { "epoch": 0.52, "grad_norm": 2.98888897895813, "learning_rate": 1.668521739130435e-05, "loss": 1.7195, "step": 6220 }, { "epoch": 0.52, "grad_norm": 1.2272439002990723, "learning_rate": 1.6679420289855074e-05, "loss": 1.7088, "step": 6230 }, { "epoch": 0.52, "grad_norm": 5.195132255554199, "learning_rate": 1.6673623188405798e-05, "loss": 1.9109, "step": 6240 }, { "epoch": 0.52, "grad_norm": 1.4767738580703735, "learning_rate": 1.6667826086956522e-05, "loss": 1.7119, "step": 6250 }, { "epoch": 0.52, "grad_norm": 4.870778560638428, "learning_rate": 1.666202898550725e-05, "loss": 1.6744, "step": 6260 }, { "epoch": 0.52, "grad_norm": 2.495901584625244, "learning_rate": 1.6656231884057973e-05, "loss": 1.7129, "step": 6270 }, { "epoch": 0.52, "grad_norm": 3.2989490032196045, "learning_rate": 1.6650434782608697e-05, "loss": 1.7543, "step": 6280 }, { "epoch": 0.52, "grad_norm": 1.2560532093048096, "learning_rate": 1.664463768115942e-05, "loss": 1.6504, "step": 6290 }, { "epoch": 0.53, "grad_norm": 1.1828808784484863, "learning_rate": 1.663884057971015e-05, "loss": 1.8215, "step": 6300 }, { "epoch": 0.53, "grad_norm": 1.3935143947601318, "learning_rate": 1.663304347826087e-05, "loss": 1.7343, "step": 6310 }, { "epoch": 0.53, "grad_norm": 2.2186152935028076, "learning_rate": 1.6627246376811597e-05, "loss": 1.8291, "step": 6320 }, { "epoch": 0.53, "grad_norm": 2.0307185649871826, "learning_rate": 1.662144927536232e-05, "loss": 1.7496, "step": 6330 }, { "epoch": 0.53, "grad_norm": 1.4176080226898193, "learning_rate": 1.6615652173913045e-05, "loss": 1.7847, "step": 6340 }, { "epoch": 0.53, "grad_norm": 2.357802391052246, "learning_rate": 1.660985507246377e-05, "loss": 1.7378, "step": 6350 }, { "epoch": 0.53, "grad_norm": 2.5888750553131104, "learning_rate": 1.6604057971014493e-05, "loss": 1.7028, "step": 6360 }, { "epoch": 0.53, "grad_norm": 1.2639836072921753, "learning_rate": 1.659826086956522e-05, "loss": 1.7333, "step": 6370 }, { "epoch": 0.53, "grad_norm": 2.8823935985565186, "learning_rate": 1.6592463768115944e-05, "loss": 1.7696, "step": 6380 }, { "epoch": 0.53, "grad_norm": 1.7665212154388428, "learning_rate": 1.6586666666666668e-05, "loss": 1.8186, "step": 6390 }, { "epoch": 0.53, "grad_norm": 3.463416814804077, "learning_rate": 1.6580869565217392e-05, "loss": 1.8244, "step": 6400 }, { "epoch": 0.53, "grad_norm": 1.7725675106048584, "learning_rate": 1.657507246376812e-05, "loss": 1.7399, "step": 6410 }, { "epoch": 0.54, "grad_norm": 2.1055688858032227, "learning_rate": 1.656927536231884e-05, "loss": 1.6752, "step": 6420 }, { "epoch": 0.54, "grad_norm": 3.611109972000122, "learning_rate": 1.6563478260869568e-05, "loss": 1.5842, "step": 6430 }, { "epoch": 0.54, "grad_norm": 2.2504234313964844, "learning_rate": 1.655768115942029e-05, "loss": 1.6229, "step": 6440 }, { "epoch": 0.54, "grad_norm": 2.377387046813965, "learning_rate": 1.6551884057971016e-05, "loss": 1.7606, "step": 6450 }, { "epoch": 0.54, "grad_norm": 1.7958356142044067, "learning_rate": 1.654608695652174e-05, "loss": 1.7271, "step": 6460 }, { "epoch": 0.54, "grad_norm": 1.7075881958007812, "learning_rate": 1.6540289855072464e-05, "loss": 1.7918, "step": 6470 }, { "epoch": 0.54, "grad_norm": 4.526883602142334, "learning_rate": 1.653449275362319e-05, "loss": 1.721, "step": 6480 }, { "epoch": 0.54, "grad_norm": 3.219193458557129, "learning_rate": 1.6528695652173915e-05, "loss": 1.8068, "step": 6490 }, { "epoch": 0.54, "grad_norm": 1.7453542947769165, "learning_rate": 1.652289855072464e-05, "loss": 1.7639, "step": 6500 }, { "epoch": 0.54, "eval_loss": 1.727725625038147, "eval_runtime": 107.4666, "eval_samples_per_second": 9.305, "eval_steps_per_second": 2.326, "step": 6500 }, { "epoch": 0.54, "grad_norm": 0.9024394750595093, "learning_rate": 1.6517101449275363e-05, "loss": 1.7855, "step": 6510 }, { "epoch": 0.54, "grad_norm": 2.008007287979126, "learning_rate": 1.651130434782609e-05, "loss": 1.7057, "step": 6520 }, { "epoch": 0.54, "grad_norm": 1.9877010583877563, "learning_rate": 1.6505507246376814e-05, "loss": 1.7263, "step": 6530 }, { "epoch": 0.55, "grad_norm": 4.027751445770264, "learning_rate": 1.6499710144927535e-05, "loss": 1.6714, "step": 6540 }, { "epoch": 0.55, "grad_norm": 5.0006256103515625, "learning_rate": 1.6493913043478262e-05, "loss": 1.6872, "step": 6550 }, { "epoch": 0.55, "grad_norm": 3.7696921825408936, "learning_rate": 1.6488115942028986e-05, "loss": 1.7761, "step": 6560 }, { "epoch": 0.55, "grad_norm": 4.299485683441162, "learning_rate": 1.648231884057971e-05, "loss": 1.6696, "step": 6570 }, { "epoch": 0.55, "grad_norm": 2.5291144847869873, "learning_rate": 1.6476521739130435e-05, "loss": 1.6788, "step": 6580 }, { "epoch": 0.55, "grad_norm": 4.1017231941223145, "learning_rate": 1.6470724637681162e-05, "loss": 1.6339, "step": 6590 }, { "epoch": 0.55, "grad_norm": 2.975684881210327, "learning_rate": 1.6464927536231886e-05, "loss": 1.63, "step": 6600 }, { "epoch": 0.55, "grad_norm": 1.5306545495986938, "learning_rate": 1.645913043478261e-05, "loss": 1.6622, "step": 6610 }, { "epoch": 0.55, "grad_norm": 1.816344976425171, "learning_rate": 1.6453333333333334e-05, "loss": 1.8507, "step": 6620 }, { "epoch": 0.55, "grad_norm": 3.180396556854248, "learning_rate": 1.644753623188406e-05, "loss": 1.6871, "step": 6630 }, { "epoch": 0.55, "grad_norm": 1.9468295574188232, "learning_rate": 1.6441739130434785e-05, "loss": 1.7409, "step": 6640 }, { "epoch": 0.55, "grad_norm": 1.6706266403198242, "learning_rate": 1.6435942028985506e-05, "loss": 1.6801, "step": 6650 }, { "epoch": 0.56, "grad_norm": 1.4290724992752075, "learning_rate": 1.6430144927536233e-05, "loss": 1.6932, "step": 6660 }, { "epoch": 0.56, "grad_norm": 2.8305716514587402, "learning_rate": 1.6424347826086957e-05, "loss": 1.8525, "step": 6670 }, { "epoch": 0.56, "grad_norm": 1.744149088859558, "learning_rate": 1.641855072463768e-05, "loss": 1.6997, "step": 6680 }, { "epoch": 0.56, "grad_norm": 3.0578601360321045, "learning_rate": 1.6412753623188405e-05, "loss": 1.706, "step": 6690 }, { "epoch": 0.56, "grad_norm": 2.352912425994873, "learning_rate": 1.6406956521739133e-05, "loss": 1.7743, "step": 6700 }, { "epoch": 0.56, "grad_norm": 1.978705883026123, "learning_rate": 1.6401159420289857e-05, "loss": 1.7195, "step": 6710 }, { "epoch": 0.56, "grad_norm": 1.2666630744934082, "learning_rate": 1.639536231884058e-05, "loss": 1.6889, "step": 6720 }, { "epoch": 0.56, "grad_norm": 1.681015133857727, "learning_rate": 1.6389565217391305e-05, "loss": 1.6464, "step": 6730 }, { "epoch": 0.56, "grad_norm": 1.7917625904083252, "learning_rate": 1.6383768115942032e-05, "loss": 1.7322, "step": 6740 }, { "epoch": 0.56, "grad_norm": 2.438145875930786, "learning_rate": 1.6377971014492756e-05, "loss": 1.6975, "step": 6750 }, { "epoch": 0.56, "grad_norm": 2.554532766342163, "learning_rate": 1.637217391304348e-05, "loss": 1.7592, "step": 6760 }, { "epoch": 0.56, "grad_norm": 3.3768386840820312, "learning_rate": 1.6366376811594204e-05, "loss": 1.6267, "step": 6770 }, { "epoch": 0.56, "grad_norm": 3.807661533355713, "learning_rate": 1.6360579710144928e-05, "loss": 1.6814, "step": 6780 }, { "epoch": 0.57, "grad_norm": 5.202757835388184, "learning_rate": 1.6354782608695656e-05, "loss": 1.7636, "step": 6790 }, { "epoch": 0.57, "grad_norm": 4.915995121002197, "learning_rate": 1.6348985507246376e-05, "loss": 1.6552, "step": 6800 }, { "epoch": 0.57, "grad_norm": 3.9319915771484375, "learning_rate": 1.6343188405797104e-05, "loss": 1.6542, "step": 6810 }, { "epoch": 0.57, "grad_norm": 2.227419137954712, "learning_rate": 1.6337391304347828e-05, "loss": 1.771, "step": 6820 }, { "epoch": 0.57, "grad_norm": 6.618062973022461, "learning_rate": 1.633159420289855e-05, "loss": 1.7122, "step": 6830 }, { "epoch": 0.57, "grad_norm": 1.2178608179092407, "learning_rate": 1.6325797101449276e-05, "loss": 1.6398, "step": 6840 }, { "epoch": 0.57, "grad_norm": 2.5877292156219482, "learning_rate": 1.632e-05, "loss": 1.8347, "step": 6850 }, { "epoch": 0.57, "grad_norm": 3.831631898880005, "learning_rate": 1.6314202898550727e-05, "loss": 1.6153, "step": 6860 }, { "epoch": 0.57, "grad_norm": 2.031569242477417, "learning_rate": 1.630840579710145e-05, "loss": 1.5382, "step": 6870 }, { "epoch": 0.57, "grad_norm": 5.036290645599365, "learning_rate": 1.6302608695652175e-05, "loss": 1.7124, "step": 6880 }, { "epoch": 0.57, "grad_norm": 3.04699444770813, "learning_rate": 1.62968115942029e-05, "loss": 1.8437, "step": 6890 }, { "epoch": 0.57, "grad_norm": 1.757121205329895, "learning_rate": 1.6291014492753626e-05, "loss": 1.7331, "step": 6900 }, { "epoch": 0.58, "grad_norm": 2.107645273208618, "learning_rate": 1.628521739130435e-05, "loss": 1.7234, "step": 6910 }, { "epoch": 0.58, "grad_norm": 2.109172821044922, "learning_rate": 1.6279420289855074e-05, "loss": 1.8055, "step": 6920 }, { "epoch": 0.58, "grad_norm": 2.344881772994995, "learning_rate": 1.62736231884058e-05, "loss": 1.7585, "step": 6930 }, { "epoch": 0.58, "grad_norm": 2.4652295112609863, "learning_rate": 1.6267826086956522e-05, "loss": 1.7634, "step": 6940 }, { "epoch": 0.58, "grad_norm": 3.3896074295043945, "learning_rate": 1.6262028985507246e-05, "loss": 1.8021, "step": 6950 }, { "epoch": 0.58, "grad_norm": 1.5588115453720093, "learning_rate": 1.625623188405797e-05, "loss": 1.7534, "step": 6960 }, { "epoch": 0.58, "grad_norm": 3.033330202102661, "learning_rate": 1.6250434782608698e-05, "loss": 1.6904, "step": 6970 }, { "epoch": 0.58, "grad_norm": 1.9910866022109985, "learning_rate": 1.6244637681159422e-05, "loss": 1.708, "step": 6980 }, { "epoch": 0.58, "grad_norm": 3.4834866523742676, "learning_rate": 1.6238840579710146e-05, "loss": 1.717, "step": 6990 }, { "epoch": 0.58, "grad_norm": 1.1782163381576538, "learning_rate": 1.623304347826087e-05, "loss": 1.5833, "step": 7000 }, { "epoch": 0.58, "eval_loss": 1.7107688188552856, "eval_runtime": 107.4755, "eval_samples_per_second": 9.304, "eval_steps_per_second": 2.326, "step": 7000 }, { "epoch": 0.58, "grad_norm": 3.3755738735198975, "learning_rate": 1.6227246376811597e-05, "loss": 1.845, "step": 7010 }, { "epoch": 0.58, "grad_norm": 1.7433241605758667, "learning_rate": 1.622144927536232e-05, "loss": 1.8534, "step": 7020 }, { "epoch": 0.59, "grad_norm": 6.202053070068359, "learning_rate": 1.6215652173913045e-05, "loss": 1.6611, "step": 7030 }, { "epoch": 0.59, "grad_norm": 1.540313720703125, "learning_rate": 1.620985507246377e-05, "loss": 1.7816, "step": 7040 }, { "epoch": 0.59, "grad_norm": 2.980997085571289, "learning_rate": 1.6204057971014497e-05, "loss": 1.8204, "step": 7050 }, { "epoch": 0.59, "grad_norm": 3.4950547218322754, "learning_rate": 1.6198260869565217e-05, "loss": 1.779, "step": 7060 }, { "epoch": 0.59, "grad_norm": 1.1530729532241821, "learning_rate": 1.619246376811594e-05, "loss": 1.7435, "step": 7070 }, { "epoch": 0.59, "grad_norm": 1.9771350622177124, "learning_rate": 1.618666666666667e-05, "loss": 1.65, "step": 7080 }, { "epoch": 0.59, "grad_norm": 3.06072735786438, "learning_rate": 1.6180869565217393e-05, "loss": 1.741, "step": 7090 }, { "epoch": 0.59, "grad_norm": 1.4323557615280151, "learning_rate": 1.6175072463768117e-05, "loss": 1.807, "step": 7100 }, { "epoch": 0.59, "grad_norm": 1.6237205266952515, "learning_rate": 1.616927536231884e-05, "loss": 1.742, "step": 7110 }, { "epoch": 0.59, "grad_norm": 2.375201463699341, "learning_rate": 1.6163478260869568e-05, "loss": 1.6201, "step": 7120 }, { "epoch": 0.59, "grad_norm": 3.091128349304199, "learning_rate": 1.6157681159420292e-05, "loss": 1.7904, "step": 7130 }, { "epoch": 0.59, "grad_norm": 1.4652012586593628, "learning_rate": 1.6151884057971016e-05, "loss": 1.6906, "step": 7140 }, { "epoch": 0.6, "grad_norm": 2.8465373516082764, "learning_rate": 1.614608695652174e-05, "loss": 1.8106, "step": 7150 }, { "epoch": 0.6, "grad_norm": 2.7708303928375244, "learning_rate": 1.6140289855072464e-05, "loss": 1.7154, "step": 7160 }, { "epoch": 0.6, "grad_norm": 2.694922924041748, "learning_rate": 1.613449275362319e-05, "loss": 1.738, "step": 7170 }, { "epoch": 0.6, "grad_norm": 2.2050352096557617, "learning_rate": 1.6128695652173912e-05, "loss": 1.7701, "step": 7180 }, { "epoch": 0.6, "grad_norm": 3.498955249786377, "learning_rate": 1.612289855072464e-05, "loss": 1.7794, "step": 7190 }, { "epoch": 0.6, "grad_norm": 1.483665108680725, "learning_rate": 1.6117101449275364e-05, "loss": 1.7243, "step": 7200 }, { "epoch": 0.6, "grad_norm": 2.6146090030670166, "learning_rate": 1.6111304347826088e-05, "loss": 1.7547, "step": 7210 }, { "epoch": 0.6, "grad_norm": 0.9853881597518921, "learning_rate": 1.610550724637681e-05, "loss": 1.6421, "step": 7220 }, { "epoch": 0.6, "grad_norm": 1.687666654586792, "learning_rate": 1.609971014492754e-05, "loss": 1.6646, "step": 7230 }, { "epoch": 0.6, "grad_norm": 1.8231513500213623, "learning_rate": 1.6093913043478263e-05, "loss": 1.8263, "step": 7240 }, { "epoch": 0.6, "grad_norm": 1.8006614446640015, "learning_rate": 1.6088115942028987e-05, "loss": 1.6252, "step": 7250 }, { "epoch": 0.6, "grad_norm": 1.6685547828674316, "learning_rate": 1.608231884057971e-05, "loss": 1.6308, "step": 7260 }, { "epoch": 0.61, "grad_norm": 4.637762069702148, "learning_rate": 1.6076521739130435e-05, "loss": 1.6501, "step": 7270 }, { "epoch": 0.61, "grad_norm": 2.1397578716278076, "learning_rate": 1.6070724637681162e-05, "loss": 1.7779, "step": 7280 }, { "epoch": 0.61, "grad_norm": 2.391406536102295, "learning_rate": 1.6064927536231883e-05, "loss": 1.7377, "step": 7290 }, { "epoch": 0.61, "grad_norm": 4.646698474884033, "learning_rate": 1.605913043478261e-05, "loss": 1.7138, "step": 7300 }, { "epoch": 0.61, "grad_norm": 1.9890764951705933, "learning_rate": 1.6053333333333334e-05, "loss": 1.636, "step": 7310 }, { "epoch": 0.61, "grad_norm": 1.770165205001831, "learning_rate": 1.604753623188406e-05, "loss": 1.8327, "step": 7320 }, { "epoch": 0.61, "grad_norm": 1.544312834739685, "learning_rate": 1.6041739130434782e-05, "loss": 1.7628, "step": 7330 }, { "epoch": 0.61, "grad_norm": 3.551856517791748, "learning_rate": 1.603594202898551e-05, "loss": 1.7112, "step": 7340 }, { "epoch": 0.61, "grad_norm": 4.357561111450195, "learning_rate": 1.6030144927536234e-05, "loss": 1.8364, "step": 7350 }, { "epoch": 0.61, "grad_norm": 8.165691375732422, "learning_rate": 1.6024347826086958e-05, "loss": 1.6699, "step": 7360 }, { "epoch": 0.61, "grad_norm": 1.3608006238937378, "learning_rate": 1.6018550724637682e-05, "loss": 1.8786, "step": 7370 }, { "epoch": 0.61, "grad_norm": 1.6346604824066162, "learning_rate": 1.6012753623188406e-05, "loss": 1.6912, "step": 7380 }, { "epoch": 0.62, "grad_norm": 2.323948383331299, "learning_rate": 1.6006956521739133e-05, "loss": 1.7305, "step": 7390 }, { "epoch": 0.62, "grad_norm": 1.645804762840271, "learning_rate": 1.6001159420289857e-05, "loss": 1.6702, "step": 7400 }, { "epoch": 0.62, "grad_norm": 1.5827347040176392, "learning_rate": 1.599536231884058e-05, "loss": 1.6253, "step": 7410 }, { "epoch": 0.62, "grad_norm": 2.5143661499023438, "learning_rate": 1.5989565217391305e-05, "loss": 1.6295, "step": 7420 }, { "epoch": 0.62, "grad_norm": 3.025846242904663, "learning_rate": 1.5983768115942033e-05, "loss": 1.6297, "step": 7430 }, { "epoch": 0.62, "grad_norm": 1.31910240650177, "learning_rate": 1.5977971014492753e-05, "loss": 1.66, "step": 7440 }, { "epoch": 0.62, "grad_norm": 3.5227108001708984, "learning_rate": 1.597217391304348e-05, "loss": 1.5631, "step": 7450 }, { "epoch": 0.62, "grad_norm": 2.1496291160583496, "learning_rate": 1.5966376811594205e-05, "loss": 1.7575, "step": 7460 }, { "epoch": 0.62, "grad_norm": 1.5027258396148682, "learning_rate": 1.596057971014493e-05, "loss": 1.7693, "step": 7470 }, { "epoch": 0.62, "grad_norm": 2.8915343284606934, "learning_rate": 1.5954782608695653e-05, "loss": 1.5916, "step": 7480 }, { "epoch": 0.62, "grad_norm": 1.5063656568527222, "learning_rate": 1.5948985507246377e-05, "loss": 1.8137, "step": 7490 }, { "epoch": 0.62, "grad_norm": 2.6433496475219727, "learning_rate": 1.5943188405797104e-05, "loss": 1.7322, "step": 7500 }, { "epoch": 0.62, "eval_loss": 1.705659031867981, "eval_runtime": 107.4816, "eval_samples_per_second": 9.304, "eval_steps_per_second": 2.326, "step": 7500 }, { "epoch": 0.63, "grad_norm": 4.358209609985352, "learning_rate": 1.5937391304347828e-05, "loss": 1.7619, "step": 7510 }, { "epoch": 0.63, "grad_norm": 4.150690078735352, "learning_rate": 1.5931594202898552e-05, "loss": 1.6083, "step": 7520 }, { "epoch": 0.63, "grad_norm": 5.220200061798096, "learning_rate": 1.5925797101449276e-05, "loss": 1.7934, "step": 7530 }, { "epoch": 0.63, "grad_norm": 1.5552024841308594, "learning_rate": 1.5920000000000003e-05, "loss": 1.7875, "step": 7540 }, { "epoch": 0.63, "grad_norm": 2.2470431327819824, "learning_rate": 1.5914202898550727e-05, "loss": 1.8069, "step": 7550 }, { "epoch": 0.63, "grad_norm": 2.775547981262207, "learning_rate": 1.590840579710145e-05, "loss": 1.7393, "step": 7560 }, { "epoch": 0.63, "grad_norm": 3.7677903175354004, "learning_rate": 1.5902608695652175e-05, "loss": 1.7661, "step": 7570 }, { "epoch": 0.63, "grad_norm": 1.0370768308639526, "learning_rate": 1.58968115942029e-05, "loss": 1.7814, "step": 7580 }, { "epoch": 0.63, "grad_norm": 1.4217703342437744, "learning_rate": 1.5891014492753623e-05, "loss": 1.7731, "step": 7590 }, { "epoch": 0.63, "grad_norm": 3.532466173171997, "learning_rate": 1.5885217391304347e-05, "loss": 1.68, "step": 7600 }, { "epoch": 0.63, "grad_norm": 3.8123104572296143, "learning_rate": 1.5879420289855075e-05, "loss": 1.6958, "step": 7610 }, { "epoch": 0.64, "grad_norm": 5.266079425811768, "learning_rate": 1.58736231884058e-05, "loss": 1.8838, "step": 7620 }, { "epoch": 0.64, "grad_norm": 4.8042216300964355, "learning_rate": 1.5867826086956523e-05, "loss": 1.7128, "step": 7630 }, { "epoch": 0.64, "grad_norm": 0.8047385215759277, "learning_rate": 1.5862028985507247e-05, "loss": 1.7068, "step": 7640 }, { "epoch": 0.64, "grad_norm": 2.157292604446411, "learning_rate": 1.5856231884057974e-05, "loss": 1.8066, "step": 7650 }, { "epoch": 0.64, "grad_norm": 1.3345887660980225, "learning_rate": 1.5850434782608698e-05, "loss": 1.6732, "step": 7660 }, { "epoch": 0.64, "grad_norm": 1.439162254333496, "learning_rate": 1.5844637681159422e-05, "loss": 1.698, "step": 7670 }, { "epoch": 0.64, "grad_norm": 4.751528263092041, "learning_rate": 1.5838840579710146e-05, "loss": 1.7296, "step": 7680 }, { "epoch": 0.64, "grad_norm": 3.4270384311676025, "learning_rate": 1.583304347826087e-05, "loss": 1.84, "step": 7690 }, { "epoch": 0.64, "grad_norm": 2.0358874797821045, "learning_rate": 1.5827246376811594e-05, "loss": 1.7292, "step": 7700 }, { "epoch": 0.64, "grad_norm": 1.3515872955322266, "learning_rate": 1.5821449275362318e-05, "loss": 1.6988, "step": 7710 }, { "epoch": 0.64, "grad_norm": 2.5109755992889404, "learning_rate": 1.5815652173913046e-05, "loss": 1.6356, "step": 7720 }, { "epoch": 0.64, "grad_norm": 2.2800748348236084, "learning_rate": 1.580985507246377e-05, "loss": 1.6617, "step": 7730 }, { "epoch": 0.65, "grad_norm": 7.169689178466797, "learning_rate": 1.5804057971014494e-05, "loss": 1.8323, "step": 7740 }, { "epoch": 0.65, "grad_norm": 1.6964682340621948, "learning_rate": 1.5798260869565218e-05, "loss": 1.7671, "step": 7750 }, { "epoch": 0.65, "grad_norm": 2.151012420654297, "learning_rate": 1.5792463768115945e-05, "loss": 1.7451, "step": 7760 }, { "epoch": 0.65, "grad_norm": 1.739316463470459, "learning_rate": 1.578666666666667e-05, "loss": 1.7101, "step": 7770 }, { "epoch": 0.65, "grad_norm": 1.8832248449325562, "learning_rate": 1.5780869565217393e-05, "loss": 1.7384, "step": 7780 }, { "epoch": 0.65, "grad_norm": 3.235473155975342, "learning_rate": 1.5775072463768117e-05, "loss": 1.6944, "step": 7790 }, { "epoch": 0.65, "grad_norm": 2.9296913146972656, "learning_rate": 1.576927536231884e-05, "loss": 1.7979, "step": 7800 }, { "epoch": 0.65, "grad_norm": 3.7439193725585938, "learning_rate": 1.576347826086957e-05, "loss": 1.6823, "step": 7810 }, { "epoch": 0.65, "grad_norm": 2.104919195175171, "learning_rate": 1.575768115942029e-05, "loss": 1.9278, "step": 7820 }, { "epoch": 0.65, "grad_norm": 2.0970284938812256, "learning_rate": 1.5751884057971017e-05, "loss": 1.6475, "step": 7830 }, { "epoch": 0.65, "grad_norm": 2.109387159347534, "learning_rate": 1.574608695652174e-05, "loss": 1.7482, "step": 7840 }, { "epoch": 0.65, "grad_norm": 2.093923807144165, "learning_rate": 1.5740289855072465e-05, "loss": 1.708, "step": 7850 }, { "epoch": 0.66, "grad_norm": 1.431930422782898, "learning_rate": 1.573449275362319e-05, "loss": 1.7585, "step": 7860 }, { "epoch": 0.66, "grad_norm": 3.5843312740325928, "learning_rate": 1.5728695652173916e-05, "loss": 1.7017, "step": 7870 }, { "epoch": 0.66, "grad_norm": 2.592013120651245, "learning_rate": 1.572289855072464e-05, "loss": 1.6862, "step": 7880 }, { "epoch": 0.66, "grad_norm": 3.1581757068634033, "learning_rate": 1.5717101449275364e-05, "loss": 1.8041, "step": 7890 }, { "epoch": 0.66, "grad_norm": 1.719002604484558, "learning_rate": 1.5711304347826088e-05, "loss": 1.7885, "step": 7900 }, { "epoch": 0.66, "grad_norm": 1.8887274265289307, "learning_rate": 1.5705507246376812e-05, "loss": 1.741, "step": 7910 }, { "epoch": 0.66, "grad_norm": 1.5597342252731323, "learning_rate": 1.569971014492754e-05, "loss": 1.7687, "step": 7920 }, { "epoch": 0.66, "grad_norm": 3.3231523036956787, "learning_rate": 1.569391304347826e-05, "loss": 1.7038, "step": 7930 }, { "epoch": 0.66, "grad_norm": 2.540226936340332, "learning_rate": 1.5688115942028987e-05, "loss": 1.8191, "step": 7940 }, { "epoch": 0.66, "grad_norm": 2.614171266555786, "learning_rate": 1.568231884057971e-05, "loss": 1.5657, "step": 7950 }, { "epoch": 0.66, "grad_norm": 3.2686009407043457, "learning_rate": 1.5676521739130435e-05, "loss": 1.6592, "step": 7960 }, { "epoch": 0.66, "grad_norm": 2.4677610397338867, "learning_rate": 1.567072463768116e-05, "loss": 1.702, "step": 7970 }, { "epoch": 0.67, "grad_norm": 2.712357521057129, "learning_rate": 1.5664927536231887e-05, "loss": 1.7096, "step": 7980 }, { "epoch": 0.67, "grad_norm": 3.5932676792144775, "learning_rate": 1.565913043478261e-05, "loss": 1.6477, "step": 7990 }, { "epoch": 0.67, "grad_norm": 2.4786651134490967, "learning_rate": 1.5653333333333335e-05, "loss": 1.7013, "step": 8000 }, { "epoch": 0.67, "eval_loss": 1.751250982284546, "eval_runtime": 107.5079, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 8000 }, { "epoch": 0.67, "grad_norm": 2.696633815765381, "learning_rate": 1.564753623188406e-05, "loss": 1.7176, "step": 8010 }, { "epoch": 0.67, "grad_norm": 2.963792324066162, "learning_rate": 1.5641739130434783e-05, "loss": 1.7605, "step": 8020 }, { "epoch": 0.67, "grad_norm": 2.107274055480957, "learning_rate": 1.563594202898551e-05, "loss": 1.5939, "step": 8030 }, { "epoch": 0.67, "grad_norm": 3.7392399311065674, "learning_rate": 1.5630144927536234e-05, "loss": 1.7734, "step": 8040 }, { "epoch": 0.67, "grad_norm": 2.943000078201294, "learning_rate": 1.5624347826086958e-05, "loss": 1.6703, "step": 8050 }, { "epoch": 0.67, "grad_norm": 2.6716084480285645, "learning_rate": 1.5618550724637682e-05, "loss": 1.7177, "step": 8060 }, { "epoch": 0.67, "grad_norm": 3.429185390472412, "learning_rate": 1.561275362318841e-05, "loss": 1.7043, "step": 8070 }, { "epoch": 0.67, "grad_norm": 3.558910608291626, "learning_rate": 1.560695652173913e-05, "loss": 1.7066, "step": 8080 }, { "epoch": 0.67, "grad_norm": 3.654998302459717, "learning_rate": 1.5601159420289854e-05, "loss": 1.7063, "step": 8090 }, { "epoch": 0.68, "grad_norm": 3.6076571941375732, "learning_rate": 1.559536231884058e-05, "loss": 1.6818, "step": 8100 }, { "epoch": 0.68, "grad_norm": 1.5298787355422974, "learning_rate": 1.5589565217391306e-05, "loss": 1.6751, "step": 8110 }, { "epoch": 0.68, "grad_norm": 3.7017557621002197, "learning_rate": 1.558376811594203e-05, "loss": 1.7395, "step": 8120 }, { "epoch": 0.68, "grad_norm": 2.02172589302063, "learning_rate": 1.5577971014492754e-05, "loss": 1.777, "step": 8130 }, { "epoch": 0.68, "grad_norm": 4.633596897125244, "learning_rate": 1.557217391304348e-05, "loss": 1.7152, "step": 8140 }, { "epoch": 0.68, "grad_norm": 1.3255687952041626, "learning_rate": 1.5566376811594205e-05, "loss": 1.6739, "step": 8150 }, { "epoch": 0.68, "grad_norm": 2.6454014778137207, "learning_rate": 1.556057971014493e-05, "loss": 1.7551, "step": 8160 }, { "epoch": 0.68, "grad_norm": 2.2562685012817383, "learning_rate": 1.5554782608695653e-05, "loss": 1.7421, "step": 8170 }, { "epoch": 0.68, "grad_norm": 3.259690761566162, "learning_rate": 1.554898550724638e-05, "loss": 1.6761, "step": 8180 }, { "epoch": 0.68, "grad_norm": 1.7266426086425781, "learning_rate": 1.55431884057971e-05, "loss": 1.8321, "step": 8190 }, { "epoch": 0.68, "grad_norm": 1.9167896509170532, "learning_rate": 1.5537391304347825e-05, "loss": 1.7285, "step": 8200 }, { "epoch": 0.68, "grad_norm": 4.338323593139648, "learning_rate": 1.5531594202898552e-05, "loss": 1.7145, "step": 8210 }, { "epoch": 0.69, "grad_norm": 3.5180749893188477, "learning_rate": 1.5525797101449276e-05, "loss": 1.7482, "step": 8220 }, { "epoch": 0.69, "grad_norm": 5.429786682128906, "learning_rate": 1.552e-05, "loss": 1.7808, "step": 8230 }, { "epoch": 0.69, "grad_norm": 4.005053997039795, "learning_rate": 1.5514202898550724e-05, "loss": 1.7514, "step": 8240 }, { "epoch": 0.69, "grad_norm": 3.007596969604492, "learning_rate": 1.5508405797101452e-05, "loss": 1.739, "step": 8250 }, { "epoch": 0.69, "grad_norm": 2.8556976318359375, "learning_rate": 1.5502608695652176e-05, "loss": 1.6269, "step": 8260 }, { "epoch": 0.69, "grad_norm": 3.3058815002441406, "learning_rate": 1.54968115942029e-05, "loss": 1.8055, "step": 8270 }, { "epoch": 0.69, "grad_norm": 2.482654094696045, "learning_rate": 1.5491014492753624e-05, "loss": 1.7335, "step": 8280 }, { "epoch": 0.69, "grad_norm": 2.104722499847412, "learning_rate": 1.548521739130435e-05, "loss": 1.6697, "step": 8290 }, { "epoch": 0.69, "grad_norm": 1.4763132333755493, "learning_rate": 1.5479420289855075e-05, "loss": 1.7573, "step": 8300 }, { "epoch": 0.69, "grad_norm": 2.5669054985046387, "learning_rate": 1.5473623188405796e-05, "loss": 1.737, "step": 8310 }, { "epoch": 0.69, "grad_norm": 1.9973231554031372, "learning_rate": 1.5467826086956523e-05, "loss": 1.828, "step": 8320 }, { "epoch": 0.69, "grad_norm": 2.6836585998535156, "learning_rate": 1.5462028985507247e-05, "loss": 1.6509, "step": 8330 }, { "epoch": 0.69, "grad_norm": 1.6782662868499756, "learning_rate": 1.545623188405797e-05, "loss": 1.7843, "step": 8340 }, { "epoch": 0.7, "grad_norm": 4.171507835388184, "learning_rate": 1.5450434782608695e-05, "loss": 1.697, "step": 8350 }, { "epoch": 0.7, "grad_norm": 5.005525588989258, "learning_rate": 1.5444637681159423e-05, "loss": 1.6807, "step": 8360 }, { "epoch": 0.7, "grad_norm": 2.1129069328308105, "learning_rate": 1.5438840579710147e-05, "loss": 1.6926, "step": 8370 }, { "epoch": 0.7, "grad_norm": 2.524050235748291, "learning_rate": 1.543304347826087e-05, "loss": 1.7259, "step": 8380 }, { "epoch": 0.7, "grad_norm": 3.451939105987549, "learning_rate": 1.5427246376811595e-05, "loss": 1.8682, "step": 8390 }, { "epoch": 0.7, "grad_norm": 2.816455841064453, "learning_rate": 1.542144927536232e-05, "loss": 1.6865, "step": 8400 }, { "epoch": 0.7, "grad_norm": 1.2041363716125488, "learning_rate": 1.5415652173913046e-05, "loss": 1.7065, "step": 8410 }, { "epoch": 0.7, "grad_norm": 2.183321475982666, "learning_rate": 1.540985507246377e-05, "loss": 1.7404, "step": 8420 }, { "epoch": 0.7, "grad_norm": 1.7429885864257812, "learning_rate": 1.5404057971014494e-05, "loss": 1.7694, "step": 8430 }, { "epoch": 0.7, "grad_norm": 3.275554656982422, "learning_rate": 1.5398260869565218e-05, "loss": 1.6809, "step": 8440 }, { "epoch": 0.7, "grad_norm": 2.737149953842163, "learning_rate": 1.5392463768115946e-05, "loss": 1.8422, "step": 8450 }, { "epoch": 0.7, "grad_norm": 1.7504504919052124, "learning_rate": 1.5386666666666666e-05, "loss": 1.7135, "step": 8460 }, { "epoch": 0.71, "grad_norm": 2.513226270675659, "learning_rate": 1.5380869565217394e-05, "loss": 1.7704, "step": 8470 }, { "epoch": 0.71, "grad_norm": 3.151536226272583, "learning_rate": 1.5375072463768118e-05, "loss": 1.6754, "step": 8480 }, { "epoch": 0.71, "grad_norm": 4.003200054168701, "learning_rate": 1.536927536231884e-05, "loss": 1.7605, "step": 8490 }, { "epoch": 0.71, "grad_norm": 2.3642451763153076, "learning_rate": 1.5363478260869566e-05, "loss": 1.7791, "step": 8500 }, { "epoch": 0.71, "eval_loss": 1.728246808052063, "eval_runtime": 107.4982, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 8500 }, { "epoch": 0.71, "grad_norm": 1.0081084966659546, "learning_rate": 1.535768115942029e-05, "loss": 1.5997, "step": 8510 }, { "epoch": 0.71, "grad_norm": 4.5596394538879395, "learning_rate": 1.5351884057971017e-05, "loss": 1.5862, "step": 8520 }, { "epoch": 0.71, "grad_norm": 3.325129508972168, "learning_rate": 1.534608695652174e-05, "loss": 1.6014, "step": 8530 }, { "epoch": 0.71, "grad_norm": 3.541696071624756, "learning_rate": 1.5340289855072465e-05, "loss": 1.7766, "step": 8540 }, { "epoch": 0.71, "grad_norm": 1.9516916275024414, "learning_rate": 1.533449275362319e-05, "loss": 1.7535, "step": 8550 }, { "epoch": 0.71, "grad_norm": 2.0460009574890137, "learning_rate": 1.5328695652173916e-05, "loss": 1.6929, "step": 8560 }, { "epoch": 0.71, "grad_norm": 2.826079845428467, "learning_rate": 1.5322898550724637e-05, "loss": 1.7532, "step": 8570 }, { "epoch": 0.71, "grad_norm": 2.370847702026367, "learning_rate": 1.5317101449275364e-05, "loss": 1.8819, "step": 8580 }, { "epoch": 0.72, "grad_norm": 4.229450702667236, "learning_rate": 1.531130434782609e-05, "loss": 1.7454, "step": 8590 }, { "epoch": 0.72, "grad_norm": 1.1140810251235962, "learning_rate": 1.5305507246376812e-05, "loss": 1.6904, "step": 8600 }, { "epoch": 0.72, "grad_norm": 2.1529414653778076, "learning_rate": 1.5299710144927536e-05, "loss": 1.8351, "step": 8610 }, { "epoch": 0.72, "grad_norm": 2.1377217769622803, "learning_rate": 1.529391304347826e-05, "loss": 1.7735, "step": 8620 }, { "epoch": 0.72, "grad_norm": 2.921389102935791, "learning_rate": 1.5288115942028988e-05, "loss": 1.7565, "step": 8630 }, { "epoch": 0.72, "grad_norm": 4.921605110168457, "learning_rate": 1.5282318840579712e-05, "loss": 1.8189, "step": 8640 }, { "epoch": 0.72, "grad_norm": 1.4307273626327515, "learning_rate": 1.5276521739130436e-05, "loss": 1.8747, "step": 8650 }, { "epoch": 0.72, "grad_norm": 3.513711452484131, "learning_rate": 1.527072463768116e-05, "loss": 1.7317, "step": 8660 }, { "epoch": 0.72, "grad_norm": 1.3792582750320435, "learning_rate": 1.5264927536231887e-05, "loss": 1.8341, "step": 8670 }, { "epoch": 0.72, "grad_norm": 2.2375993728637695, "learning_rate": 1.525913043478261e-05, "loss": 1.665, "step": 8680 }, { "epoch": 0.72, "grad_norm": 1.7629612684249878, "learning_rate": 1.5253333333333335e-05, "loss": 1.7347, "step": 8690 }, { "epoch": 0.72, "grad_norm": 1.4616599082946777, "learning_rate": 1.524753623188406e-05, "loss": 1.6149, "step": 8700 }, { "epoch": 0.73, "grad_norm": 2.200507164001465, "learning_rate": 1.5241739130434783e-05, "loss": 1.6675, "step": 8710 }, { "epoch": 0.73, "grad_norm": 7.254258632659912, "learning_rate": 1.5235942028985509e-05, "loss": 1.7182, "step": 8720 }, { "epoch": 0.73, "grad_norm": 1.435335636138916, "learning_rate": 1.5230144927536233e-05, "loss": 1.738, "step": 8730 }, { "epoch": 0.73, "grad_norm": 2.216738224029541, "learning_rate": 1.5224347826086959e-05, "loss": 1.6982, "step": 8740 }, { "epoch": 0.73, "grad_norm": 2.1532115936279297, "learning_rate": 1.5218550724637681e-05, "loss": 1.6887, "step": 8750 }, { "epoch": 0.73, "grad_norm": 2.216334581375122, "learning_rate": 1.5212753623188408e-05, "loss": 1.7531, "step": 8760 }, { "epoch": 0.73, "grad_norm": 1.3834766149520874, "learning_rate": 1.520695652173913e-05, "loss": 1.7209, "step": 8770 }, { "epoch": 0.73, "grad_norm": 3.171159267425537, "learning_rate": 1.5201159420289856e-05, "loss": 1.7489, "step": 8780 }, { "epoch": 0.73, "grad_norm": 2.972515344619751, "learning_rate": 1.519536231884058e-05, "loss": 1.641, "step": 8790 }, { "epoch": 0.73, "grad_norm": 1.618605136871338, "learning_rate": 1.5189565217391306e-05, "loss": 1.7289, "step": 8800 }, { "epoch": 0.73, "grad_norm": 1.3899489641189575, "learning_rate": 1.518376811594203e-05, "loss": 1.7706, "step": 8810 }, { "epoch": 0.73, "grad_norm": 3.194946527481079, "learning_rate": 1.5177971014492754e-05, "loss": 1.8283, "step": 8820 }, { "epoch": 0.74, "grad_norm": 3.15311598777771, "learning_rate": 1.517217391304348e-05, "loss": 1.704, "step": 8830 }, { "epoch": 0.74, "grad_norm": 1.4149380922317505, "learning_rate": 1.5166376811594204e-05, "loss": 1.6738, "step": 8840 }, { "epoch": 0.74, "grad_norm": 3.4329111576080322, "learning_rate": 1.516057971014493e-05, "loss": 1.7411, "step": 8850 }, { "epoch": 0.74, "grad_norm": 3.406054973602295, "learning_rate": 1.5154782608695654e-05, "loss": 1.8027, "step": 8860 }, { "epoch": 0.74, "grad_norm": 4.0179643630981445, "learning_rate": 1.514898550724638e-05, "loss": 1.7083, "step": 8870 }, { "epoch": 0.74, "grad_norm": 2.3201992511749268, "learning_rate": 1.5143188405797103e-05, "loss": 1.6307, "step": 8880 }, { "epoch": 0.74, "grad_norm": 4.243584632873535, "learning_rate": 1.5137391304347829e-05, "loss": 1.7504, "step": 8890 }, { "epoch": 0.74, "grad_norm": 1.8807439804077148, "learning_rate": 1.5131594202898551e-05, "loss": 1.7607, "step": 8900 }, { "epoch": 0.74, "grad_norm": 3.392615556716919, "learning_rate": 1.5125797101449277e-05, "loss": 1.6733, "step": 8910 }, { "epoch": 0.74, "grad_norm": 2.388437032699585, "learning_rate": 1.5120000000000001e-05, "loss": 1.6187, "step": 8920 }, { "epoch": 0.74, "grad_norm": 7.407155990600586, "learning_rate": 1.5114202898550725e-05, "loss": 1.6473, "step": 8930 }, { "epoch": 0.74, "grad_norm": 3.9519150257110596, "learning_rate": 1.510840579710145e-05, "loss": 1.7244, "step": 8940 }, { "epoch": 0.75, "grad_norm": 3.5977227687835693, "learning_rate": 1.5102608695652175e-05, "loss": 1.7157, "step": 8950 }, { "epoch": 0.75, "grad_norm": 5.951735019683838, "learning_rate": 1.50968115942029e-05, "loss": 1.7765, "step": 8960 }, { "epoch": 0.75, "grad_norm": 2.0863535404205322, "learning_rate": 1.5091014492753624e-05, "loss": 1.5254, "step": 8970 }, { "epoch": 0.75, "grad_norm": 1.2637622356414795, "learning_rate": 1.508521739130435e-05, "loss": 1.808, "step": 8980 }, { "epoch": 0.75, "grad_norm": 2.3032875061035156, "learning_rate": 1.5079420289855074e-05, "loss": 1.7458, "step": 8990 }, { "epoch": 0.75, "grad_norm": 3.13789701461792, "learning_rate": 1.50736231884058e-05, "loss": 1.6349, "step": 9000 }, { "epoch": 0.75, "eval_loss": 1.7237039804458618, "eval_runtime": 107.5052, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 9000 }, { "epoch": 0.75, "grad_norm": 1.9929821491241455, "learning_rate": 1.5067826086956524e-05, "loss": 1.7925, "step": 9010 }, { "epoch": 0.75, "grad_norm": 5.004290580749512, "learning_rate": 1.5062028985507246e-05, "loss": 1.6467, "step": 9020 }, { "epoch": 0.75, "grad_norm": 1.387837529182434, "learning_rate": 1.5056231884057972e-05, "loss": 1.6188, "step": 9030 }, { "epoch": 0.75, "grad_norm": 2.9127049446105957, "learning_rate": 1.5050434782608696e-05, "loss": 1.6898, "step": 9040 }, { "epoch": 0.75, "grad_norm": 2.460002899169922, "learning_rate": 1.5044637681159421e-05, "loss": 1.7306, "step": 9050 }, { "epoch": 0.76, "grad_norm": 6.018058776855469, "learning_rate": 1.5038840579710145e-05, "loss": 1.6259, "step": 9060 }, { "epoch": 0.76, "grad_norm": 1.8971534967422485, "learning_rate": 1.5033043478260871e-05, "loss": 1.6011, "step": 9070 }, { "epoch": 0.76, "grad_norm": 4.7335357666015625, "learning_rate": 1.5027246376811595e-05, "loss": 1.7402, "step": 9080 }, { "epoch": 0.76, "grad_norm": 2.477769613265991, "learning_rate": 1.5021449275362321e-05, "loss": 1.7765, "step": 9090 }, { "epoch": 0.76, "grad_norm": 2.670304775238037, "learning_rate": 1.5015652173913045e-05, "loss": 1.8922, "step": 9100 }, { "epoch": 0.76, "grad_norm": 3.747793197631836, "learning_rate": 1.500985507246377e-05, "loss": 1.6625, "step": 9110 }, { "epoch": 0.76, "grad_norm": 3.719919204711914, "learning_rate": 1.5004057971014495e-05, "loss": 1.7383, "step": 9120 }, { "epoch": 0.76, "grad_norm": 2.1044466495513916, "learning_rate": 1.4998260869565217e-05, "loss": 1.8098, "step": 9130 }, { "epoch": 0.76, "grad_norm": 2.7055985927581787, "learning_rate": 1.4992463768115944e-05, "loss": 1.4545, "step": 9140 }, { "epoch": 0.76, "grad_norm": 0.7693852186203003, "learning_rate": 1.4986666666666667e-05, "loss": 1.6745, "step": 9150 }, { "epoch": 0.76, "grad_norm": 2.0475828647613525, "learning_rate": 1.4980869565217392e-05, "loss": 1.5843, "step": 9160 }, { "epoch": 0.76, "grad_norm": 3.5320639610290527, "learning_rate": 1.4975072463768116e-05, "loss": 1.6735, "step": 9170 }, { "epoch": 0.77, "grad_norm": 3.41857647895813, "learning_rate": 1.4969275362318842e-05, "loss": 1.6797, "step": 9180 }, { "epoch": 0.77, "grad_norm": 2.8437910079956055, "learning_rate": 1.4963478260869566e-05, "loss": 1.7409, "step": 9190 }, { "epoch": 0.77, "grad_norm": 3.104484796524048, "learning_rate": 1.4957681159420292e-05, "loss": 1.5646, "step": 9200 }, { "epoch": 0.77, "grad_norm": 3.2742903232574463, "learning_rate": 1.4951884057971016e-05, "loss": 1.8273, "step": 9210 }, { "epoch": 0.77, "grad_norm": 3.759800434112549, "learning_rate": 1.494608695652174e-05, "loss": 1.5638, "step": 9220 }, { "epoch": 0.77, "grad_norm": 1.4003862142562866, "learning_rate": 1.4940289855072465e-05, "loss": 1.8205, "step": 9230 }, { "epoch": 0.77, "grad_norm": 3.187140703201294, "learning_rate": 1.493449275362319e-05, "loss": 1.6775, "step": 9240 }, { "epoch": 0.77, "grad_norm": 3.953629493713379, "learning_rate": 1.4928695652173915e-05, "loss": 1.7948, "step": 9250 }, { "epoch": 0.77, "grad_norm": 3.153158664703369, "learning_rate": 1.4922898550724637e-05, "loss": 1.7641, "step": 9260 }, { "epoch": 0.77, "grad_norm": 3.8816325664520264, "learning_rate": 1.4917101449275365e-05, "loss": 1.7967, "step": 9270 }, { "epoch": 0.77, "grad_norm": 2.1909523010253906, "learning_rate": 1.4911304347826087e-05, "loss": 1.7295, "step": 9280 }, { "epoch": 0.77, "grad_norm": 3.3025991916656494, "learning_rate": 1.4905507246376813e-05, "loss": 1.7594, "step": 9290 }, { "epoch": 0.78, "grad_norm": 4.461934566497803, "learning_rate": 1.4899710144927537e-05, "loss": 1.8171, "step": 9300 }, { "epoch": 0.78, "grad_norm": 4.337406158447266, "learning_rate": 1.4893913043478263e-05, "loss": 1.6085, "step": 9310 }, { "epoch": 0.78, "grad_norm": 1.752158284187317, "learning_rate": 1.4888115942028987e-05, "loss": 1.6972, "step": 9320 }, { "epoch": 0.78, "grad_norm": 2.966576099395752, "learning_rate": 1.488231884057971e-05, "loss": 1.6919, "step": 9330 }, { "epoch": 0.78, "grad_norm": 2.894684076309204, "learning_rate": 1.4876521739130436e-05, "loss": 1.4976, "step": 9340 }, { "epoch": 0.78, "grad_norm": 1.2293604612350464, "learning_rate": 1.487072463768116e-05, "loss": 1.6831, "step": 9350 }, { "epoch": 0.78, "grad_norm": 1.8790899515151978, "learning_rate": 1.4864927536231886e-05, "loss": 1.7635, "step": 9360 }, { "epoch": 0.78, "grad_norm": 2.168088436126709, "learning_rate": 1.485913043478261e-05, "loss": 1.6389, "step": 9370 }, { "epoch": 0.78, "grad_norm": 1.0390825271606445, "learning_rate": 1.4853333333333336e-05, "loss": 1.6824, "step": 9380 }, { "epoch": 0.78, "grad_norm": 2.15370512008667, "learning_rate": 1.4847536231884058e-05, "loss": 1.6074, "step": 9390 }, { "epoch": 0.78, "grad_norm": 2.2792294025421143, "learning_rate": 1.4841739130434785e-05, "loss": 1.6977, "step": 9400 }, { "epoch": 0.78, "grad_norm": 3.7052159309387207, "learning_rate": 1.4835942028985508e-05, "loss": 1.6734, "step": 9410 }, { "epoch": 0.79, "grad_norm": 0.8690690994262695, "learning_rate": 1.4830144927536233e-05, "loss": 1.7114, "step": 9420 }, { "epoch": 0.79, "grad_norm": 1.2574138641357422, "learning_rate": 1.4824347826086957e-05, "loss": 1.4424, "step": 9430 }, { "epoch": 0.79, "grad_norm": 0.904484748840332, "learning_rate": 1.4818550724637681e-05, "loss": 1.7067, "step": 9440 }, { "epoch": 0.79, "grad_norm": 2.294973611831665, "learning_rate": 1.4812753623188407e-05, "loss": 1.8665, "step": 9450 }, { "epoch": 0.79, "grad_norm": 4.394903182983398, "learning_rate": 1.4806956521739131e-05, "loss": 1.5927, "step": 9460 }, { "epoch": 0.79, "grad_norm": 2.6690807342529297, "learning_rate": 1.4801159420289857e-05, "loss": 1.6617, "step": 9470 }, { "epoch": 0.79, "grad_norm": 1.6983529329299927, "learning_rate": 1.479536231884058e-05, "loss": 1.7189, "step": 9480 }, { "epoch": 0.79, "grad_norm": 2.5802037715911865, "learning_rate": 1.4789565217391307e-05, "loss": 1.4436, "step": 9490 }, { "epoch": 0.79, "grad_norm": 3.2202868461608887, "learning_rate": 1.478376811594203e-05, "loss": 1.8035, "step": 9500 }, { "epoch": 0.79, "eval_loss": 1.7105869054794312, "eval_runtime": 107.5846, "eval_samples_per_second": 9.295, "eval_steps_per_second": 2.324, "step": 9500 }, { "epoch": 0.79, "grad_norm": 3.4393227100372314, "learning_rate": 1.4777971014492756e-05, "loss": 1.5999, "step": 9510 }, { "epoch": 0.79, "grad_norm": 2.443208694458008, "learning_rate": 1.4772173913043479e-05, "loss": 1.6505, "step": 9520 }, { "epoch": 0.79, "grad_norm": 5.468209743499756, "learning_rate": 1.4766376811594203e-05, "loss": 1.6897, "step": 9530 }, { "epoch": 0.8, "grad_norm": 1.3215961456298828, "learning_rate": 1.4760579710144928e-05, "loss": 1.8336, "step": 9540 }, { "epoch": 0.8, "grad_norm": 3.739179849624634, "learning_rate": 1.4754782608695652e-05, "loss": 1.8085, "step": 9550 }, { "epoch": 0.8, "grad_norm": 1.8474160432815552, "learning_rate": 1.4748985507246378e-05, "loss": 1.8288, "step": 9560 }, { "epoch": 0.8, "grad_norm": 2.9103896617889404, "learning_rate": 1.4743188405797102e-05, "loss": 1.7058, "step": 9570 }, { "epoch": 0.8, "grad_norm": 1.432541847229004, "learning_rate": 1.4737391304347828e-05, "loss": 1.7497, "step": 9580 }, { "epoch": 0.8, "grad_norm": 2.380267381668091, "learning_rate": 1.4731594202898552e-05, "loss": 1.6908, "step": 9590 }, { "epoch": 0.8, "grad_norm": 1.8983180522918701, "learning_rate": 1.4725797101449277e-05, "loss": 1.7913, "step": 9600 }, { "epoch": 0.8, "grad_norm": 1.5178720951080322, "learning_rate": 1.4720000000000001e-05, "loss": 1.6481, "step": 9610 }, { "epoch": 0.8, "grad_norm": 4.430023670196533, "learning_rate": 1.4714202898550727e-05, "loss": 1.7796, "step": 9620 }, { "epoch": 0.8, "grad_norm": 1.9804303646087646, "learning_rate": 1.4708405797101451e-05, "loss": 1.7676, "step": 9630 }, { "epoch": 0.8, "grad_norm": 4.044782638549805, "learning_rate": 1.4702608695652173e-05, "loss": 1.6331, "step": 9640 }, { "epoch": 0.8, "grad_norm": 3.2377848625183105, "learning_rate": 1.46968115942029e-05, "loss": 1.6992, "step": 9650 }, { "epoch": 0.81, "grad_norm": 2.13010835647583, "learning_rate": 1.4691014492753623e-05, "loss": 1.7363, "step": 9660 }, { "epoch": 0.81, "grad_norm": 2.638706684112549, "learning_rate": 1.4685217391304349e-05, "loss": 1.6856, "step": 9670 }, { "epoch": 0.81, "grad_norm": 1.5270848274230957, "learning_rate": 1.4679420289855073e-05, "loss": 1.694, "step": 9680 }, { "epoch": 0.81, "grad_norm": 2.1668355464935303, "learning_rate": 1.4673623188405798e-05, "loss": 1.5912, "step": 9690 }, { "epoch": 0.81, "grad_norm": 1.8836257457733154, "learning_rate": 1.466840579710145e-05, "loss": 1.7058, "step": 9700 }, { "epoch": 0.81, "grad_norm": 1.6809861660003662, "learning_rate": 1.4662608695652174e-05, "loss": 1.6679, "step": 9710 }, { "epoch": 0.81, "grad_norm": 3.5727224349975586, "learning_rate": 1.46568115942029e-05, "loss": 1.7444, "step": 9720 }, { "epoch": 0.81, "grad_norm": 5.230935573577881, "learning_rate": 1.4651014492753623e-05, "loss": 1.649, "step": 9730 }, { "epoch": 0.81, "grad_norm": 2.9964046478271484, "learning_rate": 1.464521739130435e-05, "loss": 1.6816, "step": 9740 }, { "epoch": 0.81, "grad_norm": 2.299757480621338, "learning_rate": 1.4639420289855073e-05, "loss": 1.7576, "step": 9750 }, { "epoch": 0.81, "grad_norm": 2.057912588119507, "learning_rate": 1.4633623188405799e-05, "loss": 1.7915, "step": 9760 }, { "epoch": 0.81, "grad_norm": 2.693455934524536, "learning_rate": 1.4627826086956523e-05, "loss": 1.7852, "step": 9770 }, { "epoch": 0.81, "grad_norm": 1.2792394161224365, "learning_rate": 1.4622028985507249e-05, "loss": 1.7702, "step": 9780 }, { "epoch": 0.82, "grad_norm": 1.9166826009750366, "learning_rate": 1.4616231884057973e-05, "loss": 1.5197, "step": 9790 }, { "epoch": 0.82, "grad_norm": 4.76140832901001, "learning_rate": 1.4610434782608698e-05, "loss": 1.7217, "step": 9800 }, { "epoch": 0.82, "grad_norm": 1.99151611328125, "learning_rate": 1.4604637681159422e-05, "loss": 1.8153, "step": 9810 }, { "epoch": 0.82, "grad_norm": 1.7407947778701782, "learning_rate": 1.4598840579710145e-05, "loss": 1.7537, "step": 9820 }, { "epoch": 0.82, "grad_norm": 4.095398426055908, "learning_rate": 1.459304347826087e-05, "loss": 1.7267, "step": 9830 }, { "epoch": 0.82, "grad_norm": 2.100191593170166, "learning_rate": 1.4587246376811594e-05, "loss": 1.6822, "step": 9840 }, { "epoch": 0.82, "grad_norm": 1.774409294128418, "learning_rate": 1.458144927536232e-05, "loss": 1.7082, "step": 9850 }, { "epoch": 0.82, "grad_norm": 1.7792176008224487, "learning_rate": 1.4575652173913044e-05, "loss": 1.6193, "step": 9860 }, { "epoch": 0.82, "grad_norm": 2.539458751678467, "learning_rate": 1.456985507246377e-05, "loss": 1.6587, "step": 9870 }, { "epoch": 0.82, "grad_norm": 2.321563243865967, "learning_rate": 1.4564057971014494e-05, "loss": 1.7235, "step": 9880 }, { "epoch": 0.82, "grad_norm": 1.7294243574142456, "learning_rate": 1.455826086956522e-05, "loss": 1.7623, "step": 9890 }, { "epoch": 0.82, "grad_norm": 3.641298532485962, "learning_rate": 1.4552463768115943e-05, "loss": 1.7657, "step": 9900 }, { "epoch": 0.83, "grad_norm": 1.3333618640899658, "learning_rate": 1.4546666666666669e-05, "loss": 1.6669, "step": 9910 }, { "epoch": 0.83, "grad_norm": 3.8383522033691406, "learning_rate": 1.4540869565217393e-05, "loss": 1.7376, "step": 9920 }, { "epoch": 0.83, "grad_norm": 13.071063995361328, "learning_rate": 1.4535072463768115e-05, "loss": 1.6751, "step": 9930 }, { "epoch": 0.83, "grad_norm": 2.008157730102539, "learning_rate": 1.4529275362318843e-05, "loss": 1.6904, "step": 9940 }, { "epoch": 0.83, "grad_norm": 6.280172824859619, "learning_rate": 1.4523478260869565e-05, "loss": 1.5522, "step": 9950 }, { "epoch": 0.83, "grad_norm": 4.36051607131958, "learning_rate": 1.4517681159420291e-05, "loss": 1.6922, "step": 9960 }, { "epoch": 0.83, "grad_norm": 4.0582804679870605, "learning_rate": 1.4511884057971015e-05, "loss": 1.7, "step": 9970 }, { "epoch": 0.83, "grad_norm": 2.5071215629577637, "learning_rate": 1.450608695652174e-05, "loss": 1.6065, "step": 9980 }, { "epoch": 0.83, "grad_norm": 4.5151567459106445, "learning_rate": 1.4500289855072465e-05, "loss": 1.646, "step": 9990 }, { "epoch": 0.83, "grad_norm": 4.011928081512451, "learning_rate": 1.449449275362319e-05, "loss": 1.8767, "step": 10000 }, { "epoch": 0.83, "eval_loss": 1.7204252481460571, "eval_runtime": 107.4981, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 10000 }, { "epoch": 0.83, "grad_norm": 0.8452903032302856, "learning_rate": 1.4488695652173914e-05, "loss": 1.6305, "step": 10010 }, { "epoch": 0.83, "grad_norm": 1.8112026453018188, "learning_rate": 1.4482898550724638e-05, "loss": 1.6268, "step": 10020 }, { "epoch": 0.84, "grad_norm": 0.9594013094902039, "learning_rate": 1.4477101449275364e-05, "loss": 1.4332, "step": 10030 }, { "epoch": 0.84, "grad_norm": 3.1587038040161133, "learning_rate": 1.4471304347826088e-05, "loss": 1.8313, "step": 10040 }, { "epoch": 0.84, "grad_norm": 2.3806848526000977, "learning_rate": 1.4465507246376814e-05, "loss": 1.6507, "step": 10050 }, { "epoch": 0.84, "grad_norm": 3.119511127471924, "learning_rate": 1.4459710144927536e-05, "loss": 1.8907, "step": 10060 }, { "epoch": 0.84, "grad_norm": 2.085169553756714, "learning_rate": 1.4453913043478263e-05, "loss": 1.7827, "step": 10070 }, { "epoch": 0.84, "grad_norm": 2.273642063140869, "learning_rate": 1.4448115942028986e-05, "loss": 1.8521, "step": 10080 }, { "epoch": 0.84, "grad_norm": 2.0806262493133545, "learning_rate": 1.4442318840579711e-05, "loss": 1.7231, "step": 10090 }, { "epoch": 0.84, "grad_norm": 1.6042985916137695, "learning_rate": 1.4436521739130435e-05, "loss": 1.6173, "step": 10100 }, { "epoch": 0.84, "grad_norm": 2.670179843902588, "learning_rate": 1.4430724637681161e-05, "loss": 1.8075, "step": 10110 }, { "epoch": 0.84, "grad_norm": 4.045022010803223, "learning_rate": 1.4424927536231885e-05, "loss": 1.7247, "step": 10120 }, { "epoch": 0.84, "grad_norm": 1.5309295654296875, "learning_rate": 1.4419130434782609e-05, "loss": 1.7216, "step": 10130 }, { "epoch": 0.84, "grad_norm": 2.8289453983306885, "learning_rate": 1.4413333333333335e-05, "loss": 1.7299, "step": 10140 }, { "epoch": 0.85, "grad_norm": 2.870276927947998, "learning_rate": 1.4407536231884059e-05, "loss": 1.6632, "step": 10150 }, { "epoch": 0.85, "grad_norm": 1.4293287992477417, "learning_rate": 1.4401739130434785e-05, "loss": 1.7234, "step": 10160 }, { "epoch": 0.85, "grad_norm": 2.1435534954071045, "learning_rate": 1.4395942028985509e-05, "loss": 1.7715, "step": 10170 }, { "epoch": 0.85, "grad_norm": 1.4607239961624146, "learning_rate": 1.4390144927536234e-05, "loss": 1.728, "step": 10180 }, { "epoch": 0.85, "grad_norm": 2.555734872817993, "learning_rate": 1.4384347826086957e-05, "loss": 1.8154, "step": 10190 }, { "epoch": 0.85, "grad_norm": 2.3336338996887207, "learning_rate": 1.4378550724637684e-05, "loss": 1.6378, "step": 10200 }, { "epoch": 0.85, "grad_norm": 1.9876394271850586, "learning_rate": 1.4372753623188406e-05, "loss": 1.6636, "step": 10210 }, { "epoch": 0.85, "grad_norm": 1.9591132402420044, "learning_rate": 1.4366956521739132e-05, "loss": 1.7456, "step": 10220 }, { "epoch": 0.85, "grad_norm": 2.272648572921753, "learning_rate": 1.4361159420289856e-05, "loss": 1.8266, "step": 10230 }, { "epoch": 0.85, "grad_norm": 6.858809947967529, "learning_rate": 1.435536231884058e-05, "loss": 1.6315, "step": 10240 }, { "epoch": 0.85, "grad_norm": 0.7406672835350037, "learning_rate": 1.4349565217391306e-05, "loss": 1.4722, "step": 10250 }, { "epoch": 0.85, "grad_norm": 1.0749115943908691, "learning_rate": 1.434376811594203e-05, "loss": 1.8539, "step": 10260 }, { "epoch": 0.86, "grad_norm": 2.3175392150878906, "learning_rate": 1.4337971014492755e-05, "loss": 1.7049, "step": 10270 }, { "epoch": 0.86, "grad_norm": 1.374159812927246, "learning_rate": 1.433217391304348e-05, "loss": 1.7605, "step": 10280 }, { "epoch": 0.86, "grad_norm": 1.9364595413208008, "learning_rate": 1.4326376811594205e-05, "loss": 1.6288, "step": 10290 }, { "epoch": 0.86, "grad_norm": 5.389453887939453, "learning_rate": 1.4320579710144929e-05, "loss": 1.6014, "step": 10300 }, { "epoch": 0.86, "grad_norm": 3.7157788276672363, "learning_rate": 1.4314782608695655e-05, "loss": 1.738, "step": 10310 }, { "epoch": 0.86, "grad_norm": 2.8316123485565186, "learning_rate": 1.4308985507246377e-05, "loss": 1.727, "step": 10320 }, { "epoch": 0.86, "grad_norm": 3.3950917720794678, "learning_rate": 1.4303188405797101e-05, "loss": 1.6789, "step": 10330 }, { "epoch": 0.86, "grad_norm": 4.218912601470947, "learning_rate": 1.4297391304347827e-05, "loss": 1.8176, "step": 10340 }, { "epoch": 0.86, "grad_norm": 1.7928733825683594, "learning_rate": 1.429159420289855e-05, "loss": 1.6744, "step": 10350 }, { "epoch": 0.86, "grad_norm": 1.1876684427261353, "learning_rate": 1.4285797101449276e-05, "loss": 1.6813, "step": 10360 }, { "epoch": 0.86, "grad_norm": 1.4012842178344727, "learning_rate": 1.428e-05, "loss": 1.7983, "step": 10370 }, { "epoch": 0.86, "grad_norm": 6.704860210418701, "learning_rate": 1.4274202898550726e-05, "loss": 1.6289, "step": 10380 }, { "epoch": 0.87, "grad_norm": 1.7611334323883057, "learning_rate": 1.426840579710145e-05, "loss": 1.6068, "step": 10390 }, { "epoch": 0.87, "grad_norm": 5.28679084777832, "learning_rate": 1.4262608695652176e-05, "loss": 1.644, "step": 10400 }, { "epoch": 0.87, "grad_norm": 3.1040380001068115, "learning_rate": 1.42568115942029e-05, "loss": 1.8251, "step": 10410 }, { "epoch": 0.87, "grad_norm": 2.5401835441589355, "learning_rate": 1.4251014492753626e-05, "loss": 1.6375, "step": 10420 }, { "epoch": 0.87, "grad_norm": 3.4024248123168945, "learning_rate": 1.424521739130435e-05, "loss": 1.6935, "step": 10430 }, { "epoch": 0.87, "grad_norm": 4.5380167961120605, "learning_rate": 1.4239420289855072e-05, "loss": 1.8278, "step": 10440 }, { "epoch": 0.87, "grad_norm": 2.322249412536621, "learning_rate": 1.42336231884058e-05, "loss": 1.6748, "step": 10450 }, { "epoch": 0.87, "grad_norm": 2.3199710845947266, "learning_rate": 1.4227826086956522e-05, "loss": 1.7673, "step": 10460 }, { "epoch": 0.87, "grad_norm": 1.2281827926635742, "learning_rate": 1.4222028985507247e-05, "loss": 1.7763, "step": 10470 }, { "epoch": 0.87, "grad_norm": 1.571286678314209, "learning_rate": 1.4216231884057971e-05, "loss": 1.5969, "step": 10480 }, { "epoch": 0.87, "grad_norm": 4.81419038772583, "learning_rate": 1.4210434782608697e-05, "loss": 1.7263, "step": 10490 }, { "epoch": 0.88, "grad_norm": 2.55985951423645, "learning_rate": 1.4204637681159421e-05, "loss": 1.7246, "step": 10500 }, { "epoch": 0.88, "eval_loss": 1.7039064168930054, "eval_runtime": 107.5109, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 10500 }, { "epoch": 0.88, "grad_norm": 2.4727416038513184, "learning_rate": 1.4198840579710147e-05, "loss": 1.7325, "step": 10510 }, { "epoch": 0.88, "grad_norm": 2.254564046859741, "learning_rate": 1.419304347826087e-05, "loss": 1.6793, "step": 10520 }, { "epoch": 0.88, "grad_norm": 2.339053153991699, "learning_rate": 1.4187246376811596e-05, "loss": 1.8367, "step": 10530 }, { "epoch": 0.88, "grad_norm": 2.8006272315979004, "learning_rate": 1.418144927536232e-05, "loss": 1.5286, "step": 10540 }, { "epoch": 0.88, "grad_norm": 1.7751678228378296, "learning_rate": 1.4175652173913044e-05, "loss": 1.7536, "step": 10550 }, { "epoch": 0.88, "grad_norm": 7.587690830230713, "learning_rate": 1.416985507246377e-05, "loss": 1.8107, "step": 10560 }, { "epoch": 0.88, "grad_norm": 1.8632152080535889, "learning_rate": 1.4164057971014492e-05, "loss": 1.6903, "step": 10570 }, { "epoch": 0.88, "grad_norm": 3.762665033340454, "learning_rate": 1.415826086956522e-05, "loss": 1.7828, "step": 10580 }, { "epoch": 0.88, "grad_norm": 2.1041312217712402, "learning_rate": 1.4152463768115942e-05, "loss": 1.8221, "step": 10590 }, { "epoch": 0.88, "grad_norm": 1.7822948694229126, "learning_rate": 1.4146666666666668e-05, "loss": 1.7623, "step": 10600 }, { "epoch": 0.88, "grad_norm": 3.2196130752563477, "learning_rate": 1.4140869565217392e-05, "loss": 1.8126, "step": 10610 }, { "epoch": 0.89, "grad_norm": 2.2900569438934326, "learning_rate": 1.4135072463768118e-05, "loss": 1.7031, "step": 10620 }, { "epoch": 0.89, "grad_norm": 4.033328056335449, "learning_rate": 1.4129275362318842e-05, "loss": 1.7968, "step": 10630 }, { "epoch": 0.89, "grad_norm": 3.2577106952667236, "learning_rate": 1.4123478260869566e-05, "loss": 1.69, "step": 10640 }, { "epoch": 0.89, "grad_norm": 6.332272052764893, "learning_rate": 1.4117681159420291e-05, "loss": 1.6143, "step": 10650 }, { "epoch": 0.89, "grad_norm": 2.976055383682251, "learning_rate": 1.4111884057971015e-05, "loss": 1.7698, "step": 10660 }, { "epoch": 0.89, "grad_norm": 1.821053385734558, "learning_rate": 1.4106086956521741e-05, "loss": 1.8864, "step": 10670 }, { "epoch": 0.89, "grad_norm": 2.310410261154175, "learning_rate": 1.4100289855072465e-05, "loss": 1.583, "step": 10680 }, { "epoch": 0.89, "grad_norm": 3.4083127975463867, "learning_rate": 1.409449275362319e-05, "loss": 1.7432, "step": 10690 }, { "epoch": 0.89, "grad_norm": 1.7858573198318481, "learning_rate": 1.4088695652173913e-05, "loss": 1.566, "step": 10700 }, { "epoch": 0.89, "grad_norm": 1.7986173629760742, "learning_rate": 1.408289855072464e-05, "loss": 1.7651, "step": 10710 }, { "epoch": 0.89, "grad_norm": 1.9657052755355835, "learning_rate": 1.4077101449275363e-05, "loss": 1.7637, "step": 10720 }, { "epoch": 0.89, "grad_norm": 3.002399206161499, "learning_rate": 1.4071304347826088e-05, "loss": 1.8534, "step": 10730 }, { "epoch": 0.9, "grad_norm": 3.677050828933716, "learning_rate": 1.4065507246376812e-05, "loss": 1.8067, "step": 10740 }, { "epoch": 0.9, "grad_norm": 2.9738922119140625, "learning_rate": 1.4059710144927536e-05, "loss": 1.7928, "step": 10750 }, { "epoch": 0.9, "grad_norm": 2.2202324867248535, "learning_rate": 1.4053913043478262e-05, "loss": 1.7436, "step": 10760 }, { "epoch": 0.9, "grad_norm": 2.8412764072418213, "learning_rate": 1.4048115942028986e-05, "loss": 1.7523, "step": 10770 }, { "epoch": 0.9, "grad_norm": 0.8097102046012878, "learning_rate": 1.4042318840579712e-05, "loss": 1.7362, "step": 10780 }, { "epoch": 0.9, "grad_norm": 2.471013069152832, "learning_rate": 1.4036521739130436e-05, "loss": 1.7287, "step": 10790 }, { "epoch": 0.9, "grad_norm": 1.12705659866333, "learning_rate": 1.4030724637681162e-05, "loss": 1.7023, "step": 10800 }, { "epoch": 0.9, "grad_norm": 2.3543355464935303, "learning_rate": 1.4024927536231886e-05, "loss": 1.7299, "step": 10810 }, { "epoch": 0.9, "grad_norm": 6.779575824737549, "learning_rate": 1.4019130434782611e-05, "loss": 1.668, "step": 10820 }, { "epoch": 0.9, "grad_norm": 3.1106367111206055, "learning_rate": 1.4013333333333334e-05, "loss": 1.713, "step": 10830 }, { "epoch": 0.9, "grad_norm": 1.9769501686096191, "learning_rate": 1.4007536231884061e-05, "loss": 1.6006, "step": 10840 }, { "epoch": 0.9, "grad_norm": 3.193175792694092, "learning_rate": 1.4001739130434783e-05, "loss": 1.734, "step": 10850 }, { "epoch": 0.91, "grad_norm": 1.645627737045288, "learning_rate": 1.3995942028985507e-05, "loss": 1.5706, "step": 10860 }, { "epoch": 0.91, "grad_norm": 6.674108982086182, "learning_rate": 1.3990144927536233e-05, "loss": 1.702, "step": 10870 }, { "epoch": 0.91, "grad_norm": 1.0062819719314575, "learning_rate": 1.3984347826086957e-05, "loss": 1.7267, "step": 10880 }, { "epoch": 0.91, "grad_norm": 4.037877559661865, "learning_rate": 1.3978550724637683e-05, "loss": 1.7847, "step": 10890 }, { "epoch": 0.91, "grad_norm": 2.889549493789673, "learning_rate": 1.3972753623188407e-05, "loss": 1.6711, "step": 10900 }, { "epoch": 0.91, "grad_norm": 3.1683433055877686, "learning_rate": 1.3966956521739132e-05, "loss": 1.7289, "step": 10910 }, { "epoch": 0.91, "grad_norm": 3.776911973953247, "learning_rate": 1.3961159420289856e-05, "loss": 1.6323, "step": 10920 }, { "epoch": 0.91, "grad_norm": 4.037374973297119, "learning_rate": 1.3955942028985508e-05, "loss": 1.7624, "step": 10930 }, { "epoch": 0.91, "grad_norm": 6.559633255004883, "learning_rate": 1.3950144927536233e-05, "loss": 1.6858, "step": 10940 }, { "epoch": 0.91, "grad_norm": 1.2170414924621582, "learning_rate": 1.3944347826086957e-05, "loss": 1.7849, "step": 10950 }, { "epoch": 0.91, "grad_norm": 3.384916305541992, "learning_rate": 1.3938550724637683e-05, "loss": 1.6399, "step": 10960 }, { "epoch": 0.91, "grad_norm": 2.4139840602874756, "learning_rate": 1.3932753623188407e-05, "loss": 1.7363, "step": 10970 }, { "epoch": 0.92, "grad_norm": 3.0569369792938232, "learning_rate": 1.3926956521739133e-05, "loss": 1.8243, "step": 10980 }, { "epoch": 0.92, "grad_norm": 3.9499967098236084, "learning_rate": 1.3921159420289855e-05, "loss": 1.8001, "step": 10990 }, { "epoch": 0.92, "grad_norm": 2.538534164428711, "learning_rate": 1.3915362318840582e-05, "loss": 1.5845, "step": 11000 }, { "epoch": 0.92, "eval_loss": 1.7137649059295654, "eval_runtime": 107.4981, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 11000 }, { "epoch": 0.92, "grad_norm": 1.2500182390213013, "learning_rate": 1.3909565217391305e-05, "loss": 1.7198, "step": 11010 }, { "epoch": 0.92, "grad_norm": 1.7554583549499512, "learning_rate": 1.3903768115942029e-05, "loss": 1.678, "step": 11020 }, { "epoch": 0.92, "grad_norm": 2.5758776664733887, "learning_rate": 1.3897971014492754e-05, "loss": 1.7378, "step": 11030 }, { "epoch": 0.92, "grad_norm": 6.143527984619141, "learning_rate": 1.3892173913043478e-05, "loss": 1.7865, "step": 11040 }, { "epoch": 0.92, "grad_norm": 2.270503282546997, "learning_rate": 1.3886376811594204e-05, "loss": 1.7676, "step": 11050 }, { "epoch": 0.92, "grad_norm": 3.5987863540649414, "learning_rate": 1.3880579710144928e-05, "loss": 1.7073, "step": 11060 }, { "epoch": 0.92, "grad_norm": 3.498621702194214, "learning_rate": 1.3874782608695654e-05, "loss": 1.7672, "step": 11070 }, { "epoch": 0.92, "grad_norm": 1.1236436367034912, "learning_rate": 1.3868985507246378e-05, "loss": 1.8432, "step": 11080 }, { "epoch": 0.92, "grad_norm": 8.279869079589844, "learning_rate": 1.3863188405797104e-05, "loss": 1.6825, "step": 11090 }, { "epoch": 0.93, "grad_norm": 7.814366817474365, "learning_rate": 1.3857391304347828e-05, "loss": 1.6321, "step": 11100 }, { "epoch": 0.93, "grad_norm": 1.510978102684021, "learning_rate": 1.3851594202898553e-05, "loss": 1.6527, "step": 11110 }, { "epoch": 0.93, "grad_norm": 2.76588773727417, "learning_rate": 1.3845797101449276e-05, "loss": 1.7147, "step": 11120 }, { "epoch": 0.93, "grad_norm": 4.148089408874512, "learning_rate": 1.384e-05, "loss": 1.7523, "step": 11130 }, { "epoch": 0.93, "grad_norm": 2.291975736618042, "learning_rate": 1.3834202898550725e-05, "loss": 1.6122, "step": 11140 }, { "epoch": 0.93, "grad_norm": 3.4009017944335938, "learning_rate": 1.382840579710145e-05, "loss": 1.7563, "step": 11150 }, { "epoch": 0.93, "grad_norm": 1.2660974264144897, "learning_rate": 1.3822608695652175e-05, "loss": 1.8469, "step": 11160 }, { "epoch": 0.93, "grad_norm": 1.1221381425857544, "learning_rate": 1.3816811594202899e-05, "loss": 1.8096, "step": 11170 }, { "epoch": 0.93, "grad_norm": 1.3825074434280396, "learning_rate": 1.3811014492753625e-05, "loss": 1.7913, "step": 11180 }, { "epoch": 0.93, "grad_norm": 8.367438316345215, "learning_rate": 1.3805217391304349e-05, "loss": 1.7117, "step": 11190 }, { "epoch": 0.93, "grad_norm": 3.15596866607666, "learning_rate": 1.3799420289855074e-05, "loss": 1.7031, "step": 11200 }, { "epoch": 0.93, "grad_norm": 6.4687042236328125, "learning_rate": 1.3793623188405798e-05, "loss": 1.6699, "step": 11210 }, { "epoch": 0.94, "grad_norm": 3.699357509613037, "learning_rate": 1.3787826086956524e-05, "loss": 1.829, "step": 11220 }, { "epoch": 0.94, "grad_norm": 1.6832666397094727, "learning_rate": 1.3782028985507248e-05, "loss": 1.5892, "step": 11230 }, { "epoch": 0.94, "grad_norm": 3.3249785900115967, "learning_rate": 1.377623188405797e-05, "loss": 1.8073, "step": 11240 }, { "epoch": 0.94, "grad_norm": 3.6117970943450928, "learning_rate": 1.3770434782608698e-05, "loss": 1.684, "step": 11250 }, { "epoch": 0.94, "grad_norm": 2.9994568824768066, "learning_rate": 1.376463768115942e-05, "loss": 1.6926, "step": 11260 }, { "epoch": 0.94, "grad_norm": 2.465999126434326, "learning_rate": 1.3758840579710146e-05, "loss": 1.6104, "step": 11270 }, { "epoch": 0.94, "grad_norm": 4.657724380493164, "learning_rate": 1.375304347826087e-05, "loss": 1.8288, "step": 11280 }, { "epoch": 0.94, "grad_norm": 1.94265615940094, "learning_rate": 1.3747246376811596e-05, "loss": 1.7035, "step": 11290 }, { "epoch": 0.94, "grad_norm": 1.6312084197998047, "learning_rate": 1.374144927536232e-05, "loss": 1.7352, "step": 11300 }, { "epoch": 0.94, "grad_norm": 2.722726583480835, "learning_rate": 1.3735652173913045e-05, "loss": 1.7667, "step": 11310 }, { "epoch": 0.94, "grad_norm": 6.727392196655273, "learning_rate": 1.372985507246377e-05, "loss": 1.5222, "step": 11320 }, { "epoch": 0.94, "grad_norm": 5.453555583953857, "learning_rate": 1.3724057971014493e-05, "loss": 1.8861, "step": 11330 }, { "epoch": 0.94, "grad_norm": 4.511663913726807, "learning_rate": 1.3718260869565219e-05, "loss": 1.836, "step": 11340 }, { "epoch": 0.95, "grad_norm": 2.072493553161621, "learning_rate": 1.3712463768115943e-05, "loss": 1.5072, "step": 11350 }, { "epoch": 0.95, "grad_norm": 1.9541881084442139, "learning_rate": 1.3706666666666669e-05, "loss": 1.6182, "step": 11360 }, { "epoch": 0.95, "grad_norm": 4.463501453399658, "learning_rate": 1.3700869565217391e-05, "loss": 1.8108, "step": 11370 }, { "epoch": 0.95, "grad_norm": 2.4841718673706055, "learning_rate": 1.3695072463768118e-05, "loss": 1.8205, "step": 11380 }, { "epoch": 0.95, "grad_norm": 3.357886791229248, "learning_rate": 1.368927536231884e-05, "loss": 1.6904, "step": 11390 }, { "epoch": 0.95, "grad_norm": 2.65535306930542, "learning_rate": 1.3683478260869566e-05, "loss": 1.7598, "step": 11400 }, { "epoch": 0.95, "grad_norm": 6.2453694343566895, "learning_rate": 1.367768115942029e-05, "loss": 1.6009, "step": 11410 }, { "epoch": 0.95, "grad_norm": 5.134276866912842, "learning_rate": 1.3671884057971016e-05, "loss": 1.7633, "step": 11420 }, { "epoch": 0.95, "grad_norm": 6.813711643218994, "learning_rate": 1.366608695652174e-05, "loss": 1.5795, "step": 11430 }, { "epoch": 0.95, "grad_norm": 1.364342451095581, "learning_rate": 1.3660289855072464e-05, "loss": 1.6675, "step": 11440 }, { "epoch": 0.95, "grad_norm": 1.627918004989624, "learning_rate": 1.365449275362319e-05, "loss": 1.7405, "step": 11450 }, { "epoch": 0.95, "grad_norm": 3.7319889068603516, "learning_rate": 1.3648695652173914e-05, "loss": 1.6259, "step": 11460 }, { "epoch": 0.96, "grad_norm": 0.867110013961792, "learning_rate": 1.364289855072464e-05, "loss": 1.5038, "step": 11470 }, { "epoch": 0.96, "grad_norm": 3.0498149394989014, "learning_rate": 1.3637101449275364e-05, "loss": 1.557, "step": 11480 }, { "epoch": 0.96, "grad_norm": 3.804266929626465, "learning_rate": 1.363130434782609e-05, "loss": 1.4778, "step": 11490 }, { "epoch": 0.96, "grad_norm": 4.833094120025635, "learning_rate": 1.3625507246376812e-05, "loss": 1.7432, "step": 11500 }, { "epoch": 0.96, "eval_loss": 1.6946756839752197, "eval_runtime": 107.5258, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 11500 }, { "epoch": 0.96, "grad_norm": 1.91051185131073, "learning_rate": 1.3619710144927539e-05, "loss": 1.6593, "step": 11510 }, { "epoch": 0.96, "grad_norm": 1.2992429733276367, "learning_rate": 1.3613913043478261e-05, "loss": 1.6403, "step": 11520 }, { "epoch": 0.96, "grad_norm": 1.9277349710464478, "learning_rate": 1.3608115942028987e-05, "loss": 1.7522, "step": 11530 }, { "epoch": 0.96, "grad_norm": 3.0008699893951416, "learning_rate": 1.3602318840579711e-05, "loss": 1.6957, "step": 11540 }, { "epoch": 0.96, "grad_norm": 1.4071532487869263, "learning_rate": 1.3596521739130435e-05, "loss": 1.6937, "step": 11550 }, { "epoch": 0.96, "grad_norm": 2.890350103378296, "learning_rate": 1.359072463768116e-05, "loss": 1.6852, "step": 11560 }, { "epoch": 0.96, "grad_norm": 5.77126932144165, "learning_rate": 1.3584927536231885e-05, "loss": 1.6281, "step": 11570 }, { "epoch": 0.96, "grad_norm": 1.499045729637146, "learning_rate": 1.357913043478261e-05, "loss": 1.6476, "step": 11580 }, { "epoch": 0.97, "grad_norm": 1.360612154006958, "learning_rate": 1.3573333333333334e-05, "loss": 1.7523, "step": 11590 }, { "epoch": 0.97, "grad_norm": 3.634352922439575, "learning_rate": 1.356753623188406e-05, "loss": 1.7197, "step": 11600 }, { "epoch": 0.97, "grad_norm": 2.7271361351013184, "learning_rate": 1.3561739130434784e-05, "loss": 1.7322, "step": 11610 }, { "epoch": 0.97, "grad_norm": 4.323874473571777, "learning_rate": 1.355594202898551e-05, "loss": 1.6905, "step": 11620 }, { "epoch": 0.97, "grad_norm": 4.2105393409729, "learning_rate": 1.3550144927536232e-05, "loss": 1.7726, "step": 11630 }, { "epoch": 0.97, "grad_norm": 4.96962833404541, "learning_rate": 1.3544347826086956e-05, "loss": 1.6487, "step": 11640 }, { "epoch": 0.97, "grad_norm": 3.641913652420044, "learning_rate": 1.3538550724637682e-05, "loss": 1.6343, "step": 11650 }, { "epoch": 0.97, "grad_norm": 6.166054725646973, "learning_rate": 1.3532753623188406e-05, "loss": 1.6923, "step": 11660 }, { "epoch": 0.97, "grad_norm": 2.171774387359619, "learning_rate": 1.3526956521739132e-05, "loss": 1.8464, "step": 11670 }, { "epoch": 0.97, "grad_norm": 2.8756415843963623, "learning_rate": 1.3521159420289856e-05, "loss": 1.6128, "step": 11680 }, { "epoch": 0.97, "grad_norm": 4.395898818969727, "learning_rate": 1.3515362318840581e-05, "loss": 1.7561, "step": 11690 }, { "epoch": 0.97, "grad_norm": 3.581686019897461, "learning_rate": 1.3509565217391305e-05, "loss": 1.6437, "step": 11700 }, { "epoch": 0.98, "grad_norm": 2.5616443157196045, "learning_rate": 1.3503768115942031e-05, "loss": 1.7065, "step": 11710 }, { "epoch": 0.98, "grad_norm": 3.503169536590576, "learning_rate": 1.3497971014492755e-05, "loss": 1.7289, "step": 11720 }, { "epoch": 0.98, "grad_norm": 2.8909196853637695, "learning_rate": 1.349217391304348e-05, "loss": 1.708, "step": 11730 }, { "epoch": 0.98, "grad_norm": 2.47155499458313, "learning_rate": 1.3486376811594205e-05, "loss": 1.6653, "step": 11740 }, { "epoch": 0.98, "grad_norm": 3.7819576263427734, "learning_rate": 1.3480579710144927e-05, "loss": 1.765, "step": 11750 }, { "epoch": 0.98, "grad_norm": 4.265803337097168, "learning_rate": 1.3474782608695653e-05, "loss": 1.8526, "step": 11760 }, { "epoch": 0.98, "grad_norm": 1.643122673034668, "learning_rate": 1.3468985507246377e-05, "loss": 1.7633, "step": 11770 }, { "epoch": 0.98, "grad_norm": 8.063491821289062, "learning_rate": 1.3463188405797102e-05, "loss": 1.7809, "step": 11780 }, { "epoch": 0.98, "grad_norm": 1.299911618232727, "learning_rate": 1.3457391304347826e-05, "loss": 1.7441, "step": 11790 }, { "epoch": 0.98, "grad_norm": 2.312831401824951, "learning_rate": 1.3451594202898552e-05, "loss": 1.7987, "step": 11800 }, { "epoch": 0.98, "grad_norm": 3.44124698638916, "learning_rate": 1.3445797101449276e-05, "loss": 1.6447, "step": 11810 }, { "epoch": 0.98, "grad_norm": 3.1065027713775635, "learning_rate": 1.3440000000000002e-05, "loss": 1.8101, "step": 11820 }, { "epoch": 0.99, "grad_norm": 4.687283039093018, "learning_rate": 1.3434202898550726e-05, "loss": 1.7435, "step": 11830 }, { "epoch": 0.99, "grad_norm": 4.883983135223389, "learning_rate": 1.3428405797101451e-05, "loss": 1.6305, "step": 11840 }, { "epoch": 0.99, "grad_norm": 1.454877257347107, "learning_rate": 1.3422608695652175e-05, "loss": 1.7328, "step": 11850 }, { "epoch": 0.99, "grad_norm": 5.217712879180908, "learning_rate": 1.34168115942029e-05, "loss": 1.8219, "step": 11860 }, { "epoch": 0.99, "grad_norm": 3.0228896141052246, "learning_rate": 1.3411014492753625e-05, "loss": 1.6406, "step": 11870 }, { "epoch": 0.99, "grad_norm": 1.98235285282135, "learning_rate": 1.3405217391304347e-05, "loss": 1.814, "step": 11880 }, { "epoch": 0.99, "grad_norm": 4.972719192504883, "learning_rate": 1.3399420289855073e-05, "loss": 1.7487, "step": 11890 }, { "epoch": 0.99, "grad_norm": 2.730912446975708, "learning_rate": 1.3393623188405797e-05, "loss": 1.6201, "step": 11900 }, { "epoch": 0.99, "grad_norm": 3.768197774887085, "learning_rate": 1.3387826086956523e-05, "loss": 1.5983, "step": 11910 }, { "epoch": 0.99, "grad_norm": 1.7705687284469604, "learning_rate": 1.3382028985507247e-05, "loss": 1.8149, "step": 11920 }, { "epoch": 0.99, "grad_norm": 3.303154230117798, "learning_rate": 1.3376231884057973e-05, "loss": 1.5987, "step": 11930 }, { "epoch": 0.99, "grad_norm": 3.428690195083618, "learning_rate": 1.3370434782608697e-05, "loss": 1.7096, "step": 11940 }, { "epoch": 1.0, "grad_norm": 1.814437985420227, "learning_rate": 1.336463768115942e-05, "loss": 1.5782, "step": 11950 }, { "epoch": 1.0, "grad_norm": 3.096653938293457, "learning_rate": 1.3358840579710146e-05, "loss": 1.7079, "step": 11960 }, { "epoch": 1.0, "grad_norm": 2.4044532775878906, "learning_rate": 1.335304347826087e-05, "loss": 1.5858, "step": 11970 }, { "epoch": 1.0, "grad_norm": 4.206218719482422, "learning_rate": 1.3347246376811596e-05, "loss": 1.7468, "step": 11980 }, { "epoch": 1.0, "grad_norm": 4.609010219573975, "learning_rate": 1.334144927536232e-05, "loss": 1.7859, "step": 11990 }, { "epoch": 1.0, "grad_norm": 2.646367311477661, "learning_rate": 1.3335652173913046e-05, "loss": 1.4624, "step": 12000 }, { "epoch": 1.0, "eval_loss": 1.7011674642562866, "eval_runtime": 107.5473, "eval_samples_per_second": 9.298, "eval_steps_per_second": 2.325, "step": 12000 }, { "epoch": 1.0, "grad_norm": 8.173443794250488, "learning_rate": 1.3329855072463768e-05, "loss": 1.6229, "step": 12010 }, { "epoch": 1.0, "grad_norm": 1.7814215421676636, "learning_rate": 1.3324057971014495e-05, "loss": 1.4929, "step": 12020 }, { "epoch": 1.0, "grad_norm": 1.4045521020889282, "learning_rate": 1.3318260869565218e-05, "loss": 1.6919, "step": 12030 }, { "epoch": 1.0, "grad_norm": 1.0634101629257202, "learning_rate": 1.3312463768115943e-05, "loss": 1.6938, "step": 12040 }, { "epoch": 1.0, "grad_norm": 1.312534213066101, "learning_rate": 1.3306666666666667e-05, "loss": 1.718, "step": 12050 }, { "epoch": 1.0, "grad_norm": 6.818319320678711, "learning_rate": 1.3300869565217391e-05, "loss": 1.7151, "step": 12060 }, { "epoch": 1.01, "grad_norm": 3.955862045288086, "learning_rate": 1.3295072463768117e-05, "loss": 1.7341, "step": 12070 }, { "epoch": 1.01, "grad_norm": 1.5620914697647095, "learning_rate": 1.3289275362318841e-05, "loss": 1.8083, "step": 12080 }, { "epoch": 1.01, "grad_norm": 3.745596170425415, "learning_rate": 1.3283478260869567e-05, "loss": 1.6492, "step": 12090 }, { "epoch": 1.01, "grad_norm": 6.607650279998779, "learning_rate": 1.3277681159420291e-05, "loss": 1.6268, "step": 12100 }, { "epoch": 1.01, "grad_norm": 1.8814033269882202, "learning_rate": 1.3271884057971017e-05, "loss": 1.6127, "step": 12110 }, { "epoch": 1.01, "grad_norm": 1.7189909219741821, "learning_rate": 1.326608695652174e-05, "loss": 1.7186, "step": 12120 }, { "epoch": 1.01, "grad_norm": 3.2475483417510986, "learning_rate": 1.3260289855072466e-05, "loss": 1.7642, "step": 12130 }, { "epoch": 1.01, "grad_norm": 5.229576587677002, "learning_rate": 1.3254492753623189e-05, "loss": 1.7376, "step": 12140 }, { "epoch": 1.01, "grad_norm": 5.324844837188721, "learning_rate": 1.3248695652173916e-05, "loss": 1.753, "step": 12150 }, { "epoch": 1.01, "grad_norm": 1.727643370628357, "learning_rate": 1.3242898550724638e-05, "loss": 1.7612, "step": 12160 }, { "epoch": 1.01, "grad_norm": 2.785902976989746, "learning_rate": 1.3237101449275362e-05, "loss": 1.8341, "step": 12170 }, { "epoch": 1.01, "grad_norm": 1.65829598903656, "learning_rate": 1.3231304347826088e-05, "loss": 1.6544, "step": 12180 }, { "epoch": 1.02, "grad_norm": 5.451395511627197, "learning_rate": 1.3225507246376812e-05, "loss": 1.7604, "step": 12190 }, { "epoch": 1.02, "grad_norm": 11.722532272338867, "learning_rate": 1.3219710144927538e-05, "loss": 1.6377, "step": 12200 }, { "epoch": 1.02, "grad_norm": 3.6431243419647217, "learning_rate": 1.3213913043478262e-05, "loss": 1.6285, "step": 12210 }, { "epoch": 1.02, "grad_norm": 3.2399184703826904, "learning_rate": 1.3208115942028987e-05, "loss": 1.6122, "step": 12220 }, { "epoch": 1.02, "grad_norm": 1.8696668148040771, "learning_rate": 1.3202318840579711e-05, "loss": 1.7179, "step": 12230 }, { "epoch": 1.02, "grad_norm": 3.199878692626953, "learning_rate": 1.3196521739130437e-05, "loss": 1.6551, "step": 12240 }, { "epoch": 1.02, "grad_norm": 9.326812744140625, "learning_rate": 1.3190724637681161e-05, "loss": 1.6373, "step": 12250 }, { "epoch": 1.02, "grad_norm": 1.3237980604171753, "learning_rate": 1.3184927536231883e-05, "loss": 1.6698, "step": 12260 }, { "epoch": 1.02, "grad_norm": 1.768131971359253, "learning_rate": 1.3179130434782609e-05, "loss": 1.7116, "step": 12270 }, { "epoch": 1.02, "grad_norm": 4.411503791809082, "learning_rate": 1.3173333333333333e-05, "loss": 1.6156, "step": 12280 }, { "epoch": 1.02, "grad_norm": 4.180882930755615, "learning_rate": 1.3167536231884059e-05, "loss": 1.6427, "step": 12290 }, { "epoch": 1.02, "grad_norm": 4.48996639251709, "learning_rate": 1.3161739130434783e-05, "loss": 1.8268, "step": 12300 }, { "epoch": 1.03, "grad_norm": 3.8137004375457764, "learning_rate": 1.3155942028985509e-05, "loss": 1.6113, "step": 12310 }, { "epoch": 1.03, "grad_norm": 1.2274035215377808, "learning_rate": 1.3150144927536233e-05, "loss": 1.5674, "step": 12320 }, { "epoch": 1.03, "grad_norm": 1.9615777730941772, "learning_rate": 1.3144347826086958e-05, "loss": 1.7014, "step": 12330 }, { "epoch": 1.03, "grad_norm": 2.100159168243408, "learning_rate": 1.3138550724637682e-05, "loss": 1.6939, "step": 12340 }, { "epoch": 1.03, "grad_norm": 1.2069636583328247, "learning_rate": 1.3132753623188408e-05, "loss": 1.8285, "step": 12350 }, { "epoch": 1.03, "grad_norm": 4.2240705490112305, "learning_rate": 1.3126956521739132e-05, "loss": 1.7967, "step": 12360 }, { "epoch": 1.03, "grad_norm": 4.67422342300415, "learning_rate": 1.3121159420289856e-05, "loss": 1.6853, "step": 12370 }, { "epoch": 1.03, "grad_norm": 2.0639190673828125, "learning_rate": 1.3115362318840582e-05, "loss": 1.8051, "step": 12380 }, { "epoch": 1.03, "grad_norm": 1.48922598361969, "learning_rate": 1.3109565217391304e-05, "loss": 1.744, "step": 12390 }, { "epoch": 1.03, "grad_norm": 0.7941759824752808, "learning_rate": 1.310376811594203e-05, "loss": 1.6758, "step": 12400 }, { "epoch": 1.03, "grad_norm": 1.099454402923584, "learning_rate": 1.3097971014492754e-05, "loss": 1.6345, "step": 12410 }, { "epoch": 1.03, "grad_norm": 1.9024593830108643, "learning_rate": 1.309217391304348e-05, "loss": 1.5965, "step": 12420 }, { "epoch": 1.04, "grad_norm": 3.2126779556274414, "learning_rate": 1.3086376811594203e-05, "loss": 1.6138, "step": 12430 }, { "epoch": 1.04, "grad_norm": 3.7725675106048584, "learning_rate": 1.3080579710144929e-05, "loss": 1.7117, "step": 12440 }, { "epoch": 1.04, "grad_norm": 3.924130439758301, "learning_rate": 1.3074782608695653e-05, "loss": 1.7234, "step": 12450 }, { "epoch": 1.04, "grad_norm": 2.2998852729797363, "learning_rate": 1.3068985507246379e-05, "loss": 1.6755, "step": 12460 }, { "epoch": 1.04, "grad_norm": 4.43101167678833, "learning_rate": 1.3063188405797103e-05, "loss": 1.6946, "step": 12470 }, { "epoch": 1.04, "grad_norm": 5.988603115081787, "learning_rate": 1.3057391304347827e-05, "loss": 1.6585, "step": 12480 }, { "epoch": 1.04, "grad_norm": 3.184678316116333, "learning_rate": 1.3051594202898552e-05, "loss": 1.6695, "step": 12490 }, { "epoch": 1.04, "grad_norm": 6.5645880699157715, "learning_rate": 1.3045797101449277e-05, "loss": 1.7914, "step": 12500 }, { "epoch": 1.04, "eval_loss": 1.7145652770996094, "eval_runtime": 107.5024, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 12500 }, { "epoch": 1.04, "grad_norm": 1.6192374229431152, "learning_rate": 1.3040000000000002e-05, "loss": 1.7134, "step": 12510 }, { "epoch": 1.04, "grad_norm": 1.7455005645751953, "learning_rate": 1.3034202898550725e-05, "loss": 1.7623, "step": 12520 }, { "epoch": 1.04, "grad_norm": 3.3086254596710205, "learning_rate": 1.302840579710145e-05, "loss": 1.6877, "step": 12530 }, { "epoch": 1.04, "grad_norm": 1.5355310440063477, "learning_rate": 1.3022608695652174e-05, "loss": 1.7792, "step": 12540 }, { "epoch": 1.05, "grad_norm": 1.7353932857513428, "learning_rate": 1.30168115942029e-05, "loss": 1.7148, "step": 12550 }, { "epoch": 1.05, "grad_norm": 6.468092918395996, "learning_rate": 1.3011014492753624e-05, "loss": 1.7166, "step": 12560 }, { "epoch": 1.05, "grad_norm": 4.603166580200195, "learning_rate": 1.3005217391304348e-05, "loss": 1.7689, "step": 12570 }, { "epoch": 1.05, "grad_norm": 3.521178960800171, "learning_rate": 1.2999420289855074e-05, "loss": 1.6698, "step": 12580 }, { "epoch": 1.05, "grad_norm": 2.06740665435791, "learning_rate": 1.2993623188405798e-05, "loss": 1.6722, "step": 12590 }, { "epoch": 1.05, "grad_norm": 2.7508833408355713, "learning_rate": 1.2987826086956523e-05, "loss": 1.6326, "step": 12600 }, { "epoch": 1.05, "grad_norm": 2.8367888927459717, "learning_rate": 1.2982028985507247e-05, "loss": 1.7824, "step": 12610 }, { "epoch": 1.05, "grad_norm": 2.0329577922821045, "learning_rate": 1.2976231884057973e-05, "loss": 1.6271, "step": 12620 }, { "epoch": 1.05, "grad_norm": 1.7697292566299438, "learning_rate": 1.2970434782608697e-05, "loss": 1.5485, "step": 12630 }, { "epoch": 1.05, "grad_norm": 2.019354820251465, "learning_rate": 1.2964637681159423e-05, "loss": 1.764, "step": 12640 }, { "epoch": 1.05, "grad_norm": 3.5531795024871826, "learning_rate": 1.2958840579710145e-05, "loss": 1.5918, "step": 12650 }, { "epoch": 1.05, "grad_norm": 4.986388206481934, "learning_rate": 1.295304347826087e-05, "loss": 1.7121, "step": 12660 }, { "epoch": 1.06, "grad_norm": 1.05776047706604, "learning_rate": 1.2947246376811595e-05, "loss": 1.8649, "step": 12670 }, { "epoch": 1.06, "grad_norm": 8.3735990524292, "learning_rate": 1.2941449275362319e-05, "loss": 1.7337, "step": 12680 }, { "epoch": 1.06, "grad_norm": 1.643716812133789, "learning_rate": 1.2935652173913044e-05, "loss": 1.8637, "step": 12690 }, { "epoch": 1.06, "grad_norm": 6.21591854095459, "learning_rate": 1.2929855072463768e-05, "loss": 1.6824, "step": 12700 }, { "epoch": 1.06, "grad_norm": 1.3029552698135376, "learning_rate": 1.2924057971014494e-05, "loss": 1.7475, "step": 12710 }, { "epoch": 1.06, "grad_norm": 1.8332914113998413, "learning_rate": 1.2918260869565218e-05, "loss": 1.6892, "step": 12720 }, { "epoch": 1.06, "grad_norm": 3.6949455738067627, "learning_rate": 1.2912463768115944e-05, "loss": 1.6439, "step": 12730 }, { "epoch": 1.06, "grad_norm": 2.436410427093506, "learning_rate": 1.2906666666666668e-05, "loss": 1.7154, "step": 12740 }, { "epoch": 1.06, "grad_norm": 2.0431625843048096, "learning_rate": 1.2900869565217394e-05, "loss": 1.6907, "step": 12750 }, { "epoch": 1.06, "grad_norm": 6.340989112854004, "learning_rate": 1.2895072463768118e-05, "loss": 1.6155, "step": 12760 }, { "epoch": 1.06, "grad_norm": 2.4236230850219727, "learning_rate": 1.2889275362318843e-05, "loss": 1.7229, "step": 12770 }, { "epoch": 1.06, "grad_norm": 2.5978729724884033, "learning_rate": 1.2883478260869566e-05, "loss": 1.7474, "step": 12780 }, { "epoch": 1.07, "grad_norm": 1.729132056236267, "learning_rate": 1.287768115942029e-05, "loss": 1.6459, "step": 12790 }, { "epoch": 1.07, "grad_norm": 2.0741076469421387, "learning_rate": 1.2871884057971015e-05, "loss": 1.7362, "step": 12800 }, { "epoch": 1.07, "grad_norm": 2.056138038635254, "learning_rate": 1.286608695652174e-05, "loss": 1.6319, "step": 12810 }, { "epoch": 1.07, "grad_norm": 3.7434098720550537, "learning_rate": 1.2860289855072465e-05, "loss": 1.6161, "step": 12820 }, { "epoch": 1.07, "grad_norm": 1.8511974811553955, "learning_rate": 1.2854492753623189e-05, "loss": 1.5639, "step": 12830 }, { "epoch": 1.07, "grad_norm": 4.42405891418457, "learning_rate": 1.2848695652173915e-05, "loss": 1.7232, "step": 12840 }, { "epoch": 1.07, "grad_norm": 6.329336166381836, "learning_rate": 1.2842898550724639e-05, "loss": 1.4743, "step": 12850 }, { "epoch": 1.07, "grad_norm": 2.943577527999878, "learning_rate": 1.2837101449275364e-05, "loss": 1.724, "step": 12860 }, { "epoch": 1.07, "grad_norm": 2.932284116744995, "learning_rate": 1.2831304347826088e-05, "loss": 1.7264, "step": 12870 }, { "epoch": 1.07, "grad_norm": 5.513105392456055, "learning_rate": 1.282550724637681e-05, "loss": 1.7304, "step": 12880 }, { "epoch": 1.07, "grad_norm": 0.8422374129295349, "learning_rate": 1.2819710144927538e-05, "loss": 1.7715, "step": 12890 }, { "epoch": 1.07, "grad_norm": 2.7153878211975098, "learning_rate": 1.281391304347826e-05, "loss": 1.6772, "step": 12900 }, { "epoch": 1.08, "grad_norm": 4.201610088348389, "learning_rate": 1.2808115942028986e-05, "loss": 1.6741, "step": 12910 }, { "epoch": 1.08, "grad_norm": 2.2562031745910645, "learning_rate": 1.280231884057971e-05, "loss": 1.7259, "step": 12920 }, { "epoch": 1.08, "grad_norm": 3.5883278846740723, "learning_rate": 1.2797101449275365e-05, "loss": 1.6095, "step": 12930 }, { "epoch": 1.08, "grad_norm": 2.405609130859375, "learning_rate": 1.2791304347826087e-05, "loss": 1.6715, "step": 12940 }, { "epoch": 1.08, "grad_norm": 4.966475486755371, "learning_rate": 1.2785507246376815e-05, "loss": 1.7278, "step": 12950 }, { "epoch": 1.08, "grad_norm": 2.7825913429260254, "learning_rate": 1.2779710144927537e-05, "loss": 1.7728, "step": 12960 }, { "epoch": 1.08, "grad_norm": 1.6676304340362549, "learning_rate": 1.277391304347826e-05, "loss": 1.604, "step": 12970 }, { "epoch": 1.08, "grad_norm": 8.661015510559082, "learning_rate": 1.2768115942028987e-05, "loss": 1.6988, "step": 12980 }, { "epoch": 1.08, "grad_norm": 1.4278024435043335, "learning_rate": 1.276231884057971e-05, "loss": 1.6335, "step": 12990 }, { "epoch": 1.08, "grad_norm": 1.0306954383850098, "learning_rate": 1.2756521739130436e-05, "loss": 1.7252, "step": 13000 }, { "epoch": 1.08, "eval_loss": 1.6973094940185547, "eval_runtime": 107.5084, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 13000 }, { "epoch": 1.08, "grad_norm": 4.311023712158203, "learning_rate": 1.275072463768116e-05, "loss": 1.6827, "step": 13010 }, { "epoch": 1.08, "grad_norm": 5.270779132843018, "learning_rate": 1.2744927536231886e-05, "loss": 1.6404, "step": 13020 }, { "epoch": 1.09, "grad_norm": 1.767482876777649, "learning_rate": 1.273913043478261e-05, "loss": 1.6437, "step": 13030 }, { "epoch": 1.09, "grad_norm": 4.334309101104736, "learning_rate": 1.2733333333333336e-05, "loss": 1.8447, "step": 13040 }, { "epoch": 1.09, "grad_norm": 8.177881240844727, "learning_rate": 1.272753623188406e-05, "loss": 1.7184, "step": 13050 }, { "epoch": 1.09, "grad_norm": 0.994053304195404, "learning_rate": 1.2721739130434782e-05, "loss": 1.5713, "step": 13060 }, { "epoch": 1.09, "grad_norm": 2.124035120010376, "learning_rate": 1.2715942028985508e-05, "loss": 1.7455, "step": 13070 }, { "epoch": 1.09, "grad_norm": 4.642811298370361, "learning_rate": 1.2710144927536232e-05, "loss": 1.6266, "step": 13080 }, { "epoch": 1.09, "grad_norm": 5.6701507568359375, "learning_rate": 1.2704347826086957e-05, "loss": 1.5264, "step": 13090 }, { "epoch": 1.09, "grad_norm": 3.8725638389587402, "learning_rate": 1.2698550724637681e-05, "loss": 1.6485, "step": 13100 }, { "epoch": 1.09, "grad_norm": 4.132758140563965, "learning_rate": 1.2692753623188407e-05, "loss": 1.7384, "step": 13110 }, { "epoch": 1.09, "grad_norm": 1.472156047821045, "learning_rate": 1.2686956521739131e-05, "loss": 1.7509, "step": 13120 }, { "epoch": 1.09, "grad_norm": 2.3745663166046143, "learning_rate": 1.2681159420289857e-05, "loss": 1.8063, "step": 13130 }, { "epoch": 1.09, "grad_norm": 3.400909423828125, "learning_rate": 1.267536231884058e-05, "loss": 1.502, "step": 13140 }, { "epoch": 1.1, "grad_norm": 2.341827154159546, "learning_rate": 1.2669565217391306e-05, "loss": 1.8114, "step": 13150 }, { "epoch": 1.1, "grad_norm": 4.864140033721924, "learning_rate": 1.266376811594203e-05, "loss": 1.6826, "step": 13160 }, { "epoch": 1.1, "grad_norm": 1.464040756225586, "learning_rate": 1.2657971014492755e-05, "loss": 1.8461, "step": 13170 }, { "epoch": 1.1, "grad_norm": 1.3931642770767212, "learning_rate": 1.265217391304348e-05, "loss": 1.5932, "step": 13180 }, { "epoch": 1.1, "grad_norm": 1.9136666059494019, "learning_rate": 1.2646376811594203e-05, "loss": 1.655, "step": 13190 }, { "epoch": 1.1, "grad_norm": 0.6585626006126404, "learning_rate": 1.2640579710144928e-05, "loss": 1.6953, "step": 13200 }, { "epoch": 1.1, "grad_norm": 1.4719610214233398, "learning_rate": 1.2634782608695652e-05, "loss": 1.7122, "step": 13210 }, { "epoch": 1.1, "grad_norm": 2.93149995803833, "learning_rate": 1.2628985507246378e-05, "loss": 1.7336, "step": 13220 }, { "epoch": 1.1, "grad_norm": 2.2544167041778564, "learning_rate": 1.2623188405797102e-05, "loss": 1.5404, "step": 13230 }, { "epoch": 1.1, "grad_norm": 2.97430682182312, "learning_rate": 1.2617391304347828e-05, "loss": 1.7408, "step": 13240 }, { "epoch": 1.1, "grad_norm": 1.8780460357666016, "learning_rate": 1.2611594202898552e-05, "loss": 1.5946, "step": 13250 }, { "epoch": 1.1, "grad_norm": 3.108811140060425, "learning_rate": 1.2605797101449277e-05, "loss": 1.7989, "step": 13260 }, { "epoch": 1.11, "grad_norm": 1.7901716232299805, "learning_rate": 1.2600000000000001e-05, "loss": 1.715, "step": 13270 }, { "epoch": 1.11, "grad_norm": 2.103832244873047, "learning_rate": 1.2594202898550725e-05, "loss": 1.5802, "step": 13280 }, { "epoch": 1.11, "grad_norm": 3.0903170108795166, "learning_rate": 1.2588405797101451e-05, "loss": 1.7531, "step": 13290 }, { "epoch": 1.11, "grad_norm": 2.20255184173584, "learning_rate": 1.2582608695652175e-05, "loss": 1.7651, "step": 13300 }, { "epoch": 1.11, "grad_norm": 1.1130000352859497, "learning_rate": 1.25768115942029e-05, "loss": 1.6944, "step": 13310 }, { "epoch": 1.11, "grad_norm": 2.6338019371032715, "learning_rate": 1.2571014492753623e-05, "loss": 1.6242, "step": 13320 }, { "epoch": 1.11, "grad_norm": 3.7077834606170654, "learning_rate": 1.2565217391304349e-05, "loss": 1.5732, "step": 13330 }, { "epoch": 1.11, "grad_norm": 2.3346569538116455, "learning_rate": 1.2559420289855073e-05, "loss": 1.8445, "step": 13340 }, { "epoch": 1.11, "grad_norm": 9.315794944763184, "learning_rate": 1.2553623188405798e-05, "loss": 1.6678, "step": 13350 }, { "epoch": 1.11, "grad_norm": 3.115358591079712, "learning_rate": 1.2547826086956522e-05, "loss": 1.719, "step": 13360 }, { "epoch": 1.11, "grad_norm": 7.239322662353516, "learning_rate": 1.2542028985507246e-05, "loss": 1.6076, "step": 13370 }, { "epoch": 1.11, "grad_norm": 5.326850891113281, "learning_rate": 1.2536231884057972e-05, "loss": 1.722, "step": 13380 }, { "epoch": 1.12, "grad_norm": 5.592775344848633, "learning_rate": 1.2530434782608696e-05, "loss": 1.7467, "step": 13390 }, { "epoch": 1.12, "grad_norm": 7.9691667556762695, "learning_rate": 1.2524637681159422e-05, "loss": 1.5619, "step": 13400 }, { "epoch": 1.12, "grad_norm": 1.5476248264312744, "learning_rate": 1.2518840579710146e-05, "loss": 1.7403, "step": 13410 }, { "epoch": 1.12, "grad_norm": 2.466348886489868, "learning_rate": 1.2513043478260872e-05, "loss": 1.7568, "step": 13420 }, { "epoch": 1.12, "grad_norm": 1.7985725402832031, "learning_rate": 1.2507246376811596e-05, "loss": 1.6505, "step": 13430 }, { "epoch": 1.12, "grad_norm": 2.469550848007202, "learning_rate": 1.2501449275362321e-05, "loss": 1.686, "step": 13440 }, { "epoch": 1.12, "grad_norm": 2.672849178314209, "learning_rate": 1.2495652173913044e-05, "loss": 1.6282, "step": 13450 }, { "epoch": 1.12, "grad_norm": 2.2976021766662598, "learning_rate": 1.2489855072463771e-05, "loss": 1.7232, "step": 13460 }, { "epoch": 1.12, "grad_norm": 4.909353256225586, "learning_rate": 1.2484057971014493e-05, "loss": 1.5368, "step": 13470 }, { "epoch": 1.12, "grad_norm": 4.343336582183838, "learning_rate": 1.2478260869565217e-05, "loss": 1.8103, "step": 13480 }, { "epoch": 1.12, "grad_norm": 4.372796058654785, "learning_rate": 1.2472463768115943e-05, "loss": 1.6098, "step": 13490 }, { "epoch": 1.12, "grad_norm": 1.7971751689910889, "learning_rate": 1.2466666666666667e-05, "loss": 1.6064, "step": 13500 }, { "epoch": 1.12, "eval_loss": 1.6684225797653198, "eval_runtime": 107.5063, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 13500 }, { "epoch": 1.13, "grad_norm": 3.6352314949035645, "learning_rate": 1.2460869565217393e-05, "loss": 1.7123, "step": 13510 }, { "epoch": 1.13, "grad_norm": 3.39823317527771, "learning_rate": 1.2455072463768117e-05, "loss": 1.7258, "step": 13520 }, { "epoch": 1.13, "grad_norm": 0.8198342323303223, "learning_rate": 1.2449275362318842e-05, "loss": 1.6146, "step": 13530 }, { "epoch": 1.13, "grad_norm": 2.5022692680358887, "learning_rate": 1.2443478260869566e-05, "loss": 1.6835, "step": 13540 }, { "epoch": 1.13, "grad_norm": 3.4644007682800293, "learning_rate": 1.2437681159420292e-05, "loss": 1.7186, "step": 13550 }, { "epoch": 1.13, "grad_norm": 2.09822416305542, "learning_rate": 1.2431884057971016e-05, "loss": 1.7713, "step": 13560 }, { "epoch": 1.13, "grad_norm": 3.5740549564361572, "learning_rate": 1.2426086956521742e-05, "loss": 1.7463, "step": 13570 }, { "epoch": 1.13, "grad_norm": 2.3453164100646973, "learning_rate": 1.2420289855072464e-05, "loss": 1.6706, "step": 13580 }, { "epoch": 1.13, "grad_norm": 4.465335369110107, "learning_rate": 1.2414492753623188e-05, "loss": 1.5353, "step": 13590 }, { "epoch": 1.13, "grad_norm": 3.2128164768218994, "learning_rate": 1.2408695652173914e-05, "loss": 1.7146, "step": 13600 }, { "epoch": 1.13, "grad_norm": 1.6610260009765625, "learning_rate": 1.2402898550724638e-05, "loss": 1.6992, "step": 13610 }, { "epoch": 1.14, "grad_norm": 2.8867626190185547, "learning_rate": 1.2397101449275364e-05, "loss": 1.7159, "step": 13620 }, { "epoch": 1.14, "grad_norm": 1.6658381223678589, "learning_rate": 1.2391304347826088e-05, "loss": 1.6678, "step": 13630 }, { "epoch": 1.14, "grad_norm": 3.239511728286743, "learning_rate": 1.2385507246376813e-05, "loss": 1.6895, "step": 13640 }, { "epoch": 1.14, "grad_norm": 3.459529399871826, "learning_rate": 1.2379710144927537e-05, "loss": 1.7424, "step": 13650 }, { "epoch": 1.14, "grad_norm": 4.84796142578125, "learning_rate": 1.2373913043478263e-05, "loss": 1.6314, "step": 13660 }, { "epoch": 1.14, "grad_norm": 3.1975581645965576, "learning_rate": 1.2368115942028987e-05, "loss": 1.7106, "step": 13670 }, { "epoch": 1.14, "grad_norm": 2.918569564819336, "learning_rate": 1.236231884057971e-05, "loss": 1.6561, "step": 13680 }, { "epoch": 1.14, "grad_norm": 2.9456100463867188, "learning_rate": 1.2356521739130437e-05, "loss": 1.5517, "step": 13690 }, { "epoch": 1.14, "grad_norm": 2.2089602947235107, "learning_rate": 1.2350724637681159e-05, "loss": 1.6579, "step": 13700 }, { "epoch": 1.14, "grad_norm": 4.878429889678955, "learning_rate": 1.2344927536231885e-05, "loss": 1.6001, "step": 13710 }, { "epoch": 1.14, "grad_norm": 1.9600847959518433, "learning_rate": 1.2339130434782609e-05, "loss": 1.6324, "step": 13720 }, { "epoch": 1.14, "grad_norm": 4.215534687042236, "learning_rate": 1.2333333333333334e-05, "loss": 1.76, "step": 13730 }, { "epoch": 1.15, "grad_norm": 1.4697761535644531, "learning_rate": 1.2327536231884058e-05, "loss": 1.6336, "step": 13740 }, { "epoch": 1.15, "grad_norm": 2.4715230464935303, "learning_rate": 1.2321739130434784e-05, "loss": 1.7028, "step": 13750 }, { "epoch": 1.15, "grad_norm": 2.2741434574127197, "learning_rate": 1.2315942028985508e-05, "loss": 1.713, "step": 13760 }, { "epoch": 1.15, "grad_norm": 1.9270362854003906, "learning_rate": 1.2310144927536234e-05, "loss": 1.6415, "step": 13770 }, { "epoch": 1.15, "grad_norm": 4.036669731140137, "learning_rate": 1.2304347826086958e-05, "loss": 1.7856, "step": 13780 }, { "epoch": 1.15, "grad_norm": 3.498189926147461, "learning_rate": 1.2298550724637682e-05, "loss": 1.575, "step": 13790 }, { "epoch": 1.15, "grad_norm": 2.0799221992492676, "learning_rate": 1.2292753623188408e-05, "loss": 1.4947, "step": 13800 }, { "epoch": 1.15, "grad_norm": 2.5803940296173096, "learning_rate": 1.228695652173913e-05, "loss": 1.7288, "step": 13810 }, { "epoch": 1.15, "grad_norm": 4.215254306793213, "learning_rate": 1.2281159420289857e-05, "loss": 1.76, "step": 13820 }, { "epoch": 1.15, "grad_norm": 4.722815990447998, "learning_rate": 1.227536231884058e-05, "loss": 1.6812, "step": 13830 }, { "epoch": 1.15, "grad_norm": 3.2459473609924316, "learning_rate": 1.2269565217391305e-05, "loss": 1.6047, "step": 13840 }, { "epoch": 1.15, "grad_norm": 1.8425486087799072, "learning_rate": 1.226376811594203e-05, "loss": 1.6592, "step": 13850 }, { "epoch": 1.16, "grad_norm": 3.949090003967285, "learning_rate": 1.2257971014492755e-05, "loss": 1.644, "step": 13860 }, { "epoch": 1.16, "grad_norm": 2.5830769538879395, "learning_rate": 1.2252173913043479e-05, "loss": 1.7824, "step": 13870 }, { "epoch": 1.16, "grad_norm": 1.8728455305099487, "learning_rate": 1.2246376811594205e-05, "loss": 1.7766, "step": 13880 }, { "epoch": 1.16, "grad_norm": 2.3987927436828613, "learning_rate": 1.2240579710144929e-05, "loss": 1.6303, "step": 13890 }, { "epoch": 1.16, "grad_norm": 6.897881031036377, "learning_rate": 1.2234782608695653e-05, "loss": 1.6986, "step": 13900 }, { "epoch": 1.16, "grad_norm": 1.8493584394454956, "learning_rate": 1.2228985507246378e-05, "loss": 1.6711, "step": 13910 }, { "epoch": 1.16, "grad_norm": 2.1658079624176025, "learning_rate": 1.2223188405797102e-05, "loss": 1.5714, "step": 13920 }, { "epoch": 1.16, "grad_norm": 7.026312828063965, "learning_rate": 1.2217391304347828e-05, "loss": 1.6648, "step": 13930 }, { "epoch": 1.16, "grad_norm": 4.64369010925293, "learning_rate": 1.2211594202898552e-05, "loss": 1.622, "step": 13940 }, { "epoch": 1.16, "grad_norm": 1.4812980890274048, "learning_rate": 1.2205797101449278e-05, "loss": 1.6563, "step": 13950 }, { "epoch": 1.16, "grad_norm": 3.3375606536865234, "learning_rate": 1.22e-05, "loss": 1.7177, "step": 13960 }, { "epoch": 1.16, "grad_norm": 3.0398337841033936, "learning_rate": 1.2194202898550726e-05, "loss": 1.8483, "step": 13970 }, { "epoch": 1.17, "grad_norm": 2.477541446685791, "learning_rate": 1.218840579710145e-05, "loss": 1.7636, "step": 13980 }, { "epoch": 1.17, "grad_norm": 2.2809600830078125, "learning_rate": 1.2182608695652174e-05, "loss": 1.6272, "step": 13990 }, { "epoch": 1.17, "grad_norm": 3.5135672092437744, "learning_rate": 1.21768115942029e-05, "loss": 1.6053, "step": 14000 }, { "epoch": 1.17, "eval_loss": 1.655966877937317, "eval_runtime": 107.5028, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 14000 }, { "epoch": 1.17, "grad_norm": 6.65315580368042, "learning_rate": 1.2171014492753624e-05, "loss": 1.6857, "step": 14010 }, { "epoch": 1.17, "grad_norm": 3.5551207065582275, "learning_rate": 1.216521739130435e-05, "loss": 1.6957, "step": 14020 }, { "epoch": 1.17, "grad_norm": 3.662346839904785, "learning_rate": 1.2159420289855073e-05, "loss": 1.709, "step": 14030 }, { "epoch": 1.17, "grad_norm": 3.7407190799713135, "learning_rate": 1.2153623188405799e-05, "loss": 1.7064, "step": 14040 }, { "epoch": 1.17, "grad_norm": 3.3801932334899902, "learning_rate": 1.2147826086956523e-05, "loss": 1.5048, "step": 14050 }, { "epoch": 1.17, "grad_norm": 1.1840766668319702, "learning_rate": 1.2142028985507249e-05, "loss": 1.6463, "step": 14060 }, { "epoch": 1.17, "grad_norm": 5.610103130340576, "learning_rate": 1.2136231884057973e-05, "loss": 1.6947, "step": 14070 }, { "epoch": 1.17, "grad_norm": 3.217325210571289, "learning_rate": 1.2130434782608698e-05, "loss": 1.6183, "step": 14080 }, { "epoch": 1.17, "grad_norm": 3.316359281539917, "learning_rate": 1.212463768115942e-05, "loss": 1.6692, "step": 14090 }, { "epoch": 1.18, "grad_norm": 4.7778801918029785, "learning_rate": 1.2118840579710145e-05, "loss": 1.7233, "step": 14100 }, { "epoch": 1.18, "grad_norm": 4.980316162109375, "learning_rate": 1.211304347826087e-05, "loss": 1.5753, "step": 14110 }, { "epoch": 1.18, "grad_norm": 2.6318607330322266, "learning_rate": 1.2107246376811594e-05, "loss": 1.5216, "step": 14120 }, { "epoch": 1.18, "grad_norm": 4.447054386138916, "learning_rate": 1.210144927536232e-05, "loss": 1.82, "step": 14130 }, { "epoch": 1.18, "grad_norm": 1.368971586227417, "learning_rate": 1.2095652173913044e-05, "loss": 1.7372, "step": 14140 }, { "epoch": 1.18, "grad_norm": 3.2000670433044434, "learning_rate": 1.208985507246377e-05, "loss": 1.7553, "step": 14150 }, { "epoch": 1.18, "grad_norm": 3.1410953998565674, "learning_rate": 1.2084057971014494e-05, "loss": 1.778, "step": 14160 }, { "epoch": 1.18, "grad_norm": 1.6340982913970947, "learning_rate": 1.207826086956522e-05, "loss": 1.5523, "step": 14170 }, { "epoch": 1.18, "grad_norm": 2.945784568786621, "learning_rate": 1.2072463768115943e-05, "loss": 1.6813, "step": 14180 }, { "epoch": 1.18, "grad_norm": 1.7603634595870972, "learning_rate": 1.206666666666667e-05, "loss": 1.7787, "step": 14190 }, { "epoch": 1.18, "grad_norm": 3.8819363117218018, "learning_rate": 1.2060869565217393e-05, "loss": 1.7003, "step": 14200 }, { "epoch": 1.18, "grad_norm": 3.882131576538086, "learning_rate": 1.2055072463768115e-05, "loss": 1.7552, "step": 14210 }, { "epoch": 1.19, "grad_norm": 2.339003324508667, "learning_rate": 1.2049275362318841e-05, "loss": 1.6867, "step": 14220 }, { "epoch": 1.19, "grad_norm": 5.375877857208252, "learning_rate": 1.2043478260869565e-05, "loss": 1.7049, "step": 14230 }, { "epoch": 1.19, "grad_norm": 3.6439826488494873, "learning_rate": 1.2037681159420291e-05, "loss": 1.7307, "step": 14240 }, { "epoch": 1.19, "grad_norm": 2.504507541656494, "learning_rate": 1.2031884057971015e-05, "loss": 1.7403, "step": 14250 }, { "epoch": 1.19, "grad_norm": 2.381837844848633, "learning_rate": 1.202608695652174e-05, "loss": 1.6733, "step": 14260 }, { "epoch": 1.19, "grad_norm": 5.466141700744629, "learning_rate": 1.2020289855072465e-05, "loss": 1.7054, "step": 14270 }, { "epoch": 1.19, "grad_norm": 2.095202922821045, "learning_rate": 1.201449275362319e-05, "loss": 1.6852, "step": 14280 }, { "epoch": 1.19, "grad_norm": 2.33658766746521, "learning_rate": 1.2008695652173914e-05, "loss": 1.7658, "step": 14290 }, { "epoch": 1.19, "grad_norm": 3.439746379852295, "learning_rate": 1.2002898550724638e-05, "loss": 1.5291, "step": 14300 }, { "epoch": 1.19, "grad_norm": 6.367286682128906, "learning_rate": 1.1997101449275364e-05, "loss": 1.6469, "step": 14310 }, { "epoch": 1.19, "grad_norm": 4.05848503112793, "learning_rate": 1.1991304347826086e-05, "loss": 1.8017, "step": 14320 }, { "epoch": 1.19, "grad_norm": 4.569128036499023, "learning_rate": 1.1985507246376814e-05, "loss": 1.5023, "step": 14330 }, { "epoch": 1.2, "grad_norm": 3.5229172706604004, "learning_rate": 1.1979710144927536e-05, "loss": 1.7475, "step": 14340 }, { "epoch": 1.2, "grad_norm": 4.319218635559082, "learning_rate": 1.1973913043478262e-05, "loss": 1.6949, "step": 14350 }, { "epoch": 1.2, "grad_norm": 5.897936820983887, "learning_rate": 1.1968115942028986e-05, "loss": 1.6718, "step": 14360 }, { "epoch": 1.2, "grad_norm": 2.435079574584961, "learning_rate": 1.1962318840579711e-05, "loss": 1.6979, "step": 14370 }, { "epoch": 1.2, "grad_norm": 3.011115312576294, "learning_rate": 1.1956521739130435e-05, "loss": 1.4714, "step": 14380 }, { "epoch": 1.2, "grad_norm": 1.5290638208389282, "learning_rate": 1.1950724637681161e-05, "loss": 1.5394, "step": 14390 }, { "epoch": 1.2, "grad_norm": 5.277037620544434, "learning_rate": 1.1944927536231885e-05, "loss": 1.6627, "step": 14400 }, { "epoch": 1.2, "grad_norm": 5.786652088165283, "learning_rate": 1.1939130434782609e-05, "loss": 1.7029, "step": 14410 }, { "epoch": 1.2, "grad_norm": 1.4963030815124512, "learning_rate": 1.1933333333333335e-05, "loss": 1.6999, "step": 14420 }, { "epoch": 1.2, "grad_norm": 2.0223753452301025, "learning_rate": 1.1927536231884059e-05, "loss": 1.6998, "step": 14430 }, { "epoch": 1.2, "grad_norm": 3.4138219356536865, "learning_rate": 1.1921739130434785e-05, "loss": 1.5385, "step": 14440 }, { "epoch": 1.2, "grad_norm": 5.297712802886963, "learning_rate": 1.1915942028985507e-05, "loss": 1.6142, "step": 14450 }, { "epoch": 1.21, "grad_norm": 3.5567455291748047, "learning_rate": 1.1910144927536234e-05, "loss": 1.7254, "step": 14460 }, { "epoch": 1.21, "grad_norm": 4.329671382904053, "learning_rate": 1.1904347826086957e-05, "loss": 1.7098, "step": 14470 }, { "epoch": 1.21, "grad_norm": 1.1015174388885498, "learning_rate": 1.1898550724637682e-05, "loss": 1.7312, "step": 14480 }, { "epoch": 1.21, "grad_norm": 7.632756233215332, "learning_rate": 1.1892753623188406e-05, "loss": 1.5845, "step": 14490 }, { "epoch": 1.21, "grad_norm": 4.536245822906494, "learning_rate": 1.1886956521739132e-05, "loss": 1.7143, "step": 14500 }, { "epoch": 1.21, "eval_loss": 1.686023235321045, "eval_runtime": 107.504, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 14500 }, { "epoch": 1.21, "grad_norm": 2.551739454269409, "learning_rate": 1.1881159420289856e-05, "loss": 1.7145, "step": 14510 }, { "epoch": 1.21, "grad_norm": 2.037818431854248, "learning_rate": 1.187536231884058e-05, "loss": 1.4981, "step": 14520 }, { "epoch": 1.21, "grad_norm": 2.1799204349517822, "learning_rate": 1.1869565217391306e-05, "loss": 1.634, "step": 14530 }, { "epoch": 1.21, "grad_norm": 2.238110065460205, "learning_rate": 1.186376811594203e-05, "loss": 1.5684, "step": 14540 }, { "epoch": 1.21, "grad_norm": 1.419713020324707, "learning_rate": 1.1857971014492755e-05, "loss": 1.6688, "step": 14550 }, { "epoch": 1.21, "grad_norm": 2.3431668281555176, "learning_rate": 1.185217391304348e-05, "loss": 1.7666, "step": 14560 }, { "epoch": 1.21, "grad_norm": 5.332759857177734, "learning_rate": 1.1846376811594205e-05, "loss": 1.7597, "step": 14570 }, { "epoch": 1.22, "grad_norm": 1.4301503896713257, "learning_rate": 1.1840579710144927e-05, "loss": 1.742, "step": 14580 }, { "epoch": 1.22, "grad_norm": 5.501166343688965, "learning_rate": 1.1834782608695655e-05, "loss": 1.6943, "step": 14590 }, { "epoch": 1.22, "grad_norm": 1.4921094179153442, "learning_rate": 1.1828985507246377e-05, "loss": 1.7038, "step": 14600 }, { "epoch": 1.22, "grad_norm": 2.763652801513672, "learning_rate": 1.1823188405797101e-05, "loss": 1.683, "step": 14610 }, { "epoch": 1.22, "grad_norm": 7.096171855926514, "learning_rate": 1.1817391304347827e-05, "loss": 1.5952, "step": 14620 }, { "epoch": 1.22, "grad_norm": 3.0550990104675293, "learning_rate": 1.181159420289855e-05, "loss": 1.6841, "step": 14630 }, { "epoch": 1.22, "grad_norm": 6.7404937744140625, "learning_rate": 1.1805797101449277e-05, "loss": 1.3851, "step": 14640 }, { "epoch": 1.22, "grad_norm": 2.006895065307617, "learning_rate": 1.18e-05, "loss": 1.6884, "step": 14650 }, { "epoch": 1.22, "grad_norm": 4.734574317932129, "learning_rate": 1.1794202898550726e-05, "loss": 1.6566, "step": 14660 }, { "epoch": 1.22, "grad_norm": 2.7309789657592773, "learning_rate": 1.178840579710145e-05, "loss": 1.7182, "step": 14670 }, { "epoch": 1.22, "grad_norm": 2.692793846130371, "learning_rate": 1.1782608695652176e-05, "loss": 1.6907, "step": 14680 }, { "epoch": 1.22, "grad_norm": 2.2033607959747314, "learning_rate": 1.17768115942029e-05, "loss": 1.6187, "step": 14690 }, { "epoch": 1.23, "grad_norm": 1.37400221824646, "learning_rate": 1.1771014492753626e-05, "loss": 1.7607, "step": 14700 }, { "epoch": 1.23, "grad_norm": 1.4525043964385986, "learning_rate": 1.176521739130435e-05, "loss": 1.5481, "step": 14710 }, { "epoch": 1.23, "grad_norm": 4.075309753417969, "learning_rate": 1.1759420289855072e-05, "loss": 1.6437, "step": 14720 }, { "epoch": 1.23, "grad_norm": 1.6363898515701294, "learning_rate": 1.1753623188405798e-05, "loss": 1.6288, "step": 14730 }, { "epoch": 1.23, "grad_norm": 2.210326671600342, "learning_rate": 1.1747826086956522e-05, "loss": 1.6164, "step": 14740 }, { "epoch": 1.23, "grad_norm": 2.808457851409912, "learning_rate": 1.1742028985507247e-05, "loss": 1.7345, "step": 14750 }, { "epoch": 1.23, "grad_norm": 4.7805867195129395, "learning_rate": 1.1736231884057971e-05, "loss": 1.666, "step": 14760 }, { "epoch": 1.23, "grad_norm": 1.179888129234314, "learning_rate": 1.1730434782608697e-05, "loss": 1.66, "step": 14770 }, { "epoch": 1.23, "grad_norm": 3.300703287124634, "learning_rate": 1.1724637681159421e-05, "loss": 1.5873, "step": 14780 }, { "epoch": 1.23, "grad_norm": 4.175435543060303, "learning_rate": 1.1718840579710147e-05, "loss": 1.7415, "step": 14790 }, { "epoch": 1.23, "grad_norm": 3.4557175636291504, "learning_rate": 1.171304347826087e-05, "loss": 1.7369, "step": 14800 }, { "epoch": 1.23, "grad_norm": 2.879883289337158, "learning_rate": 1.1707246376811596e-05, "loss": 1.6883, "step": 14810 }, { "epoch": 1.23, "grad_norm": 7.861069679260254, "learning_rate": 1.170144927536232e-05, "loss": 1.6678, "step": 14820 }, { "epoch": 1.24, "grad_norm": 2.97892427444458, "learning_rate": 1.1695652173913043e-05, "loss": 1.5186, "step": 14830 }, { "epoch": 1.24, "grad_norm": 1.9701839685440063, "learning_rate": 1.168985507246377e-05, "loss": 1.6847, "step": 14840 }, { "epoch": 1.24, "grad_norm": 2.1920979022979736, "learning_rate": 1.1684057971014492e-05, "loss": 1.6583, "step": 14850 }, { "epoch": 1.24, "grad_norm": 4.745516777038574, "learning_rate": 1.1678260869565218e-05, "loss": 1.6884, "step": 14860 }, { "epoch": 1.24, "grad_norm": 7.854526042938232, "learning_rate": 1.1672463768115942e-05, "loss": 1.7469, "step": 14870 }, { "epoch": 1.24, "grad_norm": 2.85132098197937, "learning_rate": 1.1666666666666668e-05, "loss": 1.5597, "step": 14880 }, { "epoch": 1.24, "grad_norm": 1.9505503177642822, "learning_rate": 1.1660869565217392e-05, "loss": 1.701, "step": 14890 }, { "epoch": 1.24, "grad_norm": 4.181471824645996, "learning_rate": 1.1655072463768118e-05, "loss": 1.7843, "step": 14900 }, { "epoch": 1.24, "grad_norm": 3.77048659324646, "learning_rate": 1.1649275362318842e-05, "loss": 1.7381, "step": 14910 }, { "epoch": 1.24, "grad_norm": 7.85846471786499, "learning_rate": 1.1643478260869566e-05, "loss": 1.7673, "step": 14920 }, { "epoch": 1.24, "grad_norm": 7.585514545440674, "learning_rate": 1.1637681159420291e-05, "loss": 1.6633, "step": 14930 }, { "epoch": 1.25, "grad_norm": 10.48277759552002, "learning_rate": 1.1631884057971015e-05, "loss": 1.6939, "step": 14940 }, { "epoch": 1.25, "grad_norm": 3.1543049812316895, "learning_rate": 1.1626086956521741e-05, "loss": 1.6358, "step": 14950 }, { "epoch": 1.25, "grad_norm": 2.4650087356567383, "learning_rate": 1.1620289855072463e-05, "loss": 1.7219, "step": 14960 }, { "epoch": 1.25, "grad_norm": 1.0995783805847168, "learning_rate": 1.161449275362319e-05, "loss": 1.68, "step": 14970 }, { "epoch": 1.25, "grad_norm": 5.351716995239258, "learning_rate": 1.1608695652173913e-05, "loss": 1.7076, "step": 14980 }, { "epoch": 1.25, "grad_norm": 1.2592127323150635, "learning_rate": 1.1602898550724639e-05, "loss": 1.6001, "step": 14990 }, { "epoch": 1.25, "grad_norm": 1.4201385974884033, "learning_rate": 1.1597101449275363e-05, "loss": 1.753, "step": 15000 }, { "epoch": 1.25, "eval_loss": 1.7132278680801392, "eval_runtime": 107.4961, "eval_samples_per_second": 9.303, "eval_steps_per_second": 2.326, "step": 15000 }, { "epoch": 1.25, "grad_norm": 2.306769609451294, "learning_rate": 1.1591304347826088e-05, "loss": 1.6939, "step": 15010 }, { "epoch": 1.25, "grad_norm": 1.9689505100250244, "learning_rate": 1.1585507246376812e-05, "loss": 1.6635, "step": 15020 }, { "epoch": 1.25, "grad_norm": 4.207001209259033, "learning_rate": 1.1579710144927536e-05, "loss": 1.7515, "step": 15030 }, { "epoch": 1.25, "grad_norm": 3.0428285598754883, "learning_rate": 1.1573913043478262e-05, "loss": 1.5935, "step": 15040 }, { "epoch": 1.25, "grad_norm": 4.358133792877197, "learning_rate": 1.1568115942028986e-05, "loss": 1.6732, "step": 15050 }, { "epoch": 1.25, "grad_norm": 4.284139633178711, "learning_rate": 1.1562318840579712e-05, "loss": 1.7963, "step": 15060 }, { "epoch": 1.26, "grad_norm": 2.9040706157684326, "learning_rate": 1.1556521739130436e-05, "loss": 1.7937, "step": 15070 }, { "epoch": 1.26, "grad_norm": 4.678374290466309, "learning_rate": 1.1550724637681162e-05, "loss": 1.5987, "step": 15080 }, { "epoch": 1.26, "grad_norm": 1.1551108360290527, "learning_rate": 1.1544927536231884e-05, "loss": 1.7292, "step": 15090 }, { "epoch": 1.26, "grad_norm": 3.153188943862915, "learning_rate": 1.1539130434782611e-05, "loss": 1.5968, "step": 15100 }, { "epoch": 1.26, "grad_norm": 7.321458339691162, "learning_rate": 1.1533333333333334e-05, "loss": 1.7164, "step": 15110 }, { "epoch": 1.26, "grad_norm": 1.9099916219711304, "learning_rate": 1.152753623188406e-05, "loss": 1.7368, "step": 15120 }, { "epoch": 1.26, "grad_norm": 4.101009368896484, "learning_rate": 1.1521739130434783e-05, "loss": 1.7376, "step": 15130 }, { "epoch": 1.26, "grad_norm": 10.590901374816895, "learning_rate": 1.1515942028985507e-05, "loss": 1.7985, "step": 15140 }, { "epoch": 1.26, "grad_norm": 4.799911022186279, "learning_rate": 1.1510144927536233e-05, "loss": 1.7259, "step": 15150 }, { "epoch": 1.26, "grad_norm": 1.911226511001587, "learning_rate": 1.1504347826086957e-05, "loss": 1.7439, "step": 15160 }, { "epoch": 1.26, "grad_norm": 1.5930674076080322, "learning_rate": 1.1498550724637683e-05, "loss": 1.7223, "step": 15170 }, { "epoch": 1.27, "grad_norm": 3.125276565551758, "learning_rate": 1.1492753623188407e-05, "loss": 1.645, "step": 15180 }, { "epoch": 1.27, "grad_norm": 1.448198914527893, "learning_rate": 1.1486956521739132e-05, "loss": 1.6675, "step": 15190 }, { "epoch": 1.27, "grad_norm": 2.842217445373535, "learning_rate": 1.1481159420289856e-05, "loss": 1.761, "step": 15200 }, { "epoch": 1.27, "grad_norm": 1.6608208417892456, "learning_rate": 1.1475362318840582e-05, "loss": 1.6394, "step": 15210 }, { "epoch": 1.27, "grad_norm": 2.386859178543091, "learning_rate": 1.1469565217391304e-05, "loss": 1.6165, "step": 15220 }, { "epoch": 1.27, "grad_norm": 1.5258991718292236, "learning_rate": 1.1463768115942028e-05, "loss": 1.7252, "step": 15230 }, { "epoch": 1.27, "grad_norm": 2.106661081314087, "learning_rate": 1.1457971014492754e-05, "loss": 1.6343, "step": 15240 }, { "epoch": 1.27, "grad_norm": 1.260819435119629, "learning_rate": 1.1452173913043478e-05, "loss": 1.7971, "step": 15250 }, { "epoch": 1.27, "grad_norm": 3.5960564613342285, "learning_rate": 1.1446376811594204e-05, "loss": 1.6154, "step": 15260 }, { "epoch": 1.27, "grad_norm": 3.4843623638153076, "learning_rate": 1.1440579710144928e-05, "loss": 1.8285, "step": 15270 }, { "epoch": 1.27, "grad_norm": 2.6920154094696045, "learning_rate": 1.1434782608695654e-05, "loss": 1.7324, "step": 15280 }, { "epoch": 1.27, "grad_norm": 3.802539110183716, "learning_rate": 1.1428985507246378e-05, "loss": 1.6399, "step": 15290 }, { "epoch": 1.27, "grad_norm": 3.4101133346557617, "learning_rate": 1.1423188405797103e-05, "loss": 1.7186, "step": 15300 }, { "epoch": 1.28, "grad_norm": 1.7906736135482788, "learning_rate": 1.1417391304347827e-05, "loss": 1.7397, "step": 15310 }, { "epoch": 1.28, "grad_norm": 2.0107662677764893, "learning_rate": 1.1411594202898553e-05, "loss": 1.6297, "step": 15320 }, { "epoch": 1.28, "grad_norm": 2.830557107925415, "learning_rate": 1.1405797101449277e-05, "loss": 1.6362, "step": 15330 }, { "epoch": 1.28, "grad_norm": 1.7907980680465698, "learning_rate": 1.14e-05, "loss": 1.6913, "step": 15340 }, { "epoch": 1.28, "grad_norm": 1.5224626064300537, "learning_rate": 1.1394202898550725e-05, "loss": 1.7274, "step": 15350 }, { "epoch": 1.28, "grad_norm": 7.044021129608154, "learning_rate": 1.1388405797101449e-05, "loss": 1.6653, "step": 15360 }, { "epoch": 1.28, "grad_norm": 2.0197882652282715, "learning_rate": 1.1382608695652175e-05, "loss": 1.7462, "step": 15370 }, { "epoch": 1.28, "grad_norm": 3.1112966537475586, "learning_rate": 1.1376811594202899e-05, "loss": 1.7024, "step": 15380 }, { "epoch": 1.28, "grad_norm": 2.9264869689941406, "learning_rate": 1.1371014492753624e-05, "loss": 1.6285, "step": 15390 }, { "epoch": 1.28, "grad_norm": 2.849034309387207, "learning_rate": 1.1365217391304348e-05, "loss": 1.7542, "step": 15400 }, { "epoch": 1.28, "grad_norm": 5.498842239379883, "learning_rate": 1.1359420289855074e-05, "loss": 1.6733, "step": 15410 }, { "epoch": 1.28, "grad_norm": 3.047708034515381, "learning_rate": 1.1353623188405798e-05, "loss": 1.6027, "step": 15420 }, { "epoch": 1.29, "grad_norm": 4.122775077819824, "learning_rate": 1.1347826086956524e-05, "loss": 1.7346, "step": 15430 }, { "epoch": 1.29, "grad_norm": 2.988992929458618, "learning_rate": 1.1342028985507248e-05, "loss": 1.5555, "step": 15440 }, { "epoch": 1.29, "grad_norm": 1.4278367757797241, "learning_rate": 1.1336231884057972e-05, "loss": 1.6886, "step": 15450 }, { "epoch": 1.29, "grad_norm": 1.543633222579956, "learning_rate": 1.1330434782608698e-05, "loss": 1.5712, "step": 15460 }, { "epoch": 1.29, "grad_norm": 2.563520908355713, "learning_rate": 1.132463768115942e-05, "loss": 1.6082, "step": 15470 }, { "epoch": 1.29, "grad_norm": 3.311037540435791, "learning_rate": 1.1318840579710147e-05, "loss": 1.56, "step": 15480 }, { "epoch": 1.29, "grad_norm": 1.979817271232605, "learning_rate": 1.131304347826087e-05, "loss": 1.7833, "step": 15490 }, { "epoch": 1.29, "grad_norm": 2.7615084648132324, "learning_rate": 1.1307246376811595e-05, "loss": 1.6641, "step": 15500 }, { "epoch": 1.29, "eval_loss": 1.6843485832214355, "eval_runtime": 107.5057, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 15500 }, { "epoch": 1.29, "grad_norm": 2.724392890930176, "learning_rate": 1.130144927536232e-05, "loss": 1.5888, "step": 15510 }, { "epoch": 1.29, "grad_norm": 4.149465560913086, "learning_rate": 1.1295652173913045e-05, "loss": 1.7467, "step": 15520 }, { "epoch": 1.29, "grad_norm": 1.9117258787155151, "learning_rate": 1.1289855072463769e-05, "loss": 1.6763, "step": 15530 }, { "epoch": 1.29, "grad_norm": 2.0557730197906494, "learning_rate": 1.1284057971014493e-05, "loss": 1.7202, "step": 15540 }, { "epoch": 1.3, "grad_norm": 2.089738130569458, "learning_rate": 1.1278260869565219e-05, "loss": 1.7192, "step": 15550 }, { "epoch": 1.3, "grad_norm": 4.925050735473633, "learning_rate": 1.1272463768115943e-05, "loss": 1.7204, "step": 15560 }, { "epoch": 1.3, "grad_norm": 7.140872001647949, "learning_rate": 1.1266666666666668e-05, "loss": 1.5706, "step": 15570 }, { "epoch": 1.3, "grad_norm": 6.26693058013916, "learning_rate": 1.1260869565217392e-05, "loss": 1.796, "step": 15580 }, { "epoch": 1.3, "grad_norm": 1.269399881362915, "learning_rate": 1.1255072463768118e-05, "loss": 1.5746, "step": 15590 }, { "epoch": 1.3, "grad_norm": 1.2779074907302856, "learning_rate": 1.124927536231884e-05, "loss": 1.5743, "step": 15600 }, { "epoch": 1.3, "grad_norm": 4.808041572570801, "learning_rate": 1.1243478260869568e-05, "loss": 1.7358, "step": 15610 }, { "epoch": 1.3, "grad_norm": 2.192412853240967, "learning_rate": 1.123768115942029e-05, "loss": 1.7665, "step": 15620 }, { "epoch": 1.3, "grad_norm": 0.8821000456809998, "learning_rate": 1.1231884057971016e-05, "loss": 1.6442, "step": 15630 }, { "epoch": 1.3, "grad_norm": 2.8440635204315186, "learning_rate": 1.122608695652174e-05, "loss": 1.6318, "step": 15640 }, { "epoch": 1.3, "grad_norm": 3.6958911418914795, "learning_rate": 1.1220289855072464e-05, "loss": 1.7744, "step": 15650 }, { "epoch": 1.3, "grad_norm": 12.584587097167969, "learning_rate": 1.121449275362319e-05, "loss": 1.7242, "step": 15660 }, { "epoch": 1.31, "grad_norm": 4.3364410400390625, "learning_rate": 1.1208695652173913e-05, "loss": 1.7048, "step": 15670 }, { "epoch": 1.31, "grad_norm": 3.9475982189178467, "learning_rate": 1.120289855072464e-05, "loss": 1.6612, "step": 15680 }, { "epoch": 1.31, "grad_norm": 5.1554036140441895, "learning_rate": 1.1197101449275363e-05, "loss": 1.6869, "step": 15690 }, { "epoch": 1.31, "grad_norm": 2.2033803462982178, "learning_rate": 1.1191304347826089e-05, "loss": 1.6598, "step": 15700 }, { "epoch": 1.31, "grad_norm": 3.7358901500701904, "learning_rate": 1.1185507246376813e-05, "loss": 1.7445, "step": 15710 }, { "epoch": 1.31, "grad_norm": 1.531160593032837, "learning_rate": 1.1179710144927539e-05, "loss": 1.6378, "step": 15720 }, { "epoch": 1.31, "grad_norm": 4.818709373474121, "learning_rate": 1.1173913043478261e-05, "loss": 1.6497, "step": 15730 }, { "epoch": 1.31, "grad_norm": 4.811004161834717, "learning_rate": 1.1168115942028988e-05, "loss": 1.7243, "step": 15740 }, { "epoch": 1.31, "grad_norm": 4.756585597991943, "learning_rate": 1.116231884057971e-05, "loss": 1.7284, "step": 15750 }, { "epoch": 1.31, "grad_norm": 1.713197946548462, "learning_rate": 1.1156521739130435e-05, "loss": 1.7706, "step": 15760 }, { "epoch": 1.31, "grad_norm": 5.140644073486328, "learning_rate": 1.115072463768116e-05, "loss": 1.6415, "step": 15770 }, { "epoch": 1.31, "grad_norm": 1.6640381813049316, "learning_rate": 1.1144927536231884e-05, "loss": 1.7129, "step": 15780 }, { "epoch": 1.32, "grad_norm": 2.814182996749878, "learning_rate": 1.113913043478261e-05, "loss": 1.5169, "step": 15790 }, { "epoch": 1.32, "grad_norm": 4.703028678894043, "learning_rate": 1.1133333333333334e-05, "loss": 1.6786, "step": 15800 }, { "epoch": 1.32, "grad_norm": 5.120509147644043, "learning_rate": 1.112753623188406e-05, "loss": 1.7163, "step": 15810 }, { "epoch": 1.32, "grad_norm": 1.0847809314727783, "learning_rate": 1.1121739130434784e-05, "loss": 1.6943, "step": 15820 }, { "epoch": 1.32, "grad_norm": 10.130949974060059, "learning_rate": 1.111594202898551e-05, "loss": 1.729, "step": 15830 }, { "epoch": 1.32, "grad_norm": 2.5162465572357178, "learning_rate": 1.1110144927536233e-05, "loss": 1.7251, "step": 15840 }, { "epoch": 1.32, "grad_norm": 4.79127311706543, "learning_rate": 1.1104347826086956e-05, "loss": 1.7665, "step": 15850 }, { "epoch": 1.32, "grad_norm": 1.3719213008880615, "learning_rate": 1.1098550724637681e-05, "loss": 1.6865, "step": 15860 }, { "epoch": 1.32, "grad_norm": 3.254711151123047, "learning_rate": 1.1092753623188405e-05, "loss": 1.7149, "step": 15870 }, { "epoch": 1.32, "grad_norm": 11.944002151489258, "learning_rate": 1.1086956521739131e-05, "loss": 1.6525, "step": 15880 }, { "epoch": 1.32, "grad_norm": 4.941390037536621, "learning_rate": 1.1081159420289855e-05, "loss": 1.5516, "step": 15890 }, { "epoch": 1.32, "grad_norm": 7.373305797576904, "learning_rate": 1.1075362318840581e-05, "loss": 1.473, "step": 15900 }, { "epoch": 1.33, "grad_norm": 5.447077751159668, "learning_rate": 1.1069565217391305e-05, "loss": 1.6378, "step": 15910 }, { "epoch": 1.33, "grad_norm": 4.908672332763672, "learning_rate": 1.106376811594203e-05, "loss": 1.7512, "step": 15920 }, { "epoch": 1.33, "grad_norm": 2.0174648761749268, "learning_rate": 1.1057971014492755e-05, "loss": 1.6252, "step": 15930 }, { "epoch": 1.33, "grad_norm": 7.651984214782715, "learning_rate": 1.105217391304348e-05, "loss": 1.5333, "step": 15940 }, { "epoch": 1.33, "grad_norm": 4.4102983474731445, "learning_rate": 1.1046376811594204e-05, "loss": 1.5242, "step": 15950 }, { "epoch": 1.33, "grad_norm": 1.826643943786621, "learning_rate": 1.1040579710144928e-05, "loss": 1.6655, "step": 15960 }, { "epoch": 1.33, "grad_norm": 2.4821903705596924, "learning_rate": 1.1034782608695654e-05, "loss": 1.6926, "step": 15970 }, { "epoch": 1.33, "grad_norm": 1.6242338418960571, "learning_rate": 1.1028985507246376e-05, "loss": 1.7808, "step": 15980 }, { "epoch": 1.33, "grad_norm": 8.218697547912598, "learning_rate": 1.1023188405797102e-05, "loss": 1.648, "step": 15990 }, { "epoch": 1.33, "grad_norm": 2.398993730545044, "learning_rate": 1.1017391304347826e-05, "loss": 1.624, "step": 16000 }, { "epoch": 1.33, "eval_loss": 1.6754653453826904, "eval_runtime": 107.521, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 16000 }, { "epoch": 1.33, "grad_norm": 4.014492511749268, "learning_rate": 1.1011594202898552e-05, "loss": 1.5021, "step": 16010 }, { "epoch": 1.33, "grad_norm": 3.1234304904937744, "learning_rate": 1.1006376811594203e-05, "loss": 1.6802, "step": 16020 }, { "epoch": 1.34, "grad_norm": 4.326152801513672, "learning_rate": 1.1000579710144927e-05, "loss": 1.7546, "step": 16030 }, { "epoch": 1.34, "grad_norm": 1.9750605821609497, "learning_rate": 1.0994782608695653e-05, "loss": 1.6459, "step": 16040 }, { "epoch": 1.34, "grad_norm": 2.2335305213928223, "learning_rate": 1.0988985507246377e-05, "loss": 1.6248, "step": 16050 }, { "epoch": 1.34, "grad_norm": 2.1429505348205566, "learning_rate": 1.0983188405797102e-05, "loss": 1.7079, "step": 16060 }, { "epoch": 1.34, "grad_norm": 3.822464942932129, "learning_rate": 1.0977391304347826e-05, "loss": 1.7218, "step": 16070 }, { "epoch": 1.34, "grad_norm": 4.145028114318848, "learning_rate": 1.0971594202898552e-05, "loss": 1.7098, "step": 16080 }, { "epoch": 1.34, "grad_norm": 5.912876129150391, "learning_rate": 1.0965797101449276e-05, "loss": 1.7075, "step": 16090 }, { "epoch": 1.34, "grad_norm": 6.154576778411865, "learning_rate": 1.0960000000000002e-05, "loss": 1.6158, "step": 16100 }, { "epoch": 1.34, "grad_norm": 1.7767752408981323, "learning_rate": 1.0954202898550726e-05, "loss": 1.5413, "step": 16110 }, { "epoch": 1.34, "grad_norm": 3.51847243309021, "learning_rate": 1.0948405797101451e-05, "loss": 1.6848, "step": 16120 }, { "epoch": 1.34, "grad_norm": 0.7433416843414307, "learning_rate": 1.0942608695652176e-05, "loss": 1.6288, "step": 16130 }, { "epoch": 1.34, "grad_norm": 3.0296177864074707, "learning_rate": 1.0936811594202898e-05, "loss": 1.5401, "step": 16140 }, { "epoch": 1.35, "grad_norm": 4.524282932281494, "learning_rate": 1.0931014492753625e-05, "loss": 1.5648, "step": 16150 }, { "epoch": 1.35, "grad_norm": 7.122012138366699, "learning_rate": 1.0925217391304348e-05, "loss": 1.7171, "step": 16160 }, { "epoch": 1.35, "grad_norm": 2.0002593994140625, "learning_rate": 1.0919420289855073e-05, "loss": 1.7173, "step": 16170 }, { "epoch": 1.35, "grad_norm": 4.547085762023926, "learning_rate": 1.0913623188405797e-05, "loss": 1.7737, "step": 16180 }, { "epoch": 1.35, "grad_norm": 2.6324098110198975, "learning_rate": 1.0907826086956523e-05, "loss": 1.7222, "step": 16190 }, { "epoch": 1.35, "grad_norm": 4.268777847290039, "learning_rate": 1.0902028985507247e-05, "loss": 1.7525, "step": 16200 }, { "epoch": 1.35, "grad_norm": 2.096090793609619, "learning_rate": 1.0896231884057973e-05, "loss": 1.5166, "step": 16210 }, { "epoch": 1.35, "grad_norm": 3.225011110305786, "learning_rate": 1.0890434782608697e-05, "loss": 1.7065, "step": 16220 }, { "epoch": 1.35, "grad_norm": 2.468010425567627, "learning_rate": 1.088463768115942e-05, "loss": 1.7441, "step": 16230 }, { "epoch": 1.35, "grad_norm": 3.6063239574432373, "learning_rate": 1.0878840579710146e-05, "loss": 1.7374, "step": 16240 }, { "epoch": 1.35, "grad_norm": 3.4506964683532715, "learning_rate": 1.087304347826087e-05, "loss": 1.8498, "step": 16250 }, { "epoch": 1.35, "grad_norm": 2.358245611190796, "learning_rate": 1.0867246376811596e-05, "loss": 1.7641, "step": 16260 }, { "epoch": 1.36, "grad_norm": 1.0673184394836426, "learning_rate": 1.0861449275362318e-05, "loss": 1.7407, "step": 16270 }, { "epoch": 1.36, "grad_norm": 1.860028624534607, "learning_rate": 1.0855652173913046e-05, "loss": 1.7653, "step": 16280 }, { "epoch": 1.36, "grad_norm": 4.493389129638672, "learning_rate": 1.0849855072463768e-05, "loss": 1.5432, "step": 16290 }, { "epoch": 1.36, "grad_norm": 3.1333208084106445, "learning_rate": 1.0844057971014494e-05, "loss": 1.5558, "step": 16300 }, { "epoch": 1.36, "grad_norm": 2.722856283187866, "learning_rate": 1.0838260869565218e-05, "loss": 1.7136, "step": 16310 }, { "epoch": 1.36, "grad_norm": 6.029232501983643, "learning_rate": 1.0832463768115943e-05, "loss": 1.7611, "step": 16320 }, { "epoch": 1.36, "grad_norm": 3.4627902507781982, "learning_rate": 1.0826666666666667e-05, "loss": 1.6839, "step": 16330 }, { "epoch": 1.36, "grad_norm": 1.860472559928894, "learning_rate": 1.0820869565217391e-05, "loss": 1.6267, "step": 16340 }, { "epoch": 1.36, "grad_norm": 1.3834178447723389, "learning_rate": 1.0815072463768117e-05, "loss": 1.6913, "step": 16350 }, { "epoch": 1.36, "grad_norm": 2.249950885772705, "learning_rate": 1.0809275362318841e-05, "loss": 1.6362, "step": 16360 }, { "epoch": 1.36, "grad_norm": 4.343896865844727, "learning_rate": 1.0803478260869567e-05, "loss": 1.7563, "step": 16370 }, { "epoch": 1.36, "grad_norm": 2.7473175525665283, "learning_rate": 1.0797681159420291e-05, "loss": 1.6708, "step": 16380 }, { "epoch": 1.37, "grad_norm": 1.320664882659912, "learning_rate": 1.0791884057971017e-05, "loss": 1.8427, "step": 16390 }, { "epoch": 1.37, "grad_norm": 6.2452778816223145, "learning_rate": 1.0786086956521739e-05, "loss": 1.6387, "step": 16400 }, { "epoch": 1.37, "grad_norm": 4.248746395111084, "learning_rate": 1.0780289855072466e-05, "loss": 1.5971, "step": 16410 }, { "epoch": 1.37, "grad_norm": 4.274169921875, "learning_rate": 1.0774492753623189e-05, "loss": 1.6122, "step": 16420 }, { "epoch": 1.37, "grad_norm": 1.0745168924331665, "learning_rate": 1.0768695652173914e-05, "loss": 1.6966, "step": 16430 }, { "epoch": 1.37, "grad_norm": 1.370829463005066, "learning_rate": 1.0762898550724638e-05, "loss": 1.5828, "step": 16440 }, { "epoch": 1.37, "grad_norm": 4.591113567352295, "learning_rate": 1.0757101449275362e-05, "loss": 1.6895, "step": 16450 }, { "epoch": 1.37, "grad_norm": 2.2173213958740234, "learning_rate": 1.0751304347826088e-05, "loss": 1.734, "step": 16460 }, { "epoch": 1.37, "grad_norm": 2.926522970199585, "learning_rate": 1.0745507246376812e-05, "loss": 1.6714, "step": 16470 }, { "epoch": 1.37, "grad_norm": 1.1437100172042847, "learning_rate": 1.0739710144927538e-05, "loss": 1.7062, "step": 16480 }, { "epoch": 1.37, "grad_norm": 3.2627272605895996, "learning_rate": 1.0733913043478262e-05, "loss": 1.5612, "step": 16490 }, { "epoch": 1.38, "grad_norm": 2.8069777488708496, "learning_rate": 1.0728115942028987e-05, "loss": 1.7116, "step": 16500 }, { "epoch": 1.38, "eval_loss": 1.7130539417266846, "eval_runtime": 107.5132, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 16500 }, { "epoch": 1.38, "grad_norm": 5.705499172210693, "learning_rate": 1.0722318840579711e-05, "loss": 1.6609, "step": 16510 }, { "epoch": 1.38, "grad_norm": 2.6209776401519775, "learning_rate": 1.0716521739130437e-05, "loss": 1.683, "step": 16520 }, { "epoch": 1.38, "grad_norm": 2.3433544635772705, "learning_rate": 1.071072463768116e-05, "loss": 1.7747, "step": 16530 }, { "epoch": 1.38, "grad_norm": 4.4690423011779785, "learning_rate": 1.0704927536231883e-05, "loss": 1.6671, "step": 16540 }, { "epoch": 1.38, "grad_norm": 4.0458269119262695, "learning_rate": 1.069913043478261e-05, "loss": 1.7513, "step": 16550 }, { "epoch": 1.38, "grad_norm": 7.192028522491455, "learning_rate": 1.0693333333333333e-05, "loss": 1.7646, "step": 16560 }, { "epoch": 1.38, "grad_norm": 4.764258861541748, "learning_rate": 1.0687536231884059e-05, "loss": 1.6782, "step": 16570 }, { "epoch": 1.38, "grad_norm": 3.255431890487671, "learning_rate": 1.0681739130434783e-05, "loss": 1.6417, "step": 16580 }, { "epoch": 1.38, "grad_norm": 0.955453634262085, "learning_rate": 1.0675942028985509e-05, "loss": 1.7354, "step": 16590 }, { "epoch": 1.38, "grad_norm": 2.478759765625, "learning_rate": 1.0670144927536233e-05, "loss": 1.7076, "step": 16600 }, { "epoch": 1.38, "grad_norm": 10.808612823486328, "learning_rate": 1.0664347826086958e-05, "loss": 1.7509, "step": 16610 }, { "epoch": 1.39, "grad_norm": 3.3443713188171387, "learning_rate": 1.0658550724637682e-05, "loss": 1.6233, "step": 16620 }, { "epoch": 1.39, "grad_norm": 6.435446262359619, "learning_rate": 1.0652753623188408e-05, "loss": 1.7493, "step": 16630 }, { "epoch": 1.39, "grad_norm": 2.0918169021606445, "learning_rate": 1.0646956521739132e-05, "loss": 1.7184, "step": 16640 }, { "epoch": 1.39, "grad_norm": 0.8072077631950378, "learning_rate": 1.0641159420289854e-05, "loss": 1.6724, "step": 16650 }, { "epoch": 1.39, "grad_norm": 2.0545530319213867, "learning_rate": 1.063536231884058e-05, "loss": 1.7191, "step": 16660 }, { "epoch": 1.39, "grad_norm": 4.794948101043701, "learning_rate": 1.0629565217391304e-05, "loss": 1.6553, "step": 16670 }, { "epoch": 1.39, "grad_norm": 6.329624176025391, "learning_rate": 1.062376811594203e-05, "loss": 1.7232, "step": 16680 }, { "epoch": 1.39, "grad_norm": 2.5596625804901123, "learning_rate": 1.0617971014492754e-05, "loss": 1.77, "step": 16690 }, { "epoch": 1.39, "grad_norm": 2.4740288257598877, "learning_rate": 1.061217391304348e-05, "loss": 1.6394, "step": 16700 }, { "epoch": 1.39, "grad_norm": 0.8927739858627319, "learning_rate": 1.0606376811594203e-05, "loss": 1.7045, "step": 16710 }, { "epoch": 1.39, "grad_norm": 2.0956900119781494, "learning_rate": 1.0600579710144929e-05, "loss": 1.7058, "step": 16720 }, { "epoch": 1.39, "grad_norm": 3.9234437942504883, "learning_rate": 1.0594782608695653e-05, "loss": 1.6324, "step": 16730 }, { "epoch": 1.4, "grad_norm": 4.277024745941162, "learning_rate": 1.0588985507246379e-05, "loss": 1.7152, "step": 16740 }, { "epoch": 1.4, "grad_norm": 3.355858564376831, "learning_rate": 1.0583188405797103e-05, "loss": 1.7431, "step": 16750 }, { "epoch": 1.4, "grad_norm": 2.8377106189727783, "learning_rate": 1.0577391304347827e-05, "loss": 1.6996, "step": 16760 }, { "epoch": 1.4, "grad_norm": 3.7551558017730713, "learning_rate": 1.0571594202898553e-05, "loss": 1.597, "step": 16770 }, { "epoch": 1.4, "grad_norm": 4.006781578063965, "learning_rate": 1.0565797101449275e-05, "loss": 1.7018, "step": 16780 }, { "epoch": 1.4, "grad_norm": 2.4323341846466064, "learning_rate": 1.056e-05, "loss": 1.5812, "step": 16790 }, { "epoch": 1.4, "grad_norm": 3.4357855319976807, "learning_rate": 1.0554202898550725e-05, "loss": 1.7026, "step": 16800 }, { "epoch": 1.4, "grad_norm": 2.7937941551208496, "learning_rate": 1.054840579710145e-05, "loss": 1.7299, "step": 16810 }, { "epoch": 1.4, "grad_norm": 2.6717631816864014, "learning_rate": 1.0542608695652174e-05, "loss": 1.759, "step": 16820 }, { "epoch": 1.4, "grad_norm": 2.9999706745147705, "learning_rate": 1.05368115942029e-05, "loss": 1.736, "step": 16830 }, { "epoch": 1.4, "grad_norm": 2.3674368858337402, "learning_rate": 1.0531014492753624e-05, "loss": 1.7221, "step": 16840 }, { "epoch": 1.4, "grad_norm": 1.2170531749725342, "learning_rate": 1.0525217391304348e-05, "loss": 1.7482, "step": 16850 }, { "epoch": 1.41, "grad_norm": 1.1290643215179443, "learning_rate": 1.0519420289855074e-05, "loss": 1.5598, "step": 16860 }, { "epoch": 1.41, "grad_norm": 1.8863322734832764, "learning_rate": 1.0513623188405798e-05, "loss": 1.5603, "step": 16870 }, { "epoch": 1.41, "grad_norm": 5.208227634429932, "learning_rate": 1.0507826086956523e-05, "loss": 1.7284, "step": 16880 }, { "epoch": 1.41, "grad_norm": 6.908130168914795, "learning_rate": 1.0502028985507247e-05, "loss": 1.6787, "step": 16890 }, { "epoch": 1.41, "grad_norm": 7.316353797912598, "learning_rate": 1.0496231884057973e-05, "loss": 1.8262, "step": 16900 }, { "epoch": 1.41, "grad_norm": 8.9257173538208, "learning_rate": 1.0490434782608695e-05, "loss": 1.7177, "step": 16910 }, { "epoch": 1.41, "grad_norm": 8.56701946258545, "learning_rate": 1.0484637681159423e-05, "loss": 1.6473, "step": 16920 }, { "epoch": 1.41, "grad_norm": 2.9926609992980957, "learning_rate": 1.0478840579710145e-05, "loss": 1.7083, "step": 16930 }, { "epoch": 1.41, "grad_norm": 3.859083414077759, "learning_rate": 1.047304347826087e-05, "loss": 1.7799, "step": 16940 }, { "epoch": 1.41, "grad_norm": 1.5566657781600952, "learning_rate": 1.0467246376811595e-05, "loss": 1.7282, "step": 16950 }, { "epoch": 1.41, "grad_norm": 8.074483871459961, "learning_rate": 1.0461449275362319e-05, "loss": 1.6706, "step": 16960 }, { "epoch": 1.41, "grad_norm": 3.9916458129882812, "learning_rate": 1.0455652173913045e-05, "loss": 1.6624, "step": 16970 }, { "epoch": 1.42, "grad_norm": 4.076849937438965, "learning_rate": 1.0449855072463769e-05, "loss": 1.7047, "step": 16980 }, { "epoch": 1.42, "grad_norm": 2.859311819076538, "learning_rate": 1.0444057971014494e-05, "loss": 1.6748, "step": 16990 }, { "epoch": 1.42, "grad_norm": 2.4924099445343018, "learning_rate": 1.0438260869565218e-05, "loss": 1.7947, "step": 17000 }, { "epoch": 1.42, "eval_loss": 1.6634085178375244, "eval_runtime": 107.5078, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 17000 }, { "epoch": 1.42, "grad_norm": 1.4697929620742798, "learning_rate": 1.0432463768115944e-05, "loss": 1.652, "step": 17010 }, { "epoch": 1.42, "grad_norm": 2.3207297325134277, "learning_rate": 1.0426666666666668e-05, "loss": 1.6632, "step": 17020 }, { "epoch": 1.42, "grad_norm": 2.104315757751465, "learning_rate": 1.0420869565217394e-05, "loss": 1.6028, "step": 17030 }, { "epoch": 1.42, "grad_norm": 1.517398715019226, "learning_rate": 1.0415072463768116e-05, "loss": 1.7276, "step": 17040 }, { "epoch": 1.42, "grad_norm": 3.3302900791168213, "learning_rate": 1.0409275362318843e-05, "loss": 1.5883, "step": 17050 }, { "epoch": 1.42, "grad_norm": 3.5856385231018066, "learning_rate": 1.0403478260869566e-05, "loss": 1.6082, "step": 17060 }, { "epoch": 1.42, "grad_norm": 7.210336685180664, "learning_rate": 1.039768115942029e-05, "loss": 1.7695, "step": 17070 }, { "epoch": 1.42, "grad_norm": 2.3868658542633057, "learning_rate": 1.0391884057971015e-05, "loss": 1.6391, "step": 17080 }, { "epoch": 1.42, "grad_norm": 7.809682369232178, "learning_rate": 1.038608695652174e-05, "loss": 1.7408, "step": 17090 }, { "epoch": 1.43, "grad_norm": 3.6190929412841797, "learning_rate": 1.0380289855072465e-05, "loss": 1.5544, "step": 17100 }, { "epoch": 1.43, "grad_norm": 1.5290706157684326, "learning_rate": 1.0374492753623189e-05, "loss": 1.6121, "step": 17110 }, { "epoch": 1.43, "grad_norm": 0.6158972382545471, "learning_rate": 1.0368695652173915e-05, "loss": 1.6292, "step": 17120 }, { "epoch": 1.43, "grad_norm": 2.539661407470703, "learning_rate": 1.0362898550724639e-05, "loss": 1.6785, "step": 17130 }, { "epoch": 1.43, "grad_norm": 2.0767712593078613, "learning_rate": 1.0357101449275364e-05, "loss": 1.7784, "step": 17140 }, { "epoch": 1.43, "grad_norm": 3.4671566486358643, "learning_rate": 1.0351304347826088e-05, "loss": 1.6613, "step": 17150 }, { "epoch": 1.43, "grad_norm": 3.9954442977905273, "learning_rate": 1.034550724637681e-05, "loss": 1.6233, "step": 17160 }, { "epoch": 1.43, "grad_norm": 1.9763154983520508, "learning_rate": 1.0339710144927536e-05, "loss": 1.6937, "step": 17170 }, { "epoch": 1.43, "grad_norm": 1.507520079612732, "learning_rate": 1.033391304347826e-05, "loss": 1.765, "step": 17180 }, { "epoch": 1.43, "grad_norm": 3.03114652633667, "learning_rate": 1.0328115942028986e-05, "loss": 1.6805, "step": 17190 }, { "epoch": 1.43, "grad_norm": 3.9964370727539062, "learning_rate": 1.032231884057971e-05, "loss": 1.7913, "step": 17200 }, { "epoch": 1.43, "grad_norm": 1.9785213470458984, "learning_rate": 1.0316521739130436e-05, "loss": 1.6369, "step": 17210 }, { "epoch": 1.44, "grad_norm": 5.543067455291748, "learning_rate": 1.031072463768116e-05, "loss": 1.6102, "step": 17220 }, { "epoch": 1.44, "grad_norm": 4.464430332183838, "learning_rate": 1.0304927536231886e-05, "loss": 1.6151, "step": 17230 }, { "epoch": 1.44, "grad_norm": 2.594529390335083, "learning_rate": 1.029913043478261e-05, "loss": 1.68, "step": 17240 }, { "epoch": 1.44, "grad_norm": 5.4555535316467285, "learning_rate": 1.0293333333333335e-05, "loss": 1.6307, "step": 17250 }, { "epoch": 1.44, "grad_norm": 8.041542053222656, "learning_rate": 1.028753623188406e-05, "loss": 1.7092, "step": 17260 }, { "epoch": 1.44, "grad_norm": 2.2063286304473877, "learning_rate": 1.0281739130434782e-05, "loss": 1.7111, "step": 17270 }, { "epoch": 1.44, "grad_norm": 5.018704891204834, "learning_rate": 1.0275942028985509e-05, "loss": 1.6344, "step": 17280 }, { "epoch": 1.44, "grad_norm": 6.327304840087891, "learning_rate": 1.0270144927536231e-05, "loss": 1.6644, "step": 17290 }, { "epoch": 1.44, "grad_norm": 2.130178213119507, "learning_rate": 1.0264347826086957e-05, "loss": 1.7562, "step": 17300 }, { "epoch": 1.44, "grad_norm": 3.026088237762451, "learning_rate": 1.0258550724637681e-05, "loss": 1.6024, "step": 17310 }, { "epoch": 1.44, "grad_norm": 1.9865357875823975, "learning_rate": 1.0252753623188407e-05, "loss": 1.6829, "step": 17320 }, { "epoch": 1.44, "grad_norm": 1.843345046043396, "learning_rate": 1.024695652173913e-05, "loss": 1.7297, "step": 17330 }, { "epoch": 1.45, "grad_norm": 2.475865125656128, "learning_rate": 1.0241159420289856e-05, "loss": 1.6453, "step": 17340 }, { "epoch": 1.45, "grad_norm": 5.109663963317871, "learning_rate": 1.023536231884058e-05, "loss": 1.6039, "step": 17350 }, { "epoch": 1.45, "grad_norm": 4.058063507080078, "learning_rate": 1.0229565217391306e-05, "loss": 1.7368, "step": 17360 }, { "epoch": 1.45, "grad_norm": 7.7936859130859375, "learning_rate": 1.022376811594203e-05, "loss": 1.6527, "step": 17370 }, { "epoch": 1.45, "grad_norm": 1.9645744562149048, "learning_rate": 1.0217971014492754e-05, "loss": 1.6359, "step": 17380 }, { "epoch": 1.45, "grad_norm": 1.9280155897140503, "learning_rate": 1.021217391304348e-05, "loss": 1.6755, "step": 17390 }, { "epoch": 1.45, "grad_norm": 11.76801586151123, "learning_rate": 1.0206376811594204e-05, "loss": 1.5459, "step": 17400 }, { "epoch": 1.45, "grad_norm": 2.5407767295837402, "learning_rate": 1.020057971014493e-05, "loss": 1.6835, "step": 17410 }, { "epoch": 1.45, "grad_norm": 3.4772510528564453, "learning_rate": 1.0194782608695652e-05, "loss": 1.7516, "step": 17420 }, { "epoch": 1.45, "grad_norm": 1.1580195426940918, "learning_rate": 1.0188985507246378e-05, "loss": 1.7234, "step": 17430 }, { "epoch": 1.45, "grad_norm": 1.6305845975875854, "learning_rate": 1.0183188405797102e-05, "loss": 1.5738, "step": 17440 }, { "epoch": 1.45, "grad_norm": 3.306994676589966, "learning_rate": 1.0177391304347827e-05, "loss": 1.6702, "step": 17450 }, { "epoch": 1.46, "grad_norm": 4.651767253875732, "learning_rate": 1.0171594202898551e-05, "loss": 1.6585, "step": 17460 }, { "epoch": 1.46, "grad_norm": 8.080167770385742, "learning_rate": 1.0165797101449275e-05, "loss": 1.8123, "step": 17470 }, { "epoch": 1.46, "grad_norm": 3.453287363052368, "learning_rate": 1.0160000000000001e-05, "loss": 1.601, "step": 17480 }, { "epoch": 1.46, "grad_norm": 2.5909740924835205, "learning_rate": 1.0154202898550725e-05, "loss": 1.7291, "step": 17490 }, { "epoch": 1.46, "grad_norm": 4.380848407745361, "learning_rate": 1.014840579710145e-05, "loss": 1.6845, "step": 17500 }, { "epoch": 1.46, "eval_loss": 1.6659685373306274, "eval_runtime": 107.5088, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 17500 }, { "epoch": 1.46, "grad_norm": 1.1240538358688354, "learning_rate": 1.0142608695652175e-05, "loss": 1.6778, "step": 17510 }, { "epoch": 1.46, "grad_norm": 4.835226535797119, "learning_rate": 1.01368115942029e-05, "loss": 1.7074, "step": 17520 }, { "epoch": 1.46, "grad_norm": 4.039511680603027, "learning_rate": 1.0131014492753624e-05, "loss": 1.6447, "step": 17530 }, { "epoch": 1.46, "grad_norm": 11.439852714538574, "learning_rate": 1.012521739130435e-05, "loss": 1.5716, "step": 17540 }, { "epoch": 1.46, "grad_norm": 7.208695411682129, "learning_rate": 1.0119420289855072e-05, "loss": 1.5687, "step": 17550 }, { "epoch": 1.46, "grad_norm": 12.316939353942871, "learning_rate": 1.0113623188405798e-05, "loss": 1.6564, "step": 17560 }, { "epoch": 1.46, "grad_norm": 4.0833916664123535, "learning_rate": 1.0107826086956522e-05, "loss": 1.5983, "step": 17570 }, { "epoch": 1.47, "grad_norm": 6.681445121765137, "learning_rate": 1.0102028985507246e-05, "loss": 1.6755, "step": 17580 }, { "epoch": 1.47, "grad_norm": 5.685708045959473, "learning_rate": 1.0096231884057972e-05, "loss": 1.7461, "step": 17590 }, { "epoch": 1.47, "grad_norm": 0.8818423748016357, "learning_rate": 1.0090434782608696e-05, "loss": 1.6818, "step": 17600 }, { "epoch": 1.47, "grad_norm": 3.1815342903137207, "learning_rate": 1.0084637681159422e-05, "loss": 1.7476, "step": 17610 }, { "epoch": 1.47, "grad_norm": 4.193720817565918, "learning_rate": 1.0078840579710146e-05, "loss": 1.8282, "step": 17620 }, { "epoch": 1.47, "grad_norm": 2.380415439605713, "learning_rate": 1.0073043478260871e-05, "loss": 1.5551, "step": 17630 }, { "epoch": 1.47, "grad_norm": 2.54988956451416, "learning_rate": 1.0067246376811595e-05, "loss": 1.6923, "step": 17640 }, { "epoch": 1.47, "grad_norm": 3.875814914703369, "learning_rate": 1.0061449275362321e-05, "loss": 1.6452, "step": 17650 }, { "epoch": 1.47, "grad_norm": 1.4553890228271484, "learning_rate": 1.0055652173913045e-05, "loss": 1.734, "step": 17660 }, { "epoch": 1.47, "grad_norm": 3.171555280685425, "learning_rate": 1.004985507246377e-05, "loss": 1.4397, "step": 17670 }, { "epoch": 1.47, "grad_norm": 3.2089016437530518, "learning_rate": 1.0044057971014493e-05, "loss": 1.5919, "step": 17680 }, { "epoch": 1.47, "grad_norm": 6.439237117767334, "learning_rate": 1.0038260869565217e-05, "loss": 1.7071, "step": 17690 }, { "epoch": 1.48, "grad_norm": 3.091012954711914, "learning_rate": 1.0032463768115943e-05, "loss": 1.6278, "step": 17700 }, { "epoch": 1.48, "grad_norm": 1.4860005378723145, "learning_rate": 1.0026666666666667e-05, "loss": 1.6406, "step": 17710 }, { "epoch": 1.48, "grad_norm": 8.64132308959961, "learning_rate": 1.0020869565217392e-05, "loss": 1.6317, "step": 17720 }, { "epoch": 1.48, "grad_norm": 2.587737798690796, "learning_rate": 1.0015072463768116e-05, "loss": 1.5913, "step": 17730 }, { "epoch": 1.48, "grad_norm": 5.59721040725708, "learning_rate": 1.0009275362318842e-05, "loss": 1.4954, "step": 17740 }, { "epoch": 1.48, "grad_norm": 7.057238578796387, "learning_rate": 1.0003478260869566e-05, "loss": 1.7462, "step": 17750 }, { "epoch": 1.48, "grad_norm": 4.500436782836914, "learning_rate": 9.99768115942029e-06, "loss": 1.6778, "step": 17760 }, { "epoch": 1.48, "grad_norm": 1.1724953651428223, "learning_rate": 9.991884057971016e-06, "loss": 1.6173, "step": 17770 }, { "epoch": 1.48, "grad_norm": 3.133427619934082, "learning_rate": 9.98608695652174e-06, "loss": 1.6226, "step": 17780 }, { "epoch": 1.48, "grad_norm": 4.147580623626709, "learning_rate": 9.980289855072465e-06, "loss": 1.6144, "step": 17790 }, { "epoch": 1.48, "grad_norm": 4.114604949951172, "learning_rate": 9.97449275362319e-06, "loss": 1.7495, "step": 17800 }, { "epoch": 1.48, "grad_norm": 2.3972349166870117, "learning_rate": 9.968695652173913e-06, "loss": 1.686, "step": 17810 }, { "epoch": 1.48, "grad_norm": 4.335039138793945, "learning_rate": 9.96289855072464e-06, "loss": 1.6874, "step": 17820 }, { "epoch": 1.49, "grad_norm": 2.4044220447540283, "learning_rate": 9.957101449275363e-06, "loss": 1.6847, "step": 17830 }, { "epoch": 1.49, "grad_norm": 7.1201982498168945, "learning_rate": 9.951304347826087e-06, "loss": 1.6875, "step": 17840 }, { "epoch": 1.49, "grad_norm": 1.6048938035964966, "learning_rate": 9.945507246376813e-06, "loss": 1.6163, "step": 17850 }, { "epoch": 1.49, "grad_norm": 1.1629419326782227, "learning_rate": 9.939710144927537e-06, "loss": 1.7531, "step": 17860 }, { "epoch": 1.49, "grad_norm": 9.67171573638916, "learning_rate": 9.933913043478261e-06, "loss": 1.804, "step": 17870 }, { "epoch": 1.49, "grad_norm": 1.4759604930877686, "learning_rate": 9.928115942028987e-06, "loss": 1.762, "step": 17880 }, { "epoch": 1.49, "grad_norm": 3.9004738330841064, "learning_rate": 9.92231884057971e-06, "loss": 1.6533, "step": 17890 }, { "epoch": 1.49, "grad_norm": 4.209875106811523, "learning_rate": 9.916521739130436e-06, "loss": 1.6523, "step": 17900 }, { "epoch": 1.49, "grad_norm": 3.8896372318267822, "learning_rate": 9.91072463768116e-06, "loss": 1.6475, "step": 17910 }, { "epoch": 1.49, "grad_norm": 1.572081208229065, "learning_rate": 9.904927536231886e-06, "loss": 1.5881, "step": 17920 }, { "epoch": 1.49, "grad_norm": 3.5894155502319336, "learning_rate": 9.89913043478261e-06, "loss": 1.6233, "step": 17930 }, { "epoch": 1.5, "grad_norm": 4.091580867767334, "learning_rate": 9.893333333333334e-06, "loss": 1.7261, "step": 17940 }, { "epoch": 1.5, "grad_norm": 2.4239258766174316, "learning_rate": 9.887536231884058e-06, "loss": 1.6872, "step": 17950 }, { "epoch": 1.5, "grad_norm": 2.6650853157043457, "learning_rate": 9.881739130434784e-06, "loss": 1.5972, "step": 17960 }, { "epoch": 1.5, "grad_norm": 2.779630661010742, "learning_rate": 9.875942028985508e-06, "loss": 1.5442, "step": 17970 }, { "epoch": 1.5, "grad_norm": 0.8651297092437744, "learning_rate": 9.870144927536233e-06, "loss": 1.6394, "step": 17980 }, { "epoch": 1.5, "grad_norm": 5.072394371032715, "learning_rate": 9.864347826086957e-06, "loss": 1.6355, "step": 17990 }, { "epoch": 1.5, "grad_norm": 4.223601341247559, "learning_rate": 9.858550724637681e-06, "loss": 1.7034, "step": 18000 }, { "epoch": 1.5, "eval_loss": 1.6817396879196167, "eval_runtime": 107.5133, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 18000 }, { "epoch": 1.5, "grad_norm": 5.345555782318115, "learning_rate": 9.852753623188407e-06, "loss": 1.6598, "step": 18010 }, { "epoch": 1.5, "grad_norm": 4.217700958251953, "learning_rate": 9.846956521739131e-06, "loss": 1.7581, "step": 18020 }, { "epoch": 1.5, "grad_norm": 5.60097074508667, "learning_rate": 9.841159420289857e-06, "loss": 1.4824, "step": 18030 }, { "epoch": 1.5, "grad_norm": 2.772341012954712, "learning_rate": 9.83536231884058e-06, "loss": 1.6643, "step": 18040 }, { "epoch": 1.5, "grad_norm": 1.193320631980896, "learning_rate": 9.829565217391305e-06, "loss": 1.8004, "step": 18050 }, { "epoch": 1.5, "grad_norm": 5.566644191741943, "learning_rate": 9.823768115942029e-06, "loss": 1.7329, "step": 18060 }, { "epoch": 1.51, "grad_norm": 5.771097183227539, "learning_rate": 9.817971014492755e-06, "loss": 1.5573, "step": 18070 }, { "epoch": 1.51, "grad_norm": 5.004515647888184, "learning_rate": 9.812173913043479e-06, "loss": 1.5583, "step": 18080 }, { "epoch": 1.51, "grad_norm": 3.670802116394043, "learning_rate": 9.806376811594204e-06, "loss": 1.6425, "step": 18090 }, { "epoch": 1.51, "grad_norm": 4.591397762298584, "learning_rate": 9.800579710144928e-06, "loss": 1.7255, "step": 18100 }, { "epoch": 1.51, "grad_norm": 4.184049606323242, "learning_rate": 9.794782608695654e-06, "loss": 1.6388, "step": 18110 }, { "epoch": 1.51, "grad_norm": 5.081575870513916, "learning_rate": 9.788985507246378e-06, "loss": 1.741, "step": 18120 }, { "epoch": 1.51, "grad_norm": 2.1452643871307373, "learning_rate": 9.783188405797102e-06, "loss": 1.5816, "step": 18130 }, { "epoch": 1.51, "grad_norm": 2.788238048553467, "learning_rate": 9.777391304347826e-06, "loss": 1.6184, "step": 18140 }, { "epoch": 1.51, "grad_norm": 5.260552883148193, "learning_rate": 9.771594202898552e-06, "loss": 1.6836, "step": 18150 }, { "epoch": 1.51, "grad_norm": 4.803924083709717, "learning_rate": 9.765797101449276e-06, "loss": 1.5638, "step": 18160 }, { "epoch": 1.51, "grad_norm": 1.803575873374939, "learning_rate": 9.760000000000001e-06, "loss": 1.7536, "step": 18170 }, { "epoch": 1.52, "grad_norm": 2.703744888305664, "learning_rate": 9.754202898550725e-06, "loss": 1.6187, "step": 18180 }, { "epoch": 1.52, "grad_norm": 4.61587381362915, "learning_rate": 9.74840579710145e-06, "loss": 1.5927, "step": 18190 }, { "epoch": 1.52, "grad_norm": 2.6593663692474365, "learning_rate": 9.742608695652175e-06, "loss": 1.8459, "step": 18200 }, { "epoch": 1.52, "grad_norm": 3.2275209426879883, "learning_rate": 9.736811594202899e-06, "loss": 1.5672, "step": 18210 }, { "epoch": 1.52, "grad_norm": 4.009672164916992, "learning_rate": 9.731014492753625e-06, "loss": 1.613, "step": 18220 }, { "epoch": 1.52, "grad_norm": 0.9421452879905701, "learning_rate": 9.725217391304349e-06, "loss": 1.8051, "step": 18230 }, { "epoch": 1.52, "grad_norm": 4.675242900848389, "learning_rate": 9.719420289855075e-06, "loss": 1.5781, "step": 18240 }, { "epoch": 1.52, "grad_norm": 4.626055717468262, "learning_rate": 9.713623188405797e-06, "loss": 1.6158, "step": 18250 }, { "epoch": 1.52, "grad_norm": 2.181307554244995, "learning_rate": 9.707826086956523e-06, "loss": 1.6871, "step": 18260 }, { "epoch": 1.52, "grad_norm": 2.5483827590942383, "learning_rate": 9.702028985507247e-06, "loss": 1.6884, "step": 18270 }, { "epoch": 1.52, "grad_norm": 2.3285396099090576, "learning_rate": 9.696231884057972e-06, "loss": 1.7371, "step": 18280 }, { "epoch": 1.52, "grad_norm": 6.175654888153076, "learning_rate": 9.690434782608696e-06, "loss": 1.7425, "step": 18290 }, { "epoch": 1.52, "grad_norm": 2.972175359725952, "learning_rate": 9.684637681159422e-06, "loss": 1.838, "step": 18300 }, { "epoch": 1.53, "grad_norm": 3.2489099502563477, "learning_rate": 9.678840579710146e-06, "loss": 1.6443, "step": 18310 }, { "epoch": 1.53, "grad_norm": 4.820380210876465, "learning_rate": 9.67304347826087e-06, "loss": 1.6839, "step": 18320 }, { "epoch": 1.53, "grad_norm": 1.2533926963806152, "learning_rate": 9.667246376811596e-06, "loss": 1.6476, "step": 18330 }, { "epoch": 1.53, "grad_norm": 4.8515777587890625, "learning_rate": 9.66144927536232e-06, "loss": 1.6342, "step": 18340 }, { "epoch": 1.53, "grad_norm": 1.8389453887939453, "learning_rate": 9.655652173913044e-06, "loss": 1.5585, "step": 18350 }, { "epoch": 1.53, "grad_norm": 1.700596570968628, "learning_rate": 9.649855072463768e-06, "loss": 1.5371, "step": 18360 }, { "epoch": 1.53, "grad_norm": 2.20552396774292, "learning_rate": 9.644057971014493e-06, "loss": 1.7413, "step": 18370 }, { "epoch": 1.53, "grad_norm": 9.809810638427734, "learning_rate": 9.638260869565217e-06, "loss": 1.6194, "step": 18380 }, { "epoch": 1.53, "grad_norm": 1.2184398174285889, "learning_rate": 9.632463768115943e-06, "loss": 1.7167, "step": 18390 }, { "epoch": 1.53, "grad_norm": 1.6306843757629395, "learning_rate": 9.626666666666667e-06, "loss": 1.7024, "step": 18400 }, { "epoch": 1.53, "grad_norm": 4.053318500518799, "learning_rate": 9.620869565217393e-06, "loss": 1.6515, "step": 18410 }, { "epoch": 1.54, "grad_norm": 7.242835998535156, "learning_rate": 9.615072463768117e-06, "loss": 1.542, "step": 18420 }, { "epoch": 1.54, "grad_norm": 3.169401168823242, "learning_rate": 9.609275362318843e-06, "loss": 1.6536, "step": 18430 }, { "epoch": 1.54, "grad_norm": 2.6384594440460205, "learning_rate": 9.603478260869567e-06, "loss": 1.6468, "step": 18440 }, { "epoch": 1.54, "grad_norm": 4.198095321655273, "learning_rate": 9.59768115942029e-06, "loss": 1.6955, "step": 18450 }, { "epoch": 1.54, "grad_norm": 5.319828987121582, "learning_rate": 9.591884057971015e-06, "loss": 1.7273, "step": 18460 }, { "epoch": 1.54, "grad_norm": 4.001225471496582, "learning_rate": 9.58608695652174e-06, "loss": 1.8093, "step": 18470 }, { "epoch": 1.54, "grad_norm": 4.059565544128418, "learning_rate": 9.580289855072464e-06, "loss": 1.6228, "step": 18480 }, { "epoch": 1.54, "grad_norm": 0.9582866430282593, "learning_rate": 9.574492753623188e-06, "loss": 1.6258, "step": 18490 }, { "epoch": 1.54, "grad_norm": 1.2751723527908325, "learning_rate": 9.568695652173914e-06, "loss": 1.7295, "step": 18500 }, { "epoch": 1.54, "eval_loss": 1.6792817115783691, "eval_runtime": 107.5135, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 18500 }, { "epoch": 1.54, "grad_norm": 2.7914340496063232, "learning_rate": 9.562898550724638e-06, "loss": 1.6173, "step": 18510 }, { "epoch": 1.54, "grad_norm": 7.133903980255127, "learning_rate": 9.557101449275364e-06, "loss": 1.6678, "step": 18520 }, { "epoch": 1.54, "grad_norm": 2.9746956825256348, "learning_rate": 9.551304347826088e-06, "loss": 1.6362, "step": 18530 }, { "epoch": 1.54, "grad_norm": 5.67676305770874, "learning_rate": 9.545507246376813e-06, "loss": 1.6312, "step": 18540 }, { "epoch": 1.55, "grad_norm": 1.4198997020721436, "learning_rate": 9.539710144927537e-06, "loss": 1.6763, "step": 18550 }, { "epoch": 1.55, "grad_norm": 1.5471702814102173, "learning_rate": 9.533913043478261e-06, "loss": 1.6033, "step": 18560 }, { "epoch": 1.55, "grad_norm": 2.1242103576660156, "learning_rate": 9.528115942028985e-06, "loss": 1.5981, "step": 18570 }, { "epoch": 1.55, "grad_norm": 3.2516043186187744, "learning_rate": 9.522318840579711e-06, "loss": 1.7269, "step": 18580 }, { "epoch": 1.55, "grad_norm": 2.795421600341797, "learning_rate": 9.516521739130435e-06, "loss": 1.6065, "step": 18590 }, { "epoch": 1.55, "grad_norm": 3.5633459091186523, "learning_rate": 9.51072463768116e-06, "loss": 1.621, "step": 18600 }, { "epoch": 1.55, "grad_norm": 4.669309616088867, "learning_rate": 9.504927536231885e-06, "loss": 1.6455, "step": 18610 }, { "epoch": 1.55, "grad_norm": 4.369694232940674, "learning_rate": 9.49913043478261e-06, "loss": 1.55, "step": 18620 }, { "epoch": 1.55, "grad_norm": 2.2723934650421143, "learning_rate": 9.493333333333334e-06, "loss": 1.7658, "step": 18630 }, { "epoch": 1.55, "grad_norm": 3.9826152324676514, "learning_rate": 9.487536231884058e-06, "loss": 1.6784, "step": 18640 }, { "epoch": 1.55, "grad_norm": 2.489523410797119, "learning_rate": 9.481739130434784e-06, "loss": 1.7932, "step": 18650 }, { "epoch": 1.56, "grad_norm": 2.9629557132720947, "learning_rate": 9.475942028985508e-06, "loss": 1.7041, "step": 18660 }, { "epoch": 1.56, "grad_norm": 4.74808406829834, "learning_rate": 9.470144927536232e-06, "loss": 1.6448, "step": 18670 }, { "epoch": 1.56, "grad_norm": 1.5019195079803467, "learning_rate": 9.464347826086956e-06, "loss": 1.7318, "step": 18680 }, { "epoch": 1.56, "grad_norm": 4.352245330810547, "learning_rate": 9.458550724637682e-06, "loss": 1.7178, "step": 18690 }, { "epoch": 1.56, "grad_norm": 2.7317593097686768, "learning_rate": 9.452753623188406e-06, "loss": 1.6883, "step": 18700 }, { "epoch": 1.56, "grad_norm": 1.4841550588607788, "learning_rate": 9.446956521739132e-06, "loss": 1.654, "step": 18710 }, { "epoch": 1.56, "grad_norm": 2.4384384155273438, "learning_rate": 9.441159420289856e-06, "loss": 1.6427, "step": 18720 }, { "epoch": 1.56, "grad_norm": 2.350482225418091, "learning_rate": 9.435362318840581e-06, "loss": 1.5817, "step": 18730 }, { "epoch": 1.56, "grad_norm": 2.0648393630981445, "learning_rate": 9.429565217391305e-06, "loss": 1.4933, "step": 18740 }, { "epoch": 1.56, "grad_norm": 4.213344573974609, "learning_rate": 9.423768115942031e-06, "loss": 1.575, "step": 18750 }, { "epoch": 1.56, "grad_norm": 7.273472309112549, "learning_rate": 9.417971014492753e-06, "loss": 1.726, "step": 18760 }, { "epoch": 1.56, "grad_norm": 2.119070053100586, "learning_rate": 9.412173913043479e-06, "loss": 1.7348, "step": 18770 }, { "epoch": 1.56, "grad_norm": 4.862486362457275, "learning_rate": 9.406376811594203e-06, "loss": 1.6862, "step": 18780 }, { "epoch": 1.57, "grad_norm": 0.6140770316123962, "learning_rate": 9.400579710144929e-06, "loss": 1.5872, "step": 18790 }, { "epoch": 1.57, "grad_norm": 3.3810670375823975, "learning_rate": 9.394782608695653e-06, "loss": 1.6539, "step": 18800 }, { "epoch": 1.57, "grad_norm": 1.982848882675171, "learning_rate": 9.388985507246377e-06, "loss": 1.8119, "step": 18810 }, { "epoch": 1.57, "grad_norm": 1.6384955644607544, "learning_rate": 9.383188405797102e-06, "loss": 1.5366, "step": 18820 }, { "epoch": 1.57, "grad_norm": 6.467543125152588, "learning_rate": 9.377391304347826e-06, "loss": 1.641, "step": 18830 }, { "epoch": 1.57, "grad_norm": 6.90090799331665, "learning_rate": 9.371594202898552e-06, "loss": 1.6931, "step": 18840 }, { "epoch": 1.57, "grad_norm": 5.378636837005615, "learning_rate": 9.365797101449276e-06, "loss": 1.7595, "step": 18850 }, { "epoch": 1.57, "grad_norm": 4.310837268829346, "learning_rate": 9.360000000000002e-06, "loss": 1.6787, "step": 18860 }, { "epoch": 1.57, "grad_norm": 2.714045524597168, "learning_rate": 9.354202898550724e-06, "loss": 1.7402, "step": 18870 }, { "epoch": 1.57, "grad_norm": 2.784644842147827, "learning_rate": 9.34840579710145e-06, "loss": 1.565, "step": 18880 }, { "epoch": 1.57, "grad_norm": 8.056758880615234, "learning_rate": 9.342608695652174e-06, "loss": 1.6375, "step": 18890 }, { "epoch": 1.57, "grad_norm": 5.494594097137451, "learning_rate": 9.3368115942029e-06, "loss": 1.6977, "step": 18900 }, { "epoch": 1.58, "grad_norm": 1.0827395915985107, "learning_rate": 9.331014492753624e-06, "loss": 1.6315, "step": 18910 }, { "epoch": 1.58, "grad_norm": 6.295031547546387, "learning_rate": 9.32521739130435e-06, "loss": 1.7304, "step": 18920 }, { "epoch": 1.58, "grad_norm": 1.8913993835449219, "learning_rate": 9.319420289855073e-06, "loss": 1.7426, "step": 18930 }, { "epoch": 1.58, "grad_norm": 2.0160393714904785, "learning_rate": 9.313623188405799e-06, "loss": 1.7997, "step": 18940 }, { "epoch": 1.58, "grad_norm": 4.568789482116699, "learning_rate": 9.307826086956523e-06, "loss": 1.699, "step": 18950 }, { "epoch": 1.58, "grad_norm": 2.200634479522705, "learning_rate": 9.302028985507247e-06, "loss": 1.5647, "step": 18960 }, { "epoch": 1.58, "grad_norm": 5.3525495529174805, "learning_rate": 9.296231884057971e-06, "loss": 1.6899, "step": 18970 }, { "epoch": 1.58, "grad_norm": 2.9189863204956055, "learning_rate": 9.290434782608697e-06, "loss": 1.7537, "step": 18980 }, { "epoch": 1.58, "grad_norm": 3.9687070846557617, "learning_rate": 9.28463768115942e-06, "loss": 1.6875, "step": 18990 }, { "epoch": 1.58, "grad_norm": 1.2662925720214844, "learning_rate": 9.278840579710145e-06, "loss": 1.6292, "step": 19000 }, { "epoch": 1.58, "eval_loss": 1.665460228919983, "eval_runtime": 107.4967, "eval_samples_per_second": 9.303, "eval_steps_per_second": 2.326, "step": 19000 }, { "epoch": 1.58, "grad_norm": 2.05936336517334, "learning_rate": 9.27304347826087e-06, "loss": 1.5103, "step": 19010 }, { "epoch": 1.58, "grad_norm": 1.9469785690307617, "learning_rate": 9.267246376811594e-06, "loss": 1.6837, "step": 19020 }, { "epoch": 1.59, "grad_norm": 2.0714728832244873, "learning_rate": 9.26144927536232e-06, "loss": 1.725, "step": 19030 }, { "epoch": 1.59, "grad_norm": 3.3942904472351074, "learning_rate": 9.255652173913044e-06, "loss": 1.6261, "step": 19040 }, { "epoch": 1.59, "grad_norm": 3.1022446155548096, "learning_rate": 9.24985507246377e-06, "loss": 1.7056, "step": 19050 }, { "epoch": 1.59, "grad_norm": 1.8929376602172852, "learning_rate": 9.244057971014494e-06, "loss": 1.666, "step": 19060 }, { "epoch": 1.59, "grad_norm": 1.3702325820922852, "learning_rate": 9.238260869565218e-06, "loss": 1.6023, "step": 19070 }, { "epoch": 1.59, "grad_norm": 2.481436014175415, "learning_rate": 9.232463768115942e-06, "loss": 1.6922, "step": 19080 }, { "epoch": 1.59, "grad_norm": 4.172421455383301, "learning_rate": 9.226666666666668e-06, "loss": 1.7799, "step": 19090 }, { "epoch": 1.59, "grad_norm": 2.3141672611236572, "learning_rate": 9.220869565217392e-06, "loss": 1.326, "step": 19100 }, { "epoch": 1.59, "grad_norm": 4.576709270477295, "learning_rate": 9.215072463768117e-06, "loss": 1.6309, "step": 19110 }, { "epoch": 1.59, "grad_norm": 1.9501328468322754, "learning_rate": 9.209275362318841e-06, "loss": 1.6507, "step": 19120 }, { "epoch": 1.59, "grad_norm": 3.715846300125122, "learning_rate": 9.203478260869565e-06, "loss": 1.6828, "step": 19130 }, { "epoch": 1.59, "grad_norm": 2.534573554992676, "learning_rate": 9.197681159420291e-06, "loss": 1.619, "step": 19140 }, { "epoch": 1.6, "grad_norm": 2.9621031284332275, "learning_rate": 9.191884057971015e-06, "loss": 1.6573, "step": 19150 }, { "epoch": 1.6, "grad_norm": 2.051302194595337, "learning_rate": 9.18608695652174e-06, "loss": 1.71, "step": 19160 }, { "epoch": 1.6, "grad_norm": 5.671502113342285, "learning_rate": 9.180289855072465e-06, "loss": 1.6008, "step": 19170 }, { "epoch": 1.6, "grad_norm": 3.0547165870666504, "learning_rate": 9.174492753623189e-06, "loss": 1.7074, "step": 19180 }, { "epoch": 1.6, "grad_norm": 2.7989306449890137, "learning_rate": 9.168695652173913e-06, "loss": 1.7383, "step": 19190 }, { "epoch": 1.6, "grad_norm": 2.843214750289917, "learning_rate": 9.162898550724638e-06, "loss": 1.6794, "step": 19200 }, { "epoch": 1.6, "grad_norm": 2.1126368045806885, "learning_rate": 9.157101449275362e-06, "loss": 1.6367, "step": 19210 }, { "epoch": 1.6, "grad_norm": 1.7680147886276245, "learning_rate": 9.151304347826088e-06, "loss": 1.6034, "step": 19220 }, { "epoch": 1.6, "grad_norm": 5.450879096984863, "learning_rate": 9.145507246376812e-06, "loss": 1.632, "step": 19230 }, { "epoch": 1.6, "grad_norm": 2.6349544525146484, "learning_rate": 9.139710144927538e-06, "loss": 1.6385, "step": 19240 }, { "epoch": 1.6, "grad_norm": 0.9699334502220154, "learning_rate": 9.133913043478262e-06, "loss": 1.5493, "step": 19250 }, { "epoch": 1.6, "grad_norm": 6.04218053817749, "learning_rate": 9.128115942028986e-06, "loss": 1.6542, "step": 19260 }, { "epoch": 1.61, "grad_norm": 3.5029866695404053, "learning_rate": 9.122318840579712e-06, "loss": 1.8018, "step": 19270 }, { "epoch": 1.61, "grad_norm": 3.081104278564453, "learning_rate": 9.116521739130436e-06, "loss": 1.6778, "step": 19280 }, { "epoch": 1.61, "grad_norm": 6.336639881134033, "learning_rate": 9.11072463768116e-06, "loss": 1.5925, "step": 19290 }, { "epoch": 1.61, "grad_norm": 2.3441286087036133, "learning_rate": 9.104927536231885e-06, "loss": 1.6447, "step": 19300 }, { "epoch": 1.61, "grad_norm": 9.648870468139648, "learning_rate": 9.09913043478261e-06, "loss": 1.8095, "step": 19310 }, { "epoch": 1.61, "grad_norm": 2.4305033683776855, "learning_rate": 9.093333333333333e-06, "loss": 1.6808, "step": 19320 }, { "epoch": 1.61, "grad_norm": 1.9530631303787231, "learning_rate": 9.087536231884059e-06, "loss": 1.6648, "step": 19330 }, { "epoch": 1.61, "grad_norm": 2.785933494567871, "learning_rate": 9.081739130434783e-06, "loss": 1.5965, "step": 19340 }, { "epoch": 1.61, "grad_norm": 2.8842251300811768, "learning_rate": 9.075942028985509e-06, "loss": 1.6147, "step": 19350 }, { "epoch": 1.61, "grad_norm": 5.127828121185303, "learning_rate": 9.070144927536233e-06, "loss": 1.7384, "step": 19360 }, { "epoch": 1.61, "grad_norm": 0.9837595224380493, "learning_rate": 9.064347826086958e-06, "loss": 1.8099, "step": 19370 }, { "epoch": 1.61, "grad_norm": 3.733158588409424, "learning_rate": 9.05855072463768e-06, "loss": 1.5608, "step": 19380 }, { "epoch": 1.62, "grad_norm": 2.9853909015655518, "learning_rate": 9.052753623188406e-06, "loss": 1.6227, "step": 19390 }, { "epoch": 1.62, "grad_norm": 5.49697208404541, "learning_rate": 9.04695652173913e-06, "loss": 1.5671, "step": 19400 }, { "epoch": 1.62, "grad_norm": 1.7057849168777466, "learning_rate": 9.041159420289856e-06, "loss": 1.6561, "step": 19410 }, { "epoch": 1.62, "grad_norm": 2.0063562393188477, "learning_rate": 9.03536231884058e-06, "loss": 1.7223, "step": 19420 }, { "epoch": 1.62, "grad_norm": 8.740435600280762, "learning_rate": 9.029565217391306e-06, "loss": 1.6543, "step": 19430 }, { "epoch": 1.62, "grad_norm": 4.221389293670654, "learning_rate": 9.02376811594203e-06, "loss": 1.673, "step": 19440 }, { "epoch": 1.62, "grad_norm": 6.550647735595703, "learning_rate": 9.017971014492754e-06, "loss": 1.5946, "step": 19450 }, { "epoch": 1.62, "grad_norm": 2.3443706035614014, "learning_rate": 9.01217391304348e-06, "loss": 1.6832, "step": 19460 }, { "epoch": 1.62, "grad_norm": 1.3949636220932007, "learning_rate": 9.006376811594203e-06, "loss": 1.5993, "step": 19470 }, { "epoch": 1.62, "grad_norm": 3.4200923442840576, "learning_rate": 9.00057971014493e-06, "loss": 1.6081, "step": 19480 }, { "epoch": 1.62, "grad_norm": 5.661345958709717, "learning_rate": 8.994782608695653e-06, "loss": 1.7607, "step": 19490 }, { "epoch": 1.62, "grad_norm": 2.108694314956665, "learning_rate": 8.988985507246377e-06, "loss": 1.6209, "step": 19500 }, { "epoch": 1.62, "eval_loss": 1.6830703020095825, "eval_runtime": 107.5287, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 19500 }, { "epoch": 1.63, "grad_norm": 5.532451629638672, "learning_rate": 8.983188405797101e-06, "loss": 1.6172, "step": 19510 }, { "epoch": 1.63, "grad_norm": 1.787123441696167, "learning_rate": 8.977391304347827e-06, "loss": 1.6074, "step": 19520 }, { "epoch": 1.63, "grad_norm": 1.405480146408081, "learning_rate": 8.971594202898551e-06, "loss": 1.6032, "step": 19530 }, { "epoch": 1.63, "grad_norm": 7.917876243591309, "learning_rate": 8.965797101449277e-06, "loss": 1.6732, "step": 19540 }, { "epoch": 1.63, "grad_norm": 3.5339460372924805, "learning_rate": 8.96e-06, "loss": 1.7914, "step": 19550 }, { "epoch": 1.63, "grad_norm": 3.5086684226989746, "learning_rate": 8.954202898550726e-06, "loss": 1.5816, "step": 19560 }, { "epoch": 1.63, "grad_norm": 9.773869514465332, "learning_rate": 8.94840579710145e-06, "loss": 1.67, "step": 19570 }, { "epoch": 1.63, "grad_norm": 5.713991165161133, "learning_rate": 8.942608695652174e-06, "loss": 1.7089, "step": 19580 }, { "epoch": 1.63, "grad_norm": 2.009612560272217, "learning_rate": 8.936811594202898e-06, "loss": 1.7485, "step": 19590 }, { "epoch": 1.63, "grad_norm": 2.547402858734131, "learning_rate": 8.931014492753624e-06, "loss": 1.5412, "step": 19600 }, { "epoch": 1.63, "grad_norm": 1.649285078048706, "learning_rate": 8.925217391304348e-06, "loss": 1.7208, "step": 19610 }, { "epoch": 1.64, "grad_norm": 5.029441833496094, "learning_rate": 8.919420289855074e-06, "loss": 1.6938, "step": 19620 }, { "epoch": 1.64, "grad_norm": 2.3328795433044434, "learning_rate": 8.913623188405798e-06, "loss": 1.5802, "step": 19630 }, { "epoch": 1.64, "grad_norm": 2.51261043548584, "learning_rate": 8.907826086956522e-06, "loss": 1.6576, "step": 19640 }, { "epoch": 1.64, "grad_norm": 5.171065330505371, "learning_rate": 8.902028985507247e-06, "loss": 1.5348, "step": 19650 }, { "epoch": 1.64, "grad_norm": 5.7769856452941895, "learning_rate": 8.896231884057971e-06, "loss": 1.6869, "step": 19660 }, { "epoch": 1.64, "grad_norm": 4.950839042663574, "learning_rate": 8.890434782608697e-06, "loss": 1.6537, "step": 19670 }, { "epoch": 1.64, "grad_norm": 4.615936279296875, "learning_rate": 8.884637681159421e-06, "loss": 1.7185, "step": 19680 }, { "epoch": 1.64, "grad_norm": 1.862908959388733, "learning_rate": 8.878840579710145e-06, "loss": 1.6145, "step": 19690 }, { "epoch": 1.64, "grad_norm": 2.8000476360321045, "learning_rate": 8.87304347826087e-06, "loss": 1.736, "step": 19700 }, { "epoch": 1.64, "grad_norm": 2.0812816619873047, "learning_rate": 8.867246376811595e-06, "loss": 1.4757, "step": 19710 }, { "epoch": 1.64, "grad_norm": 3.9957151412963867, "learning_rate": 8.861449275362319e-06, "loss": 1.6096, "step": 19720 }, { "epoch": 1.64, "grad_norm": 4.853338241577148, "learning_rate": 8.855652173913045e-06, "loss": 1.6642, "step": 19730 }, { "epoch": 1.65, "grad_norm": 3.507507085800171, "learning_rate": 8.849855072463769e-06, "loss": 1.6589, "step": 19740 }, { "epoch": 1.65, "grad_norm": 3.7916064262390137, "learning_rate": 8.844057971014494e-06, "loss": 1.6656, "step": 19750 }, { "epoch": 1.65, "grad_norm": 9.049138069152832, "learning_rate": 8.838260869565218e-06, "loss": 1.6074, "step": 19760 }, { "epoch": 1.65, "grad_norm": 4.988990306854248, "learning_rate": 8.832463768115942e-06, "loss": 1.697, "step": 19770 }, { "epoch": 1.65, "grad_norm": 3.35215163230896, "learning_rate": 8.826666666666668e-06, "loss": 1.6842, "step": 19780 }, { "epoch": 1.65, "grad_norm": 3.3260128498077393, "learning_rate": 8.820869565217392e-06, "loss": 1.6981, "step": 19790 }, { "epoch": 1.65, "grad_norm": 1.1436783075332642, "learning_rate": 8.815072463768116e-06, "loss": 1.7511, "step": 19800 }, { "epoch": 1.65, "grad_norm": 2.495922565460205, "learning_rate": 8.809275362318842e-06, "loss": 1.5865, "step": 19810 }, { "epoch": 1.65, "grad_norm": 2.033712148666382, "learning_rate": 8.803478260869566e-06, "loss": 1.6375, "step": 19820 }, { "epoch": 1.65, "grad_norm": 0.9689141511917114, "learning_rate": 8.79768115942029e-06, "loss": 1.5311, "step": 19830 }, { "epoch": 1.65, "grad_norm": 2.1666340827941895, "learning_rate": 8.791884057971015e-06, "loss": 1.7137, "step": 19840 }, { "epoch": 1.65, "grad_norm": 5.735472202301025, "learning_rate": 8.78608695652174e-06, "loss": 1.7429, "step": 19850 }, { "epoch": 1.66, "grad_norm": 7.482568264007568, "learning_rate": 8.780289855072465e-06, "loss": 1.5834, "step": 19860 }, { "epoch": 1.66, "grad_norm": 3.3337314128875732, "learning_rate": 8.774492753623189e-06, "loss": 1.7067, "step": 19870 }, { "epoch": 1.66, "grad_norm": 3.9348156452178955, "learning_rate": 8.768695652173915e-06, "loss": 1.7376, "step": 19880 }, { "epoch": 1.66, "grad_norm": 3.5238430500030518, "learning_rate": 8.762898550724639e-06, "loss": 1.5539, "step": 19890 }, { "epoch": 1.66, "grad_norm": 1.653454303741455, "learning_rate": 8.757101449275363e-06, "loss": 1.7208, "step": 19900 }, { "epoch": 1.66, "grad_norm": 5.208953857421875, "learning_rate": 8.751304347826087e-06, "loss": 1.5903, "step": 19910 }, { "epoch": 1.66, "grad_norm": 4.089755535125732, "learning_rate": 8.745507246376813e-06, "loss": 1.4553, "step": 19920 }, { "epoch": 1.66, "grad_norm": 2.7506258487701416, "learning_rate": 8.739710144927537e-06, "loss": 1.6508, "step": 19930 }, { "epoch": 1.66, "grad_norm": 5.436467170715332, "learning_rate": 8.733913043478262e-06, "loss": 1.8371, "step": 19940 }, { "epoch": 1.66, "grad_norm": 3.3517704010009766, "learning_rate": 8.728115942028986e-06, "loss": 1.7193, "step": 19950 }, { "epoch": 1.66, "grad_norm": 2.238973379135132, "learning_rate": 8.72231884057971e-06, "loss": 1.7621, "step": 19960 }, { "epoch": 1.66, "grad_norm": 3.6009724140167236, "learning_rate": 8.716521739130436e-06, "loss": 1.8128, "step": 19970 }, { "epoch": 1.67, "grad_norm": 4.952020645141602, "learning_rate": 8.71072463768116e-06, "loss": 1.7867, "step": 19980 }, { "epoch": 1.67, "grad_norm": 4.133454322814941, "learning_rate": 8.704927536231886e-06, "loss": 1.7099, "step": 19990 }, { "epoch": 1.67, "grad_norm": 1.35225510597229, "learning_rate": 8.69913043478261e-06, "loss": 1.7125, "step": 20000 }, { "epoch": 1.67, "eval_loss": 1.6788996458053589, "eval_runtime": 107.4956, "eval_samples_per_second": 9.303, "eval_steps_per_second": 2.326, "step": 20000 }, { "epoch": 1.67, "grad_norm": 2.7249436378479004, "learning_rate": 8.693333333333334e-06, "loss": 1.6412, "step": 20010 }, { "epoch": 1.67, "grad_norm": 4.052044868469238, "learning_rate": 8.687536231884058e-06, "loss": 1.4474, "step": 20020 }, { "epoch": 1.67, "grad_norm": 2.286026954650879, "learning_rate": 8.681739130434783e-06, "loss": 1.7352, "step": 20030 }, { "epoch": 1.67, "grad_norm": 9.765252113342285, "learning_rate": 8.675942028985507e-06, "loss": 1.6815, "step": 20040 }, { "epoch": 1.67, "grad_norm": 2.1208224296569824, "learning_rate": 8.670144927536233e-06, "loss": 1.7426, "step": 20050 }, { "epoch": 1.67, "grad_norm": 4.312246799468994, "learning_rate": 8.664347826086957e-06, "loss": 1.7116, "step": 20060 }, { "epoch": 1.67, "grad_norm": 3.1685314178466797, "learning_rate": 8.658550724637683e-06, "loss": 1.7183, "step": 20070 }, { "epoch": 1.67, "grad_norm": 1.873128890991211, "learning_rate": 8.652753623188407e-06, "loss": 1.811, "step": 20080 }, { "epoch": 1.67, "grad_norm": 4.21943473815918, "learning_rate": 8.64695652173913e-06, "loss": 1.6304, "step": 20090 }, { "epoch": 1.68, "grad_norm": 3.778602123260498, "learning_rate": 8.641159420289857e-06, "loss": 1.6324, "step": 20100 }, { "epoch": 1.68, "grad_norm": 2.6876754760742188, "learning_rate": 8.63536231884058e-06, "loss": 1.6039, "step": 20110 }, { "epoch": 1.68, "grad_norm": 4.973613739013672, "learning_rate": 8.629565217391305e-06, "loss": 1.7671, "step": 20120 }, { "epoch": 1.68, "grad_norm": 1.4355108737945557, "learning_rate": 8.62376811594203e-06, "loss": 1.6335, "step": 20130 }, { "epoch": 1.68, "grad_norm": 2.8951032161712646, "learning_rate": 8.617971014492754e-06, "loss": 1.7804, "step": 20140 }, { "epoch": 1.68, "grad_norm": 2.68106746673584, "learning_rate": 8.612173913043478e-06, "loss": 1.5194, "step": 20150 }, { "epoch": 1.68, "grad_norm": 4.569166660308838, "learning_rate": 8.606376811594204e-06, "loss": 1.6177, "step": 20160 }, { "epoch": 1.68, "grad_norm": 5.555330276489258, "learning_rate": 8.600579710144928e-06, "loss": 1.6004, "step": 20170 }, { "epoch": 1.68, "grad_norm": 1.8976370096206665, "learning_rate": 8.59536231884058e-06, "loss": 1.5999, "step": 20180 }, { "epoch": 1.68, "grad_norm": 1.5247642993927002, "learning_rate": 8.589565217391305e-06, "loss": 1.6919, "step": 20190 }, { "epoch": 1.68, "grad_norm": 5.274435997009277, "learning_rate": 8.583768115942029e-06, "loss": 1.5242, "step": 20200 }, { "epoch": 1.68, "grad_norm": 4.763355255126953, "learning_rate": 8.577971014492755e-06, "loss": 1.4803, "step": 20210 }, { "epoch": 1.69, "grad_norm": 3.4387519359588623, "learning_rate": 8.572173913043479e-06, "loss": 1.5794, "step": 20220 }, { "epoch": 1.69, "grad_norm": 4.496255874633789, "learning_rate": 8.566376811594204e-06, "loss": 1.6001, "step": 20230 }, { "epoch": 1.69, "grad_norm": 1.4461989402770996, "learning_rate": 8.560579710144928e-06, "loss": 1.7451, "step": 20240 }, { "epoch": 1.69, "grad_norm": 5.702919006347656, "learning_rate": 8.554782608695652e-06, "loss": 1.6566, "step": 20250 }, { "epoch": 1.69, "grad_norm": 1.9157620668411255, "learning_rate": 8.548985507246378e-06, "loss": 1.5958, "step": 20260 }, { "epoch": 1.69, "grad_norm": 4.364008903503418, "learning_rate": 8.543188405797102e-06, "loss": 1.6164, "step": 20270 }, { "epoch": 1.69, "grad_norm": 1.2819979190826416, "learning_rate": 8.537391304347826e-06, "loss": 1.7358, "step": 20280 }, { "epoch": 1.69, "grad_norm": 6.845746994018555, "learning_rate": 8.531594202898552e-06, "loss": 1.6241, "step": 20290 }, { "epoch": 1.69, "grad_norm": 3.371389627456665, "learning_rate": 8.525797101449276e-06, "loss": 1.6148, "step": 20300 }, { "epoch": 1.69, "grad_norm": 6.634834289550781, "learning_rate": 8.52e-06, "loss": 1.7313, "step": 20310 }, { "epoch": 1.69, "grad_norm": 11.510677337646484, "learning_rate": 8.514202898550725e-06, "loss": 1.6605, "step": 20320 }, { "epoch": 1.69, "grad_norm": 3.1439342498779297, "learning_rate": 8.50840579710145e-06, "loss": 1.5686, "step": 20330 }, { "epoch": 1.69, "grad_norm": 1.642844796180725, "learning_rate": 8.502608695652175e-06, "loss": 1.6276, "step": 20340 }, { "epoch": 1.7, "grad_norm": 4.143886089324951, "learning_rate": 8.496811594202899e-06, "loss": 1.5472, "step": 20350 }, { "epoch": 1.7, "grad_norm": 4.869246006011963, "learning_rate": 8.491014492753625e-06, "loss": 1.6785, "step": 20360 }, { "epoch": 1.7, "grad_norm": 2.9599814414978027, "learning_rate": 8.485217391304349e-06, "loss": 1.6191, "step": 20370 }, { "epoch": 1.7, "grad_norm": 2.2835745811462402, "learning_rate": 8.479420289855073e-06, "loss": 1.7113, "step": 20380 }, { "epoch": 1.7, "grad_norm": 4.400454998016357, "learning_rate": 8.473623188405797e-06, "loss": 1.7069, "step": 20390 }, { "epoch": 1.7, "grad_norm": 3.021839141845703, "learning_rate": 8.467826086956523e-06, "loss": 1.7014, "step": 20400 }, { "epoch": 1.7, "grad_norm": 3.3753602504730225, "learning_rate": 8.462028985507247e-06, "loss": 1.6844, "step": 20410 }, { "epoch": 1.7, "grad_norm": 1.501394271850586, "learning_rate": 8.456231884057972e-06, "loss": 1.7469, "step": 20420 }, { "epoch": 1.7, "grad_norm": 16.001710891723633, "learning_rate": 8.450434782608696e-06, "loss": 1.7221, "step": 20430 }, { "epoch": 1.7, "grad_norm": 4.816986083984375, "learning_rate": 8.44463768115942e-06, "loss": 1.7513, "step": 20440 }, { "epoch": 1.7, "grad_norm": 1.2568550109863281, "learning_rate": 8.438840579710146e-06, "loss": 1.5909, "step": 20450 }, { "epoch": 1.71, "grad_norm": 1.2014676332473755, "learning_rate": 8.43304347826087e-06, "loss": 1.5539, "step": 20460 }, { "epoch": 1.71, "grad_norm": 2.841648817062378, "learning_rate": 8.427246376811596e-06, "loss": 1.6945, "step": 20470 }, { "epoch": 1.71, "grad_norm": 2.8036043643951416, "learning_rate": 8.42144927536232e-06, "loss": 1.6231, "step": 20480 }, { "epoch": 1.71, "grad_norm": 2.289719343185425, "learning_rate": 8.415652173913044e-06, "loss": 1.5679, "step": 20490 }, { "epoch": 1.71, "grad_norm": 1.5245754718780518, "learning_rate": 8.409855072463768e-06, "loss": 1.5689, "step": 20500 }, { "epoch": 1.71, "eval_loss": 1.6623921394348145, "eval_runtime": 107.5043, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 20500 }, { "epoch": 1.71, "grad_norm": 9.230620384216309, "learning_rate": 8.404057971014493e-06, "loss": 1.4972, "step": 20510 }, { "epoch": 1.71, "grad_norm": 4.701317310333252, "learning_rate": 8.398260869565217e-06, "loss": 1.7549, "step": 20520 }, { "epoch": 1.71, "grad_norm": 2.25839900970459, "learning_rate": 8.392463768115943e-06, "loss": 1.66, "step": 20530 }, { "epoch": 1.71, "grad_norm": 3.2549502849578857, "learning_rate": 8.386666666666667e-06, "loss": 1.8211, "step": 20540 }, { "epoch": 1.71, "grad_norm": 5.639694690704346, "learning_rate": 8.380869565217393e-06, "loss": 1.6038, "step": 20550 }, { "epoch": 1.71, "grad_norm": 4.658710956573486, "learning_rate": 8.375072463768117e-06, "loss": 1.7501, "step": 20560 }, { "epoch": 1.71, "grad_norm": 2.85872745513916, "learning_rate": 8.36927536231884e-06, "loss": 1.5652, "step": 20570 }, { "epoch": 1.71, "grad_norm": 3.761110544204712, "learning_rate": 8.363478260869567e-06, "loss": 1.6325, "step": 20580 }, { "epoch": 1.72, "grad_norm": 2.664401054382324, "learning_rate": 8.35768115942029e-06, "loss": 1.6404, "step": 20590 }, { "epoch": 1.72, "grad_norm": 1.662811279296875, "learning_rate": 8.351884057971015e-06, "loss": 1.5619, "step": 20600 }, { "epoch": 1.72, "grad_norm": 2.5120227336883545, "learning_rate": 8.34608695652174e-06, "loss": 1.7325, "step": 20610 }, { "epoch": 1.72, "grad_norm": 6.742555618286133, "learning_rate": 8.340289855072464e-06, "loss": 1.7704, "step": 20620 }, { "epoch": 1.72, "grad_norm": 1.6222686767578125, "learning_rate": 8.334492753623188e-06, "loss": 1.6901, "step": 20630 }, { "epoch": 1.72, "grad_norm": 4.6312642097473145, "learning_rate": 8.328695652173914e-06, "loss": 1.728, "step": 20640 }, { "epoch": 1.72, "grad_norm": 1.7402911186218262, "learning_rate": 8.322898550724638e-06, "loss": 1.6404, "step": 20650 }, { "epoch": 1.72, "grad_norm": 2.214266777038574, "learning_rate": 8.317101449275364e-06, "loss": 1.7745, "step": 20660 }, { "epoch": 1.72, "grad_norm": 2.402463674545288, "learning_rate": 8.311304347826088e-06, "loss": 1.6937, "step": 20670 }, { "epoch": 1.72, "grad_norm": 4.309727668762207, "learning_rate": 8.305507246376813e-06, "loss": 1.633, "step": 20680 }, { "epoch": 1.72, "grad_norm": 1.6282193660736084, "learning_rate": 8.299710144927537e-06, "loss": 1.6727, "step": 20690 }, { "epoch": 1.73, "grad_norm": 4.9179277420043945, "learning_rate": 8.293913043478261e-06, "loss": 1.6977, "step": 20700 }, { "epoch": 1.73, "grad_norm": 5.5713701248168945, "learning_rate": 8.288115942028985e-06, "loss": 1.7506, "step": 20710 }, { "epoch": 1.73, "grad_norm": 2.8336901664733887, "learning_rate": 8.282318840579711e-06, "loss": 1.6477, "step": 20720 }, { "epoch": 1.73, "grad_norm": 3.1530611515045166, "learning_rate": 8.276521739130435e-06, "loss": 1.6213, "step": 20730 }, { "epoch": 1.73, "grad_norm": 2.452496290206909, "learning_rate": 8.27072463768116e-06, "loss": 1.5587, "step": 20740 }, { "epoch": 1.73, "grad_norm": 10.321528434753418, "learning_rate": 8.264927536231885e-06, "loss": 1.6628, "step": 20750 }, { "epoch": 1.73, "grad_norm": 1.9270588159561157, "learning_rate": 8.259130434782609e-06, "loss": 1.6388, "step": 20760 }, { "epoch": 1.73, "grad_norm": 2.518328905105591, "learning_rate": 8.253333333333334e-06, "loss": 1.6791, "step": 20770 }, { "epoch": 1.73, "grad_norm": 1.031456470489502, "learning_rate": 8.247536231884059e-06, "loss": 1.5683, "step": 20780 }, { "epoch": 1.73, "grad_norm": 2.8340535163879395, "learning_rate": 8.241739130434784e-06, "loss": 1.5371, "step": 20790 }, { "epoch": 1.73, "grad_norm": 6.309947967529297, "learning_rate": 8.235942028985508e-06, "loss": 1.5707, "step": 20800 }, { "epoch": 1.73, "grad_norm": 2.8457183837890625, "learning_rate": 8.230144927536232e-06, "loss": 1.6439, "step": 20810 }, { "epoch": 1.73, "grad_norm": 2.240163803100586, "learning_rate": 8.224347826086956e-06, "loss": 1.7126, "step": 20820 }, { "epoch": 1.74, "grad_norm": 2.0228397846221924, "learning_rate": 8.218550724637682e-06, "loss": 1.5457, "step": 20830 }, { "epoch": 1.74, "grad_norm": 2.194091796875, "learning_rate": 8.212753623188406e-06, "loss": 1.6615, "step": 20840 }, { "epoch": 1.74, "grad_norm": 3.5026867389678955, "learning_rate": 8.206956521739132e-06, "loss": 1.5305, "step": 20850 }, { "epoch": 1.74, "grad_norm": 4.275241374969482, "learning_rate": 8.201159420289856e-06, "loss": 1.5742, "step": 20860 }, { "epoch": 1.74, "grad_norm": 2.040112257003784, "learning_rate": 8.195362318840581e-06, "loss": 1.6762, "step": 20870 }, { "epoch": 1.74, "grad_norm": 3.858863592147827, "learning_rate": 8.189565217391305e-06, "loss": 1.7056, "step": 20880 }, { "epoch": 1.74, "grad_norm": 3.349377155303955, "learning_rate": 8.18376811594203e-06, "loss": 1.5865, "step": 20890 }, { "epoch": 1.74, "grad_norm": 4.167051792144775, "learning_rate": 8.177971014492753e-06, "loss": 1.712, "step": 20900 }, { "epoch": 1.74, "grad_norm": 5.692874431610107, "learning_rate": 8.172173913043479e-06, "loss": 1.6484, "step": 20910 }, { "epoch": 1.74, "grad_norm": 13.255517959594727, "learning_rate": 8.166376811594203e-06, "loss": 1.7715, "step": 20920 }, { "epoch": 1.74, "grad_norm": 5.60584831237793, "learning_rate": 8.160579710144929e-06, "loss": 1.5116, "step": 20930 }, { "epoch": 1.75, "grad_norm": 4.476940631866455, "learning_rate": 8.154782608695653e-06, "loss": 1.6632, "step": 20940 }, { "epoch": 1.75, "grad_norm": 3.8701999187469482, "learning_rate": 8.148985507246377e-06, "loss": 1.6628, "step": 20950 }, { "epoch": 1.75, "grad_norm": 1.236228346824646, "learning_rate": 8.143188405797102e-06, "loss": 1.4092, "step": 20960 }, { "epoch": 1.75, "grad_norm": 4.624843120574951, "learning_rate": 8.137391304347826e-06, "loss": 1.742, "step": 20970 }, { "epoch": 1.75, "grad_norm": 1.9257400035858154, "learning_rate": 8.131594202898552e-06, "loss": 1.6487, "step": 20980 }, { "epoch": 1.75, "grad_norm": 5.001676082611084, "learning_rate": 8.125797101449276e-06, "loss": 1.6715, "step": 20990 }, { "epoch": 1.75, "grad_norm": 1.8285595178604126, "learning_rate": 8.120000000000002e-06, "loss": 1.5451, "step": 21000 }, { "epoch": 1.75, "eval_loss": 1.6759874820709229, "eval_runtime": 107.5007, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 21000 }, { "epoch": 1.75, "grad_norm": 6.6983747482299805, "learning_rate": 8.114202898550724e-06, "loss": 1.6217, "step": 21010 }, { "epoch": 1.75, "grad_norm": 3.789233446121216, "learning_rate": 8.10840579710145e-06, "loss": 1.5388, "step": 21020 }, { "epoch": 1.75, "grad_norm": 9.76033878326416, "learning_rate": 8.102608695652174e-06, "loss": 1.7604, "step": 21030 }, { "epoch": 1.75, "grad_norm": 2.5290467739105225, "learning_rate": 8.0968115942029e-06, "loss": 1.6859, "step": 21040 }, { "epoch": 1.75, "grad_norm": 3.0937581062316895, "learning_rate": 8.091014492753624e-06, "loss": 1.481, "step": 21050 }, { "epoch": 1.75, "grad_norm": 4.91987943649292, "learning_rate": 8.08521739130435e-06, "loss": 1.5735, "step": 21060 }, { "epoch": 1.76, "grad_norm": 2.2947590351104736, "learning_rate": 8.079420289855073e-06, "loss": 1.6143, "step": 21070 }, { "epoch": 1.76, "grad_norm": 4.225331783294678, "learning_rate": 8.073623188405797e-06, "loss": 1.6024, "step": 21080 }, { "epoch": 1.76, "grad_norm": 6.730823993682861, "learning_rate": 8.067826086956523e-06, "loss": 1.667, "step": 21090 }, { "epoch": 1.76, "grad_norm": 1.8926206827163696, "learning_rate": 8.062028985507247e-06, "loss": 1.6495, "step": 21100 }, { "epoch": 1.76, "grad_norm": 7.650490760803223, "learning_rate": 8.056231884057971e-06, "loss": 1.7194, "step": 21110 }, { "epoch": 1.76, "grad_norm": 3.5820090770721436, "learning_rate": 8.050434782608697e-06, "loss": 1.591, "step": 21120 }, { "epoch": 1.76, "grad_norm": 1.5368926525115967, "learning_rate": 8.04463768115942e-06, "loss": 1.6234, "step": 21130 }, { "epoch": 1.76, "grad_norm": 1.6347370147705078, "learning_rate": 8.038840579710145e-06, "loss": 1.7416, "step": 21140 }, { "epoch": 1.76, "grad_norm": 10.997393608093262, "learning_rate": 8.03304347826087e-06, "loss": 1.7531, "step": 21150 }, { "epoch": 1.76, "grad_norm": 4.327144145965576, "learning_rate": 8.027246376811594e-06, "loss": 1.4644, "step": 21160 }, { "epoch": 1.76, "grad_norm": 1.6970939636230469, "learning_rate": 8.02144927536232e-06, "loss": 1.761, "step": 21170 }, { "epoch": 1.77, "grad_norm": 2.163623332977295, "learning_rate": 8.015652173913044e-06, "loss": 1.6956, "step": 21180 }, { "epoch": 1.77, "grad_norm": 1.5294394493103027, "learning_rate": 8.00985507246377e-06, "loss": 1.7231, "step": 21190 }, { "epoch": 1.77, "grad_norm": 7.195105075836182, "learning_rate": 8.004057971014494e-06, "loss": 1.7115, "step": 21200 }, { "epoch": 1.77, "grad_norm": 2.7161829471588135, "learning_rate": 7.998260869565218e-06, "loss": 1.7947, "step": 21210 }, { "epoch": 1.77, "grad_norm": 1.7141447067260742, "learning_rate": 7.992463768115942e-06, "loss": 1.6888, "step": 21220 }, { "epoch": 1.77, "grad_norm": 2.2814693450927734, "learning_rate": 7.986666666666668e-06, "loss": 1.5235, "step": 21230 }, { "epoch": 1.77, "grad_norm": 3.9310901165008545, "learning_rate": 7.980869565217392e-06, "loss": 1.6429, "step": 21240 }, { "epoch": 1.77, "grad_norm": 3.654700994491577, "learning_rate": 7.975072463768117e-06, "loss": 1.6168, "step": 21250 }, { "epoch": 1.77, "grad_norm": 6.43727445602417, "learning_rate": 7.969275362318841e-06, "loss": 1.6492, "step": 21260 }, { "epoch": 1.77, "grad_norm": 9.46625804901123, "learning_rate": 7.963478260869565e-06, "loss": 1.735, "step": 21270 }, { "epoch": 1.77, "grad_norm": 1.2169768810272217, "learning_rate": 7.957681159420291e-06, "loss": 1.5743, "step": 21280 }, { "epoch": 1.77, "grad_norm": 5.560244560241699, "learning_rate": 7.951884057971015e-06, "loss": 1.5637, "step": 21290 }, { "epoch": 1.77, "grad_norm": 7.38496208190918, "learning_rate": 7.94608695652174e-06, "loss": 1.6104, "step": 21300 }, { "epoch": 1.78, "grad_norm": 3.6433284282684326, "learning_rate": 7.940289855072465e-06, "loss": 1.644, "step": 21310 }, { "epoch": 1.78, "grad_norm": 2.7574374675750732, "learning_rate": 7.934492753623189e-06, "loss": 1.6456, "step": 21320 }, { "epoch": 1.78, "grad_norm": 3.709298610687256, "learning_rate": 7.928695652173913e-06, "loss": 1.6377, "step": 21330 }, { "epoch": 1.78, "grad_norm": 3.04289174079895, "learning_rate": 7.922898550724638e-06, "loss": 1.6613, "step": 21340 }, { "epoch": 1.78, "grad_norm": 4.023538112640381, "learning_rate": 7.917101449275362e-06, "loss": 1.7339, "step": 21350 }, { "epoch": 1.78, "grad_norm": 6.419352054595947, "learning_rate": 7.911304347826088e-06, "loss": 1.7423, "step": 21360 }, { "epoch": 1.78, "grad_norm": 6.251063823699951, "learning_rate": 7.905507246376812e-06, "loss": 1.5134, "step": 21370 }, { "epoch": 1.78, "grad_norm": 2.289116859436035, "learning_rate": 7.899710144927538e-06, "loss": 1.6577, "step": 21380 }, { "epoch": 1.78, "grad_norm": 2.699251651763916, "learning_rate": 7.893913043478262e-06, "loss": 1.6832, "step": 21390 }, { "epoch": 1.78, "grad_norm": 3.5587689876556396, "learning_rate": 7.888115942028986e-06, "loss": 1.6295, "step": 21400 }, { "epoch": 1.78, "grad_norm": 1.560650110244751, "learning_rate": 7.882318840579712e-06, "loss": 1.6037, "step": 21410 }, { "epoch": 1.79, "grad_norm": 2.8277647495269775, "learning_rate": 7.876521739130436e-06, "loss": 1.6725, "step": 21420 }, { "epoch": 1.79, "grad_norm": 2.283670425415039, "learning_rate": 7.87072463768116e-06, "loss": 1.6771, "step": 21430 }, { "epoch": 1.79, "grad_norm": 5.659524917602539, "learning_rate": 7.864927536231885e-06, "loss": 1.7715, "step": 21440 }, { "epoch": 1.79, "grad_norm": 4.301861763000488, "learning_rate": 7.85913043478261e-06, "loss": 1.5021, "step": 21450 }, { "epoch": 1.79, "grad_norm": 3.5721640586853027, "learning_rate": 7.853333333333333e-06, "loss": 1.5547, "step": 21460 }, { "epoch": 1.79, "grad_norm": 3.1962008476257324, "learning_rate": 7.847536231884059e-06, "loss": 1.6434, "step": 21470 }, { "epoch": 1.79, "grad_norm": 1.1904305219650269, "learning_rate": 7.841739130434783e-06, "loss": 1.7324, "step": 21480 }, { "epoch": 1.79, "grad_norm": 6.97566556930542, "learning_rate": 7.835942028985509e-06, "loss": 1.5438, "step": 21490 }, { "epoch": 1.79, "grad_norm": 2.0194718837738037, "learning_rate": 7.830144927536233e-06, "loss": 1.8216, "step": 21500 }, { "epoch": 1.79, "eval_loss": 1.6438640356063843, "eval_runtime": 107.5066, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 21500 }, { "epoch": 1.79, "grad_norm": 2.1328859329223633, "learning_rate": 7.824347826086958e-06, "loss": 1.6511, "step": 21510 }, { "epoch": 1.79, "grad_norm": 16.312145233154297, "learning_rate": 7.81855072463768e-06, "loss": 1.6316, "step": 21520 }, { "epoch": 1.79, "grad_norm": 2.9844260215759277, "learning_rate": 7.812753623188406e-06, "loss": 1.6512, "step": 21530 }, { "epoch": 1.79, "grad_norm": 3.2778491973876953, "learning_rate": 7.80695652173913e-06, "loss": 1.5582, "step": 21540 }, { "epoch": 1.8, "grad_norm": 4.904211044311523, "learning_rate": 7.801159420289856e-06, "loss": 1.55, "step": 21550 }, { "epoch": 1.8, "grad_norm": 4.345255374908447, "learning_rate": 7.79536231884058e-06, "loss": 1.6164, "step": 21560 }, { "epoch": 1.8, "grad_norm": 5.088725566864014, "learning_rate": 7.789565217391306e-06, "loss": 1.5831, "step": 21570 }, { "epoch": 1.8, "grad_norm": 4.778326988220215, "learning_rate": 7.78376811594203e-06, "loss": 1.6626, "step": 21580 }, { "epoch": 1.8, "grad_norm": 3.572787284851074, "learning_rate": 7.777971014492754e-06, "loss": 1.7164, "step": 21590 }, { "epoch": 1.8, "grad_norm": 7.807311534881592, "learning_rate": 7.77217391304348e-06, "loss": 1.5888, "step": 21600 }, { "epoch": 1.8, "grad_norm": 2.578456401824951, "learning_rate": 7.766376811594203e-06, "loss": 1.7378, "step": 21610 }, { "epoch": 1.8, "grad_norm": 3.514986276626587, "learning_rate": 7.76057971014493e-06, "loss": 1.72, "step": 21620 }, { "epoch": 1.8, "grad_norm": 1.335219383239746, "learning_rate": 7.754782608695653e-06, "loss": 1.6237, "step": 21630 }, { "epoch": 1.8, "grad_norm": 4.087151527404785, "learning_rate": 7.748985507246377e-06, "loss": 1.6961, "step": 21640 }, { "epoch": 1.8, "grad_norm": 5.020003318786621, "learning_rate": 7.743188405797101e-06, "loss": 1.5885, "step": 21650 }, { "epoch": 1.81, "grad_norm": 3.0945472717285156, "learning_rate": 7.737391304347827e-06, "loss": 1.656, "step": 21660 }, { "epoch": 1.81, "grad_norm": 10.43069076538086, "learning_rate": 7.731594202898551e-06, "loss": 1.7332, "step": 21670 }, { "epoch": 1.81, "grad_norm": 2.4677326679229736, "learning_rate": 7.725797101449277e-06, "loss": 1.5584, "step": 21680 }, { "epoch": 1.81, "grad_norm": 1.4591469764709473, "learning_rate": 7.72e-06, "loss": 1.777, "step": 21690 }, { "epoch": 1.81, "grad_norm": 2.6109650135040283, "learning_rate": 7.714202898550726e-06, "loss": 1.7946, "step": 21700 }, { "epoch": 1.81, "grad_norm": 2.215765953063965, "learning_rate": 7.70840579710145e-06, "loss": 1.5901, "step": 21710 }, { "epoch": 1.81, "grad_norm": 2.505645751953125, "learning_rate": 7.702608695652174e-06, "loss": 1.6892, "step": 21720 }, { "epoch": 1.81, "grad_norm": 1.5885299444198608, "learning_rate": 7.696811594202898e-06, "loss": 1.7118, "step": 21730 }, { "epoch": 1.81, "grad_norm": 2.334027051925659, "learning_rate": 7.691014492753624e-06, "loss": 1.647, "step": 21740 }, { "epoch": 1.81, "grad_norm": 8.775703430175781, "learning_rate": 7.685217391304348e-06, "loss": 1.6092, "step": 21750 }, { "epoch": 1.81, "grad_norm": 4.955008506774902, "learning_rate": 7.679420289855074e-06, "loss": 1.5739, "step": 21760 }, { "epoch": 1.81, "grad_norm": 4.728649139404297, "learning_rate": 7.673623188405798e-06, "loss": 1.532, "step": 21770 }, { "epoch": 1.81, "grad_norm": 2.9815008640289307, "learning_rate": 7.667826086956522e-06, "loss": 1.6954, "step": 21780 }, { "epoch": 1.82, "grad_norm": 6.196691513061523, "learning_rate": 7.662028985507247e-06, "loss": 1.6589, "step": 21790 }, { "epoch": 1.82, "grad_norm": 6.2602643966674805, "learning_rate": 7.656231884057971e-06, "loss": 1.6467, "step": 21800 }, { "epoch": 1.82, "grad_norm": 2.570413112640381, "learning_rate": 7.650434782608697e-06, "loss": 1.6406, "step": 21810 }, { "epoch": 1.82, "grad_norm": 11.799811363220215, "learning_rate": 7.644637681159421e-06, "loss": 1.7035, "step": 21820 }, { "epoch": 1.82, "grad_norm": 6.307993412017822, "learning_rate": 7.638840579710145e-06, "loss": 1.6075, "step": 21830 }, { "epoch": 1.82, "grad_norm": 4.131196975708008, "learning_rate": 7.63304347826087e-06, "loss": 1.5969, "step": 21840 }, { "epoch": 1.82, "grad_norm": 2.6842682361602783, "learning_rate": 7.627246376811595e-06, "loss": 1.6489, "step": 21850 }, { "epoch": 1.82, "grad_norm": 3.3321924209594727, "learning_rate": 7.621449275362319e-06, "loss": 1.5213, "step": 21860 }, { "epoch": 1.82, "grad_norm": 8.997515678405762, "learning_rate": 7.615652173913044e-06, "loss": 1.7068, "step": 21870 }, { "epoch": 1.82, "grad_norm": 2.286634922027588, "learning_rate": 7.609855072463769e-06, "loss": 1.6143, "step": 21880 }, { "epoch": 1.82, "grad_norm": 5.148158073425293, "learning_rate": 7.6040579710144934e-06, "loss": 1.7376, "step": 21890 }, { "epoch": 1.82, "grad_norm": 4.665143013000488, "learning_rate": 7.598260869565218e-06, "loss": 1.6027, "step": 21900 }, { "epoch": 1.83, "grad_norm": 1.7268582582473755, "learning_rate": 7.592463768115943e-06, "loss": 1.5526, "step": 21910 }, { "epoch": 1.83, "grad_norm": 1.673620581626892, "learning_rate": 7.586666666666668e-06, "loss": 1.7982, "step": 21920 }, { "epoch": 1.83, "grad_norm": 2.819336175918579, "learning_rate": 7.580869565217393e-06, "loss": 1.7283, "step": 21930 }, { "epoch": 1.83, "grad_norm": 6.207809925079346, "learning_rate": 7.575072463768116e-06, "loss": 1.5976, "step": 21940 }, { "epoch": 1.83, "grad_norm": 3.5421364307403564, "learning_rate": 7.569275362318841e-06, "loss": 1.7216, "step": 21950 }, { "epoch": 1.83, "grad_norm": 2.2121992111206055, "learning_rate": 7.563478260869566e-06, "loss": 1.5512, "step": 21960 }, { "epoch": 1.83, "grad_norm": 6.52865743637085, "learning_rate": 7.5576811594202906e-06, "loss": 1.6024, "step": 21970 }, { "epoch": 1.83, "grad_norm": 6.420586109161377, "learning_rate": 7.5518840579710154e-06, "loss": 1.6298, "step": 21980 }, { "epoch": 1.83, "grad_norm": 9.54375171661377, "learning_rate": 7.5460869565217394e-06, "loss": 1.7235, "step": 21990 }, { "epoch": 1.83, "grad_norm": 3.1675570011138916, "learning_rate": 7.540289855072464e-06, "loss": 1.8252, "step": 22000 }, { "epoch": 1.83, "eval_loss": 1.6268221139907837, "eval_runtime": 107.5262, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 22000 }, { "epoch": 1.83, "grad_norm": 5.670650005340576, "learning_rate": 7.534492753623189e-06, "loss": 1.7059, "step": 22010 }, { "epoch": 1.83, "grad_norm": 1.6870843172073364, "learning_rate": 7.528695652173914e-06, "loss": 1.8291, "step": 22020 }, { "epoch": 1.84, "grad_norm": 3.0037596225738525, "learning_rate": 7.522898550724639e-06, "loss": 1.7288, "step": 22030 }, { "epoch": 1.84, "grad_norm": 2.6715378761291504, "learning_rate": 7.517101449275363e-06, "loss": 1.5929, "step": 22040 }, { "epoch": 1.84, "grad_norm": 3.457519769668579, "learning_rate": 7.511304347826087e-06, "loss": 1.4734, "step": 22050 }, { "epoch": 1.84, "grad_norm": 7.1646342277526855, "learning_rate": 7.505507246376812e-06, "loss": 1.7495, "step": 22060 }, { "epoch": 1.84, "grad_norm": 4.050849437713623, "learning_rate": 7.4997101449275366e-06, "loss": 1.667, "step": 22070 }, { "epoch": 1.84, "grad_norm": 3.6044504642486572, "learning_rate": 7.493913043478261e-06, "loss": 1.4592, "step": 22080 }, { "epoch": 1.84, "grad_norm": 1.669327974319458, "learning_rate": 7.488115942028986e-06, "loss": 1.563, "step": 22090 }, { "epoch": 1.84, "grad_norm": 4.431074142456055, "learning_rate": 7.482318840579711e-06, "loss": 1.5672, "step": 22100 }, { "epoch": 1.84, "grad_norm": 6.121443748474121, "learning_rate": 7.476521739130436e-06, "loss": 1.6698, "step": 22110 }, { "epoch": 1.84, "grad_norm": 1.799599289894104, "learning_rate": 7.470724637681161e-06, "loss": 1.6929, "step": 22120 }, { "epoch": 1.84, "grad_norm": 3.3442468643188477, "learning_rate": 7.464927536231885e-06, "loss": 1.7485, "step": 22130 }, { "epoch": 1.84, "grad_norm": 2.691938638687134, "learning_rate": 7.459130434782609e-06, "loss": 1.727, "step": 22140 }, { "epoch": 1.85, "grad_norm": 3.82914400100708, "learning_rate": 7.453333333333334e-06, "loss": 1.5514, "step": 22150 }, { "epoch": 1.85, "grad_norm": 3.622535467147827, "learning_rate": 7.4475362318840585e-06, "loss": 1.7807, "step": 22160 }, { "epoch": 1.85, "grad_norm": 1.5094232559204102, "learning_rate": 7.441739130434783e-06, "loss": 1.7863, "step": 22170 }, { "epoch": 1.85, "grad_norm": 2.016897201538086, "learning_rate": 7.435942028985507e-06, "loss": 1.7699, "step": 22180 }, { "epoch": 1.85, "grad_norm": 2.2854208946228027, "learning_rate": 7.430144927536232e-06, "loss": 1.7514, "step": 22190 }, { "epoch": 1.85, "grad_norm": 8.095964431762695, "learning_rate": 7.424347826086957e-06, "loss": 1.6181, "step": 22200 }, { "epoch": 1.85, "grad_norm": 2.0803048610687256, "learning_rate": 7.418550724637682e-06, "loss": 1.6808, "step": 22210 }, { "epoch": 1.85, "grad_norm": 5.521609306335449, "learning_rate": 7.413333333333333e-06, "loss": 1.641, "step": 22220 }, { "epoch": 1.85, "grad_norm": 4.716726303100586, "learning_rate": 7.407536231884058e-06, "loss": 1.7521, "step": 22230 }, { "epoch": 1.85, "grad_norm": 4.207428455352783, "learning_rate": 7.401739130434783e-06, "loss": 1.6185, "step": 22240 }, { "epoch": 1.85, "grad_norm": 3.840510368347168, "learning_rate": 7.395942028985508e-06, "loss": 1.4995, "step": 22250 }, { "epoch": 1.85, "grad_norm": 2.4733941555023193, "learning_rate": 7.390144927536233e-06, "loss": 1.5198, "step": 22260 }, { "epoch": 1.86, "grad_norm": 3.7702128887176514, "learning_rate": 7.3843478260869575e-06, "loss": 1.543, "step": 22270 }, { "epoch": 1.86, "grad_norm": 3.0251762866973877, "learning_rate": 7.378550724637682e-06, "loss": 1.5978, "step": 22280 }, { "epoch": 1.86, "grad_norm": 7.0895233154296875, "learning_rate": 7.372753623188406e-06, "loss": 1.6949, "step": 22290 }, { "epoch": 1.86, "grad_norm": 2.8260159492492676, "learning_rate": 7.366956521739131e-06, "loss": 1.7792, "step": 22300 }, { "epoch": 1.86, "grad_norm": 3.7007031440734863, "learning_rate": 7.361159420289856e-06, "loss": 1.6979, "step": 22310 }, { "epoch": 1.86, "grad_norm": 4.845219612121582, "learning_rate": 7.35536231884058e-06, "loss": 1.3902, "step": 22320 }, { "epoch": 1.86, "grad_norm": 4.411277770996094, "learning_rate": 7.349565217391305e-06, "loss": 1.6546, "step": 22330 }, { "epoch": 1.86, "grad_norm": 3.6635563373565674, "learning_rate": 7.343768115942029e-06, "loss": 1.7551, "step": 22340 }, { "epoch": 1.86, "grad_norm": 8.20759391784668, "learning_rate": 7.337971014492754e-06, "loss": 1.574, "step": 22350 }, { "epoch": 1.86, "grad_norm": 1.9890128374099731, "learning_rate": 7.332173913043479e-06, "loss": 1.6738, "step": 22360 }, { "epoch": 1.86, "grad_norm": 8.436697959899902, "learning_rate": 7.3263768115942035e-06, "loss": 1.6415, "step": 22370 }, { "epoch": 1.86, "grad_norm": 1.9131646156311035, "learning_rate": 7.320579710144928e-06, "loss": 1.5153, "step": 22380 }, { "epoch": 1.87, "grad_norm": 2.4065988063812256, "learning_rate": 7.314782608695653e-06, "loss": 1.6153, "step": 22390 }, { "epoch": 1.87, "grad_norm": 2.279060125350952, "learning_rate": 7.308985507246378e-06, "loss": 1.6871, "step": 22400 }, { "epoch": 1.87, "grad_norm": 2.864788770675659, "learning_rate": 7.303188405797103e-06, "loss": 1.6629, "step": 22410 }, { "epoch": 1.87, "grad_norm": 6.575710296630859, "learning_rate": 7.297391304347826e-06, "loss": 1.5627, "step": 22420 }, { "epoch": 1.87, "grad_norm": 5.330643653869629, "learning_rate": 7.291594202898551e-06, "loss": 1.7939, "step": 22430 }, { "epoch": 1.87, "grad_norm": 1.9671368598937988, "learning_rate": 7.285797101449276e-06, "loss": 1.6338, "step": 22440 }, { "epoch": 1.87, "grad_norm": 8.052875518798828, "learning_rate": 7.280000000000001e-06, "loss": 1.7247, "step": 22450 }, { "epoch": 1.87, "grad_norm": 8.003440856933594, "learning_rate": 7.2742028985507255e-06, "loss": 1.6002, "step": 22460 }, { "epoch": 1.87, "grad_norm": 15.555729866027832, "learning_rate": 7.26840579710145e-06, "loss": 1.7289, "step": 22470 }, { "epoch": 1.87, "grad_norm": 2.3797450065612793, "learning_rate": 7.262608695652174e-06, "loss": 1.5819, "step": 22480 }, { "epoch": 1.87, "grad_norm": 5.605469226837158, "learning_rate": 7.256811594202899e-06, "loss": 1.6578, "step": 22490 }, { "epoch": 1.88, "grad_norm": 4.348262786865234, "learning_rate": 7.251014492753624e-06, "loss": 1.6862, "step": 22500 }, { "epoch": 1.88, "eval_loss": 1.624435544013977, "eval_runtime": 107.5057, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 22500 }, { "epoch": 1.88, "grad_norm": 1.7244126796722412, "learning_rate": 7.245217391304349e-06, "loss": 1.6801, "step": 22510 }, { "epoch": 1.88, "grad_norm": 3.5117290019989014, "learning_rate": 7.239420289855073e-06, "loss": 1.6471, "step": 22520 }, { "epoch": 1.88, "grad_norm": 3.5990796089172363, "learning_rate": 7.233623188405797e-06, "loss": 1.625, "step": 22530 }, { "epoch": 1.88, "grad_norm": 4.40592622756958, "learning_rate": 7.227826086956522e-06, "loss": 1.6896, "step": 22540 }, { "epoch": 1.88, "grad_norm": 2.524301528930664, "learning_rate": 7.222028985507247e-06, "loss": 1.6493, "step": 22550 }, { "epoch": 1.88, "grad_norm": 3.01413631439209, "learning_rate": 7.2162318840579714e-06, "loss": 1.6456, "step": 22560 }, { "epoch": 1.88, "grad_norm": 2.1266565322875977, "learning_rate": 7.210434782608696e-06, "loss": 1.6527, "step": 22570 }, { "epoch": 1.88, "grad_norm": 1.7390304803848267, "learning_rate": 7.204637681159421e-06, "loss": 1.6855, "step": 22580 }, { "epoch": 1.88, "grad_norm": 2.171137571334839, "learning_rate": 7.198840579710146e-06, "loss": 1.6591, "step": 22590 }, { "epoch": 1.88, "grad_norm": 2.5304112434387207, "learning_rate": 7.193043478260871e-06, "loss": 1.62, "step": 22600 }, { "epoch": 1.88, "grad_norm": 1.766373872756958, "learning_rate": 7.187246376811595e-06, "loss": 1.7098, "step": 22610 }, { "epoch": 1.89, "grad_norm": 2.5186986923217773, "learning_rate": 7.18144927536232e-06, "loss": 1.5517, "step": 22620 }, { "epoch": 1.89, "grad_norm": 1.9023741483688354, "learning_rate": 7.175652173913044e-06, "loss": 1.7088, "step": 22630 }, { "epoch": 1.89, "grad_norm": 6.54264497756958, "learning_rate": 7.1698550724637686e-06, "loss": 1.6399, "step": 22640 }, { "epoch": 1.89, "grad_norm": 7.001004695892334, "learning_rate": 7.164057971014493e-06, "loss": 1.5971, "step": 22650 }, { "epoch": 1.89, "grad_norm": 6.7402238845825195, "learning_rate": 7.1582608695652174e-06, "loss": 1.6201, "step": 22660 }, { "epoch": 1.89, "grad_norm": 5.761517524719238, "learning_rate": 7.152463768115942e-06, "loss": 1.6743, "step": 22670 }, { "epoch": 1.89, "grad_norm": 1.6771907806396484, "learning_rate": 7.146666666666667e-06, "loss": 1.5944, "step": 22680 }, { "epoch": 1.89, "grad_norm": 9.115729331970215, "learning_rate": 7.140869565217392e-06, "loss": 1.6292, "step": 22690 }, { "epoch": 1.89, "grad_norm": 3.5204763412475586, "learning_rate": 7.135072463768117e-06, "loss": 1.6079, "step": 22700 }, { "epoch": 1.89, "grad_norm": 1.9801039695739746, "learning_rate": 7.129275362318842e-06, "loss": 1.6728, "step": 22710 }, { "epoch": 1.89, "grad_norm": 4.535811424255371, "learning_rate": 7.1234782608695665e-06, "loss": 1.5367, "step": 22720 }, { "epoch": 1.89, "grad_norm": 4.705811023712158, "learning_rate": 7.11768115942029e-06, "loss": 1.6359, "step": 22730 }, { "epoch": 1.9, "grad_norm": 5.373073577880859, "learning_rate": 7.1118840579710146e-06, "loss": 1.5461, "step": 22740 }, { "epoch": 1.9, "grad_norm": 2.482778549194336, "learning_rate": 7.106086956521739e-06, "loss": 1.6968, "step": 22750 }, { "epoch": 1.9, "grad_norm": 7.682448387145996, "learning_rate": 7.100289855072464e-06, "loss": 1.6213, "step": 22760 }, { "epoch": 1.9, "grad_norm": 3.669218063354492, "learning_rate": 7.094492753623189e-06, "loss": 1.6408, "step": 22770 }, { "epoch": 1.9, "grad_norm": 2.680028200149536, "learning_rate": 7.088695652173914e-06, "loss": 1.6225, "step": 22780 }, { "epoch": 1.9, "grad_norm": 9.829727172851562, "learning_rate": 7.082898550724638e-06, "loss": 1.5706, "step": 22790 }, { "epoch": 1.9, "grad_norm": 5.793923377990723, "learning_rate": 7.077101449275363e-06, "loss": 1.6649, "step": 22800 }, { "epoch": 1.9, "grad_norm": 3.1820645332336426, "learning_rate": 7.071304347826088e-06, "loss": 1.518, "step": 22810 }, { "epoch": 1.9, "grad_norm": 3.055821418762207, "learning_rate": 7.0655072463768125e-06, "loss": 1.6835, "step": 22820 }, { "epoch": 1.9, "grad_norm": 6.942118167877197, "learning_rate": 7.0597101449275365e-06, "loss": 1.5547, "step": 22830 }, { "epoch": 1.9, "grad_norm": 3.1956920623779297, "learning_rate": 7.053913043478261e-06, "loss": 1.7392, "step": 22840 }, { "epoch": 1.9, "grad_norm": 1.6930139064788818, "learning_rate": 7.048115942028985e-06, "loss": 1.7377, "step": 22850 }, { "epoch": 1.91, "grad_norm": 3.8593344688415527, "learning_rate": 7.04231884057971e-06, "loss": 1.6003, "step": 22860 }, { "epoch": 1.91, "grad_norm": 6.152258396148682, "learning_rate": 7.036521739130435e-06, "loss": 1.671, "step": 22870 }, { "epoch": 1.91, "grad_norm": 2.3631834983825684, "learning_rate": 7.03072463768116e-06, "loss": 1.5857, "step": 22880 }, { "epoch": 1.91, "grad_norm": 1.4059417247772217, "learning_rate": 7.024927536231885e-06, "loss": 1.7629, "step": 22890 }, { "epoch": 1.91, "grad_norm": 1.9621164798736572, "learning_rate": 7.01913043478261e-06, "loss": 1.7076, "step": 22900 }, { "epoch": 1.91, "grad_norm": 0.742729127407074, "learning_rate": 7.0133333333333345e-06, "loss": 1.6253, "step": 22910 }, { "epoch": 1.91, "grad_norm": 3.2152140140533447, "learning_rate": 7.007536231884059e-06, "loss": 1.6246, "step": 22920 }, { "epoch": 1.91, "grad_norm": 3.5290796756744385, "learning_rate": 7.001739130434783e-06, "loss": 1.4322, "step": 22930 }, { "epoch": 1.91, "grad_norm": 5.470486164093018, "learning_rate": 6.995942028985507e-06, "loss": 1.6678, "step": 22940 }, { "epoch": 1.91, "grad_norm": 3.923020124435425, "learning_rate": 6.990144927536232e-06, "loss": 1.5589, "step": 22950 }, { "epoch": 1.91, "grad_norm": 2.6417629718780518, "learning_rate": 6.984347826086957e-06, "loss": 1.5959, "step": 22960 }, { "epoch": 1.91, "grad_norm": 3.574911117553711, "learning_rate": 6.978550724637682e-06, "loss": 1.7439, "step": 22970 }, { "epoch": 1.92, "grad_norm": 1.8391579389572144, "learning_rate": 6.972753623188406e-06, "loss": 1.6204, "step": 22980 }, { "epoch": 1.92, "grad_norm": 3.0558955669403076, "learning_rate": 6.966956521739131e-06, "loss": 1.6833, "step": 22990 }, { "epoch": 1.92, "grad_norm": 6.424144744873047, "learning_rate": 6.961159420289856e-06, "loss": 1.7008, "step": 23000 }, { "epoch": 1.92, "eval_loss": 1.6694403886795044, "eval_runtime": 107.5169, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 23000 }, { "epoch": 1.92, "grad_norm": 2.5954649448394775, "learning_rate": 6.9553623188405805e-06, "loss": 1.6293, "step": 23010 }, { "epoch": 1.92, "grad_norm": 3.2413156032562256, "learning_rate": 6.949565217391305e-06, "loss": 1.6196, "step": 23020 }, { "epoch": 1.92, "grad_norm": 2.4454963207244873, "learning_rate": 6.94376811594203e-06, "loss": 1.689, "step": 23030 }, { "epoch": 1.92, "grad_norm": 3.627885580062866, "learning_rate": 6.937971014492753e-06, "loss": 1.623, "step": 23040 }, { "epoch": 1.92, "grad_norm": 1.9731435775756836, "learning_rate": 6.932173913043478e-06, "loss": 1.5804, "step": 23050 }, { "epoch": 1.92, "grad_norm": 1.2060045003890991, "learning_rate": 6.926376811594203e-06, "loss": 1.5979, "step": 23060 }, { "epoch": 1.92, "grad_norm": 4.452988147735596, "learning_rate": 6.920579710144928e-06, "loss": 1.693, "step": 23070 }, { "epoch": 1.92, "grad_norm": 2.145307779312134, "learning_rate": 6.914782608695653e-06, "loss": 1.7325, "step": 23080 }, { "epoch": 1.92, "grad_norm": 5.082767009735107, "learning_rate": 6.908985507246378e-06, "loss": 1.6337, "step": 23090 }, { "epoch": 1.93, "grad_norm": 3.040972948074341, "learning_rate": 6.9031884057971025e-06, "loss": 1.6223, "step": 23100 }, { "epoch": 1.93, "grad_norm": 1.870182752609253, "learning_rate": 6.8973913043478265e-06, "loss": 1.7026, "step": 23110 }, { "epoch": 1.93, "grad_norm": 0.9211017489433289, "learning_rate": 6.891594202898551e-06, "loss": 1.642, "step": 23120 }, { "epoch": 1.93, "grad_norm": 2.97402286529541, "learning_rate": 6.885797101449276e-06, "loss": 1.6483, "step": 23130 }, { "epoch": 1.93, "grad_norm": 2.9657325744628906, "learning_rate": 6.88e-06, "loss": 1.6167, "step": 23140 }, { "epoch": 1.93, "grad_norm": 4.580372333526611, "learning_rate": 6.874202898550725e-06, "loss": 1.8135, "step": 23150 }, { "epoch": 1.93, "grad_norm": 3.190333843231201, "learning_rate": 6.86840579710145e-06, "loss": 1.669, "step": 23160 }, { "epoch": 1.93, "grad_norm": 9.4367094039917, "learning_rate": 6.862608695652174e-06, "loss": 1.5822, "step": 23170 }, { "epoch": 1.93, "grad_norm": 4.261363506317139, "learning_rate": 6.856811594202899e-06, "loss": 1.5777, "step": 23180 }, { "epoch": 1.93, "grad_norm": 5.0194172859191895, "learning_rate": 6.851014492753624e-06, "loss": 1.7024, "step": 23190 }, { "epoch": 1.93, "grad_norm": 5.917288780212402, "learning_rate": 6.8452173913043485e-06, "loss": 1.565, "step": 23200 }, { "epoch": 1.93, "grad_norm": 3.0884828567504883, "learning_rate": 6.839420289855073e-06, "loss": 1.7616, "step": 23210 }, { "epoch": 1.94, "grad_norm": 8.573901176452637, "learning_rate": 6.833623188405798e-06, "loss": 1.7546, "step": 23220 }, { "epoch": 1.94, "grad_norm": 5.3873114585876465, "learning_rate": 6.827826086956523e-06, "loss": 1.69, "step": 23230 }, { "epoch": 1.94, "grad_norm": 2.1748428344726562, "learning_rate": 6.822028985507248e-06, "loss": 1.5809, "step": 23240 }, { "epoch": 1.94, "grad_norm": 1.9165292978286743, "learning_rate": 6.816231884057971e-06, "loss": 1.6822, "step": 23250 }, { "epoch": 1.94, "grad_norm": 1.971045970916748, "learning_rate": 6.810434782608696e-06, "loss": 1.7906, "step": 23260 }, { "epoch": 1.94, "grad_norm": 7.549450397491455, "learning_rate": 6.804637681159421e-06, "loss": 1.6903, "step": 23270 }, { "epoch": 1.94, "grad_norm": 3.463688611984253, "learning_rate": 6.798840579710146e-06, "loss": 1.601, "step": 23280 }, { "epoch": 1.94, "grad_norm": 2.8097610473632812, "learning_rate": 6.7930434782608704e-06, "loss": 1.6345, "step": 23290 }, { "epoch": 1.94, "grad_norm": 5.737390518188477, "learning_rate": 6.7872463768115945e-06, "loss": 1.5124, "step": 23300 }, { "epoch": 1.94, "grad_norm": 4.170960426330566, "learning_rate": 6.781449275362319e-06, "loss": 1.6134, "step": 23310 }, { "epoch": 1.94, "grad_norm": 2.446510076522827, "learning_rate": 6.775652173913044e-06, "loss": 1.6565, "step": 23320 }, { "epoch": 1.94, "grad_norm": 3.7264490127563477, "learning_rate": 6.769855072463769e-06, "loss": 1.7299, "step": 23330 }, { "epoch": 1.94, "grad_norm": 3.8352794647216797, "learning_rate": 6.764057971014494e-06, "loss": 1.6823, "step": 23340 }, { "epoch": 1.95, "grad_norm": 4.742753982543945, "learning_rate": 6.758260869565217e-06, "loss": 1.6586, "step": 23350 }, { "epoch": 1.95, "grad_norm": 2.161562204360962, "learning_rate": 6.752463768115942e-06, "loss": 1.5763, "step": 23360 }, { "epoch": 1.95, "grad_norm": 5.118688583374023, "learning_rate": 6.746666666666667e-06, "loss": 1.6764, "step": 23370 }, { "epoch": 1.95, "grad_norm": 2.455047369003296, "learning_rate": 6.740869565217392e-06, "loss": 1.7486, "step": 23380 }, { "epoch": 1.95, "grad_norm": 3.586467742919922, "learning_rate": 6.7350724637681164e-06, "loss": 1.6002, "step": 23390 }, { "epoch": 1.95, "grad_norm": 1.8399118185043335, "learning_rate": 6.729275362318841e-06, "loss": 1.617, "step": 23400 }, { "epoch": 1.95, "grad_norm": 2.191316604614258, "learning_rate": 6.723478260869566e-06, "loss": 1.6777, "step": 23410 }, { "epoch": 1.95, "grad_norm": 3.110720634460449, "learning_rate": 6.717681159420291e-06, "loss": 1.627, "step": 23420 }, { "epoch": 1.95, "grad_norm": 6.99454927444458, "learning_rate": 6.711884057971015e-06, "loss": 1.6708, "step": 23430 }, { "epoch": 1.95, "grad_norm": 4.553925514221191, "learning_rate": 6.70608695652174e-06, "loss": 1.5284, "step": 23440 }, { "epoch": 1.95, "grad_norm": 5.1461501121521, "learning_rate": 6.700289855072464e-06, "loss": 1.6239, "step": 23450 }, { "epoch": 1.96, "grad_norm": 2.479620933532715, "learning_rate": 6.694492753623189e-06, "loss": 1.7803, "step": 23460 }, { "epoch": 1.96, "grad_norm": 4.193335056304932, "learning_rate": 6.6886956521739136e-06, "loss": 1.726, "step": 23470 }, { "epoch": 1.96, "grad_norm": 5.909477710723877, "learning_rate": 6.682898550724638e-06, "loss": 1.7277, "step": 23480 }, { "epoch": 1.96, "grad_norm": 1.2803231477737427, "learning_rate": 6.677101449275362e-06, "loss": 1.6677, "step": 23490 }, { "epoch": 1.96, "grad_norm": 2.0764338970184326, "learning_rate": 6.671304347826087e-06, "loss": 1.6192, "step": 23500 }, { "epoch": 1.96, "eval_loss": 1.663988471031189, "eval_runtime": 107.5268, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 23500 }, { "epoch": 1.96, "grad_norm": 4.041726112365723, "learning_rate": 6.665507246376812e-06, "loss": 1.6856, "step": 23510 }, { "epoch": 1.96, "grad_norm": 7.944361209869385, "learning_rate": 6.659710144927537e-06, "loss": 1.7495, "step": 23520 }, { "epoch": 1.96, "grad_norm": 2.918194532394409, "learning_rate": 6.653913043478262e-06, "loss": 1.6635, "step": 23530 }, { "epoch": 1.96, "grad_norm": 8.257862091064453, "learning_rate": 6.648115942028987e-06, "loss": 1.717, "step": 23540 }, { "epoch": 1.96, "grad_norm": 6.422058582305908, "learning_rate": 6.6423188405797115e-06, "loss": 1.6294, "step": 23550 }, { "epoch": 1.96, "grad_norm": 5.261116027832031, "learning_rate": 6.636521739130435e-06, "loss": 1.712, "step": 23560 }, { "epoch": 1.96, "grad_norm": 2.29148006439209, "learning_rate": 6.6307246376811595e-06, "loss": 1.7626, "step": 23570 }, { "epoch": 1.96, "grad_norm": 4.524306774139404, "learning_rate": 6.624927536231884e-06, "loss": 1.4419, "step": 23580 }, { "epoch": 1.97, "grad_norm": 5.170078277587891, "learning_rate": 6.619130434782609e-06, "loss": 1.7131, "step": 23590 }, { "epoch": 1.97, "grad_norm": 7.53287410736084, "learning_rate": 6.613333333333334e-06, "loss": 1.6708, "step": 23600 }, { "epoch": 1.97, "grad_norm": 3.6637678146362305, "learning_rate": 6.607536231884059e-06, "loss": 1.8157, "step": 23610 }, { "epoch": 1.97, "grad_norm": 1.637990117073059, "learning_rate": 6.601739130434783e-06, "loss": 1.5326, "step": 23620 }, { "epoch": 1.97, "grad_norm": 5.433206081390381, "learning_rate": 6.595942028985508e-06, "loss": 1.8064, "step": 23630 }, { "epoch": 1.97, "grad_norm": 4.04767370223999, "learning_rate": 6.590144927536233e-06, "loss": 1.5902, "step": 23640 }, { "epoch": 1.97, "grad_norm": 3.667262554168701, "learning_rate": 6.5843478260869575e-06, "loss": 1.5682, "step": 23650 }, { "epoch": 1.97, "grad_norm": 3.9290482997894287, "learning_rate": 6.5785507246376815e-06, "loss": 1.6237, "step": 23660 }, { "epoch": 1.97, "grad_norm": 3.297651529312134, "learning_rate": 6.5727536231884055e-06, "loss": 1.537, "step": 23670 }, { "epoch": 1.97, "grad_norm": 3.4150543212890625, "learning_rate": 6.56695652173913e-06, "loss": 1.6064, "step": 23680 }, { "epoch": 1.97, "grad_norm": 1.4117975234985352, "learning_rate": 6.561159420289855e-06, "loss": 1.6012, "step": 23690 }, { "epoch": 1.98, "grad_norm": 4.487409591674805, "learning_rate": 6.55536231884058e-06, "loss": 1.6483, "step": 23700 }, { "epoch": 1.98, "grad_norm": 2.014012575149536, "learning_rate": 6.549565217391305e-06, "loss": 1.592, "step": 23710 }, { "epoch": 1.98, "grad_norm": 2.575296401977539, "learning_rate": 6.54376811594203e-06, "loss": 1.574, "step": 23720 }, { "epoch": 1.98, "grad_norm": 1.4668456315994263, "learning_rate": 6.537971014492755e-06, "loss": 1.6368, "step": 23730 }, { "epoch": 1.98, "grad_norm": 6.705700874328613, "learning_rate": 6.5321739130434795e-06, "loss": 1.6754, "step": 23740 }, { "epoch": 1.98, "grad_norm": 2.414182424545288, "learning_rate": 6.5263768115942035e-06, "loss": 1.5873, "step": 23750 }, { "epoch": 1.98, "grad_norm": 3.606837749481201, "learning_rate": 6.5205797101449275e-06, "loss": 1.7331, "step": 23760 }, { "epoch": 1.98, "grad_norm": 6.832721710205078, "learning_rate": 6.514782608695652e-06, "loss": 1.668, "step": 23770 }, { "epoch": 1.98, "grad_norm": 3.788400173187256, "learning_rate": 6.508985507246377e-06, "loss": 1.5374, "step": 23780 }, { "epoch": 1.98, "grad_norm": 3.2363767623901367, "learning_rate": 6.503188405797102e-06, "loss": 1.6609, "step": 23790 }, { "epoch": 1.98, "grad_norm": 1.4389679431915283, "learning_rate": 6.497391304347826e-06, "loss": 1.6405, "step": 23800 }, { "epoch": 1.98, "grad_norm": 4.902224540710449, "learning_rate": 6.491594202898551e-06, "loss": 1.6073, "step": 23810 }, { "epoch": 1.98, "grad_norm": 3.991961717605591, "learning_rate": 6.485797101449276e-06, "loss": 1.6646, "step": 23820 }, { "epoch": 1.99, "grad_norm": 4.761397838592529, "learning_rate": 6.480000000000001e-06, "loss": 1.6553, "step": 23830 }, { "epoch": 1.99, "grad_norm": 3.5798428058624268, "learning_rate": 6.4742028985507255e-06, "loss": 1.6409, "step": 23840 }, { "epoch": 1.99, "grad_norm": 1.4175989627838135, "learning_rate": 6.46840579710145e-06, "loss": 1.7199, "step": 23850 }, { "epoch": 1.99, "grad_norm": 2.976370334625244, "learning_rate": 6.462608695652175e-06, "loss": 1.6925, "step": 23860 }, { "epoch": 1.99, "grad_norm": 5.479163646697998, "learning_rate": 6.456811594202898e-06, "loss": 1.6045, "step": 23870 }, { "epoch": 1.99, "grad_norm": 2.5710270404815674, "learning_rate": 6.451014492753623e-06, "loss": 1.6237, "step": 23880 }, { "epoch": 1.99, "grad_norm": 7.615454196929932, "learning_rate": 6.445217391304348e-06, "loss": 1.7599, "step": 23890 }, { "epoch": 1.99, "grad_norm": 7.336389064788818, "learning_rate": 6.439420289855073e-06, "loss": 1.4353, "step": 23900 }, { "epoch": 1.99, "grad_norm": 8.621050834655762, "learning_rate": 6.433623188405798e-06, "loss": 1.6964, "step": 23910 }, { "epoch": 1.99, "grad_norm": 6.770620346069336, "learning_rate": 6.427826086956523e-06, "loss": 1.467, "step": 23920 }, { "epoch": 1.99, "grad_norm": 4.004281520843506, "learning_rate": 6.4220289855072475e-06, "loss": 1.6323, "step": 23930 }, { "epoch": 2.0, "grad_norm": 3.508981704711914, "learning_rate": 6.4162318840579715e-06, "loss": 1.648, "step": 23940 }, { "epoch": 2.0, "grad_norm": 6.081500053405762, "learning_rate": 6.410434782608696e-06, "loss": 1.5768, "step": 23950 }, { "epoch": 2.0, "grad_norm": 6.746595859527588, "learning_rate": 6.404637681159421e-06, "loss": 1.6595, "step": 23960 }, { "epoch": 2.0, "grad_norm": 3.1742167472839355, "learning_rate": 6.398840579710145e-06, "loss": 1.6841, "step": 23970 }, { "epoch": 2.0, "grad_norm": 3.1276462078094482, "learning_rate": 6.39304347826087e-06, "loss": 1.5213, "step": 23980 }, { "epoch": 2.0, "grad_norm": 3.458132028579712, "learning_rate": 6.387246376811594e-06, "loss": 1.5896, "step": 23990 }, { "epoch": 2.0, "grad_norm": 5.596000671386719, "learning_rate": 6.381449275362319e-06, "loss": 1.6449, "step": 24000 }, { "epoch": 2.0, "eval_loss": 1.6730765104293823, "eval_runtime": 107.5278, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 24000 }, { "epoch": 2.0, "grad_norm": 4.354638576507568, "learning_rate": 6.375652173913044e-06, "loss": 1.5624, "step": 24010 }, { "epoch": 2.0, "grad_norm": 1.6478346586227417, "learning_rate": 6.369855072463769e-06, "loss": 1.7343, "step": 24020 }, { "epoch": 2.0, "grad_norm": 4.3177080154418945, "learning_rate": 6.3640579710144935e-06, "loss": 1.4786, "step": 24030 }, { "epoch": 2.0, "grad_norm": 6.199551582336426, "learning_rate": 6.358260869565218e-06, "loss": 1.5448, "step": 24040 }, { "epoch": 2.0, "grad_norm": 6.634474754333496, "learning_rate": 6.352463768115943e-06, "loss": 1.4954, "step": 24050 }, { "epoch": 2.0, "grad_norm": 2.2182674407958984, "learning_rate": 6.346666666666668e-06, "loss": 1.6519, "step": 24060 }, { "epoch": 2.01, "grad_norm": 12.482495307922363, "learning_rate": 6.340869565217391e-06, "loss": 1.5864, "step": 24070 }, { "epoch": 2.01, "grad_norm": 4.257279872894287, "learning_rate": 6.335072463768116e-06, "loss": 1.4682, "step": 24080 }, { "epoch": 2.01, "grad_norm": 5.081076622009277, "learning_rate": 6.329275362318841e-06, "loss": 1.5972, "step": 24090 }, { "epoch": 2.01, "grad_norm": 4.3276896476745605, "learning_rate": 6.323478260869566e-06, "loss": 1.658, "step": 24100 }, { "epoch": 2.01, "grad_norm": 2.144803524017334, "learning_rate": 6.317681159420291e-06, "loss": 1.4938, "step": 24110 }, { "epoch": 2.01, "grad_norm": 11.347249031066895, "learning_rate": 6.311884057971015e-06, "loss": 1.6704, "step": 24120 }, { "epoch": 2.01, "grad_norm": 5.875922679901123, "learning_rate": 6.3060869565217394e-06, "loss": 1.5384, "step": 24130 }, { "epoch": 2.01, "grad_norm": 2.232929229736328, "learning_rate": 6.300289855072464e-06, "loss": 1.6183, "step": 24140 }, { "epoch": 2.01, "grad_norm": 2.2293903827667236, "learning_rate": 6.294492753623189e-06, "loss": 1.4965, "step": 24150 }, { "epoch": 2.01, "grad_norm": 7.873164653778076, "learning_rate": 6.288695652173914e-06, "loss": 1.6817, "step": 24160 }, { "epoch": 2.01, "grad_norm": 6.767528057098389, "learning_rate": 6.282898550724639e-06, "loss": 1.7035, "step": 24170 }, { "epoch": 2.02, "grad_norm": 2.819685220718384, "learning_rate": 6.277101449275362e-06, "loss": 1.6476, "step": 24180 }, { "epoch": 2.02, "grad_norm": 8.485639572143555, "learning_rate": 6.271304347826087e-06, "loss": 1.6737, "step": 24190 }, { "epoch": 2.02, "grad_norm": 4.753501892089844, "learning_rate": 6.265507246376812e-06, "loss": 1.5851, "step": 24200 }, { "epoch": 2.02, "grad_norm": 2.462150812149048, "learning_rate": 6.2597101449275366e-06, "loss": 1.5416, "step": 24210 }, { "epoch": 2.02, "grad_norm": 4.648767948150635, "learning_rate": 6.2539130434782614e-06, "loss": 1.6021, "step": 24220 }, { "epoch": 2.02, "grad_norm": 4.275688648223877, "learning_rate": 6.248115942028986e-06, "loss": 1.6221, "step": 24230 }, { "epoch": 2.02, "grad_norm": 5.949949264526367, "learning_rate": 6.242318840579711e-06, "loss": 1.5643, "step": 24240 }, { "epoch": 2.02, "grad_norm": 0.8903957605361938, "learning_rate": 6.236521739130436e-06, "loss": 1.8374, "step": 24250 }, { "epoch": 2.02, "grad_norm": 12.835528373718262, "learning_rate": 6.23072463768116e-06, "loss": 1.7176, "step": 24260 }, { "epoch": 2.02, "grad_norm": 2.02567195892334, "learning_rate": 6.224927536231885e-06, "loss": 1.7012, "step": 24270 }, { "epoch": 2.02, "grad_norm": 2.2769625186920166, "learning_rate": 6.219130434782609e-06, "loss": 1.6408, "step": 24280 }, { "epoch": 2.02, "grad_norm": 4.010924339294434, "learning_rate": 6.213333333333334e-06, "loss": 1.7554, "step": 24290 }, { "epoch": 2.02, "grad_norm": 3.4019651412963867, "learning_rate": 6.2075362318840586e-06, "loss": 1.6405, "step": 24300 }, { "epoch": 2.03, "grad_norm": 12.412097930908203, "learning_rate": 6.2017391304347826e-06, "loss": 1.5947, "step": 24310 }, { "epoch": 2.03, "grad_norm": 5.544475555419922, "learning_rate": 6.195942028985507e-06, "loss": 1.6161, "step": 24320 }, { "epoch": 2.03, "grad_norm": 4.096646308898926, "learning_rate": 6.190144927536232e-06, "loss": 1.5265, "step": 24330 }, { "epoch": 2.03, "grad_norm": 6.934004783630371, "learning_rate": 6.184347826086957e-06, "loss": 1.6078, "step": 24340 }, { "epoch": 2.03, "grad_norm": 3.017855405807495, "learning_rate": 6.178550724637682e-06, "loss": 1.7315, "step": 24350 }, { "epoch": 2.03, "grad_norm": 4.79693603515625, "learning_rate": 6.172753623188407e-06, "loss": 1.6719, "step": 24360 }, { "epoch": 2.03, "grad_norm": 2.320878267288208, "learning_rate": 6.166956521739132e-06, "loss": 1.7665, "step": 24370 }, { "epoch": 2.03, "grad_norm": 1.4704517126083374, "learning_rate": 6.161159420289855e-06, "loss": 1.6487, "step": 24380 }, { "epoch": 2.03, "grad_norm": 1.9392638206481934, "learning_rate": 6.15536231884058e-06, "loss": 1.6703, "step": 24390 }, { "epoch": 2.03, "grad_norm": 1.3862284421920776, "learning_rate": 6.1495652173913045e-06, "loss": 1.5763, "step": 24400 }, { "epoch": 2.03, "grad_norm": 2.0127170085906982, "learning_rate": 6.143768115942029e-06, "loss": 1.6736, "step": 24410 }, { "epoch": 2.04, "grad_norm": 2.7247025966644287, "learning_rate": 6.137971014492754e-06, "loss": 1.5715, "step": 24420 }, { "epoch": 2.04, "grad_norm": 1.8624944686889648, "learning_rate": 6.132173913043479e-06, "loss": 1.7204, "step": 24430 }, { "epoch": 2.04, "grad_norm": 9.080060005187988, "learning_rate": 6.126376811594203e-06, "loss": 1.6873, "step": 24440 }, { "epoch": 2.04, "grad_norm": 5.190478801727295, "learning_rate": 6.120579710144928e-06, "loss": 1.6602, "step": 24450 }, { "epoch": 2.04, "grad_norm": 2.000903606414795, "learning_rate": 6.114782608695653e-06, "loss": 1.6189, "step": 24460 }, { "epoch": 2.04, "grad_norm": 2.4447052478790283, "learning_rate": 6.108985507246378e-06, "loss": 1.6426, "step": 24470 }, { "epoch": 2.04, "grad_norm": 2.6354033946990967, "learning_rate": 6.1031884057971025e-06, "loss": 1.5703, "step": 24480 }, { "epoch": 2.04, "grad_norm": 3.4452903270721436, "learning_rate": 6.0973913043478265e-06, "loss": 1.6759, "step": 24490 }, { "epoch": 2.04, "grad_norm": 3.570955991744995, "learning_rate": 6.0915942028985505e-06, "loss": 1.7965, "step": 24500 }, { "epoch": 2.04, "eval_loss": 1.6831989288330078, "eval_runtime": 107.5466, "eval_samples_per_second": 9.298, "eval_steps_per_second": 2.325, "step": 24500 }, { "epoch": 2.04, "grad_norm": 3.475395917892456, "learning_rate": 6.085797101449275e-06, "loss": 1.5667, "step": 24510 }, { "epoch": 2.04, "grad_norm": 4.197578430175781, "learning_rate": 6.08e-06, "loss": 1.443, "step": 24520 }, { "epoch": 2.04, "grad_norm": 3.9446558952331543, "learning_rate": 6.074202898550725e-06, "loss": 1.6287, "step": 24530 }, { "epoch": 2.04, "grad_norm": 2.6334280967712402, "learning_rate": 6.06840579710145e-06, "loss": 1.6424, "step": 24540 }, { "epoch": 2.05, "grad_norm": 3.000389575958252, "learning_rate": 6.062608695652175e-06, "loss": 1.5955, "step": 24550 }, { "epoch": 2.05, "grad_norm": 2.637186050415039, "learning_rate": 6.0568115942029e-06, "loss": 1.5686, "step": 24560 }, { "epoch": 2.05, "grad_norm": 2.8098504543304443, "learning_rate": 6.051014492753624e-06, "loss": 1.6036, "step": 24570 }, { "epoch": 2.05, "grad_norm": 4.713962078094482, "learning_rate": 6.0452173913043485e-06, "loss": 1.7327, "step": 24580 }, { "epoch": 2.05, "grad_norm": 3.344076156616211, "learning_rate": 6.0394202898550725e-06, "loss": 1.7235, "step": 24590 }, { "epoch": 2.05, "grad_norm": 6.279098033905029, "learning_rate": 6.0342028985507255e-06, "loss": 1.7064, "step": 24600 }, { "epoch": 2.05, "grad_norm": 3.025627851486206, "learning_rate": 6.0284057971014495e-06, "loss": 1.6894, "step": 24610 }, { "epoch": 2.05, "grad_norm": 22.400253295898438, "learning_rate": 6.022608695652174e-06, "loss": 1.6231, "step": 24620 }, { "epoch": 2.05, "grad_norm": 3.277951955795288, "learning_rate": 6.016811594202899e-06, "loss": 1.6483, "step": 24630 }, { "epoch": 2.05, "grad_norm": 3.1129698753356934, "learning_rate": 6.011014492753624e-06, "loss": 1.6069, "step": 24640 }, { "epoch": 2.05, "grad_norm": 1.6595231294631958, "learning_rate": 6.005217391304349e-06, "loss": 1.5796, "step": 24650 }, { "epoch": 2.06, "grad_norm": 4.5862345695495605, "learning_rate": 5.999420289855072e-06, "loss": 1.6568, "step": 24660 }, { "epoch": 2.06, "grad_norm": 4.170677661895752, "learning_rate": 5.993623188405797e-06, "loss": 1.608, "step": 24670 }, { "epoch": 2.06, "grad_norm": 5.7852559089660645, "learning_rate": 5.987826086956522e-06, "loss": 1.6639, "step": 24680 }, { "epoch": 2.06, "grad_norm": 2.2292609214782715, "learning_rate": 5.982028985507247e-06, "loss": 1.7244, "step": 24690 }, { "epoch": 2.06, "grad_norm": 1.7636544704437256, "learning_rate": 5.9762318840579715e-06, "loss": 1.6089, "step": 24700 }, { "epoch": 2.06, "grad_norm": 2.0442698001861572, "learning_rate": 5.970434782608696e-06, "loss": 1.6019, "step": 24710 }, { "epoch": 2.06, "grad_norm": 5.134725093841553, "learning_rate": 5.964637681159421e-06, "loss": 1.5547, "step": 24720 }, { "epoch": 2.06, "grad_norm": 4.921815872192383, "learning_rate": 5.958840579710146e-06, "loss": 1.5884, "step": 24730 }, { "epoch": 2.06, "grad_norm": 2.0440871715545654, "learning_rate": 5.95304347826087e-06, "loss": 1.7173, "step": 24740 }, { "epoch": 2.06, "grad_norm": 9.600371360778809, "learning_rate": 5.947246376811595e-06, "loss": 1.6929, "step": 24750 }, { "epoch": 2.06, "grad_norm": 5.437139987945557, "learning_rate": 5.94144927536232e-06, "loss": 1.5867, "step": 24760 }, { "epoch": 2.06, "grad_norm": 2.7451493740081787, "learning_rate": 5.935652173913044e-06, "loss": 1.5753, "step": 24770 }, { "epoch": 2.06, "grad_norm": 5.03071928024292, "learning_rate": 5.929855072463769e-06, "loss": 1.7336, "step": 24780 }, { "epoch": 2.07, "grad_norm": 5.349041938781738, "learning_rate": 5.924057971014493e-06, "loss": 1.5073, "step": 24790 }, { "epoch": 2.07, "grad_norm": 2.7859678268432617, "learning_rate": 5.9182608695652174e-06, "loss": 1.5372, "step": 24800 }, { "epoch": 2.07, "grad_norm": 2.020723819732666, "learning_rate": 5.912463768115942e-06, "loss": 1.4775, "step": 24810 }, { "epoch": 2.07, "grad_norm": 5.465567111968994, "learning_rate": 5.906666666666667e-06, "loss": 1.6493, "step": 24820 }, { "epoch": 2.07, "grad_norm": 5.622159004211426, "learning_rate": 5.900869565217392e-06, "loss": 1.5543, "step": 24830 }, { "epoch": 2.07, "grad_norm": 3.285824775695801, "learning_rate": 5.895072463768117e-06, "loss": 1.5926, "step": 24840 }, { "epoch": 2.07, "grad_norm": 18.595064163208008, "learning_rate": 5.889275362318842e-06, "loss": 1.5055, "step": 24850 }, { "epoch": 2.07, "grad_norm": 1.7584556341171265, "learning_rate": 5.8834782608695666e-06, "loss": 1.6876, "step": 24860 }, { "epoch": 2.07, "grad_norm": 9.670427322387695, "learning_rate": 5.87768115942029e-06, "loss": 1.5141, "step": 24870 }, { "epoch": 2.07, "grad_norm": 1.9229793548583984, "learning_rate": 5.8718840579710146e-06, "loss": 1.5989, "step": 24880 }, { "epoch": 2.07, "grad_norm": 2.915713310241699, "learning_rate": 5.866086956521739e-06, "loss": 1.6928, "step": 24890 }, { "epoch": 2.08, "grad_norm": 2.840363025665283, "learning_rate": 5.860289855072464e-06, "loss": 1.6718, "step": 24900 }, { "epoch": 2.08, "grad_norm": 9.295182228088379, "learning_rate": 5.854492753623189e-06, "loss": 1.6016, "step": 24910 }, { "epoch": 2.08, "grad_norm": 3.412475109100342, "learning_rate": 5.848695652173913e-06, "loss": 1.6354, "step": 24920 }, { "epoch": 2.08, "grad_norm": 8.369576454162598, "learning_rate": 5.842898550724638e-06, "loss": 1.6169, "step": 24930 }, { "epoch": 2.08, "grad_norm": 1.795441746711731, "learning_rate": 5.837101449275363e-06, "loss": 1.6285, "step": 24940 }, { "epoch": 2.08, "grad_norm": 4.794939994812012, "learning_rate": 5.831304347826088e-06, "loss": 1.5579, "step": 24950 }, { "epoch": 2.08, "grad_norm": 1.6949840784072876, "learning_rate": 5.8255072463768125e-06, "loss": 1.6978, "step": 24960 }, { "epoch": 2.08, "grad_norm": 3.829167366027832, "learning_rate": 5.8197101449275366e-06, "loss": 1.6054, "step": 24970 }, { "epoch": 2.08, "grad_norm": 5.521337509155273, "learning_rate": 5.8139130434782606e-06, "loss": 1.6029, "step": 24980 }, { "epoch": 2.08, "grad_norm": 7.663822174072266, "learning_rate": 5.808115942028985e-06, "loss": 1.5795, "step": 24990 }, { "epoch": 2.08, "grad_norm": 2.8638572692871094, "learning_rate": 5.80231884057971e-06, "loss": 1.715, "step": 25000 }, { "epoch": 2.08, "eval_loss": 1.62440824508667, "eval_runtime": 107.4912, "eval_samples_per_second": 9.303, "eval_steps_per_second": 2.326, "step": 25000 }, { "epoch": 2.08, "grad_norm": 2.584785223007202, "learning_rate": 5.796521739130435e-06, "loss": 1.6872, "step": 25010 }, { "epoch": 2.08, "grad_norm": 2.3847389221191406, "learning_rate": 5.79072463768116e-06, "loss": 1.6624, "step": 25020 }, { "epoch": 2.09, "grad_norm": 4.764808654785156, "learning_rate": 5.784927536231885e-06, "loss": 1.682, "step": 25030 }, { "epoch": 2.09, "grad_norm": 4.870251178741455, "learning_rate": 5.77913043478261e-06, "loss": 1.4977, "step": 25040 }, { "epoch": 2.09, "grad_norm": 4.905124187469482, "learning_rate": 5.7733333333333345e-06, "loss": 1.573, "step": 25050 }, { "epoch": 2.09, "grad_norm": 11.395575523376465, "learning_rate": 5.7675362318840585e-06, "loss": 1.6053, "step": 25060 }, { "epoch": 2.09, "grad_norm": 2.989179849624634, "learning_rate": 5.761739130434783e-06, "loss": 1.6168, "step": 25070 }, { "epoch": 2.09, "grad_norm": 4.6165571212768555, "learning_rate": 5.755942028985507e-06, "loss": 1.6171, "step": 25080 }, { "epoch": 2.09, "grad_norm": 5.178078651428223, "learning_rate": 5.750144927536232e-06, "loss": 1.6461, "step": 25090 }, { "epoch": 2.09, "grad_norm": 2.245847702026367, "learning_rate": 5.744347826086957e-06, "loss": 1.581, "step": 25100 }, { "epoch": 2.09, "grad_norm": 8.20195484161377, "learning_rate": 5.738550724637681e-06, "loss": 1.6275, "step": 25110 }, { "epoch": 2.09, "grad_norm": 1.7283331155776978, "learning_rate": 5.732753623188406e-06, "loss": 1.6411, "step": 25120 }, { "epoch": 2.09, "grad_norm": 2.998704195022583, "learning_rate": 5.726956521739131e-06, "loss": 1.6231, "step": 25130 }, { "epoch": 2.1, "grad_norm": 2.490156412124634, "learning_rate": 5.721159420289856e-06, "loss": 1.6043, "step": 25140 }, { "epoch": 2.1, "grad_norm": 2.0524210929870605, "learning_rate": 5.7153623188405805e-06, "loss": 1.7545, "step": 25150 }, { "epoch": 2.1, "grad_norm": 9.426039695739746, "learning_rate": 5.709565217391305e-06, "loss": 1.5325, "step": 25160 }, { "epoch": 2.1, "grad_norm": 2.069500207901001, "learning_rate": 5.70376811594203e-06, "loss": 1.6505, "step": 25170 }, { "epoch": 2.1, "grad_norm": 6.134079933166504, "learning_rate": 5.697971014492753e-06, "loss": 1.5572, "step": 25180 }, { "epoch": 2.1, "grad_norm": 3.0880463123321533, "learning_rate": 5.692173913043478e-06, "loss": 1.6467, "step": 25190 }, { "epoch": 2.1, "grad_norm": 5.196481227874756, "learning_rate": 5.686376811594203e-06, "loss": 1.5883, "step": 25200 }, { "epoch": 2.1, "grad_norm": 3.574937343597412, "learning_rate": 5.680579710144928e-06, "loss": 1.7104, "step": 25210 }, { "epoch": 2.1, "grad_norm": 7.061581611633301, "learning_rate": 5.674782608695653e-06, "loss": 1.6283, "step": 25220 }, { "epoch": 2.1, "grad_norm": 4.538878917694092, "learning_rate": 5.668985507246378e-06, "loss": 1.5797, "step": 25230 }, { "epoch": 2.1, "grad_norm": 5.407559394836426, "learning_rate": 5.663188405797102e-06, "loss": 1.6253, "step": 25240 }, { "epoch": 2.1, "grad_norm": 4.123929500579834, "learning_rate": 5.6573913043478265e-06, "loss": 1.7386, "step": 25250 }, { "epoch": 2.1, "grad_norm": 10.668411254882812, "learning_rate": 5.651594202898551e-06, "loss": 1.6129, "step": 25260 }, { "epoch": 2.11, "grad_norm": 7.001271724700928, "learning_rate": 5.645797101449276e-06, "loss": 1.6302, "step": 25270 }, { "epoch": 2.11, "grad_norm": 9.053343772888184, "learning_rate": 5.64e-06, "loss": 1.7988, "step": 25280 }, { "epoch": 2.11, "grad_norm": 2.4710533618927, "learning_rate": 5.634202898550725e-06, "loss": 1.6616, "step": 25290 }, { "epoch": 2.11, "grad_norm": 3.005622148513794, "learning_rate": 5.628405797101449e-06, "loss": 1.5469, "step": 25300 }, { "epoch": 2.11, "grad_norm": 5.569432258605957, "learning_rate": 5.623188405797102e-06, "loss": 1.5047, "step": 25310 }, { "epoch": 2.11, "grad_norm": 2.52546763420105, "learning_rate": 5.617391304347827e-06, "loss": 1.6252, "step": 25320 }, { "epoch": 2.11, "grad_norm": 2.454789400100708, "learning_rate": 5.611594202898552e-06, "loss": 1.6849, "step": 25330 }, { "epoch": 2.11, "grad_norm": 6.595395565032959, "learning_rate": 5.605797101449277e-06, "loss": 1.5866, "step": 25340 }, { "epoch": 2.11, "grad_norm": 1.6814996004104614, "learning_rate": 5.600000000000001e-06, "loss": 1.6539, "step": 25350 }, { "epoch": 2.11, "grad_norm": 1.3363173007965088, "learning_rate": 5.594202898550725e-06, "loss": 1.7953, "step": 25360 }, { "epoch": 2.11, "grad_norm": 4.2266364097595215, "learning_rate": 5.5884057971014495e-06, "loss": 1.459, "step": 25370 }, { "epoch": 2.12, "grad_norm": 8.90019702911377, "learning_rate": 5.582608695652174e-06, "loss": 1.5548, "step": 25380 }, { "epoch": 2.12, "grad_norm": 8.731317520141602, "learning_rate": 5.576811594202899e-06, "loss": 1.6977, "step": 25390 }, { "epoch": 2.12, "grad_norm": 3.7743234634399414, "learning_rate": 5.571014492753624e-06, "loss": 1.632, "step": 25400 }, { "epoch": 2.12, "grad_norm": 3.33520245552063, "learning_rate": 5.565217391304348e-06, "loss": 1.634, "step": 25410 }, { "epoch": 2.12, "grad_norm": 8.20616340637207, "learning_rate": 5.559420289855073e-06, "loss": 1.6429, "step": 25420 }, { "epoch": 2.12, "grad_norm": 4.349891662597656, "learning_rate": 5.553623188405798e-06, "loss": 1.5035, "step": 25430 }, { "epoch": 2.12, "grad_norm": 2.2954518795013428, "learning_rate": 5.5478260869565226e-06, "loss": 1.6812, "step": 25440 }, { "epoch": 2.12, "grad_norm": 7.884774684906006, "learning_rate": 5.5420289855072474e-06, "loss": 1.6622, "step": 25450 }, { "epoch": 2.12, "grad_norm": 3.3407180309295654, "learning_rate": 5.536231884057971e-06, "loss": 1.7026, "step": 25460 }, { "epoch": 2.12, "grad_norm": 4.300926685333252, "learning_rate": 5.5304347826086954e-06, "loss": 1.4354, "step": 25470 }, { "epoch": 2.12, "grad_norm": 6.3809895515441895, "learning_rate": 5.52463768115942e-06, "loss": 1.5977, "step": 25480 }, { "epoch": 2.12, "grad_norm": 1.672784447669983, "learning_rate": 5.518840579710145e-06, "loss": 1.69, "step": 25490 }, { "epoch": 2.12, "grad_norm": 1.11064875125885, "learning_rate": 5.51304347826087e-06, "loss": 1.4795, "step": 25500 }, { "epoch": 2.12, "eval_loss": 1.6643449068069458, "eval_runtime": 107.487, "eval_samples_per_second": 9.303, "eval_steps_per_second": 2.326, "step": 25500 }, { "epoch": 2.13, "grad_norm": 1.4832613468170166, "learning_rate": 5.507246376811595e-06, "loss": 1.631, "step": 25510 }, { "epoch": 2.13, "grad_norm": 4.866186141967773, "learning_rate": 5.50144927536232e-06, "loss": 1.6299, "step": 25520 }, { "epoch": 2.13, "grad_norm": 5.978271961212158, "learning_rate": 5.4956521739130446e-06, "loss": 1.69, "step": 25530 }, { "epoch": 2.13, "grad_norm": 2.5575902462005615, "learning_rate": 5.4898550724637686e-06, "loss": 1.7588, "step": 25540 }, { "epoch": 2.13, "grad_norm": 3.088501214981079, "learning_rate": 5.484057971014493e-06, "loss": 1.5889, "step": 25550 }, { "epoch": 2.13, "grad_norm": 4.234321594238281, "learning_rate": 5.478260869565217e-06, "loss": 1.615, "step": 25560 }, { "epoch": 2.13, "grad_norm": 2.050870895385742, "learning_rate": 5.472463768115942e-06, "loss": 1.8112, "step": 25570 }, { "epoch": 2.13, "grad_norm": 2.9914329051971436, "learning_rate": 5.466666666666667e-06, "loss": 1.4678, "step": 25580 }, { "epoch": 2.13, "grad_norm": 4.666996479034424, "learning_rate": 5.460869565217391e-06, "loss": 1.6458, "step": 25590 }, { "epoch": 2.13, "grad_norm": 3.144287109375, "learning_rate": 5.455072463768116e-06, "loss": 1.6335, "step": 25600 }, { "epoch": 2.13, "grad_norm": 8.561944007873535, "learning_rate": 5.449275362318841e-06, "loss": 1.7007, "step": 25610 }, { "epoch": 2.13, "grad_norm": 8.11923599243164, "learning_rate": 5.443478260869566e-06, "loss": 1.5806, "step": 25620 }, { "epoch": 2.14, "grad_norm": 2.6465091705322266, "learning_rate": 5.4376811594202905e-06, "loss": 1.5382, "step": 25630 }, { "epoch": 2.14, "grad_norm": 5.644728183746338, "learning_rate": 5.431884057971015e-06, "loss": 1.6434, "step": 25640 }, { "epoch": 2.14, "grad_norm": 3.5903024673461914, "learning_rate": 5.42608695652174e-06, "loss": 1.7184, "step": 25650 }, { "epoch": 2.14, "grad_norm": 6.686862945556641, "learning_rate": 5.420289855072465e-06, "loss": 1.6501, "step": 25660 }, { "epoch": 2.14, "grad_norm": 4.454892158508301, "learning_rate": 5.414492753623188e-06, "loss": 1.4907, "step": 25670 }, { "epoch": 2.14, "grad_norm": 6.291064262390137, "learning_rate": 5.408695652173913e-06, "loss": 1.6432, "step": 25680 }, { "epoch": 2.14, "grad_norm": 0.9967716336250305, "learning_rate": 5.402898550724638e-06, "loss": 1.6613, "step": 25690 }, { "epoch": 2.14, "grad_norm": 10.572330474853516, "learning_rate": 5.397101449275363e-06, "loss": 1.6701, "step": 25700 }, { "epoch": 2.14, "grad_norm": 2.3549177646636963, "learning_rate": 5.391304347826088e-06, "loss": 1.5943, "step": 25710 }, { "epoch": 2.14, "grad_norm": 6.504870891571045, "learning_rate": 5.3855072463768125e-06, "loss": 1.5729, "step": 25720 }, { "epoch": 2.14, "grad_norm": 1.8512842655181885, "learning_rate": 5.3797101449275365e-06, "loss": 1.653, "step": 25730 }, { "epoch": 2.15, "grad_norm": 1.5188934803009033, "learning_rate": 5.373913043478261e-06, "loss": 1.6456, "step": 25740 }, { "epoch": 2.15, "grad_norm": 2.3979270458221436, "learning_rate": 5.368115942028986e-06, "loss": 1.7375, "step": 25750 }, { "epoch": 2.15, "grad_norm": 6.565988540649414, "learning_rate": 5.362318840579711e-06, "loss": 1.6154, "step": 25760 }, { "epoch": 2.15, "grad_norm": 5.995846271514893, "learning_rate": 5.356521739130435e-06, "loss": 1.6018, "step": 25770 }, { "epoch": 2.15, "grad_norm": 5.048307418823242, "learning_rate": 5.350724637681159e-06, "loss": 1.5733, "step": 25780 }, { "epoch": 2.15, "grad_norm": 3.6430954933166504, "learning_rate": 5.344927536231884e-06, "loss": 1.6623, "step": 25790 }, { "epoch": 2.15, "grad_norm": 5.271862983703613, "learning_rate": 5.339130434782609e-06, "loss": 1.7376, "step": 25800 }, { "epoch": 2.15, "grad_norm": 4.284571647644043, "learning_rate": 5.333333333333334e-06, "loss": 1.6058, "step": 25810 }, { "epoch": 2.15, "grad_norm": 3.8240787982940674, "learning_rate": 5.3275362318840585e-06, "loss": 1.6588, "step": 25820 }, { "epoch": 2.15, "grad_norm": 2.922074556350708, "learning_rate": 5.321739130434783e-06, "loss": 1.504, "step": 25830 }, { "epoch": 2.15, "grad_norm": 3.5163557529449463, "learning_rate": 5.315942028985508e-06, "loss": 1.7208, "step": 25840 }, { "epoch": 2.15, "grad_norm": 4.59926176071167, "learning_rate": 5.310144927536233e-06, "loss": 1.5188, "step": 25850 }, { "epoch": 2.15, "grad_norm": 7.922865867614746, "learning_rate": 5.304347826086957e-06, "loss": 1.7082, "step": 25860 }, { "epoch": 2.16, "grad_norm": 2.3443105220794678, "learning_rate": 5.298550724637681e-06, "loss": 1.5892, "step": 25870 }, { "epoch": 2.16, "grad_norm": 1.4057115316390991, "learning_rate": 5.292753623188406e-06, "loss": 1.5478, "step": 25880 }, { "epoch": 2.16, "grad_norm": 5.410768985748291, "learning_rate": 5.286956521739131e-06, "loss": 1.6926, "step": 25890 }, { "epoch": 2.16, "grad_norm": 3.0512008666992188, "learning_rate": 5.281159420289856e-06, "loss": 1.6573, "step": 25900 }, { "epoch": 2.16, "grad_norm": 4.070111274719238, "learning_rate": 5.27536231884058e-06, "loss": 1.6467, "step": 25910 }, { "epoch": 2.16, "grad_norm": 13.7697114944458, "learning_rate": 5.2695652173913045e-06, "loss": 1.4524, "step": 25920 }, { "epoch": 2.16, "grad_norm": 6.01429557800293, "learning_rate": 5.263768115942029e-06, "loss": 1.6422, "step": 25930 }, { "epoch": 2.16, "grad_norm": 1.9737797975540161, "learning_rate": 5.257971014492754e-06, "loss": 1.8127, "step": 25940 }, { "epoch": 2.16, "grad_norm": 4.113332271575928, "learning_rate": 5.252173913043479e-06, "loss": 1.6502, "step": 25950 }, { "epoch": 2.16, "grad_norm": 4.877610683441162, "learning_rate": 5.246376811594204e-06, "loss": 1.7218, "step": 25960 }, { "epoch": 2.16, "grad_norm": 4.620281219482422, "learning_rate": 5.240579710144929e-06, "loss": 1.7076, "step": 25970 }, { "epoch": 2.17, "grad_norm": 4.5956878662109375, "learning_rate": 5.234782608695652e-06, "loss": 1.5189, "step": 25980 }, { "epoch": 2.17, "grad_norm": 9.458510398864746, "learning_rate": 5.228985507246377e-06, "loss": 1.5011, "step": 25990 }, { "epoch": 2.17, "grad_norm": 3.703575372695923, "learning_rate": 5.223188405797102e-06, "loss": 1.6217, "step": 26000 }, { "epoch": 2.17, "eval_loss": 1.641511082649231, "eval_runtime": 107.5147, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 26000 }, { "epoch": 2.17, "grad_norm": 4.613325595855713, "learning_rate": 5.2173913043478265e-06, "loss": 1.5841, "step": 26010 }, { "epoch": 2.17, "grad_norm": 5.493524551391602, "learning_rate": 5.211594202898551e-06, "loss": 1.6712, "step": 26020 }, { "epoch": 2.17, "grad_norm": 4.726990699768066, "learning_rate": 5.205797101449276e-06, "loss": 1.562, "step": 26030 }, { "epoch": 2.17, "grad_norm": 4.116297721862793, "learning_rate": 5.2e-06, "loss": 1.6151, "step": 26040 }, { "epoch": 2.17, "grad_norm": 5.906617641448975, "learning_rate": 5.194202898550725e-06, "loss": 1.8011, "step": 26050 }, { "epoch": 2.17, "grad_norm": 5.110503196716309, "learning_rate": 5.18840579710145e-06, "loss": 1.5389, "step": 26060 }, { "epoch": 2.17, "grad_norm": 6.075992584228516, "learning_rate": 5.182608695652175e-06, "loss": 1.6278, "step": 26070 }, { "epoch": 2.17, "grad_norm": 4.633796215057373, "learning_rate": 5.176811594202899e-06, "loss": 1.6312, "step": 26080 }, { "epoch": 2.17, "grad_norm": 8.259605407714844, "learning_rate": 5.171014492753624e-06, "loss": 1.6943, "step": 26090 }, { "epoch": 2.17, "grad_norm": 5.173551082611084, "learning_rate": 5.165217391304348e-06, "loss": 1.6731, "step": 26100 }, { "epoch": 2.18, "grad_norm": 1.1462539434432983, "learning_rate": 5.1594202898550725e-06, "loss": 1.6857, "step": 26110 }, { "epoch": 2.18, "grad_norm": 1.7535749673843384, "learning_rate": 5.153623188405797e-06, "loss": 1.6063, "step": 26120 }, { "epoch": 2.18, "grad_norm": 0.7708317041397095, "learning_rate": 5.147826086956522e-06, "loss": 1.5231, "step": 26130 }, { "epoch": 2.18, "grad_norm": 5.939055919647217, "learning_rate": 5.142028985507247e-06, "loss": 1.5663, "step": 26140 }, { "epoch": 2.18, "grad_norm": 1.7642502784729004, "learning_rate": 5.136231884057972e-06, "loss": 1.4762, "step": 26150 }, { "epoch": 2.18, "grad_norm": 2.4374587535858154, "learning_rate": 5.130434782608697e-06, "loss": 1.5594, "step": 26160 }, { "epoch": 2.18, "grad_norm": 6.011373996734619, "learning_rate": 5.124637681159422e-06, "loss": 1.6385, "step": 26170 }, { "epoch": 2.18, "grad_norm": 6.366337299346924, "learning_rate": 5.118840579710145e-06, "loss": 1.6437, "step": 26180 }, { "epoch": 2.18, "grad_norm": 4.652083873748779, "learning_rate": 5.11304347826087e-06, "loss": 1.6124, "step": 26190 }, { "epoch": 2.18, "grad_norm": 7.6200690269470215, "learning_rate": 5.1072463768115944e-06, "loss": 1.6403, "step": 26200 }, { "epoch": 2.18, "grad_norm": 0.9343698620796204, "learning_rate": 5.101449275362319e-06, "loss": 1.5959, "step": 26210 }, { "epoch": 2.19, "grad_norm": 6.836380481719971, "learning_rate": 5.095652173913044e-06, "loss": 1.6459, "step": 26220 }, { "epoch": 2.19, "grad_norm": 0.8471882343292236, "learning_rate": 5.089855072463768e-06, "loss": 1.7146, "step": 26230 }, { "epoch": 2.19, "grad_norm": 1.5582016706466675, "learning_rate": 5.084057971014493e-06, "loss": 1.752, "step": 26240 }, { "epoch": 2.19, "grad_norm": 4.58816385269165, "learning_rate": 5.078260869565218e-06, "loss": 1.7379, "step": 26250 }, { "epoch": 2.19, "grad_norm": 12.132075309753418, "learning_rate": 5.072463768115943e-06, "loss": 1.7138, "step": 26260 }, { "epoch": 2.19, "grad_norm": 7.812301158905029, "learning_rate": 5.0666666666666676e-06, "loss": 1.522, "step": 26270 }, { "epoch": 2.19, "grad_norm": 8.638223648071289, "learning_rate": 5.060869565217392e-06, "loss": 1.4687, "step": 26280 }, { "epoch": 2.19, "grad_norm": 1.7802847623825073, "learning_rate": 5.055072463768116e-06, "loss": 1.6814, "step": 26290 }, { "epoch": 2.19, "grad_norm": 3.118692398071289, "learning_rate": 5.0492753623188404e-06, "loss": 1.5805, "step": 26300 }, { "epoch": 2.19, "grad_norm": 1.812072515487671, "learning_rate": 5.043478260869565e-06, "loss": 1.5966, "step": 26310 }, { "epoch": 2.19, "grad_norm": 3.6763594150543213, "learning_rate": 5.03768115942029e-06, "loss": 1.5277, "step": 26320 }, { "epoch": 2.19, "grad_norm": 5.058526992797852, "learning_rate": 5.031884057971015e-06, "loss": 1.5608, "step": 26330 }, { "epoch": 2.19, "grad_norm": 6.4017653465271, "learning_rate": 5.02608695652174e-06, "loss": 1.702, "step": 26340 }, { "epoch": 2.2, "grad_norm": 4.419919490814209, "learning_rate": 5.020289855072465e-06, "loss": 1.7232, "step": 26350 }, { "epoch": 2.2, "grad_norm": 3.014277458190918, "learning_rate": 5.014492753623189e-06, "loss": 1.7648, "step": 26360 }, { "epoch": 2.2, "grad_norm": 2.1701581478118896, "learning_rate": 5.0086956521739136e-06, "loss": 1.3927, "step": 26370 }, { "epoch": 2.2, "grad_norm": 4.6772942543029785, "learning_rate": 5.002898550724638e-06, "loss": 1.5294, "step": 26380 }, { "epoch": 2.2, "grad_norm": 6.949331283569336, "learning_rate": 4.997101449275362e-06, "loss": 1.7033, "step": 26390 }, { "epoch": 2.2, "grad_norm": 7.254538536071777, "learning_rate": 4.991304347826087e-06, "loss": 1.6756, "step": 26400 }, { "epoch": 2.2, "grad_norm": 2.6317763328552246, "learning_rate": 4.985507246376812e-06, "loss": 1.5334, "step": 26410 }, { "epoch": 2.2, "grad_norm": 6.098857402801514, "learning_rate": 4.979710144927536e-06, "loss": 1.5211, "step": 26420 }, { "epoch": 2.2, "grad_norm": 10.028970718383789, "learning_rate": 4.973913043478261e-06, "loss": 1.6545, "step": 26430 }, { "epoch": 2.2, "grad_norm": 1.9619600772857666, "learning_rate": 4.968115942028986e-06, "loss": 1.5913, "step": 26440 }, { "epoch": 2.2, "grad_norm": 5.192411422729492, "learning_rate": 4.962318840579711e-06, "loss": 1.5367, "step": 26450 }, { "epoch": 2.21, "grad_norm": 3.78745698928833, "learning_rate": 4.9565217391304355e-06, "loss": 1.6372, "step": 26460 }, { "epoch": 2.21, "grad_norm": 3.531749963760376, "learning_rate": 4.9507246376811595e-06, "loss": 1.6329, "step": 26470 }, { "epoch": 2.21, "grad_norm": 4.566132068634033, "learning_rate": 4.944927536231884e-06, "loss": 1.6664, "step": 26480 }, { "epoch": 2.21, "grad_norm": 11.072998046875, "learning_rate": 4.939130434782609e-06, "loss": 1.6001, "step": 26490 }, { "epoch": 2.21, "grad_norm": 14.129816055297852, "learning_rate": 4.933333333333334e-06, "loss": 1.6051, "step": 26500 }, { "epoch": 2.21, "eval_loss": 1.6546690464019775, "eval_runtime": 107.5057, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 26500 }, { "epoch": 2.21, "grad_norm": 7.938653945922852, "learning_rate": 4.927536231884059e-06, "loss": 1.549, "step": 26510 }, { "epoch": 2.21, "grad_norm": 3.820674419403076, "learning_rate": 4.921739130434783e-06, "loss": 1.5996, "step": 26520 }, { "epoch": 2.21, "grad_norm": 3.9461727142333984, "learning_rate": 4.915942028985508e-06, "loss": 1.5242, "step": 26530 }, { "epoch": 2.21, "grad_norm": 2.117321729660034, "learning_rate": 4.910144927536233e-06, "loss": 1.6593, "step": 26540 }, { "epoch": 2.21, "grad_norm": 6.951183795928955, "learning_rate": 4.904347826086957e-06, "loss": 1.6066, "step": 26550 }, { "epoch": 2.21, "grad_norm": 7.954775333404541, "learning_rate": 4.8985507246376815e-06, "loss": 1.6093, "step": 26560 }, { "epoch": 2.21, "grad_norm": 2.857492208480835, "learning_rate": 4.892753623188406e-06, "loss": 1.6337, "step": 26570 }, { "epoch": 2.21, "grad_norm": 6.1911091804504395, "learning_rate": 4.88695652173913e-06, "loss": 1.6241, "step": 26580 }, { "epoch": 2.22, "grad_norm": 5.39035701751709, "learning_rate": 4.881159420289855e-06, "loss": 1.4798, "step": 26590 }, { "epoch": 2.22, "grad_norm": 4.794005870819092, "learning_rate": 4.87536231884058e-06, "loss": 1.6761, "step": 26600 }, { "epoch": 2.22, "grad_norm": 3.688593626022339, "learning_rate": 4.869565217391305e-06, "loss": 1.7455, "step": 26610 }, { "epoch": 2.22, "grad_norm": 2.9753758907318115, "learning_rate": 4.863768115942029e-06, "loss": 1.6043, "step": 26620 }, { "epoch": 2.22, "grad_norm": 4.76633882522583, "learning_rate": 4.857971014492754e-06, "loss": 1.5899, "step": 26630 }, { "epoch": 2.22, "grad_norm": 9.436808586120605, "learning_rate": 4.852173913043479e-06, "loss": 1.5456, "step": 26640 }, { "epoch": 2.22, "grad_norm": 3.1801443099975586, "learning_rate": 4.8463768115942035e-06, "loss": 1.4973, "step": 26650 }, { "epoch": 2.22, "grad_norm": 3.6295244693756104, "learning_rate": 4.840579710144928e-06, "loss": 1.6056, "step": 26660 }, { "epoch": 2.22, "grad_norm": 2.5441739559173584, "learning_rate": 4.834782608695652e-06, "loss": 1.5993, "step": 26670 }, { "epoch": 2.22, "grad_norm": 5.536238193511963, "learning_rate": 4.828985507246377e-06, "loss": 1.7003, "step": 26680 }, { "epoch": 2.22, "grad_norm": 8.867695808410645, "learning_rate": 4.823188405797102e-06, "loss": 1.5816, "step": 26690 }, { "epoch": 2.23, "grad_norm": 2.4776854515075684, "learning_rate": 4.817391304347827e-06, "loss": 1.6089, "step": 26700 }, { "epoch": 2.23, "grad_norm": 3.4541239738464355, "learning_rate": 4.811594202898551e-06, "loss": 1.5486, "step": 26710 }, { "epoch": 2.23, "grad_norm": 5.7615251541137695, "learning_rate": 4.805797101449276e-06, "loss": 1.4767, "step": 26720 }, { "epoch": 2.23, "grad_norm": 5.553129196166992, "learning_rate": 4.800000000000001e-06, "loss": 1.6036, "step": 26730 }, { "epoch": 2.23, "grad_norm": 1.3022050857543945, "learning_rate": 4.794202898550725e-06, "loss": 1.5677, "step": 26740 }, { "epoch": 2.23, "grad_norm": 2.267906427383423, "learning_rate": 4.7884057971014495e-06, "loss": 1.5816, "step": 26750 }, { "epoch": 2.23, "grad_norm": 3.8131909370422363, "learning_rate": 4.782608695652174e-06, "loss": 1.7683, "step": 26760 }, { "epoch": 2.23, "grad_norm": 4.468419551849365, "learning_rate": 4.776811594202899e-06, "loss": 1.4521, "step": 26770 }, { "epoch": 2.23, "grad_norm": 5.726469039916992, "learning_rate": 4.771014492753623e-06, "loss": 1.6083, "step": 26780 }, { "epoch": 2.23, "grad_norm": 2.375027894973755, "learning_rate": 4.765217391304348e-06, "loss": 1.6481, "step": 26790 }, { "epoch": 2.23, "grad_norm": 8.728473663330078, "learning_rate": 4.759420289855073e-06, "loss": 1.6519, "step": 26800 }, { "epoch": 2.23, "grad_norm": 13.2547607421875, "learning_rate": 4.753623188405798e-06, "loss": 1.646, "step": 26810 }, { "epoch": 2.23, "grad_norm": 5.739991664886475, "learning_rate": 4.747826086956523e-06, "loss": 1.6067, "step": 26820 }, { "epoch": 2.24, "grad_norm": 5.9073076248168945, "learning_rate": 4.742028985507247e-06, "loss": 1.4971, "step": 26830 }, { "epoch": 2.24, "grad_norm": 8.832313537597656, "learning_rate": 4.7362318840579715e-06, "loss": 1.6724, "step": 26840 }, { "epoch": 2.24, "grad_norm": 1.5870254039764404, "learning_rate": 4.730434782608696e-06, "loss": 1.7331, "step": 26850 }, { "epoch": 2.24, "grad_norm": 7.8640055656433105, "learning_rate": 4.724637681159421e-06, "loss": 1.5264, "step": 26860 }, { "epoch": 2.24, "grad_norm": 6.005427360534668, "learning_rate": 4.718840579710145e-06, "loss": 1.5717, "step": 26870 }, { "epoch": 2.24, "grad_norm": 3.7698235511779785, "learning_rate": 4.71304347826087e-06, "loss": 1.7427, "step": 26880 }, { "epoch": 2.24, "grad_norm": 6.162291049957275, "learning_rate": 4.707246376811595e-06, "loss": 1.7923, "step": 26890 }, { "epoch": 2.24, "grad_norm": 1.3494528532028198, "learning_rate": 4.701449275362319e-06, "loss": 1.7313, "step": 26900 }, { "epoch": 2.24, "grad_norm": 3.9067726135253906, "learning_rate": 4.695652173913044e-06, "loss": 1.7465, "step": 26910 }, { "epoch": 2.24, "grad_norm": 2.578123092651367, "learning_rate": 4.689855072463769e-06, "loss": 1.5809, "step": 26920 }, { "epoch": 2.24, "grad_norm": 3.2738120555877686, "learning_rate": 4.684057971014493e-06, "loss": 1.662, "step": 26930 }, { "epoch": 2.25, "grad_norm": 2.7239420413970947, "learning_rate": 4.6782608695652175e-06, "loss": 1.5232, "step": 26940 }, { "epoch": 2.25, "grad_norm": 2.876070022583008, "learning_rate": 4.672463768115942e-06, "loss": 1.7607, "step": 26950 }, { "epoch": 2.25, "grad_norm": 5.976984977722168, "learning_rate": 4.666666666666667e-06, "loss": 1.6693, "step": 26960 }, { "epoch": 2.25, "grad_norm": 7.821426868438721, "learning_rate": 4.660869565217392e-06, "loss": 1.6281, "step": 26970 }, { "epoch": 2.25, "grad_norm": 4.057045936584473, "learning_rate": 4.655072463768116e-06, "loss": 1.6141, "step": 26980 }, { "epoch": 2.25, "grad_norm": 6.272705554962158, "learning_rate": 4.649275362318841e-06, "loss": 1.6509, "step": 26990 }, { "epoch": 2.25, "grad_norm": 2.8042986392974854, "learning_rate": 4.643478260869566e-06, "loss": 1.6283, "step": 27000 }, { "epoch": 2.25, "eval_loss": 1.599829912185669, "eval_runtime": 107.518, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 27000 }, { "epoch": 2.25, "grad_norm": 2.672848701477051, "learning_rate": 4.637681159420291e-06, "loss": 1.587, "step": 27010 }, { "epoch": 2.25, "grad_norm": 3.5648651123046875, "learning_rate": 4.6318840579710154e-06, "loss": 1.6535, "step": 27020 }, { "epoch": 2.25, "grad_norm": 5.619153022766113, "learning_rate": 4.6260869565217394e-06, "loss": 1.7095, "step": 27030 }, { "epoch": 2.25, "grad_norm": 3.661830425262451, "learning_rate": 4.620289855072464e-06, "loss": 1.5421, "step": 27040 }, { "epoch": 2.25, "grad_norm": 2.601088285446167, "learning_rate": 4.614492753623188e-06, "loss": 1.6169, "step": 27050 }, { "epoch": 2.25, "grad_norm": 1.6670879125595093, "learning_rate": 4.608695652173913e-06, "loss": 1.7808, "step": 27060 }, { "epoch": 2.26, "grad_norm": 5.285208702087402, "learning_rate": 4.602898550724638e-06, "loss": 1.6018, "step": 27070 }, { "epoch": 2.26, "grad_norm": 13.955137252807617, "learning_rate": 4.597101449275363e-06, "loss": 1.6071, "step": 27080 }, { "epoch": 2.26, "grad_norm": 3.246000289916992, "learning_rate": 4.591304347826087e-06, "loss": 1.74, "step": 27090 }, { "epoch": 2.26, "grad_norm": 4.297258377075195, "learning_rate": 4.585507246376812e-06, "loss": 1.6233, "step": 27100 }, { "epoch": 2.26, "grad_norm": 6.08992338180542, "learning_rate": 4.5797101449275366e-06, "loss": 1.6511, "step": 27110 }, { "epoch": 2.26, "grad_norm": 4.511499404907227, "learning_rate": 4.573913043478261e-06, "loss": 1.5344, "step": 27120 }, { "epoch": 2.26, "grad_norm": 2.989239454269409, "learning_rate": 4.568115942028986e-06, "loss": 1.6172, "step": 27130 }, { "epoch": 2.26, "grad_norm": 2.358445882797241, "learning_rate": 4.56231884057971e-06, "loss": 1.6004, "step": 27140 }, { "epoch": 2.26, "grad_norm": 2.533210039138794, "learning_rate": 4.556521739130435e-06, "loss": 1.6651, "step": 27150 }, { "epoch": 2.26, "grad_norm": 1.6375526189804077, "learning_rate": 4.55072463768116e-06, "loss": 1.603, "step": 27160 }, { "epoch": 2.26, "grad_norm": 16.163049697875977, "learning_rate": 4.544927536231885e-06, "loss": 1.7045, "step": 27170 }, { "epoch": 2.27, "grad_norm": 3.7427773475646973, "learning_rate": 4.53913043478261e-06, "loss": 1.7164, "step": 27180 }, { "epoch": 2.27, "grad_norm": 3.3884432315826416, "learning_rate": 4.533333333333334e-06, "loss": 1.5997, "step": 27190 }, { "epoch": 2.27, "grad_norm": 11.96393871307373, "learning_rate": 4.5275362318840585e-06, "loss": 1.6458, "step": 27200 }, { "epoch": 2.27, "grad_norm": 3.4181969165802, "learning_rate": 4.5217391304347826e-06, "loss": 1.5166, "step": 27210 }, { "epoch": 2.27, "grad_norm": 6.070342540740967, "learning_rate": 4.515942028985507e-06, "loss": 1.5037, "step": 27220 }, { "epoch": 2.27, "grad_norm": 4.330051898956299, "learning_rate": 4.510144927536232e-06, "loss": 1.6322, "step": 27230 }, { "epoch": 2.27, "grad_norm": 4.178266525268555, "learning_rate": 4.504347826086956e-06, "loss": 1.5712, "step": 27240 }, { "epoch": 2.27, "grad_norm": 10.042227745056152, "learning_rate": 4.498550724637681e-06, "loss": 1.5195, "step": 27250 }, { "epoch": 2.27, "grad_norm": 4.68449592590332, "learning_rate": 4.492753623188406e-06, "loss": 1.5767, "step": 27260 }, { "epoch": 2.27, "grad_norm": 6.098611354827881, "learning_rate": 4.486956521739131e-06, "loss": 1.5826, "step": 27270 }, { "epoch": 2.27, "grad_norm": 3.7514631748199463, "learning_rate": 4.481159420289856e-06, "loss": 1.6519, "step": 27280 }, { "epoch": 2.27, "grad_norm": 6.086874961853027, "learning_rate": 4.47536231884058e-06, "loss": 1.6356, "step": 27290 }, { "epoch": 2.27, "grad_norm": 2.969055414199829, "learning_rate": 4.4695652173913045e-06, "loss": 1.5372, "step": 27300 }, { "epoch": 2.28, "grad_norm": 3.0818963050842285, "learning_rate": 4.463768115942029e-06, "loss": 1.6195, "step": 27310 }, { "epoch": 2.28, "grad_norm": 2.355165481567383, "learning_rate": 4.457971014492754e-06, "loss": 1.5498, "step": 27320 }, { "epoch": 2.28, "grad_norm": 3.69412899017334, "learning_rate": 4.452173913043479e-06, "loss": 1.6927, "step": 27330 }, { "epoch": 2.28, "grad_norm": 4.266238212585449, "learning_rate": 4.446376811594204e-06, "loss": 1.5444, "step": 27340 }, { "epoch": 2.28, "grad_norm": 10.938898086547852, "learning_rate": 4.440579710144928e-06, "loss": 1.5767, "step": 27350 }, { "epoch": 2.28, "grad_norm": 3.8983266353607178, "learning_rate": 4.434782608695653e-06, "loss": 1.6385, "step": 27360 }, { "epoch": 2.28, "grad_norm": 6.039658069610596, "learning_rate": 4.428985507246377e-06, "loss": 1.6902, "step": 27370 }, { "epoch": 2.28, "grad_norm": 3.6886088848114014, "learning_rate": 4.423188405797102e-06, "loss": 1.5464, "step": 27380 }, { "epoch": 2.28, "grad_norm": 3.805041551589966, "learning_rate": 4.4173913043478265e-06, "loss": 1.5516, "step": 27390 }, { "epoch": 2.28, "grad_norm": 6.767727851867676, "learning_rate": 4.4115942028985505e-06, "loss": 1.6857, "step": 27400 }, { "epoch": 2.28, "grad_norm": 2.386350631713867, "learning_rate": 4.405797101449275e-06, "loss": 1.7113, "step": 27410 }, { "epoch": 2.29, "grad_norm": 1.8526190519332886, "learning_rate": 4.4e-06, "loss": 1.6934, "step": 27420 }, { "epoch": 2.29, "grad_norm": 6.962111949920654, "learning_rate": 4.394202898550725e-06, "loss": 1.5782, "step": 27430 }, { "epoch": 2.29, "grad_norm": 3.9486684799194336, "learning_rate": 4.38840579710145e-06, "loss": 1.5233, "step": 27440 }, { "epoch": 2.29, "grad_norm": 4.742575645446777, "learning_rate": 4.382608695652174e-06, "loss": 1.5672, "step": 27450 }, { "epoch": 2.29, "grad_norm": 7.8938422203063965, "learning_rate": 4.376811594202899e-06, "loss": 1.5662, "step": 27460 }, { "epoch": 2.29, "grad_norm": 4.794543743133545, "learning_rate": 4.371014492753624e-06, "loss": 1.6878, "step": 27470 }, { "epoch": 2.29, "grad_norm": 2.3197507858276367, "learning_rate": 4.3652173913043485e-06, "loss": 1.6168, "step": 27480 }, { "epoch": 2.29, "grad_norm": 3.5373730659484863, "learning_rate": 4.359420289855073e-06, "loss": 1.6601, "step": 27490 }, { "epoch": 2.29, "grad_norm": 1.5637263059616089, "learning_rate": 4.353623188405797e-06, "loss": 1.6436, "step": 27500 }, { "epoch": 2.29, "eval_loss": 1.637457251548767, "eval_runtime": 107.504, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 27500 }, { "epoch": 2.29, "grad_norm": 4.249775409698486, "learning_rate": 4.347826086956522e-06, "loss": 1.5759, "step": 27510 }, { "epoch": 2.29, "grad_norm": 9.582161903381348, "learning_rate": 4.342028985507247e-06, "loss": 1.5566, "step": 27520 }, { "epoch": 2.29, "grad_norm": 3.517526626586914, "learning_rate": 4.336231884057971e-06, "loss": 1.5944, "step": 27530 }, { "epoch": 2.29, "grad_norm": 8.537724494934082, "learning_rate": 4.330434782608696e-06, "loss": 1.5976, "step": 27540 }, { "epoch": 2.3, "grad_norm": 3.978813886642456, "learning_rate": 4.324637681159421e-06, "loss": 1.6186, "step": 27550 }, { "epoch": 2.3, "grad_norm": 2.7505924701690674, "learning_rate": 4.318840579710145e-06, "loss": 1.5513, "step": 27560 }, { "epoch": 2.3, "grad_norm": 2.5586419105529785, "learning_rate": 4.31304347826087e-06, "loss": 1.7299, "step": 27570 }, { "epoch": 2.3, "grad_norm": 4.848520278930664, "learning_rate": 4.3072463768115945e-06, "loss": 1.6996, "step": 27580 }, { "epoch": 2.3, "grad_norm": 10.454072952270508, "learning_rate": 4.301449275362319e-06, "loss": 1.748, "step": 27590 }, { "epoch": 2.3, "grad_norm": 5.605503559112549, "learning_rate": 4.295652173913043e-06, "loss": 1.6378, "step": 27600 }, { "epoch": 2.3, "grad_norm": 7.994022369384766, "learning_rate": 4.289855072463768e-06, "loss": 1.5866, "step": 27610 }, { "epoch": 2.3, "grad_norm": 3.65685772895813, "learning_rate": 4.284057971014493e-06, "loss": 1.5671, "step": 27620 }, { "epoch": 2.3, "grad_norm": 8.096505165100098, "learning_rate": 4.278260869565218e-06, "loss": 1.5329, "step": 27630 }, { "epoch": 2.3, "grad_norm": 4.837905406951904, "learning_rate": 4.272463768115943e-06, "loss": 1.7046, "step": 27640 }, { "epoch": 2.3, "grad_norm": 4.230941295623779, "learning_rate": 4.266666666666668e-06, "loss": 1.6219, "step": 27650 }, { "epoch": 2.31, "grad_norm": 6.498970031738281, "learning_rate": 4.260869565217392e-06, "loss": 1.5791, "step": 27660 }, { "epoch": 2.31, "grad_norm": 2.494067430496216, "learning_rate": 4.2550724637681165e-06, "loss": 1.5099, "step": 27670 }, { "epoch": 2.31, "grad_norm": 3.1975343227386475, "learning_rate": 4.249275362318841e-06, "loss": 1.5382, "step": 27680 }, { "epoch": 2.31, "grad_norm": 4.870430946350098, "learning_rate": 4.243478260869565e-06, "loss": 1.4734, "step": 27690 }, { "epoch": 2.31, "grad_norm": 3.4920201301574707, "learning_rate": 4.23768115942029e-06, "loss": 1.5764, "step": 27700 }, { "epoch": 2.31, "grad_norm": 2.826338052749634, "learning_rate": 4.231884057971015e-06, "loss": 1.766, "step": 27710 }, { "epoch": 2.31, "grad_norm": 9.284537315368652, "learning_rate": 4.226086956521739e-06, "loss": 1.5152, "step": 27720 }, { "epoch": 2.31, "grad_norm": 4.703794479370117, "learning_rate": 4.220289855072464e-06, "loss": 1.5506, "step": 27730 }, { "epoch": 2.31, "grad_norm": 4.4055094718933105, "learning_rate": 4.214492753623189e-06, "loss": 1.6593, "step": 27740 }, { "epoch": 2.31, "grad_norm": 2.2604644298553467, "learning_rate": 4.208695652173914e-06, "loss": 1.5203, "step": 27750 }, { "epoch": 2.31, "grad_norm": 2.278348445892334, "learning_rate": 4.202898550724638e-06, "loss": 1.6086, "step": 27760 }, { "epoch": 2.31, "grad_norm": 8.626958847045898, "learning_rate": 4.1971014492753624e-06, "loss": 1.569, "step": 27770 }, { "epoch": 2.31, "grad_norm": 6.3467583656311035, "learning_rate": 4.191304347826087e-06, "loss": 1.5253, "step": 27780 }, { "epoch": 2.32, "grad_norm": 2.055905818939209, "learning_rate": 4.185507246376812e-06, "loss": 1.6537, "step": 27790 }, { "epoch": 2.32, "grad_norm": 2.519817352294922, "learning_rate": 4.179710144927537e-06, "loss": 1.5329, "step": 27800 }, { "epoch": 2.32, "grad_norm": 3.1581406593322754, "learning_rate": 4.173913043478261e-06, "loss": 1.616, "step": 27810 }, { "epoch": 2.32, "grad_norm": 1.114212155342102, "learning_rate": 4.168115942028986e-06, "loss": 1.4248, "step": 27820 }, { "epoch": 2.32, "grad_norm": 4.742372989654541, "learning_rate": 4.162318840579711e-06, "loss": 1.7195, "step": 27830 }, { "epoch": 2.32, "grad_norm": 2.707331895828247, "learning_rate": 4.1565217391304356e-06, "loss": 1.5344, "step": 27840 }, { "epoch": 2.32, "grad_norm": 4.827779293060303, "learning_rate": 4.1507246376811596e-06, "loss": 1.6523, "step": 27850 }, { "epoch": 2.32, "grad_norm": 3.977830410003662, "learning_rate": 4.1449275362318844e-06, "loss": 1.6839, "step": 27860 }, { "epoch": 2.32, "grad_norm": 8.55379867553711, "learning_rate": 4.139130434782609e-06, "loss": 1.6007, "step": 27870 }, { "epoch": 2.32, "grad_norm": 4.7034173011779785, "learning_rate": 4.133333333333333e-06, "loss": 1.5704, "step": 27880 }, { "epoch": 2.32, "grad_norm": 2.772855520248413, "learning_rate": 4.127536231884058e-06, "loss": 1.7209, "step": 27890 }, { "epoch": 2.33, "grad_norm": 2.952080011367798, "learning_rate": 4.121739130434783e-06, "loss": 1.5282, "step": 27900 }, { "epoch": 2.33, "grad_norm": 2.7901113033294678, "learning_rate": 4.115942028985507e-06, "loss": 1.6545, "step": 27910 }, { "epoch": 2.33, "grad_norm": 8.063865661621094, "learning_rate": 4.110144927536232e-06, "loss": 1.4981, "step": 27920 }, { "epoch": 2.33, "grad_norm": 9.868477821350098, "learning_rate": 4.104347826086957e-06, "loss": 1.6642, "step": 27930 }, { "epoch": 2.33, "grad_norm": 5.399648666381836, "learning_rate": 4.0985507246376816e-06, "loss": 1.6422, "step": 27940 }, { "epoch": 2.33, "grad_norm": 3.0602734088897705, "learning_rate": 4.092753623188406e-06, "loss": 1.7643, "step": 27950 }, { "epoch": 2.33, "grad_norm": 5.221512794494629, "learning_rate": 4.086956521739131e-06, "loss": 1.5439, "step": 27960 }, { "epoch": 2.33, "grad_norm": 3.30639386177063, "learning_rate": 4.081159420289855e-06, "loss": 1.5125, "step": 27970 }, { "epoch": 2.33, "grad_norm": 4.630527973175049, "learning_rate": 4.07536231884058e-06, "loss": 1.5665, "step": 27980 }, { "epoch": 2.33, "grad_norm": 9.667155265808105, "learning_rate": 4.069565217391305e-06, "loss": 1.6917, "step": 27990 }, { "epoch": 2.33, "grad_norm": 2.4528236389160156, "learning_rate": 4.06376811594203e-06, "loss": 1.6995, "step": 28000 }, { "epoch": 2.33, "eval_loss": 1.644856333732605, "eval_runtime": 107.5008, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 28000 }, { "epoch": 2.33, "grad_norm": 3.805037260055542, "learning_rate": 4.057971014492754e-06, "loss": 1.5207, "step": 28010 }, { "epoch": 2.33, "grad_norm": 3.819047451019287, "learning_rate": 4.052173913043479e-06, "loss": 1.6518, "step": 28020 }, { "epoch": 2.34, "grad_norm": 8.75721263885498, "learning_rate": 4.0463768115942035e-06, "loss": 1.6758, "step": 28030 }, { "epoch": 2.34, "grad_norm": 5.432452201843262, "learning_rate": 4.0405797101449275e-06, "loss": 1.7066, "step": 28040 }, { "epoch": 2.34, "grad_norm": 3.6831490993499756, "learning_rate": 4.034782608695652e-06, "loss": 1.6379, "step": 28050 }, { "epoch": 2.34, "grad_norm": 6.150071620941162, "learning_rate": 4.028985507246377e-06, "loss": 1.5458, "step": 28060 }, { "epoch": 2.34, "grad_norm": 6.020603179931641, "learning_rate": 4.023188405797101e-06, "loss": 1.6909, "step": 28070 }, { "epoch": 2.34, "grad_norm": 11.10090446472168, "learning_rate": 4.017391304347826e-06, "loss": 1.7057, "step": 28080 }, { "epoch": 2.34, "grad_norm": 2.3139498233795166, "learning_rate": 4.011594202898551e-06, "loss": 1.6334, "step": 28090 }, { "epoch": 2.34, "grad_norm": 11.909808158874512, "learning_rate": 4.005797101449276e-06, "loss": 1.6384, "step": 28100 }, { "epoch": 2.34, "grad_norm": 2.950678586959839, "learning_rate": 4.000000000000001e-06, "loss": 1.6356, "step": 28110 }, { "epoch": 2.34, "grad_norm": 2.596022844314575, "learning_rate": 3.994202898550725e-06, "loss": 1.4085, "step": 28120 }, { "epoch": 2.34, "grad_norm": 9.143681526184082, "learning_rate": 3.9884057971014495e-06, "loss": 1.5866, "step": 28130 }, { "epoch": 2.34, "grad_norm": 2.4474549293518066, "learning_rate": 3.982608695652174e-06, "loss": 1.5971, "step": 28140 }, { "epoch": 2.35, "grad_norm": 3.5778965950012207, "learning_rate": 3.976811594202899e-06, "loss": 1.5654, "step": 28150 }, { "epoch": 2.35, "grad_norm": 1.3869068622589111, "learning_rate": 3.971014492753624e-06, "loss": 1.6479, "step": 28160 }, { "epoch": 2.35, "grad_norm": 6.334060192108154, "learning_rate": 3.965217391304348e-06, "loss": 1.4869, "step": 28170 }, { "epoch": 2.35, "grad_norm": 3.896120071411133, "learning_rate": 3.959420289855073e-06, "loss": 1.5473, "step": 28180 }, { "epoch": 2.35, "grad_norm": 4.137752532958984, "learning_rate": 3.953623188405798e-06, "loss": 1.5977, "step": 28190 }, { "epoch": 2.35, "grad_norm": 3.7057723999023438, "learning_rate": 3.947826086956522e-06, "loss": 1.6333, "step": 28200 }, { "epoch": 2.35, "grad_norm": 7.279668807983398, "learning_rate": 3.942028985507247e-06, "loss": 1.6963, "step": 28210 }, { "epoch": 2.35, "grad_norm": 4.471526145935059, "learning_rate": 3.936231884057971e-06, "loss": 1.6287, "step": 28220 }, { "epoch": 2.35, "grad_norm": 3.1095166206359863, "learning_rate": 3.9304347826086955e-06, "loss": 1.5774, "step": 28230 }, { "epoch": 2.35, "grad_norm": 5.295337677001953, "learning_rate": 3.92463768115942e-06, "loss": 1.6255, "step": 28240 }, { "epoch": 2.35, "grad_norm": 2.934601306915283, "learning_rate": 3.918840579710145e-06, "loss": 1.6194, "step": 28250 }, { "epoch": 2.35, "grad_norm": 3.069929361343384, "learning_rate": 3.91304347826087e-06, "loss": 1.7143, "step": 28260 }, { "epoch": 2.36, "grad_norm": 4.211582660675049, "learning_rate": 3.907246376811595e-06, "loss": 1.5541, "step": 28270 }, { "epoch": 2.36, "grad_norm": 8.216347694396973, "learning_rate": 3.901449275362319e-06, "loss": 1.5638, "step": 28280 }, { "epoch": 2.36, "grad_norm": 8.743631362915039, "learning_rate": 3.895652173913044e-06, "loss": 1.5687, "step": 28290 }, { "epoch": 2.36, "grad_norm": 4.041175842285156, "learning_rate": 3.889855072463769e-06, "loss": 1.5939, "step": 28300 }, { "epoch": 2.36, "grad_norm": 8.4274263381958, "learning_rate": 3.8840579710144935e-06, "loss": 1.7864, "step": 28310 }, { "epoch": 2.36, "grad_norm": 6.043384552001953, "learning_rate": 3.878260869565218e-06, "loss": 1.5686, "step": 28320 }, { "epoch": 2.36, "grad_norm": 2.899559736251831, "learning_rate": 3.872463768115942e-06, "loss": 1.7651, "step": 28330 }, { "epoch": 2.36, "grad_norm": 6.566895008087158, "learning_rate": 3.866666666666667e-06, "loss": 1.6548, "step": 28340 }, { "epoch": 2.36, "grad_norm": 4.242386341094971, "learning_rate": 3.860869565217392e-06, "loss": 1.5816, "step": 28350 }, { "epoch": 2.36, "grad_norm": 3.0503904819488525, "learning_rate": 3.855072463768116e-06, "loss": 1.7181, "step": 28360 }, { "epoch": 2.36, "grad_norm": 8.445423126220703, "learning_rate": 3.849275362318841e-06, "loss": 1.6133, "step": 28370 }, { "epoch": 2.37, "grad_norm": 4.893218040466309, "learning_rate": 3.843478260869565e-06, "loss": 1.6204, "step": 28380 }, { "epoch": 2.37, "grad_norm": 3.2958295345306396, "learning_rate": 3.83768115942029e-06, "loss": 1.524, "step": 28390 }, { "epoch": 2.37, "grad_norm": 3.6230056285858154, "learning_rate": 3.831884057971015e-06, "loss": 1.7141, "step": 28400 }, { "epoch": 2.37, "grad_norm": 4.484030246734619, "learning_rate": 3.8260869565217395e-06, "loss": 1.7142, "step": 28410 }, { "epoch": 2.37, "grad_norm": 5.462039470672607, "learning_rate": 3.820289855072464e-06, "loss": 1.6588, "step": 28420 }, { "epoch": 2.37, "grad_norm": 1.6934198141098022, "learning_rate": 3.8144927536231883e-06, "loss": 1.6298, "step": 28430 }, { "epoch": 2.37, "grad_norm": 4.632734298706055, "learning_rate": 3.808695652173913e-06, "loss": 1.523, "step": 28440 }, { "epoch": 2.37, "grad_norm": 1.3441051244735718, "learning_rate": 3.802898550724638e-06, "loss": 1.6716, "step": 28450 }, { "epoch": 2.37, "grad_norm": 14.253397941589355, "learning_rate": 3.797101449275363e-06, "loss": 1.6655, "step": 28460 }, { "epoch": 2.37, "grad_norm": 3.6277294158935547, "learning_rate": 3.7913043478260873e-06, "loss": 1.6434, "step": 28470 }, { "epoch": 2.37, "grad_norm": 3.960604429244995, "learning_rate": 3.7855072463768117e-06, "loss": 1.6302, "step": 28480 }, { "epoch": 2.37, "grad_norm": 8.853973388671875, "learning_rate": 3.7797101449275366e-06, "loss": 1.5669, "step": 28490 }, { "epoch": 2.38, "grad_norm": 6.044206619262695, "learning_rate": 3.773913043478261e-06, "loss": 1.6932, "step": 28500 }, { "epoch": 2.38, "eval_loss": 1.6286858320236206, "eval_runtime": 107.5089, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.325, "step": 28500 }, { "epoch": 2.38, "grad_norm": 2.0954174995422363, "learning_rate": 3.768115942028986e-06, "loss": 1.7568, "step": 28510 }, { "epoch": 2.38, "grad_norm": 7.709628105163574, "learning_rate": 3.7623188405797107e-06, "loss": 1.4997, "step": 28520 }, { "epoch": 2.38, "grad_norm": 3.7862284183502197, "learning_rate": 3.7565217391304347e-06, "loss": 1.6189, "step": 28530 }, { "epoch": 2.38, "grad_norm": 2.443877696990967, "learning_rate": 3.7507246376811596e-06, "loss": 1.6114, "step": 28540 }, { "epoch": 2.38, "grad_norm": 9.974430084228516, "learning_rate": 3.7449275362318844e-06, "loss": 1.7863, "step": 28550 }, { "epoch": 2.38, "grad_norm": 4.032052516937256, "learning_rate": 3.739130434782609e-06, "loss": 1.5454, "step": 28560 }, { "epoch": 2.38, "grad_norm": 15.427571296691895, "learning_rate": 3.7333333333333337e-06, "loss": 1.583, "step": 28570 }, { "epoch": 2.38, "grad_norm": 5.56199836730957, "learning_rate": 3.727536231884058e-06, "loss": 1.5224, "step": 28580 }, { "epoch": 2.38, "grad_norm": 2.051842212677002, "learning_rate": 3.7217391304347826e-06, "loss": 1.4863, "step": 28590 }, { "epoch": 2.38, "grad_norm": 2.057842493057251, "learning_rate": 3.7159420289855074e-06, "loss": 1.6266, "step": 28600 }, { "epoch": 2.38, "grad_norm": 2.723554849624634, "learning_rate": 3.7101449275362323e-06, "loss": 1.4752, "step": 28610 }, { "epoch": 2.38, "grad_norm": 2.5356404781341553, "learning_rate": 3.704347826086957e-06, "loss": 1.6376, "step": 28620 }, { "epoch": 2.39, "grad_norm": 2.806687593460083, "learning_rate": 3.6985507246376816e-06, "loss": 1.5874, "step": 28630 }, { "epoch": 2.39, "grad_norm": 2.9221794605255127, "learning_rate": 3.692753623188406e-06, "loss": 1.5031, "step": 28640 }, { "epoch": 2.39, "grad_norm": 4.143265724182129, "learning_rate": 3.686956521739131e-06, "loss": 1.5727, "step": 28650 }, { "epoch": 2.39, "grad_norm": 2.832207679748535, "learning_rate": 3.6811594202898553e-06, "loss": 1.6113, "step": 28660 }, { "epoch": 2.39, "grad_norm": 9.17288875579834, "learning_rate": 3.67536231884058e-06, "loss": 1.593, "step": 28670 }, { "epoch": 2.39, "grad_norm": 2.8944125175476074, "learning_rate": 3.669565217391305e-06, "loss": 1.6084, "step": 28680 }, { "epoch": 2.39, "grad_norm": 9.046699523925781, "learning_rate": 3.663768115942029e-06, "loss": 1.5391, "step": 28690 }, { "epoch": 2.39, "grad_norm": 6.482266902923584, "learning_rate": 3.657971014492754e-06, "loss": 1.6621, "step": 28700 }, { "epoch": 2.39, "grad_norm": 4.573151588439941, "learning_rate": 3.6521739130434787e-06, "loss": 1.6467, "step": 28710 }, { "epoch": 2.39, "grad_norm": 13.082232475280762, "learning_rate": 3.646376811594203e-06, "loss": 1.6243, "step": 28720 }, { "epoch": 2.39, "grad_norm": 5.799623012542725, "learning_rate": 3.640579710144928e-06, "loss": 1.5777, "step": 28730 }, { "epoch": 2.4, "grad_norm": 4.449925422668457, "learning_rate": 3.6347826086956524e-06, "loss": 1.7436, "step": 28740 }, { "epoch": 2.4, "grad_norm": 6.415322780609131, "learning_rate": 3.628985507246377e-06, "loss": 1.5015, "step": 28750 }, { "epoch": 2.4, "grad_norm": 3.3753433227539062, "learning_rate": 3.6231884057971017e-06, "loss": 1.696, "step": 28760 }, { "epoch": 2.4, "grad_norm": 3.1241776943206787, "learning_rate": 3.6173913043478265e-06, "loss": 1.5431, "step": 28770 }, { "epoch": 2.4, "grad_norm": 6.052039623260498, "learning_rate": 3.6115942028985514e-06, "loss": 1.5878, "step": 28780 }, { "epoch": 2.4, "grad_norm": 4.090257167816162, "learning_rate": 3.6057971014492754e-06, "loss": 1.5928, "step": 28790 }, { "epoch": 2.4, "grad_norm": 2.91934871673584, "learning_rate": 3.6000000000000003e-06, "loss": 1.6831, "step": 28800 }, { "epoch": 2.4, "grad_norm": 8.889466285705566, "learning_rate": 3.594202898550725e-06, "loss": 1.599, "step": 28810 }, { "epoch": 2.4, "grad_norm": 5.158334255218506, "learning_rate": 3.5884057971014495e-06, "loss": 1.6809, "step": 28820 }, { "epoch": 2.4, "grad_norm": 4.268106460571289, "learning_rate": 3.5826086956521744e-06, "loss": 1.6967, "step": 28830 }, { "epoch": 2.4, "grad_norm": 5.080434322357178, "learning_rate": 3.5768115942028984e-06, "loss": 1.7103, "step": 28840 }, { "epoch": 2.4, "grad_norm": 2.549793243408203, "learning_rate": 3.5710144927536233e-06, "loss": 1.5933, "step": 28850 }, { "epoch": 2.41, "grad_norm": 5.401309013366699, "learning_rate": 3.565217391304348e-06, "loss": 1.6199, "step": 28860 }, { "epoch": 2.41, "grad_norm": 1.3040978908538818, "learning_rate": 3.559420289855073e-06, "loss": 1.7081, "step": 28870 }, { "epoch": 2.41, "grad_norm": 2.473288059234619, "learning_rate": 3.5536231884057974e-06, "loss": 1.626, "step": 28880 }, { "epoch": 2.41, "grad_norm": 1.599831461906433, "learning_rate": 3.547826086956522e-06, "loss": 1.7197, "step": 28890 }, { "epoch": 2.41, "grad_norm": 3.284329414367676, "learning_rate": 3.5420289855072467e-06, "loss": 1.6332, "step": 28900 }, { "epoch": 2.41, "grad_norm": 8.579666137695312, "learning_rate": 3.536231884057971e-06, "loss": 1.7461, "step": 28910 }, { "epoch": 2.41, "grad_norm": 0.893390953540802, "learning_rate": 3.530434782608696e-06, "loss": 1.588, "step": 28920 }, { "epoch": 2.41, "grad_norm": 6.217680931091309, "learning_rate": 3.524637681159421e-06, "loss": 1.6019, "step": 28930 }, { "epoch": 2.41, "grad_norm": 3.5240437984466553, "learning_rate": 3.5188405797101457e-06, "loss": 1.5526, "step": 28940 }, { "epoch": 2.41, "grad_norm": 1.2045577764511108, "learning_rate": 3.5130434782608697e-06, "loss": 1.5679, "step": 28950 }, { "epoch": 2.41, "grad_norm": 7.309528350830078, "learning_rate": 3.5072463768115945e-06, "loss": 1.6655, "step": 28960 }, { "epoch": 2.41, "grad_norm": 7.580850601196289, "learning_rate": 3.501449275362319e-06, "loss": 1.6961, "step": 28970 }, { "epoch": 2.42, "grad_norm": 3.6850571632385254, "learning_rate": 3.495652173913044e-06, "loss": 1.6288, "step": 28980 }, { "epoch": 2.42, "grad_norm": 6.09687614440918, "learning_rate": 3.4898550724637687e-06, "loss": 1.7291, "step": 28990 }, { "epoch": 2.42, "grad_norm": 2.748039960861206, "learning_rate": 3.4840579710144927e-06, "loss": 1.5591, "step": 29000 }, { "epoch": 2.42, "eval_loss": 1.657044529914856, "eval_runtime": 107.5181, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 29000 }, { "epoch": 2.42, "grad_norm": 3.007520914077759, "learning_rate": 3.4782608695652175e-06, "loss": 1.6379, "step": 29010 }, { "epoch": 2.42, "grad_norm": 4.748697280883789, "learning_rate": 3.4724637681159424e-06, "loss": 1.7399, "step": 29020 }, { "epoch": 2.42, "grad_norm": 15.37971305847168, "learning_rate": 3.4666666666666672e-06, "loss": 1.5297, "step": 29030 }, { "epoch": 2.42, "grad_norm": 3.962806463241577, "learning_rate": 3.4608695652173916e-06, "loss": 1.6266, "step": 29040 }, { "epoch": 2.42, "grad_norm": 13.86978530883789, "learning_rate": 3.455072463768116e-06, "loss": 1.6452, "step": 29050 }, { "epoch": 2.42, "grad_norm": 3.6884708404541016, "learning_rate": 3.449275362318841e-06, "loss": 1.63, "step": 29060 }, { "epoch": 2.42, "grad_norm": 2.2268710136413574, "learning_rate": 3.4434782608695654e-06, "loss": 1.7602, "step": 29070 }, { "epoch": 2.42, "grad_norm": 8.4721040725708, "learning_rate": 3.43768115942029e-06, "loss": 1.7621, "step": 29080 }, { "epoch": 2.42, "grad_norm": 9.810726165771484, "learning_rate": 3.431884057971015e-06, "loss": 1.5289, "step": 29090 }, { "epoch": 2.42, "grad_norm": 4.837118625640869, "learning_rate": 3.426086956521739e-06, "loss": 1.6023, "step": 29100 }, { "epoch": 2.43, "grad_norm": 13.719472885131836, "learning_rate": 3.420289855072464e-06, "loss": 1.6571, "step": 29110 }, { "epoch": 2.43, "grad_norm": 1.3916150331497192, "learning_rate": 3.4144927536231888e-06, "loss": 1.6231, "step": 29120 }, { "epoch": 2.43, "grad_norm": 4.7587890625, "learning_rate": 3.408695652173913e-06, "loss": 1.6458, "step": 29130 }, { "epoch": 2.43, "grad_norm": 5.0899200439453125, "learning_rate": 3.402898550724638e-06, "loss": 1.681, "step": 29140 }, { "epoch": 2.43, "grad_norm": 3.8998475074768066, "learning_rate": 3.3971014492753625e-06, "loss": 1.5843, "step": 29150 }, { "epoch": 2.43, "grad_norm": 13.847112655639648, "learning_rate": 3.391304347826087e-06, "loss": 1.5932, "step": 29160 }, { "epoch": 2.43, "grad_norm": 16.733882904052734, "learning_rate": 3.3855072463768118e-06, "loss": 1.6131, "step": 29170 }, { "epoch": 2.43, "grad_norm": 3.589210033416748, "learning_rate": 3.3797101449275366e-06, "loss": 1.5347, "step": 29180 }, { "epoch": 2.43, "grad_norm": 2.7056937217712402, "learning_rate": 3.3739130434782615e-06, "loss": 1.6423, "step": 29190 }, { "epoch": 2.43, "grad_norm": 5.761937618255615, "learning_rate": 3.3681159420289855e-06, "loss": 1.5252, "step": 29200 }, { "epoch": 2.43, "grad_norm": 4.621154308319092, "learning_rate": 3.3623188405797103e-06, "loss": 1.4666, "step": 29210 }, { "epoch": 2.44, "grad_norm": 9.775867462158203, "learning_rate": 3.356521739130435e-06, "loss": 1.6301, "step": 29220 }, { "epoch": 2.44, "grad_norm": 4.965156078338623, "learning_rate": 3.3507246376811596e-06, "loss": 1.7043, "step": 29230 }, { "epoch": 2.44, "grad_norm": 11.6044282913208, "learning_rate": 3.3449275362318845e-06, "loss": 1.6566, "step": 29240 }, { "epoch": 2.44, "grad_norm": 2.9858810901641846, "learning_rate": 3.3391304347826093e-06, "loss": 1.5687, "step": 29250 }, { "epoch": 2.44, "grad_norm": 5.0615386962890625, "learning_rate": 3.3333333333333333e-06, "loss": 1.4778, "step": 29260 }, { "epoch": 2.44, "grad_norm": 8.376899719238281, "learning_rate": 3.327536231884058e-06, "loss": 1.6104, "step": 29270 }, { "epoch": 2.44, "grad_norm": 4.015613555908203, "learning_rate": 3.321739130434783e-06, "loss": 1.5788, "step": 29280 }, { "epoch": 2.44, "grad_norm": 2.4846994876861572, "learning_rate": 3.3159420289855075e-06, "loss": 1.5098, "step": 29290 }, { "epoch": 2.44, "grad_norm": 3.8240742683410645, "learning_rate": 3.3101449275362323e-06, "loss": 1.7382, "step": 29300 }, { "epoch": 2.44, "grad_norm": 3.34401798248291, "learning_rate": 3.3043478260869567e-06, "loss": 1.6224, "step": 29310 }, { "epoch": 2.44, "grad_norm": 4.242111682891846, "learning_rate": 3.298550724637681e-06, "loss": 1.7168, "step": 29320 }, { "epoch": 2.44, "grad_norm": 5.714269638061523, "learning_rate": 3.292753623188406e-06, "loss": 1.6101, "step": 29330 }, { "epoch": 2.44, "grad_norm": 2.2656619548797607, "learning_rate": 3.286956521739131e-06, "loss": 1.6284, "step": 29340 }, { "epoch": 2.45, "grad_norm": 8.77830696105957, "learning_rate": 3.2811594202898557e-06, "loss": 1.6186, "step": 29350 }, { "epoch": 2.45, "grad_norm": 3.6399919986724854, "learning_rate": 3.2753623188405797e-06, "loss": 1.5704, "step": 29360 }, { "epoch": 2.45, "grad_norm": 1.7771320343017578, "learning_rate": 3.2695652173913046e-06, "loss": 1.6302, "step": 29370 }, { "epoch": 2.45, "grad_norm": 5.038367748260498, "learning_rate": 3.2637681159420294e-06, "loss": 1.4699, "step": 29380 }, { "epoch": 2.45, "grad_norm": 1.5462366342544556, "learning_rate": 3.257971014492754e-06, "loss": 1.5656, "step": 29390 }, { "epoch": 2.45, "grad_norm": 8.673677444458008, "learning_rate": 3.252753623188406e-06, "loss": 1.5968, "step": 29400 }, { "epoch": 2.45, "grad_norm": 3.5238194465637207, "learning_rate": 3.246956521739131e-06, "loss": 1.7341, "step": 29410 }, { "epoch": 2.45, "grad_norm": 3.633085012435913, "learning_rate": 3.2411594202898557e-06, "loss": 1.6154, "step": 29420 }, { "epoch": 2.45, "grad_norm": 7.964749336242676, "learning_rate": 3.2353623188405797e-06, "loss": 1.4897, "step": 29430 }, { "epoch": 2.45, "grad_norm": 6.036418437957764, "learning_rate": 3.2295652173913045e-06, "loss": 1.74, "step": 29440 }, { "epoch": 2.45, "grad_norm": 5.448636054992676, "learning_rate": 3.2237681159420294e-06, "loss": 1.7668, "step": 29450 }, { "epoch": 2.46, "grad_norm": 5.758767604827881, "learning_rate": 3.217971014492754e-06, "loss": 1.5285, "step": 29460 }, { "epoch": 2.46, "grad_norm": 13.795671463012695, "learning_rate": 3.2121739130434787e-06, "loss": 1.5928, "step": 29470 }, { "epoch": 2.46, "grad_norm": 0.950065553188324, "learning_rate": 3.2063768115942027e-06, "loss": 1.6188, "step": 29480 }, { "epoch": 2.46, "grad_norm": 9.130352020263672, "learning_rate": 3.2005797101449275e-06, "loss": 1.5059, "step": 29490 }, { "epoch": 2.46, "grad_norm": 3.847170114517212, "learning_rate": 3.1947826086956524e-06, "loss": 1.6167, "step": 29500 }, { "epoch": 2.46, "eval_loss": 1.6451491117477417, "eval_runtime": 107.4982, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 29500 }, { "epoch": 2.46, "grad_norm": 3.0313570499420166, "learning_rate": 3.1889855072463772e-06, "loss": 1.6151, "step": 29510 }, { "epoch": 2.46, "grad_norm": 2.9217429161071777, "learning_rate": 3.1831884057971017e-06, "loss": 1.643, "step": 29520 }, { "epoch": 2.46, "grad_norm": 3.5794498920440674, "learning_rate": 3.1773913043478265e-06, "loss": 1.6273, "step": 29530 }, { "epoch": 2.46, "grad_norm": 2.837756395339966, "learning_rate": 3.171594202898551e-06, "loss": 1.5733, "step": 29540 }, { "epoch": 2.46, "grad_norm": 4.105989933013916, "learning_rate": 3.1657971014492754e-06, "loss": 1.5915, "step": 29550 }, { "epoch": 2.46, "grad_norm": 9.32128620147705, "learning_rate": 3.1600000000000002e-06, "loss": 1.5463, "step": 29560 }, { "epoch": 2.46, "grad_norm": 4.723803520202637, "learning_rate": 3.154202898550725e-06, "loss": 1.7413, "step": 29570 }, { "epoch": 2.46, "grad_norm": 8.316760063171387, "learning_rate": 3.14840579710145e-06, "loss": 1.6757, "step": 29580 }, { "epoch": 2.47, "grad_norm": 7.694721698760986, "learning_rate": 3.142608695652174e-06, "loss": 1.5203, "step": 29590 }, { "epoch": 2.47, "grad_norm": 3.9127633571624756, "learning_rate": 3.136811594202899e-06, "loss": 1.5992, "step": 29600 }, { "epoch": 2.47, "grad_norm": 4.171841144561768, "learning_rate": 3.1310144927536237e-06, "loss": 1.6754, "step": 29610 }, { "epoch": 2.47, "grad_norm": 6.929067134857178, "learning_rate": 3.125217391304348e-06, "loss": 1.6738, "step": 29620 }, { "epoch": 2.47, "grad_norm": 7.294098854064941, "learning_rate": 3.119420289855073e-06, "loss": 1.6663, "step": 29630 }, { "epoch": 2.47, "grad_norm": 3.76629376411438, "learning_rate": 3.113623188405797e-06, "loss": 1.6625, "step": 29640 }, { "epoch": 2.47, "grad_norm": 6.9984612464904785, "learning_rate": 3.107826086956522e-06, "loss": 1.5415, "step": 29650 }, { "epoch": 2.47, "grad_norm": 2.3784775733947754, "learning_rate": 3.1020289855072466e-06, "loss": 1.7637, "step": 29660 }, { "epoch": 2.47, "grad_norm": 5.289485454559326, "learning_rate": 3.0962318840579715e-06, "loss": 1.7363, "step": 29670 }, { "epoch": 2.47, "grad_norm": 3.363936424255371, "learning_rate": 3.090434782608696e-06, "loss": 1.7829, "step": 29680 }, { "epoch": 2.47, "grad_norm": 0.904037356376648, "learning_rate": 3.0846376811594204e-06, "loss": 1.6108, "step": 29690 }, { "epoch": 2.48, "grad_norm": 4.6849846839904785, "learning_rate": 3.0788405797101452e-06, "loss": 1.53, "step": 29700 }, { "epoch": 2.48, "grad_norm": 13.439961433410645, "learning_rate": 3.0730434782608696e-06, "loss": 1.6294, "step": 29710 }, { "epoch": 2.48, "grad_norm": 14.236470222473145, "learning_rate": 3.0672463768115945e-06, "loss": 1.5566, "step": 29720 }, { "epoch": 2.48, "grad_norm": 2.827075481414795, "learning_rate": 3.0614492753623193e-06, "loss": 1.6481, "step": 29730 }, { "epoch": 2.48, "grad_norm": 2.0929653644561768, "learning_rate": 3.0556521739130434e-06, "loss": 1.6433, "step": 29740 }, { "epoch": 2.48, "grad_norm": 6.4234700202941895, "learning_rate": 3.049855072463768e-06, "loss": 1.5093, "step": 29750 }, { "epoch": 2.48, "grad_norm": 3.105968952178955, "learning_rate": 3.044057971014493e-06, "loss": 1.5319, "step": 29760 }, { "epoch": 2.48, "grad_norm": 2.3987698554992676, "learning_rate": 3.038260869565218e-06, "loss": 1.5574, "step": 29770 }, { "epoch": 2.48, "grad_norm": 6.3514299392700195, "learning_rate": 3.0324637681159423e-06, "loss": 1.5673, "step": 29780 }, { "epoch": 2.48, "grad_norm": 10.782695770263672, "learning_rate": 3.0266666666666668e-06, "loss": 1.5443, "step": 29790 }, { "epoch": 2.48, "grad_norm": 5.090881824493408, "learning_rate": 3.020869565217391e-06, "loss": 1.5812, "step": 29800 }, { "epoch": 2.48, "grad_norm": 11.7606782913208, "learning_rate": 3.015072463768116e-06, "loss": 1.6019, "step": 29810 }, { "epoch": 2.48, "grad_norm": 10.391364097595215, "learning_rate": 3.009275362318841e-06, "loss": 1.5642, "step": 29820 }, { "epoch": 2.49, "grad_norm": 5.301925182342529, "learning_rate": 3.0034782608695658e-06, "loss": 1.575, "step": 29830 }, { "epoch": 2.49, "grad_norm": 4.747443675994873, "learning_rate": 2.99768115942029e-06, "loss": 1.7891, "step": 29840 }, { "epoch": 2.49, "grad_norm": 3.40224027633667, "learning_rate": 2.9918840579710146e-06, "loss": 1.7931, "step": 29850 }, { "epoch": 2.49, "grad_norm": 1.9095501899719238, "learning_rate": 2.9860869565217395e-06, "loss": 1.727, "step": 29860 }, { "epoch": 2.49, "grad_norm": 5.217514514923096, "learning_rate": 2.980289855072464e-06, "loss": 1.5056, "step": 29870 }, { "epoch": 2.49, "grad_norm": 3.045649766921997, "learning_rate": 2.9744927536231888e-06, "loss": 1.5775, "step": 29880 }, { "epoch": 2.49, "grad_norm": 3.6917686462402344, "learning_rate": 2.9686956521739136e-06, "loss": 1.6197, "step": 29890 }, { "epoch": 2.49, "grad_norm": 2.3253226280212402, "learning_rate": 2.9628985507246376e-06, "loss": 1.7079, "step": 29900 }, { "epoch": 2.49, "grad_norm": 2.5302798748016357, "learning_rate": 2.9571014492753625e-06, "loss": 1.5915, "step": 29910 }, { "epoch": 2.49, "grad_norm": 2.9401729106903076, "learning_rate": 2.9513043478260873e-06, "loss": 1.5172, "step": 29920 }, { "epoch": 2.49, "grad_norm": 4.63379430770874, "learning_rate": 2.945507246376812e-06, "loss": 1.6475, "step": 29930 }, { "epoch": 2.5, "grad_norm": 3.7005679607391357, "learning_rate": 2.9397101449275366e-06, "loss": 1.383, "step": 29940 }, { "epoch": 2.5, "grad_norm": 1.5383400917053223, "learning_rate": 2.933913043478261e-06, "loss": 1.6116, "step": 29950 }, { "epoch": 2.5, "grad_norm": 3.307799816131592, "learning_rate": 2.9281159420289855e-06, "loss": 1.7254, "step": 29960 }, { "epoch": 2.5, "grad_norm": 5.233972549438477, "learning_rate": 2.9223188405797103e-06, "loss": 1.5307, "step": 29970 }, { "epoch": 2.5, "grad_norm": 2.475644588470459, "learning_rate": 2.916521739130435e-06, "loss": 1.6835, "step": 29980 }, { "epoch": 2.5, "grad_norm": 3.9054574966430664, "learning_rate": 2.91072463768116e-06, "loss": 1.5383, "step": 29990 }, { "epoch": 2.5, "grad_norm": 2.8980517387390137, "learning_rate": 2.904927536231884e-06, "loss": 1.6759, "step": 30000 }, { "epoch": 2.5, "eval_loss": 1.6585332155227661, "eval_runtime": 107.501, "eval_samples_per_second": 9.302, "eval_steps_per_second": 2.326, "step": 30000 }, { "epoch": 2.5, "grad_norm": 1.9723654985427856, "learning_rate": 2.899130434782609e-06, "loss": 1.6399, "step": 30010 }, { "epoch": 2.5, "grad_norm": 7.082981586456299, "learning_rate": 2.8933333333333337e-06, "loss": 1.6246, "step": 30020 }, { "epoch": 2.5, "grad_norm": 5.984813690185547, "learning_rate": 2.887536231884058e-06, "loss": 1.6357, "step": 30030 }, { "epoch": 2.5, "grad_norm": 9.565591812133789, "learning_rate": 2.881739130434783e-06, "loss": 1.6206, "step": 30040 }, { "epoch": 2.5, "grad_norm": 13.118734359741211, "learning_rate": 2.8759420289855074e-06, "loss": 1.6352, "step": 30050 }, { "epoch": 2.5, "grad_norm": 2.744248867034912, "learning_rate": 2.870144927536232e-06, "loss": 1.4582, "step": 30060 }, { "epoch": 2.51, "grad_norm": 3.0195579528808594, "learning_rate": 2.8643478260869567e-06, "loss": 1.6441, "step": 30070 }, { "epoch": 2.51, "grad_norm": 3.9167885780334473, "learning_rate": 2.8585507246376816e-06, "loss": 1.7603, "step": 30080 }, { "epoch": 2.51, "grad_norm": 6.419768810272217, "learning_rate": 2.852753623188406e-06, "loss": 1.598, "step": 30090 }, { "epoch": 2.51, "grad_norm": 3.9617881774902344, "learning_rate": 2.8469565217391304e-06, "loss": 1.639, "step": 30100 }, { "epoch": 2.51, "grad_norm": 2.755096673965454, "learning_rate": 2.8411594202898553e-06, "loss": 1.6888, "step": 30110 }, { "epoch": 2.51, "grad_norm": 9.04216194152832, "learning_rate": 2.8353623188405797e-06, "loss": 1.6008, "step": 30120 }, { "epoch": 2.51, "grad_norm": 2.4999070167541504, "learning_rate": 2.8295652173913046e-06, "loss": 1.6094, "step": 30130 }, { "epoch": 2.51, "grad_norm": 10.484206199645996, "learning_rate": 2.8237681159420294e-06, "loss": 1.6802, "step": 30140 }, { "epoch": 2.51, "grad_norm": 7.458893299102783, "learning_rate": 2.8179710144927543e-06, "loss": 1.559, "step": 30150 }, { "epoch": 2.51, "grad_norm": 8.366874694824219, "learning_rate": 2.8121739130434783e-06, "loss": 1.5715, "step": 30160 }, { "epoch": 2.51, "grad_norm": 10.303510665893555, "learning_rate": 2.806376811594203e-06, "loss": 1.5516, "step": 30170 }, { "epoch": 2.52, "grad_norm": 4.16677713394165, "learning_rate": 2.800579710144928e-06, "loss": 1.5497, "step": 30180 }, { "epoch": 2.52, "grad_norm": 8.9423246383667, "learning_rate": 2.7947826086956524e-06, "loss": 1.7121, "step": 30190 }, { "epoch": 2.52, "grad_norm": 3.328890562057495, "learning_rate": 2.7889855072463773e-06, "loss": 1.5159, "step": 30200 }, { "epoch": 2.52, "grad_norm": 7.184723854064941, "learning_rate": 2.7831884057971013e-06, "loss": 1.6757, "step": 30210 }, { "epoch": 2.52, "grad_norm": 5.9649858474731445, "learning_rate": 2.777391304347826e-06, "loss": 1.4885, "step": 30220 }, { "epoch": 2.52, "grad_norm": 5.443271160125732, "learning_rate": 2.771594202898551e-06, "loss": 1.5998, "step": 30230 }, { "epoch": 2.52, "grad_norm": 2.292788028717041, "learning_rate": 2.765797101449276e-06, "loss": 1.5722, "step": 30240 }, { "epoch": 2.52, "grad_norm": 9.991342544555664, "learning_rate": 2.7600000000000003e-06, "loss": 1.6356, "step": 30250 }, { "epoch": 2.52, "grad_norm": 2.1068317890167236, "learning_rate": 2.7542028985507247e-06, "loss": 1.651, "step": 30260 }, { "epoch": 2.52, "grad_norm": 7.125411033630371, "learning_rate": 2.7484057971014495e-06, "loss": 1.5846, "step": 30270 }, { "epoch": 2.52, "grad_norm": 2.9197959899902344, "learning_rate": 2.742608695652174e-06, "loss": 1.6979, "step": 30280 }, { "epoch": 2.52, "grad_norm": 2.4163668155670166, "learning_rate": 2.736811594202899e-06, "loss": 1.6169, "step": 30290 }, { "epoch": 2.52, "grad_norm": 8.496438026428223, "learning_rate": 2.7310144927536237e-06, "loss": 1.6185, "step": 30300 }, { "epoch": 2.53, "grad_norm": 2.996656894683838, "learning_rate": 2.7252173913043477e-06, "loss": 1.747, "step": 30310 }, { "epoch": 2.53, "grad_norm": 2.3623204231262207, "learning_rate": 2.7194202898550725e-06, "loss": 1.6652, "step": 30320 }, { "epoch": 2.53, "grad_norm": 6.751153945922852, "learning_rate": 2.7136231884057974e-06, "loss": 1.5636, "step": 30330 }, { "epoch": 2.53, "grad_norm": 2.7132070064544678, "learning_rate": 2.7078260869565222e-06, "loss": 1.5571, "step": 30340 }, { "epoch": 2.53, "grad_norm": 12.558267593383789, "learning_rate": 2.7020289855072467e-06, "loss": 1.6319, "step": 30350 }, { "epoch": 2.53, "grad_norm": 4.379528522491455, "learning_rate": 2.696231884057971e-06, "loss": 1.6902, "step": 30360 }, { "epoch": 2.53, "grad_norm": 6.271668910980225, "learning_rate": 2.6904347826086955e-06, "loss": 1.7105, "step": 30370 }, { "epoch": 2.53, "grad_norm": 5.071247577667236, "learning_rate": 2.6846376811594204e-06, "loss": 1.5997, "step": 30380 }, { "epoch": 2.53, "grad_norm": 5.980905532836914, "learning_rate": 2.6788405797101452e-06, "loss": 1.6008, "step": 30390 }, { "epoch": 2.53, "grad_norm": 3.2732436656951904, "learning_rate": 2.67304347826087e-06, "loss": 1.6783, "step": 30400 }, { "epoch": 2.53, "grad_norm": 3.015439748764038, "learning_rate": 2.667246376811594e-06, "loss": 1.6118, "step": 30410 }, { "epoch": 2.54, "grad_norm": 3.7114615440368652, "learning_rate": 2.661449275362319e-06, "loss": 1.6513, "step": 30420 }, { "epoch": 2.54, "grad_norm": 4.484541893005371, "learning_rate": 2.655652173913044e-06, "loss": 1.6789, "step": 30430 }, { "epoch": 2.54, "grad_norm": 6.272510528564453, "learning_rate": 2.6498550724637682e-06, "loss": 1.5341, "step": 30440 }, { "epoch": 2.54, "grad_norm": 2.3156094551086426, "learning_rate": 2.644057971014493e-06, "loss": 1.7266, "step": 30450 }, { "epoch": 2.54, "grad_norm": 15.670822143554688, "learning_rate": 2.638260869565218e-06, "loss": 1.6492, "step": 30460 }, { "epoch": 2.54, "grad_norm": 10.792664527893066, "learning_rate": 2.632463768115942e-06, "loss": 1.7282, "step": 30470 }, { "epoch": 2.54, "grad_norm": 1.0802708864212036, "learning_rate": 2.6266666666666668e-06, "loss": 1.6272, "step": 30480 }, { "epoch": 2.54, "grad_norm": 1.8151583671569824, "learning_rate": 2.6208695652173916e-06, "loss": 1.6041, "step": 30490 }, { "epoch": 2.54, "grad_norm": 2.1548194885253906, "learning_rate": 2.6150724637681165e-06, "loss": 1.6457, "step": 30500 }, { "epoch": 2.54, "eval_loss": 1.642120599746704, "eval_runtime": 107.4944, "eval_samples_per_second": 9.303, "eval_steps_per_second": 2.326, "step": 30500 }, { "epoch": 2.54, "grad_norm": 1.8505381345748901, "learning_rate": 2.609275362318841e-06, "loss": 1.5916, "step": 30510 }, { "epoch": 2.54, "grad_norm": 4.077845096588135, "learning_rate": 2.6034782608695654e-06, "loss": 1.5464, "step": 30520 }, { "epoch": 2.54, "grad_norm": 3.143275022506714, "learning_rate": 2.5976811594202898e-06, "loss": 1.5341, "step": 30530 }, { "epoch": 2.54, "grad_norm": 4.190448760986328, "learning_rate": 2.5918840579710146e-06, "loss": 1.5817, "step": 30540 }, { "epoch": 2.55, "grad_norm": 2.009246587753296, "learning_rate": 2.5860869565217395e-06, "loss": 1.5787, "step": 30550 }, { "epoch": 2.55, "grad_norm": 6.875110149383545, "learning_rate": 2.5802898550724643e-06, "loss": 1.5827, "step": 30560 }, { "epoch": 2.55, "grad_norm": 8.125142097473145, "learning_rate": 2.5744927536231883e-06, "loss": 1.4877, "step": 30570 }, { "epoch": 2.55, "grad_norm": 6.384122848510742, "learning_rate": 2.568695652173913e-06, "loss": 1.6345, "step": 30580 }, { "epoch": 2.55, "grad_norm": 10.49345874786377, "learning_rate": 2.562898550724638e-06, "loss": 1.6619, "step": 30590 }, { "epoch": 2.55, "grad_norm": 1.870846152305603, "learning_rate": 2.5571014492753625e-06, "loss": 1.5736, "step": 30600 }, { "epoch": 2.55, "grad_norm": 3.0048367977142334, "learning_rate": 2.5513043478260873e-06, "loss": 1.6575, "step": 30610 }, { "epoch": 2.55, "grad_norm": 7.404007434844971, "learning_rate": 2.5455072463768118e-06, "loss": 1.6789, "step": 30620 }, { "epoch": 2.55, "grad_norm": 5.966566562652588, "learning_rate": 2.539710144927536e-06, "loss": 1.5581, "step": 30630 }, { "epoch": 2.55, "grad_norm": 2.988835096359253, "learning_rate": 2.533913043478261e-06, "loss": 1.4879, "step": 30640 }, { "epoch": 2.55, "grad_norm": 7.500990390777588, "learning_rate": 2.528115942028986e-06, "loss": 1.6214, "step": 30650 }, { "epoch": 2.56, "grad_norm": 6.991439342498779, "learning_rate": 2.5223188405797107e-06, "loss": 1.5002, "step": 30660 }, { "epoch": 2.56, "grad_norm": 2.588336706161499, "learning_rate": 2.5165217391304348e-06, "loss": 1.6576, "step": 30670 }, { "epoch": 2.56, "grad_norm": 2.5451226234436035, "learning_rate": 2.5107246376811596e-06, "loss": 1.6218, "step": 30680 }, { "epoch": 2.56, "grad_norm": 10.67144775390625, "learning_rate": 2.504927536231884e-06, "loss": 1.6439, "step": 30690 }, { "epoch": 2.56, "grad_norm": 14.395882606506348, "learning_rate": 2.499130434782609e-06, "loss": 1.5812, "step": 30700 }, { "epoch": 2.56, "grad_norm": 3.2323532104492188, "learning_rate": 2.4933333333333333e-06, "loss": 1.6072, "step": 30710 }, { "epoch": 2.56, "grad_norm": 2.6720938682556152, "learning_rate": 2.487536231884058e-06, "loss": 1.5967, "step": 30720 }, { "epoch": 2.56, "grad_norm": 2.9183871746063232, "learning_rate": 2.481739130434783e-06, "loss": 1.659, "step": 30730 }, { "epoch": 2.56, "grad_norm": 8.462002754211426, "learning_rate": 2.4759420289855075e-06, "loss": 1.6629, "step": 30740 }, { "epoch": 2.56, "grad_norm": 9.175357818603516, "learning_rate": 2.4701449275362323e-06, "loss": 1.6591, "step": 30750 }, { "epoch": 2.56, "grad_norm": 11.072102546691895, "learning_rate": 2.4643478260869567e-06, "loss": 1.5515, "step": 30760 }, { "epoch": 2.56, "grad_norm": 5.013851642608643, "learning_rate": 2.458550724637681e-06, "loss": 1.5813, "step": 30770 }, { "epoch": 2.56, "grad_norm": 1.7383862733840942, "learning_rate": 2.452753623188406e-06, "loss": 1.7057, "step": 30780 }, { "epoch": 2.57, "grad_norm": 5.308183670043945, "learning_rate": 2.4469565217391304e-06, "loss": 1.5989, "step": 30790 }, { "epoch": 2.57, "grad_norm": 4.486476898193359, "learning_rate": 2.4411594202898553e-06, "loss": 1.6127, "step": 30800 }, { "epoch": 2.57, "grad_norm": 4.1956095695495605, "learning_rate": 2.4353623188405797e-06, "loss": 1.515, "step": 30810 }, { "epoch": 2.57, "grad_norm": 6.074836730957031, "learning_rate": 2.4295652173913046e-06, "loss": 1.5636, "step": 30820 }, { "epoch": 2.57, "grad_norm": 3.5480659008026123, "learning_rate": 2.4237681159420294e-06, "loss": 1.6296, "step": 30830 }, { "epoch": 2.57, "grad_norm": 3.6157848834991455, "learning_rate": 2.417971014492754e-06, "loss": 1.7141, "step": 30840 }, { "epoch": 2.57, "grad_norm": 1.7036499977111816, "learning_rate": 2.4121739130434783e-06, "loss": 1.6024, "step": 30850 }, { "epoch": 2.57, "grad_norm": 9.184311866760254, "learning_rate": 2.406376811594203e-06, "loss": 1.6096, "step": 30860 }, { "epoch": 2.57, "grad_norm": 10.717764854431152, "learning_rate": 2.4005797101449276e-06, "loss": 1.6807, "step": 30870 }, { "epoch": 2.57, "grad_norm": 6.877434253692627, "learning_rate": 2.3947826086956524e-06, "loss": 1.6401, "step": 30880 }, { "epoch": 2.57, "grad_norm": 4.13861083984375, "learning_rate": 2.388985507246377e-06, "loss": 1.6329, "step": 30890 }, { "epoch": 2.58, "grad_norm": 3.996332883834839, "learning_rate": 2.3831884057971017e-06, "loss": 1.5384, "step": 30900 }, { "epoch": 2.58, "grad_norm": 1.9450478553771973, "learning_rate": 2.3773913043478266e-06, "loss": 1.6154, "step": 30910 }, { "epoch": 2.58, "grad_norm": 5.448505401611328, "learning_rate": 2.371594202898551e-06, "loss": 1.7094, "step": 30920 }, { "epoch": 2.58, "grad_norm": 8.060430526733398, "learning_rate": 2.3657971014492754e-06, "loss": 1.5228, "step": 30930 }, { "epoch": 2.58, "grad_norm": 11.698649406433105, "learning_rate": 2.3600000000000003e-06, "loss": 1.6043, "step": 30940 }, { "epoch": 2.58, "grad_norm": 3.175755262374878, "learning_rate": 2.3542028985507247e-06, "loss": 1.5744, "step": 30950 }, { "epoch": 2.58, "grad_norm": 0.9357259273529053, "learning_rate": 2.3484057971014496e-06, "loss": 1.5971, "step": 30960 }, { "epoch": 2.58, "grad_norm": 3.1497623920440674, "learning_rate": 2.342608695652174e-06, "loss": 1.5522, "step": 30970 }, { "epoch": 2.58, "grad_norm": 3.6525659561157227, "learning_rate": 2.336811594202899e-06, "loss": 1.6543, "step": 30980 }, { "epoch": 2.58, "grad_norm": 11.570157051086426, "learning_rate": 2.3310144927536237e-06, "loss": 1.5808, "step": 30990 }, { "epoch": 2.58, "grad_norm": 7.399476528167725, "learning_rate": 2.325217391304348e-06, "loss": 1.701, "step": 31000 }, { "epoch": 2.58, "eval_loss": 1.6337002515792847, "eval_runtime": 107.5309, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 31000 }, { "epoch": 2.58, "grad_norm": 4.558446407318115, "learning_rate": 2.3194202898550725e-06, "loss": 1.6179, "step": 31010 }, { "epoch": 2.58, "grad_norm": 2.1192896366119385, "learning_rate": 2.3142028985507247e-06, "loss": 1.4463, "step": 31020 }, { "epoch": 2.59, "grad_norm": 1.965945839881897, "learning_rate": 2.3084057971014495e-06, "loss": 1.6299, "step": 31030 }, { "epoch": 2.59, "grad_norm": 3.286752223968506, "learning_rate": 2.302608695652174e-06, "loss": 1.5018, "step": 31040 }, { "epoch": 2.59, "grad_norm": 1.3863645792007446, "learning_rate": 2.296811594202899e-06, "loss": 1.6844, "step": 31050 }, { "epoch": 2.59, "grad_norm": 13.46044921875, "learning_rate": 2.2910144927536237e-06, "loss": 1.6116, "step": 31060 }, { "epoch": 2.59, "grad_norm": 7.819109916687012, "learning_rate": 2.285217391304348e-06, "loss": 1.5284, "step": 31070 }, { "epoch": 2.59, "grad_norm": 3.452699899673462, "learning_rate": 2.2794202898550725e-06, "loss": 1.6716, "step": 31080 }, { "epoch": 2.59, "grad_norm": 4.367595195770264, "learning_rate": 2.2736231884057974e-06, "loss": 1.4822, "step": 31090 }, { "epoch": 2.59, "grad_norm": 9.822129249572754, "learning_rate": 2.267826086956522e-06, "loss": 1.6761, "step": 31100 }, { "epoch": 2.59, "grad_norm": 3.4133310317993164, "learning_rate": 2.2620289855072466e-06, "loss": 1.655, "step": 31110 }, { "epoch": 2.59, "grad_norm": 6.704021453857422, "learning_rate": 2.256231884057971e-06, "loss": 1.531, "step": 31120 }, { "epoch": 2.59, "grad_norm": 6.551307201385498, "learning_rate": 2.250434782608696e-06, "loss": 1.6892, "step": 31130 }, { "epoch": 2.59, "grad_norm": 4.172097682952881, "learning_rate": 2.2446376811594208e-06, "loss": 1.6129, "step": 31140 }, { "epoch": 2.6, "grad_norm": 7.622472286224365, "learning_rate": 2.238840579710145e-06, "loss": 1.5993, "step": 31150 }, { "epoch": 2.6, "grad_norm": 3.656231641769409, "learning_rate": 2.2330434782608696e-06, "loss": 1.6874, "step": 31160 }, { "epoch": 2.6, "grad_norm": 2.335791826248169, "learning_rate": 2.2272463768115945e-06, "loss": 1.5399, "step": 31170 }, { "epoch": 2.6, "grad_norm": 0.9562406539916992, "learning_rate": 2.221449275362319e-06, "loss": 1.7015, "step": 31180 }, { "epoch": 2.6, "grad_norm": 5.210826873779297, "learning_rate": 2.2156521739130438e-06, "loss": 1.5114, "step": 31190 }, { "epoch": 2.6, "grad_norm": 9.8193998336792, "learning_rate": 2.209855072463768e-06, "loss": 1.509, "step": 31200 }, { "epoch": 2.6, "grad_norm": 4.946864128112793, "learning_rate": 2.204057971014493e-06, "loss": 1.6648, "step": 31210 }, { "epoch": 2.6, "grad_norm": 4.112027168273926, "learning_rate": 2.1982608695652175e-06, "loss": 1.6629, "step": 31220 }, { "epoch": 2.6, "grad_norm": 1.7467082738876343, "learning_rate": 2.1924637681159423e-06, "loss": 1.637, "step": 31230 }, { "epoch": 2.6, "grad_norm": 5.126824855804443, "learning_rate": 2.1866666666666668e-06, "loss": 1.6365, "step": 31240 }, { "epoch": 2.6, "grad_norm": 7.7713398933410645, "learning_rate": 2.180869565217391e-06, "loss": 1.4703, "step": 31250 }, { "epoch": 2.6, "grad_norm": 5.900889873504639, "learning_rate": 2.175072463768116e-06, "loss": 1.5412, "step": 31260 }, { "epoch": 2.61, "grad_norm": 3.582507848739624, "learning_rate": 2.169275362318841e-06, "loss": 1.51, "step": 31270 }, { "epoch": 2.61, "grad_norm": 4.354831218719482, "learning_rate": 2.1634782608695653e-06, "loss": 1.7147, "step": 31280 }, { "epoch": 2.61, "grad_norm": 5.068592071533203, "learning_rate": 2.15768115942029e-06, "loss": 1.5849, "step": 31290 }, { "epoch": 2.61, "grad_norm": 3.196958065032959, "learning_rate": 2.1518840579710146e-06, "loss": 1.6493, "step": 31300 }, { "epoch": 2.61, "grad_norm": 6.429135322570801, "learning_rate": 2.1460869565217395e-06, "loss": 1.5347, "step": 31310 }, { "epoch": 2.61, "grad_norm": 5.728844165802002, "learning_rate": 2.140289855072464e-06, "loss": 1.5515, "step": 31320 }, { "epoch": 2.61, "grad_norm": 13.872916221618652, "learning_rate": 2.1344927536231883e-06, "loss": 1.602, "step": 31330 }, { "epoch": 2.61, "grad_norm": 4.392234802246094, "learning_rate": 2.128695652173913e-06, "loss": 1.7038, "step": 31340 }, { "epoch": 2.61, "grad_norm": 6.907491683959961, "learning_rate": 2.1228985507246376e-06, "loss": 1.5619, "step": 31350 }, { "epoch": 2.61, "grad_norm": 5.335078239440918, "learning_rate": 2.1171014492753625e-06, "loss": 1.5478, "step": 31360 }, { "epoch": 2.61, "grad_norm": 1.8956576585769653, "learning_rate": 2.1113043478260873e-06, "loss": 1.6199, "step": 31370 }, { "epoch": 2.62, "grad_norm": 12.366223335266113, "learning_rate": 2.1055072463768117e-06, "loss": 1.4663, "step": 31380 }, { "epoch": 2.62, "grad_norm": 5.7240118980407715, "learning_rate": 2.0997101449275366e-06, "loss": 1.7283, "step": 31390 }, { "epoch": 2.62, "grad_norm": 3.118774652481079, "learning_rate": 2.093913043478261e-06, "loss": 1.6955, "step": 31400 }, { "epoch": 2.62, "grad_norm": 2.2114875316619873, "learning_rate": 2.0881159420289855e-06, "loss": 1.5495, "step": 31410 }, { "epoch": 2.62, "grad_norm": 2.9266295433044434, "learning_rate": 2.0823188405797103e-06, "loss": 1.5095, "step": 31420 }, { "epoch": 2.62, "grad_norm": 2.1518819332122803, "learning_rate": 2.0765217391304347e-06, "loss": 1.6809, "step": 31430 }, { "epoch": 2.62, "grad_norm": 5.437044620513916, "learning_rate": 2.0707246376811596e-06, "loss": 1.6229, "step": 31440 }, { "epoch": 2.62, "grad_norm": 4.866549491882324, "learning_rate": 2.0649275362318844e-06, "loss": 1.6808, "step": 31450 }, { "epoch": 2.62, "grad_norm": 6.811148166656494, "learning_rate": 2.059130434782609e-06, "loss": 1.4217, "step": 31460 }, { "epoch": 2.62, "grad_norm": 5.14778470993042, "learning_rate": 2.0533333333333337e-06, "loss": 1.6761, "step": 31470 }, { "epoch": 2.62, "grad_norm": 2.273214340209961, "learning_rate": 2.047536231884058e-06, "loss": 1.5311, "step": 31480 }, { "epoch": 2.62, "grad_norm": 0.9031133055686951, "learning_rate": 2.0417391304347826e-06, "loss": 1.7013, "step": 31490 }, { "epoch": 2.62, "grad_norm": 10.49181079864502, "learning_rate": 2.0359420289855074e-06, "loss": 1.6723, "step": 31500 }, { "epoch": 2.62, "eval_loss": 1.6217410564422607, "eval_runtime": 107.5528, "eval_samples_per_second": 9.298, "eval_steps_per_second": 2.324, "step": 31500 }, { "epoch": 2.63, "grad_norm": 2.9454545974731445, "learning_rate": 2.030144927536232e-06, "loss": 1.6591, "step": 31510 }, { "epoch": 2.63, "grad_norm": 9.571122169494629, "learning_rate": 2.0243478260869567e-06, "loss": 1.6785, "step": 31520 }, { "epoch": 2.63, "grad_norm": 7.118087291717529, "learning_rate": 2.018550724637681e-06, "loss": 1.5377, "step": 31530 }, { "epoch": 2.63, "grad_norm": 16.46897315979004, "learning_rate": 2.012753623188406e-06, "loss": 1.5683, "step": 31540 }, { "epoch": 2.63, "grad_norm": 2.300513505935669, "learning_rate": 2.006956521739131e-06, "loss": 1.6441, "step": 31550 }, { "epoch": 2.63, "grad_norm": 1.414223551750183, "learning_rate": 2.0011594202898553e-06, "loss": 1.4831, "step": 31560 }, { "epoch": 2.63, "grad_norm": 2.0008349418640137, "learning_rate": 1.9953623188405797e-06, "loss": 1.6988, "step": 31570 }, { "epoch": 2.63, "grad_norm": 2.3853042125701904, "learning_rate": 1.9895652173913046e-06, "loss": 1.6968, "step": 31580 }, { "epoch": 2.63, "grad_norm": 5.788661956787109, "learning_rate": 1.983768115942029e-06, "loss": 1.7416, "step": 31590 }, { "epoch": 2.63, "grad_norm": 3.9878652095794678, "learning_rate": 1.977971014492754e-06, "loss": 1.733, "step": 31600 }, { "epoch": 2.63, "grad_norm": 2.0681121349334717, "learning_rate": 1.9721739130434783e-06, "loss": 1.5268, "step": 31610 }, { "epoch": 2.63, "grad_norm": 6.360716342926025, "learning_rate": 1.966376811594203e-06, "loss": 1.6611, "step": 31620 }, { "epoch": 2.64, "grad_norm": 1.8834378719329834, "learning_rate": 1.960579710144928e-06, "loss": 1.59, "step": 31630 }, { "epoch": 2.64, "grad_norm": 5.510819911956787, "learning_rate": 1.9547826086956524e-06, "loss": 1.6145, "step": 31640 }, { "epoch": 2.64, "grad_norm": 2.6274240016937256, "learning_rate": 1.948985507246377e-06, "loss": 1.5678, "step": 31650 }, { "epoch": 2.64, "grad_norm": 2.6434683799743652, "learning_rate": 1.9431884057971017e-06, "loss": 1.6534, "step": 31660 }, { "epoch": 2.64, "grad_norm": 5.2217583656311035, "learning_rate": 1.937391304347826e-06, "loss": 1.5893, "step": 31670 }, { "epoch": 2.64, "grad_norm": 6.410333156585693, "learning_rate": 1.931594202898551e-06, "loss": 1.7107, "step": 31680 }, { "epoch": 2.64, "grad_norm": 7.742589473724365, "learning_rate": 1.9257971014492754e-06, "loss": 1.7014, "step": 31690 }, { "epoch": 2.64, "grad_norm": 5.723726272583008, "learning_rate": 1.9200000000000003e-06, "loss": 1.4988, "step": 31700 }, { "epoch": 2.64, "grad_norm": 6.500274181365967, "learning_rate": 1.9142028985507247e-06, "loss": 1.5023, "step": 31710 }, { "epoch": 2.64, "grad_norm": 2.310852527618408, "learning_rate": 1.9084057971014495e-06, "loss": 1.65, "step": 31720 }, { "epoch": 2.64, "grad_norm": 8.708599090576172, "learning_rate": 1.9026086956521742e-06, "loss": 1.6035, "step": 31730 }, { "epoch": 2.65, "grad_norm": 7.076088905334473, "learning_rate": 1.8968115942028986e-06, "loss": 1.5787, "step": 31740 }, { "epoch": 2.65, "grad_norm": 4.023421287536621, "learning_rate": 1.8910144927536235e-06, "loss": 1.6103, "step": 31750 }, { "epoch": 2.65, "grad_norm": 10.380520820617676, "learning_rate": 1.885217391304348e-06, "loss": 1.5645, "step": 31760 }, { "epoch": 2.65, "grad_norm": 12.353821754455566, "learning_rate": 1.8794202898550725e-06, "loss": 1.487, "step": 31770 }, { "epoch": 2.65, "grad_norm": 10.361377716064453, "learning_rate": 1.8736231884057974e-06, "loss": 1.7336, "step": 31780 }, { "epoch": 2.65, "grad_norm": 2.4413888454437256, "learning_rate": 1.8678260869565218e-06, "loss": 1.6445, "step": 31790 }, { "epoch": 2.65, "grad_norm": 5.693152904510498, "learning_rate": 1.8620289855072465e-06, "loss": 1.5021, "step": 31800 }, { "epoch": 2.65, "grad_norm": 27.886554718017578, "learning_rate": 1.8562318840579713e-06, "loss": 1.5851, "step": 31810 }, { "epoch": 2.65, "grad_norm": 0.8801918029785156, "learning_rate": 1.8504347826086957e-06, "loss": 1.6223, "step": 31820 }, { "epoch": 2.65, "grad_norm": 3.8942666053771973, "learning_rate": 1.8446376811594206e-06, "loss": 1.7608, "step": 31830 }, { "epoch": 2.65, "grad_norm": 2.6183860301971436, "learning_rate": 1.838840579710145e-06, "loss": 1.644, "step": 31840 }, { "epoch": 2.65, "grad_norm": 2.7478418350219727, "learning_rate": 1.8330434782608697e-06, "loss": 1.4282, "step": 31850 }, { "epoch": 2.66, "grad_norm": 3.082428216934204, "learning_rate": 1.8272463768115945e-06, "loss": 1.6814, "step": 31860 }, { "epoch": 2.66, "grad_norm": 4.891972064971924, "learning_rate": 1.821449275362319e-06, "loss": 1.5985, "step": 31870 }, { "epoch": 2.66, "grad_norm": 10.067206382751465, "learning_rate": 1.8156521739130436e-06, "loss": 1.411, "step": 31880 }, { "epoch": 2.66, "grad_norm": 7.1874613761901855, "learning_rate": 1.8098550724637682e-06, "loss": 1.5593, "step": 31890 }, { "epoch": 2.66, "grad_norm": 2.310758590698242, "learning_rate": 1.8040579710144929e-06, "loss": 1.5513, "step": 31900 }, { "epoch": 2.66, "grad_norm": 3.2715983390808105, "learning_rate": 1.7982608695652177e-06, "loss": 1.7019, "step": 31910 }, { "epoch": 2.66, "grad_norm": 8.090458869934082, "learning_rate": 1.7924637681159421e-06, "loss": 1.6117, "step": 31920 }, { "epoch": 2.66, "grad_norm": 12.502359390258789, "learning_rate": 1.7866666666666668e-06, "loss": 1.4899, "step": 31930 }, { "epoch": 2.66, "grad_norm": 13.158916473388672, "learning_rate": 1.7808695652173916e-06, "loss": 1.7233, "step": 31940 }, { "epoch": 2.66, "grad_norm": 15.319182395935059, "learning_rate": 1.775072463768116e-06, "loss": 1.5413, "step": 31950 }, { "epoch": 2.66, "grad_norm": 4.916179180145264, "learning_rate": 1.7692753623188407e-06, "loss": 1.6006, "step": 31960 }, { "epoch": 2.66, "grad_norm": 8.421045303344727, "learning_rate": 1.7634782608695653e-06, "loss": 1.5972, "step": 31970 }, { "epoch": 2.67, "grad_norm": 2.742832660675049, "learning_rate": 1.75768115942029e-06, "loss": 1.6625, "step": 31980 }, { "epoch": 2.67, "grad_norm": 1.5600188970565796, "learning_rate": 1.7518840579710148e-06, "loss": 1.647, "step": 31990 }, { "epoch": 2.67, "grad_norm": 3.127601385116577, "learning_rate": 1.7460869565217393e-06, "loss": 1.5884, "step": 32000 }, { "epoch": 2.67, "eval_loss": 1.6017138957977295, "eval_runtime": 107.5185, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 32000 }, { "epoch": 2.67, "grad_norm": 8.60046672821045, "learning_rate": 1.740289855072464e-06, "loss": 1.5752, "step": 32010 }, { "epoch": 2.67, "grad_norm": 14.514521598815918, "learning_rate": 1.7344927536231883e-06, "loss": 1.6261, "step": 32020 }, { "epoch": 2.67, "grad_norm": 8.017278671264648, "learning_rate": 1.7286956521739132e-06, "loss": 1.6809, "step": 32030 }, { "epoch": 2.67, "grad_norm": 1.6506872177124023, "learning_rate": 1.7228985507246378e-06, "loss": 1.6235, "step": 32040 }, { "epoch": 2.67, "grad_norm": 5.44380521774292, "learning_rate": 1.7171014492753625e-06, "loss": 1.6831, "step": 32050 }, { "epoch": 2.67, "grad_norm": 6.318562984466553, "learning_rate": 1.7113043478260871e-06, "loss": 1.726, "step": 32060 }, { "epoch": 2.67, "grad_norm": 2.484785556793213, "learning_rate": 1.705507246376812e-06, "loss": 1.63, "step": 32070 }, { "epoch": 2.67, "grad_norm": 3.5618247985839844, "learning_rate": 1.6997101449275364e-06, "loss": 1.6426, "step": 32080 }, { "epoch": 2.67, "grad_norm": 1.5634952783584595, "learning_rate": 1.693913043478261e-06, "loss": 1.506, "step": 32090 }, { "epoch": 2.67, "grad_norm": 5.788817882537842, "learning_rate": 1.6881159420289855e-06, "loss": 1.6246, "step": 32100 }, { "epoch": 2.68, "grad_norm": 7.975259780883789, "learning_rate": 1.6823188405797103e-06, "loss": 1.5248, "step": 32110 }, { "epoch": 2.68, "grad_norm": 8.401017189025879, "learning_rate": 1.676521739130435e-06, "loss": 1.5491, "step": 32120 }, { "epoch": 2.68, "grad_norm": 1.3177763223648071, "learning_rate": 1.6707246376811596e-06, "loss": 1.6666, "step": 32130 }, { "epoch": 2.68, "grad_norm": 3.550060749053955, "learning_rate": 1.6649275362318842e-06, "loss": 1.4496, "step": 32140 }, { "epoch": 2.68, "grad_norm": 4.780557155609131, "learning_rate": 1.6591304347826087e-06, "loss": 1.5778, "step": 32150 }, { "epoch": 2.68, "grad_norm": 1.2793478965759277, "learning_rate": 1.6533333333333335e-06, "loss": 1.7047, "step": 32160 }, { "epoch": 2.68, "grad_norm": 2.5238006114959717, "learning_rate": 1.6475362318840582e-06, "loss": 1.6762, "step": 32170 }, { "epoch": 2.68, "grad_norm": 5.78254508972168, "learning_rate": 1.6417391304347826e-06, "loss": 1.473, "step": 32180 }, { "epoch": 2.68, "grad_norm": 3.1378421783447266, "learning_rate": 1.6359420289855074e-06, "loss": 1.5478, "step": 32190 }, { "epoch": 2.68, "grad_norm": 4.800199508666992, "learning_rate": 1.6301449275362319e-06, "loss": 1.7282, "step": 32200 }, { "epoch": 2.68, "grad_norm": 11.791644096374512, "learning_rate": 1.6243478260869565e-06, "loss": 1.6786, "step": 32210 }, { "epoch": 2.69, "grad_norm": 6.185345649719238, "learning_rate": 1.6185507246376814e-06, "loss": 1.5419, "step": 32220 }, { "epoch": 2.69, "grad_norm": 4.825332164764404, "learning_rate": 1.6127536231884058e-06, "loss": 1.7859, "step": 32230 }, { "epoch": 2.69, "grad_norm": 2.289442300796509, "learning_rate": 1.6069565217391307e-06, "loss": 1.7831, "step": 32240 }, { "epoch": 2.69, "grad_norm": 11.469537734985352, "learning_rate": 1.6011594202898553e-06, "loss": 1.591, "step": 32250 }, { "epoch": 2.69, "grad_norm": 2.8224618434906006, "learning_rate": 1.5953623188405797e-06, "loss": 1.7497, "step": 32260 }, { "epoch": 2.69, "grad_norm": 3.616290807723999, "learning_rate": 1.5895652173913046e-06, "loss": 1.6872, "step": 32270 }, { "epoch": 2.69, "grad_norm": 1.3341703414916992, "learning_rate": 1.583768115942029e-06, "loss": 1.593, "step": 32280 }, { "epoch": 2.69, "grad_norm": 8.595649719238281, "learning_rate": 1.5779710144927536e-06, "loss": 1.5657, "step": 32290 }, { "epoch": 2.69, "grad_norm": 2.221670389175415, "learning_rate": 1.5721739130434785e-06, "loss": 1.6259, "step": 32300 }, { "epoch": 2.69, "grad_norm": 2.5493180751800537, "learning_rate": 1.566376811594203e-06, "loss": 1.6628, "step": 32310 }, { "epoch": 2.69, "grad_norm": 1.4357565641403198, "learning_rate": 1.5605797101449278e-06, "loss": 1.5747, "step": 32320 }, { "epoch": 2.69, "grad_norm": 16.11993408203125, "learning_rate": 1.5547826086956522e-06, "loss": 1.4607, "step": 32330 }, { "epoch": 2.69, "grad_norm": 2.597362995147705, "learning_rate": 1.5489855072463769e-06, "loss": 1.5912, "step": 32340 }, { "epoch": 2.7, "grad_norm": 4.872979164123535, "learning_rate": 1.5431884057971017e-06, "loss": 1.4533, "step": 32350 }, { "epoch": 2.7, "grad_norm": 3.2023985385894775, "learning_rate": 1.5373913043478261e-06, "loss": 1.5058, "step": 32360 }, { "epoch": 2.7, "grad_norm": 4.435876369476318, "learning_rate": 1.5315942028985508e-06, "loss": 1.5454, "step": 32370 }, { "epoch": 2.7, "grad_norm": 2.7985036373138428, "learning_rate": 1.5257971014492756e-06, "loss": 1.6741, "step": 32380 }, { "epoch": 2.7, "grad_norm": 6.3253607749938965, "learning_rate": 1.52e-06, "loss": 1.7431, "step": 32390 }, { "epoch": 2.7, "grad_norm": 1.4724886417388916, "learning_rate": 1.514202898550725e-06, "loss": 1.6594, "step": 32400 }, { "epoch": 2.7, "grad_norm": 2.5811171531677246, "learning_rate": 1.5084057971014493e-06, "loss": 1.5847, "step": 32410 }, { "epoch": 2.7, "grad_norm": 9.18812084197998, "learning_rate": 1.502608695652174e-06, "loss": 1.6295, "step": 32420 }, { "epoch": 2.7, "grad_norm": 9.978080749511719, "learning_rate": 1.4968115942028988e-06, "loss": 1.5189, "step": 32430 }, { "epoch": 2.7, "grad_norm": 2.765017509460449, "learning_rate": 1.4910144927536233e-06, "loss": 1.6167, "step": 32440 }, { "epoch": 2.7, "grad_norm": 3.3332769870758057, "learning_rate": 1.485217391304348e-06, "loss": 1.6536, "step": 32450 }, { "epoch": 2.71, "grad_norm": 7.815036773681641, "learning_rate": 1.4794202898550725e-06, "loss": 1.6844, "step": 32460 }, { "epoch": 2.71, "grad_norm": 8.239996910095215, "learning_rate": 1.4736231884057972e-06, "loss": 1.6551, "step": 32470 }, { "epoch": 2.71, "grad_norm": 10.380647659301758, "learning_rate": 1.467826086956522e-06, "loss": 1.6455, "step": 32480 }, { "epoch": 2.71, "grad_norm": 14.243300437927246, "learning_rate": 1.4620289855072465e-06, "loss": 1.6603, "step": 32490 }, { "epoch": 2.71, "grad_norm": 2.9138994216918945, "learning_rate": 1.4562318840579711e-06, "loss": 1.56, "step": 32500 }, { "epoch": 2.71, "eval_loss": 1.6212100982666016, "eval_runtime": 107.5288, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 32500 }, { "epoch": 2.71, "grad_norm": 10.24506664276123, "learning_rate": 1.4504347826086955e-06, "loss": 1.6814, "step": 32510 }, { "epoch": 2.71, "grad_norm": 12.093935012817383, "learning_rate": 1.4446376811594204e-06, "loss": 1.611, "step": 32520 }, { "epoch": 2.71, "grad_norm": 5.0004658699035645, "learning_rate": 1.438840579710145e-06, "loss": 1.5318, "step": 32530 }, { "epoch": 2.71, "grad_norm": 3.960986614227295, "learning_rate": 1.4330434782608697e-06, "loss": 1.6179, "step": 32540 }, { "epoch": 2.71, "grad_norm": 3.640979290008545, "learning_rate": 1.4272463768115943e-06, "loss": 1.6522, "step": 32550 }, { "epoch": 2.71, "grad_norm": 3.157996892929077, "learning_rate": 1.4214492753623192e-06, "loss": 1.6118, "step": 32560 }, { "epoch": 2.71, "grad_norm": 7.887393474578857, "learning_rate": 1.4156521739130436e-06, "loss": 1.6207, "step": 32570 }, { "epoch": 2.71, "grad_norm": 11.028023719787598, "learning_rate": 1.4098550724637682e-06, "loss": 1.5071, "step": 32580 }, { "epoch": 2.72, "grad_norm": 2.2174723148345947, "learning_rate": 1.4040579710144927e-06, "loss": 1.6467, "step": 32590 }, { "epoch": 2.72, "grad_norm": 2.945136785507202, "learning_rate": 1.3982608695652175e-06, "loss": 1.6025, "step": 32600 }, { "epoch": 2.72, "grad_norm": 2.605086326599121, "learning_rate": 1.3924637681159422e-06, "loss": 1.5723, "step": 32610 }, { "epoch": 2.72, "grad_norm": 1.6031261682510376, "learning_rate": 1.3866666666666668e-06, "loss": 1.6667, "step": 32620 }, { "epoch": 2.72, "grad_norm": 10.576859474182129, "learning_rate": 1.3808695652173914e-06, "loss": 1.5214, "step": 32630 }, { "epoch": 2.72, "grad_norm": 3.951624870300293, "learning_rate": 1.3750724637681159e-06, "loss": 1.4939, "step": 32640 }, { "epoch": 2.72, "grad_norm": 7.097296714782715, "learning_rate": 1.3692753623188407e-06, "loss": 1.4366, "step": 32650 }, { "epoch": 2.72, "grad_norm": 2.388517379760742, "learning_rate": 1.3634782608695654e-06, "loss": 1.5237, "step": 32660 }, { "epoch": 2.72, "grad_norm": 7.370208263397217, "learning_rate": 1.3576811594202898e-06, "loss": 1.7699, "step": 32670 }, { "epoch": 2.72, "grad_norm": 3.332535982131958, "learning_rate": 1.3518840579710146e-06, "loss": 1.6375, "step": 32680 }, { "epoch": 2.72, "grad_norm": 1.344327449798584, "learning_rate": 1.3460869565217393e-06, "loss": 1.4128, "step": 32690 }, { "epoch": 2.73, "grad_norm": 9.12246036529541, "learning_rate": 1.340289855072464e-06, "loss": 1.4211, "step": 32700 }, { "epoch": 2.73, "grad_norm": 2.78950572013855, "learning_rate": 1.3344927536231886e-06, "loss": 1.6012, "step": 32710 }, { "epoch": 2.73, "grad_norm": 3.5925843715667725, "learning_rate": 1.328695652173913e-06, "loss": 1.5014, "step": 32720 }, { "epoch": 2.73, "grad_norm": 1.6471080780029297, "learning_rate": 1.3228985507246379e-06, "loss": 1.6523, "step": 32730 }, { "epoch": 2.73, "grad_norm": 12.4561128616333, "learning_rate": 1.3171014492753625e-06, "loss": 1.6558, "step": 32740 }, { "epoch": 2.73, "grad_norm": 2.051928997039795, "learning_rate": 1.311304347826087e-06, "loss": 1.5797, "step": 32750 }, { "epoch": 2.73, "grad_norm": 6.791071891784668, "learning_rate": 1.3055072463768118e-06, "loss": 1.5058, "step": 32760 }, { "epoch": 2.73, "grad_norm": 4.921985149383545, "learning_rate": 1.2997101449275362e-06, "loss": 1.727, "step": 32770 }, { "epoch": 2.73, "grad_norm": 3.559007406234741, "learning_rate": 1.293913043478261e-06, "loss": 1.5479, "step": 32780 }, { "epoch": 2.73, "grad_norm": 7.684724807739258, "learning_rate": 1.2881159420289857e-06, "loss": 1.5711, "step": 32790 }, { "epoch": 2.73, "grad_norm": 6.558244228363037, "learning_rate": 1.2823188405797101e-06, "loss": 1.5848, "step": 32800 }, { "epoch": 2.73, "grad_norm": 4.056134223937988, "learning_rate": 1.276521739130435e-06, "loss": 1.6693, "step": 32810 }, { "epoch": 2.73, "grad_norm": 16.4197998046875, "learning_rate": 1.2707246376811594e-06, "loss": 1.5769, "step": 32820 }, { "epoch": 2.74, "grad_norm": 4.05332612991333, "learning_rate": 1.264927536231884e-06, "loss": 1.6521, "step": 32830 }, { "epoch": 2.74, "grad_norm": 10.956727027893066, "learning_rate": 1.259130434782609e-06, "loss": 1.596, "step": 32840 }, { "epoch": 2.74, "grad_norm": 4.413317680358887, "learning_rate": 1.2533333333333333e-06, "loss": 1.496, "step": 32850 }, { "epoch": 2.74, "grad_norm": 3.767711639404297, "learning_rate": 1.2475362318840582e-06, "loss": 1.5371, "step": 32860 }, { "epoch": 2.74, "grad_norm": 5.680372714996338, "learning_rate": 1.2417391304347826e-06, "loss": 1.531, "step": 32870 }, { "epoch": 2.74, "grad_norm": 1.4916908740997314, "learning_rate": 1.2359420289855073e-06, "loss": 1.6898, "step": 32880 }, { "epoch": 2.74, "grad_norm": 9.154963493347168, "learning_rate": 1.2301449275362321e-06, "loss": 1.4688, "step": 32890 }, { "epoch": 2.74, "grad_norm": 2.380986213684082, "learning_rate": 1.2243478260869567e-06, "loss": 1.7119, "step": 32900 }, { "epoch": 2.74, "grad_norm": 10.677800178527832, "learning_rate": 1.2185507246376812e-06, "loss": 1.616, "step": 32910 }, { "epoch": 2.74, "grad_norm": 7.30240535736084, "learning_rate": 1.2127536231884058e-06, "loss": 1.5754, "step": 32920 }, { "epoch": 2.74, "grad_norm": 9.45409870147705, "learning_rate": 1.2069565217391305e-06, "loss": 1.6461, "step": 32930 }, { "epoch": 2.75, "grad_norm": 6.102117538452148, "learning_rate": 1.2011594202898553e-06, "loss": 1.4394, "step": 32940 }, { "epoch": 2.75, "grad_norm": 6.18507194519043, "learning_rate": 1.1953623188405797e-06, "loss": 1.6139, "step": 32950 }, { "epoch": 2.75, "grad_norm": 7.034895896911621, "learning_rate": 1.1895652173913044e-06, "loss": 1.571, "step": 32960 }, { "epoch": 2.75, "grad_norm": 1.3450069427490234, "learning_rate": 1.183768115942029e-06, "loss": 1.4893, "step": 32970 }, { "epoch": 2.75, "grad_norm": 4.201149940490723, "learning_rate": 1.1779710144927539e-06, "loss": 1.4029, "step": 32980 }, { "epoch": 2.75, "grad_norm": 1.9617148637771606, "learning_rate": 1.1721739130434783e-06, "loss": 1.6637, "step": 32990 }, { "epoch": 2.75, "grad_norm": 1.6870609521865845, "learning_rate": 1.166376811594203e-06, "loss": 1.5653, "step": 33000 }, { "epoch": 2.75, "eval_loss": 1.617643117904663, "eval_runtime": 107.5322, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 33000 }, { "epoch": 2.75, "grad_norm": 2.035808563232422, "learning_rate": 1.1605797101449276e-06, "loss": 1.5895, "step": 33010 }, { "epoch": 2.75, "grad_norm": 3.8778083324432373, "learning_rate": 1.1547826086956522e-06, "loss": 1.5143, "step": 33020 }, { "epoch": 2.75, "grad_norm": 2.143531322479248, "learning_rate": 1.1489855072463769e-06, "loss": 1.5878, "step": 33030 }, { "epoch": 2.75, "grad_norm": 2.5766236782073975, "learning_rate": 1.1431884057971015e-06, "loss": 1.5166, "step": 33040 }, { "epoch": 2.75, "grad_norm": 3.0880119800567627, "learning_rate": 1.1373913043478262e-06, "loss": 1.5383, "step": 33050 }, { "epoch": 2.75, "grad_norm": 6.4174723625183105, "learning_rate": 1.1315942028985508e-06, "loss": 1.6729, "step": 33060 }, { "epoch": 2.76, "grad_norm": 2.1581320762634277, "learning_rate": 1.1257971014492754e-06, "loss": 1.7279, "step": 33070 }, { "epoch": 2.76, "grad_norm": 4.984561443328857, "learning_rate": 1.12e-06, "loss": 1.651, "step": 33080 }, { "epoch": 2.76, "grad_norm": 7.201277732849121, "learning_rate": 1.1142028985507247e-06, "loss": 1.6384, "step": 33090 }, { "epoch": 2.76, "grad_norm": 2.131202459335327, "learning_rate": 1.1084057971014494e-06, "loss": 1.5721, "step": 33100 }, { "epoch": 2.76, "grad_norm": 10.104864120483398, "learning_rate": 1.102608695652174e-06, "loss": 1.7424, "step": 33110 }, { "epoch": 2.76, "grad_norm": 6.0055012702941895, "learning_rate": 1.0968115942028986e-06, "loss": 1.5323, "step": 33120 }, { "epoch": 2.76, "grad_norm": 5.302622318267822, "learning_rate": 1.0910144927536233e-06, "loss": 1.4631, "step": 33130 }, { "epoch": 2.76, "grad_norm": 7.973222732543945, "learning_rate": 1.085217391304348e-06, "loss": 1.7163, "step": 33140 }, { "epoch": 2.76, "grad_norm": 9.29218578338623, "learning_rate": 1.0794202898550726e-06, "loss": 1.5899, "step": 33150 }, { "epoch": 2.76, "grad_norm": 9.235881805419922, "learning_rate": 1.0736231884057972e-06, "loss": 1.612, "step": 33160 }, { "epoch": 2.76, "grad_norm": 11.965916633605957, "learning_rate": 1.0678260869565218e-06, "loss": 1.602, "step": 33170 }, { "epoch": 2.77, "grad_norm": 10.200899124145508, "learning_rate": 1.0620289855072465e-06, "loss": 1.5964, "step": 33180 }, { "epoch": 2.77, "grad_norm": 10.006152153015137, "learning_rate": 1.0562318840579711e-06, "loss": 1.541, "step": 33190 }, { "epoch": 2.77, "grad_norm": 2.9720447063446045, "learning_rate": 1.0504347826086958e-06, "loss": 1.5875, "step": 33200 }, { "epoch": 2.77, "grad_norm": 2.2912840843200684, "learning_rate": 1.0446376811594204e-06, "loss": 1.5263, "step": 33210 }, { "epoch": 2.77, "grad_norm": 2.361422538757324, "learning_rate": 1.038840579710145e-06, "loss": 1.6695, "step": 33220 }, { "epoch": 2.77, "grad_norm": 1.8953531980514526, "learning_rate": 1.0330434782608697e-06, "loss": 1.6833, "step": 33230 }, { "epoch": 2.77, "grad_norm": 4.783647537231445, "learning_rate": 1.0272463768115941e-06, "loss": 1.483, "step": 33240 }, { "epoch": 2.77, "grad_norm": 4.266748905181885, "learning_rate": 1.021449275362319e-06, "loss": 1.5473, "step": 33250 }, { "epoch": 2.77, "grad_norm": 5.104844093322754, "learning_rate": 1.0156521739130436e-06, "loss": 1.5915, "step": 33260 }, { "epoch": 2.77, "grad_norm": 2.9186527729034424, "learning_rate": 1.0098550724637683e-06, "loss": 1.6114, "step": 33270 }, { "epoch": 2.77, "grad_norm": 7.935939311981201, "learning_rate": 1.0040579710144927e-06, "loss": 1.4601, "step": 33280 }, { "epoch": 2.77, "grad_norm": 5.332785129547119, "learning_rate": 9.982608695652175e-07, "loss": 1.7318, "step": 33290 }, { "epoch": 2.77, "grad_norm": 7.0045485496521, "learning_rate": 9.924637681159422e-07, "loss": 1.5863, "step": 33300 }, { "epoch": 2.78, "grad_norm": 10.329780578613281, "learning_rate": 9.866666666666668e-07, "loss": 1.5792, "step": 33310 }, { "epoch": 2.78, "grad_norm": 3.1837329864501953, "learning_rate": 9.808695652173912e-07, "loss": 1.4957, "step": 33320 }, { "epoch": 2.78, "grad_norm": 14.454916954040527, "learning_rate": 9.750724637681159e-07, "loss": 1.6237, "step": 33330 }, { "epoch": 2.78, "grad_norm": 2.9528446197509766, "learning_rate": 9.692753623188407e-07, "loss": 1.6035, "step": 33340 }, { "epoch": 2.78, "grad_norm": 2.9662082195281982, "learning_rate": 9.634782608695654e-07, "loss": 1.5402, "step": 33350 }, { "epoch": 2.78, "grad_norm": 4.538402557373047, "learning_rate": 9.576811594202898e-07, "loss": 1.6336, "step": 33360 }, { "epoch": 2.78, "grad_norm": 5.312148571014404, "learning_rate": 9.518840579710146e-07, "loss": 1.5634, "step": 33370 }, { "epoch": 2.78, "grad_norm": 4.3434953689575195, "learning_rate": 9.460869565217393e-07, "loss": 1.587, "step": 33380 }, { "epoch": 2.78, "grad_norm": 4.069890975952148, "learning_rate": 9.402898550724638e-07, "loss": 1.5212, "step": 33390 }, { "epoch": 2.78, "grad_norm": 19.578651428222656, "learning_rate": 9.344927536231885e-07, "loss": 1.6608, "step": 33400 }, { "epoch": 2.78, "grad_norm": 5.462759494781494, "learning_rate": 9.286956521739131e-07, "loss": 1.6739, "step": 33410 }, { "epoch": 2.79, "grad_norm": 6.227196216583252, "learning_rate": 9.228985507246377e-07, "loss": 1.6326, "step": 33420 }, { "epoch": 2.79, "grad_norm": 2.722791910171509, "learning_rate": 9.171014492753624e-07, "loss": 1.6126, "step": 33430 }, { "epoch": 2.79, "grad_norm": 15.881592750549316, "learning_rate": 9.11304347826087e-07, "loss": 1.7239, "step": 33440 }, { "epoch": 2.79, "grad_norm": 6.390254020690918, "learning_rate": 9.055072463768117e-07, "loss": 1.6419, "step": 33450 }, { "epoch": 2.79, "grad_norm": 3.134054660797119, "learning_rate": 8.997101449275362e-07, "loss": 1.6066, "step": 33460 }, { "epoch": 2.79, "grad_norm": 5.273614406585693, "learning_rate": 8.93913043478261e-07, "loss": 1.6421, "step": 33470 }, { "epoch": 2.79, "grad_norm": 4.4810967445373535, "learning_rate": 8.881159420289856e-07, "loss": 1.705, "step": 33480 }, { "epoch": 2.79, "grad_norm": 7.226357460021973, "learning_rate": 8.823188405797103e-07, "loss": 1.5615, "step": 33490 }, { "epoch": 2.79, "grad_norm": 3.2726452350616455, "learning_rate": 8.765217391304348e-07, "loss": 1.713, "step": 33500 }, { "epoch": 2.79, "eval_loss": 1.6301745176315308, "eval_runtime": 107.5184, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 33500 }, { "epoch": 2.79, "grad_norm": 9.75317096710205, "learning_rate": 8.707246376811595e-07, "loss": 1.6305, "step": 33510 }, { "epoch": 2.79, "grad_norm": 8.24269962310791, "learning_rate": 8.649275362318842e-07, "loss": 1.6059, "step": 33520 }, { "epoch": 2.79, "grad_norm": 7.880953788757324, "learning_rate": 8.591304347826088e-07, "loss": 1.6878, "step": 33530 }, { "epoch": 2.79, "grad_norm": 3.4583754539489746, "learning_rate": 8.533333333333334e-07, "loss": 1.7179, "step": 33540 }, { "epoch": 2.8, "grad_norm": 5.8554558753967285, "learning_rate": 8.47536231884058e-07, "loss": 1.535, "step": 33550 }, { "epoch": 2.8, "grad_norm": 5.77476167678833, "learning_rate": 8.417391304347827e-07, "loss": 1.6241, "step": 33560 }, { "epoch": 2.8, "grad_norm": 1.786972999572754, "learning_rate": 8.359420289855074e-07, "loss": 1.6576, "step": 33570 }, { "epoch": 2.8, "grad_norm": 3.2922704219818115, "learning_rate": 8.301449275362319e-07, "loss": 1.7354, "step": 33580 }, { "epoch": 2.8, "grad_norm": 10.593795776367188, "learning_rate": 8.243478260869566e-07, "loss": 1.5815, "step": 33590 }, { "epoch": 2.8, "grad_norm": 3.0302324295043945, "learning_rate": 8.185507246376813e-07, "loss": 1.6433, "step": 33600 }, { "epoch": 2.8, "grad_norm": 10.300827026367188, "learning_rate": 8.127536231884059e-07, "loss": 1.613, "step": 33610 }, { "epoch": 2.8, "grad_norm": 4.340428352355957, "learning_rate": 8.069565217391305e-07, "loss": 1.5346, "step": 33620 }, { "epoch": 2.8, "grad_norm": 6.048191547393799, "learning_rate": 8.011594202898551e-07, "loss": 1.5197, "step": 33630 }, { "epoch": 2.8, "grad_norm": 1.3901071548461914, "learning_rate": 7.953623188405798e-07, "loss": 1.4772, "step": 33640 }, { "epoch": 2.8, "grad_norm": 2.204463481903076, "learning_rate": 7.895652173913045e-07, "loss": 1.6636, "step": 33650 }, { "epoch": 2.81, "grad_norm": 3.664523124694824, "learning_rate": 7.83768115942029e-07, "loss": 1.5472, "step": 33660 }, { "epoch": 2.81, "grad_norm": 6.5395827293396, "learning_rate": 7.779710144927537e-07, "loss": 1.6717, "step": 33670 }, { "epoch": 2.81, "grad_norm": 4.011569976806641, "learning_rate": 7.721739130434783e-07, "loss": 1.7751, "step": 33680 }, { "epoch": 2.81, "grad_norm": 6.7899556159973145, "learning_rate": 7.66376811594203e-07, "loss": 1.7148, "step": 33690 }, { "epoch": 2.81, "grad_norm": 5.4738054275512695, "learning_rate": 7.605797101449276e-07, "loss": 1.4787, "step": 33700 }, { "epoch": 2.81, "grad_norm": 8.042850494384766, "learning_rate": 7.547826086956522e-07, "loss": 1.675, "step": 33710 }, { "epoch": 2.81, "grad_norm": 13.439699172973633, "learning_rate": 7.489855072463768e-07, "loss": 1.6719, "step": 33720 }, { "epoch": 2.81, "grad_norm": 3.596892833709717, "learning_rate": 7.431884057971014e-07, "loss": 1.6386, "step": 33730 }, { "epoch": 2.81, "grad_norm": 4.695061683654785, "learning_rate": 7.373913043478262e-07, "loss": 1.6444, "step": 33740 }, { "epoch": 2.81, "grad_norm": 3.027362108230591, "learning_rate": 7.315942028985508e-07, "loss": 1.6984, "step": 33750 }, { "epoch": 2.81, "grad_norm": 1.8642385005950928, "learning_rate": 7.257971014492753e-07, "loss": 1.5937, "step": 33760 }, { "epoch": 2.81, "grad_norm": 2.800645112991333, "learning_rate": 7.2e-07, "loss": 1.5741, "step": 33770 }, { "epoch": 2.81, "grad_norm": 10.626157760620117, "learning_rate": 7.142028985507247e-07, "loss": 1.5336, "step": 33780 }, { "epoch": 2.82, "grad_norm": 9.157116889953613, "learning_rate": 7.084057971014494e-07, "loss": 1.5535, "step": 33790 }, { "epoch": 2.82, "grad_norm": 7.302629470825195, "learning_rate": 7.026086956521739e-07, "loss": 1.5688, "step": 33800 }, { "epoch": 2.82, "grad_norm": 3.5727436542510986, "learning_rate": 6.968115942028986e-07, "loss": 1.6339, "step": 33810 }, { "epoch": 2.82, "grad_norm": 16.979459762573242, "learning_rate": 6.910144927536233e-07, "loss": 1.5552, "step": 33820 }, { "epoch": 2.82, "grad_norm": 3.843989133834839, "learning_rate": 6.852173913043479e-07, "loss": 1.5359, "step": 33830 }, { "epoch": 2.82, "grad_norm": 2.522932767868042, "learning_rate": 6.794202898550725e-07, "loss": 1.7011, "step": 33840 }, { "epoch": 2.82, "grad_norm": 5.601334095001221, "learning_rate": 6.736231884057971e-07, "loss": 1.6746, "step": 33850 }, { "epoch": 2.82, "grad_norm": 3.354949712753296, "learning_rate": 6.678260869565218e-07, "loss": 1.589, "step": 33860 }, { "epoch": 2.82, "grad_norm": 1.3252222537994385, "learning_rate": 6.620289855072465e-07, "loss": 1.5887, "step": 33870 }, { "epoch": 2.82, "grad_norm": 3.7831883430480957, "learning_rate": 6.56231884057971e-07, "loss": 1.5805, "step": 33880 }, { "epoch": 2.82, "grad_norm": 8.868619918823242, "learning_rate": 6.504347826086957e-07, "loss": 1.6854, "step": 33890 }, { "epoch": 2.83, "grad_norm": 7.663196563720703, "learning_rate": 6.446376811594203e-07, "loss": 1.7296, "step": 33900 }, { "epoch": 2.83, "grad_norm": 5.567215919494629, "learning_rate": 6.388405797101451e-07, "loss": 1.5271, "step": 33910 }, { "epoch": 2.83, "grad_norm": 5.213868141174316, "learning_rate": 6.330434782608696e-07, "loss": 1.4958, "step": 33920 }, { "epoch": 2.83, "grad_norm": 2.9527573585510254, "learning_rate": 6.272463768115942e-07, "loss": 1.5046, "step": 33930 }, { "epoch": 2.83, "grad_norm": 5.8709187507629395, "learning_rate": 6.214492753623189e-07, "loss": 1.5993, "step": 33940 }, { "epoch": 2.83, "grad_norm": 17.194610595703125, "learning_rate": 6.156521739130435e-07, "loss": 1.6132, "step": 33950 }, { "epoch": 2.83, "grad_norm": 6.302470684051514, "learning_rate": 6.098550724637682e-07, "loss": 1.6321, "step": 33960 }, { "epoch": 2.83, "grad_norm": 1.782871127128601, "learning_rate": 6.040579710144928e-07, "loss": 1.4957, "step": 33970 }, { "epoch": 2.83, "grad_norm": 4.105221748352051, "learning_rate": 5.982608695652174e-07, "loss": 1.6038, "step": 33980 }, { "epoch": 2.83, "grad_norm": 3.2038047313690186, "learning_rate": 5.924637681159421e-07, "loss": 1.454, "step": 33990 }, { "epoch": 2.83, "grad_norm": 11.589030265808105, "learning_rate": 5.866666666666667e-07, "loss": 1.583, "step": 34000 }, { "epoch": 2.83, "eval_loss": 1.6165950298309326, "eval_runtime": 107.5234, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 34000 }, { "epoch": 2.83, "grad_norm": 5.653379917144775, "learning_rate": 5.808695652173914e-07, "loss": 1.5756, "step": 34010 }, { "epoch": 2.83, "grad_norm": 4.994700908660889, "learning_rate": 5.75072463768116e-07, "loss": 1.6044, "step": 34020 }, { "epoch": 2.84, "grad_norm": 5.27285099029541, "learning_rate": 5.692753623188407e-07, "loss": 1.5561, "step": 34030 }, { "epoch": 2.84, "grad_norm": 3.3184444904327393, "learning_rate": 5.634782608695653e-07, "loss": 1.5874, "step": 34040 }, { "epoch": 2.84, "grad_norm": 3.3386433124542236, "learning_rate": 5.576811594202898e-07, "loss": 1.6027, "step": 34050 }, { "epoch": 2.84, "grad_norm": 9.001840591430664, "learning_rate": 5.518840579710146e-07, "loss": 1.5911, "step": 34060 }, { "epoch": 2.84, "grad_norm": 11.12852954864502, "learning_rate": 5.460869565217391e-07, "loss": 1.5201, "step": 34070 }, { "epoch": 2.84, "grad_norm": 5.407654285430908, "learning_rate": 5.402898550724639e-07, "loss": 1.5665, "step": 34080 }, { "epoch": 2.84, "grad_norm": 4.4106831550598145, "learning_rate": 5.344927536231884e-07, "loss": 1.7836, "step": 34090 }, { "epoch": 2.84, "grad_norm": 8.831493377685547, "learning_rate": 5.286956521739131e-07, "loss": 1.5497, "step": 34100 }, { "epoch": 2.84, "grad_norm": 3.7488794326782227, "learning_rate": 5.228985507246377e-07, "loss": 1.6616, "step": 34110 }, { "epoch": 2.84, "grad_norm": 0.5988351702690125, "learning_rate": 5.171014492753624e-07, "loss": 1.4637, "step": 34120 }, { "epoch": 2.84, "grad_norm": 7.661751747131348, "learning_rate": 5.11304347826087e-07, "loss": 1.6349, "step": 34130 }, { "epoch": 2.84, "grad_norm": 1.7503852844238281, "learning_rate": 5.055072463768116e-07, "loss": 1.7236, "step": 34140 }, { "epoch": 2.85, "grad_norm": 4.78981876373291, "learning_rate": 4.997101449275362e-07, "loss": 1.6712, "step": 34150 }, { "epoch": 2.85, "grad_norm": 3.7767815589904785, "learning_rate": 4.939130434782609e-07, "loss": 1.5735, "step": 34160 }, { "epoch": 2.85, "grad_norm": 8.948434829711914, "learning_rate": 4.881159420289855e-07, "loss": 1.4895, "step": 34170 }, { "epoch": 2.85, "grad_norm": 5.406666278839111, "learning_rate": 4.823188405797102e-07, "loss": 1.5253, "step": 34180 }, { "epoch": 2.85, "grad_norm": 1.939326524734497, "learning_rate": 4.7652173913043486e-07, "loss": 1.6299, "step": 34190 }, { "epoch": 2.85, "grad_norm": 12.841042518615723, "learning_rate": 4.7072463768115945e-07, "loss": 1.6818, "step": 34200 }, { "epoch": 2.85, "grad_norm": 2.7751150131225586, "learning_rate": 4.6492753623188414e-07, "loss": 1.7415, "step": 34210 }, { "epoch": 2.85, "grad_norm": 5.986341953277588, "learning_rate": 4.5913043478260873e-07, "loss": 1.5771, "step": 34220 }, { "epoch": 2.85, "grad_norm": 7.021378517150879, "learning_rate": 4.533333333333334e-07, "loss": 1.4319, "step": 34230 }, { "epoch": 2.85, "grad_norm": 6.669992446899414, "learning_rate": 4.47536231884058e-07, "loss": 1.6399, "step": 34240 }, { "epoch": 2.85, "grad_norm": 1.224604845046997, "learning_rate": 4.417391304347826e-07, "loss": 1.7313, "step": 34250 }, { "epoch": 2.85, "grad_norm": 4.330286026000977, "learning_rate": 4.359420289855073e-07, "loss": 1.658, "step": 34260 }, { "epoch": 2.86, "grad_norm": 1.8469319343566895, "learning_rate": 4.301449275362319e-07, "loss": 1.5495, "step": 34270 }, { "epoch": 2.86, "grad_norm": 1.5525469779968262, "learning_rate": 4.243478260869566e-07, "loss": 1.5885, "step": 34280 }, { "epoch": 2.86, "grad_norm": 3.59576416015625, "learning_rate": 4.1855072463768116e-07, "loss": 1.6725, "step": 34290 }, { "epoch": 2.86, "grad_norm": 14.655567169189453, "learning_rate": 4.1275362318840586e-07, "loss": 1.4917, "step": 34300 }, { "epoch": 2.86, "grad_norm": 4.645742416381836, "learning_rate": 4.0695652173913044e-07, "loss": 1.5779, "step": 34310 }, { "epoch": 2.86, "grad_norm": 2.458324670791626, "learning_rate": 4.0115942028985514e-07, "loss": 1.6004, "step": 34320 }, { "epoch": 2.86, "grad_norm": 4.773531913757324, "learning_rate": 3.9536231884057973e-07, "loss": 1.504, "step": 34330 }, { "epoch": 2.86, "grad_norm": 4.06483268737793, "learning_rate": 3.8956521739130437e-07, "loss": 1.6038, "step": 34340 }, { "epoch": 2.86, "grad_norm": 3.4933018684387207, "learning_rate": 3.83768115942029e-07, "loss": 1.6455, "step": 34350 }, { "epoch": 2.86, "grad_norm": 2.6547389030456543, "learning_rate": 3.7797101449275365e-07, "loss": 1.5391, "step": 34360 }, { "epoch": 2.86, "grad_norm": 2.740145683288574, "learning_rate": 3.721739130434783e-07, "loss": 1.6146, "step": 34370 }, { "epoch": 2.87, "grad_norm": 2.028878688812256, "learning_rate": 3.6637681159420293e-07, "loss": 1.5921, "step": 34380 }, { "epoch": 2.87, "grad_norm": 6.614234447479248, "learning_rate": 3.6057971014492757e-07, "loss": 1.5552, "step": 34390 }, { "epoch": 2.87, "grad_norm": 2.7156872749328613, "learning_rate": 3.547826086956522e-07, "loss": 1.6545, "step": 34400 }, { "epoch": 2.87, "grad_norm": 2.235097646713257, "learning_rate": 3.4898550724637685e-07, "loss": 1.5333, "step": 34410 }, { "epoch": 2.87, "grad_norm": 4.5841474533081055, "learning_rate": 3.431884057971015e-07, "loss": 1.6009, "step": 34420 }, { "epoch": 2.87, "grad_norm": 3.2173547744750977, "learning_rate": 3.3739130434782614e-07, "loss": 1.5737, "step": 34430 }, { "epoch": 2.87, "grad_norm": 2.977945327758789, "learning_rate": 3.315942028985508e-07, "loss": 1.6099, "step": 34440 }, { "epoch": 2.87, "grad_norm": 4.717473983764648, "learning_rate": 3.2579710144927537e-07, "loss": 1.6938, "step": 34450 }, { "epoch": 2.87, "grad_norm": 3.8826589584350586, "learning_rate": 3.2e-07, "loss": 1.5333, "step": 34460 }, { "epoch": 2.87, "grad_norm": 1.7628260850906372, "learning_rate": 3.1420289855072465e-07, "loss": 1.4657, "step": 34470 }, { "epoch": 2.87, "grad_norm": 2.397793769836426, "learning_rate": 3.084057971014493e-07, "loss": 1.7186, "step": 34480 }, { "epoch": 2.87, "grad_norm": 13.669360160827637, "learning_rate": 3.0260869565217393e-07, "loss": 1.6144, "step": 34490 }, { "epoch": 2.88, "grad_norm": 1.7966532707214355, "learning_rate": 2.9681159420289857e-07, "loss": 1.7335, "step": 34500 }, { "epoch": 2.88, "eval_loss": 1.6453466415405273, "eval_runtime": 107.5244, "eval_samples_per_second": 9.3, "eval_steps_per_second": 2.325, "step": 34500 }, { "epoch": 2.88, "grad_norm": 5.828636646270752, "learning_rate": 2.910144927536232e-07, "loss": 1.4402, "step": 34510 }, { "epoch": 2.88, "grad_norm": 1.665623664855957, "learning_rate": 2.8521739130434785e-07, "loss": 1.5562, "step": 34520 }, { "epoch": 2.88, "grad_norm": 7.800908088684082, "learning_rate": 2.794202898550725e-07, "loss": 1.5813, "step": 34530 }, { "epoch": 2.88, "grad_norm": 6.734397888183594, "learning_rate": 2.7362318840579713e-07, "loss": 1.7721, "step": 34540 }, { "epoch": 2.88, "grad_norm": 2.1839168071746826, "learning_rate": 2.678260869565218e-07, "loss": 1.5236, "step": 34550 }, { "epoch": 2.88, "grad_norm": 3.9042229652404785, "learning_rate": 2.620289855072464e-07, "loss": 1.6205, "step": 34560 }, { "epoch": 2.88, "grad_norm": 6.442493438720703, "learning_rate": 2.5623188405797106e-07, "loss": 1.6334, "step": 34570 }, { "epoch": 2.88, "grad_norm": 11.60261058807373, "learning_rate": 2.504347826086957e-07, "loss": 1.6631, "step": 34580 }, { "epoch": 2.88, "grad_norm": 1.5220028162002563, "learning_rate": 2.4463768115942034e-07, "loss": 1.5903, "step": 34590 }, { "epoch": 2.88, "grad_norm": 6.463686466217041, "learning_rate": 2.3884057971014493e-07, "loss": 1.505, "step": 34600 }, { "epoch": 2.88, "grad_norm": 1.3802415132522583, "learning_rate": 2.3304347826086957e-07, "loss": 1.5574, "step": 34610 }, { "epoch": 2.88, "grad_norm": 5.931606292724609, "learning_rate": 2.272463768115942e-07, "loss": 1.6049, "step": 34620 }, { "epoch": 2.89, "grad_norm": 4.055948734283447, "learning_rate": 2.2144927536231885e-07, "loss": 1.7223, "step": 34630 }, { "epoch": 2.89, "grad_norm": 3.3893649578094482, "learning_rate": 2.156521739130435e-07, "loss": 1.6336, "step": 34640 }, { "epoch": 2.89, "grad_norm": 4.007665157318115, "learning_rate": 2.0985507246376813e-07, "loss": 1.5563, "step": 34650 }, { "epoch": 2.89, "grad_norm": 3.499795913696289, "learning_rate": 2.0405797101449277e-07, "loss": 1.6813, "step": 34660 }, { "epoch": 2.89, "grad_norm": 3.31160569190979, "learning_rate": 1.9826086956521742e-07, "loss": 1.5491, "step": 34670 }, { "epoch": 2.89, "grad_norm": 2.493201732635498, "learning_rate": 1.9246376811594206e-07, "loss": 1.7265, "step": 34680 }, { "epoch": 2.89, "grad_norm": 6.865158557891846, "learning_rate": 1.866666666666667e-07, "loss": 1.6564, "step": 34690 }, { "epoch": 2.89, "grad_norm": 2.9603042602539062, "learning_rate": 1.808695652173913e-07, "loss": 1.6632, "step": 34700 }, { "epoch": 2.89, "grad_norm": 10.82306957244873, "learning_rate": 1.7507246376811595e-07, "loss": 1.555, "step": 34710 }, { "epoch": 2.89, "grad_norm": 4.0846428871154785, "learning_rate": 1.692753623188406e-07, "loss": 1.6608, "step": 34720 }, { "epoch": 2.89, "grad_norm": 5.046102523803711, "learning_rate": 1.6347826086956523e-07, "loss": 1.6355, "step": 34730 }, { "epoch": 2.9, "grad_norm": 5.843890190124512, "learning_rate": 1.5768115942028988e-07, "loss": 1.7457, "step": 34740 }, { "epoch": 2.9, "grad_norm": 2.0817549228668213, "learning_rate": 1.5188405797101452e-07, "loss": 1.5093, "step": 34750 }, { "epoch": 2.9, "grad_norm": 3.4835991859436035, "learning_rate": 1.4608695652173916e-07, "loss": 1.5516, "step": 34760 }, { "epoch": 2.9, "grad_norm": 4.763504505157471, "learning_rate": 1.402898550724638e-07, "loss": 1.5014, "step": 34770 }, { "epoch": 2.9, "grad_norm": 7.537784576416016, "learning_rate": 1.344927536231884e-07, "loss": 1.5698, "step": 34780 }, { "epoch": 2.9, "grad_norm": 3.3694698810577393, "learning_rate": 1.2869565217391305e-07, "loss": 1.7388, "step": 34790 }, { "epoch": 2.9, "grad_norm": 3.1657655239105225, "learning_rate": 1.228985507246377e-07, "loss": 1.6657, "step": 34800 }, { "epoch": 2.9, "grad_norm": 3.10614013671875, "learning_rate": 1.1710144927536234e-07, "loss": 1.6406, "step": 34810 }, { "epoch": 2.9, "grad_norm": 8.151602745056152, "learning_rate": 1.1130434782608698e-07, "loss": 1.7237, "step": 34820 }, { "epoch": 2.9, "grad_norm": 2.7785089015960693, "learning_rate": 1.0550724637681159e-07, "loss": 1.6641, "step": 34830 }, { "epoch": 2.9, "grad_norm": 1.2298667430877686, "learning_rate": 9.971014492753623e-08, "loss": 1.659, "step": 34840 }, { "epoch": 2.9, "grad_norm": 4.855597972869873, "learning_rate": 9.391304347826087e-08, "loss": 1.5468, "step": 34850 }, { "epoch": 2.91, "grad_norm": 7.356712818145752, "learning_rate": 8.811594202898551e-08, "loss": 1.5559, "step": 34860 }, { "epoch": 2.91, "grad_norm": 6.330800533294678, "learning_rate": 8.231884057971016e-08, "loss": 1.5451, "step": 34870 }, { "epoch": 2.91, "grad_norm": 4.4921183586120605, "learning_rate": 7.65217391304348e-08, "loss": 1.6324, "step": 34880 }, { "epoch": 2.91, "grad_norm": 11.110346794128418, "learning_rate": 7.072463768115942e-08, "loss": 1.5751, "step": 34890 }, { "epoch": 2.91, "grad_norm": 5.403293609619141, "learning_rate": 6.492753623188407e-08, "loss": 1.6089, "step": 34900 }, { "epoch": 2.91, "grad_norm": 11.329370498657227, "learning_rate": 5.9130434782608707e-08, "loss": 1.7421, "step": 34910 }, { "epoch": 2.91, "grad_norm": 6.819499969482422, "learning_rate": 5.3333333333333334e-08, "loss": 1.6757, "step": 34920 }, { "epoch": 2.91, "grad_norm": 3.4054958820343018, "learning_rate": 4.7536231884057975e-08, "loss": 1.6065, "step": 34930 }, { "epoch": 2.91, "grad_norm": 6.741161823272705, "learning_rate": 4.173913043478261e-08, "loss": 1.705, "step": 34940 }, { "epoch": 2.91, "grad_norm": 12.431832313537598, "learning_rate": 3.594202898550725e-08, "loss": 1.637, "step": 34950 }, { "epoch": 2.91, "grad_norm": 3.2292697429656982, "learning_rate": 3.0144927536231885e-08, "loss": 1.6626, "step": 34960 }, { "epoch": 2.91, "grad_norm": 6.705583095550537, "learning_rate": 2.4347826086956523e-08, "loss": 1.7878, "step": 34970 }, { "epoch": 2.92, "grad_norm": 4.077023506164551, "learning_rate": 1.855072463768116e-08, "loss": 1.5233, "step": 34980 }, { "epoch": 2.92, "grad_norm": 3.2491979598999023, "learning_rate": 1.2753623188405798e-08, "loss": 1.6806, "step": 34990 }, { "epoch": 2.92, "grad_norm": 5.141303062438965, "learning_rate": 6.956521739130436e-09, "loss": 1.6406, "step": 35000 }, { "epoch": 2.92, "eval_loss": 1.6195024251937866, "eval_runtime": 107.5117, "eval_samples_per_second": 9.301, "eval_steps_per_second": 2.325, "step": 35000 } ], "logging_steps": 10, "max_steps": 35000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 5.6357440978944e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }