diff --git "a/llama2_13b_lora_b/trainer_state.json" "b/llama2_13b_lora_b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/llama2_13b_lora_b/trainer_state.json" @@ -0,0 +1,9701 @@ +{ + "best_metric": 0.6174443364143372, + "best_model_checkpoint": "ckpt/llama2_13b_other/fuze_28_balance_no_sys/checkpoint-12000", + "epoch": 2.0, + "eval_steps": 3000, + "global_step": 13776, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014518002322880372, + "grad_norm": 0.5469616055488586, + "learning_rate": 2.5e-05, + "loss": 2.7374, + "step": 10 + }, + { + "epoch": 0.0029036004645760743, + "grad_norm": 3.591161012649536, + "learning_rate": 5e-05, + "loss": 2.639, + "step": 20 + }, + { + "epoch": 0.004355400696864111, + "grad_norm": 4.15903902053833, + "learning_rate": 4.9999934803356854e-05, + "loss": 1.9973, + "step": 30 + }, + { + "epoch": 0.005807200929152149, + "grad_norm": 0.7227652072906494, + "learning_rate": 4.999973921376744e-05, + "loss": 1.0062, + "step": 40 + }, + { + "epoch": 0.007259001161440186, + "grad_norm": 2.8751795291900635, + "learning_rate": 4.9999413232251924e-05, + "loss": 0.8956, + "step": 50 + }, + { + "epoch": 0.008710801393728223, + "grad_norm": 1.205117106437683, + "learning_rate": 4.9998956860510515e-05, + "loss": 0.6662, + "step": 60 + }, + { + "epoch": 0.01016260162601626, + "grad_norm": 2.018847703933716, + "learning_rate": 4.9998370100923546e-05, + "loss": 0.843, + "step": 70 + }, + { + "epoch": 0.011614401858304297, + "grad_norm": 1.5158370733261108, + "learning_rate": 4.9997652956551386e-05, + "loss": 0.6177, + "step": 80 + }, + { + "epoch": 0.013066202090592335, + "grad_norm": 2.3284151554107666, + "learning_rate": 4.999680543113447e-05, + "loss": 0.9499, + "step": 90 + }, + { + "epoch": 0.014518002322880372, + "grad_norm": 1.6479175090789795, + "learning_rate": 4.999582752909326e-05, + "loss": 0.94, + "step": 100 + }, + { + "epoch": 0.01596980255516841, + "grad_norm": 2.4836409091949463, + "learning_rate": 4.999471925552824e-05, + "loss": 1.0131, + "step": 110 + }, + { + "epoch": 0.017421602787456445, + "grad_norm": 0.8880699872970581, + "learning_rate": 4.999348061621986e-05, + "loss": 0.7342, + "step": 120 + }, + { + "epoch": 0.018873403019744484, + "grad_norm": 1.1543691158294678, + "learning_rate": 4.999211161762852e-05, + "loss": 0.9561, + "step": 130 + }, + { + "epoch": 0.02032520325203252, + "grad_norm": 1.358257532119751, + "learning_rate": 4.9990612266894574e-05, + "loss": 0.8214, + "step": 140 + }, + { + "epoch": 0.02177700348432056, + "grad_norm": 1.9943785667419434, + "learning_rate": 4.9988982571838214e-05, + "loss": 0.8319, + "step": 150 + }, + { + "epoch": 0.023228803716608595, + "grad_norm": 6.3814520835876465, + "learning_rate": 4.998722254095949e-05, + "loss": 0.9133, + "step": 160 + }, + { + "epoch": 0.02468060394889663, + "grad_norm": 5.650225639343262, + "learning_rate": 4.998533218343826e-05, + "loss": 0.8295, + "step": 170 + }, + { + "epoch": 0.02613240418118467, + "grad_norm": 0.8774411082267761, + "learning_rate": 4.998331150913412e-05, + "loss": 0.7959, + "step": 180 + }, + { + "epoch": 0.027584204413472705, + "grad_norm": 1.1295948028564453, + "learning_rate": 4.998116052858636e-05, + "loss": 0.6883, + "step": 190 + }, + { + "epoch": 0.029036004645760744, + "grad_norm": 1.1888564825057983, + "learning_rate": 4.9978879253013925e-05, + "loss": 0.9037, + "step": 200 + }, + { + "epoch": 0.03048780487804878, + "grad_norm": 2.754950761795044, + "learning_rate": 4.997646769431532e-05, + "loss": 0.9584, + "step": 210 + }, + { + "epoch": 0.03193960511033682, + "grad_norm": 0.6559054851531982, + "learning_rate": 4.9973925865068604e-05, + "loss": 0.9589, + "step": 220 + }, + { + "epoch": 0.03339140534262486, + "grad_norm": 0.49671778082847595, + "learning_rate": 4.997125377853127e-05, + "loss": 0.6646, + "step": 230 + }, + { + "epoch": 0.03484320557491289, + "grad_norm": 1.9896801710128784, + "learning_rate": 4.996845144864021e-05, + "loss": 0.9931, + "step": 240 + }, + { + "epoch": 0.03629500580720093, + "grad_norm": 5.797499179840088, + "learning_rate": 4.9965518890011606e-05, + "loss": 0.6066, + "step": 250 + }, + { + "epoch": 0.03774680603948897, + "grad_norm": 1.675417423248291, + "learning_rate": 4.996245611794091e-05, + "loss": 0.8694, + "step": 260 + }, + { + "epoch": 0.039198606271777, + "grad_norm": 1.7982059717178345, + "learning_rate": 4.9959263148402713e-05, + "loss": 0.699, + "step": 270 + }, + { + "epoch": 0.04065040650406504, + "grad_norm": 2.3844377994537354, + "learning_rate": 4.9955939998050696e-05, + "loss": 0.9412, + "step": 280 + }, + { + "epoch": 0.04210220673635308, + "grad_norm": 1.0922398567199707, + "learning_rate": 4.9952486684217516e-05, + "loss": 0.7914, + "step": 290 + }, + { + "epoch": 0.04355400696864112, + "grad_norm": 0.8432398438453674, + "learning_rate": 4.994890322491472e-05, + "loss": 0.8929, + "step": 300 + }, + { + "epoch": 0.04500580720092915, + "grad_norm": 0.7825923562049866, + "learning_rate": 4.9945189638832676e-05, + "loss": 0.9772, + "step": 310 + }, + { + "epoch": 0.04645760743321719, + "grad_norm": 3.1025567054748535, + "learning_rate": 4.994134594534046e-05, + "loss": 0.9957, + "step": 320 + }, + { + "epoch": 0.04790940766550523, + "grad_norm": 0.7623017430305481, + "learning_rate": 4.993737216448573e-05, + "loss": 0.5879, + "step": 330 + }, + { + "epoch": 0.04936120789779326, + "grad_norm": 0.5946126580238342, + "learning_rate": 4.9933268316994665e-05, + "loss": 0.7163, + "step": 340 + }, + { + "epoch": 0.0508130081300813, + "grad_norm": 2.404294490814209, + "learning_rate": 4.992903442427184e-05, + "loss": 0.7242, + "step": 350 + }, + { + "epoch": 0.05226480836236934, + "grad_norm": 2.895533323287964, + "learning_rate": 4.9924670508400096e-05, + "loss": 0.7379, + "step": 360 + }, + { + "epoch": 0.05371660859465738, + "grad_norm": 1.498542070388794, + "learning_rate": 4.992017659214044e-05, + "loss": 0.8413, + "step": 370 + }, + { + "epoch": 0.05516840882694541, + "grad_norm": 2.449810743331909, + "learning_rate": 4.991555269893194e-05, + "loss": 0.7442, + "step": 380 + }, + { + "epoch": 0.05662020905923345, + "grad_norm": 0.6948937773704529, + "learning_rate": 4.991079885289159e-05, + "loss": 0.7314, + "step": 390 + }, + { + "epoch": 0.05807200929152149, + "grad_norm": 1.1707820892333984, + "learning_rate": 4.990591507881416e-05, + "loss": 0.6596, + "step": 400 + }, + { + "epoch": 0.05952380952380952, + "grad_norm": 2.644362211227417, + "learning_rate": 4.99009014021721e-05, + "loss": 0.6841, + "step": 410 + }, + { + "epoch": 0.06097560975609756, + "grad_norm": 4.067262649536133, + "learning_rate": 4.9895757849115415e-05, + "loss": 0.9483, + "step": 420 + }, + { + "epoch": 0.0624274099883856, + "grad_norm": 0.9770334959030151, + "learning_rate": 4.989048444647149e-05, + "loss": 0.738, + "step": 430 + }, + { + "epoch": 0.06387921022067364, + "grad_norm": 0.6074944734573364, + "learning_rate": 4.988508122174498e-05, + "loss": 0.8884, + "step": 440 + }, + { + "epoch": 0.06533101045296168, + "grad_norm": 1.6675050258636475, + "learning_rate": 4.9879548203117654e-05, + "loss": 0.713, + "step": 450 + }, + { + "epoch": 0.06678281068524972, + "grad_norm": 2.338547945022583, + "learning_rate": 4.987388541944824e-05, + "loss": 1.0344, + "step": 460 + }, + { + "epoch": 0.06823461091753774, + "grad_norm": 2.637160301208496, + "learning_rate": 4.986809290027231e-05, + "loss": 0.6869, + "step": 470 + }, + { + "epoch": 0.06968641114982578, + "grad_norm": 1.666285753250122, + "learning_rate": 4.986217067580209e-05, + "loss": 0.7925, + "step": 480 + }, + { + "epoch": 0.07113821138211382, + "grad_norm": 3.2277042865753174, + "learning_rate": 4.98561187769263e-05, + "loss": 0.4641, + "step": 490 + }, + { + "epoch": 0.07259001161440186, + "grad_norm": 2.3560967445373535, + "learning_rate": 4.984993723521003e-05, + "loss": 0.5966, + "step": 500 + }, + { + "epoch": 0.0740418118466899, + "grad_norm": 1.709749698638916, + "learning_rate": 4.984362608289454e-05, + "loss": 0.672, + "step": 510 + }, + { + "epoch": 0.07549361207897794, + "grad_norm": 0.9946199655532837, + "learning_rate": 4.98371853528971e-05, + "loss": 0.7569, + "step": 520 + }, + { + "epoch": 0.07694541231126598, + "grad_norm": 0.839363694190979, + "learning_rate": 4.983061507881083e-05, + "loss": 0.6376, + "step": 530 + }, + { + "epoch": 0.078397212543554, + "grad_norm": 1.063310146331787, + "learning_rate": 4.982391529490452e-05, + "loss": 0.9329, + "step": 540 + }, + { + "epoch": 0.07984901277584204, + "grad_norm": 1.1951221227645874, + "learning_rate": 4.981708603612244e-05, + "loss": 0.7935, + "step": 550 + }, + { + "epoch": 0.08130081300813008, + "grad_norm": 1.3913841247558594, + "learning_rate": 4.981012733808417e-05, + "loss": 0.7963, + "step": 560 + }, + { + "epoch": 0.08275261324041812, + "grad_norm": 1.2319680452346802, + "learning_rate": 4.980303923708441e-05, + "loss": 0.8177, + "step": 570 + }, + { + "epoch": 0.08420441347270616, + "grad_norm": 1.925199031829834, + "learning_rate": 4.979582177009279e-05, + "loss": 0.7387, + "step": 580 + }, + { + "epoch": 0.0856562137049942, + "grad_norm": 1.3763043880462646, + "learning_rate": 4.9788474974753686e-05, + "loss": 0.6866, + "step": 590 + }, + { + "epoch": 0.08710801393728224, + "grad_norm": 1.1121422052383423, + "learning_rate": 4.9780998889386e-05, + "loss": 1.0793, + "step": 600 + }, + { + "epoch": 0.08855981416957026, + "grad_norm": 4.042593479156494, + "learning_rate": 4.9773393552982994e-05, + "loss": 0.7474, + "step": 610 + }, + { + "epoch": 0.0900116144018583, + "grad_norm": 1.7447359561920166, + "learning_rate": 4.976565900521205e-05, + "loss": 0.6573, + "step": 620 + }, + { + "epoch": 0.09146341463414634, + "grad_norm": 2.484264373779297, + "learning_rate": 4.975779528641451e-05, + "loss": 0.7327, + "step": 630 + }, + { + "epoch": 0.09291521486643438, + "grad_norm": 1.6296310424804688, + "learning_rate": 4.97498024376054e-05, + "loss": 1.0403, + "step": 640 + }, + { + "epoch": 0.09436701509872242, + "grad_norm": 0.7024840712547302, + "learning_rate": 4.9741680500473276e-05, + "loss": 0.8121, + "step": 650 + }, + { + "epoch": 0.09581881533101046, + "grad_norm": 5.444680690765381, + "learning_rate": 4.973342951737999e-05, + "loss": 0.6586, + "step": 660 + }, + { + "epoch": 0.0972706155632985, + "grad_norm": 3.954261302947998, + "learning_rate": 4.9725049531360454e-05, + "loss": 1.0836, + "step": 670 + }, + { + "epoch": 0.09872241579558652, + "grad_norm": 0.8371906876564026, + "learning_rate": 4.9716540586122425e-05, + "loss": 0.7811, + "step": 680 + }, + { + "epoch": 0.10017421602787456, + "grad_norm": 1.8764899969100952, + "learning_rate": 4.970790272604626e-05, + "loss": 0.5407, + "step": 690 + }, + { + "epoch": 0.1016260162601626, + "grad_norm": 1.2408713102340698, + "learning_rate": 4.9699135996184745e-05, + "loss": 0.751, + "step": 700 + }, + { + "epoch": 0.10307781649245064, + "grad_norm": 1.968125343322754, + "learning_rate": 4.969024044226276e-05, + "loss": 0.7491, + "step": 710 + }, + { + "epoch": 0.10452961672473868, + "grad_norm": 2.832357168197632, + "learning_rate": 4.9681216110677145e-05, + "loss": 0.8841, + "step": 720 + }, + { + "epoch": 0.10598141695702672, + "grad_norm": 1.2367392778396606, + "learning_rate": 4.9672063048496384e-05, + "loss": 0.6756, + "step": 730 + }, + { + "epoch": 0.10743321718931476, + "grad_norm": 1.3740241527557373, + "learning_rate": 4.9662781303460385e-05, + "loss": 0.62, + "step": 740 + }, + { + "epoch": 0.10888501742160278, + "grad_norm": 1.3408523797988892, + "learning_rate": 4.9653370923980245e-05, + "loss": 1.1724, + "step": 750 + }, + { + "epoch": 0.11033681765389082, + "grad_norm": 0.5014351010322571, + "learning_rate": 4.964383195913798e-05, + "loss": 0.6388, + "step": 760 + }, + { + "epoch": 0.11178861788617886, + "grad_norm": 4.638858318328857, + "learning_rate": 4.963416445868626e-05, + "loss": 0.9585, + "step": 770 + }, + { + "epoch": 0.1132404181184669, + "grad_norm": 1.0444010496139526, + "learning_rate": 4.962436847304818e-05, + "loss": 0.8725, + "step": 780 + }, + { + "epoch": 0.11469221835075494, + "grad_norm": 0.930262565612793, + "learning_rate": 4.9614444053316954e-05, + "loss": 0.6572, + "step": 790 + }, + { + "epoch": 0.11614401858304298, + "grad_norm": 2.152299642562866, + "learning_rate": 4.960439125125571e-05, + "loss": 0.9154, + "step": 800 + }, + { + "epoch": 0.11759581881533102, + "grad_norm": 0.5455219745635986, + "learning_rate": 4.959421011929716e-05, + "loss": 0.6822, + "step": 810 + }, + { + "epoch": 0.11904761904761904, + "grad_norm": 2.061328172683716, + "learning_rate": 4.9583900710543344e-05, + "loss": 0.6367, + "step": 820 + }, + { + "epoch": 0.12049941927990708, + "grad_norm": 0.8445961475372314, + "learning_rate": 4.957346307876537e-05, + "loss": 0.7663, + "step": 830 + }, + { + "epoch": 0.12195121951219512, + "grad_norm": 1.552341341972351, + "learning_rate": 4.956289727840313e-05, + "loss": 0.8241, + "step": 840 + }, + { + "epoch": 0.12340301974448316, + "grad_norm": 1.2934677600860596, + "learning_rate": 4.9552203364565e-05, + "loss": 0.8214, + "step": 850 + }, + { + "epoch": 0.1248548199767712, + "grad_norm": 1.5896968841552734, + "learning_rate": 4.9541381393027564e-05, + "loss": 0.8306, + "step": 860 + }, + { + "epoch": 0.12630662020905922, + "grad_norm": 1.5255497694015503, + "learning_rate": 4.953043142023531e-05, + "loss": 1.0784, + "step": 870 + }, + { + "epoch": 0.12775842044134728, + "grad_norm": 1.1418253183364868, + "learning_rate": 4.951935350330037e-05, + "loss": 0.7049, + "step": 880 + }, + { + "epoch": 0.1292102206736353, + "grad_norm": 1.0762348175048828, + "learning_rate": 4.950814770000217e-05, + "loss": 0.7348, + "step": 890 + }, + { + "epoch": 0.13066202090592335, + "grad_norm": 2.7543962001800537, + "learning_rate": 4.949681406878718e-05, + "loss": 0.5945, + "step": 900 + }, + { + "epoch": 0.13211382113821138, + "grad_norm": 0.9333184361457825, + "learning_rate": 4.948535266876857e-05, + "loss": 0.5863, + "step": 910 + }, + { + "epoch": 0.13356562137049943, + "grad_norm": 0.7856501340866089, + "learning_rate": 4.947376355972593e-05, + "loss": 0.8137, + "step": 920 + }, + { + "epoch": 0.13501742160278746, + "grad_norm": 2.296924352645874, + "learning_rate": 4.9462046802104945e-05, + "loss": 0.6391, + "step": 930 + }, + { + "epoch": 0.13646922183507548, + "grad_norm": 1.829867959022522, + "learning_rate": 4.9450202457017055e-05, + "loss": 0.763, + "step": 940 + }, + { + "epoch": 0.13792102206736354, + "grad_norm": 0.624662458896637, + "learning_rate": 4.9438230586239207e-05, + "loss": 0.789, + "step": 950 + }, + { + "epoch": 0.13937282229965156, + "grad_norm": 1.0507394075393677, + "learning_rate": 4.942613125221346e-05, + "loss": 0.7278, + "step": 960 + }, + { + "epoch": 0.14082462253193961, + "grad_norm": 0.9353327751159668, + "learning_rate": 4.9413904518046674e-05, + "loss": 0.6717, + "step": 970 + }, + { + "epoch": 0.14227642276422764, + "grad_norm": 4.606626510620117, + "learning_rate": 4.9401550447510235e-05, + "loss": 0.6505, + "step": 980 + }, + { + "epoch": 0.1437282229965157, + "grad_norm": 0.5310667753219604, + "learning_rate": 4.9389069105039634e-05, + "loss": 0.6163, + "step": 990 + }, + { + "epoch": 0.14518002322880372, + "grad_norm": 1.1119409799575806, + "learning_rate": 4.9376460555734225e-05, + "loss": 0.6708, + "step": 1000 + }, + { + "epoch": 0.14663182346109174, + "grad_norm": 0.934678852558136, + "learning_rate": 4.936372486535679e-05, + "loss": 0.63, + "step": 1010 + }, + { + "epoch": 0.1480836236933798, + "grad_norm": 0.9781250357627869, + "learning_rate": 4.9350862100333294e-05, + "loss": 0.7353, + "step": 1020 + }, + { + "epoch": 0.14953542392566782, + "grad_norm": 0.732448935508728, + "learning_rate": 4.9337872327752444e-05, + "loss": 0.6336, + "step": 1030 + }, + { + "epoch": 0.15098722415795587, + "grad_norm": 0.9106850624084473, + "learning_rate": 4.932475561536542e-05, + "loss": 0.5646, + "step": 1040 + }, + { + "epoch": 0.1524390243902439, + "grad_norm": 3.07547926902771, + "learning_rate": 4.931151203158547e-05, + "loss": 0.5629, + "step": 1050 + }, + { + "epoch": 0.15389082462253195, + "grad_norm": 2.424933433532715, + "learning_rate": 4.929814164548756e-05, + "loss": 0.9348, + "step": 1060 + }, + { + "epoch": 0.15534262485481998, + "grad_norm": 0.5797663927078247, + "learning_rate": 4.928464452680804e-05, + "loss": 0.7293, + "step": 1070 + }, + { + "epoch": 0.156794425087108, + "grad_norm": 4.099000453948975, + "learning_rate": 4.9271020745944265e-05, + "loss": 0.3943, + "step": 1080 + }, + { + "epoch": 0.15824622531939606, + "grad_norm": 2.4443376064300537, + "learning_rate": 4.92572703739542e-05, + "loss": 0.503, + "step": 1090 + }, + { + "epoch": 0.15969802555168408, + "grad_norm": 1.4358808994293213, + "learning_rate": 4.924339348255611e-05, + "loss": 0.7181, + "step": 1100 + }, + { + "epoch": 0.16114982578397213, + "grad_norm": 1.4664112329483032, + "learning_rate": 4.922939014412812e-05, + "loss": 0.7096, + "step": 1110 + }, + { + "epoch": 0.16260162601626016, + "grad_norm": 1.605031967163086, + "learning_rate": 4.9215260431707885e-05, + "loss": 0.7917, + "step": 1120 + }, + { + "epoch": 0.1640534262485482, + "grad_norm": 2.435290813446045, + "learning_rate": 4.92010044189922e-05, + "loss": 0.7983, + "step": 1130 + }, + { + "epoch": 0.16550522648083624, + "grad_norm": 3.4949209690093994, + "learning_rate": 4.9186622180336595e-05, + "loss": 0.811, + "step": 1140 + }, + { + "epoch": 0.16695702671312426, + "grad_norm": 0.8932238221168518, + "learning_rate": 4.917211379075496e-05, + "loss": 0.5875, + "step": 1150 + }, + { + "epoch": 0.16840882694541232, + "grad_norm": 1.7695764303207397, + "learning_rate": 4.9157479325919156e-05, + "loss": 0.8934, + "step": 1160 + }, + { + "epoch": 0.16986062717770034, + "grad_norm": 5.012516975402832, + "learning_rate": 4.9142718862158634e-05, + "loss": 0.6394, + "step": 1170 + }, + { + "epoch": 0.1713124274099884, + "grad_norm": 1.3308697938919067, + "learning_rate": 4.912783247646e-05, + "loss": 0.5884, + "step": 1180 + }, + { + "epoch": 0.17276422764227642, + "grad_norm": 0.9371745586395264, + "learning_rate": 4.911282024646664e-05, + "loss": 0.8007, + "step": 1190 + }, + { + "epoch": 0.17421602787456447, + "grad_norm": 3.8555784225463867, + "learning_rate": 4.909768225047833e-05, + "loss": 0.632, + "step": 1200 + }, + { + "epoch": 0.1756678281068525, + "grad_norm": 3.392313003540039, + "learning_rate": 4.908241856745077e-05, + "loss": 0.8346, + "step": 1210 + }, + { + "epoch": 0.17711962833914052, + "grad_norm": 1.3405215740203857, + "learning_rate": 4.906702927699525e-05, + "loss": 0.7455, + "step": 1220 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 1.5076225996017456, + "learning_rate": 4.905151445937817e-05, + "loss": 0.8539, + "step": 1230 + }, + { + "epoch": 0.1800232288037166, + "grad_norm": 1.722262978553772, + "learning_rate": 4.903587419552065e-05, + "loss": 0.641, + "step": 1240 + }, + { + "epoch": 0.18147502903600465, + "grad_norm": 0.5747788548469543, + "learning_rate": 4.902010856699811e-05, + "loss": 0.669, + "step": 1250 + }, + { + "epoch": 0.18292682926829268, + "grad_norm": 3.8853611946105957, + "learning_rate": 4.900421765603983e-05, + "loss": 0.598, + "step": 1260 + }, + { + "epoch": 0.18437862950058073, + "grad_norm": 0.7678889036178589, + "learning_rate": 4.8988201545528536e-05, + "loss": 0.57, + "step": 1270 + }, + { + "epoch": 0.18583042973286876, + "grad_norm": 0.890204906463623, + "learning_rate": 4.897206031899997e-05, + "loss": 0.52, + "step": 1280 + }, + { + "epoch": 0.18728222996515678, + "grad_norm": 1.0817334651947021, + "learning_rate": 4.8955794060642416e-05, + "loss": 0.5813, + "step": 1290 + }, + { + "epoch": 0.18873403019744484, + "grad_norm": 0.7758299112319946, + "learning_rate": 4.893940285529631e-05, + "loss": 0.8182, + "step": 1300 + }, + { + "epoch": 0.19018583042973286, + "grad_norm": 1.0594083070755005, + "learning_rate": 4.8922886788453796e-05, + "loss": 0.6143, + "step": 1310 + }, + { + "epoch": 0.1916376306620209, + "grad_norm": 0.9304606914520264, + "learning_rate": 4.8906245946258235e-05, + "loss": 0.7401, + "step": 1320 + }, + { + "epoch": 0.19308943089430894, + "grad_norm": 2.11362361907959, + "learning_rate": 4.8889480415503785e-05, + "loss": 0.4487, + "step": 1330 + }, + { + "epoch": 0.194541231126597, + "grad_norm": 2.9040818214416504, + "learning_rate": 4.8872590283634955e-05, + "loss": 0.7218, + "step": 1340 + }, + { + "epoch": 0.19599303135888502, + "grad_norm": 0.6021516919136047, + "learning_rate": 4.8855575638746135e-05, + "loss": 0.7179, + "step": 1350 + }, + { + "epoch": 0.19744483159117304, + "grad_norm": 3.067187786102295, + "learning_rate": 4.883843656958115e-05, + "loss": 0.9561, + "step": 1360 + }, + { + "epoch": 0.1988966318234611, + "grad_norm": 4.093753337860107, + "learning_rate": 4.882117316553278e-05, + "loss": 0.8025, + "step": 1370 + }, + { + "epoch": 0.20034843205574912, + "grad_norm": 1.0853984355926514, + "learning_rate": 4.88037855166423e-05, + "loss": 0.7298, + "step": 1380 + }, + { + "epoch": 0.20180023228803717, + "grad_norm": 1.4068083763122559, + "learning_rate": 4.878627371359902e-05, + "loss": 0.5038, + "step": 1390 + }, + { + "epoch": 0.2032520325203252, + "grad_norm": 1.063698649406433, + "learning_rate": 4.876863784773981e-05, + "loss": 0.8824, + "step": 1400 + }, + { + "epoch": 0.20470383275261325, + "grad_norm": 1.4493242502212524, + "learning_rate": 4.875087801104859e-05, + "loss": 0.8179, + "step": 1410 + }, + { + "epoch": 0.20615563298490128, + "grad_norm": 1.8046404123306274, + "learning_rate": 4.8732994296155915e-05, + "loss": 0.7289, + "step": 1420 + }, + { + "epoch": 0.2076074332171893, + "grad_norm": 1.531055212020874, + "learning_rate": 4.871498679633844e-05, + "loss": 0.9306, + "step": 1430 + }, + { + "epoch": 0.20905923344947736, + "grad_norm": 1.2926791906356812, + "learning_rate": 4.869685560551844e-05, + "loss": 0.7812, + "step": 1440 + }, + { + "epoch": 0.21051103368176538, + "grad_norm": 2.004673957824707, + "learning_rate": 4.867860081826334e-05, + "loss": 0.6344, + "step": 1450 + }, + { + "epoch": 0.21196283391405343, + "grad_norm": 0.8372285962104797, + "learning_rate": 4.866022252978521e-05, + "loss": 0.9279, + "step": 1460 + }, + { + "epoch": 0.21341463414634146, + "grad_norm": 3.9492061138153076, + "learning_rate": 4.8641720835940265e-05, + "loss": 0.6554, + "step": 1470 + }, + { + "epoch": 0.2148664343786295, + "grad_norm": 1.1838141679763794, + "learning_rate": 4.862309583322837e-05, + "loss": 0.35, + "step": 1480 + }, + { + "epoch": 0.21631823461091754, + "grad_norm": 0.5205928683280945, + "learning_rate": 4.860434761879255e-05, + "loss": 0.8758, + "step": 1490 + }, + { + "epoch": 0.21777003484320556, + "grad_norm": 1.2075397968292236, + "learning_rate": 4.858547629041844e-05, + "loss": 0.8463, + "step": 1500 + }, + { + "epoch": 0.21922183507549362, + "grad_norm": 0.9651175141334534, + "learning_rate": 4.8566481946533824e-05, + "loss": 0.5918, + "step": 1510 + }, + { + "epoch": 0.22067363530778164, + "grad_norm": 1.0648430585861206, + "learning_rate": 4.8547364686208106e-05, + "loss": 0.7321, + "step": 1520 + }, + { + "epoch": 0.2221254355400697, + "grad_norm": 1.3580704927444458, + "learning_rate": 4.852812460915178e-05, + "loss": 0.8827, + "step": 1530 + }, + { + "epoch": 0.22357723577235772, + "grad_norm": 1.9950529336929321, + "learning_rate": 4.850876181571592e-05, + "loss": 0.8698, + "step": 1540 + }, + { + "epoch": 0.22502903600464577, + "grad_norm": 0.6319971680641174, + "learning_rate": 4.848927640689165e-05, + "loss": 0.8824, + "step": 1550 + }, + { + "epoch": 0.2264808362369338, + "grad_norm": 0.40468019247055054, + "learning_rate": 4.846966848430964e-05, + "loss": 0.454, + "step": 1560 + }, + { + "epoch": 0.22793263646922182, + "grad_norm": 2.219438076019287, + "learning_rate": 4.8449938150239544e-05, + "loss": 0.7014, + "step": 1570 + }, + { + "epoch": 0.22938443670150988, + "grad_norm": 0.6382218599319458, + "learning_rate": 4.843008550758948e-05, + "loss": 0.7618, + "step": 1580 + }, + { + "epoch": 0.2308362369337979, + "grad_norm": 1.5169848203659058, + "learning_rate": 4.8410110659905514e-05, + "loss": 0.9599, + "step": 1590 + }, + { + "epoch": 0.23228803716608595, + "grad_norm": 1.203534483909607, + "learning_rate": 4.8390013711371085e-05, + "loss": 0.4722, + "step": 1600 + }, + { + "epoch": 0.23373983739837398, + "grad_norm": 1.456782341003418, + "learning_rate": 4.836979476680647e-05, + "loss": 0.8534, + "step": 1610 + }, + { + "epoch": 0.23519163763066203, + "grad_norm": 0.9215080142021179, + "learning_rate": 4.834945393166826e-05, + "loss": 0.8088, + "step": 1620 + }, + { + "epoch": 0.23664343786295006, + "grad_norm": 0.7815489768981934, + "learning_rate": 4.832899131204879e-05, + "loss": 0.8544, + "step": 1630 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.947912871837616, + "learning_rate": 4.8308407014675577e-05, + "loss": 0.6289, + "step": 1640 + }, + { + "epoch": 0.23954703832752614, + "grad_norm": 0.6381635665893555, + "learning_rate": 4.82877011469108e-05, + "loss": 0.7655, + "step": 1650 + }, + { + "epoch": 0.24099883855981416, + "grad_norm": 1.064013957977295, + "learning_rate": 4.8266873816750716e-05, + "loss": 0.5693, + "step": 1660 + }, + { + "epoch": 0.2424506387921022, + "grad_norm": 2.0902225971221924, + "learning_rate": 4.824592513282505e-05, + "loss": 0.8012, + "step": 1670 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 6.462097644805908, + "learning_rate": 4.8224855204396555e-05, + "loss": 0.628, + "step": 1680 + }, + { + "epoch": 0.2453542392566783, + "grad_norm": 1.036537766456604, + "learning_rate": 4.820366414136028e-05, + "loss": 0.7784, + "step": 1690 + }, + { + "epoch": 0.24680603948896632, + "grad_norm": 1.232399582862854, + "learning_rate": 4.818235205424315e-05, + "loss": 0.7538, + "step": 1700 + }, + { + "epoch": 0.24825783972125434, + "grad_norm": 1.105141282081604, + "learning_rate": 4.816091905420327e-05, + "loss": 0.9042, + "step": 1710 + }, + { + "epoch": 0.2497096399535424, + "grad_norm": 0.6853220462799072, + "learning_rate": 4.813936525302942e-05, + "loss": 0.5936, + "step": 1720 + }, + { + "epoch": 0.2511614401858304, + "grad_norm": 0.4660559892654419, + "learning_rate": 4.811769076314044e-05, + "loss": 0.7323, + "step": 1730 + }, + { + "epoch": 0.25261324041811845, + "grad_norm": 1.0349425077438354, + "learning_rate": 4.809589569758464e-05, + "loss": 0.5865, + "step": 1740 + }, + { + "epoch": 0.2540650406504065, + "grad_norm": 0.4405325651168823, + "learning_rate": 4.8073980170039234e-05, + "loss": 0.7297, + "step": 1750 + }, + { + "epoch": 0.25551684088269455, + "grad_norm": 1.4432979822158813, + "learning_rate": 4.805194429480972e-05, + "loss": 0.6268, + "step": 1760 + }, + { + "epoch": 0.2569686411149826, + "grad_norm": 0.7807000279426575, + "learning_rate": 4.802978818682933e-05, + "loss": 0.7536, + "step": 1770 + }, + { + "epoch": 0.2584204413472706, + "grad_norm": 1.3717634677886963, + "learning_rate": 4.800751196165835e-05, + "loss": 0.908, + "step": 1780 + }, + { + "epoch": 0.2598722415795586, + "grad_norm": 1.9359996318817139, + "learning_rate": 4.79851157354836e-05, + "loss": 0.4698, + "step": 1790 + }, + { + "epoch": 0.2613240418118467, + "grad_norm": 2.113598346710205, + "learning_rate": 4.7962599625117773e-05, + "loss": 0.6629, + "step": 1800 + }, + { + "epoch": 0.26277584204413473, + "grad_norm": 0.7605477571487427, + "learning_rate": 4.7939963747998855e-05, + "loss": 0.727, + "step": 1810 + }, + { + "epoch": 0.26422764227642276, + "grad_norm": 0.6016331315040588, + "learning_rate": 4.7917208222189506e-05, + "loss": 0.8574, + "step": 1820 + }, + { + "epoch": 0.2656794425087108, + "grad_norm": 0.8621135950088501, + "learning_rate": 4.789433316637644e-05, + "loss": 0.7995, + "step": 1830 + }, + { + "epoch": 0.26713124274099886, + "grad_norm": 1.2249228954315186, + "learning_rate": 4.7871338699869796e-05, + "loss": 0.9538, + "step": 1840 + }, + { + "epoch": 0.2685830429732869, + "grad_norm": 3.5839085578918457, + "learning_rate": 4.784822494260255e-05, + "loss": 0.602, + "step": 1850 + }, + { + "epoch": 0.2700348432055749, + "grad_norm": 1.334702491760254, + "learning_rate": 4.782499201512983e-05, + "loss": 0.702, + "step": 1860 + }, + { + "epoch": 0.27148664343786294, + "grad_norm": 0.8643277287483215, + "learning_rate": 4.780164003862838e-05, + "loss": 0.7837, + "step": 1870 + }, + { + "epoch": 0.27293844367015097, + "grad_norm": 0.9091192483901978, + "learning_rate": 4.777816913489581e-05, + "loss": 0.658, + "step": 1880 + }, + { + "epoch": 0.27439024390243905, + "grad_norm": 4.237992763519287, + "learning_rate": 4.775457942635006e-05, + "loss": 0.7956, + "step": 1890 + }, + { + "epoch": 0.27584204413472707, + "grad_norm": 0.5401553511619568, + "learning_rate": 4.773087103602871e-05, + "loss": 0.6637, + "step": 1900 + }, + { + "epoch": 0.2772938443670151, + "grad_norm": 2.7873334884643555, + "learning_rate": 4.770704408758837e-05, + "loss": 0.4589, + "step": 1910 + }, + { + "epoch": 0.2787456445993031, + "grad_norm": 0.6120592355728149, + "learning_rate": 4.7683098705304e-05, + "loss": 0.6523, + "step": 1920 + }, + { + "epoch": 0.28019744483159115, + "grad_norm": 0.4932442307472229, + "learning_rate": 4.765903501406826e-05, + "loss": 0.7068, + "step": 1930 + }, + { + "epoch": 0.28164924506387923, + "grad_norm": 1.102984070777893, + "learning_rate": 4.7634853139390945e-05, + "loss": 0.7414, + "step": 1940 + }, + { + "epoch": 0.28310104529616725, + "grad_norm": 0.7468515634536743, + "learning_rate": 4.7610553207398185e-05, + "loss": 0.8069, + "step": 1950 + }, + { + "epoch": 0.2845528455284553, + "grad_norm": 1.3317950963974, + "learning_rate": 4.758613534483191e-05, + "loss": 0.8219, + "step": 1960 + }, + { + "epoch": 0.2860046457607433, + "grad_norm": 1.7681723833084106, + "learning_rate": 4.7561599679049135e-05, + "loss": 0.5898, + "step": 1970 + }, + { + "epoch": 0.2874564459930314, + "grad_norm": 1.0765740871429443, + "learning_rate": 4.7536946338021306e-05, + "loss": 0.552, + "step": 1980 + }, + { + "epoch": 0.2889082462253194, + "grad_norm": 1.1886732578277588, + "learning_rate": 4.751217545033362e-05, + "loss": 0.5558, + "step": 1990 + }, + { + "epoch": 0.29036004645760743, + "grad_norm": 1.0681451559066772, + "learning_rate": 4.748728714518438e-05, + "loss": 0.6335, + "step": 2000 + }, + { + "epoch": 0.29181184668989546, + "grad_norm": 0.8771520256996155, + "learning_rate": 4.7462281552384306e-05, + "loss": 0.6354, + "step": 2010 + }, + { + "epoch": 0.2932636469221835, + "grad_norm": 1.085581660270691, + "learning_rate": 4.7437158802355854e-05, + "loss": 0.4697, + "step": 2020 + }, + { + "epoch": 0.29471544715447157, + "grad_norm": 1.2349504232406616, + "learning_rate": 4.7411919026132536e-05, + "loss": 0.5823, + "step": 2030 + }, + { + "epoch": 0.2961672473867596, + "grad_norm": 0.8741536736488342, + "learning_rate": 4.7386562355358254e-05, + "loss": 0.7622, + "step": 2040 + }, + { + "epoch": 0.2976190476190476, + "grad_norm": 3.957540273666382, + "learning_rate": 4.736108892228658e-05, + "loss": 0.696, + "step": 2050 + }, + { + "epoch": 0.29907084785133564, + "grad_norm": 1.2028242349624634, + "learning_rate": 4.733549885978012e-05, + "loss": 0.5248, + "step": 2060 + }, + { + "epoch": 0.30052264808362367, + "grad_norm": 2.623757839202881, + "learning_rate": 4.7309792301309755e-05, + "loss": 0.7899, + "step": 2070 + }, + { + "epoch": 0.30197444831591175, + "grad_norm": 0.8219063878059387, + "learning_rate": 4.728396938095399e-05, + "loss": 0.8088, + "step": 2080 + }, + { + "epoch": 0.3034262485481998, + "grad_norm": 2.02731990814209, + "learning_rate": 4.7258030233398244e-05, + "loss": 0.7673, + "step": 2090 + }, + { + "epoch": 0.3048780487804878, + "grad_norm": 1.400942087173462, + "learning_rate": 4.723197499393415e-05, + "loss": 0.648, + "step": 2100 + }, + { + "epoch": 0.3063298490127758, + "grad_norm": 2.6127829551696777, + "learning_rate": 4.7205803798458836e-05, + "loss": 0.7408, + "step": 2110 + }, + { + "epoch": 0.3077816492450639, + "grad_norm": 2.252988338470459, + "learning_rate": 4.7179516783474226e-05, + "loss": 0.7625, + "step": 2120 + }, + { + "epoch": 0.30923344947735193, + "grad_norm": 1.4618316888809204, + "learning_rate": 4.7153114086086336e-05, + "loss": 0.9155, + "step": 2130 + }, + { + "epoch": 0.31068524970963995, + "grad_norm": 0.945075511932373, + "learning_rate": 4.712659584400454e-05, + "loss": 0.8939, + "step": 2140 + }, + { + "epoch": 0.312137049941928, + "grad_norm": 1.9799119234085083, + "learning_rate": 4.709996219554088e-05, + "loss": 0.7928, + "step": 2150 + }, + { + "epoch": 0.313588850174216, + "grad_norm": 3.0045998096466064, + "learning_rate": 4.7073213279609293e-05, + "loss": 0.7881, + "step": 2160 + }, + { + "epoch": 0.3150406504065041, + "grad_norm": 1.4035004377365112, + "learning_rate": 4.7046349235724964e-05, + "loss": 0.8062, + "step": 2170 + }, + { + "epoch": 0.3164924506387921, + "grad_norm": 1.9164339303970337, + "learning_rate": 4.701937020400352e-05, + "loss": 0.7617, + "step": 2180 + }, + { + "epoch": 0.31794425087108014, + "grad_norm": 1.0605820417404175, + "learning_rate": 4.699227632516034e-05, + "loss": 0.7231, + "step": 2190 + }, + { + "epoch": 0.31939605110336816, + "grad_norm": 0.9426791071891785, + "learning_rate": 4.6965067740509825e-05, + "loss": 0.6771, + "step": 2200 + }, + { + "epoch": 0.3208478513356562, + "grad_norm": 1.0823321342468262, + "learning_rate": 4.693774459196465e-05, + "loss": 0.8387, + "step": 2210 + }, + { + "epoch": 0.32229965156794427, + "grad_norm": 1.703384518623352, + "learning_rate": 4.691030702203502e-05, + "loss": 0.4302, + "step": 2220 + }, + { + "epoch": 0.3237514518002323, + "grad_norm": 1.2216838598251343, + "learning_rate": 4.6882755173827933e-05, + "loss": 0.5434, + "step": 2230 + }, + { + "epoch": 0.3252032520325203, + "grad_norm": 0.8944600820541382, + "learning_rate": 4.6855089191046406e-05, + "loss": 0.7718, + "step": 2240 + }, + { + "epoch": 0.32665505226480834, + "grad_norm": 1.4830057621002197, + "learning_rate": 4.682730921798881e-05, + "loss": 0.7067, + "step": 2250 + }, + { + "epoch": 0.3281068524970964, + "grad_norm": 1.1373881101608276, + "learning_rate": 4.679941539954801e-05, + "loss": 0.6134, + "step": 2260 + }, + { + "epoch": 0.32955865272938445, + "grad_norm": 3.311741352081299, + "learning_rate": 4.677140788121067e-05, + "loss": 0.5914, + "step": 2270 + }, + { + "epoch": 0.3310104529616725, + "grad_norm": 1.3183683156967163, + "learning_rate": 4.674328680905649e-05, + "loss": 0.6412, + "step": 2280 + }, + { + "epoch": 0.3324622531939605, + "grad_norm": 0.6239253282546997, + "learning_rate": 4.671505232975741e-05, + "loss": 0.8585, + "step": 2290 + }, + { + "epoch": 0.3339140534262485, + "grad_norm": 0.6019532680511475, + "learning_rate": 4.668670459057692e-05, + "loss": 0.6322, + "step": 2300 + }, + { + "epoch": 0.3353658536585366, + "grad_norm": 1.3409720659255981, + "learning_rate": 4.665824373936921e-05, + "loss": 0.8676, + "step": 2310 + }, + { + "epoch": 0.33681765389082463, + "grad_norm": 1.3901034593582153, + "learning_rate": 4.662966992457842e-05, + "loss": 0.6381, + "step": 2320 + }, + { + "epoch": 0.33826945412311266, + "grad_norm": 0.4752490818500519, + "learning_rate": 4.660098329523791e-05, + "loss": 0.7852, + "step": 2330 + }, + { + "epoch": 0.3397212543554007, + "grad_norm": 0.8826183676719666, + "learning_rate": 4.657218400096942e-05, + "loss": 0.7941, + "step": 2340 + }, + { + "epoch": 0.3411730545876887, + "grad_norm": 1.8894481658935547, + "learning_rate": 4.654327219198235e-05, + "loss": 0.554, + "step": 2350 + }, + { + "epoch": 0.3426248548199768, + "grad_norm": 4.281989097595215, + "learning_rate": 4.6514248019072926e-05, + "loss": 0.6456, + "step": 2360 + }, + { + "epoch": 0.3440766550522648, + "grad_norm": 1.1848098039627075, + "learning_rate": 4.648511163362343e-05, + "loss": 0.8237, + "step": 2370 + }, + { + "epoch": 0.34552845528455284, + "grad_norm": 1.174756646156311, + "learning_rate": 4.645586318760145e-05, + "loss": 0.709, + "step": 2380 + }, + { + "epoch": 0.34698025551684086, + "grad_norm": 2.8332509994506836, + "learning_rate": 4.6426502833559e-05, + "loss": 0.6055, + "step": 2390 + }, + { + "epoch": 0.34843205574912894, + "grad_norm": 0.6192472577095032, + "learning_rate": 4.639703072463181e-05, + "loss": 0.8328, + "step": 2400 + }, + { + "epoch": 0.34988385598141697, + "grad_norm": 0.6660485863685608, + "learning_rate": 4.636744701453849e-05, + "loss": 0.92, + "step": 2410 + }, + { + "epoch": 0.351335656213705, + "grad_norm": 1.6284211874008179, + "learning_rate": 4.633775185757973e-05, + "loss": 0.7252, + "step": 2420 + }, + { + "epoch": 0.352787456445993, + "grad_norm": 0.7274760007858276, + "learning_rate": 4.630794540863747e-05, + "loss": 0.6107, + "step": 2430 + }, + { + "epoch": 0.35423925667828104, + "grad_norm": 2.6577463150024414, + "learning_rate": 4.627802782317417e-05, + "loss": 0.647, + "step": 2440 + }, + { + "epoch": 0.3556910569105691, + "grad_norm": 1.4532408714294434, + "learning_rate": 4.624799925723191e-05, + "loss": 0.435, + "step": 2450 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 2.7971816062927246, + "learning_rate": 4.621785986743163e-05, + "loss": 0.5866, + "step": 2460 + }, + { + "epoch": 0.3585946573751452, + "grad_norm": 1.4571512937545776, + "learning_rate": 4.61876098109723e-05, + "loss": 0.7796, + "step": 2470 + }, + { + "epoch": 0.3600464576074332, + "grad_norm": 2.3864150047302246, + "learning_rate": 4.6157249245630075e-05, + "loss": 0.9921, + "step": 2480 + }, + { + "epoch": 0.3614982578397213, + "grad_norm": 2.915992021560669, + "learning_rate": 4.6126778329757516e-05, + "loss": 0.7665, + "step": 2490 + }, + { + "epoch": 0.3629500580720093, + "grad_norm": 2.1495201587677, + "learning_rate": 4.609619722228274e-05, + "loss": 0.6569, + "step": 2500 + }, + { + "epoch": 0.36440185830429733, + "grad_norm": 2.9136157035827637, + "learning_rate": 4.606550608270859e-05, + "loss": 0.835, + "step": 2510 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 0.8638590574264526, + "learning_rate": 4.603470507111182e-05, + "loss": 0.7063, + "step": 2520 + }, + { + "epoch": 0.3673054587688734, + "grad_norm": 2.173835277557373, + "learning_rate": 4.600379434814221e-05, + "loss": 0.761, + "step": 2530 + }, + { + "epoch": 0.36875725900116146, + "grad_norm": 2.0101635456085205, + "learning_rate": 4.597277407502181e-05, + "loss": 0.5618, + "step": 2540 + }, + { + "epoch": 0.3702090592334495, + "grad_norm": 1.1493425369262695, + "learning_rate": 4.5941644413544024e-05, + "loss": 0.671, + "step": 2550 + }, + { + "epoch": 0.3716608594657375, + "grad_norm": 1.129114031791687, + "learning_rate": 4.591040552607281e-05, + "loss": 0.601, + "step": 2560 + }, + { + "epoch": 0.37311265969802554, + "grad_norm": 2.0701091289520264, + "learning_rate": 4.587905757554182e-05, + "loss": 0.8573, + "step": 2570 + }, + { + "epoch": 0.37456445993031356, + "grad_norm": 1.2713189125061035, + "learning_rate": 4.5847600725453536e-05, + "loss": 0.6449, + "step": 2580 + }, + { + "epoch": 0.37601626016260165, + "grad_norm": 1.8538284301757812, + "learning_rate": 4.581603513987845e-05, + "loss": 0.6038, + "step": 2590 + }, + { + "epoch": 0.37746806039488967, + "grad_norm": 1.350251317024231, + "learning_rate": 4.5784360983454175e-05, + "loss": 0.5973, + "step": 2600 + }, + { + "epoch": 0.3789198606271777, + "grad_norm": 0.7953972220420837, + "learning_rate": 4.5752578421384606e-05, + "loss": 0.9078, + "step": 2610 + }, + { + "epoch": 0.3803716608594657, + "grad_norm": 0.8986756205558777, + "learning_rate": 4.572068761943905e-05, + "loss": 0.6951, + "step": 2620 + }, + { + "epoch": 0.3818234610917538, + "grad_norm": 0.920846700668335, + "learning_rate": 4.568868874395137e-05, + "loss": 0.4939, + "step": 2630 + }, + { + "epoch": 0.3832752613240418, + "grad_norm": 1.8228408098220825, + "learning_rate": 4.565658196181909e-05, + "loss": 0.8694, + "step": 2640 + }, + { + "epoch": 0.38472706155632985, + "grad_norm": 1.1996351480484009, + "learning_rate": 4.5624367440502594e-05, + "loss": 0.6528, + "step": 2650 + }, + { + "epoch": 0.3861788617886179, + "grad_norm": 1.8428452014923096, + "learning_rate": 4.559204534802415e-05, + "loss": 0.6755, + "step": 2660 + }, + { + "epoch": 0.3876306620209059, + "grad_norm": 1.1987791061401367, + "learning_rate": 4.555961585296712e-05, + "loss": 0.5469, + "step": 2670 + }, + { + "epoch": 0.389082462253194, + "grad_norm": 1.405840277671814, + "learning_rate": 4.5527079124475045e-05, + "loss": 0.7443, + "step": 2680 + }, + { + "epoch": 0.390534262485482, + "grad_norm": 1.371089220046997, + "learning_rate": 4.549443533225075e-05, + "loss": 0.7145, + "step": 2690 + }, + { + "epoch": 0.39198606271777003, + "grad_norm": 1.3392704725265503, + "learning_rate": 4.546168464655551e-05, + "loss": 0.6241, + "step": 2700 + }, + { + "epoch": 0.39343786295005806, + "grad_norm": 1.8694888353347778, + "learning_rate": 4.542882723820809e-05, + "loss": 0.7412, + "step": 2710 + }, + { + "epoch": 0.3948896631823461, + "grad_norm": 0.8335723876953125, + "learning_rate": 4.5395863278583914e-05, + "loss": 0.5457, + "step": 2720 + }, + { + "epoch": 0.39634146341463417, + "grad_norm": 1.200954556465149, + "learning_rate": 4.5362792939614126e-05, + "loss": 0.8856, + "step": 2730 + }, + { + "epoch": 0.3977932636469222, + "grad_norm": 0.7490825057029724, + "learning_rate": 4.532961639378477e-05, + "loss": 0.7058, + "step": 2740 + }, + { + "epoch": 0.3992450638792102, + "grad_norm": 0.5736889839172363, + "learning_rate": 4.529633381413577e-05, + "loss": 0.8461, + "step": 2750 + }, + { + "epoch": 0.40069686411149824, + "grad_norm": 3.038465976715088, + "learning_rate": 4.526294537426013e-05, + "loss": 0.9319, + "step": 2760 + }, + { + "epoch": 0.4021486643437863, + "grad_norm": 3.3678839206695557, + "learning_rate": 4.5229451248302996e-05, + "loss": 0.7878, + "step": 2770 + }, + { + "epoch": 0.40360046457607435, + "grad_norm": 0.9918755292892456, + "learning_rate": 4.5195851610960716e-05, + "loss": 0.5738, + "step": 2780 + }, + { + "epoch": 0.40505226480836237, + "grad_norm": 0.45315515995025635, + "learning_rate": 4.516214663747999e-05, + "loss": 0.8513, + "step": 2790 + }, + { + "epoch": 0.4065040650406504, + "grad_norm": 3.0047781467437744, + "learning_rate": 4.512833650365691e-05, + "loss": 0.494, + "step": 2800 + }, + { + "epoch": 0.4079558652729384, + "grad_norm": 1.6291121244430542, + "learning_rate": 4.509442138583604e-05, + "loss": 0.4759, + "step": 2810 + }, + { + "epoch": 0.4094076655052265, + "grad_norm": 1.279628038406372, + "learning_rate": 4.506040146090953e-05, + "loss": 0.75, + "step": 2820 + }, + { + "epoch": 0.41085946573751453, + "grad_norm": 0.6952537894248962, + "learning_rate": 4.502627690631618e-05, + "loss": 0.6722, + "step": 2830 + }, + { + "epoch": 0.41231126596980255, + "grad_norm": 6.771650791168213, + "learning_rate": 4.499204790004051e-05, + "loss": 0.6538, + "step": 2840 + }, + { + "epoch": 0.4137630662020906, + "grad_norm": 1.1350947618484497, + "learning_rate": 4.49577146206118e-05, + "loss": 0.651, + "step": 2850 + }, + { + "epoch": 0.4152148664343786, + "grad_norm": 1.379130482673645, + "learning_rate": 4.492327724710324e-05, + "loss": 0.8259, + "step": 2860 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.9285208582878113, + "learning_rate": 4.488873595913091e-05, + "loss": 0.5317, + "step": 2870 + }, + { + "epoch": 0.4181184668989547, + "grad_norm": 1.3536639213562012, + "learning_rate": 4.485409093685289e-05, + "loss": 0.9471, + "step": 2880 + }, + { + "epoch": 0.41957026713124274, + "grad_norm": 1.6582531929016113, + "learning_rate": 4.4819342360968316e-05, + "loss": 0.6531, + "step": 2890 + }, + { + "epoch": 0.42102206736353076, + "grad_norm": 0.5296352505683899, + "learning_rate": 4.478449041271644e-05, + "loss": 0.8268, + "step": 2900 + }, + { + "epoch": 0.42247386759581884, + "grad_norm": 1.2088879346847534, + "learning_rate": 4.474953527387564e-05, + "loss": 0.9049, + "step": 2910 + }, + { + "epoch": 0.42392566782810687, + "grad_norm": 0.5331336855888367, + "learning_rate": 4.471447712676256e-05, + "loss": 1.1198, + "step": 2920 + }, + { + "epoch": 0.4253774680603949, + "grad_norm": 1.4603538513183594, + "learning_rate": 4.4679316154231054e-05, + "loss": 0.5809, + "step": 2930 + }, + { + "epoch": 0.4268292682926829, + "grad_norm": 0.9502357840538025, + "learning_rate": 4.464405253967133e-05, + "loss": 0.5471, + "step": 2940 + }, + { + "epoch": 0.42828106852497094, + "grad_norm": 1.7284854650497437, + "learning_rate": 4.4608686467008926e-05, + "loss": 0.6076, + "step": 2950 + }, + { + "epoch": 0.429732868757259, + "grad_norm": 0.7732632160186768, + "learning_rate": 4.457321812070378e-05, + "loss": 0.8251, + "step": 2960 + }, + { + "epoch": 0.43118466898954705, + "grad_norm": 1.8970303535461426, + "learning_rate": 4.453764768574926e-05, + "loss": 0.6548, + "step": 2970 + }, + { + "epoch": 0.4326364692218351, + "grad_norm": 0.50247722864151, + "learning_rate": 4.450197534767121e-05, + "loss": 0.6137, + "step": 2980 + }, + { + "epoch": 0.4340882694541231, + "grad_norm": 1.2860316038131714, + "learning_rate": 4.4466201292526956e-05, + "loss": 0.5776, + "step": 2990 + }, + { + "epoch": 0.4355400696864111, + "grad_norm": 1.1598414182662964, + "learning_rate": 4.4430325706904366e-05, + "loss": 0.9806, + "step": 3000 + }, + { + "epoch": 0.4355400696864111, + "eval_loss": 0.6489894390106201, + "eval_runtime": 107.7493, + "eval_samples_per_second": 13.457, + "eval_steps_per_second": 3.369, + "step": 3000 + }, + { + "epoch": 0.4369918699186992, + "grad_norm": 0.9044310450553894, + "learning_rate": 4.439434877792086e-05, + "loss": 0.666, + "step": 3010 + }, + { + "epoch": 0.43844367015098723, + "grad_norm": 0.6645646095275879, + "learning_rate": 4.435827069322244e-05, + "loss": 0.5448, + "step": 3020 + }, + { + "epoch": 0.43989547038327526, + "grad_norm": 4.799647331237793, + "learning_rate": 4.4322091640982705e-05, + "loss": 0.6945, + "step": 3030 + }, + { + "epoch": 0.4413472706155633, + "grad_norm": 1.6476815938949585, + "learning_rate": 4.428581180990188e-05, + "loss": 0.6551, + "step": 3040 + }, + { + "epoch": 0.44279907084785136, + "grad_norm": 0.8809843063354492, + "learning_rate": 4.424943138920581e-05, + "loss": 0.9209, + "step": 3050 + }, + { + "epoch": 0.4442508710801394, + "grad_norm": 2.0106568336486816, + "learning_rate": 4.4212950568645007e-05, + "loss": 0.6188, + "step": 3060 + }, + { + "epoch": 0.4457026713124274, + "grad_norm": 0.4192439913749695, + "learning_rate": 4.417636953849364e-05, + "loss": 0.7886, + "step": 3070 + }, + { + "epoch": 0.44715447154471544, + "grad_norm": 1.8189557790756226, + "learning_rate": 4.4139688489548534e-05, + "loss": 1.0636, + "step": 3080 + }, + { + "epoch": 0.44860627177700346, + "grad_norm": 1.1486669778823853, + "learning_rate": 4.410290761312818e-05, + "loss": 0.7989, + "step": 3090 + }, + { + "epoch": 0.45005807200929154, + "grad_norm": 0.6434163451194763, + "learning_rate": 4.406602710107177e-05, + "loss": 0.7368, + "step": 3100 + }, + { + "epoch": 0.45150987224157957, + "grad_norm": 1.370603084564209, + "learning_rate": 4.4029047145738134e-05, + "loss": 0.6113, + "step": 3110 + }, + { + "epoch": 0.4529616724738676, + "grad_norm": 1.5696393251419067, + "learning_rate": 4.39919679400048e-05, + "loss": 0.6274, + "step": 3120 + }, + { + "epoch": 0.4544134727061556, + "grad_norm": 21.59466552734375, + "learning_rate": 4.3954789677266936e-05, + "loss": 0.7229, + "step": 3130 + }, + { + "epoch": 0.45586527293844364, + "grad_norm": 0.975163459777832, + "learning_rate": 4.391751255143639e-05, + "loss": 0.7115, + "step": 3140 + }, + { + "epoch": 0.4573170731707317, + "grad_norm": 0.6678398251533508, + "learning_rate": 4.3880136756940624e-05, + "loss": 0.6668, + "step": 3150 + }, + { + "epoch": 0.45876887340301975, + "grad_norm": 0.9730459451675415, + "learning_rate": 4.384266248872176e-05, + "loss": 0.6139, + "step": 3160 + }, + { + "epoch": 0.4602206736353078, + "grad_norm": 0.7275809049606323, + "learning_rate": 4.380508994223551e-05, + "loss": 0.9358, + "step": 3170 + }, + { + "epoch": 0.4616724738675958, + "grad_norm": 4.506844520568848, + "learning_rate": 4.376741931345019e-05, + "loss": 0.5481, + "step": 3180 + }, + { + "epoch": 0.4631242740998839, + "grad_norm": 0.5535733699798584, + "learning_rate": 4.3729650798845676e-05, + "loss": 0.7074, + "step": 3190 + }, + { + "epoch": 0.4645760743321719, + "grad_norm": 0.7955453991889954, + "learning_rate": 4.36917845954124e-05, + "loss": 0.5912, + "step": 3200 + }, + { + "epoch": 0.46602787456445993, + "grad_norm": 1.144351601600647, + "learning_rate": 4.365382090065032e-05, + "loss": 0.893, + "step": 3210 + }, + { + "epoch": 0.46747967479674796, + "grad_norm": 2.5055947303771973, + "learning_rate": 4.3615759912567864e-05, + "loss": 0.7052, + "step": 3220 + }, + { + "epoch": 0.468931475029036, + "grad_norm": 2.367400884628296, + "learning_rate": 4.3577601829680925e-05, + "loss": 0.5374, + "step": 3230 + }, + { + "epoch": 0.47038327526132406, + "grad_norm": 2.6038706302642822, + "learning_rate": 4.353934685101181e-05, + "loss": 0.5551, + "step": 3240 + }, + { + "epoch": 0.4718350754936121, + "grad_norm": 1.4026364088058472, + "learning_rate": 4.350099517608823e-05, + "loss": 0.7855, + "step": 3250 + }, + { + "epoch": 0.4732868757259001, + "grad_norm": 1.1398979425430298, + "learning_rate": 4.346254700494221e-05, + "loss": 0.6862, + "step": 3260 + }, + { + "epoch": 0.47473867595818814, + "grad_norm": 0.881351888179779, + "learning_rate": 4.3424002538109096e-05, + "loss": 0.7258, + "step": 3270 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 3.777125120162964, + "learning_rate": 4.338536197662646e-05, + "loss": 0.6882, + "step": 3280 + }, + { + "epoch": 0.47764227642276424, + "grad_norm": 1.4731556177139282, + "learning_rate": 4.3346625522033105e-05, + "loss": 0.8303, + "step": 3290 + }, + { + "epoch": 0.47909407665505227, + "grad_norm": 1.810880184173584, + "learning_rate": 4.330779337636798e-05, + "loss": 0.7837, + "step": 3300 + }, + { + "epoch": 0.4805458768873403, + "grad_norm": 1.3891079425811768, + "learning_rate": 4.326886574216911e-05, + "loss": 0.4782, + "step": 3310 + }, + { + "epoch": 0.4819976771196283, + "grad_norm": 3.7195885181427, + "learning_rate": 4.32298428224726e-05, + "loss": 0.6343, + "step": 3320 + }, + { + "epoch": 0.4834494773519164, + "grad_norm": 1.837262511253357, + "learning_rate": 4.319072482081151e-05, + "loss": 0.4242, + "step": 3330 + }, + { + "epoch": 0.4849012775842044, + "grad_norm": 0.9354246854782104, + "learning_rate": 4.315151194121484e-05, + "loss": 0.6616, + "step": 3340 + }, + { + "epoch": 0.48635307781649245, + "grad_norm": 5.568230152130127, + "learning_rate": 4.3112204388206436e-05, + "loss": 0.5538, + "step": 3350 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 1.4984145164489746, + "learning_rate": 4.307280236680393e-05, + "loss": 0.6217, + "step": 3360 + }, + { + "epoch": 0.4892566782810685, + "grad_norm": 1.278181552886963, + "learning_rate": 4.303330608251769e-05, + "loss": 0.6273, + "step": 3370 + }, + { + "epoch": 0.4907084785133566, + "grad_norm": 0.48235225677490234, + "learning_rate": 4.2993715741349726e-05, + "loss": 0.5814, + "step": 3380 + }, + { + "epoch": 0.4921602787456446, + "grad_norm": 0.9399949312210083, + "learning_rate": 4.2954031549792634e-05, + "loss": 0.869, + "step": 3390 + }, + { + "epoch": 0.49361207897793263, + "grad_norm": 1.9232203960418701, + "learning_rate": 4.291425371482849e-05, + "loss": 0.8627, + "step": 3400 + }, + { + "epoch": 0.49506387921022066, + "grad_norm": 0.5802033543586731, + "learning_rate": 4.287438244392781e-05, + "loss": 0.8384, + "step": 3410 + }, + { + "epoch": 0.4965156794425087, + "grad_norm": 0.3144931197166443, + "learning_rate": 4.283441794504842e-05, + "loss": 0.6346, + "step": 3420 + }, + { + "epoch": 0.49796747967479676, + "grad_norm": 5.040658473968506, + "learning_rate": 4.279436042663443e-05, + "loss": 0.6497, + "step": 3430 + }, + { + "epoch": 0.4994192799070848, + "grad_norm": 0.7379769682884216, + "learning_rate": 4.275421009761509e-05, + "loss": 0.6061, + "step": 3440 + }, + { + "epoch": 0.5008710801393729, + "grad_norm": 0.975500226020813, + "learning_rate": 4.271396716740374e-05, + "loss": 0.601, + "step": 3450 + }, + { + "epoch": 0.5023228803716608, + "grad_norm": 1.0296087265014648, + "learning_rate": 4.267363184589669e-05, + "loss": 0.5649, + "step": 3460 + }, + { + "epoch": 0.5037746806039489, + "grad_norm": 1.4896851778030396, + "learning_rate": 4.2633204343472146e-05, + "loss": 0.6021, + "step": 3470 + }, + { + "epoch": 0.5052264808362369, + "grad_norm": 1.235889196395874, + "learning_rate": 4.25926848709891e-05, + "loss": 0.4451, + "step": 3480 + }, + { + "epoch": 0.506678281068525, + "grad_norm": 0.9615374207496643, + "learning_rate": 4.255207363978625e-05, + "loss": 0.4711, + "step": 3490 + }, + { + "epoch": 0.508130081300813, + "grad_norm": 1.6776018142700195, + "learning_rate": 4.251137086168086e-05, + "loss": 0.7406, + "step": 3500 + }, + { + "epoch": 0.509581881533101, + "grad_norm": 1.5150796175003052, + "learning_rate": 4.247057674896771e-05, + "loss": 0.496, + "step": 3510 + }, + { + "epoch": 0.5110336817653891, + "grad_norm": 1.1669261455535889, + "learning_rate": 4.24296915144179e-05, + "loss": 0.8257, + "step": 3520 + }, + { + "epoch": 0.5124854819976771, + "grad_norm": 0.6701371073722839, + "learning_rate": 4.2388715371277875e-05, + "loss": 0.8408, + "step": 3530 + }, + { + "epoch": 0.5139372822299652, + "grad_norm": 1.5670065879821777, + "learning_rate": 4.234764853326817e-05, + "loss": 0.9285, + "step": 3540 + }, + { + "epoch": 0.5153890824622532, + "grad_norm": 0.589513897895813, + "learning_rate": 4.230649121458239e-05, + "loss": 0.7376, + "step": 3550 + }, + { + "epoch": 0.5168408826945412, + "grad_norm": 0.7740994095802307, + "learning_rate": 4.226524362988605e-05, + "loss": 0.5336, + "step": 3560 + }, + { + "epoch": 0.5182926829268293, + "grad_norm": 1.503607153892517, + "learning_rate": 4.222390599431549e-05, + "loss": 0.8121, + "step": 3570 + }, + { + "epoch": 0.5197444831591173, + "grad_norm": 1.1378567218780518, + "learning_rate": 4.21824785234767e-05, + "loss": 1.0838, + "step": 3580 + }, + { + "epoch": 0.5211962833914053, + "grad_norm": 0.8732675313949585, + "learning_rate": 4.214096143344425e-05, + "loss": 0.6242, + "step": 3590 + }, + { + "epoch": 0.5226480836236934, + "grad_norm": 1.4234071969985962, + "learning_rate": 4.2099354940760124e-05, + "loss": 0.7382, + "step": 3600 + }, + { + "epoch": 0.5240998838559814, + "grad_norm": 0.9399917721748352, + "learning_rate": 4.205765926243264e-05, + "loss": 0.6173, + "step": 3610 + }, + { + "epoch": 0.5255516840882695, + "grad_norm": 1.9771159887313843, + "learning_rate": 4.201587461593522e-05, + "loss": 0.9029, + "step": 3620 + }, + { + "epoch": 0.5270034843205574, + "grad_norm": 2.2527432441711426, + "learning_rate": 4.197400121920539e-05, + "loss": 0.624, + "step": 3630 + }, + { + "epoch": 0.5284552845528455, + "grad_norm": 1.7543494701385498, + "learning_rate": 4.193203929064353e-05, + "loss": 0.5714, + "step": 3640 + }, + { + "epoch": 0.5299070847851336, + "grad_norm": 0.9363800883293152, + "learning_rate": 4.1889989049111794e-05, + "loss": 0.5273, + "step": 3650 + }, + { + "epoch": 0.5313588850174216, + "grad_norm": 0.6811619400978088, + "learning_rate": 4.184785071393295e-05, + "loss": 0.634, + "step": 3660 + }, + { + "epoch": 0.5328106852497096, + "grad_norm": 1.3300182819366455, + "learning_rate": 4.180562450488923e-05, + "loss": 0.7374, + "step": 3670 + }, + { + "epoch": 0.5342624854819977, + "grad_norm": 5.183244228363037, + "learning_rate": 4.17633106422212e-05, + "loss": 0.6945, + "step": 3680 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 3.467090368270874, + "learning_rate": 4.1720909346626624e-05, + "loss": 0.52, + "step": 3690 + }, + { + "epoch": 0.5371660859465738, + "grad_norm": 0.5636081099510193, + "learning_rate": 4.167842083925926e-05, + "loss": 0.7019, + "step": 3700 + }, + { + "epoch": 0.5386178861788617, + "grad_norm": 0.8139100074768066, + "learning_rate": 4.163584534172774e-05, + "loss": 0.6844, + "step": 3710 + }, + { + "epoch": 0.5400696864111498, + "grad_norm": 0.3868808150291443, + "learning_rate": 4.1593183076094445e-05, + "loss": 0.4764, + "step": 3720 + }, + { + "epoch": 0.5415214866434379, + "grad_norm": 3.8870656490325928, + "learning_rate": 4.155043426487429e-05, + "loss": 0.6925, + "step": 3730 + }, + { + "epoch": 0.5429732868757259, + "grad_norm": 1.7030867338180542, + "learning_rate": 4.150759913103359e-05, + "loss": 0.5368, + "step": 3740 + }, + { + "epoch": 0.544425087108014, + "grad_norm": 1.52249276638031, + "learning_rate": 4.1464677897988904e-05, + "loss": 0.6469, + "step": 3750 + }, + { + "epoch": 0.5458768873403019, + "grad_norm": 1.3640564680099487, + "learning_rate": 4.1421670789605856e-05, + "loss": 0.6186, + "step": 3760 + }, + { + "epoch": 0.54732868757259, + "grad_norm": 0.9472920298576355, + "learning_rate": 4.137857803019797e-05, + "loss": 0.6701, + "step": 3770 + }, + { + "epoch": 0.5487804878048781, + "grad_norm": 2.9700679779052734, + "learning_rate": 4.1335399844525514e-05, + "loss": 0.6616, + "step": 3780 + }, + { + "epoch": 0.5502322880371661, + "grad_norm": 1.1544781923294067, + "learning_rate": 4.129213645779431e-05, + "loss": 0.6644, + "step": 3790 + }, + { + "epoch": 0.5516840882694541, + "grad_norm": 2.1192784309387207, + "learning_rate": 4.124878809565455e-05, + "loss": 0.5912, + "step": 3800 + }, + { + "epoch": 0.5531358885017421, + "grad_norm": 0.9204639196395874, + "learning_rate": 4.1205354984199665e-05, + "loss": 1.0158, + "step": 3810 + }, + { + "epoch": 0.5545876887340302, + "grad_norm": 1.1523475646972656, + "learning_rate": 4.116183734996509e-05, + "loss": 0.5879, + "step": 3820 + }, + { + "epoch": 0.5560394889663183, + "grad_norm": 1.5894629955291748, + "learning_rate": 4.1118235419927125e-05, + "loss": 0.5309, + "step": 3830 + }, + { + "epoch": 0.5574912891986062, + "grad_norm": 1.463646650314331, + "learning_rate": 4.107454942150173e-05, + "loss": 0.5955, + "step": 3840 + }, + { + "epoch": 0.5589430894308943, + "grad_norm": 0.8998947739601135, + "learning_rate": 4.103077958254334e-05, + "loss": 0.5999, + "step": 3850 + }, + { + "epoch": 0.5603948896631823, + "grad_norm": 1.8093136548995972, + "learning_rate": 4.098692613134367e-05, + "loss": 0.7605, + "step": 3860 + }, + { + "epoch": 0.5618466898954704, + "grad_norm": 1.070966124534607, + "learning_rate": 4.0942989296630566e-05, + "loss": 0.7076, + "step": 3870 + }, + { + "epoch": 0.5632984901277585, + "grad_norm": 1.424028754234314, + "learning_rate": 4.0898969307566734e-05, + "loss": 0.553, + "step": 3880 + }, + { + "epoch": 0.5647502903600464, + "grad_norm": 4.3886189460754395, + "learning_rate": 4.0854866393748633e-05, + "loss": 0.6369, + "step": 3890 + }, + { + "epoch": 0.5662020905923345, + "grad_norm": 0.7212158441543579, + "learning_rate": 4.081068078520521e-05, + "loss": 0.5729, + "step": 3900 + }, + { + "epoch": 0.5676538908246226, + "grad_norm": 1.5475590229034424, + "learning_rate": 4.076641271239674e-05, + "loss": 0.6781, + "step": 3910 + }, + { + "epoch": 0.5691056910569106, + "grad_norm": 2.9124624729156494, + "learning_rate": 4.072206240621359e-05, + "loss": 0.3627, + "step": 3920 + }, + { + "epoch": 0.5705574912891986, + "grad_norm": 3.567720651626587, + "learning_rate": 4.067763009797506e-05, + "loss": 0.6201, + "step": 3930 + }, + { + "epoch": 0.5720092915214866, + "grad_norm": 1.0543193817138672, + "learning_rate": 4.063311601942814e-05, + "loss": 0.8288, + "step": 3940 + }, + { + "epoch": 0.5734610917537747, + "grad_norm": 2.356640338897705, + "learning_rate": 4.058852040274629e-05, + "loss": 0.7107, + "step": 3950 + }, + { + "epoch": 0.5749128919860628, + "grad_norm": 1.225469946861267, + "learning_rate": 4.054384348052829e-05, + "loss": 0.7114, + "step": 3960 + }, + { + "epoch": 0.5763646922183507, + "grad_norm": 1.6612083911895752, + "learning_rate": 4.049908548579695e-05, + "loss": 0.6198, + "step": 3970 + }, + { + "epoch": 0.5778164924506388, + "grad_norm": 0.8432019352912903, + "learning_rate": 4.0454246651997976e-05, + "loss": 0.641, + "step": 3980 + }, + { + "epoch": 0.5792682926829268, + "grad_norm": 1.41001296043396, + "learning_rate": 4.040932721299866e-05, + "loss": 0.6773, + "step": 3990 + }, + { + "epoch": 0.5807200929152149, + "grad_norm": 4.1915788650512695, + "learning_rate": 4.036432740308675e-05, + "loss": 0.708, + "step": 4000 + }, + { + "epoch": 0.582171893147503, + "grad_norm": 1.1455175876617432, + "learning_rate": 4.031924745696915e-05, + "loss": 0.687, + "step": 4010 + }, + { + "epoch": 0.5836236933797909, + "grad_norm": 0.27715983986854553, + "learning_rate": 4.027408760977078e-05, + "loss": 0.6192, + "step": 4020 + }, + { + "epoch": 0.585075493612079, + "grad_norm": 0.9823393821716309, + "learning_rate": 4.022884809703325e-05, + "loss": 0.7417, + "step": 4030 + }, + { + "epoch": 0.586527293844367, + "grad_norm": 1.0322932004928589, + "learning_rate": 4.018352915471373e-05, + "loss": 0.6031, + "step": 4040 + }, + { + "epoch": 0.587979094076655, + "grad_norm": 1.231325387954712, + "learning_rate": 4.0138131019183635e-05, + "loss": 0.6654, + "step": 4050 + }, + { + "epoch": 0.5894308943089431, + "grad_norm": 0.7293880581855774, + "learning_rate": 4.009265392722745e-05, + "loss": 0.7368, + "step": 4060 + }, + { + "epoch": 0.5908826945412311, + "grad_norm": 1.2683119773864746, + "learning_rate": 4.0047098116041494e-05, + "loss": 0.7025, + "step": 4070 + }, + { + "epoch": 0.5923344947735192, + "grad_norm": 3.7659318447113037, + "learning_rate": 4.000146382323262e-05, + "loss": 0.6851, + "step": 4080 + }, + { + "epoch": 0.5937862950058072, + "grad_norm": 0.5533025860786438, + "learning_rate": 3.995575128681706e-05, + "loss": 0.7296, + "step": 4090 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 1.3915671110153198, + "learning_rate": 3.990996074521912e-05, + "loss": 0.8556, + "step": 4100 + }, + { + "epoch": 0.5966898954703833, + "grad_norm": 1.290931224822998, + "learning_rate": 3.986409243726997e-05, + "loss": 0.6936, + "step": 4110 + }, + { + "epoch": 0.5981416957026713, + "grad_norm": 1.8250644207000732, + "learning_rate": 3.981814660220639e-05, + "loss": 0.48, + "step": 4120 + }, + { + "epoch": 0.5995934959349594, + "grad_norm": 5.125851631164551, + "learning_rate": 3.977212347966951e-05, + "loss": 0.6769, + "step": 4130 + }, + { + "epoch": 0.6010452961672473, + "grad_norm": 1.0293982028961182, + "learning_rate": 3.9726023309703586e-05, + "loss": 0.4873, + "step": 4140 + }, + { + "epoch": 0.6024970963995354, + "grad_norm": 1.5232713222503662, + "learning_rate": 3.9679846332754716e-05, + "loss": 0.5796, + "step": 4150 + }, + { + "epoch": 0.6039488966318235, + "grad_norm": 1.948309302330017, + "learning_rate": 3.963359278966962e-05, + "loss": 0.7975, + "step": 4160 + }, + { + "epoch": 0.6054006968641115, + "grad_norm": 4.971721649169922, + "learning_rate": 3.9587262921694343e-05, + "loss": 0.5604, + "step": 4170 + }, + { + "epoch": 0.6068524970963995, + "grad_norm": 0.7850014567375183, + "learning_rate": 3.954085697047305e-05, + "loss": 0.6898, + "step": 4180 + }, + { + "epoch": 0.6083042973286876, + "grad_norm": 0.5327876210212708, + "learning_rate": 3.949437517804672e-05, + "loss": 0.7244, + "step": 4190 + }, + { + "epoch": 0.6097560975609756, + "grad_norm": 2.595165729522705, + "learning_rate": 3.944781778685189e-05, + "loss": 0.6537, + "step": 4200 + }, + { + "epoch": 0.6112078977932637, + "grad_norm": 3.179577350616455, + "learning_rate": 3.940118503971941e-05, + "loss": 0.6315, + "step": 4210 + }, + { + "epoch": 0.6126596980255516, + "grad_norm": 4.726830959320068, + "learning_rate": 3.935447717987318e-05, + "loss": 0.9359, + "step": 4220 + }, + { + "epoch": 0.6141114982578397, + "grad_norm": 0.4002162516117096, + "learning_rate": 3.930769445092883e-05, + "loss": 0.7475, + "step": 4230 + }, + { + "epoch": 0.6155632984901278, + "grad_norm": 1.5376918315887451, + "learning_rate": 3.9260837096892536e-05, + "loss": 0.8695, + "step": 4240 + }, + { + "epoch": 0.6170150987224158, + "grad_norm": 1.1458797454833984, + "learning_rate": 3.921390536215966e-05, + "loss": 0.5302, + "step": 4250 + }, + { + "epoch": 0.6184668989547039, + "grad_norm": 2.180319309234619, + "learning_rate": 3.916689949151352e-05, + "loss": 0.6508, + "step": 4260 + }, + { + "epoch": 0.6199186991869918, + "grad_norm": 0.7947795391082764, + "learning_rate": 3.911981973012413e-05, + "loss": 0.5396, + "step": 4270 + }, + { + "epoch": 0.6213704994192799, + "grad_norm": 2.065096616744995, + "learning_rate": 3.907266632354687e-05, + "loss": 0.6551, + "step": 4280 + }, + { + "epoch": 0.622822299651568, + "grad_norm": 0.585402250289917, + "learning_rate": 3.902543951772125e-05, + "loss": 0.8218, + "step": 4290 + }, + { + "epoch": 0.624274099883856, + "grad_norm": 0.9007218480110168, + "learning_rate": 3.897813955896961e-05, + "loss": 0.6261, + "step": 4300 + }, + { + "epoch": 0.625725900116144, + "grad_norm": 1.722657322883606, + "learning_rate": 3.8930766693995836e-05, + "loss": 0.6373, + "step": 4310 + }, + { + "epoch": 0.627177700348432, + "grad_norm": 2.8142952919006348, + "learning_rate": 3.888332116988405e-05, + "loss": 0.7586, + "step": 4320 + }, + { + "epoch": 0.6286295005807201, + "grad_norm": 0.6167258620262146, + "learning_rate": 3.883580323409739e-05, + "loss": 0.6376, + "step": 4330 + }, + { + "epoch": 0.6300813008130082, + "grad_norm": 1.2382534742355347, + "learning_rate": 3.878821313447662e-05, + "loss": 0.7507, + "step": 4340 + }, + { + "epoch": 0.6315331010452961, + "grad_norm": 1.4185280799865723, + "learning_rate": 3.874055111923895e-05, + "loss": 0.8366, + "step": 4350 + }, + { + "epoch": 0.6329849012775842, + "grad_norm": 1.5447771549224854, + "learning_rate": 3.869281743697664e-05, + "loss": 0.7417, + "step": 4360 + }, + { + "epoch": 0.6344367015098722, + "grad_norm": 0.8044071793556213, + "learning_rate": 3.864501233665574e-05, + "loss": 0.6307, + "step": 4370 + }, + { + "epoch": 0.6358885017421603, + "grad_norm": 1.0656015872955322, + "learning_rate": 3.8597136067614834e-05, + "loss": 0.8411, + "step": 4380 + }, + { + "epoch": 0.6373403019744484, + "grad_norm": 1.03560471534729, + "learning_rate": 3.854918887956369e-05, + "loss": 0.4866, + "step": 4390 + }, + { + "epoch": 0.6387921022067363, + "grad_norm": 3.3328843116760254, + "learning_rate": 3.850117102258194e-05, + "loss": 0.5966, + "step": 4400 + }, + { + "epoch": 0.6402439024390244, + "grad_norm": 0.6904016733169556, + "learning_rate": 3.8453082747117866e-05, + "loss": 0.7452, + "step": 4410 + }, + { + "epoch": 0.6416957026713124, + "grad_norm": 1.4979177713394165, + "learning_rate": 3.8404924303986966e-05, + "loss": 0.5983, + "step": 4420 + }, + { + "epoch": 0.6431475029036005, + "grad_norm": 0.5199301838874817, + "learning_rate": 3.8356695944370766e-05, + "loss": 0.6088, + "step": 4430 + }, + { + "epoch": 0.6445993031358885, + "grad_norm": 0.7011024355888367, + "learning_rate": 3.8308397919815425e-05, + "loss": 0.8235, + "step": 4440 + }, + { + "epoch": 0.6460511033681765, + "grad_norm": 0.6176084280014038, + "learning_rate": 3.826003048223048e-05, + "loss": 0.5582, + "step": 4450 + }, + { + "epoch": 0.6475029036004646, + "grad_norm": 0.8521440029144287, + "learning_rate": 3.8211593883887486e-05, + "loss": 0.608, + "step": 4460 + }, + { + "epoch": 0.6489547038327527, + "grad_norm": 1.2053148746490479, + "learning_rate": 3.816308837741875e-05, + "loss": 0.6533, + "step": 4470 + }, + { + "epoch": 0.6504065040650406, + "grad_norm": 1.701720952987671, + "learning_rate": 3.811451421581595e-05, + "loss": 0.6655, + "step": 4480 + }, + { + "epoch": 0.6518583042973287, + "grad_norm": 1.8435336351394653, + "learning_rate": 3.8065871652428874e-05, + "loss": 0.6773, + "step": 4490 + }, + { + "epoch": 0.6533101045296167, + "grad_norm": 3.5968480110168457, + "learning_rate": 3.801716094096407e-05, + "loss": 0.8139, + "step": 4500 + }, + { + "epoch": 0.6547619047619048, + "grad_norm": 0.776545524597168, + "learning_rate": 3.796838233548353e-05, + "loss": 0.758, + "step": 4510 + }, + { + "epoch": 0.6562137049941928, + "grad_norm": 1.1160175800323486, + "learning_rate": 3.7919536090403366e-05, + "loss": 0.4703, + "step": 4520 + }, + { + "epoch": 0.6576655052264808, + "grad_norm": 1.2551127672195435, + "learning_rate": 3.787062246049245e-05, + "loss": 0.8029, + "step": 4530 + }, + { + "epoch": 0.6591173054587689, + "grad_norm": 1.130473256111145, + "learning_rate": 3.7821641700871174e-05, + "loss": 0.6633, + "step": 4540 + }, + { + "epoch": 0.6605691056910569, + "grad_norm": 0.6870506405830383, + "learning_rate": 3.7772594067010005e-05, + "loss": 0.5136, + "step": 4550 + }, + { + "epoch": 0.662020905923345, + "grad_norm": 1.1664706468582153, + "learning_rate": 3.772347981472824e-05, + "loss": 0.7384, + "step": 4560 + }, + { + "epoch": 0.663472706155633, + "grad_norm": 1.849837303161621, + "learning_rate": 3.767429920019261e-05, + "loss": 0.6037, + "step": 4570 + }, + { + "epoch": 0.664924506387921, + "grad_norm": 1.2257493734359741, + "learning_rate": 3.7625052479916015e-05, + "loss": 0.7564, + "step": 4580 + }, + { + "epoch": 0.6663763066202091, + "grad_norm": 1.277335286140442, + "learning_rate": 3.7575739910756124e-05, + "loss": 0.6522, + "step": 4590 + }, + { + "epoch": 0.667828106852497, + "grad_norm": 0.8080965280532837, + "learning_rate": 3.752636174991403e-05, + "loss": 0.8077, + "step": 4600 + }, + { + "epoch": 0.6692799070847851, + "grad_norm": 1.9517686367034912, + "learning_rate": 3.747691825493298e-05, + "loss": 0.5579, + "step": 4610 + }, + { + "epoch": 0.6707317073170732, + "grad_norm": 1.0174436569213867, + "learning_rate": 3.742740968369697e-05, + "loss": 0.8038, + "step": 4620 + }, + { + "epoch": 0.6721835075493612, + "grad_norm": 0.6888383626937866, + "learning_rate": 3.73778362944294e-05, + "loss": 0.8365, + "step": 4630 + }, + { + "epoch": 0.6736353077816493, + "grad_norm": 2.7746047973632812, + "learning_rate": 3.732819834569176e-05, + "loss": 0.5363, + "step": 4640 + }, + { + "epoch": 0.6750871080139372, + "grad_norm": 0.43378978967666626, + "learning_rate": 3.7278496096382254e-05, + "loss": 0.5768, + "step": 4650 + }, + { + "epoch": 0.6765389082462253, + "grad_norm": 1.7999366521835327, + "learning_rate": 3.722872980573448e-05, + "loss": 0.7168, + "step": 4660 + }, + { + "epoch": 0.6779907084785134, + "grad_norm": 0.7228707075119019, + "learning_rate": 3.717889973331603e-05, + "loss": 0.8107, + "step": 4670 + }, + { + "epoch": 0.6794425087108014, + "grad_norm": 1.048464059829712, + "learning_rate": 3.7129006139027203e-05, + "loss": 0.6335, + "step": 4680 + }, + { + "epoch": 0.6808943089430894, + "grad_norm": 3.776031494140625, + "learning_rate": 3.707904928309956e-05, + "loss": 0.5367, + "step": 4690 + }, + { + "epoch": 0.6823461091753774, + "grad_norm": 4.042102336883545, + "learning_rate": 3.7029029426094666e-05, + "loss": 0.5869, + "step": 4700 + }, + { + "epoch": 0.6837979094076655, + "grad_norm": 2.6105918884277344, + "learning_rate": 3.6978946828902646e-05, + "loss": 0.4038, + "step": 4710 + }, + { + "epoch": 0.6852497096399536, + "grad_norm": 0.17694531381130219, + "learning_rate": 3.6928801752740895e-05, + "loss": 0.6876, + "step": 4720 + }, + { + "epoch": 0.6867015098722415, + "grad_norm": 1.4261376857757568, + "learning_rate": 3.687859445915265e-05, + "loss": 0.4988, + "step": 4730 + }, + { + "epoch": 0.6881533101045296, + "grad_norm": 3.2906527519226074, + "learning_rate": 3.682832521000568e-05, + "loss": 0.6203, + "step": 4740 + }, + { + "epoch": 0.6896051103368177, + "grad_norm": 0.8446171283721924, + "learning_rate": 3.677799426749088e-05, + "loss": 0.9472, + "step": 4750 + }, + { + "epoch": 0.6910569105691057, + "grad_norm": 1.2324299812316895, + "learning_rate": 3.6727601894120945e-05, + "loss": 0.6428, + "step": 4760 + }, + { + "epoch": 0.6925087108013938, + "grad_norm": 2.0250608921051025, + "learning_rate": 3.667714835272895e-05, + "loss": 0.55, + "step": 4770 + }, + { + "epoch": 0.6939605110336817, + "grad_norm": 1.788245677947998, + "learning_rate": 3.662663390646701e-05, + "loss": 0.672, + "step": 4780 + }, + { + "epoch": 0.6954123112659698, + "grad_norm": 2.5829572677612305, + "learning_rate": 3.657605881880493e-05, + "loss": 0.4385, + "step": 4790 + }, + { + "epoch": 0.6968641114982579, + "grad_norm": 0.9620968699455261, + "learning_rate": 3.652542335352878e-05, + "loss": 0.8065, + "step": 4800 + }, + { + "epoch": 0.6983159117305459, + "grad_norm": 1.38759183883667, + "learning_rate": 3.647472777473954e-05, + "loss": 0.7473, + "step": 4810 + }, + { + "epoch": 0.6997677119628339, + "grad_norm": 1.4988477230072021, + "learning_rate": 3.6423972346851744e-05, + "loss": 0.6581, + "step": 4820 + }, + { + "epoch": 0.7012195121951219, + "grad_norm": 1.095119595527649, + "learning_rate": 3.637315733459207e-05, + "loss": 0.5304, + "step": 4830 + }, + { + "epoch": 0.70267131242741, + "grad_norm": 0.6751285791397095, + "learning_rate": 3.6322283002997964e-05, + "loss": 0.7912, + "step": 4840 + }, + { + "epoch": 0.7041231126596981, + "grad_norm": 4.3074564933776855, + "learning_rate": 3.62713496174163e-05, + "loss": 0.545, + "step": 4850 + }, + { + "epoch": 0.705574912891986, + "grad_norm": 1.85584557056427, + "learning_rate": 3.622035744350192e-05, + "loss": 0.9848, + "step": 4860 + }, + { + "epoch": 0.7070267131242741, + "grad_norm": 1.2834818363189697, + "learning_rate": 3.6169306747216324e-05, + "loss": 0.7151, + "step": 4870 + }, + { + "epoch": 0.7084785133565621, + "grad_norm": 2.248262882232666, + "learning_rate": 3.611819779482623e-05, + "loss": 0.5322, + "step": 4880 + }, + { + "epoch": 0.7099303135888502, + "grad_norm": 2.055523633956909, + "learning_rate": 3.606703085290221e-05, + "loss": 0.6814, + "step": 4890 + }, + { + "epoch": 0.7113821138211383, + "grad_norm": 1.6206103563308716, + "learning_rate": 3.601580618831727e-05, + "loss": 0.8505, + "step": 4900 + }, + { + "epoch": 0.7128339140534262, + "grad_norm": 1.4901407957077026, + "learning_rate": 3.5964524068245536e-05, + "loss": 0.9409, + "step": 4910 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.2524491548538208, + "learning_rate": 3.591318476016076e-05, + "loss": 0.6961, + "step": 4920 + }, + { + "epoch": 0.7157375145180023, + "grad_norm": 1.2523133754730225, + "learning_rate": 3.586178853183498e-05, + "loss": 0.7585, + "step": 4930 + }, + { + "epoch": 0.7171893147502904, + "grad_norm": 1.0829603672027588, + "learning_rate": 3.581033565133713e-05, + "loss": 0.6737, + "step": 4940 + }, + { + "epoch": 0.7186411149825784, + "grad_norm": 2.06748628616333, + "learning_rate": 3.5758826387031626e-05, + "loss": 0.7715, + "step": 4950 + }, + { + "epoch": 0.7200929152148664, + "grad_norm": 0.8570627570152283, + "learning_rate": 3.570726100757693e-05, + "loss": 0.7153, + "step": 4960 + }, + { + "epoch": 0.7215447154471545, + "grad_norm": 4.75230073928833, + "learning_rate": 3.5655639781924247e-05, + "loss": 0.447, + "step": 4970 + }, + { + "epoch": 0.7229965156794426, + "grad_norm": 2.5116281509399414, + "learning_rate": 3.5603962979315996e-05, + "loss": 0.5853, + "step": 4980 + }, + { + "epoch": 0.7244483159117305, + "grad_norm": 1.00091552734375, + "learning_rate": 3.555223086928453e-05, + "loss": 0.8609, + "step": 4990 + }, + { + "epoch": 0.7259001161440186, + "grad_norm": 1.0202133655548096, + "learning_rate": 3.550044372165062e-05, + "loss": 0.879, + "step": 5000 + }, + { + "epoch": 0.7273519163763066, + "grad_norm": 1.4836984872817993, + "learning_rate": 3.5448601806522134e-05, + "loss": 0.3201, + "step": 5010 + }, + { + "epoch": 0.7288037166085947, + "grad_norm": 1.1861945390701294, + "learning_rate": 3.539670539429256e-05, + "loss": 0.4413, + "step": 5020 + }, + { + "epoch": 0.7302555168408827, + "grad_norm": 1.24436616897583, + "learning_rate": 3.534475475563967e-05, + "loss": 0.7143, + "step": 5030 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 1.162705421447754, + "learning_rate": 3.5292750161524045e-05, + "loss": 0.6185, + "step": 5040 + }, + { + "epoch": 0.7331591173054588, + "grad_norm": 1.116911768913269, + "learning_rate": 3.5240691883187666e-05, + "loss": 0.6876, + "step": 5050 + }, + { + "epoch": 0.7346109175377468, + "grad_norm": 0.6887683272361755, + "learning_rate": 3.5188580192152544e-05, + "loss": 0.5068, + "step": 5060 + }, + { + "epoch": 0.7360627177700348, + "grad_norm": 0.8753703832626343, + "learning_rate": 3.513641536021925e-05, + "loss": 0.8465, + "step": 5070 + }, + { + "epoch": 0.7375145180023229, + "grad_norm": 1.0913424491882324, + "learning_rate": 3.5084197659465555e-05, + "loss": 0.5948, + "step": 5080 + }, + { + "epoch": 0.7389663182346109, + "grad_norm": 4.28510856628418, + "learning_rate": 3.503192736224496e-05, + "loss": 0.6233, + "step": 5090 + }, + { + "epoch": 0.740418118466899, + "grad_norm": 1.444339632987976, + "learning_rate": 3.49796047411853e-05, + "loss": 0.4999, + "step": 5100 + }, + { + "epoch": 0.741869918699187, + "grad_norm": 1.1212478876113892, + "learning_rate": 3.4927230069187307e-05, + "loss": 0.5284, + "step": 5110 + }, + { + "epoch": 0.743321718931475, + "grad_norm": 0.00559547683224082, + "learning_rate": 3.487480361942321e-05, + "loss": 0.4229, + "step": 5120 + }, + { + "epoch": 0.7447735191637631, + "grad_norm": 13.444393157958984, + "learning_rate": 3.482232566533529e-05, + "loss": 0.7992, + "step": 5130 + }, + { + "epoch": 0.7462253193960511, + "grad_norm": 0.6348085403442383, + "learning_rate": 3.4769796480634456e-05, + "loss": 0.7238, + "step": 5140 + }, + { + "epoch": 0.7476771196283392, + "grad_norm": 1.069054126739502, + "learning_rate": 3.471721633929885e-05, + "loss": 0.4417, + "step": 5150 + }, + { + "epoch": 0.7491289198606271, + "grad_norm": 0.9457240104675293, + "learning_rate": 3.466458551557235e-05, + "loss": 0.7843, + "step": 5160 + }, + { + "epoch": 0.7505807200929152, + "grad_norm": 5.595800399780273, + "learning_rate": 3.4611904283963205e-05, + "loss": 0.8307, + "step": 5170 + }, + { + "epoch": 0.7520325203252033, + "grad_norm": 0.6603794693946838, + "learning_rate": 3.455917291924256e-05, + "loss": 0.5221, + "step": 5180 + }, + { + "epoch": 0.7534843205574913, + "grad_norm": 1.487289309501648, + "learning_rate": 3.450639169644308e-05, + "loss": 0.6535, + "step": 5190 + }, + { + "epoch": 0.7549361207897793, + "grad_norm": 0.9417099952697754, + "learning_rate": 3.445356089085743e-05, + "loss": 0.7801, + "step": 5200 + }, + { + "epoch": 0.7563879210220673, + "grad_norm": 0.5838674306869507, + "learning_rate": 3.4400680778036906e-05, + "loss": 0.5079, + "step": 5210 + }, + { + "epoch": 0.7578397212543554, + "grad_norm": 1.297662377357483, + "learning_rate": 3.434775163378997e-05, + "loss": 0.6784, + "step": 5220 + }, + { + "epoch": 0.7592915214866435, + "grad_norm": 0.6394696235656738, + "learning_rate": 3.4294773734180825e-05, + "loss": 0.5856, + "step": 5230 + }, + { + "epoch": 0.7607433217189314, + "grad_norm": 3.172327756881714, + "learning_rate": 3.424174735552799e-05, + "loss": 0.7602, + "step": 5240 + }, + { + "epoch": 0.7621951219512195, + "grad_norm": 1.0046736001968384, + "learning_rate": 3.418867277440278e-05, + "loss": 0.8301, + "step": 5250 + }, + { + "epoch": 0.7636469221835076, + "grad_norm": 5.960042953491211, + "learning_rate": 3.413555026762799e-05, + "loss": 0.745, + "step": 5260 + }, + { + "epoch": 0.7650987224157956, + "grad_norm": 0.9394850730895996, + "learning_rate": 3.408238011227635e-05, + "loss": 0.7655, + "step": 5270 + }, + { + "epoch": 0.7665505226480837, + "grad_norm": 1.9447022676467896, + "learning_rate": 3.402916258566907e-05, + "loss": 0.909, + "step": 5280 + }, + { + "epoch": 0.7680023228803716, + "grad_norm": 1.3960545063018799, + "learning_rate": 3.3975897965374515e-05, + "loss": 1.0169, + "step": 5290 + }, + { + "epoch": 0.7694541231126597, + "grad_norm": 1.291868805885315, + "learning_rate": 3.392258652920664e-05, + "loss": 0.8068, + "step": 5300 + }, + { + "epoch": 0.7709059233449478, + "grad_norm": 0.8512223362922668, + "learning_rate": 3.386922855522356e-05, + "loss": 0.6296, + "step": 5310 + }, + { + "epoch": 0.7723577235772358, + "grad_norm": 1.03252112865448, + "learning_rate": 3.3815824321726154e-05, + "loss": 0.7254, + "step": 5320 + }, + { + "epoch": 0.7738095238095238, + "grad_norm": 0.5753119587898254, + "learning_rate": 3.376237410725655e-05, + "loss": 0.8159, + "step": 5330 + }, + { + "epoch": 0.7752613240418118, + "grad_norm": 0.9350941181182861, + "learning_rate": 3.370887819059672e-05, + "loss": 0.6446, + "step": 5340 + }, + { + "epoch": 0.7767131242740999, + "grad_norm": 1.6437619924545288, + "learning_rate": 3.3655336850767e-05, + "loss": 0.891, + "step": 5350 + }, + { + "epoch": 0.778164924506388, + "grad_norm": 2.669983386993408, + "learning_rate": 3.3601750367024645e-05, + "loss": 0.8369, + "step": 5360 + }, + { + "epoch": 0.7796167247386759, + "grad_norm": 1.661522388458252, + "learning_rate": 3.354811901886234e-05, + "loss": 0.7392, + "step": 5370 + }, + { + "epoch": 0.781068524970964, + "grad_norm": 0.7996639609336853, + "learning_rate": 3.3494443086006824e-05, + "loss": 0.745, + "step": 5380 + }, + { + "epoch": 0.782520325203252, + "grad_norm": 0.6470725536346436, + "learning_rate": 3.344072284841734e-05, + "loss": 0.7941, + "step": 5390 + }, + { + "epoch": 0.7839721254355401, + "grad_norm": 1.523929476737976, + "learning_rate": 3.3386958586284204e-05, + "loss": 0.5812, + "step": 5400 + }, + { + "epoch": 0.7854239256678281, + "grad_norm": 0.7597313523292542, + "learning_rate": 3.333315058002739e-05, + "loss": 0.4126, + "step": 5410 + }, + { + "epoch": 0.7868757259001161, + "grad_norm": 2.064470052719116, + "learning_rate": 3.3279299110295e-05, + "loss": 0.7855, + "step": 5420 + }, + { + "epoch": 0.7883275261324042, + "grad_norm": 0.6145796179771423, + "learning_rate": 3.3225404457961834e-05, + "loss": 0.6219, + "step": 5430 + }, + { + "epoch": 0.7897793263646922, + "grad_norm": 3.158587694168091, + "learning_rate": 3.317146690412793e-05, + "loss": 0.7321, + "step": 5440 + }, + { + "epoch": 0.7912311265969802, + "grad_norm": 4.978558540344238, + "learning_rate": 3.311748673011709e-05, + "loss": 0.5758, + "step": 5450 + }, + { + "epoch": 0.7926829268292683, + "grad_norm": 1.3039811849594116, + "learning_rate": 3.306346421747539e-05, + "loss": 0.7172, + "step": 5460 + }, + { + "epoch": 0.7941347270615563, + "grad_norm": 0.47538790106773376, + "learning_rate": 3.300939964796977e-05, + "loss": 0.5409, + "step": 5470 + }, + { + "epoch": 0.7955865272938444, + "grad_norm": 1.0770827531814575, + "learning_rate": 3.295529330358649e-05, + "loss": 0.4414, + "step": 5480 + }, + { + "epoch": 0.7970383275261324, + "grad_norm": 0.7383883595466614, + "learning_rate": 3.290114546652971e-05, + "loss": 0.5318, + "step": 5490 + }, + { + "epoch": 0.7984901277584204, + "grad_norm": 0.9000987410545349, + "learning_rate": 3.284695641922e-05, + "loss": 0.5446, + "step": 5500 + }, + { + "epoch": 0.7999419279907085, + "grad_norm": 2.9022693634033203, + "learning_rate": 3.279272644429291e-05, + "loss": 0.725, + "step": 5510 + }, + { + "epoch": 0.8013937282229965, + "grad_norm": 1.3384835720062256, + "learning_rate": 3.2738455824597405e-05, + "loss": 0.6995, + "step": 5520 + }, + { + "epoch": 0.8028455284552846, + "grad_norm": 0.9091627597808838, + "learning_rate": 3.268414484319445e-05, + "loss": 0.5134, + "step": 5530 + }, + { + "epoch": 0.8042973286875726, + "grad_norm": 3.8653523921966553, + "learning_rate": 3.262979378335557e-05, + "loss": 0.7161, + "step": 5540 + }, + { + "epoch": 0.8057491289198606, + "grad_norm": 0.8096335530281067, + "learning_rate": 3.257540292856126e-05, + "loss": 0.5652, + "step": 5550 + }, + { + "epoch": 0.8072009291521487, + "grad_norm": 1.397865653038025, + "learning_rate": 3.252097256249965e-05, + "loss": 0.6965, + "step": 5560 + }, + { + "epoch": 0.8086527293844367, + "grad_norm": 2.277859926223755, + "learning_rate": 3.246650296906489e-05, + "loss": 0.6531, + "step": 5570 + }, + { + "epoch": 0.8101045296167247, + "grad_norm": 2.0666253566741943, + "learning_rate": 3.241199443235576e-05, + "loss": 0.4249, + "step": 5580 + }, + { + "epoch": 0.8115563298490128, + "grad_norm": 1.2161462306976318, + "learning_rate": 3.2357447236674136e-05, + "loss": 0.4259, + "step": 5590 + }, + { + "epoch": 0.8130081300813008, + "grad_norm": 2.8734538555145264, + "learning_rate": 3.2302861666523564e-05, + "loss": 0.4658, + "step": 5600 + }, + { + "epoch": 0.8144599303135889, + "grad_norm": 0.739331841468811, + "learning_rate": 3.22482380066077e-05, + "loss": 0.6863, + "step": 5610 + }, + { + "epoch": 0.8159117305458768, + "grad_norm": 0.8823861479759216, + "learning_rate": 3.2193576541828894e-05, + "loss": 0.6399, + "step": 5620 + }, + { + "epoch": 0.8173635307781649, + "grad_norm": 1.240403175354004, + "learning_rate": 3.2138877557286675e-05, + "loss": 0.8784, + "step": 5630 + }, + { + "epoch": 0.818815331010453, + "grad_norm": 1.1647741794586182, + "learning_rate": 3.208414133827623e-05, + "loss": 0.9796, + "step": 5640 + }, + { + "epoch": 0.820267131242741, + "grad_norm": 1.0195775032043457, + "learning_rate": 3.2029368170287e-05, + "loss": 0.4319, + "step": 5650 + }, + { + "epoch": 0.8217189314750291, + "grad_norm": 1.4524924755096436, + "learning_rate": 3.197455833900112e-05, + "loss": 0.7408, + "step": 5660 + }, + { + "epoch": 0.823170731707317, + "grad_norm": 0.5133039355278015, + "learning_rate": 3.191971213029195e-05, + "loss": 0.4198, + "step": 5670 + }, + { + "epoch": 0.8246225319396051, + "grad_norm": 1.205497145652771, + "learning_rate": 3.186482983022257e-05, + "loss": 0.4425, + "step": 5680 + }, + { + "epoch": 0.8260743321718932, + "grad_norm": 0.6108511090278625, + "learning_rate": 3.180991172504434e-05, + "loss": 0.6768, + "step": 5690 + }, + { + "epoch": 0.8275261324041812, + "grad_norm": 1.1527341604232788, + "learning_rate": 3.175495810119533e-05, + "loss": 0.5248, + "step": 5700 + }, + { + "epoch": 0.8289779326364692, + "grad_norm": 1.3975361585617065, + "learning_rate": 3.16999692452989e-05, + "loss": 0.8838, + "step": 5710 + }, + { + "epoch": 0.8304297328687572, + "grad_norm": 4.7035603523254395, + "learning_rate": 3.164494544416215e-05, + "loss": 1.0907, + "step": 5720 + }, + { + "epoch": 0.8318815331010453, + "grad_norm": 1.6571784019470215, + "learning_rate": 3.158988698477445e-05, + "loss": 0.732, + "step": 5730 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.6325092315673828, + "learning_rate": 3.1534794154305935e-05, + "loss": 0.8245, + "step": 5740 + }, + { + "epoch": 0.8347851335656213, + "grad_norm": 0.9876767992973328, + "learning_rate": 3.1479667240106016e-05, + "loss": 0.7428, + "step": 5750 + }, + { + "epoch": 0.8362369337979094, + "grad_norm": 0.37352874875068665, + "learning_rate": 3.142450652970187e-05, + "loss": 0.5489, + "step": 5760 + }, + { + "epoch": 0.8376887340301974, + "grad_norm": 1.924856424331665, + "learning_rate": 3.136931231079696e-05, + "loss": 0.7834, + "step": 5770 + }, + { + "epoch": 0.8391405342624855, + "grad_norm": 1.8147573471069336, + "learning_rate": 3.1314084871269496e-05, + "loss": 0.6688, + "step": 5780 + }, + { + "epoch": 0.8405923344947736, + "grad_norm": 0.680001437664032, + "learning_rate": 3.1258824499170975e-05, + "loss": 0.6193, + "step": 5790 + }, + { + "epoch": 0.8420441347270615, + "grad_norm": 1.3029786348342896, + "learning_rate": 3.1203531482724665e-05, + "loss": 0.694, + "step": 5800 + }, + { + "epoch": 0.8434959349593496, + "grad_norm": 1.4556697607040405, + "learning_rate": 3.114820611032408e-05, + "loss": 0.7933, + "step": 5810 + }, + { + "epoch": 0.8449477351916377, + "grad_norm": 1.7649941444396973, + "learning_rate": 3.1092848670531514e-05, + "loss": 0.4818, + "step": 5820 + }, + { + "epoch": 0.8463995354239257, + "grad_norm": 0.8985478281974792, + "learning_rate": 3.1037459452076504e-05, + "loss": 0.6992, + "step": 5830 + }, + { + "epoch": 0.8478513356562137, + "grad_norm": 1.0186079740524292, + "learning_rate": 3.0982038743854346e-05, + "loss": 0.2927, + "step": 5840 + }, + { + "epoch": 0.8493031358885017, + "grad_norm": 1.724191427230835, + "learning_rate": 3.0926586834924555e-05, + "loss": 0.6936, + "step": 5850 + }, + { + "epoch": 0.8507549361207898, + "grad_norm": 0.25378018617630005, + "learning_rate": 3.087110401450941e-05, + "loss": 0.6692, + "step": 5860 + }, + { + "epoch": 0.8522067363530779, + "grad_norm": 0.5603824853897095, + "learning_rate": 3.0815590571992394e-05, + "loss": 0.4975, + "step": 5870 + }, + { + "epoch": 0.8536585365853658, + "grad_norm": 0.4890209138393402, + "learning_rate": 3.076004679691672e-05, + "loss": 0.832, + "step": 5880 + }, + { + "epoch": 0.8551103368176539, + "grad_norm": 0.6338871121406555, + "learning_rate": 3.0704472978983795e-05, + "loss": 0.6447, + "step": 5890 + }, + { + "epoch": 0.8565621370499419, + "grad_norm": 0.9809471964836121, + "learning_rate": 3.064886940805174e-05, + "loss": 0.6176, + "step": 5900 + }, + { + "epoch": 0.85801393728223, + "grad_norm": 0.7329034209251404, + "learning_rate": 3.059323637413385e-05, + "loss": 0.4022, + "step": 5910 + }, + { + "epoch": 0.859465737514518, + "grad_norm": 1.2352603673934937, + "learning_rate": 3.053757416739708e-05, + "loss": 0.9392, + "step": 5920 + }, + { + "epoch": 0.860917537746806, + "grad_norm": 1.056897759437561, + "learning_rate": 3.0481883078160555e-05, + "loss": 0.616, + "step": 5930 + }, + { + "epoch": 0.8623693379790941, + "grad_norm": 0.6841446757316589, + "learning_rate": 3.042616339689404e-05, + "loss": 0.5995, + "step": 5940 + }, + { + "epoch": 0.8638211382113821, + "grad_norm": 1.3766181468963623, + "learning_rate": 3.0370415414216436e-05, + "loss": 0.6945, + "step": 5950 + }, + { + "epoch": 0.8652729384436701, + "grad_norm": 0.960422694683075, + "learning_rate": 3.0314639420894242e-05, + "loss": 0.6205, + "step": 5960 + }, + { + "epoch": 0.8667247386759582, + "grad_norm": 2.2252063751220703, + "learning_rate": 3.0258835707840062e-05, + "loss": 0.67, + "step": 5970 + }, + { + "epoch": 0.8681765389082462, + "grad_norm": 4.834002494812012, + "learning_rate": 3.020300456611109e-05, + "loss": 0.5169, + "step": 5980 + }, + { + "epoch": 0.8696283391405343, + "grad_norm": 0.3208721876144409, + "learning_rate": 3.0147146286907546e-05, + "loss": 0.7802, + "step": 5990 + }, + { + "epoch": 0.8710801393728222, + "grad_norm": 0.6140812039375305, + "learning_rate": 3.0091261161571227e-05, + "loss": 0.753, + "step": 6000 + }, + { + "epoch": 0.8710801393728222, + "eval_loss": 0.6239650249481201, + "eval_runtime": 107.7332, + "eval_samples_per_second": 13.459, + "eval_steps_per_second": 3.369, + "step": 6000 + }, + { + "epoch": 0.8725319396051103, + "grad_norm": 0.7981186509132385, + "learning_rate": 3.003534948158393e-05, + "loss": 0.581, + "step": 6010 + }, + { + "epoch": 0.8739837398373984, + "grad_norm": 1.1279065608978271, + "learning_rate": 2.9979411538565977e-05, + "loss": 0.5993, + "step": 6020 + }, + { + "epoch": 0.8754355400696864, + "grad_norm": 0.7594296336174011, + "learning_rate": 2.9923447624274647e-05, + "loss": 0.7433, + "step": 6030 + }, + { + "epoch": 0.8768873403019745, + "grad_norm": 4.225851058959961, + "learning_rate": 2.9867458030602684e-05, + "loss": 0.5974, + "step": 6040 + }, + { + "epoch": 0.8783391405342624, + "grad_norm": 1.2313289642333984, + "learning_rate": 2.9811443049576793e-05, + "loss": 0.5609, + "step": 6050 + }, + { + "epoch": 0.8797909407665505, + "grad_norm": 2.6386501789093018, + "learning_rate": 2.9755402973356045e-05, + "loss": 0.9846, + "step": 6060 + }, + { + "epoch": 0.8812427409988386, + "grad_norm": 1.1028252840042114, + "learning_rate": 2.969933809423045e-05, + "loss": 0.5933, + "step": 6070 + }, + { + "epoch": 0.8826945412311266, + "grad_norm": 1.0655920505523682, + "learning_rate": 2.964324870461935e-05, + "loss": 0.8486, + "step": 6080 + }, + { + "epoch": 0.8841463414634146, + "grad_norm": 2.2200887203216553, + "learning_rate": 2.9587135097069934e-05, + "loss": 0.3357, + "step": 6090 + }, + { + "epoch": 0.8855981416957027, + "grad_norm": 8.945457458496094, + "learning_rate": 2.9530997564255725e-05, + "loss": 0.7661, + "step": 6100 + }, + { + "epoch": 0.8870499419279907, + "grad_norm": 0.8916497230529785, + "learning_rate": 2.9474836398975005e-05, + "loss": 0.3096, + "step": 6110 + }, + { + "epoch": 0.8885017421602788, + "grad_norm": 1.2500933408737183, + "learning_rate": 2.9418651894149334e-05, + "loss": 0.7636, + "step": 6120 + }, + { + "epoch": 0.8899535423925667, + "grad_norm": 1.3231313228607178, + "learning_rate": 2.9362444342822015e-05, + "loss": 0.8473, + "step": 6130 + }, + { + "epoch": 0.8914053426248548, + "grad_norm": 1.0085506439208984, + "learning_rate": 2.9306214038156516e-05, + "loss": 0.6876, + "step": 6140 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 0.7650404572486877, + "learning_rate": 2.924996127343502e-05, + "loss": 0.4889, + "step": 6150 + }, + { + "epoch": 0.8943089430894309, + "grad_norm": 0.7335465550422668, + "learning_rate": 2.9193686342056847e-05, + "loss": 0.6647, + "step": 6160 + }, + { + "epoch": 0.895760743321719, + "grad_norm": 0.5137434005737305, + "learning_rate": 2.9137389537536913e-05, + "loss": 0.6737, + "step": 6170 + }, + { + "epoch": 0.8972125435540069, + "grad_norm": 0.9400390386581421, + "learning_rate": 2.9081071153504236e-05, + "loss": 0.6747, + "step": 6180 + }, + { + "epoch": 0.898664343786295, + "grad_norm": 0.660967230796814, + "learning_rate": 2.9024731483700396e-05, + "loss": 0.4432, + "step": 6190 + }, + { + "epoch": 0.9001161440185831, + "grad_norm": 2.423039197921753, + "learning_rate": 2.8968370821977963e-05, + "loss": 0.6982, + "step": 6200 + }, + { + "epoch": 0.9015679442508711, + "grad_norm": 3.0828261375427246, + "learning_rate": 2.8911989462299016e-05, + "loss": 0.5868, + "step": 6210 + }, + { + "epoch": 0.9030197444831591, + "grad_norm": 2.1633851528167725, + "learning_rate": 2.8855587698733595e-05, + "loss": 0.5404, + "step": 6220 + }, + { + "epoch": 0.9044715447154471, + "grad_norm": 5.27179479598999, + "learning_rate": 2.8799165825458145e-05, + "loss": 0.7313, + "step": 6230 + }, + { + "epoch": 0.9059233449477352, + "grad_norm": 0.805304229259491, + "learning_rate": 2.8742724136754005e-05, + "loss": 0.5804, + "step": 6240 + }, + { + "epoch": 0.9073751451800233, + "grad_norm": 2.6164822578430176, + "learning_rate": 2.868626292700588e-05, + "loss": 0.6612, + "step": 6250 + }, + { + "epoch": 0.9088269454123112, + "grad_norm": 1.3976331949234009, + "learning_rate": 2.8629782490700253e-05, + "loss": 0.5746, + "step": 6260 + }, + { + "epoch": 0.9102787456445993, + "grad_norm": 1.42573881149292, + "learning_rate": 2.857328312242392e-05, + "loss": 0.576, + "step": 6270 + }, + { + "epoch": 0.9117305458768873, + "grad_norm": 2.0388023853302, + "learning_rate": 2.851676511686243e-05, + "loss": 0.7672, + "step": 6280 + }, + { + "epoch": 0.9131823461091754, + "grad_norm": 1.3161983489990234, + "learning_rate": 2.8460228768798506e-05, + "loss": 0.6011, + "step": 6290 + }, + { + "epoch": 0.9146341463414634, + "grad_norm": 1.2606275081634521, + "learning_rate": 2.8403674373110562e-05, + "loss": 0.6017, + "step": 6300 + }, + { + "epoch": 0.9160859465737514, + "grad_norm": 2.2314658164978027, + "learning_rate": 2.8347102224771144e-05, + "loss": 0.6201, + "step": 6310 + }, + { + "epoch": 0.9175377468060395, + "grad_norm": 1.990546703338623, + "learning_rate": 2.8290512618845367e-05, + "loss": 0.6775, + "step": 6320 + }, + { + "epoch": 0.9189895470383276, + "grad_norm": 1.7261875867843628, + "learning_rate": 2.823390585048943e-05, + "loss": 0.6419, + "step": 6330 + }, + { + "epoch": 0.9204413472706156, + "grad_norm": 2.2154932022094727, + "learning_rate": 2.8177282214949047e-05, + "loss": 0.8979, + "step": 6340 + }, + { + "epoch": 0.9218931475029036, + "grad_norm": 6.259598731994629, + "learning_rate": 2.8120642007557873e-05, + "loss": 0.767, + "step": 6350 + }, + { + "epoch": 0.9233449477351916, + "grad_norm": 1.4923880100250244, + "learning_rate": 2.806398552373603e-05, + "loss": 0.7091, + "step": 6360 + }, + { + "epoch": 0.9247967479674797, + "grad_norm": 0.6974102258682251, + "learning_rate": 2.8007313058988527e-05, + "loss": 0.6863, + "step": 6370 + }, + { + "epoch": 0.9262485481997678, + "grad_norm": 1.9238085746765137, + "learning_rate": 2.7950624908903705e-05, + "loss": 0.555, + "step": 6380 + }, + { + "epoch": 0.9277003484320557, + "grad_norm": 0.496724933385849, + "learning_rate": 2.789392136915175e-05, + "loss": 0.9554, + "step": 6390 + }, + { + "epoch": 0.9291521486643438, + "grad_norm": 1.349373459815979, + "learning_rate": 2.7837202735483093e-05, + "loss": 0.8156, + "step": 6400 + }, + { + "epoch": 0.9306039488966318, + "grad_norm": 1.377130150794983, + "learning_rate": 2.778046930372689e-05, + "loss": 0.7222, + "step": 6410 + }, + { + "epoch": 0.9320557491289199, + "grad_norm": 1.0762406587600708, + "learning_rate": 2.7723721369789486e-05, + "loss": 0.6956, + "step": 6420 + }, + { + "epoch": 0.9335075493612079, + "grad_norm": 1.7975473403930664, + "learning_rate": 2.7666959229652867e-05, + "loss": 0.7824, + "step": 6430 + }, + { + "epoch": 0.9349593495934959, + "grad_norm": 1.836282730102539, + "learning_rate": 2.761018317937311e-05, + "loss": 0.6559, + "step": 6440 + }, + { + "epoch": 0.936411149825784, + "grad_norm": 1.9735631942749023, + "learning_rate": 2.7553393515078852e-05, + "loss": 0.578, + "step": 6450 + }, + { + "epoch": 0.937862950058072, + "grad_norm": 1.7507141828536987, + "learning_rate": 2.749659053296973e-05, + "loss": 0.897, + "step": 6460 + }, + { + "epoch": 0.93931475029036, + "grad_norm": 1.1130051612854004, + "learning_rate": 2.743977452931484e-05, + "loss": 0.5654, + "step": 6470 + }, + { + "epoch": 0.9407665505226481, + "grad_norm": 0.851780354976654, + "learning_rate": 2.738294580045119e-05, + "loss": 0.5722, + "step": 6480 + }, + { + "epoch": 0.9422183507549361, + "grad_norm": 0.6273514628410339, + "learning_rate": 2.732610464278219e-05, + "loss": 0.6938, + "step": 6490 + }, + { + "epoch": 0.9436701509872242, + "grad_norm": 1.4148989915847778, + "learning_rate": 2.7269251352776042e-05, + "loss": 0.5636, + "step": 6500 + }, + { + "epoch": 0.9451219512195121, + "grad_norm": 0.9783958792686462, + "learning_rate": 2.7212386226964242e-05, + "loss": 0.5425, + "step": 6510 + }, + { + "epoch": 0.9465737514518002, + "grad_norm": 0.860564649105072, + "learning_rate": 2.7155509561940017e-05, + "loss": 0.6981, + "step": 6520 + }, + { + "epoch": 0.9480255516840883, + "grad_norm": 1.0383031368255615, + "learning_rate": 2.7098621654356766e-05, + "loss": 0.7683, + "step": 6530 + }, + { + "epoch": 0.9494773519163763, + "grad_norm": 0.6206135153770447, + "learning_rate": 2.704172280092655e-05, + "loss": 0.5571, + "step": 6540 + }, + { + "epoch": 0.9509291521486644, + "grad_norm": 1.0526723861694336, + "learning_rate": 2.698481329841851e-05, + "loss": 0.9023, + "step": 6550 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.7944720983505249, + "learning_rate": 2.6927893443657316e-05, + "loss": 0.5719, + "step": 6560 + }, + { + "epoch": 0.9538327526132404, + "grad_norm": 0.16957837343215942, + "learning_rate": 2.6870963533521655e-05, + "loss": 0.641, + "step": 6570 + }, + { + "epoch": 0.9552845528455285, + "grad_norm": 0.8909958004951477, + "learning_rate": 2.681402386494264e-05, + "loss": 0.5357, + "step": 6580 + }, + { + "epoch": 0.9567363530778165, + "grad_norm": 0.8061552047729492, + "learning_rate": 2.6757074734902303e-05, + "loss": 0.8705, + "step": 6590 + }, + { + "epoch": 0.9581881533101045, + "grad_norm": 0.7766616940498352, + "learning_rate": 2.6700116440432005e-05, + "loss": 0.6641, + "step": 6600 + }, + { + "epoch": 0.9596399535423926, + "grad_norm": 4.805869102478027, + "learning_rate": 2.6643149278610925e-05, + "loss": 0.4838, + "step": 6610 + }, + { + "epoch": 0.9610917537746806, + "grad_norm": 1.1826244592666626, + "learning_rate": 2.6586173546564465e-05, + "loss": 0.8335, + "step": 6620 + }, + { + "epoch": 0.9625435540069687, + "grad_norm": 4.609352111816406, + "learning_rate": 2.6529189541462745e-05, + "loss": 0.5172, + "step": 6630 + }, + { + "epoch": 0.9639953542392566, + "grad_norm": 1.5737910270690918, + "learning_rate": 2.647219756051904e-05, + "loss": 0.4788, + "step": 6640 + }, + { + "epoch": 0.9654471544715447, + "grad_norm": 4.146353244781494, + "learning_rate": 2.6415197900988213e-05, + "loss": 0.7194, + "step": 6650 + }, + { + "epoch": 0.9668989547038328, + "grad_norm": 0.5611397624015808, + "learning_rate": 2.6358190860165187e-05, + "loss": 0.489, + "step": 6660 + }, + { + "epoch": 0.9683507549361208, + "grad_norm": 2.0827231407165527, + "learning_rate": 2.6301176735383382e-05, + "loss": 0.5859, + "step": 6670 + }, + { + "epoch": 0.9698025551684089, + "grad_norm": 2.0396342277526855, + "learning_rate": 2.624415582401314e-05, + "loss": 0.7885, + "step": 6680 + }, + { + "epoch": 0.9712543554006968, + "grad_norm": 2.5447700023651123, + "learning_rate": 2.6187128423460233e-05, + "loss": 0.722, + "step": 6690 + }, + { + "epoch": 0.9727061556329849, + "grad_norm": 4.586677551269531, + "learning_rate": 2.6130094831164282e-05, + "loss": 0.5383, + "step": 6700 + }, + { + "epoch": 0.974157955865273, + "grad_norm": 2.4895076751708984, + "learning_rate": 2.607305534459717e-05, + "loss": 0.6993, + "step": 6710 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.8436042666435242, + "learning_rate": 2.6016010261261546e-05, + "loss": 0.6571, + "step": 6720 + }, + { + "epoch": 0.977061556329849, + "grad_norm": 0.6883308291435242, + "learning_rate": 2.5958959878689253e-05, + "loss": 0.5514, + "step": 6730 + }, + { + "epoch": 0.978513356562137, + "grad_norm": 2.935514211654663, + "learning_rate": 2.590190449443975e-05, + "loss": 0.6725, + "step": 6740 + }, + { + "epoch": 0.9799651567944251, + "grad_norm": 2.491732597351074, + "learning_rate": 2.584484440609861e-05, + "loss": 0.6864, + "step": 6750 + }, + { + "epoch": 0.9814169570267132, + "grad_norm": 2.6545393466949463, + "learning_rate": 2.5787779911275937e-05, + "loss": 0.6371, + "step": 6760 + }, + { + "epoch": 0.9828687572590011, + "grad_norm": 0.4963870942592621, + "learning_rate": 2.57307113076048e-05, + "loss": 0.6246, + "step": 6770 + }, + { + "epoch": 0.9843205574912892, + "grad_norm": 0.5385538339614868, + "learning_rate": 2.567363889273971e-05, + "loss": 0.8436, + "step": 6780 + }, + { + "epoch": 0.9857723577235772, + "grad_norm": 7.346200466156006, + "learning_rate": 2.561656296435506e-05, + "loss": 0.65, + "step": 6790 + }, + { + "epoch": 0.9872241579558653, + "grad_norm": 1.658962368965149, + "learning_rate": 2.555948382014357e-05, + "loss": 0.6879, + "step": 6800 + }, + { + "epoch": 0.9886759581881533, + "grad_norm": 1.5802571773529053, + "learning_rate": 2.5502401757814714e-05, + "loss": 0.7704, + "step": 6810 + }, + { + "epoch": 0.9901277584204413, + "grad_norm": 2.8722903728485107, + "learning_rate": 2.5445317075093223e-05, + "loss": 0.4583, + "step": 6820 + }, + { + "epoch": 0.9915795586527294, + "grad_norm": 0.8566171526908875, + "learning_rate": 2.5388230069717446e-05, + "loss": 0.8975, + "step": 6830 + }, + { + "epoch": 0.9930313588850174, + "grad_norm": 0.7352342009544373, + "learning_rate": 2.5331141039437882e-05, + "loss": 0.7039, + "step": 6840 + }, + { + "epoch": 0.9944831591173054, + "grad_norm": 1.2166061401367188, + "learning_rate": 2.5274050282015587e-05, + "loss": 0.6728, + "step": 6850 + }, + { + "epoch": 0.9959349593495935, + "grad_norm": 1.2508012056350708, + "learning_rate": 2.521695809522061e-05, + "loss": 0.7019, + "step": 6860 + }, + { + "epoch": 0.9973867595818815, + "grad_norm": 1.4797356128692627, + "learning_rate": 2.515986477683048e-05, + "loss": 0.5035, + "step": 6870 + }, + { + "epoch": 0.9988385598141696, + "grad_norm": 0.6893177628517151, + "learning_rate": 2.510277062462861e-05, + "loss": 0.6175, + "step": 6880 + }, + { + "epoch": 1.0002903600464577, + "grad_norm": 3.4936206340789795, + "learning_rate": 2.504567593640275e-05, + "loss": 0.7674, + "step": 6890 + }, + { + "epoch": 1.0017421602787457, + "grad_norm": 1.3632289171218872, + "learning_rate": 2.4988581009943477e-05, + "loss": 0.3736, + "step": 6900 + }, + { + "epoch": 1.0031939605110336, + "grad_norm": 0.46680641174316406, + "learning_rate": 2.4931486143042586e-05, + "loss": 0.4425, + "step": 6910 + }, + { + "epoch": 1.0046457607433217, + "grad_norm": 0.8818445801734924, + "learning_rate": 2.4874391633491576e-05, + "loss": 0.6905, + "step": 6920 + }, + { + "epoch": 1.0060975609756098, + "grad_norm": 0.5474444031715393, + "learning_rate": 2.4817297779080073e-05, + "loss": 0.7923, + "step": 6930 + }, + { + "epoch": 1.0075493612078978, + "grad_norm": 0.8076862096786499, + "learning_rate": 2.4760204877594297e-05, + "loss": 0.6344, + "step": 6940 + }, + { + "epoch": 1.009001161440186, + "grad_norm": 1.0539401769638062, + "learning_rate": 2.4703113226815474e-05, + "loss": 0.7762, + "step": 6950 + }, + { + "epoch": 1.0104529616724738, + "grad_norm": 1.6129015684127808, + "learning_rate": 2.4646023124518336e-05, + "loss": 0.5475, + "step": 6960 + }, + { + "epoch": 1.0119047619047619, + "grad_norm": 3.270751476287842, + "learning_rate": 2.4588934868469522e-05, + "loss": 0.7106, + "step": 6970 + }, + { + "epoch": 1.01335656213705, + "grad_norm": 1.1488885879516602, + "learning_rate": 2.4531848756426032e-05, + "loss": 0.6126, + "step": 6980 + }, + { + "epoch": 1.014808362369338, + "grad_norm": 0.7339674234390259, + "learning_rate": 2.447476508613372e-05, + "loss": 0.5384, + "step": 6990 + }, + { + "epoch": 1.016260162601626, + "grad_norm": 0.2865770161151886, + "learning_rate": 2.4417684155325664e-05, + "loss": 0.6378, + "step": 7000 + }, + { + "epoch": 1.017711962833914, + "grad_norm": 1.421481728553772, + "learning_rate": 2.4360606261720673e-05, + "loss": 0.6757, + "step": 7010 + }, + { + "epoch": 1.019163763066202, + "grad_norm": 0.846333384513855, + "learning_rate": 2.430353170302172e-05, + "loss": 0.517, + "step": 7020 + }, + { + "epoch": 1.0206155632984901, + "grad_norm": 0.3524300158023834, + "learning_rate": 2.4246460776914363e-05, + "loss": 0.6129, + "step": 7030 + }, + { + "epoch": 1.0220673635307782, + "grad_norm": 0.7928240299224854, + "learning_rate": 2.4189393781065232e-05, + "loss": 0.4327, + "step": 7040 + }, + { + "epoch": 1.0235191637630663, + "grad_norm": 0.9376094341278076, + "learning_rate": 2.4132331013120453e-05, + "loss": 0.6137, + "step": 7050 + }, + { + "epoch": 1.0249709639953541, + "grad_norm": 1.046407699584961, + "learning_rate": 2.4075272770704104e-05, + "loss": 0.6877, + "step": 7060 + }, + { + "epoch": 1.0264227642276422, + "grad_norm": 2.0462183952331543, + "learning_rate": 2.4018219351416645e-05, + "loss": 0.4539, + "step": 7070 + }, + { + "epoch": 1.0278745644599303, + "grad_norm": 0.4574951231479645, + "learning_rate": 2.3961171052833386e-05, + "loss": 0.9033, + "step": 7080 + }, + { + "epoch": 1.0293263646922184, + "grad_norm": 3.518298864364624, + "learning_rate": 2.3904128172502946e-05, + "loss": 0.5817, + "step": 7090 + }, + { + "epoch": 1.0307781649245065, + "grad_norm": 0.598048985004425, + "learning_rate": 2.3847091007945667e-05, + "loss": 0.4244, + "step": 7100 + }, + { + "epoch": 1.0322299651567943, + "grad_norm": 1.5225111246109009, + "learning_rate": 2.3790059856652083e-05, + "loss": 0.9356, + "step": 7110 + }, + { + "epoch": 1.0336817653890824, + "grad_norm": 0.9001873135566711, + "learning_rate": 2.3733035016081355e-05, + "loss": 0.4678, + "step": 7120 + }, + { + "epoch": 1.0351335656213705, + "grad_norm": 2.5215003490448, + "learning_rate": 2.367601678365974e-05, + "loss": 0.5787, + "step": 7130 + }, + { + "epoch": 1.0365853658536586, + "grad_norm": 0.9304032325744629, + "learning_rate": 2.361900545677903e-05, + "loss": 0.3138, + "step": 7140 + }, + { + "epoch": 1.0380371660859466, + "grad_norm": 0.9305661916732788, + "learning_rate": 2.3562001332795e-05, + "loss": 0.5626, + "step": 7150 + }, + { + "epoch": 1.0394889663182345, + "grad_norm": 1.5378453731536865, + "learning_rate": 2.3505004709025842e-05, + "loss": 0.7586, + "step": 7160 + }, + { + "epoch": 1.0409407665505226, + "grad_norm": 0.8000249266624451, + "learning_rate": 2.3448015882750647e-05, + "loss": 0.4352, + "step": 7170 + }, + { + "epoch": 1.0423925667828107, + "grad_norm": 0.8322232365608215, + "learning_rate": 2.339103515120783e-05, + "loss": 0.7357, + "step": 7180 + }, + { + "epoch": 1.0438443670150988, + "grad_norm": 0.9948438405990601, + "learning_rate": 2.3334062811593556e-05, + "loss": 0.657, + "step": 7190 + }, + { + "epoch": 1.0452961672473868, + "grad_norm": 1.071321725845337, + "learning_rate": 2.3277099161060298e-05, + "loss": 0.5158, + "step": 7200 + }, + { + "epoch": 1.0467479674796747, + "grad_norm": 0.7249424457550049, + "learning_rate": 2.3220144496715125e-05, + "loss": 0.606, + "step": 7210 + }, + { + "epoch": 1.0481997677119628, + "grad_norm": 1.2231613397598267, + "learning_rate": 2.3163199115618282e-05, + "loss": 0.4094, + "step": 7220 + }, + { + "epoch": 1.0496515679442509, + "grad_norm": 1.2972086668014526, + "learning_rate": 2.310626331478159e-05, + "loss": 0.4112, + "step": 7230 + }, + { + "epoch": 1.051103368176539, + "grad_norm": 1.0579259395599365, + "learning_rate": 2.304933739116688e-05, + "loss": 0.6859, + "step": 7240 + }, + { + "epoch": 1.052555168408827, + "grad_norm": 1.3413074016571045, + "learning_rate": 2.2992421641684494e-05, + "loss": 0.4698, + "step": 7250 + }, + { + "epoch": 1.054006968641115, + "grad_norm": 1.203018069267273, + "learning_rate": 2.2935516363191693e-05, + "loss": 0.4366, + "step": 7260 + }, + { + "epoch": 1.055458768873403, + "grad_norm": 1.540850281715393, + "learning_rate": 2.2878621852491135e-05, + "loss": 0.5985, + "step": 7270 + }, + { + "epoch": 1.056910569105691, + "grad_norm": 0.8544327616691589, + "learning_rate": 2.28217384063293e-05, + "loss": 0.6348, + "step": 7280 + }, + { + "epoch": 1.0583623693379791, + "grad_norm": 0.9405458569526672, + "learning_rate": 2.2764866321394963e-05, + "loss": 0.5561, + "step": 7290 + }, + { + "epoch": 1.0598141695702672, + "grad_norm": 0.6483383178710938, + "learning_rate": 2.2708005894317657e-05, + "loss": 0.6295, + "step": 7300 + }, + { + "epoch": 1.0612659698025553, + "grad_norm": 1.3376249074935913, + "learning_rate": 2.2651157421666096e-05, + "loss": 0.6177, + "step": 7310 + }, + { + "epoch": 1.0627177700348431, + "grad_norm": 2.9725067615509033, + "learning_rate": 2.2594321199946656e-05, + "loss": 0.4115, + "step": 7320 + }, + { + "epoch": 1.0641695702671312, + "grad_norm": 1.1227383613586426, + "learning_rate": 2.253749752560179e-05, + "loss": 0.7575, + "step": 7330 + }, + { + "epoch": 1.0656213704994193, + "grad_norm": 7.148159027099609, + "learning_rate": 2.248068669500853e-05, + "loss": 0.6736, + "step": 7340 + }, + { + "epoch": 1.0670731707317074, + "grad_norm": 1.4029227495193481, + "learning_rate": 2.2423889004476915e-05, + "loss": 0.5547, + "step": 7350 + }, + { + "epoch": 1.0685249709639955, + "grad_norm": 0.13588035106658936, + "learning_rate": 2.2367104750248444e-05, + "loss": 0.5272, + "step": 7360 + }, + { + "epoch": 1.0699767711962833, + "grad_norm": 1.2609344720840454, + "learning_rate": 2.2310334228494536e-05, + "loss": 0.6262, + "step": 7370 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 1.735031008720398, + "learning_rate": 2.2253577735314987e-05, + "loss": 0.4278, + "step": 7380 + }, + { + "epoch": 1.0728803716608595, + "grad_norm": 5.371007919311523, + "learning_rate": 2.219683556673642e-05, + "loss": 0.6081, + "step": 7390 + }, + { + "epoch": 1.0743321718931476, + "grad_norm": 2.175072431564331, + "learning_rate": 2.2140108018710758e-05, + "loss": 0.7055, + "step": 7400 + }, + { + "epoch": 1.0757839721254356, + "grad_norm": 0.49367207288742065, + "learning_rate": 2.208339538711366e-05, + "loss": 0.3842, + "step": 7410 + }, + { + "epoch": 1.0772357723577235, + "grad_norm": 1.9475051164627075, + "learning_rate": 2.2026697967742968e-05, + "loss": 0.4956, + "step": 7420 + }, + { + "epoch": 1.0786875725900116, + "grad_norm": 1.74053955078125, + "learning_rate": 2.1970016056317203e-05, + "loss": 0.6627, + "step": 7430 + }, + { + "epoch": 1.0801393728222997, + "grad_norm": 1.1123576164245605, + "learning_rate": 2.1913349948473996e-05, + "loss": 0.4789, + "step": 7440 + }, + { + "epoch": 1.0815911730545877, + "grad_norm": 1.6125507354736328, + "learning_rate": 2.1856699939768545e-05, + "loss": 0.4892, + "step": 7450 + }, + { + "epoch": 1.0830429732868758, + "grad_norm": 1.4963864088058472, + "learning_rate": 2.1800066325672074e-05, + "loss": 0.4966, + "step": 7460 + }, + { + "epoch": 1.0844947735191637, + "grad_norm": 1.2943956851959229, + "learning_rate": 2.1743449401570324e-05, + "loss": 0.7522, + "step": 7470 + }, + { + "epoch": 1.0859465737514518, + "grad_norm": 0.6681497097015381, + "learning_rate": 2.1686849462761947e-05, + "loss": 0.5014, + "step": 7480 + }, + { + "epoch": 1.0873983739837398, + "grad_norm": 1.1527822017669678, + "learning_rate": 2.1630266804457035e-05, + "loss": 0.4268, + "step": 7490 + }, + { + "epoch": 1.088850174216028, + "grad_norm": 1.0493078231811523, + "learning_rate": 2.157370172177553e-05, + "loss": 0.6676, + "step": 7500 + }, + { + "epoch": 1.090301974448316, + "grad_norm": 0.7843257784843445, + "learning_rate": 2.1517154509745724e-05, + "loss": 0.4035, + "step": 7510 + }, + { + "epoch": 1.0917537746806039, + "grad_norm": 1.5716508626937866, + "learning_rate": 2.1460625463302686e-05, + "loss": 0.4774, + "step": 7520 + }, + { + "epoch": 1.093205574912892, + "grad_norm": 0.881391704082489, + "learning_rate": 2.1404114877286747e-05, + "loss": 0.6217, + "step": 7530 + }, + { + "epoch": 1.09465737514518, + "grad_norm": 0.4978386461734772, + "learning_rate": 2.134762304644193e-05, + "loss": 0.7448, + "step": 7540 + }, + { + "epoch": 1.096109175377468, + "grad_norm": 1.047534465789795, + "learning_rate": 2.129115026541447e-05, + "loss": 0.7455, + "step": 7550 + }, + { + "epoch": 1.0975609756097562, + "grad_norm": 3.125924825668335, + "learning_rate": 2.1234696828751226e-05, + "loss": 0.3793, + "step": 7560 + }, + { + "epoch": 1.099012775842044, + "grad_norm": 4.937119960784912, + "learning_rate": 2.1178263030898155e-05, + "loss": 0.6671, + "step": 7570 + }, + { + "epoch": 1.1004645760743321, + "grad_norm": 0.9988604187965393, + "learning_rate": 2.1121849166198793e-05, + "loss": 0.6868, + "step": 7580 + }, + { + "epoch": 1.1019163763066202, + "grad_norm": 1.7846256494522095, + "learning_rate": 2.106545552889272e-05, + "loss": 0.7165, + "step": 7590 + }, + { + "epoch": 1.1033681765389083, + "grad_norm": 1.7793424129486084, + "learning_rate": 2.1009082413113973e-05, + "loss": 0.6098, + "step": 7600 + }, + { + "epoch": 1.1048199767711964, + "grad_norm": 0.6615446209907532, + "learning_rate": 2.095273011288963e-05, + "loss": 0.5701, + "step": 7610 + }, + { + "epoch": 1.1062717770034842, + "grad_norm": 1.3341655731201172, + "learning_rate": 2.0896398922138122e-05, + "loss": 0.676, + "step": 7620 + }, + { + "epoch": 1.1077235772357723, + "grad_norm": 1.0205527544021606, + "learning_rate": 2.0840089134667824e-05, + "loss": 0.5475, + "step": 7630 + }, + { + "epoch": 1.1091753774680604, + "grad_norm": 1.5262418985366821, + "learning_rate": 2.0783801044175467e-05, + "loss": 0.582, + "step": 7640 + }, + { + "epoch": 1.1106271777003485, + "grad_norm": 2.5063817501068115, + "learning_rate": 2.0727534944244615e-05, + "loss": 0.7552, + "step": 7650 + }, + { + "epoch": 1.1120789779326365, + "grad_norm": 3.6351969242095947, + "learning_rate": 2.067129112834413e-05, + "loss": 0.6419, + "step": 7660 + }, + { + "epoch": 1.1135307781649244, + "grad_norm": 0.8957704305648804, + "learning_rate": 2.061506988982665e-05, + "loss": 0.4333, + "step": 7670 + }, + { + "epoch": 1.1149825783972125, + "grad_norm": 1.9803669452667236, + "learning_rate": 2.0558871521927073e-05, + "loss": 0.4656, + "step": 7680 + }, + { + "epoch": 1.1164343786295006, + "grad_norm": 0.8719884157180786, + "learning_rate": 2.0502696317760973e-05, + "loss": 0.4252, + "step": 7690 + }, + { + "epoch": 1.1178861788617886, + "grad_norm": 1.6916320323944092, + "learning_rate": 2.044654457032314e-05, + "loss": 0.7204, + "step": 7700 + }, + { + "epoch": 1.1193379790940767, + "grad_norm": 1.6074903011322021, + "learning_rate": 2.0390416572486e-05, + "loss": 0.4984, + "step": 7710 + }, + { + "epoch": 1.1207897793263646, + "grad_norm": 0.2988170087337494, + "learning_rate": 2.033431261699813e-05, + "loss": 0.4557, + "step": 7720 + }, + { + "epoch": 1.1222415795586527, + "grad_norm": 15.167128562927246, + "learning_rate": 2.0278232996482688e-05, + "loss": 0.551, + "step": 7730 + }, + { + "epoch": 1.1236933797909407, + "grad_norm": 0.8808531761169434, + "learning_rate": 2.0222178003435926e-05, + "loss": 0.434, + "step": 7740 + }, + { + "epoch": 1.1251451800232288, + "grad_norm": 0.7921860814094543, + "learning_rate": 2.0166147930225615e-05, + "loss": 0.4803, + "step": 7750 + }, + { + "epoch": 1.126596980255517, + "grad_norm": 1.9591280221939087, + "learning_rate": 2.011014306908958e-05, + "loss": 0.786, + "step": 7760 + }, + { + "epoch": 1.1280487804878048, + "grad_norm": 1.479054570198059, + "learning_rate": 2.0054163712134145e-05, + "loss": 0.655, + "step": 7770 + }, + { + "epoch": 1.1295005807200929, + "grad_norm": 3.091681480407715, + "learning_rate": 1.9998210151332585e-05, + "loss": 0.7444, + "step": 7780 + }, + { + "epoch": 1.130952380952381, + "grad_norm": 2.064387321472168, + "learning_rate": 1.994228267852366e-05, + "loss": 0.4337, + "step": 7790 + }, + { + "epoch": 1.132404181184669, + "grad_norm": 1.0761544704437256, + "learning_rate": 1.9886381585410045e-05, + "loss": 0.5395, + "step": 7800 + }, + { + "epoch": 1.133855981416957, + "grad_norm": 1.1305792331695557, + "learning_rate": 1.9830507163556816e-05, + "loss": 0.6013, + "step": 7810 + }, + { + "epoch": 1.135307781649245, + "grad_norm": 3.304077386856079, + "learning_rate": 1.977465970438998e-05, + "loss": 0.8103, + "step": 7820 + }, + { + "epoch": 1.136759581881533, + "grad_norm": 0.8400141596794128, + "learning_rate": 1.9718839499194868e-05, + "loss": 0.5292, + "step": 7830 + }, + { + "epoch": 1.1382113821138211, + "grad_norm": 5.679340839385986, + "learning_rate": 1.9663046839114684e-05, + "loss": 0.5317, + "step": 7840 + }, + { + "epoch": 1.1396631823461092, + "grad_norm": 2.914165496826172, + "learning_rate": 1.960728201514896e-05, + "loss": 0.7501, + "step": 7850 + }, + { + "epoch": 1.1411149825783973, + "grad_norm": 3.093472957611084, + "learning_rate": 1.9551545318152047e-05, + "loss": 0.5741, + "step": 7860 + }, + { + "epoch": 1.1425667828106851, + "grad_norm": 1.7415759563446045, + "learning_rate": 1.949583703883158e-05, + "loss": 0.5044, + "step": 7870 + }, + { + "epoch": 1.1440185830429732, + "grad_norm": 4.8877668380737305, + "learning_rate": 1.9440157467746985e-05, + "loss": 0.786, + "step": 7880 + }, + { + "epoch": 1.1454703832752613, + "grad_norm": 1.9730969667434692, + "learning_rate": 1.9384506895307964e-05, + "loss": 0.7195, + "step": 7890 + }, + { + "epoch": 1.1469221835075494, + "grad_norm": 12.92557430267334, + "learning_rate": 1.932888561177294e-05, + "loss": 0.5679, + "step": 7900 + }, + { + "epoch": 1.1483739837398375, + "grad_norm": 2.283071517944336, + "learning_rate": 1.92732939072476e-05, + "loss": 0.5129, + "step": 7910 + }, + { + "epoch": 1.1498257839721253, + "grad_norm": 0.8420314788818359, + "learning_rate": 1.9217732071683343e-05, + "loss": 0.6232, + "step": 7920 + }, + { + "epoch": 1.1512775842044134, + "grad_norm": 1.523573637008667, + "learning_rate": 1.9162200394875783e-05, + "loss": 0.6329, + "step": 7930 + }, + { + "epoch": 1.1527293844367015, + "grad_norm": 3.2268831729888916, + "learning_rate": 1.9106699166463247e-05, + "loss": 0.5248, + "step": 7940 + }, + { + "epoch": 1.1541811846689896, + "grad_norm": 2.4383325576782227, + "learning_rate": 1.905122867592522e-05, + "loss": 0.725, + "step": 7950 + }, + { + "epoch": 1.1556329849012776, + "grad_norm": 7.215484142303467, + "learning_rate": 1.8995789212580884e-05, + "loss": 0.4331, + "step": 7960 + }, + { + "epoch": 1.1570847851335655, + "grad_norm": 1.5999699831008911, + "learning_rate": 1.89403810655876e-05, + "loss": 0.421, + "step": 7970 + }, + { + "epoch": 1.1585365853658536, + "grad_norm": 0.6313633918762207, + "learning_rate": 1.8885004523939386e-05, + "loss": 0.3322, + "step": 7980 + }, + { + "epoch": 1.1599883855981417, + "grad_norm": 1.2481117248535156, + "learning_rate": 1.8829659876465406e-05, + "loss": 0.4594, + "step": 7990 + }, + { + "epoch": 1.1614401858304297, + "grad_norm": 1.2827537059783936, + "learning_rate": 1.8774347411828472e-05, + "loss": 0.603, + "step": 8000 + }, + { + "epoch": 1.1628919860627178, + "grad_norm": 0.5014917254447937, + "learning_rate": 1.871906741852356e-05, + "loss": 0.3013, + "step": 8010 + }, + { + "epoch": 1.164343786295006, + "grad_norm": 1.5885872840881348, + "learning_rate": 1.8663820184876247e-05, + "loss": 0.5299, + "step": 8020 + }, + { + "epoch": 1.1657955865272938, + "grad_norm": 0.0035865483805537224, + "learning_rate": 1.8608605999041297e-05, + "loss": 0.5274, + "step": 8030 + }, + { + "epoch": 1.1672473867595818, + "grad_norm": 1.4468114376068115, + "learning_rate": 1.8553425149001057e-05, + "loss": 0.4781, + "step": 8040 + }, + { + "epoch": 1.16869918699187, + "grad_norm": 2.688275098800659, + "learning_rate": 1.8498277922564026e-05, + "loss": 0.4668, + "step": 8050 + }, + { + "epoch": 1.170150987224158, + "grad_norm": 1.6105045080184937, + "learning_rate": 1.8443164607363333e-05, + "loss": 0.6738, + "step": 8060 + }, + { + "epoch": 1.171602787456446, + "grad_norm": 1.46797513961792, + "learning_rate": 1.8388085490855217e-05, + "loss": 0.552, + "step": 8070 + }, + { + "epoch": 1.173054587688734, + "grad_norm": 1.1859605312347412, + "learning_rate": 1.833304086031757e-05, + "loss": 0.4247, + "step": 8080 + }, + { + "epoch": 1.174506387921022, + "grad_norm": 1.4056955575942993, + "learning_rate": 1.8278031002848394e-05, + "loss": 0.4875, + "step": 8090 + }, + { + "epoch": 1.17595818815331, + "grad_norm": 1.6861822605133057, + "learning_rate": 1.8223056205364342e-05, + "loss": 0.5837, + "step": 8100 + }, + { + "epoch": 1.1774099883855982, + "grad_norm": 1.9432148933410645, + "learning_rate": 1.8168116754599186e-05, + "loss": 0.6512, + "step": 8110 + }, + { + "epoch": 1.1788617886178863, + "grad_norm": 1.438887119293213, + "learning_rate": 1.811321293710235e-05, + "loss": 0.5249, + "step": 8120 + }, + { + "epoch": 1.1803135888501741, + "grad_norm": 4.159003734588623, + "learning_rate": 1.8058345039237395e-05, + "loss": 0.4055, + "step": 8130 + }, + { + "epoch": 1.1817653890824622, + "grad_norm": 1.9116485118865967, + "learning_rate": 1.8003513347180557e-05, + "loss": 0.6732, + "step": 8140 + }, + { + "epoch": 1.1832171893147503, + "grad_norm": 0.8615849614143372, + "learning_rate": 1.7948718146919212e-05, + "loss": 0.4732, + "step": 8150 + }, + { + "epoch": 1.1846689895470384, + "grad_norm": 1.812454342842102, + "learning_rate": 1.7893959724250402e-05, + "loss": 0.4385, + "step": 8160 + }, + { + "epoch": 1.1861207897793264, + "grad_norm": 1.0954737663269043, + "learning_rate": 1.7839238364779358e-05, + "loss": 0.4728, + "step": 8170 + }, + { + "epoch": 1.1875725900116145, + "grad_norm": 3.3820154666900635, + "learning_rate": 1.7784554353918002e-05, + "loss": 0.4665, + "step": 8180 + }, + { + "epoch": 1.1890243902439024, + "grad_norm": 1.3054349422454834, + "learning_rate": 1.772990797688344e-05, + "loss": 0.8027, + "step": 8190 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.9854569435119629, + "learning_rate": 1.7675299518696503e-05, + "loss": 0.6728, + "step": 8200 + }, + { + "epoch": 1.1919279907084785, + "grad_norm": 1.3417751789093018, + "learning_rate": 1.7620729264180244e-05, + "loss": 0.6094, + "step": 8210 + }, + { + "epoch": 1.1933797909407666, + "grad_norm": 1.8543522357940674, + "learning_rate": 1.756619749795846e-05, + "loss": 0.3593, + "step": 8220 + }, + { + "epoch": 1.1948315911730547, + "grad_norm": 4.067511081695557, + "learning_rate": 1.751170450445418e-05, + "loss": 0.4437, + "step": 8230 + }, + { + "epoch": 1.1962833914053426, + "grad_norm": 0.7072954773902893, + "learning_rate": 1.7457250567888255e-05, + "loss": 0.4523, + "step": 8240 + }, + { + "epoch": 1.1977351916376306, + "grad_norm": 2.400019884109497, + "learning_rate": 1.7402835972277774e-05, + "loss": 0.7181, + "step": 8250 + }, + { + "epoch": 1.1991869918699187, + "grad_norm": 0.7188956141471863, + "learning_rate": 1.734846100143466e-05, + "loss": 0.6106, + "step": 8260 + }, + { + "epoch": 1.2006387921022068, + "grad_norm": 0.9549878835678101, + "learning_rate": 1.7294125938964163e-05, + "loss": 0.6636, + "step": 8270 + }, + { + "epoch": 1.202090592334495, + "grad_norm": 1.26228928565979, + "learning_rate": 1.7239831068263366e-05, + "loss": 0.3134, + "step": 8280 + }, + { + "epoch": 1.2035423925667827, + "grad_norm": 1.7492179870605469, + "learning_rate": 1.718557667251974e-05, + "loss": 0.7868, + "step": 8290 + }, + { + "epoch": 1.2049941927990708, + "grad_norm": 5.789414405822754, + "learning_rate": 1.7131363034709647e-05, + "loss": 0.3828, + "step": 8300 + }, + { + "epoch": 1.206445993031359, + "grad_norm": 5.111294746398926, + "learning_rate": 1.7077190437596864e-05, + "loss": 0.5902, + "step": 8310 + }, + { + "epoch": 1.207897793263647, + "grad_norm": 1.8779693841934204, + "learning_rate": 1.7023059163731097e-05, + "loss": 0.5968, + "step": 8320 + }, + { + "epoch": 1.209349593495935, + "grad_norm": 4.733475685119629, + "learning_rate": 1.696896949544654e-05, + "loss": 0.6245, + "step": 8330 + }, + { + "epoch": 1.210801393728223, + "grad_norm": 0.9428911805152893, + "learning_rate": 1.6914921714860378e-05, + "loss": 0.537, + "step": 8340 + }, + { + "epoch": 1.212253193960511, + "grad_norm": 0.8777297735214233, + "learning_rate": 1.686091610387133e-05, + "loss": 0.5012, + "step": 8350 + }, + { + "epoch": 1.213704994192799, + "grad_norm": 4.631138801574707, + "learning_rate": 1.680695294415815e-05, + "loss": 0.6156, + "step": 8360 + }, + { + "epoch": 1.2151567944250872, + "grad_norm": 0.6276788711547852, + "learning_rate": 1.6753032517178187e-05, + "loss": 0.6097, + "step": 8370 + }, + { + "epoch": 1.2166085946573753, + "grad_norm": 0.7549428939819336, + "learning_rate": 1.6699155104165904e-05, + "loss": 0.7467, + "step": 8380 + }, + { + "epoch": 1.2180603948896631, + "grad_norm": 0.9138199687004089, + "learning_rate": 1.6645320986131433e-05, + "loss": 0.5846, + "step": 8390 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 1.1513859033584595, + "learning_rate": 1.659153044385906e-05, + "loss": 0.4798, + "step": 8400 + }, + { + "epoch": 1.2209639953542393, + "grad_norm": 1.6771997213363647, + "learning_rate": 1.6537783757905816e-05, + "loss": 0.8278, + "step": 8410 + }, + { + "epoch": 1.2224157955865274, + "grad_norm": 1.2027699947357178, + "learning_rate": 1.648408120859998e-05, + "loss": 0.7619, + "step": 8420 + }, + { + "epoch": 1.2238675958188154, + "grad_norm": 2.257286310195923, + "learning_rate": 1.643042307603964e-05, + "loss": 0.7877, + "step": 8430 + }, + { + "epoch": 1.2253193960511033, + "grad_norm": 0.687853217124939, + "learning_rate": 1.6376809640091174e-05, + "loss": 0.6319, + "step": 8440 + }, + { + "epoch": 1.2267711962833914, + "grad_norm": 1.3753950595855713, + "learning_rate": 1.63232411803879e-05, + "loss": 0.5018, + "step": 8450 + }, + { + "epoch": 1.2282229965156795, + "grad_norm": 3.117898464202881, + "learning_rate": 1.6269717976328503e-05, + "loss": 0.6428, + "step": 8460 + }, + { + "epoch": 1.2296747967479675, + "grad_norm": 1.2253605127334595, + "learning_rate": 1.6216240307075642e-05, + "loss": 0.6265, + "step": 8470 + }, + { + "epoch": 1.2311265969802556, + "grad_norm": 1.9370412826538086, + "learning_rate": 1.6162808451554483e-05, + "loss": 0.6584, + "step": 8480 + }, + { + "epoch": 1.2325783972125435, + "grad_norm": 4.468973636627197, + "learning_rate": 1.6109422688451224e-05, + "loss": 0.6343, + "step": 8490 + }, + { + "epoch": 1.2340301974448316, + "grad_norm": 6.738311290740967, + "learning_rate": 1.605608329621168e-05, + "loss": 0.6665, + "step": 8500 + }, + { + "epoch": 1.2354819976771196, + "grad_norm": 1.266482949256897, + "learning_rate": 1.6002790553039803e-05, + "loss": 0.7137, + "step": 8510 + }, + { + "epoch": 1.2369337979094077, + "grad_norm": 0.7233752012252808, + "learning_rate": 1.594954473689621e-05, + "loss": 0.5351, + "step": 8520 + }, + { + "epoch": 1.2383855981416958, + "grad_norm": 3.379714012145996, + "learning_rate": 1.5896346125496793e-05, + "loss": 0.5488, + "step": 8530 + }, + { + "epoch": 1.2398373983739837, + "grad_norm": 2.4713003635406494, + "learning_rate": 1.5843194996311213e-05, + "loss": 0.7367, + "step": 8540 + }, + { + "epoch": 1.2412891986062717, + "grad_norm": 0.4656989574432373, + "learning_rate": 1.5790091626561494e-05, + "loss": 0.3323, + "step": 8550 + }, + { + "epoch": 1.2427409988385598, + "grad_norm": 1.3530571460723877, + "learning_rate": 1.5737036293220554e-05, + "loss": 0.5089, + "step": 8560 + }, + { + "epoch": 1.244192799070848, + "grad_norm": 1.5478246212005615, + "learning_rate": 1.568402927301076e-05, + "loss": 0.6737, + "step": 8570 + }, + { + "epoch": 1.245644599303136, + "grad_norm": 1.6007646322250366, + "learning_rate": 1.5631070842402494e-05, + "loss": 0.5032, + "step": 8580 + }, + { + "epoch": 1.2470963995354238, + "grad_norm": 1.9949185848236084, + "learning_rate": 1.5578161277612707e-05, + "loss": 0.746, + "step": 8590 + }, + { + "epoch": 1.248548199767712, + "grad_norm": 1.552194595336914, + "learning_rate": 1.5525300854603486e-05, + "loss": 0.4807, + "step": 8600 + }, + { + "epoch": 1.25, + "grad_norm": 6.406808376312256, + "learning_rate": 1.547248984908059e-05, + "loss": 0.5125, + "step": 8610 + }, + { + "epoch": 1.251451800232288, + "grad_norm": 0.6398019790649414, + "learning_rate": 1.5419728536492055e-05, + "loss": 0.3386, + "step": 8620 + }, + { + "epoch": 1.2529036004645762, + "grad_norm": 1.874664306640625, + "learning_rate": 1.5367017192026713e-05, + "loss": 0.5268, + "step": 8630 + }, + { + "epoch": 1.254355400696864, + "grad_norm": 2.999232053756714, + "learning_rate": 1.5314356090612776e-05, + "loss": 0.5744, + "step": 8640 + }, + { + "epoch": 1.255807200929152, + "grad_norm": 9.338212966918945, + "learning_rate": 1.5261745506916408e-05, + "loss": 0.6682, + "step": 8650 + }, + { + "epoch": 1.2572590011614402, + "grad_norm": 3.1387779712677, + "learning_rate": 1.5209185715340294e-05, + "loss": 0.4691, + "step": 8660 + }, + { + "epoch": 1.2587108013937283, + "grad_norm": 0.6614925861358643, + "learning_rate": 1.5156676990022184e-05, + "loss": 0.4255, + "step": 8670 + }, + { + "epoch": 1.2601626016260163, + "grad_norm": 0.9042619466781616, + "learning_rate": 1.5104219604833494e-05, + "loss": 0.3958, + "step": 8680 + }, + { + "epoch": 1.2616144018583042, + "grad_norm": 0.8313902020454407, + "learning_rate": 1.5051813833377859e-05, + "loss": 0.5207, + "step": 8690 + }, + { + "epoch": 1.2630662020905923, + "grad_norm": 1.0558016300201416, + "learning_rate": 1.4999459948989702e-05, + "loss": 0.3235, + "step": 8700 + }, + { + "epoch": 1.2645180023228804, + "grad_norm": 1.1987258195877075, + "learning_rate": 1.4947158224732827e-05, + "loss": 0.4936, + "step": 8710 + }, + { + "epoch": 1.2659698025551684, + "grad_norm": 1.0946906805038452, + "learning_rate": 1.4894908933398989e-05, + "loss": 0.6256, + "step": 8720 + }, + { + "epoch": 1.2674216027874565, + "grad_norm": 1.2409650087356567, + "learning_rate": 1.4842712347506443e-05, + "loss": 0.793, + "step": 8730 + }, + { + "epoch": 1.2688734030197444, + "grad_norm": 0.7660655379295349, + "learning_rate": 1.4790568739298582e-05, + "loss": 0.5611, + "step": 8740 + }, + { + "epoch": 1.2703252032520325, + "grad_norm": 0.7420207262039185, + "learning_rate": 1.473847838074245e-05, + "loss": 0.6045, + "step": 8750 + }, + { + "epoch": 1.2717770034843205, + "grad_norm": 0.743302047252655, + "learning_rate": 1.4686441543527374e-05, + "loss": 0.7294, + "step": 8760 + }, + { + "epoch": 1.2732288037166086, + "grad_norm": 1.441884160041809, + "learning_rate": 1.4634458499063536e-05, + "loss": 0.6125, + "step": 8770 + }, + { + "epoch": 1.2746806039488967, + "grad_norm": 0.13829253613948822, + "learning_rate": 1.458252951848051e-05, + "loss": 0.4259, + "step": 8780 + }, + { + "epoch": 1.2761324041811846, + "grad_norm": 1.4842077493667603, + "learning_rate": 1.4530654872625935e-05, + "loss": 0.5568, + "step": 8790 + }, + { + "epoch": 1.2775842044134726, + "grad_norm": 1.0749858617782593, + "learning_rate": 1.4478834832064026e-05, + "loss": 0.5374, + "step": 8800 + }, + { + "epoch": 1.2790360046457607, + "grad_norm": 8.395951271057129, + "learning_rate": 1.4427069667074184e-05, + "loss": 0.4693, + "step": 8810 + }, + { + "epoch": 1.2804878048780488, + "grad_norm": 0.30601173639297485, + "learning_rate": 1.4375359647649634e-05, + "loss": 0.3597, + "step": 8820 + }, + { + "epoch": 1.2819396051103369, + "grad_norm": 1.7369287014007568, + "learning_rate": 1.4323705043495938e-05, + "loss": 0.4448, + "step": 8830 + }, + { + "epoch": 1.2833914053426247, + "grad_norm": 1.2835052013397217, + "learning_rate": 1.4272106124029627e-05, + "loss": 0.7685, + "step": 8840 + }, + { + "epoch": 1.2848432055749128, + "grad_norm": 3.1556379795074463, + "learning_rate": 1.4220563158376832e-05, + "loss": 0.5719, + "step": 8850 + }, + { + "epoch": 1.286295005807201, + "grad_norm": 2.188831090927124, + "learning_rate": 1.4169076415371802e-05, + "loss": 0.5663, + "step": 8860 + }, + { + "epoch": 1.287746806039489, + "grad_norm": 2.645719051361084, + "learning_rate": 1.4117646163555565e-05, + "loss": 0.653, + "step": 8870 + }, + { + "epoch": 1.289198606271777, + "grad_norm": 1.1469491720199585, + "learning_rate": 1.4066272671174512e-05, + "loss": 0.5314, + "step": 8880 + }, + { + "epoch": 1.290650406504065, + "grad_norm": 1.8526806831359863, + "learning_rate": 1.4014956206178987e-05, + "loss": 0.4409, + "step": 8890 + }, + { + "epoch": 1.292102206736353, + "grad_norm": 0.9696226716041565, + "learning_rate": 1.3963697036221863e-05, + "loss": 0.7264, + "step": 8900 + }, + { + "epoch": 1.293554006968641, + "grad_norm": 2.522721529006958, + "learning_rate": 1.3912495428657236e-05, + "loss": 0.7832, + "step": 8910 + }, + { + "epoch": 1.2950058072009292, + "grad_norm": 1.7248927354812622, + "learning_rate": 1.3861351650538929e-05, + "loss": 0.548, + "step": 8920 + }, + { + "epoch": 1.2964576074332173, + "grad_norm": 0.5419870018959045, + "learning_rate": 1.3810265968619141e-05, + "loss": 0.6291, + "step": 8930 + }, + { + "epoch": 1.297909407665505, + "grad_norm": 0.7840960025787354, + "learning_rate": 1.3759238649347091e-05, + "loss": 0.5772, + "step": 8940 + }, + { + "epoch": 1.2993612078977932, + "grad_norm": 1.4585460424423218, + "learning_rate": 1.3708269958867565e-05, + "loss": 0.8735, + "step": 8950 + }, + { + "epoch": 1.3008130081300813, + "grad_norm": 3.5455801486968994, + "learning_rate": 1.3657360163019544e-05, + "loss": 0.6392, + "step": 8960 + }, + { + "epoch": 1.3022648083623694, + "grad_norm": 0.890296220779419, + "learning_rate": 1.3606509527334894e-05, + "loss": 0.853, + "step": 8970 + }, + { + "epoch": 1.3037166085946574, + "grad_norm": 2.1235806941986084, + "learning_rate": 1.3555718317036847e-05, + "loss": 0.6268, + "step": 8980 + }, + { + "epoch": 1.3051684088269453, + "grad_norm": 1.9171247482299805, + "learning_rate": 1.3504986797038715e-05, + "loss": 0.5688, + "step": 8990 + }, + { + "epoch": 1.3066202090592334, + "grad_norm": 2.7999086380004883, + "learning_rate": 1.3454315231942499e-05, + "loss": 0.5062, + "step": 9000 + }, + { + "epoch": 1.3066202090592334, + "eval_loss": 0.6196444034576416, + "eval_runtime": 107.7639, + "eval_samples_per_second": 13.455, + "eval_steps_per_second": 3.368, + "step": 9000 + }, + { + "epoch": 1.3080720092915215, + "grad_norm": 1.666410207748413, + "learning_rate": 1.3403703886037466e-05, + "loss": 0.7899, + "step": 9010 + }, + { + "epoch": 1.3095238095238095, + "grad_norm": 1.47067129611969, + "learning_rate": 1.3353153023298789e-05, + "loss": 0.5773, + "step": 9020 + }, + { + "epoch": 1.3109756097560976, + "grad_norm": 1.4645687341690063, + "learning_rate": 1.3302662907386222e-05, + "loss": 0.6352, + "step": 9030 + }, + { + "epoch": 1.3124274099883855, + "grad_norm": 1.135907530784607, + "learning_rate": 1.325223380164263e-05, + "loss": 0.5388, + "step": 9040 + }, + { + "epoch": 1.3138792102206736, + "grad_norm": 0.8413094282150269, + "learning_rate": 1.3201865969092686e-05, + "loss": 0.7493, + "step": 9050 + }, + { + "epoch": 1.3153310104529616, + "grad_norm": 1.01530921459198, + "learning_rate": 1.315155967244149e-05, + "loss": 0.4492, + "step": 9060 + }, + { + "epoch": 1.3167828106852497, + "grad_norm": 2.6221423149108887, + "learning_rate": 1.3101315174073162e-05, + "loss": 0.5208, + "step": 9070 + }, + { + "epoch": 1.3182346109175378, + "grad_norm": 5.264577865600586, + "learning_rate": 1.305113273604952e-05, + "loss": 0.4573, + "step": 9080 + }, + { + "epoch": 1.3196864111498257, + "grad_norm": 1.6519479751586914, + "learning_rate": 1.3001012620108693e-05, + "loss": 0.5216, + "step": 9090 + }, + { + "epoch": 1.321138211382114, + "grad_norm": 1.1643894910812378, + "learning_rate": 1.2950955087663741e-05, + "loss": 0.4458, + "step": 9100 + }, + { + "epoch": 1.3225900116144018, + "grad_norm": 1.967511534690857, + "learning_rate": 1.2900960399801292e-05, + "loss": 0.7898, + "step": 9110 + }, + { + "epoch": 1.32404181184669, + "grad_norm": 1.269264578819275, + "learning_rate": 1.2851028817280242e-05, + "loss": 0.5747, + "step": 9120 + }, + { + "epoch": 1.325493612078978, + "grad_norm": 1.0032755136489868, + "learning_rate": 1.2801160600530299e-05, + "loss": 0.5245, + "step": 9130 + }, + { + "epoch": 1.3269454123112658, + "grad_norm": 4.5418925285339355, + "learning_rate": 1.2751356009650681e-05, + "loss": 0.6442, + "step": 9140 + }, + { + "epoch": 1.3283972125435541, + "grad_norm": 1.1265850067138672, + "learning_rate": 1.270161530440878e-05, + "loss": 0.4234, + "step": 9150 + }, + { + "epoch": 1.329849012775842, + "grad_norm": 0.029596175998449326, + "learning_rate": 1.2651938744238745e-05, + "loss": 0.4876, + "step": 9160 + }, + { + "epoch": 1.33130081300813, + "grad_norm": 4.938312530517578, + "learning_rate": 1.2602326588240168e-05, + "loss": 0.5431, + "step": 9170 + }, + { + "epoch": 1.3327526132404182, + "grad_norm": 1.1647799015045166, + "learning_rate": 1.2552779095176737e-05, + "loss": 0.5084, + "step": 9180 + }, + { + "epoch": 1.334204413472706, + "grad_norm": 0.8059009313583374, + "learning_rate": 1.2503296523474883e-05, + "loss": 0.7431, + "step": 9190 + }, + { + "epoch": 1.3356562137049943, + "grad_norm": 1.5592460632324219, + "learning_rate": 1.245387913122239e-05, + "loss": 0.312, + "step": 9200 + }, + { + "epoch": 1.3371080139372822, + "grad_norm": 1.1873098611831665, + "learning_rate": 1.2404527176167124e-05, + "loss": 0.7229, + "step": 9210 + }, + { + "epoch": 1.3385598141695703, + "grad_norm": 1.4901853799819946, + "learning_rate": 1.2355240915715618e-05, + "loss": 0.538, + "step": 9220 + }, + { + "epoch": 1.3400116144018583, + "grad_norm": 1.5089656114578247, + "learning_rate": 1.2306020606931767e-05, + "loss": 0.5226, + "step": 9230 + }, + { + "epoch": 1.3414634146341464, + "grad_norm": 0.9845458269119263, + "learning_rate": 1.2256866506535497e-05, + "loss": 0.61, + "step": 9240 + }, + { + "epoch": 1.3429152148664345, + "grad_norm": 0.9404434561729431, + "learning_rate": 1.220777887090139e-05, + "loss": 0.5815, + "step": 9250 + }, + { + "epoch": 1.3443670150987224, + "grad_norm": 1.297400712966919, + "learning_rate": 1.2158757956057357e-05, + "loss": 0.5703, + "step": 9260 + }, + { + "epoch": 1.3458188153310104, + "grad_norm": 5.133298397064209, + "learning_rate": 1.2109804017683349e-05, + "loss": 0.3776, + "step": 9270 + }, + { + "epoch": 1.3472706155632985, + "grad_norm": 0.2664077579975128, + "learning_rate": 1.206091731110994e-05, + "loss": 0.4978, + "step": 9280 + }, + { + "epoch": 1.3487224157955866, + "grad_norm": 0.8112949132919312, + "learning_rate": 1.2012098091317083e-05, + "loss": 0.4887, + "step": 9290 + }, + { + "epoch": 1.3501742160278747, + "grad_norm": 1.9871488809585571, + "learning_rate": 1.1963346612932702e-05, + "loss": 0.7117, + "step": 9300 + }, + { + "epoch": 1.3516260162601625, + "grad_norm": 3.5719833374023438, + "learning_rate": 1.191466313023143e-05, + "loss": 0.568, + "step": 9310 + }, + { + "epoch": 1.3530778164924506, + "grad_norm": 1.1161819696426392, + "learning_rate": 1.1866047897133223e-05, + "loss": 0.4455, + "step": 9320 + }, + { + "epoch": 1.3545296167247387, + "grad_norm": 1.2592240571975708, + "learning_rate": 1.1817501167202099e-05, + "loss": 0.5396, + "step": 9330 + }, + { + "epoch": 1.3559814169570268, + "grad_norm": 8.793890953063965, + "learning_rate": 1.1769023193644757e-05, + "loss": 0.5515, + "step": 9340 + }, + { + "epoch": 1.3574332171893149, + "grad_norm": 1.0319164991378784, + "learning_rate": 1.1720614229309277e-05, + "loss": 0.62, + "step": 9350 + }, + { + "epoch": 1.3588850174216027, + "grad_norm": 1.9891750812530518, + "learning_rate": 1.1672274526683835e-05, + "loss": 0.5769, + "step": 9360 + }, + { + "epoch": 1.3603368176538908, + "grad_norm": 3.4943082332611084, + "learning_rate": 1.162400433789533e-05, + "loss": 0.6463, + "step": 9370 + }, + { + "epoch": 1.3617886178861789, + "grad_norm": 1.8810696601867676, + "learning_rate": 1.1575803914708096e-05, + "loss": 0.7964, + "step": 9380 + }, + { + "epoch": 1.363240418118467, + "grad_norm": 1.418583869934082, + "learning_rate": 1.1527673508522604e-05, + "loss": 0.428, + "step": 9390 + }, + { + "epoch": 1.364692218350755, + "grad_norm": 4.40504264831543, + "learning_rate": 1.1479613370374136e-05, + "loss": 0.6119, + "step": 9400 + }, + { + "epoch": 1.366144018583043, + "grad_norm": 2.4559905529022217, + "learning_rate": 1.143162375093145e-05, + "loss": 0.5134, + "step": 9410 + }, + { + "epoch": 1.367595818815331, + "grad_norm": 3.8561477661132812, + "learning_rate": 1.1383704900495529e-05, + "loss": 0.4626, + "step": 9420 + }, + { + "epoch": 1.369047619047619, + "grad_norm": 1.6356045007705688, + "learning_rate": 1.1335857068998221e-05, + "loss": 0.5223, + "step": 9430 + }, + { + "epoch": 1.3704994192799071, + "grad_norm": 1.7519195079803467, + "learning_rate": 1.1288080506000955e-05, + "loss": 0.641, + "step": 9440 + }, + { + "epoch": 1.3719512195121952, + "grad_norm": 0.4097733199596405, + "learning_rate": 1.1240375460693475e-05, + "loss": 0.5781, + "step": 9450 + }, + { + "epoch": 1.373403019744483, + "grad_norm": 2.5884532928466797, + "learning_rate": 1.119274218189247e-05, + "loss": 0.5514, + "step": 9460 + }, + { + "epoch": 1.3748548199767712, + "grad_norm": 1.4594874382019043, + "learning_rate": 1.1145180918040332e-05, + "loss": 0.7619, + "step": 9470 + }, + { + "epoch": 1.3763066202090593, + "grad_norm": 7.807918548583984, + "learning_rate": 1.109769191720384e-05, + "loss": 0.3226, + "step": 9480 + }, + { + "epoch": 1.3777584204413473, + "grad_norm": 0.6364027261734009, + "learning_rate": 1.1050275427072884e-05, + "loss": 0.5776, + "step": 9490 + }, + { + "epoch": 1.3792102206736354, + "grad_norm": 0.4011842608451843, + "learning_rate": 1.1002931694959131e-05, + "loss": 0.4091, + "step": 9500 + }, + { + "epoch": 1.3806620209059233, + "grad_norm": 5.032822132110596, + "learning_rate": 1.0955660967794768e-05, + "loss": 0.5523, + "step": 9510 + }, + { + "epoch": 1.3821138211382114, + "grad_norm": 3.3209786415100098, + "learning_rate": 1.0908463492131227e-05, + "loss": 0.5782, + "step": 9520 + }, + { + "epoch": 1.3835656213704994, + "grad_norm": 0.4670596718788147, + "learning_rate": 1.086133951413785e-05, + "loss": 0.6112, + "step": 9530 + }, + { + "epoch": 1.3850174216027875, + "grad_norm": 6.041258335113525, + "learning_rate": 1.081428927960067e-05, + "loss": 0.6415, + "step": 9540 + }, + { + "epoch": 1.3864692218350756, + "grad_norm": 2.76751446723938, + "learning_rate": 1.0767313033921067e-05, + "loss": 0.3524, + "step": 9550 + }, + { + "epoch": 1.3879210220673635, + "grad_norm": 2.8424673080444336, + "learning_rate": 1.0720411022114512e-05, + "loss": 0.6496, + "step": 9560 + }, + { + "epoch": 1.3893728222996515, + "grad_norm": 1.2790861129760742, + "learning_rate": 1.0673583488809321e-05, + "loss": 0.6281, + "step": 9570 + }, + { + "epoch": 1.3908246225319396, + "grad_norm": 2.5029184818267822, + "learning_rate": 1.0626830678245329e-05, + "loss": 0.6078, + "step": 9580 + }, + { + "epoch": 1.3922764227642277, + "grad_norm": 1.0946515798568726, + "learning_rate": 1.0580152834272622e-05, + "loss": 0.5256, + "step": 9590 + }, + { + "epoch": 1.3937282229965158, + "grad_norm": 1.5489437580108643, + "learning_rate": 1.0533550200350314e-05, + "loss": 0.6867, + "step": 9600 + }, + { + "epoch": 1.3951800232288036, + "grad_norm": 1.4204350709915161, + "learning_rate": 1.0487023019545235e-05, + "loss": 0.6683, + "step": 9610 + }, + { + "epoch": 1.3966318234610917, + "grad_norm": 1.674791932106018, + "learning_rate": 1.044057153453066e-05, + "loss": 0.7691, + "step": 9620 + }, + { + "epoch": 1.3980836236933798, + "grad_norm": 2.3372557163238525, + "learning_rate": 1.039419598758505e-05, + "loss": 0.5875, + "step": 9630 + }, + { + "epoch": 1.3995354239256679, + "grad_norm": 4.951801300048828, + "learning_rate": 1.0347896620590819e-05, + "loss": 0.4327, + "step": 9640 + }, + { + "epoch": 1.400987224157956, + "grad_norm": 1.4369560480117798, + "learning_rate": 1.0301673675033017e-05, + "loss": 0.4592, + "step": 9650 + }, + { + "epoch": 1.4024390243902438, + "grad_norm": 1.2974849939346313, + "learning_rate": 1.025552739199813e-05, + "loss": 0.5833, + "step": 9660 + }, + { + "epoch": 1.403890824622532, + "grad_norm": 1.037194848060608, + "learning_rate": 1.0209458012172768e-05, + "loss": 0.4698, + "step": 9670 + }, + { + "epoch": 1.40534262485482, + "grad_norm": 2.5829808712005615, + "learning_rate": 1.016346577584244e-05, + "loss": 0.5585, + "step": 9680 + }, + { + "epoch": 1.406794425087108, + "grad_norm": 2.260946273803711, + "learning_rate": 1.0117550922890307e-05, + "loss": 0.6017, + "step": 9690 + }, + { + "epoch": 1.4082462253193961, + "grad_norm": 1.81033194065094, + "learning_rate": 1.0071713692795918e-05, + "loss": 0.6426, + "step": 9700 + }, + { + "epoch": 1.409698025551684, + "grad_norm": 1.978293776512146, + "learning_rate": 1.0025954324633948e-05, + "loss": 0.4709, + "step": 9710 + }, + { + "epoch": 1.411149825783972, + "grad_norm": 1.209401249885559, + "learning_rate": 9.980273057072968e-06, + "loss": 0.4459, + "step": 9720 + }, + { + "epoch": 1.4126016260162602, + "grad_norm": 1.3207520246505737, + "learning_rate": 9.934670128374212e-06, + "loss": 0.3628, + "step": 9730 + }, + { + "epoch": 1.4140534262485482, + "grad_norm": 0.9167854189872742, + "learning_rate": 9.889145776390308e-06, + "loss": 0.5037, + "step": 9740 + }, + { + "epoch": 1.4155052264808363, + "grad_norm": 2.53662109375, + "learning_rate": 9.843700238564035e-06, + "loss": 0.4758, + "step": 9750 + }, + { + "epoch": 1.4169570267131242, + "grad_norm": 2.7502434253692627, + "learning_rate": 9.798333751927139e-06, + "loss": 0.6707, + "step": 9760 + }, + { + "epoch": 1.4184088269454123, + "grad_norm": 1.7120157480239868, + "learning_rate": 9.753046553099007e-06, + "loss": 0.7902, + "step": 9770 + }, + { + "epoch": 1.4198606271777003, + "grad_norm": 2.2297070026397705, + "learning_rate": 9.707838878285527e-06, + "loss": 0.7242, + "step": 9780 + }, + { + "epoch": 1.4213124274099884, + "grad_norm": 2.1308581829071045, + "learning_rate": 9.662710963277783e-06, + "loss": 0.5492, + "step": 9790 + }, + { + "epoch": 1.4227642276422765, + "grad_norm": 7.506939888000488, + "learning_rate": 9.617663043450847e-06, + "loss": 0.469, + "step": 9800 + }, + { + "epoch": 1.4242160278745644, + "grad_norm": 7.771796703338623, + "learning_rate": 9.572695353762584e-06, + "loss": 0.4342, + "step": 9810 + }, + { + "epoch": 1.4256678281068524, + "grad_norm": 5.498221397399902, + "learning_rate": 9.527808128752397e-06, + "loss": 0.6446, + "step": 9820 + }, + { + "epoch": 1.4271196283391405, + "grad_norm": 3.744574785232544, + "learning_rate": 9.483001602539984e-06, + "loss": 0.4798, + "step": 9830 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.2909547090530396, + "learning_rate": 9.43827600882415e-06, + "loss": 0.4513, + "step": 9840 + }, + { + "epoch": 1.4300232288037167, + "grad_norm": 1.5674768686294556, + "learning_rate": 9.393631580881596e-06, + "loss": 0.3784, + "step": 9850 + }, + { + "epoch": 1.4314750290360045, + "grad_norm": 0.9246693253517151, + "learning_rate": 9.349068551565649e-06, + "loss": 0.3879, + "step": 9860 + }, + { + "epoch": 1.4329268292682926, + "grad_norm": 4.146523952484131, + "learning_rate": 9.304587153305122e-06, + "loss": 0.4375, + "step": 9870 + }, + { + "epoch": 1.4343786295005807, + "grad_norm": 0.4749496579170227, + "learning_rate": 9.260187618103036e-06, + "loss": 0.6098, + "step": 9880 + }, + { + "epoch": 1.4358304297328688, + "grad_norm": 0.9822236895561218, + "learning_rate": 9.215870177535433e-06, + "loss": 0.6339, + "step": 9890 + }, + { + "epoch": 1.4372822299651569, + "grad_norm": 1.2814334630966187, + "learning_rate": 9.171635062750189e-06, + "loss": 0.8344, + "step": 9900 + }, + { + "epoch": 1.4387340301974447, + "grad_norm": 2.548846483230591, + "learning_rate": 9.127482504465792e-06, + "loss": 0.5309, + "step": 9910 + }, + { + "epoch": 1.4401858304297328, + "grad_norm": 2.3129475116729736, + "learning_rate": 9.083412732970123e-06, + "loss": 0.6082, + "step": 9920 + }, + { + "epoch": 1.4416376306620209, + "grad_norm": 1.459028959274292, + "learning_rate": 9.039425978119267e-06, + "loss": 0.5144, + "step": 9930 + }, + { + "epoch": 1.443089430894309, + "grad_norm": 2.7794339656829834, + "learning_rate": 8.995522469336337e-06, + "loss": 0.6107, + "step": 9940 + }, + { + "epoch": 1.444541231126597, + "grad_norm": 1.3913737535476685, + "learning_rate": 8.951702435610244e-06, + "loss": 0.5444, + "step": 9950 + }, + { + "epoch": 1.445993031358885, + "grad_norm": 1.145751714706421, + "learning_rate": 8.907966105494498e-06, + "loss": 0.5478, + "step": 9960 + }, + { + "epoch": 1.447444831591173, + "grad_norm": 1.6435590982437134, + "learning_rate": 8.864313707106075e-06, + "loss": 0.5803, + "step": 9970 + }, + { + "epoch": 1.448896631823461, + "grad_norm": 2.8100969791412354, + "learning_rate": 8.820745468124144e-06, + "loss": 0.6449, + "step": 9980 + }, + { + "epoch": 1.4503484320557491, + "grad_norm": 0.4345937967300415, + "learning_rate": 8.777261615788956e-06, + "loss": 0.7335, + "step": 9990 + }, + { + "epoch": 1.4518002322880372, + "grad_norm": 3.7761106491088867, + "learning_rate": 8.733862376900597e-06, + "loss": 0.4368, + "step": 10000 + }, + { + "epoch": 1.453252032520325, + "grad_norm": 0.9145069718360901, + "learning_rate": 8.690547977817839e-06, + "loss": 0.6349, + "step": 10010 + }, + { + "epoch": 1.4547038327526132, + "grad_norm": 0.9794019460678101, + "learning_rate": 8.64731864445696e-06, + "loss": 0.5048, + "step": 10020 + }, + { + "epoch": 1.4561556329849012, + "grad_norm": 2.5523462295532227, + "learning_rate": 8.604174602290563e-06, + "loss": 0.5025, + "step": 10030 + }, + { + "epoch": 1.4576074332171893, + "grad_norm": 1.2542840242385864, + "learning_rate": 8.561116076346377e-06, + "loss": 0.342, + "step": 10040 + }, + { + "epoch": 1.4590592334494774, + "grad_norm": 4.584123611450195, + "learning_rate": 8.518143291206099e-06, + "loss": 0.5593, + "step": 10050 + }, + { + "epoch": 1.4605110336817653, + "grad_norm": 2.7633087635040283, + "learning_rate": 8.475256471004259e-06, + "loss": 0.5616, + "step": 10060 + }, + { + "epoch": 1.4619628339140534, + "grad_norm": 4.123738765716553, + "learning_rate": 8.43245583942698e-06, + "loss": 0.5572, + "step": 10070 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 6.2447428703308105, + "learning_rate": 8.389741619710855e-06, + "loss": 0.3971, + "step": 10080 + }, + { + "epoch": 1.4648664343786295, + "grad_norm": 1.1899082660675049, + "learning_rate": 8.347114034641806e-06, + "loss": 0.333, + "step": 10090 + }, + { + "epoch": 1.4663182346109176, + "grad_norm": 5.325255393981934, + "learning_rate": 8.304573306553846e-06, + "loss": 0.4626, + "step": 10100 + }, + { + "epoch": 1.4677700348432055, + "grad_norm": 2.715012788772583, + "learning_rate": 8.262119657327996e-06, + "loss": 0.4834, + "step": 10110 + }, + { + "epoch": 1.4692218350754935, + "grad_norm": 1.1475021839141846, + "learning_rate": 8.219753308391101e-06, + "loss": 0.5551, + "step": 10120 + }, + { + "epoch": 1.4706736353077816, + "grad_norm": 7.364482402801514, + "learning_rate": 8.17747448071465e-06, + "loss": 0.5282, + "step": 10130 + }, + { + "epoch": 1.4721254355400697, + "grad_norm": 1.1067121028900146, + "learning_rate": 8.135283394813651e-06, + "loss": 0.569, + "step": 10140 + }, + { + "epoch": 1.4735772357723578, + "grad_norm": 1.5818873643875122, + "learning_rate": 8.093180270745485e-06, + "loss": 0.5892, + "step": 10150 + }, + { + "epoch": 1.4750290360046456, + "grad_norm": 2.0473148822784424, + "learning_rate": 8.05116532810874e-06, + "loss": 0.8704, + "step": 10160 + }, + { + "epoch": 1.476480836236934, + "grad_norm": 1.3639038801193237, + "learning_rate": 8.009238786042062e-06, + "loss": 0.517, + "step": 10170 + }, + { + "epoch": 1.4779326364692218, + "grad_norm": 1.7621372938156128, + "learning_rate": 7.967400863223051e-06, + "loss": 0.469, + "step": 10180 + }, + { + "epoch": 1.4793844367015099, + "grad_norm": 1.9453188180923462, + "learning_rate": 7.925651777867068e-06, + "loss": 0.5911, + "step": 10190 + }, + { + "epoch": 1.480836236933798, + "grad_norm": 1.6738969087600708, + "learning_rate": 7.883991747726127e-06, + "loss": 0.5271, + "step": 10200 + }, + { + "epoch": 1.4822880371660858, + "grad_norm": 1.6197839975357056, + "learning_rate": 7.842420990087774e-06, + "loss": 0.5143, + "step": 10210 + }, + { + "epoch": 1.4837398373983741, + "grad_norm": 1.3254222869873047, + "learning_rate": 7.800939721773893e-06, + "loss": 0.5526, + "step": 10220 + }, + { + "epoch": 1.485191637630662, + "grad_norm": 2.3349244594573975, + "learning_rate": 7.759548159139654e-06, + "loss": 0.557, + "step": 10230 + }, + { + "epoch": 1.48664343786295, + "grad_norm": 1.9867162704467773, + "learning_rate": 7.718246518072341e-06, + "loss": 0.4553, + "step": 10240 + }, + { + "epoch": 1.4880952380952381, + "grad_norm": 0.5736078023910522, + "learning_rate": 7.677035013990211e-06, + "loss": 0.6118, + "step": 10250 + }, + { + "epoch": 1.489547038327526, + "grad_norm": 2.2709176540374756, + "learning_rate": 7.635913861841395e-06, + "loss": 0.7102, + "step": 10260 + }, + { + "epoch": 1.4909988385598143, + "grad_norm": 0.5769612789154053, + "learning_rate": 7.594883276102799e-06, + "loss": 0.639, + "step": 10270 + }, + { + "epoch": 1.4924506387921022, + "grad_norm": 2.3885700702667236, + "learning_rate": 7.5539434707789266e-06, + "loss": 0.767, + "step": 10280 + }, + { + "epoch": 1.4939024390243902, + "grad_norm": 2.5631144046783447, + "learning_rate": 7.513094659400802e-06, + "loss": 0.557, + "step": 10290 + }, + { + "epoch": 1.4953542392566783, + "grad_norm": 1.6625310182571411, + "learning_rate": 7.47233705502487e-06, + "loss": 0.419, + "step": 10300 + }, + { + "epoch": 1.4968060394889664, + "grad_norm": 3.3970789909362793, + "learning_rate": 7.431670870231844e-06, + "loss": 0.4773, + "step": 10310 + }, + { + "epoch": 1.4982578397212545, + "grad_norm": 2.158837080001831, + "learning_rate": 7.391096317125607e-06, + "loss": 0.5095, + "step": 10320 + }, + { + "epoch": 1.4997096399535423, + "grad_norm": 2.132723569869995, + "learning_rate": 7.350613607332163e-06, + "loss": 0.582, + "step": 10330 + }, + { + "epoch": 1.5011614401858304, + "grad_norm": 3.694959878921509, + "learning_rate": 7.310222951998438e-06, + "loss": 0.3228, + "step": 10340 + }, + { + "epoch": 1.5026132404181185, + "grad_norm": 3.7945165634155273, + "learning_rate": 7.269924561791236e-06, + "loss": 0.5246, + "step": 10350 + }, + { + "epoch": 1.5040650406504064, + "grad_norm": 1.9424091577529907, + "learning_rate": 7.2297186468961554e-06, + "loss": 0.6539, + "step": 10360 + }, + { + "epoch": 1.5055168408826947, + "grad_norm": 1.384211540222168, + "learning_rate": 7.189605417016443e-06, + "loss": 0.5089, + "step": 10370 + }, + { + "epoch": 1.5069686411149825, + "grad_norm": 1.18372642993927, + "learning_rate": 7.149585081371923e-06, + "loss": 0.624, + "step": 10380 + }, + { + "epoch": 1.5084204413472706, + "grad_norm": 2.478210926055908, + "learning_rate": 7.109657848697937e-06, + "loss": 0.5944, + "step": 10390 + }, + { + "epoch": 1.5098722415795587, + "grad_norm": 1.7582294940948486, + "learning_rate": 7.0698239272441985e-06, + "loss": 0.3679, + "step": 10400 + }, + { + "epoch": 1.5113240418118465, + "grad_norm": 3.0840678215026855, + "learning_rate": 7.03008352477374e-06, + "loss": 0.6691, + "step": 10410 + }, + { + "epoch": 1.5127758420441348, + "grad_norm": 5.244002342224121, + "learning_rate": 6.99043684856184e-06, + "loss": 0.5773, + "step": 10420 + }, + { + "epoch": 1.5142276422764227, + "grad_norm": 2.812211513519287, + "learning_rate": 6.950884105394903e-06, + "loss": 0.4341, + "step": 10430 + }, + { + "epoch": 1.5156794425087108, + "grad_norm": 0.9920812845230103, + "learning_rate": 6.911425501569418e-06, + "loss": 0.5441, + "step": 10440 + }, + { + "epoch": 1.5171312427409989, + "grad_norm": 0.8474797606468201, + "learning_rate": 6.872061242890882e-06, + "loss": 0.7427, + "step": 10450 + }, + { + "epoch": 1.5185830429732867, + "grad_norm": 1.2484221458435059, + "learning_rate": 6.8327915346726806e-06, + "loss": 0.5319, + "step": 10460 + }, + { + "epoch": 1.520034843205575, + "grad_norm": 2.2322065830230713, + "learning_rate": 6.793616581735062e-06, + "loss": 0.7047, + "step": 10470 + }, + { + "epoch": 1.5214866434378629, + "grad_norm": 3.255192756652832, + "learning_rate": 6.754536588404078e-06, + "loss": 0.5605, + "step": 10480 + }, + { + "epoch": 1.522938443670151, + "grad_norm": 2.065782308578491, + "learning_rate": 6.715551758510469e-06, + "loss": 0.609, + "step": 10490 + }, + { + "epoch": 1.524390243902439, + "grad_norm": 1.5074211359024048, + "learning_rate": 6.676662295388631e-06, + "loss": 0.4149, + "step": 10500 + }, + { + "epoch": 1.525842044134727, + "grad_norm": 1.3542487621307373, + "learning_rate": 6.637868401875577e-06, + "loss": 0.4952, + "step": 10510 + }, + { + "epoch": 1.5272938443670152, + "grad_norm": 6.184685230255127, + "learning_rate": 6.599170280309824e-06, + "loss": 0.6942, + "step": 10520 + }, + { + "epoch": 1.528745644599303, + "grad_norm": 1.49580979347229, + "learning_rate": 6.560568132530376e-06, + "loss": 0.5696, + "step": 10530 + }, + { + "epoch": 1.5301974448315911, + "grad_norm": 1.4806469678878784, + "learning_rate": 6.522062159875692e-06, + "loss": 0.6504, + "step": 10540 + }, + { + "epoch": 1.5316492450638792, + "grad_norm": 2.461064577102661, + "learning_rate": 6.4836525631825714e-06, + "loss": 0.5862, + "step": 10550 + }, + { + "epoch": 1.533101045296167, + "grad_norm": 1.635206937789917, + "learning_rate": 6.4453395427851475e-06, + "loss": 0.5664, + "step": 10560 + }, + { + "epoch": 1.5345528455284554, + "grad_norm": 2.978720188140869, + "learning_rate": 6.407123298513865e-06, + "loss": 0.6014, + "step": 10570 + }, + { + "epoch": 1.5360046457607432, + "grad_norm": 3.055194854736328, + "learning_rate": 6.369004029694378e-06, + "loss": 0.5824, + "step": 10580 + }, + { + "epoch": 1.5374564459930313, + "grad_norm": 2.0090768337249756, + "learning_rate": 6.330981935146555e-06, + "loss": 0.7431, + "step": 10590 + }, + { + "epoch": 1.5389082462253194, + "grad_norm": 3.167921781539917, + "learning_rate": 6.29305721318344e-06, + "loss": 0.4873, + "step": 10600 + }, + { + "epoch": 1.5403600464576073, + "grad_norm": 2.447772264480591, + "learning_rate": 6.25523006161019e-06, + "loss": 0.6174, + "step": 10610 + }, + { + "epoch": 1.5418118466898956, + "grad_norm": 1.068217396736145, + "learning_rate": 6.217500677723065e-06, + "loss": 0.6131, + "step": 10620 + }, + { + "epoch": 1.5432636469221834, + "grad_norm": 3.059321403503418, + "learning_rate": 6.179869258308407e-06, + "loss": 0.5651, + "step": 10630 + }, + { + "epoch": 1.5447154471544715, + "grad_norm": 1.0179533958435059, + "learning_rate": 6.142335999641599e-06, + "loss": 0.8561, + "step": 10640 + }, + { + "epoch": 1.5461672473867596, + "grad_norm": 3.0670573711395264, + "learning_rate": 6.104901097486024e-06, + "loss": 0.4205, + "step": 10650 + }, + { + "epoch": 1.5476190476190477, + "grad_norm": 1.0629135370254517, + "learning_rate": 6.067564747092094e-06, + "loss": 0.7445, + "step": 10660 + }, + { + "epoch": 1.5490708478513358, + "grad_norm": 1.5961717367172241, + "learning_rate": 6.030327143196179e-06, + "loss": 0.6035, + "step": 10670 + }, + { + "epoch": 1.5505226480836236, + "grad_norm": 2.1358516216278076, + "learning_rate": 5.993188480019615e-06, + "loss": 0.3647, + "step": 10680 + }, + { + "epoch": 1.5519744483159117, + "grad_norm": 3.7955398559570312, + "learning_rate": 5.956148951267706e-06, + "loss": 0.4885, + "step": 10690 + }, + { + "epoch": 1.5534262485481998, + "grad_norm": 1.8019474744796753, + "learning_rate": 5.919208750128685e-06, + "loss": 0.4086, + "step": 10700 + }, + { + "epoch": 1.5548780487804879, + "grad_norm": 0.8065319061279297, + "learning_rate": 5.882368069272709e-06, + "loss": 0.6092, + "step": 10710 + }, + { + "epoch": 1.556329849012776, + "grad_norm": 1.8280988931655884, + "learning_rate": 5.8456271008508955e-06, + "loss": 0.583, + "step": 10720 + }, + { + "epoch": 1.5577816492450638, + "grad_norm": 2.872685670852661, + "learning_rate": 5.808986036494254e-06, + "loss": 0.3497, + "step": 10730 + }, + { + "epoch": 1.5592334494773519, + "grad_norm": 2.1516687870025635, + "learning_rate": 5.772445067312729e-06, + "loss": 0.4461, + "step": 10740 + }, + { + "epoch": 1.56068524970964, + "grad_norm": 0.9675107598304749, + "learning_rate": 5.736004383894231e-06, + "loss": 0.8109, + "step": 10750 + }, + { + "epoch": 1.562137049941928, + "grad_norm": 3.2056965827941895, + "learning_rate": 5.69966417630356e-06, + "loss": 0.7312, + "step": 10760 + }, + { + "epoch": 1.5635888501742161, + "grad_norm": 1.4558873176574707, + "learning_rate": 5.663424634081474e-06, + "loss": 0.5516, + "step": 10770 + }, + { + "epoch": 1.565040650406504, + "grad_norm": 1.376585602760315, + "learning_rate": 5.62728594624371e-06, + "loss": 0.3938, + "step": 10780 + }, + { + "epoch": 1.566492450638792, + "grad_norm": 1.4354243278503418, + "learning_rate": 5.59124830127995e-06, + "loss": 0.6572, + "step": 10790 + }, + { + "epoch": 1.5679442508710801, + "grad_norm": 2.6437528133392334, + "learning_rate": 5.555311887152867e-06, + "loss": 0.4434, + "step": 10800 + }, + { + "epoch": 1.5693960511033682, + "grad_norm": 2.039637327194214, + "learning_rate": 5.5194768912971565e-06, + "loss": 0.4561, + "step": 10810 + }, + { + "epoch": 1.5708478513356563, + "grad_norm": 1.7926547527313232, + "learning_rate": 5.483743500618529e-06, + "loss": 0.7296, + "step": 10820 + }, + { + "epoch": 1.5722996515679442, + "grad_norm": 2.8525867462158203, + "learning_rate": 5.448111901492747e-06, + "loss": 0.5546, + "step": 10830 + }, + { + "epoch": 1.5737514518002322, + "grad_norm": 1.2368862628936768, + "learning_rate": 5.412582279764669e-06, + "loss": 0.5491, + "step": 10840 + }, + { + "epoch": 1.5752032520325203, + "grad_norm": 2.139909505844116, + "learning_rate": 5.377154820747271e-06, + "loss": 0.5243, + "step": 10850 + }, + { + "epoch": 1.5766550522648084, + "grad_norm": 1.4064335823059082, + "learning_rate": 5.341829709220647e-06, + "loss": 0.9336, + "step": 10860 + }, + { + "epoch": 1.5781068524970965, + "grad_norm": 3.3268957138061523, + "learning_rate": 5.306607129431107e-06, + "loss": 0.57, + "step": 10870 + }, + { + "epoch": 1.5795586527293843, + "grad_norm": 5.089993000030518, + "learning_rate": 5.271487265090163e-06, + "loss": 0.5028, + "step": 10880 + }, + { + "epoch": 1.5810104529616724, + "grad_norm": 1.5383248329162598, + "learning_rate": 5.236470299373589e-06, + "loss": 0.4664, + "step": 10890 + }, + { + "epoch": 1.5824622531939605, + "grad_norm": 1.7498929500579834, + "learning_rate": 5.201556414920486e-06, + "loss": 0.8543, + "step": 10900 + }, + { + "epoch": 1.5839140534262486, + "grad_norm": 1.0989433526992798, + "learning_rate": 5.1667457938322925e-06, + "loss": 0.4634, + "step": 10910 + }, + { + "epoch": 1.5853658536585367, + "grad_norm": 1.1718672513961792, + "learning_rate": 5.1320386176718555e-06, + "loss": 0.2643, + "step": 10920 + }, + { + "epoch": 1.5868176538908245, + "grad_norm": 2.5529532432556152, + "learning_rate": 5.097435067462497e-06, + "loss": 0.7085, + "step": 10930 + }, + { + "epoch": 1.5882694541231128, + "grad_norm": 0.7183501124382019, + "learning_rate": 5.0629353236870375e-06, + "loss": 0.6274, + "step": 10940 + }, + { + "epoch": 1.5897212543554007, + "grad_norm": 0.3472674787044525, + "learning_rate": 5.02853956628686e-06, + "loss": 0.6934, + "step": 10950 + }, + { + "epoch": 1.5911730545876888, + "grad_norm": 1.4065948724746704, + "learning_rate": 4.994247974661026e-06, + "loss": 0.7115, + "step": 10960 + }, + { + "epoch": 1.5926248548199768, + "grad_norm": 0.9510209560394287, + "learning_rate": 4.960060727665255e-06, + "loss": 0.5962, + "step": 10970 + }, + { + "epoch": 1.5940766550522647, + "grad_norm": 3.3892316818237305, + "learning_rate": 4.92597800361104e-06, + "loss": 0.4962, + "step": 10980 + }, + { + "epoch": 1.595528455284553, + "grad_norm": 3.7970123291015625, + "learning_rate": 4.891999980264728e-06, + "loss": 0.537, + "step": 10990 + }, + { + "epoch": 1.5969802555168409, + "grad_norm": 1.8874022960662842, + "learning_rate": 4.85812683484656e-06, + "loss": 0.6591, + "step": 11000 + }, + { + "epoch": 1.598432055749129, + "grad_norm": 3.3695411682128906, + "learning_rate": 4.824358744029761e-06, + "loss": 0.4808, + "step": 11010 + }, + { + "epoch": 1.599883855981417, + "grad_norm": 4.303611755371094, + "learning_rate": 4.790695883939633e-06, + "loss": 0.4313, + "step": 11020 + }, + { + "epoch": 1.6013356562137049, + "grad_norm": 2.4233243465423584, + "learning_rate": 4.757138430152608e-06, + "loss": 0.4927, + "step": 11030 + }, + { + "epoch": 1.6027874564459932, + "grad_norm": 1.4356447458267212, + "learning_rate": 4.72368655769535e-06, + "loss": 0.4185, + "step": 11040 + }, + { + "epoch": 1.604239256678281, + "grad_norm": 5.6396636962890625, + "learning_rate": 4.690340441043847e-06, + "loss": 0.5059, + "step": 11050 + }, + { + "epoch": 1.6056910569105691, + "grad_norm": 0.8661177754402161, + "learning_rate": 4.6571002541224955e-06, + "loss": 0.6568, + "step": 11060 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 8.649468421936035, + "learning_rate": 4.623966170303171e-06, + "loss": 0.4749, + "step": 11070 + }, + { + "epoch": 1.608594657375145, + "grad_norm": 0.2540145516395569, + "learning_rate": 4.590938362404368e-06, + "loss": 0.6654, + "step": 11080 + }, + { + "epoch": 1.6100464576074334, + "grad_norm": 2.6379213333129883, + "learning_rate": 4.558017002690257e-06, + "loss": 0.5673, + "step": 11090 + }, + { + "epoch": 1.6114982578397212, + "grad_norm": 0.4164746403694153, + "learning_rate": 4.525202262869804e-06, + "loss": 0.4536, + "step": 11100 + }, + { + "epoch": 1.6129500580720093, + "grad_norm": 0.745841920375824, + "learning_rate": 4.492494314095891e-06, + "loss": 0.5186, + "step": 11110 + }, + { + "epoch": 1.6144018583042974, + "grad_norm": 2.1661605834960938, + "learning_rate": 4.45989332696439e-06, + "loss": 0.67, + "step": 11120 + }, + { + "epoch": 1.6158536585365852, + "grad_norm": 1.8644750118255615, + "learning_rate": 4.427399471513288e-06, + "loss": 0.6665, + "step": 11130 + }, + { + "epoch": 1.6173054587688735, + "grad_norm": 0.8033472299575806, + "learning_rate": 4.395012917221825e-06, + "loss": 0.6176, + "step": 11140 + }, + { + "epoch": 1.6187572590011614, + "grad_norm": 1.7303358316421509, + "learning_rate": 4.362733833009558e-06, + "loss": 0.4351, + "step": 11150 + }, + { + "epoch": 1.6202090592334495, + "grad_norm": 5.510407447814941, + "learning_rate": 4.330562387235512e-06, + "loss": 0.7516, + "step": 11160 + }, + { + "epoch": 1.6216608594657376, + "grad_norm": 0.6413615942001343, + "learning_rate": 4.298498747697335e-06, + "loss": 0.3923, + "step": 11170 + }, + { + "epoch": 1.6231126596980254, + "grad_norm": 1.7095063924789429, + "learning_rate": 4.266543081630347e-06, + "loss": 0.3482, + "step": 11180 + }, + { + "epoch": 1.6245644599303137, + "grad_norm": 0.7411553859710693, + "learning_rate": 4.234695555706714e-06, + "loss": 0.3467, + "step": 11190 + }, + { + "epoch": 1.6260162601626016, + "grad_norm": 6.189662456512451, + "learning_rate": 4.202956336034591e-06, + "loss": 0.6474, + "step": 11200 + }, + { + "epoch": 1.6274680603948897, + "grad_norm": 0.5904057025909424, + "learning_rate": 4.171325588157218e-06, + "loss": 0.3935, + "step": 11210 + }, + { + "epoch": 1.6289198606271778, + "grad_norm": 1.5648072957992554, + "learning_rate": 4.139803477052076e-06, + "loss": 0.6161, + "step": 11220 + }, + { + "epoch": 1.6303716608594656, + "grad_norm": 4.8867597579956055, + "learning_rate": 4.108390167130044e-06, + "loss": 0.5963, + "step": 11230 + }, + { + "epoch": 1.631823461091754, + "grad_norm": 0.865047037601471, + "learning_rate": 4.077085822234503e-06, + "loss": 0.4213, + "step": 11240 + }, + { + "epoch": 1.6332752613240418, + "grad_norm": 1.2489089965820312, + "learning_rate": 4.045890605640504e-06, + "loss": 0.4975, + "step": 11250 + }, + { + "epoch": 1.6347270615563299, + "grad_norm": 0.8895522952079773, + "learning_rate": 4.0148046800539265e-06, + "loss": 0.5152, + "step": 11260 + }, + { + "epoch": 1.636178861788618, + "grad_norm": 7.4556121826171875, + "learning_rate": 3.983828207610615e-06, + "loss": 0.8086, + "step": 11270 + }, + { + "epoch": 1.6376306620209058, + "grad_norm": 2.2906975746154785, + "learning_rate": 3.9529613498755165e-06, + "loss": 0.4963, + "step": 11280 + }, + { + "epoch": 1.639082462253194, + "grad_norm": 1.934874415397644, + "learning_rate": 3.922204267841889e-06, + "loss": 0.4317, + "step": 11290 + }, + { + "epoch": 1.640534262485482, + "grad_norm": 1.597822666168213, + "learning_rate": 3.8915571219304055e-06, + "loss": 0.5763, + "step": 11300 + }, + { + "epoch": 1.64198606271777, + "grad_norm": 1.507169485092163, + "learning_rate": 3.861020071988339e-06, + "loss": 0.4695, + "step": 11310 + }, + { + "epoch": 1.6434378629500581, + "grad_norm": 0.8798676133155823, + "learning_rate": 3.830593277288757e-06, + "loss": 0.4347, + "step": 11320 + }, + { + "epoch": 1.644889663182346, + "grad_norm": 2.6623973846435547, + "learning_rate": 3.800276896529642e-06, + "loss": 0.4887, + "step": 11330 + }, + { + "epoch": 1.6463414634146343, + "grad_norm": 1.0119774341583252, + "learning_rate": 3.7700710878330907e-06, + "loss": 0.4776, + "step": 11340 + }, + { + "epoch": 1.6477932636469221, + "grad_norm": 1.744946002960205, + "learning_rate": 3.7399760087444975e-06, + "loss": 0.3542, + "step": 11350 + }, + { + "epoch": 1.6492450638792102, + "grad_norm": 0.9083417057991028, + "learning_rate": 3.7099918162317114e-06, + "loss": 0.5441, + "step": 11360 + }, + { + "epoch": 1.6506968641114983, + "grad_norm": 1.0866427421569824, + "learning_rate": 3.680118666684218e-06, + "loss": 0.6087, + "step": 11370 + }, + { + "epoch": 1.6521486643437862, + "grad_norm": 1.4837919473648071, + "learning_rate": 3.6503567159123536e-06, + "loss": 0.5775, + "step": 11380 + }, + { + "epoch": 1.6536004645760745, + "grad_norm": 1.312999963760376, + "learning_rate": 3.6207061191464636e-06, + "loss": 0.6444, + "step": 11390 + }, + { + "epoch": 1.6550522648083623, + "grad_norm": 1.7816232442855835, + "learning_rate": 3.5911670310360882e-06, + "loss": 0.7579, + "step": 11400 + }, + { + "epoch": 1.6565040650406504, + "grad_norm": 5.434678077697754, + "learning_rate": 3.561739605649189e-06, + "loss": 0.5099, + "step": 11410 + }, + { + "epoch": 1.6579558652729385, + "grad_norm": 3.44694185256958, + "learning_rate": 3.532423996471307e-06, + "loss": 0.8014, + "step": 11420 + }, + { + "epoch": 1.6594076655052263, + "grad_norm": 1.121071219444275, + "learning_rate": 3.503220356404785e-06, + "loss": 0.7484, + "step": 11430 + }, + { + "epoch": 1.6608594657375146, + "grad_norm": 1.6010463237762451, + "learning_rate": 3.4741288377679732e-06, + "loss": 0.6689, + "step": 11440 + }, + { + "epoch": 1.6623112659698025, + "grad_norm": 2.2549779415130615, + "learning_rate": 3.4451495922944195e-06, + "loss": 0.5535, + "step": 11450 + }, + { + "epoch": 1.6637630662020906, + "grad_norm": 0.8929911255836487, + "learning_rate": 3.4162827711320788e-06, + "loss": 0.6548, + "step": 11460 + }, + { + "epoch": 1.6652148664343787, + "grad_norm": 0.8602511286735535, + "learning_rate": 3.3875285248425427e-06, + "loss": 0.4342, + "step": 11470 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.5482707023620605, + "learning_rate": 3.358887003400246e-06, + "loss": 0.5578, + "step": 11480 + }, + { + "epoch": 1.6681184668989548, + "grad_norm": 2.0349433422088623, + "learning_rate": 3.3303583561916624e-06, + "loss": 0.4982, + "step": 11490 + }, + { + "epoch": 1.6695702671312427, + "grad_norm": 0.9225724339485168, + "learning_rate": 3.3019427320145542e-06, + "loss": 0.4313, + "step": 11500 + }, + { + "epoch": 1.6710220673635308, + "grad_norm": 1.9111758470535278, + "learning_rate": 3.2736402790771948e-06, + "loss": 0.4434, + "step": 11510 + }, + { + "epoch": 1.6724738675958188, + "grad_norm": 3.7338778972625732, + "learning_rate": 3.245451144997569e-06, + "loss": 0.6636, + "step": 11520 + }, + { + "epoch": 1.6739256678281067, + "grad_norm": 1.3860654830932617, + "learning_rate": 3.2173754768026394e-06, + "loss": 0.5516, + "step": 11530 + }, + { + "epoch": 1.675377468060395, + "grad_norm": 1.1442302465438843, + "learning_rate": 3.189413420927545e-06, + "loss": 0.3753, + "step": 11540 + }, + { + "epoch": 1.6768292682926829, + "grad_norm": 1.216781497001648, + "learning_rate": 3.1615651232148547e-06, + "loss": 0.478, + "step": 11550 + }, + { + "epoch": 1.678281068524971, + "grad_norm": 1.9530525207519531, + "learning_rate": 3.1338307289138254e-06, + "loss": 0.5791, + "step": 11560 + }, + { + "epoch": 1.679732868757259, + "grad_norm": 1.5259203910827637, + "learning_rate": 3.1062103826796e-06, + "loss": 0.8218, + "step": 11570 + }, + { + "epoch": 1.6811846689895469, + "grad_norm": 1.0574325323104858, + "learning_rate": 3.078704228572485e-06, + "loss": 0.4432, + "step": 11580 + }, + { + "epoch": 1.6826364692218352, + "grad_norm": 1.1627357006072998, + "learning_rate": 3.0513124100571944e-06, + "loss": 0.4513, + "step": 11590 + }, + { + "epoch": 1.684088269454123, + "grad_norm": 1.7360894680023193, + "learning_rate": 3.0240350700021097e-06, + "loss": 0.4008, + "step": 11600 + }, + { + "epoch": 1.6855400696864111, + "grad_norm": 1.3201311826705933, + "learning_rate": 2.9968723506784953e-06, + "loss": 0.7546, + "step": 11610 + }, + { + "epoch": 1.6869918699186992, + "grad_norm": 2.6173791885375977, + "learning_rate": 2.9698243937598125e-06, + "loss": 0.587, + "step": 11620 + }, + { + "epoch": 1.688443670150987, + "grad_norm": 1.6192914247512817, + "learning_rate": 2.942891340320936e-06, + "loss": 0.5349, + "step": 11630 + }, + { + "epoch": 1.6898954703832754, + "grad_norm": 1.5096668004989624, + "learning_rate": 2.9160733308374347e-06, + "loss": 0.5358, + "step": 11640 + }, + { + "epoch": 1.6913472706155632, + "grad_norm": 2.530461072921753, + "learning_rate": 2.8893705051848546e-06, + "loss": 0.4036, + "step": 11650 + }, + { + "epoch": 1.6927990708478513, + "grad_norm": 1.12082040309906, + "learning_rate": 2.862783002637959e-06, + "loss": 0.6056, + "step": 11660 + }, + { + "epoch": 1.6942508710801394, + "grad_norm": 2.175119400024414, + "learning_rate": 2.836310961870012e-06, + "loss": 0.5726, + "step": 11670 + }, + { + "epoch": 1.6957026713124272, + "grad_norm": 1.1249805688858032, + "learning_rate": 2.8099545209520794e-06, + "loss": 0.6046, + "step": 11680 + }, + { + "epoch": 1.6971544715447155, + "grad_norm": 1.3748245239257812, + "learning_rate": 2.783713817352282e-06, + "loss": 0.5619, + "step": 11690 + }, + { + "epoch": 1.6986062717770034, + "grad_norm": 1.3081672191619873, + "learning_rate": 2.757588987935078e-06, + "loss": 0.4904, + "step": 11700 + }, + { + "epoch": 1.7000580720092915, + "grad_norm": 1.0695126056671143, + "learning_rate": 2.731580168960557e-06, + "loss": 0.4588, + "step": 11710 + }, + { + "epoch": 1.7015098722415796, + "grad_norm": 0.9099944829940796, + "learning_rate": 2.705687496083742e-06, + "loss": 0.6389, + "step": 11720 + }, + { + "epoch": 1.7029616724738676, + "grad_norm": 3.102823495864868, + "learning_rate": 2.679911104353855e-06, + "loss": 0.4546, + "step": 11730 + }, + { + "epoch": 1.7044134727061557, + "grad_norm": 2.0113303661346436, + "learning_rate": 2.654251128213642e-06, + "loss": 0.5193, + "step": 11740 + }, + { + "epoch": 1.7058652729384436, + "grad_norm": 5.120201110839844, + "learning_rate": 2.6287077014986396e-06, + "loss": 0.3837, + "step": 11750 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 1.8609308004379272, + "learning_rate": 2.603280957436499e-06, + "loss": 0.6775, + "step": 11760 + }, + { + "epoch": 1.7087688734030198, + "grad_norm": 0.5862835049629211, + "learning_rate": 2.5779710286463006e-06, + "loss": 0.4387, + "step": 11770 + }, + { + "epoch": 1.7102206736353078, + "grad_norm": 1.4512196779251099, + "learning_rate": 2.552778047137824e-06, + "loss": 0.551, + "step": 11780 + }, + { + "epoch": 1.711672473867596, + "grad_norm": 1.8097496032714844, + "learning_rate": 2.527702144310909e-06, + "loss": 0.4015, + "step": 11790 + }, + { + "epoch": 1.7131242740998838, + "grad_norm": 1.6561768054962158, + "learning_rate": 2.502743450954714e-06, + "loss": 0.6328, + "step": 11800 + }, + { + "epoch": 1.7145760743321719, + "grad_norm": 3.0371804237365723, + "learning_rate": 2.477902097247095e-06, + "loss": 0.5383, + "step": 11810 + }, + { + "epoch": 1.71602787456446, + "grad_norm": 1.1968135833740234, + "learning_rate": 2.453178212753876e-06, + "loss": 0.4626, + "step": 11820 + }, + { + "epoch": 1.717479674796748, + "grad_norm": 1.953162431716919, + "learning_rate": 2.428571926428194e-06, + "loss": 0.416, + "step": 11830 + }, + { + "epoch": 1.718931475029036, + "grad_norm": 5.2054443359375, + "learning_rate": 2.4040833666098413e-06, + "loss": 0.5872, + "step": 11840 + }, + { + "epoch": 1.720383275261324, + "grad_norm": 0.8436479568481445, + "learning_rate": 2.3797126610245605e-06, + "loss": 0.6168, + "step": 11850 + }, + { + "epoch": 1.721835075493612, + "grad_norm": 1.6038563251495361, + "learning_rate": 2.3554599367834137e-06, + "loss": 0.4696, + "step": 11860 + }, + { + "epoch": 1.7232868757259001, + "grad_norm": 1.6510204076766968, + "learning_rate": 2.3313253203820965e-06, + "loss": 0.5301, + "step": 11870 + }, + { + "epoch": 1.7247386759581882, + "grad_norm": 1.8987841606140137, + "learning_rate": 2.307308937700278e-06, + "loss": 0.4072, + "step": 11880 + }, + { + "epoch": 1.7261904761904763, + "grad_norm": 1.4687321186065674, + "learning_rate": 2.283410914000969e-06, + "loss": 0.6519, + "step": 11890 + }, + { + "epoch": 1.7276422764227641, + "grad_norm": 1.3027409315109253, + "learning_rate": 2.2596313739298462e-06, + "loss": 0.5261, + "step": 11900 + }, + { + "epoch": 1.7290940766550522, + "grad_norm": 1.6218777894973755, + "learning_rate": 2.235970441514598e-06, + "loss": 0.5705, + "step": 11910 + }, + { + "epoch": 1.7305458768873403, + "grad_norm": 1.294359803199768, + "learning_rate": 2.2124282401642936e-06, + "loss": 0.5077, + "step": 11920 + }, + { + "epoch": 1.7319976771196284, + "grad_norm": 3.3764312267303467, + "learning_rate": 2.189004892668742e-06, + "loss": 0.5721, + "step": 11930 + }, + { + "epoch": 1.7334494773519165, + "grad_norm": 2.6626949310302734, + "learning_rate": 2.165700521197825e-06, + "loss": 0.4905, + "step": 11940 + }, + { + "epoch": 1.7349012775842043, + "grad_norm": 1.1768718957901, + "learning_rate": 2.1425152473008832e-06, + "loss": 0.6437, + "step": 11950 + }, + { + "epoch": 1.7363530778164924, + "grad_norm": 3.1630823612213135, + "learning_rate": 2.119449191906089e-06, + "loss": 0.5341, + "step": 11960 + }, + { + "epoch": 1.7378048780487805, + "grad_norm": 6.051577568054199, + "learning_rate": 2.096502475319781e-06, + "loss": 0.4468, + "step": 11970 + }, + { + "epoch": 1.7392566782810686, + "grad_norm": 1.5965017080307007, + "learning_rate": 2.0736752172258846e-06, + "loss": 0.8102, + "step": 11980 + }, + { + "epoch": 1.7407084785133566, + "grad_norm": 1.8911795616149902, + "learning_rate": 2.050967536685233e-06, + "loss": 0.7823, + "step": 11990 + }, + { + "epoch": 1.7421602787456445, + "grad_norm": 1.6226707696914673, + "learning_rate": 2.0283795521350042e-06, + "loss": 0.7913, + "step": 12000 + }, + { + "epoch": 1.7421602787456445, + "eval_loss": 0.6174443364143372, + "eval_runtime": 107.8262, + "eval_samples_per_second": 13.448, + "eval_steps_per_second": 3.367, + "step": 12000 + }, + { + "epoch": 1.7436120789779328, + "grad_norm": 1.1205623149871826, + "learning_rate": 2.005911381388048e-06, + "loss": 0.6055, + "step": 12010 + }, + { + "epoch": 1.7450638792102207, + "grad_norm": 1.6299880743026733, + "learning_rate": 1.9835631416323164e-06, + "loss": 0.599, + "step": 12020 + }, + { + "epoch": 1.7465156794425087, + "grad_norm": 1.34322190284729, + "learning_rate": 1.961334949430227e-06, + "loss": 0.5602, + "step": 12030 + }, + { + "epoch": 1.7479674796747968, + "grad_norm": 0.7588962912559509, + "learning_rate": 1.9392269207180512e-06, + "loss": 0.4644, + "step": 12040 + }, + { + "epoch": 1.7494192799070847, + "grad_norm": 3.5957090854644775, + "learning_rate": 1.9172391708053408e-06, + "loss": 1.0411, + "step": 12050 + }, + { + "epoch": 1.750871080139373, + "grad_norm": 1.6750432252883911, + "learning_rate": 1.895371814374286e-06, + "loss": 0.5805, + "step": 12060 + }, + { + "epoch": 1.7523228803716608, + "grad_norm": 1.9335286617279053, + "learning_rate": 1.8736249654791538e-06, + "loss": 0.7541, + "step": 12070 + }, + { + "epoch": 1.753774680603949, + "grad_norm": 1.4929413795471191, + "learning_rate": 1.8519987375456654e-06, + "loss": 0.5656, + "step": 12080 + }, + { + "epoch": 1.755226480836237, + "grad_norm": 1.5122934579849243, + "learning_rate": 1.8304932433704097e-06, + "loss": 0.5031, + "step": 12090 + }, + { + "epoch": 1.7566782810685249, + "grad_norm": 1.0562947988510132, + "learning_rate": 1.809108595120279e-06, + "loss": 0.5551, + "step": 12100 + }, + { + "epoch": 1.7581300813008132, + "grad_norm": 1.0522669553756714, + "learning_rate": 1.7878449043318534e-06, + "loss": 0.4314, + "step": 12110 + }, + { + "epoch": 1.759581881533101, + "grad_norm": 0.8575490117073059, + "learning_rate": 1.766702281910837e-06, + "loss": 0.4565, + "step": 12120 + }, + { + "epoch": 1.761033681765389, + "grad_norm": 1.6525681018829346, + "learning_rate": 1.7456808381314583e-06, + "loss": 0.4301, + "step": 12130 + }, + { + "epoch": 1.7624854819976772, + "grad_norm": 1.931264042854309, + "learning_rate": 1.7247806826359375e-06, + "loss": 0.4871, + "step": 12140 + }, + { + "epoch": 1.763937282229965, + "grad_norm": 1.8501496315002441, + "learning_rate": 1.704001924433865e-06, + "loss": 0.657, + "step": 12150 + }, + { + "epoch": 1.7653890824622533, + "grad_norm": 1.272760272026062, + "learning_rate": 1.6833446719016627e-06, + "loss": 0.606, + "step": 12160 + }, + { + "epoch": 1.7668408826945412, + "grad_norm": 0.9544461369514465, + "learning_rate": 1.6628090327820172e-06, + "loss": 0.6067, + "step": 12170 + }, + { + "epoch": 1.7682926829268293, + "grad_norm": 9.855928421020508, + "learning_rate": 1.6423951141833011e-06, + "loss": 0.3548, + "step": 12180 + }, + { + "epoch": 1.7697444831591174, + "grad_norm": 3.282947540283203, + "learning_rate": 1.6221030225790413e-06, + "loss": 0.3999, + "step": 12190 + }, + { + "epoch": 1.7711962833914052, + "grad_norm": 3.6573681831359863, + "learning_rate": 1.6019328638073261e-06, + "loss": 0.5574, + "step": 12200 + }, + { + "epoch": 1.7726480836236935, + "grad_norm": 3.0135483741760254, + "learning_rate": 1.581884743070297e-06, + "loss": 0.7062, + "step": 12210 + }, + { + "epoch": 1.7740998838559814, + "grad_norm": 1.4390877485275269, + "learning_rate": 1.5619587649335605e-06, + "loss": 0.5718, + "step": 12220 + }, + { + "epoch": 1.7755516840882695, + "grad_norm": 0.5463725328445435, + "learning_rate": 1.5421550333256734e-06, + "loss": 0.609, + "step": 12230 + }, + { + "epoch": 1.7770034843205575, + "grad_norm": 2.245574712753296, + "learning_rate": 1.5224736515375814e-06, + "loss": 0.5087, + "step": 12240 + }, + { + "epoch": 1.7784552845528454, + "grad_norm": 1.2518837451934814, + "learning_rate": 1.502914722222079e-06, + "loss": 0.6448, + "step": 12250 + }, + { + "epoch": 1.7799070847851337, + "grad_norm": 1.8152894973754883, + "learning_rate": 1.4834783473932994e-06, + "loss": 0.6077, + "step": 12260 + }, + { + "epoch": 1.7813588850174216, + "grad_norm": 1.3485125303268433, + "learning_rate": 1.4641646284261485e-06, + "loss": 0.5192, + "step": 12270 + }, + { + "epoch": 1.7828106852497096, + "grad_norm": 4.338469982147217, + "learning_rate": 1.444973666055796e-06, + "loss": 0.6732, + "step": 12280 + }, + { + "epoch": 1.7842624854819977, + "grad_norm": 0.5736151933670044, + "learning_rate": 1.4259055603771527e-06, + "loss": 0.4268, + "step": 12290 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 2.7274398803710938, + "learning_rate": 1.4069604108443296e-06, + "loss": 0.6137, + "step": 12300 + }, + { + "epoch": 1.787166085946574, + "grad_norm": 1.3027504682540894, + "learning_rate": 1.3881383162701433e-06, + "loss": 0.48, + "step": 12310 + }, + { + "epoch": 1.7886178861788617, + "grad_norm": 1.979504942893982, + "learning_rate": 1.3694393748255902e-06, + "loss": 0.3862, + "step": 12320 + }, + { + "epoch": 1.7900696864111498, + "grad_norm": 2.1074235439300537, + "learning_rate": 1.3508636840393246e-06, + "loss": 0.5215, + "step": 12330 + }, + { + "epoch": 1.791521486643438, + "grad_norm": 2.5477986335754395, + "learning_rate": 1.3324113407971516e-06, + "loss": 0.4583, + "step": 12340 + }, + { + "epoch": 1.7929732868757258, + "grad_norm": 0.5263664126396179, + "learning_rate": 1.314082441341552e-06, + "loss": 0.6051, + "step": 12350 + }, + { + "epoch": 1.794425087108014, + "grad_norm": 1.8962557315826416, + "learning_rate": 1.2958770812711352e-06, + "loss": 0.6069, + "step": 12360 + }, + { + "epoch": 1.795876887340302, + "grad_norm": 2.079145908355713, + "learning_rate": 1.2777953555401678e-06, + "loss": 0.7225, + "step": 12370 + }, + { + "epoch": 1.79732868757259, + "grad_norm": 0.8144702315330505, + "learning_rate": 1.2598373584580824e-06, + "loss": 0.474, + "step": 12380 + }, + { + "epoch": 1.798780487804878, + "grad_norm": 6.011617660522461, + "learning_rate": 1.2420031836889668e-06, + "loss": 0.4614, + "step": 12390 + }, + { + "epoch": 1.800232288037166, + "grad_norm": 2.0123348236083984, + "learning_rate": 1.224292924251083e-06, + "loss": 0.4744, + "step": 12400 + }, + { + "epoch": 1.8016840882694543, + "grad_norm": 1.1453293561935425, + "learning_rate": 1.2067066725163946e-06, + "loss": 0.7232, + "step": 12410 + }, + { + "epoch": 1.8031358885017421, + "grad_norm": 3.011850357055664, + "learning_rate": 1.1892445202100643e-06, + "loss": 0.6242, + "step": 12420 + }, + { + "epoch": 1.8045876887340302, + "grad_norm": 1.3641666173934937, + "learning_rate": 1.1719065584099881e-06, + "loss": 0.6855, + "step": 12430 + }, + { + "epoch": 1.8060394889663183, + "grad_norm": 2.096034288406372, + "learning_rate": 1.1546928775463234e-06, + "loss": 0.5658, + "step": 12440 + }, + { + "epoch": 1.8074912891986061, + "grad_norm": 1.173338770866394, + "learning_rate": 1.137603567401005e-06, + "loss": 0.6926, + "step": 12450 + }, + { + "epoch": 1.8089430894308944, + "grad_norm": 7.650896072387695, + "learning_rate": 1.1206387171072808e-06, + "loss": 0.5958, + "step": 12460 + }, + { + "epoch": 1.8103948896631823, + "grad_norm": 4.46699857711792, + "learning_rate": 1.1037984151492624e-06, + "loss": 0.4605, + "step": 12470 + }, + { + "epoch": 1.8118466898954704, + "grad_norm": 2.164135217666626, + "learning_rate": 1.0870827493614344e-06, + "loss": 0.665, + "step": 12480 + }, + { + "epoch": 1.8132984901277585, + "grad_norm": 1.531969428062439, + "learning_rate": 1.0704918069282226e-06, + "loss": 0.4462, + "step": 12490 + }, + { + "epoch": 1.8147502903600463, + "grad_norm": 1.4021626710891724, + "learning_rate": 1.0540256743835297e-06, + "loss": 0.5399, + "step": 12500 + }, + { + "epoch": 1.8162020905923346, + "grad_norm": 1.4886596202850342, + "learning_rate": 1.0376844376102784e-06, + "loss": 0.5748, + "step": 12510 + }, + { + "epoch": 1.8176538908246225, + "grad_norm": 2.1300623416900635, + "learning_rate": 1.0214681818399712e-06, + "loss": 0.6041, + "step": 12520 + }, + { + "epoch": 1.8191056910569106, + "grad_norm": 2.5872268676757812, + "learning_rate": 1.0053769916522488e-06, + "loss": 0.6594, + "step": 12530 + }, + { + "epoch": 1.8205574912891986, + "grad_norm": 0.8167919516563416, + "learning_rate": 9.894109509744342e-07, + "loss": 0.5516, + "step": 12540 + }, + { + "epoch": 1.8220092915214865, + "grad_norm": 1.7698092460632324, + "learning_rate": 9.735701430811067e-07, + "loss": 0.4946, + "step": 12550 + }, + { + "epoch": 1.8234610917537748, + "grad_norm": 1.8498523235321045, + "learning_rate": 9.578546505936676e-07, + "loss": 0.6975, + "step": 12560 + }, + { + "epoch": 1.8249128919860627, + "grad_norm": 2.3681557178497314, + "learning_rate": 9.422645554799048e-07, + "loss": 0.7246, + "step": 12570 + }, + { + "epoch": 1.8263646922183507, + "grad_norm": 1.6103743314743042, + "learning_rate": 9.267999390535659e-07, + "loss": 0.597, + "step": 12580 + }, + { + "epoch": 1.8278164924506388, + "grad_norm": 1.0685875415802002, + "learning_rate": 9.11460881973944e-07, + "loss": 0.4749, + "step": 12590 + }, + { + "epoch": 1.8292682926829267, + "grad_norm": 1.7253142595291138, + "learning_rate": 8.962474642454338e-07, + "loss": 0.7401, + "step": 12600 + }, + { + "epoch": 1.830720092915215, + "grad_norm": 3.835946798324585, + "learning_rate": 8.811597652171377e-07, + "loss": 0.5107, + "step": 12610 + }, + { + "epoch": 1.8321718931475028, + "grad_norm": 3.366118907928467, + "learning_rate": 8.661978635824464e-07, + "loss": 0.5523, + "step": 12620 + }, + { + "epoch": 1.833623693379791, + "grad_norm": 1.4780124425888062, + "learning_rate": 8.513618373786198e-07, + "loss": 0.4592, + "step": 12630 + }, + { + "epoch": 1.835075493612079, + "grad_norm": 1.5837668180465698, + "learning_rate": 8.366517639863819e-07, + "loss": 0.5838, + "step": 12640 + }, + { + "epoch": 1.8365272938443669, + "grad_norm": 5.799704551696777, + "learning_rate": 8.220677201295296e-07, + "loss": 0.7116, + "step": 12650 + }, + { + "epoch": 1.8379790940766552, + "grad_norm": 1.668204665184021, + "learning_rate": 8.076097818745188e-07, + "loss": 0.5013, + "step": 12660 + }, + { + "epoch": 1.839430894308943, + "grad_norm": 2.408761501312256, + "learning_rate": 7.932780246300703e-07, + "loss": 0.4475, + "step": 12670 + }, + { + "epoch": 1.840882694541231, + "grad_norm": 1.6833887100219727, + "learning_rate": 7.790725231467844e-07, + "loss": 0.3637, + "step": 12680 + }, + { + "epoch": 1.8423344947735192, + "grad_norm": 2.4150564670562744, + "learning_rate": 7.649933515167407e-07, + "loss": 0.5217, + "step": 12690 + }, + { + "epoch": 1.843786295005807, + "grad_norm": 0.23005646467208862, + "learning_rate": 7.510405831731155e-07, + "loss": 0.7733, + "step": 12700 + }, + { + "epoch": 1.8452380952380953, + "grad_norm": 5.178878307342529, + "learning_rate": 7.372142908898038e-07, + "loss": 0.5562, + "step": 12710 + }, + { + "epoch": 1.8466898954703832, + "grad_norm": 1.1063512563705444, + "learning_rate": 7.235145467810344e-07, + "loss": 0.6543, + "step": 12720 + }, + { + "epoch": 1.8481416957026713, + "grad_norm": 1.4870764017105103, + "learning_rate": 7.099414223009859e-07, + "loss": 0.5468, + "step": 12730 + }, + { + "epoch": 1.8495934959349594, + "grad_norm": 0.8903436660766602, + "learning_rate": 6.964949882434402e-07, + "loss": 0.441, + "step": 12740 + }, + { + "epoch": 1.8510452961672472, + "grad_norm": 2.586010694503784, + "learning_rate": 6.831753147413827e-07, + "loss": 0.7283, + "step": 12750 + }, + { + "epoch": 1.8524970963995355, + "grad_norm": 7.29203987121582, + "learning_rate": 6.699824712666503e-07, + "loss": 0.5616, + "step": 12760 + }, + { + "epoch": 1.8539488966318234, + "grad_norm": 2.853286027908325, + "learning_rate": 6.569165266295779e-07, + "loss": 0.6829, + "step": 12770 + }, + { + "epoch": 1.8554006968641115, + "grad_norm": 1.3794944286346436, + "learning_rate": 6.439775489786193e-07, + "loss": 0.6023, + "step": 12780 + }, + { + "epoch": 1.8568524970963995, + "grad_norm": 1.2747677564620972, + "learning_rate": 6.311656058000076e-07, + "loss": 0.5941, + "step": 12790 + }, + { + "epoch": 1.8583042973286876, + "grad_norm": 1.2692632675170898, + "learning_rate": 6.184807639173979e-07, + "loss": 0.54, + "step": 12800 + }, + { + "epoch": 1.8597560975609757, + "grad_norm": 3.867017984390259, + "learning_rate": 6.059230894915224e-07, + "loss": 0.4035, + "step": 12810 + }, + { + "epoch": 1.8612078977932636, + "grad_norm": 5.028682708740234, + "learning_rate": 5.934926480198333e-07, + "loss": 0.6283, + "step": 12820 + }, + { + "epoch": 1.8626596980255516, + "grad_norm": 1.321096658706665, + "learning_rate": 5.811895043361742e-07, + "loss": 0.3401, + "step": 12830 + }, + { + "epoch": 1.8641114982578397, + "grad_norm": 1.8228753805160522, + "learning_rate": 5.690137226104481e-07, + "loss": 0.6275, + "step": 12840 + }, + { + "epoch": 1.8655632984901278, + "grad_norm": 1.2849879264831543, + "learning_rate": 5.569653663482527e-07, + "loss": 0.5017, + "step": 12850 + }, + { + "epoch": 1.8670150987224159, + "grad_norm": 1.4431346654891968, + "learning_rate": 5.450444983905845e-07, + "loss": 0.4334, + "step": 12860 + }, + { + "epoch": 1.8684668989547037, + "grad_norm": 3.2771754264831543, + "learning_rate": 5.332511809134883e-07, + "loss": 0.5051, + "step": 12870 + }, + { + "epoch": 1.8699186991869918, + "grad_norm": 0.42705005407333374, + "learning_rate": 5.215854754277382e-07, + "loss": 0.5255, + "step": 12880 + }, + { + "epoch": 1.87137049941928, + "grad_norm": 1.7732504606246948, + "learning_rate": 5.100474427785245e-07, + "loss": 0.5235, + "step": 12890 + }, + { + "epoch": 1.872822299651568, + "grad_norm": 1.3279632329940796, + "learning_rate": 4.986371431451254e-07, + "loss": 0.7319, + "step": 12900 + }, + { + "epoch": 1.874274099883856, + "grad_norm": 3.910167694091797, + "learning_rate": 4.87354636040599e-07, + "loss": 0.4989, + "step": 12910 + }, + { + "epoch": 1.875725900116144, + "grad_norm": 4.26170015335083, + "learning_rate": 4.7619998031147304e-07, + "loss": 0.3566, + "step": 12920 + }, + { + "epoch": 1.877177700348432, + "grad_norm": 1.6784484386444092, + "learning_rate": 4.651732341374365e-07, + "loss": 0.4187, + "step": 12930 + }, + { + "epoch": 1.87862950058072, + "grad_norm": 2.036226511001587, + "learning_rate": 4.5427445503103684e-07, + "loss": 0.504, + "step": 12940 + }, + { + "epoch": 1.8800813008130082, + "grad_norm": 1.2957289218902588, + "learning_rate": 4.435036998373776e-07, + "loss": 0.419, + "step": 12950 + }, + { + "epoch": 1.8815331010452963, + "grad_norm": 1.349066972732544, + "learning_rate": 4.3286102473382994e-07, + "loss": 0.3789, + "step": 12960 + }, + { + "epoch": 1.8829849012775841, + "grad_norm": 8.302398681640625, + "learning_rate": 4.2234648522972156e-07, + "loss": 0.3884, + "step": 12970 + }, + { + "epoch": 1.8844367015098722, + "grad_norm": 4.297000885009766, + "learning_rate": 4.11960136166073e-07, + "loss": 0.4167, + "step": 12980 + }, + { + "epoch": 1.8858885017421603, + "grad_norm": 0.8533451557159424, + "learning_rate": 4.0170203171528974e-07, + "loss": 0.3217, + "step": 12990 + }, + { + "epoch": 1.8873403019744484, + "grad_norm": 1.4826991558074951, + "learning_rate": 3.9157222538088454e-07, + "loss": 0.3191, + "step": 13000 + }, + { + "epoch": 1.8887921022067364, + "grad_norm": 2.1450116634368896, + "learning_rate": 3.815707699972165e-07, + "loss": 0.3731, + "step": 13010 + }, + { + "epoch": 1.8902439024390243, + "grad_norm": 1.522496223449707, + "learning_rate": 3.716977177291886e-07, + "loss": 0.5099, + "step": 13020 + }, + { + "epoch": 1.8916957026713124, + "grad_norm": 0.7258571982383728, + "learning_rate": 3.619531200719839e-07, + "loss": 0.5385, + "step": 13030 + }, + { + "epoch": 1.8931475029036005, + "grad_norm": 3.8711488246917725, + "learning_rate": 3.5233702785081035e-07, + "loss": 0.3283, + "step": 13040 + }, + { + "epoch": 1.8945993031358885, + "grad_norm": 1.6500895023345947, + "learning_rate": 3.428494912206259e-07, + "loss": 0.2962, + "step": 13050 + }, + { + "epoch": 1.8960511033681766, + "grad_norm": 1.1146255731582642, + "learning_rate": 3.334905596658666e-07, + "loss": 0.7563, + "step": 13060 + }, + { + "epoch": 1.8975029036004645, + "grad_norm": 1.4949523210525513, + "learning_rate": 3.242602820002161e-07, + "loss": 0.4394, + "step": 13070 + }, + { + "epoch": 1.8989547038327528, + "grad_norm": 3.016923189163208, + "learning_rate": 3.1515870636631696e-07, + "loss": 0.5093, + "step": 13080 + }, + { + "epoch": 1.9004065040650406, + "grad_norm": 1.2064356803894043, + "learning_rate": 3.061858802355433e-07, + "loss": 0.5408, + "step": 13090 + }, + { + "epoch": 1.9018583042973287, + "grad_norm": 2.9825875759124756, + "learning_rate": 2.97341850407748e-07, + "loss": 0.4946, + "step": 13100 + }, + { + "epoch": 1.9033101045296168, + "grad_norm": 2.217625617980957, + "learning_rate": 2.886266630110185e-07, + "loss": 0.5713, + "step": 13110 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 1.1467139720916748, + "learning_rate": 2.8004036350142705e-07, + "loss": 0.4261, + "step": 13120 + }, + { + "epoch": 1.906213704994193, + "grad_norm": 2.285097599029541, + "learning_rate": 2.7158299666280864e-07, + "loss": 0.53, + "step": 13130 + }, + { + "epoch": 1.9076655052264808, + "grad_norm": 3.382395029067993, + "learning_rate": 2.6325460660651393e-07, + "loss": 0.761, + "step": 13140 + }, + { + "epoch": 1.909117305458769, + "grad_norm": 2.245380163192749, + "learning_rate": 2.550552367711956e-07, + "loss": 0.6742, + "step": 13150 + }, + { + "epoch": 1.910569105691057, + "grad_norm": 2.25600528717041, + "learning_rate": 2.469849299225585e-07, + "loss": 0.5978, + "step": 13160 + }, + { + "epoch": 1.9120209059233448, + "grad_norm": 1.8853719234466553, + "learning_rate": 2.390437281531627e-07, + "loss": 0.422, + "step": 13170 + }, + { + "epoch": 1.9134727061556331, + "grad_norm": 1.3422927856445312, + "learning_rate": 2.3123167288217618e-07, + "loss": 0.6992, + "step": 13180 + }, + { + "epoch": 1.914924506387921, + "grad_norm": 1.3293629884719849, + "learning_rate": 2.2354880485518648e-07, + "loss": 0.3887, + "step": 13190 + }, + { + "epoch": 1.916376306620209, + "grad_norm": 1.4933987855911255, + "learning_rate": 2.1599516414396726e-07, + "loss": 0.7679, + "step": 13200 + }, + { + "epoch": 1.9178281068524972, + "grad_norm": 2.126613140106201, + "learning_rate": 2.0857079014628135e-07, + "loss": 0.5869, + "step": 13210 + }, + { + "epoch": 1.919279907084785, + "grad_norm": 1.1330955028533936, + "learning_rate": 2.0127572158566976e-07, + "loss": 0.6385, + "step": 13220 + }, + { + "epoch": 1.9207317073170733, + "grad_norm": 3.0028634071350098, + "learning_rate": 1.9410999651125196e-07, + "loss": 0.3487, + "step": 13230 + }, + { + "epoch": 1.9221835075493612, + "grad_norm": 3.6127331256866455, + "learning_rate": 1.8707365229752306e-07, + "loss": 0.3929, + "step": 13240 + }, + { + "epoch": 1.9236353077816493, + "grad_norm": 3.5471410751342773, + "learning_rate": 1.8016672564416526e-07, + "loss": 0.4829, + "step": 13250 + }, + { + "epoch": 1.9250871080139373, + "grad_norm": 2.57403564453125, + "learning_rate": 1.7338925257585626e-07, + "loss": 0.4579, + "step": 13260 + }, + { + "epoch": 1.9265389082462252, + "grad_norm": 0.892292320728302, + "learning_rate": 1.6674126844207215e-07, + "loss": 0.6123, + "step": 13270 + }, + { + "epoch": 1.9279907084785135, + "grad_norm": 5.541355609893799, + "learning_rate": 1.6022280791691547e-07, + "loss": 0.5871, + "step": 13280 + }, + { + "epoch": 1.9294425087108014, + "grad_norm": 4.043185234069824, + "learning_rate": 1.5383390499892625e-07, + "loss": 0.7962, + "step": 13290 + }, + { + "epoch": 1.9308943089430894, + "grad_norm": 3.4358088970184326, + "learning_rate": 1.4757459301089904e-07, + "loss": 0.8971, + "step": 13300 + }, + { + "epoch": 1.9323461091753775, + "grad_norm": 2.0731537342071533, + "learning_rate": 1.414449045997357e-07, + "loss": 0.587, + "step": 13310 + }, + { + "epoch": 1.9337979094076654, + "grad_norm": 2.296551465988159, + "learning_rate": 1.3544487173623443e-07, + "loss": 0.6924, + "step": 13320 + }, + { + "epoch": 1.9352497096399537, + "grad_norm": 1.4577088356018066, + "learning_rate": 1.295745257149622e-07, + "loss": 0.607, + "step": 13330 + }, + { + "epoch": 1.9367015098722415, + "grad_norm": 2.300415277481079, + "learning_rate": 1.2383389715406592e-07, + "loss": 0.584, + "step": 13340 + }, + { + "epoch": 1.9381533101045296, + "grad_norm": 5.345014572143555, + "learning_rate": 1.1822301599511976e-07, + "loss": 0.5331, + "step": 13350 + }, + { + "epoch": 1.9396051103368177, + "grad_norm": 1.9530677795410156, + "learning_rate": 1.1274191150297542e-07, + "loss": 0.4817, + "step": 13360 + }, + { + "epoch": 1.9410569105691056, + "grad_norm": 2.270688533782959, + "learning_rate": 1.0739061226560099e-07, + "loss": 0.3805, + "step": 13370 + }, + { + "epoch": 1.9425087108013939, + "grad_norm": 4.100897312164307, + "learning_rate": 1.021691461939367e-07, + "loss": 0.5722, + "step": 13380 + }, + { + "epoch": 1.9439605110336817, + "grad_norm": 1.5650062561035156, + "learning_rate": 9.707754052174777e-08, + "loss": 0.4488, + "step": 13390 + }, + { + "epoch": 1.9454123112659698, + "grad_norm": 3.1894681453704834, + "learning_rate": 9.211582180548295e-08, + "loss": 0.8613, + "step": 13400 + }, + { + "epoch": 1.9468641114982579, + "grad_norm": 4.314057350158691, + "learning_rate": 8.728401592413283e-08, + "loss": 0.5325, + "step": 13410 + }, + { + "epoch": 1.9483159117305457, + "grad_norm": 1.2972936630249023, + "learning_rate": 8.258214807909947e-08, + "loss": 0.3854, + "step": 13420 + }, + { + "epoch": 1.949767711962834, + "grad_norm": 1.0964834690093994, + "learning_rate": 7.801024279406599e-08, + "loss": 0.5104, + "step": 13430 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 2.3164939880371094, + "learning_rate": 7.356832391485769e-08, + "loss": 0.5469, + "step": 13440 + }, + { + "epoch": 1.95267131242741, + "grad_norm": 0.8079650402069092, + "learning_rate": 6.925641460933107e-08, + "loss": 0.5622, + "step": 13450 + }, + { + "epoch": 1.954123112659698, + "grad_norm": 0.8280888795852661, + "learning_rate": 6.5074537367249e-08, + "loss": 0.4257, + "step": 13460 + }, + { + "epoch": 1.955574912891986, + "grad_norm": 1.360197901725769, + "learning_rate": 6.102271400016124e-08, + "loss": 0.4915, + "step": 13470 + }, + { + "epoch": 1.9570267131242742, + "grad_norm": 4.283596992492676, + "learning_rate": 5.710096564128797e-08, + "loss": 0.4378, + "step": 13480 + }, + { + "epoch": 1.958478513356562, + "grad_norm": 1.3778785467147827, + "learning_rate": 5.3309312745419835e-08, + "loss": 0.5003, + "step": 13490 + }, + { + "epoch": 1.9599303135888502, + "grad_norm": 2.252000093460083, + "learning_rate": 4.9647775088793035e-08, + "loss": 0.5867, + "step": 13500 + }, + { + "epoch": 1.9613821138211383, + "grad_norm": 2.1115314960479736, + "learning_rate": 4.611637176901162e-08, + "loss": 0.6936, + "step": 13510 + }, + { + "epoch": 1.962833914053426, + "grad_norm": 0.7023373246192932, + "learning_rate": 4.2715121204922606e-08, + "loss": 0.4358, + "step": 13520 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 2.6030709743499756, + "learning_rate": 3.944404113653544e-08, + "loss": 0.6004, + "step": 13530 + }, + { + "epoch": 1.9657375145180023, + "grad_norm": 1.5096830129623413, + "learning_rate": 3.630314862492767e-08, + "loss": 0.66, + "step": 13540 + }, + { + "epoch": 1.9671893147502904, + "grad_norm": 2.45739483833313, + "learning_rate": 3.3292460052147814e-08, + "loss": 0.5751, + "step": 13550 + }, + { + "epoch": 1.9686411149825784, + "grad_norm": 1.5947012901306152, + "learning_rate": 3.0411991121143124e-08, + "loss": 0.4873, + "step": 13560 + }, + { + "epoch": 1.9700929152148663, + "grad_norm": 4.944673538208008, + "learning_rate": 2.76617568556653e-08, + "loss": 0.5386, + "step": 13570 + }, + { + "epoch": 1.9715447154471546, + "grad_norm": 1.3134737014770508, + "learning_rate": 2.5041771600195496e-08, + "loss": 0.596, + "step": 13580 + }, + { + "epoch": 1.9729965156794425, + "grad_norm": 1.0245544910430908, + "learning_rate": 2.2552049019874955e-08, + "loss": 0.7618, + "step": 13590 + }, + { + "epoch": 1.9744483159117305, + "grad_norm": 1.1660908460617065, + "learning_rate": 2.0192602100424507e-08, + "loss": 0.5421, + "step": 13600 + }, + { + "epoch": 1.9759001161440186, + "grad_norm": 2.883108377456665, + "learning_rate": 1.796344314809184e-08, + "loss": 0.7773, + "step": 13610 + }, + { + "epoch": 1.9773519163763065, + "grad_norm": 1.4589025974273682, + "learning_rate": 1.5864583789565457e-08, + "loss": 0.7103, + "step": 13620 + }, + { + "epoch": 1.9788037166085948, + "grad_norm": 0.33630794286727905, + "learning_rate": 1.3896034971935812e-08, + "loss": 0.5487, + "step": 13630 + }, + { + "epoch": 1.9802555168408826, + "grad_norm": 8.537004470825195, + "learning_rate": 1.2057806962625928e-08, + "loss": 0.7031, + "step": 13640 + }, + { + "epoch": 1.9817073170731707, + "grad_norm": 2.483400583267212, + "learning_rate": 1.0349909349333109e-08, + "loss": 0.5642, + "step": 13650 + }, + { + "epoch": 1.9831591173054588, + "grad_norm": 3.4803388118743896, + "learning_rate": 8.77235104000118e-09, + "loss": 0.3511, + "step": 13660 + }, + { + "epoch": 1.9846109175377467, + "grad_norm": 3.3463430404663086, + "learning_rate": 7.3251402627427805e-09, + "loss": 0.3859, + "step": 13670 + }, + { + "epoch": 1.986062717770035, + "grad_norm": 2.710141897201538, + "learning_rate": 6.008284565825473e-09, + "loss": 0.5433, + "step": 13680 + }, + { + "epoch": 1.9875145180023228, + "grad_norm": 1.5595346689224243, + "learning_rate": 4.82179081761347e-09, + "loss": 0.5447, + "step": 13690 + }, + { + "epoch": 1.988966318234611, + "grad_norm": 2.3480842113494873, + "learning_rate": 3.76566520653987e-09, + "loss": 0.4713, + "step": 13700 + }, + { + "epoch": 1.990418118466899, + "grad_norm": 0.828123152256012, + "learning_rate": 2.8399132410733553e-09, + "loss": 0.5635, + "step": 13710 + }, + { + "epoch": 1.9918699186991868, + "grad_norm": 2.631500720977783, + "learning_rate": 2.044539749684882e-09, + "loss": 0.5184, + "step": 13720 + }, + { + "epoch": 1.9933217189314751, + "grad_norm": 2.2084085941314697, + "learning_rate": 1.3795488808310274e-09, + "loss": 0.5075, + "step": 13730 + }, + { + "epoch": 1.994773519163763, + "grad_norm": 2.0961720943450928, + "learning_rate": 8.449441029234617e-10, + "loss": 0.9044, + "step": 13740 + }, + { + "epoch": 1.996225319396051, + "grad_norm": 1.8734853267669678, + "learning_rate": 4.4072820432061733e-10, + "loss": 0.4804, + "step": 13750 + }, + { + "epoch": 1.9976771196283392, + "grad_norm": 1.1035456657409668, + "learning_rate": 1.6690329330271147e-10, + "loss": 0.4869, + "step": 13760 + }, + { + "epoch": 1.999128919860627, + "grad_norm": 3.4342620372772217, + "learning_rate": 2.3470798063418564e-11, + "loss": 0.4943, + "step": 13770 + }, + { + "epoch": 2.0, + "step": 13776, + "total_flos": 2.442921933399982e+18, + "train_loss": 0.63349506684712, + "train_runtime": 13338.7431, + "train_samples_per_second": 4.131, + "train_steps_per_second": 1.033 + } + ], + "logging_steps": 10, + "max_steps": 13776, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3000, + "total_flos": 2.442921933399982e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}