{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.981640146878824, "eval_steps": 500, "global_step": 1530, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009791921664626682, "grad_norm": 4.1982927322387695, "learning_rate": 4.9999947298042775e-06, "loss": 1.1086, "step": 1 }, { "epoch": 0.019583843329253364, "grad_norm": 3.8332724571228027, "learning_rate": 4.99997891923933e-06, "loss": 0.889, "step": 2 }, { "epoch": 0.02937576499388005, "grad_norm": 3.872532844543457, "learning_rate": 4.999952568371817e-06, "loss": 1.1263, "step": 3 }, { "epoch": 0.03916768665850673, "grad_norm": 3.594716787338257, "learning_rate": 4.999915677312839e-06, "loss": 1.043, "step": 4 }, { "epoch": 0.04895960832313342, "grad_norm": 3.4819302558898926, "learning_rate": 4.9998682462179335e-06, "loss": 1.1027, "step": 5 }, { "epoch": 0.0587515299877601, "grad_norm": 3.312889337539673, "learning_rate": 4.999810275287077e-06, "loss": 0.9266, "step": 6 }, { "epoch": 0.06854345165238677, "grad_norm": 2.689910888671875, "learning_rate": 4.9997417647646845e-06, "loss": 0.9915, "step": 7 }, { "epoch": 0.07833537331701346, "grad_norm": 2.9629459381103516, "learning_rate": 4.9996627149396075e-06, "loss": 0.9476, "step": 8 }, { "epoch": 0.08812729498164015, "grad_norm": 3.5845913887023926, "learning_rate": 4.999573126145132e-06, "loss": 0.9684, "step": 9 }, { "epoch": 0.09791921664626684, "grad_norm": 2.85638689994812, "learning_rate": 4.999472998758979e-06, "loss": 1.1772, "step": 10 }, { "epoch": 0.10771113831089352, "grad_norm": 3.0446324348449707, "learning_rate": 4.9993623332033e-06, "loss": 1.0668, "step": 11 }, { "epoch": 0.1175030599755202, "grad_norm": 2.287930965423584, "learning_rate": 4.99924112994468e-06, "loss": 1.0948, "step": 12 }, { "epoch": 0.12729498164014688, "grad_norm": 2.3001797199249268, "learning_rate": 4.999109389494129e-06, "loss": 0.8348, "step": 13 }, { "epoch": 0.13708690330477355, "grad_norm": 2.1713125705718994, "learning_rate": 4.998967112407087e-06, "loss": 0.8579, "step": 14 }, { "epoch": 0.14687882496940025, "grad_norm": 2.4115211963653564, "learning_rate": 4.998814299283415e-06, "loss": 1.0831, "step": 15 }, { "epoch": 0.15667074663402691, "grad_norm": 2.2919187545776367, "learning_rate": 4.9986509507673986e-06, "loss": 0.8667, "step": 16 }, { "epoch": 0.1664626682986536, "grad_norm": 2.3826684951782227, "learning_rate": 4.99847706754774e-06, "loss": 0.8568, "step": 17 }, { "epoch": 0.1762545899632803, "grad_norm": 2.4488134384155273, "learning_rate": 4.998292650357558e-06, "loss": 1.1822, "step": 18 }, { "epoch": 0.18604651162790697, "grad_norm": 2.498718023300171, "learning_rate": 4.998097699974383e-06, "loss": 0.9499, "step": 19 }, { "epoch": 0.19583843329253367, "grad_norm": 2.5751450061798096, "learning_rate": 4.99789221722016e-06, "loss": 1.0277, "step": 20 }, { "epoch": 0.20563035495716034, "grad_norm": 1.9550775289535522, "learning_rate": 4.997676202961234e-06, "loss": 0.9181, "step": 21 }, { "epoch": 0.21542227662178703, "grad_norm": 2.3730952739715576, "learning_rate": 4.997449658108354e-06, "loss": 0.8664, "step": 22 }, { "epoch": 0.2252141982864137, "grad_norm": 2.253598690032959, "learning_rate": 4.99721258361667e-06, "loss": 0.8962, "step": 23 }, { "epoch": 0.2350061199510404, "grad_norm": 2.281116247177124, "learning_rate": 4.996964980485725e-06, "loss": 1.1838, "step": 24 }, { "epoch": 0.24479804161566707, "grad_norm": 2.0912861824035645, "learning_rate": 4.996706849759453e-06, "loss": 1.1451, "step": 25 }, { "epoch": 0.25458996328029376, "grad_norm": 2.2522265911102295, "learning_rate": 4.996438192526173e-06, "loss": 1.0119, "step": 26 }, { "epoch": 0.26438188494492043, "grad_norm": 1.9836186170578003, "learning_rate": 4.996159009918586e-06, "loss": 1.0591, "step": 27 }, { "epoch": 0.2741738066095471, "grad_norm": 2.303683280944824, "learning_rate": 4.995869303113768e-06, "loss": 0.8296, "step": 28 }, { "epoch": 0.2839657282741738, "grad_norm": 1.872109055519104, "learning_rate": 4.995569073333172e-06, "loss": 0.904, "step": 29 }, { "epoch": 0.2937576499388005, "grad_norm": 2.2063632011413574, "learning_rate": 4.995258321842611e-06, "loss": 0.8231, "step": 30 }, { "epoch": 0.30354957160342716, "grad_norm": 2.0881803035736084, "learning_rate": 4.994937049952262e-06, "loss": 0.8387, "step": 31 }, { "epoch": 0.31334149326805383, "grad_norm": 2.151073932647705, "learning_rate": 4.994605259016658e-06, "loss": 0.8802, "step": 32 }, { "epoch": 0.32313341493268055, "grad_norm": 1.7886265516281128, "learning_rate": 4.994262950434683e-06, "loss": 0.9016, "step": 33 }, { "epoch": 0.3329253365973072, "grad_norm": 1.8912667036056519, "learning_rate": 4.993910125649561e-06, "loss": 0.991, "step": 34 }, { "epoch": 0.3427172582619339, "grad_norm": 2.0813636779785156, "learning_rate": 4.993546786148858e-06, "loss": 0.8743, "step": 35 }, { "epoch": 0.3525091799265606, "grad_norm": 1.942328691482544, "learning_rate": 4.99317293346447e-06, "loss": 0.8598, "step": 36 }, { "epoch": 0.3623011015911873, "grad_norm": 2.016279935836792, "learning_rate": 4.99278856917262e-06, "loss": 1.075, "step": 37 }, { "epoch": 0.37209302325581395, "grad_norm": 1.8362637758255005, "learning_rate": 4.992393694893844e-06, "loss": 1.0846, "step": 38 }, { "epoch": 0.3818849449204406, "grad_norm": 1.7136290073394775, "learning_rate": 4.991988312292998e-06, "loss": 0.9098, "step": 39 }, { "epoch": 0.39167686658506734, "grad_norm": 2.099512815475464, "learning_rate": 4.991572423079236e-06, "loss": 1.221, "step": 40 }, { "epoch": 0.401468788249694, "grad_norm": 1.7864044904708862, "learning_rate": 4.9911460290060135e-06, "loss": 0.957, "step": 41 }, { "epoch": 0.4112607099143207, "grad_norm": 2.0868992805480957, "learning_rate": 4.990709131871074e-06, "loss": 0.8932, "step": 42 }, { "epoch": 0.42105263157894735, "grad_norm": 1.911013126373291, "learning_rate": 4.990261733516445e-06, "loss": 0.9521, "step": 43 }, { "epoch": 0.43084455324357407, "grad_norm": 1.721886157989502, "learning_rate": 4.989803835828426e-06, "loss": 0.9122, "step": 44 }, { "epoch": 0.44063647490820074, "grad_norm": 1.626144289970398, "learning_rate": 4.989335440737587e-06, "loss": 0.788, "step": 45 }, { "epoch": 0.4504283965728274, "grad_norm": 1.6793444156646729, "learning_rate": 4.988856550218755e-06, "loss": 0.7455, "step": 46 }, { "epoch": 0.4602203182374541, "grad_norm": 1.8586649894714355, "learning_rate": 4.988367166291006e-06, "loss": 0.7887, "step": 47 }, { "epoch": 0.4700122399020808, "grad_norm": 2.406543254852295, "learning_rate": 4.987867291017662e-06, "loss": 0.9364, "step": 48 }, { "epoch": 0.47980416156670747, "grad_norm": 1.8706172704696655, "learning_rate": 4.987356926506273e-06, "loss": 0.8364, "step": 49 }, { "epoch": 0.48959608323133413, "grad_norm": 1.8031095266342163, "learning_rate": 4.986836074908616e-06, "loss": 0.9586, "step": 50 }, { "epoch": 0.49938800489596086, "grad_norm": 1.9483214616775513, "learning_rate": 4.986304738420684e-06, "loss": 0.7822, "step": 51 }, { "epoch": 0.5091799265605875, "grad_norm": 1.886414885520935, "learning_rate": 4.985762919282674e-06, "loss": 0.9397, "step": 52 }, { "epoch": 0.5189718482252142, "grad_norm": 2.3802127838134766, "learning_rate": 4.9852106197789804e-06, "loss": 1.151, "step": 53 }, { "epoch": 0.5287637698898409, "grad_norm": 1.756941318511963, "learning_rate": 4.984647842238185e-06, "loss": 0.7922, "step": 54 }, { "epoch": 0.5385556915544676, "grad_norm": 1.743428111076355, "learning_rate": 4.984074589033045e-06, "loss": 0.8244, "step": 55 }, { "epoch": 0.5483476132190942, "grad_norm": 1.809975266456604, "learning_rate": 4.983490862580486e-06, "loss": 0.7307, "step": 56 }, { "epoch": 0.5581395348837209, "grad_norm": 1.9696334600448608, "learning_rate": 4.982896665341591e-06, "loss": 0.9661, "step": 57 }, { "epoch": 0.5679314565483476, "grad_norm": 1.771151065826416, "learning_rate": 4.982291999821587e-06, "loss": 0.9017, "step": 58 }, { "epoch": 0.5777233782129743, "grad_norm": 1.6412746906280518, "learning_rate": 4.98167686856984e-06, "loss": 0.9265, "step": 59 }, { "epoch": 0.587515299877601, "grad_norm": 1.8410847187042236, "learning_rate": 4.98105127417984e-06, "loss": 0.9575, "step": 60 }, { "epoch": 0.5973072215422277, "grad_norm": 2.031231641769409, "learning_rate": 4.980415219289189e-06, "loss": 0.8345, "step": 61 }, { "epoch": 0.6070991432068543, "grad_norm": 2.061760425567627, "learning_rate": 4.979768706579595e-06, "loss": 1.0223, "step": 62 }, { "epoch": 0.616891064871481, "grad_norm": 1.8148959875106812, "learning_rate": 4.9791117387768575e-06, "loss": 1.0181, "step": 63 }, { "epoch": 0.6266829865361077, "grad_norm": 1.7690637111663818, "learning_rate": 4.978444318650855e-06, "loss": 0.921, "step": 64 }, { "epoch": 0.6364749082007344, "grad_norm": 1.9219954013824463, "learning_rate": 4.977766449015534e-06, "loss": 1.0737, "step": 65 }, { "epoch": 0.6462668298653611, "grad_norm": 1.9013108015060425, "learning_rate": 4.977078132728901e-06, "loss": 0.922, "step": 66 }, { "epoch": 0.6560587515299877, "grad_norm": 2.0774574279785156, "learning_rate": 4.976379372693005e-06, "loss": 1.1406, "step": 67 }, { "epoch": 0.6658506731946144, "grad_norm": 1.7290092706680298, "learning_rate": 4.975670171853926e-06, "loss": 0.971, "step": 68 }, { "epoch": 0.6756425948592412, "grad_norm": 1.8541407585144043, "learning_rate": 4.974950533201768e-06, "loss": 0.8993, "step": 69 }, { "epoch": 0.6854345165238678, "grad_norm": 1.751027226448059, "learning_rate": 4.9742204597706386e-06, "loss": 0.8674, "step": 70 }, { "epoch": 0.6952264381884945, "grad_norm": 1.9312092065811157, "learning_rate": 4.973479954638642e-06, "loss": 1.1153, "step": 71 }, { "epoch": 0.7050183598531212, "grad_norm": 1.777396321296692, "learning_rate": 4.972729020927866e-06, "loss": 0.8642, "step": 72 }, { "epoch": 0.7148102815177478, "grad_norm": 1.988424301147461, "learning_rate": 4.9719676618043614e-06, "loss": 1.1362, "step": 73 }, { "epoch": 0.7246022031823746, "grad_norm": 2.0535950660705566, "learning_rate": 4.9711958804781385e-06, "loss": 0.989, "step": 74 }, { "epoch": 0.7343941248470012, "grad_norm": 1.7406589984893799, "learning_rate": 4.9704136802031485e-06, "loss": 0.7403, "step": 75 }, { "epoch": 0.7441860465116279, "grad_norm": 1.9246405363082886, "learning_rate": 4.969621064277271e-06, "loss": 0.9888, "step": 76 }, { "epoch": 0.7539779681762546, "grad_norm": 1.8370853662490845, "learning_rate": 4.968818036042299e-06, "loss": 0.884, "step": 77 }, { "epoch": 0.7637698898408812, "grad_norm": 1.6906005144119263, "learning_rate": 4.968004598883923e-06, "loss": 0.9272, "step": 78 }, { "epoch": 0.773561811505508, "grad_norm": 2.2373621463775635, "learning_rate": 4.967180756231723e-06, "loss": 0.9989, "step": 79 }, { "epoch": 0.7833537331701347, "grad_norm": 1.780800223350525, "learning_rate": 4.966346511559149e-06, "loss": 0.7982, "step": 80 }, { "epoch": 0.7931456548347613, "grad_norm": 2.09855055809021, "learning_rate": 4.965501868383507e-06, "loss": 1.1236, "step": 81 }, { "epoch": 0.802937576499388, "grad_norm": 1.6766303777694702, "learning_rate": 4.964646830265944e-06, "loss": 0.8876, "step": 82 }, { "epoch": 0.8127294981640147, "grad_norm": 1.6568055152893066, "learning_rate": 4.963781400811435e-06, "loss": 1.0442, "step": 83 }, { "epoch": 0.8225214198286414, "grad_norm": 1.9741536378860474, "learning_rate": 4.962905583668766e-06, "loss": 1.0185, "step": 84 }, { "epoch": 0.8323133414932681, "grad_norm": 1.6732988357543945, "learning_rate": 4.962019382530521e-06, "loss": 0.8511, "step": 85 }, { "epoch": 0.8421052631578947, "grad_norm": 2.0651164054870605, "learning_rate": 4.961122801133059e-06, "loss": 0.8455, "step": 86 }, { "epoch": 0.8518971848225214, "grad_norm": 1.9031686782836914, "learning_rate": 4.960215843256512e-06, "loss": 0.8027, "step": 87 }, { "epoch": 0.8616891064871481, "grad_norm": 1.6613521575927734, "learning_rate": 4.9592985127247525e-06, "loss": 1.0015, "step": 88 }, { "epoch": 0.8714810281517748, "grad_norm": 1.9487308263778687, "learning_rate": 4.958370813405392e-06, "loss": 0.8869, "step": 89 }, { "epoch": 0.8812729498164015, "grad_norm": 1.6250264644622803, "learning_rate": 4.957432749209755e-06, "loss": 0.7025, "step": 90 }, { "epoch": 0.8910648714810282, "grad_norm": 1.8711061477661133, "learning_rate": 4.956484324092867e-06, "loss": 0.8518, "step": 91 }, { "epoch": 0.9008567931456548, "grad_norm": 1.721203088760376, "learning_rate": 4.955525542053438e-06, "loss": 0.9342, "step": 92 }, { "epoch": 0.9106487148102815, "grad_norm": 1.9115077257156372, "learning_rate": 4.954556407133843e-06, "loss": 0.959, "step": 93 }, { "epoch": 0.9204406364749081, "grad_norm": 1.940711498260498, "learning_rate": 4.953576923420105e-06, "loss": 0.7522, "step": 94 }, { "epoch": 0.9302325581395349, "grad_norm": 2.0221927165985107, "learning_rate": 4.952587095041882e-06, "loss": 1.0285, "step": 95 }, { "epoch": 0.9400244798041616, "grad_norm": 1.9498896598815918, "learning_rate": 4.9515869261724444e-06, "loss": 0.8966, "step": 96 }, { "epoch": 0.9498164014687882, "grad_norm": 1.8775895833969116, "learning_rate": 4.950576421028662e-06, "loss": 0.7945, "step": 97 }, { "epoch": 0.9596083231334149, "grad_norm": 1.9037103652954102, "learning_rate": 4.949555583870983e-06, "loss": 0.9877, "step": 98 }, { "epoch": 0.9694002447980417, "grad_norm": 1.7415046691894531, "learning_rate": 4.948524419003415e-06, "loss": 0.7034, "step": 99 }, { "epoch": 0.9791921664626683, "grad_norm": 1.7894679307937622, "learning_rate": 4.9474829307735115e-06, "loss": 0.7072, "step": 100 }, { "epoch": 0.988984088127295, "grad_norm": 1.84097421169281, "learning_rate": 4.9464311235723504e-06, "loss": 1.1331, "step": 101 }, { "epoch": 0.9987760097919217, "grad_norm": 1.9686108827590942, "learning_rate": 4.9453690018345144e-06, "loss": 0.935, "step": 102 }, { "epoch": 1.0085679314565483, "grad_norm": 3.9489552974700928, "learning_rate": 4.944296570038076e-06, "loss": 1.3136, "step": 103 }, { "epoch": 1.018359853121175, "grad_norm": 1.61304771900177, "learning_rate": 4.943213832704575e-06, "loss": 0.9136, "step": 104 }, { "epoch": 1.0281517747858018, "grad_norm": 1.6916468143463135, "learning_rate": 4.942120794399002e-06, "loss": 0.8965, "step": 105 }, { "epoch": 1.0379436964504285, "grad_norm": 1.5819220542907715, "learning_rate": 4.941017459729778e-06, "loss": 0.883, "step": 106 }, { "epoch": 1.047735618115055, "grad_norm": 1.928499698638916, "learning_rate": 4.939903833348733e-06, "loss": 0.9184, "step": 107 }, { "epoch": 1.0575275397796817, "grad_norm": 1.7820097208023071, "learning_rate": 4.938779919951092e-06, "loss": 0.7877, "step": 108 }, { "epoch": 1.0673194614443084, "grad_norm": 4.7082390785217285, "learning_rate": 4.937645724275449e-06, "loss": 1.0572, "step": 109 }, { "epoch": 1.0771113831089352, "grad_norm": 1.7655494213104248, "learning_rate": 4.936501251103751e-06, "loss": 0.8118, "step": 110 }, { "epoch": 1.086903304773562, "grad_norm": 1.7132837772369385, "learning_rate": 4.935346505261276e-06, "loss": 0.7269, "step": 111 }, { "epoch": 1.0966952264381884, "grad_norm": 1.7379621267318726, "learning_rate": 4.934181491616613e-06, "loss": 0.805, "step": 112 }, { "epoch": 1.1064871481028151, "grad_norm": 1.9942940473556519, "learning_rate": 4.9330062150816415e-06, "loss": 0.8552, "step": 113 }, { "epoch": 1.1162790697674418, "grad_norm": 1.7253119945526123, "learning_rate": 4.9318206806115125e-06, "loss": 0.9615, "step": 114 }, { "epoch": 1.1260709914320686, "grad_norm": 1.7357773780822754, "learning_rate": 4.930624893204624e-06, "loss": 0.9264, "step": 115 }, { "epoch": 1.1358629130966953, "grad_norm": 1.9663028717041016, "learning_rate": 4.929418857902603e-06, "loss": 0.6953, "step": 116 }, { "epoch": 1.1456548347613218, "grad_norm": 1.8109073638916016, "learning_rate": 4.928202579790285e-06, "loss": 0.8415, "step": 117 }, { "epoch": 1.1554467564259485, "grad_norm": 1.5971161127090454, "learning_rate": 4.926976063995687e-06, "loss": 0.8338, "step": 118 }, { "epoch": 1.1652386780905752, "grad_norm": 1.6034120321273804, "learning_rate": 4.925739315689991e-06, "loss": 0.8423, "step": 119 }, { "epoch": 1.175030599755202, "grad_norm": 1.6467894315719604, "learning_rate": 4.9244923400875245e-06, "loss": 0.8094, "step": 120 }, { "epoch": 1.1848225214198287, "grad_norm": 1.7149252891540527, "learning_rate": 4.9232351424457286e-06, "loss": 0.823, "step": 121 }, { "epoch": 1.1946144430844554, "grad_norm": 2.067819356918335, "learning_rate": 4.921967728065147e-06, "loss": 1.0936, "step": 122 }, { "epoch": 1.204406364749082, "grad_norm": 1.9850021600723267, "learning_rate": 4.920690102289397e-06, "loss": 0.9659, "step": 123 }, { "epoch": 1.2141982864137086, "grad_norm": 1.5845577716827393, "learning_rate": 4.91940227050515e-06, "loss": 0.7686, "step": 124 }, { "epoch": 1.2239902080783354, "grad_norm": 1.726210117340088, "learning_rate": 4.918104238142104e-06, "loss": 0.959, "step": 125 }, { "epoch": 1.233782129742962, "grad_norm": 1.597510814666748, "learning_rate": 4.916796010672969e-06, "loss": 0.8165, "step": 126 }, { "epoch": 1.2435740514075888, "grad_norm": 1.8032078742980957, "learning_rate": 4.915477593613436e-06, "loss": 0.8521, "step": 127 }, { "epoch": 1.2533659730722153, "grad_norm": 1.7745392322540283, "learning_rate": 4.914148992522157e-06, "loss": 0.8301, "step": 128 }, { "epoch": 1.263157894736842, "grad_norm": 2.1708357334136963, "learning_rate": 4.912810213000723e-06, "loss": 1.1565, "step": 129 }, { "epoch": 1.2729498164014688, "grad_norm": 1.4237957000732422, "learning_rate": 4.911461260693639e-06, "loss": 0.6328, "step": 130 }, { "epoch": 1.2827417380660955, "grad_norm": 1.7967597246170044, "learning_rate": 4.910102141288297e-06, "loss": 0.922, "step": 131 }, { "epoch": 1.2925336597307222, "grad_norm": 1.9335849285125732, "learning_rate": 4.908732860514958e-06, "loss": 1.0408, "step": 132 }, { "epoch": 1.302325581395349, "grad_norm": 1.637567400932312, "learning_rate": 4.907353424146726e-06, "loss": 0.6581, "step": 133 }, { "epoch": 1.3121175030599757, "grad_norm": 1.6415040493011475, "learning_rate": 4.905963837999518e-06, "loss": 0.7543, "step": 134 }, { "epoch": 1.3219094247246022, "grad_norm": 1.7031927108764648, "learning_rate": 4.904564107932048e-06, "loss": 0.7776, "step": 135 }, { "epoch": 1.3317013463892289, "grad_norm": 1.883571743965149, "learning_rate": 4.903154239845798e-06, "loss": 0.8697, "step": 136 }, { "epoch": 1.3414932680538556, "grad_norm": 2.000694990158081, "learning_rate": 4.901734239684991e-06, "loss": 0.9421, "step": 137 }, { "epoch": 1.3512851897184823, "grad_norm": 1.9366508722305298, "learning_rate": 4.900304113436571e-06, "loss": 1.2923, "step": 138 }, { "epoch": 1.3610771113831088, "grad_norm": 1.8254188299179077, "learning_rate": 4.898863867130174e-06, "loss": 0.6678, "step": 139 }, { "epoch": 1.3708690330477356, "grad_norm": 1.9203460216522217, "learning_rate": 4.897413506838103e-06, "loss": 1.0549, "step": 140 }, { "epoch": 1.3806609547123623, "grad_norm": 1.6104964017868042, "learning_rate": 4.895953038675307e-06, "loss": 0.6941, "step": 141 }, { "epoch": 1.390452876376989, "grad_norm": 1.8984593152999878, "learning_rate": 4.894482468799344e-06, "loss": 0.8085, "step": 142 }, { "epoch": 1.4002447980416157, "grad_norm": 1.9625005722045898, "learning_rate": 4.893001803410371e-06, "loss": 0.8622, "step": 143 }, { "epoch": 1.4100367197062424, "grad_norm": 1.6875929832458496, "learning_rate": 4.891511048751102e-06, "loss": 0.7046, "step": 144 }, { "epoch": 1.4198286413708692, "grad_norm": 1.88526451587677, "learning_rate": 4.890010211106795e-06, "loss": 0.879, "step": 145 }, { "epoch": 1.4296205630354957, "grad_norm": 1.8440868854522705, "learning_rate": 4.888499296805214e-06, "loss": 0.8519, "step": 146 }, { "epoch": 1.4394124847001224, "grad_norm": 1.7341837882995605, "learning_rate": 4.886978312216612e-06, "loss": 0.6981, "step": 147 }, { "epoch": 1.4492044063647491, "grad_norm": 1.8317904472351074, "learning_rate": 4.8854472637536966e-06, "loss": 0.7863, "step": 148 }, { "epoch": 1.4589963280293758, "grad_norm": 1.9017943143844604, "learning_rate": 4.883906157871609e-06, "loss": 0.7964, "step": 149 }, { "epoch": 1.4687882496940023, "grad_norm": 1.7514961957931519, "learning_rate": 4.882355001067892e-06, "loss": 0.7671, "step": 150 }, { "epoch": 1.478580171358629, "grad_norm": 1.8960765600204468, "learning_rate": 4.880793799882466e-06, "loss": 0.7808, "step": 151 }, { "epoch": 1.4883720930232558, "grad_norm": 1.575915813446045, "learning_rate": 4.8792225608976e-06, "loss": 0.7577, "step": 152 }, { "epoch": 1.4981640146878825, "grad_norm": 2.0890085697174072, "learning_rate": 4.8776412907378845e-06, "loss": 1.0504, "step": 153 }, { "epoch": 1.5079559363525092, "grad_norm": 1.4662138223648071, "learning_rate": 4.8760499960702005e-06, "loss": 0.7073, "step": 154 }, { "epoch": 1.517747858017136, "grad_norm": 2.029181718826294, "learning_rate": 4.874448683603696e-06, "loss": 0.9269, "step": 155 }, { "epoch": 1.5275397796817627, "grad_norm": 1.755637288093567, "learning_rate": 4.8728373600897535e-06, "loss": 0.6771, "step": 156 }, { "epoch": 1.5373317013463892, "grad_norm": 1.9037798643112183, "learning_rate": 4.871216032321968e-06, "loss": 1.1156, "step": 157 }, { "epoch": 1.547123623011016, "grad_norm": 1.6457839012145996, "learning_rate": 4.869584707136109e-06, "loss": 0.8398, "step": 158 }, { "epoch": 1.5569155446756426, "grad_norm": 2.157588243484497, "learning_rate": 4.867943391410101e-06, "loss": 0.9196, "step": 159 }, { "epoch": 1.5667074663402691, "grad_norm": 2.1161298751831055, "learning_rate": 4.8662920920639866e-06, "loss": 1.067, "step": 160 }, { "epoch": 1.5764993880048959, "grad_norm": 1.5604486465454102, "learning_rate": 4.864630816059903e-06, "loss": 0.5783, "step": 161 }, { "epoch": 1.5862913096695226, "grad_norm": 1.9163925647735596, "learning_rate": 4.86295957040205e-06, "loss": 0.9226, "step": 162 }, { "epoch": 1.5960832313341493, "grad_norm": 1.614378809928894, "learning_rate": 4.861278362136659e-06, "loss": 0.6903, "step": 163 }, { "epoch": 1.605875152998776, "grad_norm": 2.001657485961914, "learning_rate": 4.8595871983519705e-06, "loss": 0.7037, "step": 164 }, { "epoch": 1.6156670746634028, "grad_norm": 1.819227933883667, "learning_rate": 4.857886086178194e-06, "loss": 0.7028, "step": 165 }, { "epoch": 1.6254589963280295, "grad_norm": 2.138237953186035, "learning_rate": 4.856175032787485e-06, "loss": 1.0315, "step": 166 }, { "epoch": 1.6352509179926562, "grad_norm": 1.8790464401245117, "learning_rate": 4.854454045393913e-06, "loss": 0.7351, "step": 167 }, { "epoch": 1.6450428396572827, "grad_norm": 2.102705240249634, "learning_rate": 4.852723131253429e-06, "loss": 1.1448, "step": 168 }, { "epoch": 1.6548347613219094, "grad_norm": 1.6158536672592163, "learning_rate": 4.8509822976638395e-06, "loss": 0.6702, "step": 169 }, { "epoch": 1.6646266829865362, "grad_norm": 1.6575067043304443, "learning_rate": 4.849231551964771e-06, "loss": 0.7302, "step": 170 }, { "epoch": 1.6744186046511627, "grad_norm": 2.056703567504883, "learning_rate": 4.847470901537642e-06, "loss": 0.9211, "step": 171 }, { "epoch": 1.6842105263157894, "grad_norm": 1.9917559623718262, "learning_rate": 4.845700353805629e-06, "loss": 0.7579, "step": 172 }, { "epoch": 1.694002447980416, "grad_norm": 1.4190120697021484, "learning_rate": 4.843919916233639e-06, "loss": 0.6226, "step": 173 }, { "epoch": 1.7037943696450428, "grad_norm": 2.011537551879883, "learning_rate": 4.842129596328277e-06, "loss": 1.1584, "step": 174 }, { "epoch": 1.7135862913096696, "grad_norm": 1.489752173423767, "learning_rate": 4.84032940163781e-06, "loss": 0.6577, "step": 175 }, { "epoch": 1.7233782129742963, "grad_norm": 1.8804810047149658, "learning_rate": 4.838519339752143e-06, "loss": 0.7293, "step": 176 }, { "epoch": 1.733170134638923, "grad_norm": 1.7899119853973389, "learning_rate": 4.836699418302777e-06, "loss": 0.7031, "step": 177 }, { "epoch": 1.7429620563035497, "grad_norm": 1.805040717124939, "learning_rate": 4.834869644962789e-06, "loss": 0.5648, "step": 178 }, { "epoch": 1.7527539779681762, "grad_norm": 2.0442566871643066, "learning_rate": 4.833030027446788e-06, "loss": 1.1236, "step": 179 }, { "epoch": 1.762545899632803, "grad_norm": 1.9622224569320679, "learning_rate": 4.83118057351089e-06, "loss": 0.898, "step": 180 }, { "epoch": 1.7723378212974297, "grad_norm": 1.6069920063018799, "learning_rate": 4.829321290952683e-06, "loss": 0.8197, "step": 181 }, { "epoch": 1.7821297429620562, "grad_norm": 1.7059009075164795, "learning_rate": 4.827452187611192e-06, "loss": 0.8255, "step": 182 }, { "epoch": 1.791921664626683, "grad_norm": 1.9465808868408203, "learning_rate": 4.825573271366851e-06, "loss": 0.8021, "step": 183 }, { "epoch": 1.8017135862913096, "grad_norm": 1.7264560461044312, "learning_rate": 4.823684550141464e-06, "loss": 0.9922, "step": 184 }, { "epoch": 1.8115055079559363, "grad_norm": 1.8085198402404785, "learning_rate": 4.821786031898176e-06, "loss": 0.5722, "step": 185 }, { "epoch": 1.821297429620563, "grad_norm": 1.9642200469970703, "learning_rate": 4.819877724641437e-06, "loss": 0.8131, "step": 186 }, { "epoch": 1.8310893512851898, "grad_norm": 1.7268058061599731, "learning_rate": 4.817959636416969e-06, "loss": 0.8849, "step": 187 }, { "epoch": 1.8408812729498165, "grad_norm": 1.7369985580444336, "learning_rate": 4.8160317753117326e-06, "loss": 0.8433, "step": 188 }, { "epoch": 1.8506731946144432, "grad_norm": 1.894836664199829, "learning_rate": 4.814094149453891e-06, "loss": 0.8243, "step": 189 }, { "epoch": 1.8604651162790697, "grad_norm": 1.8126766681671143, "learning_rate": 4.81214676701278e-06, "loss": 0.9671, "step": 190 }, { "epoch": 1.8702570379436965, "grad_norm": 1.8832120895385742, "learning_rate": 4.8101896361988675e-06, "loss": 0.8657, "step": 191 }, { "epoch": 1.880048959608323, "grad_norm": 1.7154321670532227, "learning_rate": 4.808222765263724e-06, "loss": 1.0075, "step": 192 }, { "epoch": 1.8898408812729497, "grad_norm": 1.9785068035125732, "learning_rate": 4.806246162499985e-06, "loss": 0.8027, "step": 193 }, { "epoch": 1.8996328029375764, "grad_norm": 1.825987696647644, "learning_rate": 4.8042598362413175e-06, "loss": 0.9135, "step": 194 }, { "epoch": 1.9094247246022031, "grad_norm": 1.9241256713867188, "learning_rate": 4.802263794862385e-06, "loss": 0.8403, "step": 195 }, { "epoch": 1.9192166462668299, "grad_norm": 1.920577883720398, "learning_rate": 4.800258046778809e-06, "loss": 0.7258, "step": 196 }, { "epoch": 1.9290085679314566, "grad_norm": 1.66743803024292, "learning_rate": 4.798242600447137e-06, "loss": 0.709, "step": 197 }, { "epoch": 1.9388004895960833, "grad_norm": 1.778154969215393, "learning_rate": 4.796217464364808e-06, "loss": 0.8337, "step": 198 }, { "epoch": 1.94859241126071, "grad_norm": 1.703848958015442, "learning_rate": 4.794182647070112e-06, "loss": 0.6687, "step": 199 }, { "epoch": 1.9583843329253368, "grad_norm": 1.9317318201065063, "learning_rate": 4.792138157142158e-06, "loss": 0.9651, "step": 200 }, { "epoch": 1.9681762545899633, "grad_norm": 1.9100357294082642, "learning_rate": 4.790084003200835e-06, "loss": 0.793, "step": 201 }, { "epoch": 1.97796817625459, "grad_norm": 1.706403136253357, "learning_rate": 4.788020193906776e-06, "loss": 0.8398, "step": 202 }, { "epoch": 1.9877600979192165, "grad_norm": 1.6666045188903809, "learning_rate": 4.785946737961328e-06, "loss": 0.801, "step": 203 }, { "epoch": 1.9975520195838432, "grad_norm": 1.7287180423736572, "learning_rate": 4.783863644106502e-06, "loss": 1.0011, "step": 204 }, { "epoch": 2.00734394124847, "grad_norm": 5.877979755401611, "learning_rate": 4.781770921124951e-06, "loss": 2.0292, "step": 205 }, { "epoch": 2.0171358629130967, "grad_norm": 1.9121031761169434, "learning_rate": 4.779668577839921e-06, "loss": 0.7222, "step": 206 }, { "epoch": 2.0269277845777234, "grad_norm": 1.684372901916504, "learning_rate": 4.7775566231152216e-06, "loss": 0.8345, "step": 207 }, { "epoch": 2.03671970624235, "grad_norm": 1.6510744094848633, "learning_rate": 4.775435065855183e-06, "loss": 0.9588, "step": 208 }, { "epoch": 2.046511627906977, "grad_norm": 1.4488738775253296, "learning_rate": 4.7733039150046235e-06, "loss": 0.667, "step": 209 }, { "epoch": 2.0563035495716036, "grad_norm": 1.8589884042739868, "learning_rate": 4.771163179548809e-06, "loss": 1.127, "step": 210 }, { "epoch": 2.0660954712362303, "grad_norm": 1.8021223545074463, "learning_rate": 4.769012868513416e-06, "loss": 0.6917, "step": 211 }, { "epoch": 2.075887392900857, "grad_norm": 1.5799764394760132, "learning_rate": 4.766852990964492e-06, "loss": 0.7346, "step": 212 }, { "epoch": 2.0856793145654833, "grad_norm": 1.56980299949646, "learning_rate": 4.764683556008418e-06, "loss": 0.7308, "step": 213 }, { "epoch": 2.09547123623011, "grad_norm": 1.8797266483306885, "learning_rate": 4.762504572791873e-06, "loss": 0.9757, "step": 214 }, { "epoch": 2.1052631578947367, "grad_norm": 1.7201037406921387, "learning_rate": 4.7603160505017895e-06, "loss": 0.8619, "step": 215 }, { "epoch": 2.1150550795593634, "grad_norm": 1.9809595346450806, "learning_rate": 4.7581179983653224e-06, "loss": 1.1416, "step": 216 }, { "epoch": 2.12484700122399, "grad_norm": 1.9553593397140503, "learning_rate": 4.755910425649803e-06, "loss": 0.6144, "step": 217 }, { "epoch": 2.134638922888617, "grad_norm": 1.5125792026519775, "learning_rate": 4.753693341662702e-06, "loss": 0.6474, "step": 218 }, { "epoch": 2.1444308445532436, "grad_norm": 1.6196978092193604, "learning_rate": 4.7514667557515935e-06, "loss": 0.6626, "step": 219 }, { "epoch": 2.1542227662178703, "grad_norm": 2.045452833175659, "learning_rate": 4.749230677304114e-06, "loss": 0.9574, "step": 220 }, { "epoch": 2.164014687882497, "grad_norm": 2.0072288513183594, "learning_rate": 4.746985115747918e-06, "loss": 0.7487, "step": 221 }, { "epoch": 2.173806609547124, "grad_norm": 1.8324187994003296, "learning_rate": 4.7447300805506455e-06, "loss": 0.7926, "step": 222 }, { "epoch": 2.18359853121175, "grad_norm": 1.844360113143921, "learning_rate": 4.742465581219878e-06, "loss": 0.7883, "step": 223 }, { "epoch": 2.193390452876377, "grad_norm": 1.8216134309768677, "learning_rate": 4.7401916273031e-06, "loss": 0.8582, "step": 224 }, { "epoch": 2.2031823745410035, "grad_norm": 1.668815016746521, "learning_rate": 4.737908228387656e-06, "loss": 0.6368, "step": 225 }, { "epoch": 2.2129742962056302, "grad_norm": 1.7899885177612305, "learning_rate": 4.7356153941007145e-06, "loss": 0.7993, "step": 226 }, { "epoch": 2.222766217870257, "grad_norm": 1.679237723350525, "learning_rate": 4.733313134109223e-06, "loss": 0.8697, "step": 227 }, { "epoch": 2.2325581395348837, "grad_norm": 1.8128347396850586, "learning_rate": 4.73100145811987e-06, "loss": 0.6255, "step": 228 }, { "epoch": 2.2423500611995104, "grad_norm": 1.8245306015014648, "learning_rate": 4.728680375879045e-06, "loss": 0.6866, "step": 229 }, { "epoch": 2.252141982864137, "grad_norm": 1.6740870475769043, "learning_rate": 4.726349897172791e-06, "loss": 0.7194, "step": 230 }, { "epoch": 2.261933904528764, "grad_norm": 1.6936622858047485, "learning_rate": 4.724010031826775e-06, "loss": 0.6239, "step": 231 }, { "epoch": 2.2717258261933906, "grad_norm": 1.84676194190979, "learning_rate": 4.721660789706232e-06, "loss": 0.78, "step": 232 }, { "epoch": 2.2815177478580173, "grad_norm": 2.130429983139038, "learning_rate": 4.719302180715938e-06, "loss": 0.9668, "step": 233 }, { "epoch": 2.2913096695226436, "grad_norm": 1.5900758504867554, "learning_rate": 4.716934214800155e-06, "loss": 0.8592, "step": 234 }, { "epoch": 2.3011015911872703, "grad_norm": 1.7055306434631348, "learning_rate": 4.714556901942599e-06, "loss": 0.9039, "step": 235 }, { "epoch": 2.310893512851897, "grad_norm": 1.7725728750228882, "learning_rate": 4.712170252166395e-06, "loss": 0.8064, "step": 236 }, { "epoch": 2.3206854345165238, "grad_norm": 1.6738322973251343, "learning_rate": 4.709774275534031e-06, "loss": 0.7163, "step": 237 }, { "epoch": 2.3304773561811505, "grad_norm": 1.9001731872558594, "learning_rate": 4.707368982147318e-06, "loss": 0.8895, "step": 238 }, { "epoch": 2.340269277845777, "grad_norm": 1.6682621240615845, "learning_rate": 4.704954382147351e-06, "loss": 0.8012, "step": 239 }, { "epoch": 2.350061199510404, "grad_norm": 1.7350693941116333, "learning_rate": 4.702530485714462e-06, "loss": 0.6117, "step": 240 }, { "epoch": 2.3598531211750307, "grad_norm": 1.809665322303772, "learning_rate": 4.700097303068177e-06, "loss": 0.7626, "step": 241 }, { "epoch": 2.3696450428396574, "grad_norm": 1.9229344129562378, "learning_rate": 4.697654844467175e-06, "loss": 0.8459, "step": 242 }, { "epoch": 2.379436964504284, "grad_norm": 1.554150938987732, "learning_rate": 4.695203120209245e-06, "loss": 0.6786, "step": 243 }, { "epoch": 2.389228886168911, "grad_norm": 1.9067550897598267, "learning_rate": 4.69274214063124e-06, "loss": 0.9324, "step": 244 }, { "epoch": 2.399020807833537, "grad_norm": 1.9316915273666382, "learning_rate": 4.690271916109034e-06, "loss": 1.0489, "step": 245 }, { "epoch": 2.408812729498164, "grad_norm": 1.7435996532440186, "learning_rate": 4.687792457057482e-06, "loss": 0.6831, "step": 246 }, { "epoch": 2.4186046511627906, "grad_norm": 1.4588803052902222, "learning_rate": 4.685303773930371e-06, "loss": 0.7357, "step": 247 }, { "epoch": 2.4283965728274173, "grad_norm": 1.5775786638259888, "learning_rate": 4.682805877220378e-06, "loss": 0.7255, "step": 248 }, { "epoch": 2.438188494492044, "grad_norm": 1.6537951231002808, "learning_rate": 4.6802987774590275e-06, "loss": 0.8292, "step": 249 }, { "epoch": 2.4479804161566707, "grad_norm": 1.8435249328613281, "learning_rate": 4.677782485216644e-06, "loss": 0.8582, "step": 250 }, { "epoch": 2.4577723378212974, "grad_norm": 1.7312040328979492, "learning_rate": 4.675257011102311e-06, "loss": 0.7965, "step": 251 }, { "epoch": 2.467564259485924, "grad_norm": 1.6767661571502686, "learning_rate": 4.672722365763821e-06, "loss": 0.68, "step": 252 }, { "epoch": 2.477356181150551, "grad_norm": 1.5255998373031616, "learning_rate": 4.670178559887637e-06, "loss": 0.5391, "step": 253 }, { "epoch": 2.4871481028151776, "grad_norm": 1.6863305568695068, "learning_rate": 4.667625604198842e-06, "loss": 0.7528, "step": 254 }, { "epoch": 2.4969400244798043, "grad_norm": 1.9714897871017456, "learning_rate": 4.665063509461098e-06, "loss": 1.0592, "step": 255 }, { "epoch": 2.5067319461444306, "grad_norm": 2.3776144981384277, "learning_rate": 4.662492286476595e-06, "loss": 0.9442, "step": 256 }, { "epoch": 2.516523867809058, "grad_norm": 1.8653812408447266, "learning_rate": 4.6599119460860145e-06, "loss": 0.973, "step": 257 }, { "epoch": 2.526315789473684, "grad_norm": 1.7013781070709229, "learning_rate": 4.657322499168475e-06, "loss": 0.6615, "step": 258 }, { "epoch": 2.536107711138311, "grad_norm": 1.7495367527008057, "learning_rate": 4.654723956641489e-06, "loss": 0.9595, "step": 259 }, { "epoch": 2.5458996328029375, "grad_norm": 1.709252119064331, "learning_rate": 4.65211632946092e-06, "loss": 0.6111, "step": 260 }, { "epoch": 2.5556915544675642, "grad_norm": 2.163742780685425, "learning_rate": 4.649499628620931e-06, "loss": 0.9475, "step": 261 }, { "epoch": 2.565483476132191, "grad_norm": 1.8249528408050537, "learning_rate": 4.646873865153945e-06, "loss": 0.7806, "step": 262 }, { "epoch": 2.5752753977968177, "grad_norm": 1.5504190921783447, "learning_rate": 4.644239050130589e-06, "loss": 0.5171, "step": 263 }, { "epoch": 2.5850673194614444, "grad_norm": 2.1261394023895264, "learning_rate": 4.641595194659657e-06, "loss": 0.8785, "step": 264 }, { "epoch": 2.594859241126071, "grad_norm": 1.6256144046783447, "learning_rate": 4.638942309888058e-06, "loss": 0.7413, "step": 265 }, { "epoch": 2.604651162790698, "grad_norm": 2.0446391105651855, "learning_rate": 4.63628040700077e-06, "loss": 1.3574, "step": 266 }, { "epoch": 2.614443084455324, "grad_norm": 1.6148625612258911, "learning_rate": 4.63360949722079e-06, "loss": 0.6135, "step": 267 }, { "epoch": 2.6242350061199513, "grad_norm": 1.766831874847412, "learning_rate": 4.630929591809095e-06, "loss": 0.8657, "step": 268 }, { "epoch": 2.6340269277845776, "grad_norm": 1.9170904159545898, "learning_rate": 4.6282407020645825e-06, "loss": 0.8043, "step": 269 }, { "epoch": 2.6438188494492043, "grad_norm": 1.6428509950637817, "learning_rate": 4.625542839324036e-06, "loss": 0.7712, "step": 270 }, { "epoch": 2.653610771113831, "grad_norm": 1.6599236726760864, "learning_rate": 4.622836014962065e-06, "loss": 0.8619, "step": 271 }, { "epoch": 2.6634026927784578, "grad_norm": 1.7766387462615967, "learning_rate": 4.620120240391065e-06, "loss": 0.823, "step": 272 }, { "epoch": 2.6731946144430845, "grad_norm": 1.8489127159118652, "learning_rate": 4.617395527061168e-06, "loss": 0.8095, "step": 273 }, { "epoch": 2.682986536107711, "grad_norm": 1.8188955783843994, "learning_rate": 4.614661886460191e-06, "loss": 0.79, "step": 274 }, { "epoch": 2.692778457772338, "grad_norm": 1.9311672449111938, "learning_rate": 4.611919330113592e-06, "loss": 0.9358, "step": 275 }, { "epoch": 2.7025703794369647, "grad_norm": 1.779396414756775, "learning_rate": 4.609167869584416e-06, "loss": 0.5877, "step": 276 }, { "epoch": 2.7123623011015914, "grad_norm": 1.7271040678024292, "learning_rate": 4.606407516473254e-06, "loss": 0.821, "step": 277 }, { "epoch": 2.7221542227662177, "grad_norm": 1.746903419494629, "learning_rate": 4.6036382824181836e-06, "loss": 0.8218, "step": 278 }, { "epoch": 2.731946144430845, "grad_norm": 1.6061373949050903, "learning_rate": 4.600860179094732e-06, "loss": 0.8719, "step": 279 }, { "epoch": 2.741738066095471, "grad_norm": 1.6927648782730103, "learning_rate": 4.598073218215817e-06, "loss": 0.7247, "step": 280 }, { "epoch": 2.751529987760098, "grad_norm": 1.6876628398895264, "learning_rate": 4.595277411531701e-06, "loss": 0.8858, "step": 281 }, { "epoch": 2.7613219094247246, "grad_norm": 1.7462552785873413, "learning_rate": 4.592472770829945e-06, "loss": 0.8907, "step": 282 }, { "epoch": 2.7711138310893513, "grad_norm": 1.5235626697540283, "learning_rate": 4.5896593079353515e-06, "loss": 0.557, "step": 283 }, { "epoch": 2.780905752753978, "grad_norm": 1.8148753643035889, "learning_rate": 4.586837034709921e-06, "loss": 0.842, "step": 284 }, { "epoch": 2.7906976744186047, "grad_norm": 1.7743046283721924, "learning_rate": 4.584005963052799e-06, "loss": 0.9448, "step": 285 }, { "epoch": 2.8004895960832314, "grad_norm": 2.061232805252075, "learning_rate": 4.581166104900228e-06, "loss": 0.9421, "step": 286 }, { "epoch": 2.810281517747858, "grad_norm": 1.5784261226654053, "learning_rate": 4.5783174722254934e-06, "loss": 0.8144, "step": 287 }, { "epoch": 2.820073439412485, "grad_norm": 1.7882845401763916, "learning_rate": 4.575460077038877e-06, "loss": 0.8148, "step": 288 }, { "epoch": 2.829865361077111, "grad_norm": 1.7912652492523193, "learning_rate": 4.572593931387604e-06, "loss": 0.8074, "step": 289 }, { "epoch": 2.8396572827417383, "grad_norm": 1.6410728693008423, "learning_rate": 4.569719047355795e-06, "loss": 0.6716, "step": 290 }, { "epoch": 2.8494492044063646, "grad_norm": 1.6893397569656372, "learning_rate": 4.566835437064409e-06, "loss": 0.777, "step": 291 }, { "epoch": 2.8592411260709913, "grad_norm": 1.8589907884597778, "learning_rate": 4.5639431126712e-06, "loss": 0.7802, "step": 292 }, { "epoch": 2.869033047735618, "grad_norm": 1.8061766624450684, "learning_rate": 4.561042086370659e-06, "loss": 0.6876, "step": 293 }, { "epoch": 2.878824969400245, "grad_norm": 1.781959891319275, "learning_rate": 4.5581323703939685e-06, "loss": 0.6102, "step": 294 }, { "epoch": 2.8886168910648715, "grad_norm": 1.786219596862793, "learning_rate": 4.555213977008946e-06, "loss": 0.7686, "step": 295 }, { "epoch": 2.8984088127294982, "grad_norm": 1.6375802755355835, "learning_rate": 4.552286918519996e-06, "loss": 0.7579, "step": 296 }, { "epoch": 2.908200734394125, "grad_norm": 1.6881898641586304, "learning_rate": 4.5493512072680535e-06, "loss": 0.8043, "step": 297 }, { "epoch": 2.9179926560587517, "grad_norm": 1.639721155166626, "learning_rate": 4.5464068556305375e-06, "loss": 0.6017, "step": 298 }, { "epoch": 2.9277845777233784, "grad_norm": 1.8131664991378784, "learning_rate": 4.543453876021297e-06, "loss": 0.9044, "step": 299 }, { "epoch": 2.9375764993880047, "grad_norm": 2.314148187637329, "learning_rate": 4.540492280890555e-06, "loss": 0.9388, "step": 300 }, { "epoch": 2.9473684210526314, "grad_norm": 1.8061414957046509, "learning_rate": 4.53752208272486e-06, "loss": 0.7033, "step": 301 }, { "epoch": 2.957160342717258, "grad_norm": 1.9006153345108032, "learning_rate": 4.534543294047033e-06, "loss": 0.9874, "step": 302 }, { "epoch": 2.966952264381885, "grad_norm": 1.6536442041397095, "learning_rate": 4.531555927416115e-06, "loss": 0.6166, "step": 303 }, { "epoch": 2.9767441860465116, "grad_norm": 1.7328367233276367, "learning_rate": 4.528559995427309e-06, "loss": 0.6354, "step": 304 }, { "epoch": 2.9865361077111383, "grad_norm": 2.011812448501587, "learning_rate": 4.525555510711934e-06, "loss": 0.9448, "step": 305 }, { "epoch": 2.996328029375765, "grad_norm": 1.5000935792922974, "learning_rate": 4.522542485937369e-06, "loss": 0.6131, "step": 306 }, { "epoch": 3.0061199510403918, "grad_norm": 4.0604681968688965, "learning_rate": 4.519520933806997e-06, "loss": 1.0448, "step": 307 }, { "epoch": 3.0159118727050185, "grad_norm": 1.7402530908584595, "learning_rate": 4.516490867060156e-06, "loss": 0.7478, "step": 308 }, { "epoch": 3.025703794369645, "grad_norm": 1.6993818283081055, "learning_rate": 4.5134522984720816e-06, "loss": 0.5536, "step": 309 }, { "epoch": 3.035495716034272, "grad_norm": 1.845446228981018, "learning_rate": 4.5104052408538545e-06, "loss": 0.8718, "step": 310 }, { "epoch": 3.045287637698898, "grad_norm": 1.6233938932418823, "learning_rate": 4.507349707052347e-06, "loss": 0.6232, "step": 311 }, { "epoch": 3.055079559363525, "grad_norm": 1.780224323272705, "learning_rate": 4.504285709950167e-06, "loss": 0.6192, "step": 312 }, { "epoch": 3.0648714810281517, "grad_norm": 2.497929334640503, "learning_rate": 4.501213262465607e-06, "loss": 0.8197, "step": 313 }, { "epoch": 3.0746634026927784, "grad_norm": 1.9052263498306274, "learning_rate": 4.498132377552587e-06, "loss": 0.7437, "step": 314 }, { "epoch": 3.084455324357405, "grad_norm": 1.5467990636825562, "learning_rate": 4.4950430682005995e-06, "loss": 0.5957, "step": 315 }, { "epoch": 3.094247246022032, "grad_norm": 1.8469765186309814, "learning_rate": 4.491945347434656e-06, "loss": 0.7685, "step": 316 }, { "epoch": 3.1040391676866586, "grad_norm": 1.7806025743484497, "learning_rate": 4.488839228315233e-06, "loss": 0.604, "step": 317 }, { "epoch": 3.1138310893512853, "grad_norm": 1.7567436695098877, "learning_rate": 4.485724723938215e-06, "loss": 0.6989, "step": 318 }, { "epoch": 3.123623011015912, "grad_norm": 1.504656195640564, "learning_rate": 4.482601847434841e-06, "loss": 0.6728, "step": 319 }, { "epoch": 3.1334149326805387, "grad_norm": 1.8254302740097046, "learning_rate": 4.479470611971646e-06, "loss": 0.7716, "step": 320 }, { "epoch": 3.1432068543451654, "grad_norm": 1.8780500888824463, "learning_rate": 4.4763310307504084e-06, "loss": 1.1413, "step": 321 }, { "epoch": 3.1529987760097917, "grad_norm": 1.7721192836761475, "learning_rate": 4.473183117008096e-06, "loss": 0.6591, "step": 322 }, { "epoch": 3.1627906976744184, "grad_norm": 1.982852816581726, "learning_rate": 4.470026884016805e-06, "loss": 0.9408, "step": 323 }, { "epoch": 3.172582619339045, "grad_norm": 1.592284917831421, "learning_rate": 4.4668623450837085e-06, "loss": 0.6208, "step": 324 }, { "epoch": 3.182374541003672, "grad_norm": 1.6603188514709473, "learning_rate": 4.463689513550997e-06, "loss": 0.6565, "step": 325 }, { "epoch": 3.1921664626682986, "grad_norm": 1.9736286401748657, "learning_rate": 4.460508402795827e-06, "loss": 1.0639, "step": 326 }, { "epoch": 3.2019583843329253, "grad_norm": 1.7832359075546265, "learning_rate": 4.457319026230257e-06, "loss": 0.6116, "step": 327 }, { "epoch": 3.211750305997552, "grad_norm": 1.7322028875350952, "learning_rate": 4.4541213973012005e-06, "loss": 0.8676, "step": 328 }, { "epoch": 3.221542227662179, "grad_norm": 1.4758862257003784, "learning_rate": 4.450915529490359e-06, "loss": 0.5178, "step": 329 }, { "epoch": 3.2313341493268055, "grad_norm": 1.759240984916687, "learning_rate": 4.447701436314176e-06, "loss": 0.724, "step": 330 }, { "epoch": 3.2411260709914322, "grad_norm": 1.8031115531921387, "learning_rate": 4.44447913132377e-06, "loss": 0.687, "step": 331 }, { "epoch": 3.250917992656059, "grad_norm": 1.791666030883789, "learning_rate": 4.441248628104884e-06, "loss": 0.8308, "step": 332 }, { "epoch": 3.2607099143206852, "grad_norm": 1.654770851135254, "learning_rate": 4.438009940277825e-06, "loss": 0.7024, "step": 333 }, { "epoch": 3.270501835985312, "grad_norm": 1.7612643241882324, "learning_rate": 4.434763081497407e-06, "loss": 0.7022, "step": 334 }, { "epoch": 3.2802937576499387, "grad_norm": 1.7003636360168457, "learning_rate": 4.431508065452897e-06, "loss": 0.6483, "step": 335 }, { "epoch": 3.2900856793145654, "grad_norm": 2.109245777130127, "learning_rate": 4.428244905867952e-06, "loss": 0.7754, "step": 336 }, { "epoch": 3.299877600979192, "grad_norm": 1.938667893409729, "learning_rate": 4.424973616500563e-06, "loss": 0.5672, "step": 337 }, { "epoch": 3.309669522643819, "grad_norm": 1.660105586051941, "learning_rate": 4.421694211142998e-06, "loss": 0.6269, "step": 338 }, { "epoch": 3.3194614443084456, "grad_norm": 1.6938822269439697, "learning_rate": 4.418406703621743e-06, "loss": 0.6615, "step": 339 }, { "epoch": 3.3292533659730723, "grad_norm": 1.8447024822235107, "learning_rate": 4.415111107797445e-06, "loss": 0.7797, "step": 340 }, { "epoch": 3.339045287637699, "grad_norm": 1.7716082334518433, "learning_rate": 4.411807437564854e-06, "loss": 0.7762, "step": 341 }, { "epoch": 3.3488372093023258, "grad_norm": 1.9670720100402832, "learning_rate": 4.408495706852758e-06, "loss": 0.8167, "step": 342 }, { "epoch": 3.3586291309669525, "grad_norm": 1.7168461084365845, "learning_rate": 4.405175929623934e-06, "loss": 0.9784, "step": 343 }, { "epoch": 3.3684210526315788, "grad_norm": 1.6868672370910645, "learning_rate": 4.401848119875081e-06, "loss": 0.7769, "step": 344 }, { "epoch": 3.3782129742962055, "grad_norm": 1.82831871509552, "learning_rate": 4.398512291636768e-06, "loss": 0.8146, "step": 345 }, { "epoch": 3.388004895960832, "grad_norm": 1.8755537271499634, "learning_rate": 4.395168458973368e-06, "loss": 0.8937, "step": 346 }, { "epoch": 3.397796817625459, "grad_norm": 2.1765365600585938, "learning_rate": 4.391816635983004e-06, "loss": 0.9958, "step": 347 }, { "epoch": 3.4075887392900857, "grad_norm": 1.6079312562942505, "learning_rate": 4.388456836797484e-06, "loss": 0.652, "step": 348 }, { "epoch": 3.4173806609547124, "grad_norm": 1.8513658046722412, "learning_rate": 4.385089075582251e-06, "loss": 0.7091, "step": 349 }, { "epoch": 3.427172582619339, "grad_norm": 1.6403636932373047, "learning_rate": 4.381713366536312e-06, "loss": 0.7441, "step": 350 }, { "epoch": 3.436964504283966, "grad_norm": 1.7154061794281006, "learning_rate": 4.378329723892184e-06, "loss": 0.79, "step": 351 }, { "epoch": 3.4467564259485926, "grad_norm": 1.9436532258987427, "learning_rate": 4.374938161915835e-06, "loss": 0.7786, "step": 352 }, { "epoch": 3.4565483476132193, "grad_norm": 1.8954344987869263, "learning_rate": 4.37153869490662e-06, "loss": 0.7084, "step": 353 }, { "epoch": 3.466340269277846, "grad_norm": 1.741155982017517, "learning_rate": 4.368131337197228e-06, "loss": 0.7567, "step": 354 }, { "epoch": 3.4761321909424723, "grad_norm": 1.6095000505447388, "learning_rate": 4.364716103153609e-06, "loss": 0.5588, "step": 355 }, { "epoch": 3.485924112607099, "grad_norm": 1.8443665504455566, "learning_rate": 4.361293007174926e-06, "loss": 0.9721, "step": 356 }, { "epoch": 3.4957160342717257, "grad_norm": 1.6049314737319946, "learning_rate": 4.357862063693486e-06, "loss": 0.6667, "step": 357 }, { "epoch": 3.5055079559363524, "grad_norm": 1.7836921215057373, "learning_rate": 4.354423287174686e-06, "loss": 0.804, "step": 358 }, { "epoch": 3.515299877600979, "grad_norm": 1.6569099426269531, "learning_rate": 4.350976692116945e-06, "loss": 0.9696, "step": 359 }, { "epoch": 3.525091799265606, "grad_norm": 1.857596755027771, "learning_rate": 4.3475222930516484e-06, "loss": 0.9303, "step": 360 }, { "epoch": 3.5348837209302326, "grad_norm": 1.9433345794677734, "learning_rate": 4.3440601045430825e-06, "loss": 0.9228, "step": 361 }, { "epoch": 3.5446756425948593, "grad_norm": 1.7486381530761719, "learning_rate": 4.340590141188377e-06, "loss": 0.8053, "step": 362 }, { "epoch": 3.554467564259486, "grad_norm": 1.555833101272583, "learning_rate": 4.3371124176174396e-06, "loss": 0.7447, "step": 363 }, { "epoch": 3.5642594859241123, "grad_norm": 2.2484242916107178, "learning_rate": 4.333626948492898e-06, "loss": 0.8457, "step": 364 }, { "epoch": 3.5740514075887395, "grad_norm": 1.8998305797576904, "learning_rate": 4.330133748510037e-06, "loss": 0.8809, "step": 365 }, { "epoch": 3.583843329253366, "grad_norm": 1.615239143371582, "learning_rate": 4.326632832396733e-06, "loss": 0.6692, "step": 366 }, { "epoch": 3.5936352509179925, "grad_norm": 1.727641224861145, "learning_rate": 4.323124214913397e-06, "loss": 0.5472, "step": 367 }, { "epoch": 3.6034271725826192, "grad_norm": 1.9414492845535278, "learning_rate": 4.319607910852911e-06, "loss": 0.6891, "step": 368 }, { "epoch": 3.613219094247246, "grad_norm": 1.8788846731185913, "learning_rate": 4.316083935040561e-06, "loss": 0.6575, "step": 369 }, { "epoch": 3.6230110159118727, "grad_norm": 1.8999978303909302, "learning_rate": 4.3125523023339825e-06, "loss": 0.7525, "step": 370 }, { "epoch": 3.6328029375764994, "grad_norm": 1.689702033996582, "learning_rate": 4.3090130276230915e-06, "loss": 0.5294, "step": 371 }, { "epoch": 3.642594859241126, "grad_norm": 1.7235329151153564, "learning_rate": 4.305466125830023e-06, "loss": 0.7487, "step": 372 }, { "epoch": 3.652386780905753, "grad_norm": 1.699202299118042, "learning_rate": 4.301911611909074e-06, "loss": 0.6459, "step": 373 }, { "epoch": 3.6621787025703796, "grad_norm": 2.258392095565796, "learning_rate": 4.2983495008466285e-06, "loss": 1.0021, "step": 374 }, { "epoch": 3.671970624235006, "grad_norm": 1.7750481367111206, "learning_rate": 4.294779807661105e-06, "loss": 0.6392, "step": 375 }, { "epoch": 3.681762545899633, "grad_norm": 1.7278869152069092, "learning_rate": 4.29120254740289e-06, "loss": 0.7892, "step": 376 }, { "epoch": 3.6915544675642593, "grad_norm": 1.7488670349121094, "learning_rate": 4.287617735154272e-06, "loss": 0.6689, "step": 377 }, { "epoch": 3.701346389228886, "grad_norm": 1.7195801734924316, "learning_rate": 4.284025386029381e-06, "loss": 0.6903, "step": 378 }, { "epoch": 3.7111383108935128, "grad_norm": 1.9059380292892456, "learning_rate": 4.280425515174124e-06, "loss": 0.7048, "step": 379 }, { "epoch": 3.7209302325581395, "grad_norm": 1.9076306819915771, "learning_rate": 4.276818137766118e-06, "loss": 0.869, "step": 380 }, { "epoch": 3.730722154222766, "grad_norm": 1.8751521110534668, "learning_rate": 4.273203269014634e-06, "loss": 0.9298, "step": 381 }, { "epoch": 3.740514075887393, "grad_norm": 1.671371579170227, "learning_rate": 4.269580924160523e-06, "loss": 0.8358, "step": 382 }, { "epoch": 3.7503059975520197, "grad_norm": 2.1627748012542725, "learning_rate": 4.265951118476158e-06, "loss": 1.1296, "step": 383 }, { "epoch": 3.7600979192166464, "grad_norm": 1.7722114324569702, "learning_rate": 4.262313867265369e-06, "loss": 0.5577, "step": 384 }, { "epoch": 3.769889840881273, "grad_norm": 1.788717269897461, "learning_rate": 4.258669185863375e-06, "loss": 0.6594, "step": 385 }, { "epoch": 3.7796817625458994, "grad_norm": 1.737066626548767, "learning_rate": 4.255017089636725e-06, "loss": 0.6728, "step": 386 }, { "epoch": 3.7894736842105265, "grad_norm": 2.048884630203247, "learning_rate": 4.251357593983228e-06, "loss": 0.9587, "step": 387 }, { "epoch": 3.799265605875153, "grad_norm": 1.3372879028320312, "learning_rate": 4.24769071433189e-06, "loss": 0.3893, "step": 388 }, { "epoch": 3.8090575275397796, "grad_norm": 1.9370672702789307, "learning_rate": 4.244016466142852e-06, "loss": 0.8633, "step": 389 }, { "epoch": 3.8188494492044063, "grad_norm": 1.9244333505630493, "learning_rate": 4.240334864907317e-06, "loss": 0.7675, "step": 390 }, { "epoch": 3.828641370869033, "grad_norm": 1.8911683559417725, "learning_rate": 4.236645926147493e-06, "loss": 0.7063, "step": 391 }, { "epoch": 3.8384332925336597, "grad_norm": 1.3820501565933228, "learning_rate": 4.232949665416526e-06, "loss": 0.6153, "step": 392 }, { "epoch": 3.8482252141982864, "grad_norm": 1.8441942930221558, "learning_rate": 4.229246098298426e-06, "loss": 0.9155, "step": 393 }, { "epoch": 3.858017135862913, "grad_norm": 1.848370909690857, "learning_rate": 4.225535240408014e-06, "loss": 0.7187, "step": 394 }, { "epoch": 3.86780905752754, "grad_norm": 1.825411319732666, "learning_rate": 4.221817107390847e-06, "loss": 0.7561, "step": 395 }, { "epoch": 3.8776009791921666, "grad_norm": 1.918476939201355, "learning_rate": 4.218091714923157e-06, "loss": 0.7431, "step": 396 }, { "epoch": 3.887392900856793, "grad_norm": 1.5590591430664062, "learning_rate": 4.214359078711782e-06, "loss": 0.6068, "step": 397 }, { "epoch": 3.89718482252142, "grad_norm": 1.538544774055481, "learning_rate": 4.210619214494099e-06, "loss": 0.7647, "step": 398 }, { "epoch": 3.9069767441860463, "grad_norm": 1.9497493505477905, "learning_rate": 4.206872138037964e-06, "loss": 0.9843, "step": 399 }, { "epoch": 3.916768665850673, "grad_norm": 2.0706608295440674, "learning_rate": 4.203117865141635e-06, "loss": 0.8955, "step": 400 }, { "epoch": 3.9265605875153, "grad_norm": 1.7848016023635864, "learning_rate": 4.199356411633717e-06, "loss": 0.6951, "step": 401 }, { "epoch": 3.9363525091799265, "grad_norm": 1.8751904964447021, "learning_rate": 4.195587793373085e-06, "loss": 0.6471, "step": 402 }, { "epoch": 3.9461444308445532, "grad_norm": 1.6563018560409546, "learning_rate": 4.191812026248825e-06, "loss": 0.737, "step": 403 }, { "epoch": 3.95593635250918, "grad_norm": 1.8174633979797363, "learning_rate": 4.188029126180161e-06, "loss": 0.8826, "step": 404 }, { "epoch": 3.9657282741738067, "grad_norm": 1.4947586059570312, "learning_rate": 4.184239109116393e-06, "loss": 0.4843, "step": 405 }, { "epoch": 3.9755201958384334, "grad_norm": 1.8430842161178589, "learning_rate": 4.180441991036827e-06, "loss": 0.8971, "step": 406 }, { "epoch": 3.98531211750306, "grad_norm": 1.5086185932159424, "learning_rate": 4.1766377879507055e-06, "loss": 0.4906, "step": 407 }, { "epoch": 3.9951040391676864, "grad_norm": 2.0418710708618164, "learning_rate": 4.172826515897146e-06, "loss": 0.8966, "step": 408 }, { "epoch": 4.004895960832314, "grad_norm": 8.560981750488281, "learning_rate": 4.169008190945067e-06, "loss": 1.2406, "step": 409 }, { "epoch": 4.01468788249694, "grad_norm": 1.7079252004623413, "learning_rate": 4.165182829193126e-06, "loss": 0.6482, "step": 410 }, { "epoch": 4.024479804161567, "grad_norm": 1.6790159940719604, "learning_rate": 4.161350446769645e-06, "loss": 0.6079, "step": 411 }, { "epoch": 4.034271725826193, "grad_norm": 1.5673798322677612, "learning_rate": 4.15751105983255e-06, "loss": 0.5845, "step": 412 }, { "epoch": 4.0440636474908205, "grad_norm": 1.6145811080932617, "learning_rate": 4.1536646845692976e-06, "loss": 0.681, "step": 413 }, { "epoch": 4.053855569155447, "grad_norm": 1.803058385848999, "learning_rate": 4.149811337196808e-06, "loss": 0.9443, "step": 414 }, { "epoch": 4.063647490820073, "grad_norm": 1.5080933570861816, "learning_rate": 4.1459510339613954e-06, "loss": 0.5508, "step": 415 }, { "epoch": 4.0734394124847, "grad_norm": 1.8413206338882446, "learning_rate": 4.142083791138703e-06, "loss": 0.9034, "step": 416 }, { "epoch": 4.0832313341493265, "grad_norm": 1.6403236389160156, "learning_rate": 4.138209625033635e-06, "loss": 0.6793, "step": 417 }, { "epoch": 4.093023255813954, "grad_norm": 1.6847971677780151, "learning_rate": 4.134328551980279e-06, "loss": 0.5222, "step": 418 }, { "epoch": 4.10281517747858, "grad_norm": 1.5513584613800049, "learning_rate": 4.130440588341848e-06, "loss": 0.7794, "step": 419 }, { "epoch": 4.112607099143207, "grad_norm": 1.8495330810546875, "learning_rate": 4.126545750510605e-06, "loss": 0.7235, "step": 420 }, { "epoch": 4.122399020807833, "grad_norm": 1.6706733703613281, "learning_rate": 4.1226440549077975e-06, "loss": 0.5705, "step": 421 }, { "epoch": 4.1321909424724605, "grad_norm": 1.760111927986145, "learning_rate": 4.118735517983584e-06, "loss": 0.8207, "step": 422 }, { "epoch": 4.141982864137087, "grad_norm": 1.528247594833374, "learning_rate": 4.114820156216969e-06, "loss": 0.5261, "step": 423 }, { "epoch": 4.151774785801714, "grad_norm": 2.1555042266845703, "learning_rate": 4.110897986115729e-06, "loss": 1.1471, "step": 424 }, { "epoch": 4.16156670746634, "grad_norm": 2.004366874694824, "learning_rate": 4.106969024216348e-06, "loss": 0.7685, "step": 425 }, { "epoch": 4.1713586291309666, "grad_norm": 1.8953263759613037, "learning_rate": 4.1030332870839466e-06, "loss": 0.6383, "step": 426 }, { "epoch": 4.181150550795594, "grad_norm": 1.7345049381256104, "learning_rate": 4.099090791312206e-06, "loss": 0.6935, "step": 427 }, { "epoch": 4.19094247246022, "grad_norm": 1.8631129264831543, "learning_rate": 4.0951415535233065e-06, "loss": 0.8442, "step": 428 }, { "epoch": 4.200734394124847, "grad_norm": 1.7572860717773438, "learning_rate": 4.091185590367854e-06, "loss": 0.684, "step": 429 }, { "epoch": 4.2105263157894735, "grad_norm": 1.7677788734436035, "learning_rate": 4.087222918524807e-06, "loss": 0.5606, "step": 430 }, { "epoch": 4.220318237454101, "grad_norm": 1.859362244606018, "learning_rate": 4.083253554701412e-06, "loss": 0.6481, "step": 431 }, { "epoch": 4.230110159118727, "grad_norm": 2.28190279006958, "learning_rate": 4.079277515633127e-06, "loss": 0.7928, "step": 432 }, { "epoch": 4.239902080783354, "grad_norm": 1.742475986480713, "learning_rate": 4.0752948180835575e-06, "loss": 0.563, "step": 433 }, { "epoch": 4.24969400244798, "grad_norm": 1.6631873846054077, "learning_rate": 4.0713054788443776e-06, "loss": 0.8565, "step": 434 }, { "epoch": 4.2594859241126075, "grad_norm": 1.5401971340179443, "learning_rate": 4.067309514735267e-06, "loss": 0.545, "step": 435 }, { "epoch": 4.269277845777234, "grad_norm": 1.7012211084365845, "learning_rate": 4.063306942603835e-06, "loss": 0.6122, "step": 436 }, { "epoch": 4.27906976744186, "grad_norm": 1.555208444595337, "learning_rate": 4.059297779325555e-06, "loss": 0.4767, "step": 437 }, { "epoch": 4.288861689106487, "grad_norm": 1.7196720838546753, "learning_rate": 4.0552820418036855e-06, "loss": 0.6713, "step": 438 }, { "epoch": 4.2986536107711135, "grad_norm": 1.794687032699585, "learning_rate": 4.051259746969204e-06, "loss": 0.6342, "step": 439 }, { "epoch": 4.308445532435741, "grad_norm": 1.8412281274795532, "learning_rate": 4.0472309117807365e-06, "loss": 0.8008, "step": 440 }, { "epoch": 4.318237454100367, "grad_norm": 2.0671744346618652, "learning_rate": 4.043195553224482e-06, "loss": 0.6764, "step": 441 }, { "epoch": 4.328029375764994, "grad_norm": 1.7817986011505127, "learning_rate": 4.039153688314146e-06, "loss": 0.6126, "step": 442 }, { "epoch": 4.33782129742962, "grad_norm": 2.020407199859619, "learning_rate": 4.035105334090862e-06, "loss": 0.926, "step": 443 }, { "epoch": 4.347613219094248, "grad_norm": 1.8513925075531006, "learning_rate": 4.031050507623125e-06, "loss": 0.7044, "step": 444 }, { "epoch": 4.357405140758874, "grad_norm": 1.8840636014938354, "learning_rate": 4.02698922600672e-06, "loss": 0.6591, "step": 445 }, { "epoch": 4.3671970624235, "grad_norm": 1.8405572175979614, "learning_rate": 4.022921506364644e-06, "loss": 0.5722, "step": 446 }, { "epoch": 4.376988984088127, "grad_norm": 1.7552632093429565, "learning_rate": 4.018847365847042e-06, "loss": 0.646, "step": 447 }, { "epoch": 4.386780905752754, "grad_norm": 1.7341976165771484, "learning_rate": 4.014766821631128e-06, "loss": 0.6496, "step": 448 }, { "epoch": 4.396572827417381, "grad_norm": 2.0633487701416016, "learning_rate": 4.0106798909211145e-06, "loss": 0.7044, "step": 449 }, { "epoch": 4.406364749082007, "grad_norm": 1.9197129011154175, "learning_rate": 4.006586590948141e-06, "loss": 0.6673, "step": 450 }, { "epoch": 4.416156670746634, "grad_norm": 1.7765311002731323, "learning_rate": 4.002486938970203e-06, "loss": 0.6857, "step": 451 }, { "epoch": 4.4259485924112605, "grad_norm": 2.218404769897461, "learning_rate": 3.998380952272073e-06, "loss": 0.8925, "step": 452 }, { "epoch": 4.435740514075888, "grad_norm": 1.9908126592636108, "learning_rate": 3.994268648165234e-06, "loss": 0.7822, "step": 453 }, { "epoch": 4.445532435740514, "grad_norm": 1.6664211750030518, "learning_rate": 3.990150043987806e-06, "loss": 0.7229, "step": 454 }, { "epoch": 4.455324357405141, "grad_norm": 1.9056819677352905, "learning_rate": 3.986025157104467e-06, "loss": 0.6447, "step": 455 }, { "epoch": 4.465116279069767, "grad_norm": 1.6858128309249878, "learning_rate": 3.981894004906388e-06, "loss": 0.6967, "step": 456 }, { "epoch": 4.474908200734394, "grad_norm": 1.9926295280456543, "learning_rate": 3.977756604811152e-06, "loss": 1.1001, "step": 457 }, { "epoch": 4.484700122399021, "grad_norm": 1.5066274404525757, "learning_rate": 3.973612974262685e-06, "loss": 0.4915, "step": 458 }, { "epoch": 4.494492044063647, "grad_norm": 1.7402639389038086, "learning_rate": 3.969463130731183e-06, "loss": 0.8178, "step": 459 }, { "epoch": 4.504283965728274, "grad_norm": 1.7749882936477661, "learning_rate": 3.965307091713037e-06, "loss": 0.6181, "step": 460 }, { "epoch": 4.5140758873929006, "grad_norm": 1.6733134984970093, "learning_rate": 3.961144874730758e-06, "loss": 0.5797, "step": 461 }, { "epoch": 4.523867809057528, "grad_norm": 2.030430555343628, "learning_rate": 3.956976497332903e-06, "loss": 0.7794, "step": 462 }, { "epoch": 4.533659730722154, "grad_norm": 1.7787001132965088, "learning_rate": 3.952801977094005e-06, "loss": 0.8961, "step": 463 }, { "epoch": 4.543451652386781, "grad_norm": 1.870334506034851, "learning_rate": 3.948621331614495e-06, "loss": 0.7167, "step": 464 }, { "epoch": 4.5532435740514074, "grad_norm": 1.7778104543685913, "learning_rate": 3.9444345785206285e-06, "loss": 0.6932, "step": 465 }, { "epoch": 4.563035495716035, "grad_norm": 1.7349879741668701, "learning_rate": 3.9402417354644115e-06, "loss": 0.7703, "step": 466 }, { "epoch": 4.572827417380661, "grad_norm": 1.8538790941238403, "learning_rate": 3.936042820123529e-06, "loss": 0.6718, "step": 467 }, { "epoch": 4.582619339045287, "grad_norm": 2.262875556945801, "learning_rate": 3.9318378502012636e-06, "loss": 0.6639, "step": 468 }, { "epoch": 4.592411260709914, "grad_norm": 1.6901624202728271, "learning_rate": 3.927626843426427e-06, "loss": 0.6917, "step": 469 }, { "epoch": 4.602203182374541, "grad_norm": 1.8226820230484009, "learning_rate": 3.923409817553284e-06, "loss": 0.7469, "step": 470 }, { "epoch": 4.611995104039168, "grad_norm": 1.8534526824951172, "learning_rate": 3.919186790361476e-06, "loss": 0.5711, "step": 471 }, { "epoch": 4.621787025703794, "grad_norm": 1.6683892011642456, "learning_rate": 3.914957779655946e-06, "loss": 0.7549, "step": 472 }, { "epoch": 4.631578947368421, "grad_norm": 1.848334789276123, "learning_rate": 3.910722803266866e-06, "loss": 0.81, "step": 473 }, { "epoch": 4.6413708690330475, "grad_norm": 1.7016183137893677, "learning_rate": 3.906481879049559e-06, "loss": 0.5973, "step": 474 }, { "epoch": 4.651162790697675, "grad_norm": 1.9261906147003174, "learning_rate": 3.902235024884425e-06, "loss": 0.9308, "step": 475 }, { "epoch": 4.660954712362301, "grad_norm": 1.7943637371063232, "learning_rate": 3.897982258676867e-06, "loss": 0.7557, "step": 476 }, { "epoch": 4.670746634026928, "grad_norm": 2.117396593093872, "learning_rate": 3.893723598357214e-06, "loss": 1.0249, "step": 477 }, { "epoch": 4.680538555691554, "grad_norm": 2.9921984672546387, "learning_rate": 3.8894590618806435e-06, "loss": 0.5656, "step": 478 }, { "epoch": 4.690330477356181, "grad_norm": 1.8793911933898926, "learning_rate": 3.88518866722711e-06, "loss": 0.646, "step": 479 }, { "epoch": 4.700122399020808, "grad_norm": 1.8031724691390991, "learning_rate": 3.880912432401265e-06, "loss": 0.6982, "step": 480 }, { "epoch": 4.709914320685434, "grad_norm": 1.7483805418014526, "learning_rate": 3.876630375432384e-06, "loss": 0.6289, "step": 481 }, { "epoch": 4.719706242350061, "grad_norm": 1.9166884422302246, "learning_rate": 3.872342514374291e-06, "loss": 0.7317, "step": 482 }, { "epoch": 4.729498164014688, "grad_norm": 1.7474361658096313, "learning_rate": 3.868048867305279e-06, "loss": 0.5564, "step": 483 }, { "epoch": 4.739290085679315, "grad_norm": 1.754001498222351, "learning_rate": 3.863749452328035e-06, "loss": 0.728, "step": 484 }, { "epoch": 4.749082007343941, "grad_norm": 1.705482006072998, "learning_rate": 3.859444287569567e-06, "loss": 0.6904, "step": 485 }, { "epoch": 4.758873929008568, "grad_norm": 1.713342547416687, "learning_rate": 3.855133391181124e-06, "loss": 0.6774, "step": 486 }, { "epoch": 4.7686658506731945, "grad_norm": 2.002656936645508, "learning_rate": 3.850816781338119e-06, "loss": 0.925, "step": 487 }, { "epoch": 4.778457772337822, "grad_norm": 1.7748180627822876, "learning_rate": 3.846494476240057e-06, "loss": 0.567, "step": 488 }, { "epoch": 4.788249694002448, "grad_norm": 1.7066618204116821, "learning_rate": 3.842166494110451e-06, "loss": 0.7358, "step": 489 }, { "epoch": 4.798041615667074, "grad_norm": 1.854140043258667, "learning_rate": 3.837832853196751e-06, "loss": 0.6995, "step": 490 }, { "epoch": 4.807833537331701, "grad_norm": 1.6693692207336426, "learning_rate": 3.833493571770268e-06, "loss": 0.5997, "step": 491 }, { "epoch": 4.817625458996328, "grad_norm": 2.0006518363952637, "learning_rate": 3.8291486681260904e-06, "loss": 1.0332, "step": 492 }, { "epoch": 4.827417380660955, "grad_norm": 1.701348066329956, "learning_rate": 3.824798160583012e-06, "loss": 0.6327, "step": 493 }, { "epoch": 4.837209302325581, "grad_norm": 1.8359907865524292, "learning_rate": 3.820442067483455e-06, "loss": 0.8476, "step": 494 }, { "epoch": 4.847001223990208, "grad_norm": 1.716478943824768, "learning_rate": 3.81608040719339e-06, "loss": 0.683, "step": 495 }, { "epoch": 4.8567931456548346, "grad_norm": 1.8071199655532837, "learning_rate": 3.811713198102258e-06, "loss": 0.6236, "step": 496 }, { "epoch": 4.866585067319462, "grad_norm": 1.9766230583190918, "learning_rate": 3.807340458622898e-06, "loss": 0.7792, "step": 497 }, { "epoch": 4.876376988984088, "grad_norm": 1.922031044960022, "learning_rate": 3.802962207191463e-06, "loss": 1.1315, "step": 498 }, { "epoch": 4.886168910648715, "grad_norm": 1.9953148365020752, "learning_rate": 3.7985784622673473e-06, "loss": 0.7975, "step": 499 }, { "epoch": 4.8959608323133414, "grad_norm": 1.8715909719467163, "learning_rate": 3.794189242333107e-06, "loss": 0.6275, "step": 500 }, { "epoch": 4.905752753977968, "grad_norm": 1.7319326400756836, "learning_rate": 3.789794565894378e-06, "loss": 0.544, "step": 501 }, { "epoch": 4.915544675642595, "grad_norm": 2.048226833343506, "learning_rate": 3.785394451479806e-06, "loss": 0.7647, "step": 502 }, { "epoch": 4.925336597307221, "grad_norm": 2.080871105194092, "learning_rate": 3.780988917640963e-06, "loss": 0.7746, "step": 503 }, { "epoch": 4.935128518971848, "grad_norm": 1.7307814359664917, "learning_rate": 3.7765779829522674e-06, "loss": 0.4275, "step": 504 }, { "epoch": 4.944920440636475, "grad_norm": 1.9921857118606567, "learning_rate": 3.772161666010913e-06, "loss": 0.9656, "step": 505 }, { "epoch": 4.954712362301102, "grad_norm": 1.5920534133911133, "learning_rate": 3.7677399854367815e-06, "loss": 0.5459, "step": 506 }, { "epoch": 4.964504283965728, "grad_norm": 1.6851868629455566, "learning_rate": 3.7633129598723704e-06, "loss": 0.5938, "step": 507 }, { "epoch": 4.974296205630355, "grad_norm": 1.7891894578933716, "learning_rate": 3.7588806079827147e-06, "loss": 0.8512, "step": 508 }, { "epoch": 4.9840881272949815, "grad_norm": 1.8974699974060059, "learning_rate": 3.7544429484553026e-06, "loss": 0.707, "step": 509 }, { "epoch": 4.993880048959609, "grad_norm": 1.8479547500610352, "learning_rate": 3.7500000000000005e-06, "loss": 0.7338, "step": 510 }, { "epoch": 5.003671970624235, "grad_norm": 6.809603691101074, "learning_rate": 3.745551781348977e-06, "loss": 0.9602, "step": 511 }, { "epoch": 5.013463892288861, "grad_norm": 1.5952311754226685, "learning_rate": 3.7410983112566166e-06, "loss": 0.6745, "step": 512 }, { "epoch": 5.023255813953488, "grad_norm": 1.7409144639968872, "learning_rate": 3.736639608499448e-06, "loss": 0.6986, "step": 513 }, { "epoch": 5.033047735618115, "grad_norm": 2.1375889778137207, "learning_rate": 3.7321756918760587e-06, "loss": 0.9074, "step": 514 }, { "epoch": 5.042839657282742, "grad_norm": 1.723294734954834, "learning_rate": 3.7277065802070213e-06, "loss": 0.6184, "step": 515 }, { "epoch": 5.052631578947368, "grad_norm": 1.7434897422790527, "learning_rate": 3.7232322923348093e-06, "loss": 0.572, "step": 516 }, { "epoch": 5.062423500611995, "grad_norm": 1.889726996421814, "learning_rate": 3.7187528471237223e-06, "loss": 0.9935, "step": 517 }, { "epoch": 5.072215422276622, "grad_norm": 1.6148978471755981, "learning_rate": 3.7142682634598016e-06, "loss": 0.5632, "step": 518 }, { "epoch": 5.082007343941249, "grad_norm": 1.9944149255752563, "learning_rate": 3.709778560250754e-06, "loss": 0.8933, "step": 519 }, { "epoch": 5.091799265605875, "grad_norm": 1.6916334629058838, "learning_rate": 3.7052837564258728e-06, "loss": 0.6407, "step": 520 }, { "epoch": 5.101591187270502, "grad_norm": 2.108482837677002, "learning_rate": 3.7007838709359527e-06, "loss": 0.828, "step": 521 }, { "epoch": 5.1113831089351285, "grad_norm": 1.8837676048278809, "learning_rate": 3.6962789227532165e-06, "loss": 0.7684, "step": 522 }, { "epoch": 5.121175030599755, "grad_norm": 2.0093040466308594, "learning_rate": 3.691768930871232e-06, "loss": 0.7647, "step": 523 }, { "epoch": 5.130966952264382, "grad_norm": 1.9251562356948853, "learning_rate": 3.6872539143048287e-06, "loss": 0.7135, "step": 524 }, { "epoch": 5.140758873929008, "grad_norm": 1.743512749671936, "learning_rate": 3.6827338920900257e-06, "loss": 0.6222, "step": 525 }, { "epoch": 5.150550795593635, "grad_norm": 1.9517323970794678, "learning_rate": 3.6782088832839436e-06, "loss": 0.7861, "step": 526 }, { "epoch": 5.160342717258262, "grad_norm": 1.4948291778564453, "learning_rate": 3.6736789069647273e-06, "loss": 0.4798, "step": 527 }, { "epoch": 5.170134638922889, "grad_norm": 1.6748100519180298, "learning_rate": 3.6691439822314672e-06, "loss": 0.6667, "step": 528 }, { "epoch": 5.179926560587515, "grad_norm": 1.8798792362213135, "learning_rate": 3.664604128204117e-06, "loss": 0.8504, "step": 529 }, { "epoch": 5.189718482252142, "grad_norm": 1.8752387762069702, "learning_rate": 3.660059364023409e-06, "loss": 0.6961, "step": 530 }, { "epoch": 5.1995104039167686, "grad_norm": 1.607374668121338, "learning_rate": 3.6555097088507837e-06, "loss": 0.5009, "step": 531 }, { "epoch": 5.209302325581396, "grad_norm": 1.7791218757629395, "learning_rate": 3.650955181868298e-06, "loss": 0.5917, "step": 532 }, { "epoch": 5.219094247246022, "grad_norm": 1.9009809494018555, "learning_rate": 3.646395802278551e-06, "loss": 0.7433, "step": 533 }, { "epoch": 5.228886168910648, "grad_norm": 1.8437162637710571, "learning_rate": 3.641831589304602e-06, "loss": 0.6631, "step": 534 }, { "epoch": 5.2386780905752754, "grad_norm": 1.9826735258102417, "learning_rate": 3.6372625621898864e-06, "loss": 0.8492, "step": 535 }, { "epoch": 5.248470012239902, "grad_norm": 1.8516113758087158, "learning_rate": 3.6326887401981386e-06, "loss": 0.692, "step": 536 }, { "epoch": 5.258261933904529, "grad_norm": 1.6150197982788086, "learning_rate": 3.6281101426133075e-06, "loss": 0.6246, "step": 537 }, { "epoch": 5.268053855569155, "grad_norm": 1.9633769989013672, "learning_rate": 3.6235267887394774e-06, "loss": 1.1106, "step": 538 }, { "epoch": 5.277845777233782, "grad_norm": 1.8161698579788208, "learning_rate": 3.618938697900788e-06, "loss": 0.5598, "step": 539 }, { "epoch": 5.287637698898409, "grad_norm": 1.7449780702590942, "learning_rate": 3.6143458894413463e-06, "loss": 0.3777, "step": 540 }, { "epoch": 5.297429620563036, "grad_norm": 1.8650290966033936, "learning_rate": 3.6097483827251524e-06, "loss": 0.7008, "step": 541 }, { "epoch": 5.307221542227662, "grad_norm": 1.7794548273086548, "learning_rate": 3.6051461971360146e-06, "loss": 0.6069, "step": 542 }, { "epoch": 5.317013463892289, "grad_norm": 1.8387482166290283, "learning_rate": 3.600539352077469e-06, "loss": 0.5256, "step": 543 }, { "epoch": 5.3268053855569155, "grad_norm": 1.775593638420105, "learning_rate": 3.595927866972694e-06, "loss": 0.5254, "step": 544 }, { "epoch": 5.336597307221542, "grad_norm": 2.6975016593933105, "learning_rate": 3.591311761264433e-06, "loss": 0.6563, "step": 545 }, { "epoch": 5.346389228886169, "grad_norm": 1.9159234762191772, "learning_rate": 3.586691054414913e-06, "loss": 0.6886, "step": 546 }, { "epoch": 5.356181150550795, "grad_norm": 1.706493616104126, "learning_rate": 3.5820657659057545e-06, "loss": 0.5446, "step": 547 }, { "epoch": 5.365973072215422, "grad_norm": 1.8246746063232422, "learning_rate": 3.577435915237899e-06, "loss": 0.6459, "step": 548 }, { "epoch": 5.375764993880049, "grad_norm": 1.55630624294281, "learning_rate": 3.5728015219315226e-06, "loss": 0.4515, "step": 549 }, { "epoch": 5.385556915544676, "grad_norm": 2.1207566261291504, "learning_rate": 3.5681626055259526e-06, "loss": 0.8468, "step": 550 }, { "epoch": 5.395348837209302, "grad_norm": 1.6563208103179932, "learning_rate": 3.563519185579587e-06, "loss": 0.5321, "step": 551 }, { "epoch": 5.405140758873929, "grad_norm": 2.334747552871704, "learning_rate": 3.558871281669811e-06, "loss": 1.1206, "step": 552 }, { "epoch": 5.414932680538556, "grad_norm": 1.776879906654358, "learning_rate": 3.5542189133929162e-06, "loss": 0.553, "step": 553 }, { "epoch": 5.424724602203183, "grad_norm": 1.8452098369598389, "learning_rate": 3.549562100364014e-06, "loss": 0.5915, "step": 554 }, { "epoch": 5.434516523867809, "grad_norm": 2.118831157684326, "learning_rate": 3.544900862216959e-06, "loss": 0.9604, "step": 555 }, { "epoch": 5.444308445532435, "grad_norm": 1.8282262086868286, "learning_rate": 3.5402352186042602e-06, "loss": 0.831, "step": 556 }, { "epoch": 5.4541003671970625, "grad_norm": 1.8505369424819946, "learning_rate": 3.5355651891970016e-06, "loss": 0.9521, "step": 557 }, { "epoch": 5.463892288861689, "grad_norm": 1.8349502086639404, "learning_rate": 3.530890793684759e-06, "loss": 0.7363, "step": 558 }, { "epoch": 5.473684210526316, "grad_norm": 1.9025176763534546, "learning_rate": 3.5262120517755154e-06, "loss": 0.6494, "step": 559 }, { "epoch": 5.483476132190942, "grad_norm": 1.7090250253677368, "learning_rate": 3.521528983195579e-06, "loss": 0.6525, "step": 560 }, { "epoch": 5.493268053855569, "grad_norm": 1.8467003107070923, "learning_rate": 3.516841607689501e-06, "loss": 0.6574, "step": 561 }, { "epoch": 5.503059975520196, "grad_norm": 1.7567086219787598, "learning_rate": 3.512149945019989e-06, "loss": 0.481, "step": 562 }, { "epoch": 5.512851897184823, "grad_norm": 1.6873767375946045, "learning_rate": 3.5074540149678293e-06, "loss": 0.5429, "step": 563 }, { "epoch": 5.522643818849449, "grad_norm": 1.9199461936950684, "learning_rate": 3.502753837331797e-06, "loss": 0.5444, "step": 564 }, { "epoch": 5.532435740514076, "grad_norm": 1.7763882875442505, "learning_rate": 3.4980494319285773e-06, "loss": 0.7194, "step": 565 }, { "epoch": 5.5422276621787026, "grad_norm": 1.7657161951065063, "learning_rate": 3.4933408185926805e-06, "loss": 0.5012, "step": 566 }, { "epoch": 5.552019583843329, "grad_norm": 1.6161220073699951, "learning_rate": 3.4886280171763563e-06, "loss": 0.6407, "step": 567 }, { "epoch": 5.561811505507956, "grad_norm": 1.6048427820205688, "learning_rate": 3.4839110475495153e-06, "loss": 0.6836, "step": 568 }, { "epoch": 5.571603427172582, "grad_norm": 1.9500138759613037, "learning_rate": 3.4791899295996386e-06, "loss": 0.7854, "step": 569 }, { "epoch": 5.5813953488372094, "grad_norm": 1.5950162410736084, "learning_rate": 3.4744646832316985e-06, "loss": 0.4106, "step": 570 }, { "epoch": 5.591187270501836, "grad_norm": 1.9633638858795166, "learning_rate": 3.4697353283680746e-06, "loss": 0.694, "step": 571 }, { "epoch": 5.600979192166463, "grad_norm": 1.7308915853500366, "learning_rate": 3.465001884948468e-06, "loss": 0.4694, "step": 572 }, { "epoch": 5.610771113831089, "grad_norm": 1.6681874990463257, "learning_rate": 3.4602643729298152e-06, "loss": 0.5633, "step": 573 }, { "epoch": 5.620563035495716, "grad_norm": 1.8871471881866455, "learning_rate": 3.45552281228621e-06, "loss": 0.8596, "step": 574 }, { "epoch": 5.630354957160343, "grad_norm": 1.6353410482406616, "learning_rate": 3.4507772230088148e-06, "loss": 0.6651, "step": 575 }, { "epoch": 5.64014687882497, "grad_norm": 2.659740686416626, "learning_rate": 3.446027625105776e-06, "loss": 0.7023, "step": 576 }, { "epoch": 5.649938800489596, "grad_norm": 1.9393025636672974, "learning_rate": 3.4412740386021426e-06, "loss": 0.7573, "step": 577 }, { "epoch": 5.659730722154222, "grad_norm": 1.874972939491272, "learning_rate": 3.436516483539781e-06, "loss": 0.5654, "step": 578 }, { "epoch": 5.6695226438188495, "grad_norm": 1.7418142557144165, "learning_rate": 3.431754979977285e-06, "loss": 0.5691, "step": 579 }, { "epoch": 5.679314565483476, "grad_norm": 1.6752581596374512, "learning_rate": 3.4269895479899023e-06, "loss": 0.5488, "step": 580 }, { "epoch": 5.689106487148103, "grad_norm": 2.0247418880462646, "learning_rate": 3.4222202076694395e-06, "loss": 0.6508, "step": 581 }, { "epoch": 5.698898408812729, "grad_norm": 1.7557880878448486, "learning_rate": 3.4174469791241805e-06, "loss": 0.5115, "step": 582 }, { "epoch": 5.708690330477356, "grad_norm": 2.1636388301849365, "learning_rate": 3.4126698824788063e-06, "loss": 0.7812, "step": 583 }, { "epoch": 5.718482252141983, "grad_norm": 1.8486337661743164, "learning_rate": 3.4078889378743036e-06, "loss": 0.5463, "step": 584 }, { "epoch": 5.72827417380661, "grad_norm": 2.0236499309539795, "learning_rate": 3.403104165467883e-06, "loss": 0.7061, "step": 585 }, { "epoch": 5.738066095471236, "grad_norm": 1.781389832496643, "learning_rate": 3.3983155854328942e-06, "loss": 0.6559, "step": 586 }, { "epoch": 5.747858017135863, "grad_norm": 1.8835967779159546, "learning_rate": 3.3935232179587414e-06, "loss": 0.7244, "step": 587 }, { "epoch": 5.75764993880049, "grad_norm": 2.0303163528442383, "learning_rate": 3.388727083250795e-06, "loss": 0.6869, "step": 588 }, { "epoch": 5.767441860465116, "grad_norm": 1.717504858970642, "learning_rate": 3.3839272015303115e-06, "loss": 0.6378, "step": 589 }, { "epoch": 5.777233782129743, "grad_norm": 1.7656553983688354, "learning_rate": 3.379123593034342e-06, "loss": 0.588, "step": 590 }, { "epoch": 5.787025703794369, "grad_norm": 1.6479922533035278, "learning_rate": 3.374316278015653e-06, "loss": 0.4838, "step": 591 }, { "epoch": 5.7968176254589965, "grad_norm": 2.0165750980377197, "learning_rate": 3.369505276742638e-06, "loss": 0.7007, "step": 592 }, { "epoch": 5.806609547123623, "grad_norm": 1.724360466003418, "learning_rate": 3.36469060949923e-06, "loss": 0.7533, "step": 593 }, { "epoch": 5.81640146878825, "grad_norm": 1.7533793449401855, "learning_rate": 3.359872296584821e-06, "loss": 0.6194, "step": 594 }, { "epoch": 5.826193390452876, "grad_norm": 2.1136512756347656, "learning_rate": 3.3550503583141726e-06, "loss": 0.7046, "step": 595 }, { "epoch": 5.835985312117503, "grad_norm": 1.760472059249878, "learning_rate": 3.350224815017331e-06, "loss": 0.6399, "step": 596 }, { "epoch": 5.84577723378213, "grad_norm": 1.7681163549423218, "learning_rate": 3.345395687039543e-06, "loss": 0.8128, "step": 597 }, { "epoch": 5.855569155446757, "grad_norm": 1.9811427593231201, "learning_rate": 3.3405629947411687e-06, "loss": 0.6606, "step": 598 }, { "epoch": 5.865361077111383, "grad_norm": 2.087017774581909, "learning_rate": 3.3357267584975937e-06, "loss": 0.6253, "step": 599 }, { "epoch": 5.875152998776009, "grad_norm": 2.0523016452789307, "learning_rate": 3.3308869986991493e-06, "loss": 0.7782, "step": 600 }, { "epoch": 5.8849449204406366, "grad_norm": 2.1251156330108643, "learning_rate": 3.32604373575102e-06, "loss": 0.6583, "step": 601 }, { "epoch": 5.894736842105263, "grad_norm": 1.8005659580230713, "learning_rate": 3.32119699007316e-06, "loss": 0.7314, "step": 602 }, { "epoch": 5.90452876376989, "grad_norm": 2.0085527896881104, "learning_rate": 3.3163467821002082e-06, "loss": 0.7311, "step": 603 }, { "epoch": 5.914320685434516, "grad_norm": 1.8307207822799683, "learning_rate": 3.311493132281402e-06, "loss": 0.8017, "step": 604 }, { "epoch": 5.9241126070991434, "grad_norm": 1.8823357820510864, "learning_rate": 3.3066360610804877e-06, "loss": 0.8413, "step": 605 }, { "epoch": 5.93390452876377, "grad_norm": 1.816819190979004, "learning_rate": 3.3017755889756382e-06, "loss": 0.5875, "step": 606 }, { "epoch": 5.943696450428397, "grad_norm": 1.8413772583007812, "learning_rate": 3.2969117364593654e-06, "loss": 0.7483, "step": 607 }, { "epoch": 5.953488372093023, "grad_norm": 1.8138740062713623, "learning_rate": 3.292044524038433e-06, "loss": 0.6296, "step": 608 }, { "epoch": 5.96328029375765, "grad_norm": 1.745700478553772, "learning_rate": 3.28717397223377e-06, "loss": 0.5437, "step": 609 }, { "epoch": 5.973072215422277, "grad_norm": 2.1375620365142822, "learning_rate": 3.2823001015803863e-06, "loss": 0.8765, "step": 610 }, { "epoch": 5.982864137086903, "grad_norm": 1.4280636310577393, "learning_rate": 3.277422932627283e-06, "loss": 0.3366, "step": 611 }, { "epoch": 5.99265605875153, "grad_norm": 1.8823065757751465, "learning_rate": 3.272542485937369e-06, "loss": 0.6679, "step": 612 }, { "epoch": 6.002447980416156, "grad_norm": 6.602539539337158, "learning_rate": 3.2676587820873704e-06, "loss": 1.2411, "step": 613 }, { "epoch": 6.0122399020807835, "grad_norm": 1.9242345094680786, "learning_rate": 3.2627718416677484e-06, "loss": 0.6409, "step": 614 }, { "epoch": 6.02203182374541, "grad_norm": 2.036929130554199, "learning_rate": 3.257881685282609e-06, "loss": 0.6876, "step": 615 }, { "epoch": 6.031823745410037, "grad_norm": 2.0596063137054443, "learning_rate": 3.2529883335496163e-06, "loss": 0.6769, "step": 616 }, { "epoch": 6.041615667074663, "grad_norm": 2.060333013534546, "learning_rate": 3.2480918070999083e-06, "loss": 0.8818, "step": 617 }, { "epoch": 6.05140758873929, "grad_norm": 1.5693408250808716, "learning_rate": 3.243192126578007e-06, "loss": 0.5675, "step": 618 }, { "epoch": 6.061199510403917, "grad_norm": 1.7919774055480957, "learning_rate": 3.2382893126417327e-06, "loss": 0.7305, "step": 619 }, { "epoch": 6.070991432068544, "grad_norm": 1.9924007654190063, "learning_rate": 3.2333833859621155e-06, "loss": 0.5647, "step": 620 }, { "epoch": 6.08078335373317, "grad_norm": 1.7289621829986572, "learning_rate": 3.228474367223312e-06, "loss": 0.5715, "step": 621 }, { "epoch": 6.090575275397796, "grad_norm": 1.9013326168060303, "learning_rate": 3.223562277122513e-06, "loss": 0.4505, "step": 622 }, { "epoch": 6.100367197062424, "grad_norm": 2.131302833557129, "learning_rate": 3.2186471363698598e-06, "loss": 0.4895, "step": 623 }, { "epoch": 6.11015911872705, "grad_norm": 1.766001582145691, "learning_rate": 3.213728965688356e-06, "loss": 0.4299, "step": 624 }, { "epoch": 6.119951040391677, "grad_norm": 1.99663507938385, "learning_rate": 3.2088077858137774e-06, "loss": 0.852, "step": 625 }, { "epoch": 6.129742962056303, "grad_norm": 1.6072460412979126, "learning_rate": 3.2038836174945907e-06, "loss": 0.5558, "step": 626 }, { "epoch": 6.1395348837209305, "grad_norm": 2.0335426330566406, "learning_rate": 3.19895648149186e-06, "loss": 0.7967, "step": 627 }, { "epoch": 6.149326805385557, "grad_norm": 1.7607651948928833, "learning_rate": 3.194026398579162e-06, "loss": 0.6873, "step": 628 }, { "epoch": 6.159118727050184, "grad_norm": 1.880065679550171, "learning_rate": 3.189093389542498e-06, "loss": 0.6644, "step": 629 }, { "epoch": 6.16891064871481, "grad_norm": 1.810813307762146, "learning_rate": 3.184157475180208e-06, "loss": 0.4958, "step": 630 }, { "epoch": 6.178702570379437, "grad_norm": 2.134848117828369, "learning_rate": 3.1792186763028778e-06, "loss": 0.8276, "step": 631 }, { "epoch": 6.188494492044064, "grad_norm": 1.6647247076034546, "learning_rate": 3.1742770137332567e-06, "loss": 0.6495, "step": 632 }, { "epoch": 6.19828641370869, "grad_norm": 1.657850742340088, "learning_rate": 3.1693325083061703e-06, "loss": 0.7569, "step": 633 }, { "epoch": 6.208078335373317, "grad_norm": 1.6627212762832642, "learning_rate": 3.164385180868425e-06, "loss": 0.6776, "step": 634 }, { "epoch": 6.217870257037943, "grad_norm": 1.5389106273651123, "learning_rate": 3.1594350522787296e-06, "loss": 0.4122, "step": 635 }, { "epoch": 6.2276621787025706, "grad_norm": 1.9366390705108643, "learning_rate": 3.1544821434076013e-06, "loss": 0.6484, "step": 636 }, { "epoch": 6.237454100367197, "grad_norm": 1.7447491884231567, "learning_rate": 3.149526475137278e-06, "loss": 0.7365, "step": 637 }, { "epoch": 6.247246022031824, "grad_norm": 1.8330599069595337, "learning_rate": 3.144568068361634e-06, "loss": 0.8781, "step": 638 }, { "epoch": 6.25703794369645, "grad_norm": 1.7635221481323242, "learning_rate": 3.1396069439860894e-06, "loss": 0.4673, "step": 639 }, { "epoch": 6.2668298653610774, "grad_norm": 1.7916160821914673, "learning_rate": 3.1346431229275197e-06, "loss": 0.558, "step": 640 }, { "epoch": 6.276621787025704, "grad_norm": 1.9943921566009521, "learning_rate": 3.1296766261141733e-06, "loss": 0.3981, "step": 641 }, { "epoch": 6.286413708690331, "grad_norm": 2.011319637298584, "learning_rate": 3.124707474485577e-06, "loss": 0.6408, "step": 642 }, { "epoch": 6.296205630354957, "grad_norm": 1.7006263732910156, "learning_rate": 3.1197356889924535e-06, "loss": 0.6055, "step": 643 }, { "epoch": 6.3059975520195835, "grad_norm": 1.7150357961654663, "learning_rate": 3.1147612905966286e-06, "loss": 0.7216, "step": 644 }, { "epoch": 6.315789473684211, "grad_norm": 1.6759999990463257, "learning_rate": 3.109784300270943e-06, "loss": 0.5874, "step": 645 }, { "epoch": 6.325581395348837, "grad_norm": 1.782863736152649, "learning_rate": 3.1048047389991693e-06, "loss": 0.6606, "step": 646 }, { "epoch": 6.335373317013464, "grad_norm": 1.6150490045547485, "learning_rate": 3.0998226277759173e-06, "loss": 0.6275, "step": 647 }, { "epoch": 6.34516523867809, "grad_norm": 2.192833423614502, "learning_rate": 3.094837987606547e-06, "loss": 0.7894, "step": 648 }, { "epoch": 6.3549571603427175, "grad_norm": 1.9683672189712524, "learning_rate": 3.0898508395070825e-06, "loss": 0.7329, "step": 649 }, { "epoch": 6.364749082007344, "grad_norm": 1.8401882648468018, "learning_rate": 3.084861204504122e-06, "loss": 0.523, "step": 650 }, { "epoch": 6.374541003671971, "grad_norm": 1.7800284624099731, "learning_rate": 3.0798691036347455e-06, "loss": 0.5544, "step": 651 }, { "epoch": 6.384332925336597, "grad_norm": 1.6933841705322266, "learning_rate": 3.0748745579464347e-06, "loss": 0.575, "step": 652 }, { "epoch": 6.394124847001224, "grad_norm": 1.978541612625122, "learning_rate": 3.0698775884969738e-06, "loss": 0.6253, "step": 653 }, { "epoch": 6.403916768665851, "grad_norm": 2.2512764930725098, "learning_rate": 3.0648782163543696e-06, "loss": 0.5724, "step": 654 }, { "epoch": 6.413708690330477, "grad_norm": 1.9642492532730103, "learning_rate": 3.059876462596758e-06, "loss": 1.0377, "step": 655 }, { "epoch": 6.423500611995104, "grad_norm": 1.80538010597229, "learning_rate": 3.0548723483123157e-06, "loss": 0.5214, "step": 656 }, { "epoch": 6.43329253365973, "grad_norm": 1.8782613277435303, "learning_rate": 3.049865894599172e-06, "loss": 0.5537, "step": 657 }, { "epoch": 6.443084455324358, "grad_norm": 1.924204707145691, "learning_rate": 3.0448571225653195e-06, "loss": 0.6087, "step": 658 }, { "epoch": 6.452876376988984, "grad_norm": 1.6280007362365723, "learning_rate": 3.039846053328526e-06, "loss": 0.5624, "step": 659 }, { "epoch": 6.462668298653611, "grad_norm": 1.8631033897399902, "learning_rate": 3.0348327080162438e-06, "loss": 0.7005, "step": 660 }, { "epoch": 6.472460220318237, "grad_norm": 1.8842097520828247, "learning_rate": 3.0298171077655214e-06, "loss": 0.6265, "step": 661 }, { "epoch": 6.4822521419828645, "grad_norm": 1.9385743141174316, "learning_rate": 3.0247992737229147e-06, "loss": 0.8134, "step": 662 }, { "epoch": 6.492044063647491, "grad_norm": 1.865721583366394, "learning_rate": 3.019779227044398e-06, "loss": 0.703, "step": 663 }, { "epoch": 6.501835985312118, "grad_norm": 1.7327815294265747, "learning_rate": 3.014756988895275e-06, "loss": 0.5076, "step": 664 }, { "epoch": 6.511627906976744, "grad_norm": 2.0660793781280518, "learning_rate": 3.0097325804500864e-06, "loss": 0.6871, "step": 665 }, { "epoch": 6.5214198286413705, "grad_norm": 2.079118490219116, "learning_rate": 3.0047060228925256e-06, "loss": 0.6912, "step": 666 }, { "epoch": 6.531211750305998, "grad_norm": 1.8385956287384033, "learning_rate": 2.999677337415347e-06, "loss": 0.6342, "step": 667 }, { "epoch": 6.541003671970624, "grad_norm": 1.9039627313613892, "learning_rate": 2.994646545220275e-06, "loss": 0.5804, "step": 668 }, { "epoch": 6.550795593635251, "grad_norm": 1.8720134496688843, "learning_rate": 2.9896136675179177e-06, "loss": 0.6485, "step": 669 }, { "epoch": 6.560587515299877, "grad_norm": 1.7179137468338013, "learning_rate": 2.9845787255276753e-06, "loss": 0.6173, "step": 670 }, { "epoch": 6.5703794369645045, "grad_norm": 1.690687656402588, "learning_rate": 2.979541740477652e-06, "loss": 0.7013, "step": 671 }, { "epoch": 6.580171358629131, "grad_norm": 2.0336570739746094, "learning_rate": 2.9745027336045652e-06, "loss": 0.7872, "step": 672 }, { "epoch": 6.589963280293758, "grad_norm": 1.6374056339263916, "learning_rate": 2.9694617261536564e-06, "loss": 0.4962, "step": 673 }, { "epoch": 6.599755201958384, "grad_norm": 1.7813180685043335, "learning_rate": 2.964418739378603e-06, "loss": 0.5496, "step": 674 }, { "epoch": 6.6095471236230114, "grad_norm": 1.7465476989746094, "learning_rate": 2.9593737945414264e-06, "loss": 0.4812, "step": 675 }, { "epoch": 6.619339045287638, "grad_norm": 1.694322943687439, "learning_rate": 2.954326912912404e-06, "loss": 0.5466, "step": 676 }, { "epoch": 6.629130966952264, "grad_norm": 2.09624981880188, "learning_rate": 2.9492781157699794e-06, "loss": 0.7137, "step": 677 }, { "epoch": 6.638922888616891, "grad_norm": 1.7662960290908813, "learning_rate": 2.9442274244006725e-06, "loss": 0.3653, "step": 678 }, { "epoch": 6.6487148102815175, "grad_norm": 1.9928301572799683, "learning_rate": 2.939174860098987e-06, "loss": 0.8323, "step": 679 }, { "epoch": 6.658506731946145, "grad_norm": 2.4069645404815674, "learning_rate": 2.9341204441673267e-06, "loss": 0.7601, "step": 680 }, { "epoch": 6.668298653610771, "grad_norm": 1.888750433921814, "learning_rate": 2.9290641979158998e-06, "loss": 0.6114, "step": 681 }, { "epoch": 6.678090575275398, "grad_norm": 1.9744417667388916, "learning_rate": 2.924006142662632e-06, "loss": 0.6849, "step": 682 }, { "epoch": 6.687882496940024, "grad_norm": 1.67307710647583, "learning_rate": 2.918946299733077e-06, "loss": 0.6376, "step": 683 }, { "epoch": 6.6976744186046515, "grad_norm": 1.9306859970092773, "learning_rate": 2.913884690460325e-06, "loss": 0.6066, "step": 684 }, { "epoch": 6.707466340269278, "grad_norm": 1.6338527202606201, "learning_rate": 2.9088213361849127e-06, "loss": 0.6716, "step": 685 }, { "epoch": 6.717258261933905, "grad_norm": 1.5537396669387817, "learning_rate": 2.903756258254734e-06, "loss": 0.6085, "step": 686 }, { "epoch": 6.727050183598531, "grad_norm": 1.964795470237732, "learning_rate": 2.8986894780249524e-06, "loss": 0.8045, "step": 687 }, { "epoch": 6.7368421052631575, "grad_norm": 1.771888256072998, "learning_rate": 2.8936210168579043e-06, "loss": 0.5249, "step": 688 }, { "epoch": 6.746634026927785, "grad_norm": 1.6270524263381958, "learning_rate": 2.888550896123018e-06, "loss": 0.4082, "step": 689 }, { "epoch": 6.756425948592411, "grad_norm": 2.190917730331421, "learning_rate": 2.883479137196714e-06, "loss": 1.0732, "step": 690 }, { "epoch": 6.766217870257038, "grad_norm": 2.018854856491089, "learning_rate": 2.8784057614623227e-06, "loss": 0.6135, "step": 691 }, { "epoch": 6.776009791921664, "grad_norm": 1.9348071813583374, "learning_rate": 2.8733307903099926e-06, "loss": 0.5462, "step": 692 }, { "epoch": 6.785801713586292, "grad_norm": 2.208258628845215, "learning_rate": 2.8682542451365943e-06, "loss": 0.7406, "step": 693 }, { "epoch": 6.795593635250918, "grad_norm": 1.6801481246948242, "learning_rate": 2.8631761473456377e-06, "loss": 0.4516, "step": 694 }, { "epoch": 6.805385556915545, "grad_norm": 1.924139142036438, "learning_rate": 2.8580965183471794e-06, "loss": 0.7314, "step": 695 }, { "epoch": 6.815177478580171, "grad_norm": 1.861464500427246, "learning_rate": 2.853015379557729e-06, "loss": 0.8131, "step": 696 }, { "epoch": 6.8249694002447985, "grad_norm": 2.1084578037261963, "learning_rate": 2.847932752400164e-06, "loss": 0.7231, "step": 697 }, { "epoch": 6.834761321909425, "grad_norm": 1.868126630783081, "learning_rate": 2.842848658303637e-06, "loss": 0.7356, "step": 698 }, { "epoch": 6.844553243574051, "grad_norm": 1.8570178747177124, "learning_rate": 2.8377631187034825e-06, "loss": 0.6783, "step": 699 }, { "epoch": 6.854345165238678, "grad_norm": 1.925726056098938, "learning_rate": 2.832676155041135e-06, "loss": 0.6231, "step": 700 }, { "epoch": 6.8641370869033045, "grad_norm": 1.8818365335464478, "learning_rate": 2.8275877887640267e-06, "loss": 0.599, "step": 701 }, { "epoch": 6.873929008567932, "grad_norm": 1.5818636417388916, "learning_rate": 2.822498041325509e-06, "loss": 0.3679, "step": 702 }, { "epoch": 6.883720930232558, "grad_norm": 1.7425843477249146, "learning_rate": 2.817406934184752e-06, "loss": 0.6365, "step": 703 }, { "epoch": 6.893512851897185, "grad_norm": 1.8083233833312988, "learning_rate": 2.8123144888066623e-06, "loss": 0.5843, "step": 704 }, { "epoch": 6.903304773561811, "grad_norm": 1.962203860282898, "learning_rate": 2.8072207266617856e-06, "loss": 0.7329, "step": 705 }, { "epoch": 6.9130966952264385, "grad_norm": 1.8940486907958984, "learning_rate": 2.802125669226222e-06, "loss": 0.6155, "step": 706 }, { "epoch": 6.922888616891065, "grad_norm": 1.7155791521072388, "learning_rate": 2.79702933798153e-06, "loss": 0.6855, "step": 707 }, { "epoch": 6.932680538555692, "grad_norm": 2.0107507705688477, "learning_rate": 2.7919317544146405e-06, "loss": 0.5591, "step": 708 }, { "epoch": 6.942472460220318, "grad_norm": 1.5478901863098145, "learning_rate": 2.786832940017766e-06, "loss": 0.6073, "step": 709 }, { "epoch": 6.9522643818849446, "grad_norm": 1.9494494199752808, "learning_rate": 2.7817329162883033e-06, "loss": 0.7065, "step": 710 }, { "epoch": 6.962056303549572, "grad_norm": 1.562627911567688, "learning_rate": 2.776631704728752e-06, "loss": 0.6088, "step": 711 }, { "epoch": 6.971848225214198, "grad_norm": 1.920892596244812, "learning_rate": 2.7715293268466204e-06, "loss": 0.7302, "step": 712 }, { "epoch": 6.981640146878825, "grad_norm": 1.8191511631011963, "learning_rate": 2.7664258041543306e-06, "loss": 0.6296, "step": 713 }, { "epoch": 6.9914320685434515, "grad_norm": 1.7295955419540405, "learning_rate": 2.761321158169134e-06, "loss": 0.4896, "step": 714 }, { "epoch": 7.001223990208079, "grad_norm": 4.760815620422363, "learning_rate": 2.7562154104130177e-06, "loss": 1.2699, "step": 715 }, { "epoch": 7.011015911872705, "grad_norm": 1.8637394905090332, "learning_rate": 2.7511085824126133e-06, "loss": 0.6226, "step": 716 }, { "epoch": 7.020807833537332, "grad_norm": 1.7756836414337158, "learning_rate": 2.746000695699107e-06, "loss": 0.8752, "step": 717 }, { "epoch": 7.030599755201958, "grad_norm": 1.726749062538147, "learning_rate": 2.74089177180815e-06, "loss": 0.5424, "step": 718 }, { "epoch": 7.0403916768665855, "grad_norm": 1.7784234285354614, "learning_rate": 2.7357818322797643e-06, "loss": 0.5583, "step": 719 }, { "epoch": 7.050183598531212, "grad_norm": 2.115105152130127, "learning_rate": 2.730670898658255e-06, "loss": 0.6348, "step": 720 }, { "epoch": 7.059975520195838, "grad_norm": 1.9012866020202637, "learning_rate": 2.72555899249212e-06, "loss": 0.5347, "step": 721 }, { "epoch": 7.069767441860465, "grad_norm": 1.9426422119140625, "learning_rate": 2.7204461353339546e-06, "loss": 0.7656, "step": 722 }, { "epoch": 7.0795593635250915, "grad_norm": 1.6899685859680176, "learning_rate": 2.7153323487403653e-06, "loss": 0.5921, "step": 723 }, { "epoch": 7.089351285189719, "grad_norm": 2.0799875259399414, "learning_rate": 2.7102176542718783e-06, "loss": 0.4787, "step": 724 }, { "epoch": 7.099143206854345, "grad_norm": 1.8492670059204102, "learning_rate": 2.705102073492845e-06, "loss": 0.6167, "step": 725 }, { "epoch": 7.108935128518972, "grad_norm": 1.7344064712524414, "learning_rate": 2.699985627971354e-06, "loss": 0.5123, "step": 726 }, { "epoch": 7.118727050183598, "grad_norm": 2.190241813659668, "learning_rate": 2.694868339279144e-06, "loss": 0.5343, "step": 727 }, { "epoch": 7.128518971848226, "grad_norm": 2.0244648456573486, "learning_rate": 2.689750228991503e-06, "loss": 0.5417, "step": 728 }, { "epoch": 7.138310893512852, "grad_norm": 2.1997575759887695, "learning_rate": 2.684631318687185e-06, "loss": 0.8798, "step": 729 }, { "epoch": 7.148102815177479, "grad_norm": 2.041642427444458, "learning_rate": 2.679511629948319e-06, "loss": 0.6131, "step": 730 }, { "epoch": 7.157894736842105, "grad_norm": 2.021385669708252, "learning_rate": 2.6743911843603134e-06, "loss": 0.4983, "step": 731 }, { "epoch": 7.167686658506732, "grad_norm": 2.0443131923675537, "learning_rate": 2.669270003511769e-06, "loss": 0.6613, "step": 732 }, { "epoch": 7.177478580171359, "grad_norm": 1.804967999458313, "learning_rate": 2.6641481089943854e-06, "loss": 0.5718, "step": 733 }, { "epoch": 7.187270501835985, "grad_norm": 2.022339105606079, "learning_rate": 2.6590255224028725e-06, "loss": 0.6902, "step": 734 }, { "epoch": 7.197062423500612, "grad_norm": 1.7102774381637573, "learning_rate": 2.6539022653348577e-06, "loss": 0.6214, "step": 735 }, { "epoch": 7.2068543451652385, "grad_norm": 1.6640324592590332, "learning_rate": 2.648778359390794e-06, "loss": 0.5834, "step": 736 }, { "epoch": 7.216646266829866, "grad_norm": 1.8334426879882812, "learning_rate": 2.6436538261738715e-06, "loss": 0.6683, "step": 737 }, { "epoch": 7.226438188494492, "grad_norm": 1.5923433303833008, "learning_rate": 2.638528687289925e-06, "loss": 0.4182, "step": 738 }, { "epoch": 7.236230110159119, "grad_norm": 1.8124734163284302, "learning_rate": 2.6334029643473426e-06, "loss": 0.7483, "step": 739 }, { "epoch": 7.246022031823745, "grad_norm": 1.8725506067276, "learning_rate": 2.6282766789569742e-06, "loss": 0.6136, "step": 740 }, { "epoch": 7.2558139534883725, "grad_norm": 1.7662770748138428, "learning_rate": 2.6231498527320426e-06, "loss": 0.6287, "step": 741 }, { "epoch": 7.265605875152999, "grad_norm": 1.8419501781463623, "learning_rate": 2.618022507288049e-06, "loss": 0.7921, "step": 742 }, { "epoch": 7.275397796817625, "grad_norm": 1.912237524986267, "learning_rate": 2.6128946642426844e-06, "loss": 0.7325, "step": 743 }, { "epoch": 7.285189718482252, "grad_norm": 1.9951177835464478, "learning_rate": 2.6077663452157398e-06, "loss": 0.5097, "step": 744 }, { "epoch": 7.2949816401468786, "grad_norm": 1.8831316232681274, "learning_rate": 2.602637571829009e-06, "loss": 0.5866, "step": 745 }, { "epoch": 7.304773561811506, "grad_norm": 1.633870244026184, "learning_rate": 2.5975083657062043e-06, "loss": 0.4931, "step": 746 }, { "epoch": 7.314565483476132, "grad_norm": 1.7254648208618164, "learning_rate": 2.592378748472863e-06, "loss": 0.465, "step": 747 }, { "epoch": 7.324357405140759, "grad_norm": 1.7281690835952759, "learning_rate": 2.587248741756253e-06, "loss": 0.4458, "step": 748 }, { "epoch": 7.3341493268053854, "grad_norm": 1.8751955032348633, "learning_rate": 2.582118367185287e-06, "loss": 0.6052, "step": 749 }, { "epoch": 7.343941248470013, "grad_norm": 1.701616883277893, "learning_rate": 2.576987646390426e-06, "loss": 0.5438, "step": 750 }, { "epoch": 7.353733170134639, "grad_norm": 1.8297240734100342, "learning_rate": 2.5718566010035943e-06, "loss": 0.6359, "step": 751 }, { "epoch": 7.363525091799266, "grad_norm": 2.0527989864349365, "learning_rate": 2.566725252658081e-06, "loss": 0.8555, "step": 752 }, { "epoch": 7.373317013463892, "grad_norm": 2.0145673751831055, "learning_rate": 2.561593622988456e-06, "loss": 0.6158, "step": 753 }, { "epoch": 7.383108935128519, "grad_norm": 1.6125619411468506, "learning_rate": 2.5564617336304703e-06, "loss": 0.4014, "step": 754 }, { "epoch": 7.392900856793146, "grad_norm": 2.2305614948272705, "learning_rate": 2.5513296062209766e-06, "loss": 0.5481, "step": 755 }, { "epoch": 7.402692778457772, "grad_norm": 2.106675386428833, "learning_rate": 2.546197262397825e-06, "loss": 0.5079, "step": 756 }, { "epoch": 7.412484700122399, "grad_norm": 1.9670437574386597, "learning_rate": 2.5410647237997822e-06, "loss": 0.5576, "step": 757 }, { "epoch": 7.4222766217870255, "grad_norm": 1.8804761171340942, "learning_rate": 2.535932012066434e-06, "loss": 0.5802, "step": 758 }, { "epoch": 7.432068543451653, "grad_norm": 1.876996397972107, "learning_rate": 2.530799148838096e-06, "loss": 0.7875, "step": 759 }, { "epoch": 7.441860465116279, "grad_norm": 1.9186457395553589, "learning_rate": 2.525666155755725e-06, "loss": 0.5284, "step": 760 }, { "epoch": 7.451652386780906, "grad_norm": 1.8182743787765503, "learning_rate": 2.5205330544608224e-06, "loss": 0.5017, "step": 761 }, { "epoch": 7.461444308445532, "grad_norm": 2.0103161334991455, "learning_rate": 2.515399866595347e-06, "loss": 0.7079, "step": 762 }, { "epoch": 7.47123623011016, "grad_norm": 1.818121314048767, "learning_rate": 2.5102666138016225e-06, "loss": 0.824, "step": 763 }, { "epoch": 7.481028151774786, "grad_norm": 1.7930558919906616, "learning_rate": 2.5051333177222476e-06, "loss": 0.5519, "step": 764 }, { "epoch": 7.490820073439412, "grad_norm": 1.8278942108154297, "learning_rate": 2.5e-06, "loss": 0.4696, "step": 765 }, { "epoch": 7.500611995104039, "grad_norm": 1.8447648286819458, "learning_rate": 2.4948666822777536e-06, "loss": 0.5293, "step": 766 }, { "epoch": 7.510403916768666, "grad_norm": 1.9343880414962769, "learning_rate": 2.4897333861983783e-06, "loss": 0.7387, "step": 767 }, { "epoch": 7.520195838433293, "grad_norm": 1.809899926185608, "learning_rate": 2.4846001334046537e-06, "loss": 0.6534, "step": 768 }, { "epoch": 7.529987760097919, "grad_norm": 1.6889994144439697, "learning_rate": 2.479466945539178e-06, "loss": 0.6182, "step": 769 }, { "epoch": 7.539779681762546, "grad_norm": 1.8474067449569702, "learning_rate": 2.474333844244276e-06, "loss": 0.5644, "step": 770 }, { "epoch": 7.5495716034271725, "grad_norm": 1.719081163406372, "learning_rate": 2.4692008511619042e-06, "loss": 0.4232, "step": 771 }, { "epoch": 7.5593635250918, "grad_norm": 2.1085290908813477, "learning_rate": 2.464067987933567e-06, "loss": 0.4834, "step": 772 }, { "epoch": 7.569155446756426, "grad_norm": 1.8854507207870483, "learning_rate": 2.458935276200219e-06, "loss": 0.6594, "step": 773 }, { "epoch": 7.578947368421053, "grad_norm": 1.779068112373352, "learning_rate": 2.453802737602176e-06, "loss": 0.5707, "step": 774 }, { "epoch": 7.588739290085679, "grad_norm": 1.771462082862854, "learning_rate": 2.4486703937790243e-06, "loss": 0.6949, "step": 775 }, { "epoch": 7.598531211750306, "grad_norm": 1.7813142538070679, "learning_rate": 2.4435382663695305e-06, "loss": 0.5743, "step": 776 }, { "epoch": 7.608323133414933, "grad_norm": 2.15559720993042, "learning_rate": 2.438406377011546e-06, "loss": 0.7055, "step": 777 }, { "epoch": 7.618115055079559, "grad_norm": 1.6817296743392944, "learning_rate": 2.4332747473419193e-06, "loss": 0.4256, "step": 778 }, { "epoch": 7.627906976744186, "grad_norm": 1.9564720392227173, "learning_rate": 2.428143398996407e-06, "loss": 0.7568, "step": 779 }, { "epoch": 7.6376988984088126, "grad_norm": 2.0500922203063965, "learning_rate": 2.4230123536095746e-06, "loss": 0.7748, "step": 780 }, { "epoch": 7.64749082007344, "grad_norm": 1.9276511669158936, "learning_rate": 2.417881632814714e-06, "loss": 0.8587, "step": 781 }, { "epoch": 7.657282741738066, "grad_norm": 2.074822425842285, "learning_rate": 2.4127512582437486e-06, "loss": 0.3384, "step": 782 }, { "epoch": 7.667074663402692, "grad_norm": 1.8807878494262695, "learning_rate": 2.4076212515271384e-06, "loss": 0.6684, "step": 783 }, { "epoch": 7.6768665850673194, "grad_norm": 1.837026596069336, "learning_rate": 2.4024916342937966e-06, "loss": 0.5543, "step": 784 }, { "epoch": 7.686658506731947, "grad_norm": 1.8804985284805298, "learning_rate": 2.3973624281709924e-06, "loss": 0.6623, "step": 785 }, { "epoch": 7.696450428396573, "grad_norm": 1.693087100982666, "learning_rate": 2.392233654784262e-06, "loss": 0.4938, "step": 786 }, { "epoch": 7.706242350061199, "grad_norm": 2.094351053237915, "learning_rate": 2.387105335757316e-06, "loss": 0.5744, "step": 787 }, { "epoch": 7.716034271725826, "grad_norm": 2.0713577270507812, "learning_rate": 2.3819774927119523e-06, "loss": 0.5779, "step": 788 }, { "epoch": 7.725826193390453, "grad_norm": 1.81289803981781, "learning_rate": 2.376850147267958e-06, "loss": 0.5983, "step": 789 }, { "epoch": 7.73561811505508, "grad_norm": 2.0912883281707764, "learning_rate": 2.3717233210430258e-06, "loss": 0.7409, "step": 790 }, { "epoch": 7.745410036719706, "grad_norm": 2.0038065910339355, "learning_rate": 2.3665970356526574e-06, "loss": 0.6263, "step": 791 }, { "epoch": 7.755201958384333, "grad_norm": 2.0850038528442383, "learning_rate": 2.3614713127100752e-06, "loss": 0.5604, "step": 792 }, { "epoch": 7.7649938800489595, "grad_norm": 1.7946621179580688, "learning_rate": 2.3563461738261285e-06, "loss": 0.5541, "step": 793 }, { "epoch": 7.774785801713586, "grad_norm": 1.7942308187484741, "learning_rate": 2.3512216406092066e-06, "loss": 0.6226, "step": 794 }, { "epoch": 7.784577723378213, "grad_norm": 1.7821414470672607, "learning_rate": 2.346097734665143e-06, "loss": 0.6163, "step": 795 }, { "epoch": 7.79436964504284, "grad_norm": 1.9595634937286377, "learning_rate": 2.340974477597128e-06, "loss": 0.5784, "step": 796 }, { "epoch": 7.804161566707466, "grad_norm": 1.9986389875411987, "learning_rate": 2.3358518910056146e-06, "loss": 0.7621, "step": 797 }, { "epoch": 7.813953488372093, "grad_norm": 1.8178908824920654, "learning_rate": 2.3307299964882314e-06, "loss": 0.4573, "step": 798 }, { "epoch": 7.82374541003672, "grad_norm": 2.3029026985168457, "learning_rate": 2.325608815639687e-06, "loss": 0.5455, "step": 799 }, { "epoch": 7.833537331701346, "grad_norm": 1.98384428024292, "learning_rate": 2.3204883700516813e-06, "loss": 0.6872, "step": 800 }, { "epoch": 7.843329253365973, "grad_norm": 1.94217050075531, "learning_rate": 2.3153686813128153e-06, "loss": 0.6273, "step": 801 }, { "epoch": 7.8531211750306, "grad_norm": 1.629888653755188, "learning_rate": 2.310249771008498e-06, "loss": 0.5389, "step": 802 }, { "epoch": 7.862913096695227, "grad_norm": 1.826710820198059, "learning_rate": 2.3051316607208566e-06, "loss": 0.6932, "step": 803 }, { "epoch": 7.872705018359853, "grad_norm": 2.059861183166504, "learning_rate": 2.3000143720286463e-06, "loss": 0.759, "step": 804 }, { "epoch": 7.882496940024479, "grad_norm": 1.761783242225647, "learning_rate": 2.2948979265071565e-06, "loss": 0.5921, "step": 805 }, { "epoch": 7.8922888616891065, "grad_norm": 2.003918170928955, "learning_rate": 2.2897823457281225e-06, "loss": 0.5106, "step": 806 }, { "epoch": 7.902080783353734, "grad_norm": 1.9686771631240845, "learning_rate": 2.2846676512596355e-06, "loss": 0.6705, "step": 807 }, { "epoch": 7.91187270501836, "grad_norm": 2.0167222023010254, "learning_rate": 2.2795538646660462e-06, "loss": 0.4546, "step": 808 }, { "epoch": 7.921664626682986, "grad_norm": 1.9226226806640625, "learning_rate": 2.2744410075078805e-06, "loss": 0.6759, "step": 809 }, { "epoch": 7.931456548347613, "grad_norm": 1.9938815832138062, "learning_rate": 2.269329101341745e-06, "loss": 0.8693, "step": 810 }, { "epoch": 7.94124847001224, "grad_norm": 1.6467808485031128, "learning_rate": 2.2642181677202366e-06, "loss": 0.532, "step": 811 }, { "epoch": 7.951040391676867, "grad_norm": 1.8960227966308594, "learning_rate": 2.2591082281918507e-06, "loss": 0.5151, "step": 812 }, { "epoch": 7.960832313341493, "grad_norm": 1.9753961563110352, "learning_rate": 2.2539993043008936e-06, "loss": 0.6028, "step": 813 }, { "epoch": 7.97062423500612, "grad_norm": 1.9453445672988892, "learning_rate": 2.2488914175873876e-06, "loss": 0.6545, "step": 814 }, { "epoch": 7.9804161566707466, "grad_norm": 2.058900833129883, "learning_rate": 2.2437845895869827e-06, "loss": 0.3995, "step": 815 }, { "epoch": 7.990208078335373, "grad_norm": 1.9905152320861816, "learning_rate": 2.238678841830867e-06, "loss": 0.5753, "step": 816 }, { "epoch": 8.0, "grad_norm": 4.995258331298828, "learning_rate": 2.2335741958456703e-06, "loss": 1.2088, "step": 817 }, { "epoch": 8.009791921664627, "grad_norm": 2.1932873725891113, "learning_rate": 2.2284706731533805e-06, "loss": 0.5763, "step": 818 }, { "epoch": 8.019583843329253, "grad_norm": 1.9897724390029907, "learning_rate": 2.2233682952712484e-06, "loss": 0.5859, "step": 819 }, { "epoch": 8.02937576499388, "grad_norm": 1.7052136659622192, "learning_rate": 2.2182670837116975e-06, "loss": 0.5236, "step": 820 }, { "epoch": 8.039167686658507, "grad_norm": 1.6906954050064087, "learning_rate": 2.213167059982235e-06, "loss": 0.6403, "step": 821 }, { "epoch": 8.048959608323134, "grad_norm": 1.9218356609344482, "learning_rate": 2.20806824558536e-06, "loss": 0.3448, "step": 822 }, { "epoch": 8.05875152998776, "grad_norm": 1.807294249534607, "learning_rate": 2.202970662018471e-06, "loss": 0.7681, "step": 823 }, { "epoch": 8.068543451652387, "grad_norm": 2.0794694423675537, "learning_rate": 2.197874330773779e-06, "loss": 0.8907, "step": 824 }, { "epoch": 8.078335373317014, "grad_norm": 1.7886254787445068, "learning_rate": 2.1927792733382153e-06, "loss": 0.3702, "step": 825 }, { "epoch": 8.088127294981641, "grad_norm": 1.7085249423980713, "learning_rate": 2.1876855111933385e-06, "loss": 0.5034, "step": 826 }, { "epoch": 8.097919216646266, "grad_norm": 2.4030823707580566, "learning_rate": 2.182593065815249e-06, "loss": 0.5923, "step": 827 }, { "epoch": 8.107711138310894, "grad_norm": 1.7696202993392944, "learning_rate": 2.1775019586744924e-06, "loss": 0.3931, "step": 828 }, { "epoch": 8.11750305997552, "grad_norm": 1.8768678903579712, "learning_rate": 2.172412211235974e-06, "loss": 0.5783, "step": 829 }, { "epoch": 8.127294981640146, "grad_norm": 2.108625888824463, "learning_rate": 2.167323844958867e-06, "loss": 0.5142, "step": 830 }, { "epoch": 8.137086903304773, "grad_norm": 1.8129931688308716, "learning_rate": 2.1622368812965184e-06, "loss": 0.4732, "step": 831 }, { "epoch": 8.1468788249694, "grad_norm": 1.9590007066726685, "learning_rate": 2.1571513416963647e-06, "loss": 0.8152, "step": 832 }, { "epoch": 8.156670746634028, "grad_norm": 1.7841365337371826, "learning_rate": 2.1520672475998374e-06, "loss": 0.845, "step": 833 }, { "epoch": 8.166462668298653, "grad_norm": 1.9791157245635986, "learning_rate": 2.1469846204422724e-06, "loss": 0.6278, "step": 834 }, { "epoch": 8.17625458996328, "grad_norm": 2.1543097496032715, "learning_rate": 2.141903481652822e-06, "loss": 0.4332, "step": 835 }, { "epoch": 8.186046511627907, "grad_norm": 2.0894935131073, "learning_rate": 2.136823852654363e-06, "loss": 0.4529, "step": 836 }, { "epoch": 8.195838433292534, "grad_norm": 2.095534086227417, "learning_rate": 2.131745754863406e-06, "loss": 0.4827, "step": 837 }, { "epoch": 8.20563035495716, "grad_norm": 2.0144567489624023, "learning_rate": 2.126669209690008e-06, "loss": 0.5765, "step": 838 }, { "epoch": 8.215422276621787, "grad_norm": 1.7944990396499634, "learning_rate": 2.121594238537677e-06, "loss": 0.5062, "step": 839 }, { "epoch": 8.225214198286414, "grad_norm": 1.9306622743606567, "learning_rate": 2.1165208628032863e-06, "loss": 0.5132, "step": 840 }, { "epoch": 8.23500611995104, "grad_norm": 2.0294885635375977, "learning_rate": 2.111449103876983e-06, "loss": 0.5609, "step": 841 }, { "epoch": 8.244798041615667, "grad_norm": 1.8263746500015259, "learning_rate": 2.1063789831420957e-06, "loss": 0.6118, "step": 842 }, { "epoch": 8.254589963280294, "grad_norm": 1.961182951927185, "learning_rate": 2.101310521975048e-06, "loss": 0.6572, "step": 843 }, { "epoch": 8.264381884944921, "grad_norm": 2.1316161155700684, "learning_rate": 2.096243741745266e-06, "loss": 0.6411, "step": 844 }, { "epoch": 8.274173806609546, "grad_norm": 1.720635175704956, "learning_rate": 2.0911786638150873e-06, "loss": 0.6301, "step": 845 }, { "epoch": 8.283965728274174, "grad_norm": 1.7557703256607056, "learning_rate": 2.086115309539675e-06, "loss": 0.4931, "step": 846 }, { "epoch": 8.2937576499388, "grad_norm": 2.0891196727752686, "learning_rate": 2.081053700266923e-06, "loss": 0.6566, "step": 847 }, { "epoch": 8.303549571603428, "grad_norm": 2.166632890701294, "learning_rate": 2.0759938573373683e-06, "loss": 0.5108, "step": 848 }, { "epoch": 8.313341493268053, "grad_norm": 2.049032211303711, "learning_rate": 2.0709358020841006e-06, "loss": 0.6702, "step": 849 }, { "epoch": 8.32313341493268, "grad_norm": 1.6838774681091309, "learning_rate": 2.0658795558326745e-06, "loss": 0.6044, "step": 850 }, { "epoch": 8.332925336597308, "grad_norm": 1.7359877824783325, "learning_rate": 2.0608251399010136e-06, "loss": 0.5029, "step": 851 }, { "epoch": 8.342717258261933, "grad_norm": 1.6999928951263428, "learning_rate": 2.0557725755993283e-06, "loss": 0.4285, "step": 852 }, { "epoch": 8.35250917992656, "grad_norm": 2.0318007469177246, "learning_rate": 2.050721884230021e-06, "loss": 0.5602, "step": 853 }, { "epoch": 8.362301101591187, "grad_norm": 1.677319049835205, "learning_rate": 2.0456730870875964e-06, "loss": 0.7356, "step": 854 }, { "epoch": 8.372093023255815, "grad_norm": 1.8247665166854858, "learning_rate": 2.040626205458574e-06, "loss": 0.6258, "step": 855 }, { "epoch": 8.38188494492044, "grad_norm": 1.9809565544128418, "learning_rate": 2.035581260621398e-06, "loss": 0.3715, "step": 856 }, { "epoch": 8.391676866585067, "grad_norm": 1.8528735637664795, "learning_rate": 2.0305382738463444e-06, "loss": 0.4909, "step": 857 }, { "epoch": 8.401468788249694, "grad_norm": 2.332023859024048, "learning_rate": 2.0254972663954356e-06, "loss": 0.5047, "step": 858 }, { "epoch": 8.411260709914322, "grad_norm": 1.862711787223816, "learning_rate": 2.0204582595223486e-06, "loss": 0.701, "step": 859 }, { "epoch": 8.421052631578947, "grad_norm": 1.8858460187911987, "learning_rate": 2.015421274472325e-06, "loss": 0.5339, "step": 860 }, { "epoch": 8.430844553243574, "grad_norm": 1.9136649370193481, "learning_rate": 2.010386332482083e-06, "loss": 0.5349, "step": 861 }, { "epoch": 8.440636474908201, "grad_norm": 2.065195322036743, "learning_rate": 2.005353454779726e-06, "loss": 0.6783, "step": 862 }, { "epoch": 8.450428396572827, "grad_norm": 1.888999581336975, "learning_rate": 2.0003226625846535e-06, "loss": 0.6943, "step": 863 }, { "epoch": 8.460220318237454, "grad_norm": 1.8780450820922852, "learning_rate": 1.995293977107475e-06, "loss": 0.7216, "step": 864 }, { "epoch": 8.470012239902081, "grad_norm": 1.819907546043396, "learning_rate": 1.9902674195499144e-06, "loss": 0.576, "step": 865 }, { "epoch": 8.479804161566708, "grad_norm": 2.0120952129364014, "learning_rate": 1.9852430111047254e-06, "loss": 0.5428, "step": 866 }, { "epoch": 8.489596083231334, "grad_norm": 1.8431098461151123, "learning_rate": 1.9802207729556023e-06, "loss": 0.6222, "step": 867 }, { "epoch": 8.49938800489596, "grad_norm": 1.797861099243164, "learning_rate": 1.9752007262770857e-06, "loss": 0.6064, "step": 868 }, { "epoch": 8.509179926560588, "grad_norm": 1.8799742460250854, "learning_rate": 1.970182892234479e-06, "loss": 0.5887, "step": 869 }, { "epoch": 8.518971848225215, "grad_norm": 1.888797640800476, "learning_rate": 1.965167291983757e-06, "loss": 0.8209, "step": 870 }, { "epoch": 8.52876376988984, "grad_norm": 2.052819013595581, "learning_rate": 1.9601539466714747e-06, "loss": 0.563, "step": 871 }, { "epoch": 8.538555691554468, "grad_norm": 1.9499998092651367, "learning_rate": 1.955142877434681e-06, "loss": 0.6843, "step": 872 }, { "epoch": 8.548347613219095, "grad_norm": 1.7014323472976685, "learning_rate": 1.9501341054008292e-06, "loss": 0.5014, "step": 873 }, { "epoch": 8.55813953488372, "grad_norm": 1.8305422067642212, "learning_rate": 1.9451276516876856e-06, "loss": 0.6123, "step": 874 }, { "epoch": 8.567931456548347, "grad_norm": 1.91391921043396, "learning_rate": 1.9401235374032427e-06, "loss": 0.4914, "step": 875 }, { "epoch": 8.577723378212974, "grad_norm": 1.8856854438781738, "learning_rate": 1.9351217836456316e-06, "loss": 0.5066, "step": 876 }, { "epoch": 8.587515299877602, "grad_norm": 1.7935293912887573, "learning_rate": 1.9301224115030275e-06, "loss": 0.6096, "step": 877 }, { "epoch": 8.597307221542227, "grad_norm": 1.7831039428710938, "learning_rate": 1.9251254420535665e-06, "loss": 0.4746, "step": 878 }, { "epoch": 8.607099143206854, "grad_norm": 2.0652408599853516, "learning_rate": 1.9201308963652553e-06, "loss": 0.4253, "step": 879 }, { "epoch": 8.616891064871481, "grad_norm": 2.050835371017456, "learning_rate": 1.9151387954958792e-06, "loss": 0.6455, "step": 880 }, { "epoch": 8.626682986536107, "grad_norm": 2.0856056213378906, "learning_rate": 1.910149160492918e-06, "loss": 0.6813, "step": 881 }, { "epoch": 8.636474908200734, "grad_norm": 1.8683456182479858, "learning_rate": 1.9051620123934538e-06, "loss": 0.6174, "step": 882 }, { "epoch": 8.646266829865361, "grad_norm": 1.859810471534729, "learning_rate": 1.9001773722240837e-06, "loss": 0.5656, "step": 883 }, { "epoch": 8.656058751529988, "grad_norm": 1.8673434257507324, "learning_rate": 1.895195261000831e-06, "loss": 0.5782, "step": 884 }, { "epoch": 8.665850673194614, "grad_norm": 1.845432162284851, "learning_rate": 1.8902156997290571e-06, "loss": 0.547, "step": 885 }, { "epoch": 8.67564259485924, "grad_norm": 2.135610342025757, "learning_rate": 1.885238709403372e-06, "loss": 0.5115, "step": 886 }, { "epoch": 8.685434516523868, "grad_norm": 1.976205825805664, "learning_rate": 1.880264311007547e-06, "loss": 0.5706, "step": 887 }, { "epoch": 8.695226438188495, "grad_norm": 1.9482977390289307, "learning_rate": 1.8752925255144228e-06, "loss": 0.4883, "step": 888 }, { "epoch": 8.70501835985312, "grad_norm": 1.7764558792114258, "learning_rate": 1.8703233738858267e-06, "loss": 0.4413, "step": 889 }, { "epoch": 8.714810281517748, "grad_norm": 1.7509212493896484, "learning_rate": 1.8653568770724805e-06, "loss": 0.626, "step": 890 }, { "epoch": 8.724602203182375, "grad_norm": 1.6893430948257446, "learning_rate": 1.860393056013911e-06, "loss": 0.5212, "step": 891 }, { "epoch": 8.734394124847, "grad_norm": 1.8409041166305542, "learning_rate": 1.8554319316383657e-06, "loss": 0.5825, "step": 892 }, { "epoch": 8.744186046511627, "grad_norm": 1.6444921493530273, "learning_rate": 1.8504735248627223e-06, "loss": 0.4588, "step": 893 }, { "epoch": 8.753977968176255, "grad_norm": 1.9400769472122192, "learning_rate": 1.8455178565923993e-06, "loss": 0.53, "step": 894 }, { "epoch": 8.763769889840882, "grad_norm": 1.9518778324127197, "learning_rate": 1.84056494772127e-06, "loss": 0.6626, "step": 895 }, { "epoch": 8.773561811505507, "grad_norm": 1.8867392539978027, "learning_rate": 1.8356148191315753e-06, "loss": 0.3985, "step": 896 }, { "epoch": 8.783353733170134, "grad_norm": 1.8398773670196533, "learning_rate": 1.8306674916938303e-06, "loss": 0.5727, "step": 897 }, { "epoch": 8.793145654834762, "grad_norm": 1.9605438709259033, "learning_rate": 1.8257229862667437e-06, "loss": 0.5975, "step": 898 }, { "epoch": 8.802937576499389, "grad_norm": 2.0399532318115234, "learning_rate": 1.8207813236971233e-06, "loss": 0.6591, "step": 899 }, { "epoch": 8.812729498164014, "grad_norm": 1.8264209032058716, "learning_rate": 1.8158425248197931e-06, "loss": 0.5893, "step": 900 }, { "epoch": 8.822521419828641, "grad_norm": 2.2628707885742188, "learning_rate": 1.8109066104575023e-06, "loss": 0.5725, "step": 901 }, { "epoch": 8.832313341493268, "grad_norm": 1.8494503498077393, "learning_rate": 1.8059736014208388e-06, "loss": 0.6098, "step": 902 }, { "epoch": 8.842105263157894, "grad_norm": 1.7301713228225708, "learning_rate": 1.8010435185081405e-06, "loss": 0.672, "step": 903 }, { "epoch": 8.851897184822521, "grad_norm": 2.034219980239868, "learning_rate": 1.7961163825054101e-06, "loss": 0.4025, "step": 904 }, { "epoch": 8.861689106487148, "grad_norm": 2.381079912185669, "learning_rate": 1.7911922141862232e-06, "loss": 0.6344, "step": 905 }, { "epoch": 8.871481028151775, "grad_norm": 1.8997502326965332, "learning_rate": 1.7862710343116451e-06, "loss": 0.6304, "step": 906 }, { "epoch": 8.8812729498164, "grad_norm": 1.959622859954834, "learning_rate": 1.7813528636301409e-06, "loss": 0.6578, "step": 907 }, { "epoch": 8.891064871481028, "grad_norm": 1.8533662557601929, "learning_rate": 1.7764377228774877e-06, "loss": 0.6258, "step": 908 }, { "epoch": 8.900856793145655, "grad_norm": 1.7809977531433105, "learning_rate": 1.7715256327766887e-06, "loss": 0.5881, "step": 909 }, { "epoch": 8.910648714810282, "grad_norm": 1.9531484842300415, "learning_rate": 1.7666166140378853e-06, "loss": 0.6586, "step": 910 }, { "epoch": 8.920440636474908, "grad_norm": 1.8694995641708374, "learning_rate": 1.7617106873582684e-06, "loss": 0.4534, "step": 911 }, { "epoch": 8.930232558139535, "grad_norm": 1.9488515853881836, "learning_rate": 1.7568078734219934e-06, "loss": 0.7551, "step": 912 }, { "epoch": 8.940024479804162, "grad_norm": 2.363982915878296, "learning_rate": 1.7519081929000925e-06, "loss": 0.6141, "step": 913 }, { "epoch": 8.949816401468787, "grad_norm": 2.1547443866729736, "learning_rate": 1.747011666450384e-06, "loss": 0.5707, "step": 914 }, { "epoch": 8.959608323133414, "grad_norm": 1.878010630607605, "learning_rate": 1.7421183147173915e-06, "loss": 0.4958, "step": 915 }, { "epoch": 8.969400244798042, "grad_norm": 2.0991098880767822, "learning_rate": 1.737228158332252e-06, "loss": 0.4644, "step": 916 }, { "epoch": 8.979192166462669, "grad_norm": 2.0460169315338135, "learning_rate": 1.73234121791263e-06, "loss": 0.6929, "step": 917 }, { "epoch": 8.988984088127294, "grad_norm": 1.9267401695251465, "learning_rate": 1.7274575140626318e-06, "loss": 0.5198, "step": 918 }, { "epoch": 8.998776009791921, "grad_norm": 2.072077512741089, "learning_rate": 1.7225770673727177e-06, "loss": 0.6584, "step": 919 }, { "epoch": 9.008567931456549, "grad_norm": 6.799975872039795, "learning_rate": 1.7176998984196148e-06, "loss": 1.0149, "step": 920 }, { "epoch": 9.018359853121176, "grad_norm": 1.866452932357788, "learning_rate": 1.7128260277662307e-06, "loss": 0.5354, "step": 921 }, { "epoch": 9.028151774785801, "grad_norm": 1.7898658514022827, "learning_rate": 1.7079554759615685e-06, "loss": 0.4961, "step": 922 }, { "epoch": 9.037943696450428, "grad_norm": 1.9190596342086792, "learning_rate": 1.7030882635406354e-06, "loss": 0.5135, "step": 923 }, { "epoch": 9.047735618115055, "grad_norm": 1.8479288816452026, "learning_rate": 1.6982244110243626e-06, "loss": 0.6033, "step": 924 }, { "epoch": 9.057527539779683, "grad_norm": 1.8580877780914307, "learning_rate": 1.6933639389195136e-06, "loss": 0.6775, "step": 925 }, { "epoch": 9.067319461444308, "grad_norm": 1.9282350540161133, "learning_rate": 1.6885068677185989e-06, "loss": 0.5372, "step": 926 }, { "epoch": 9.077111383108935, "grad_norm": 2.0182504653930664, "learning_rate": 1.6836532178997922e-06, "loss": 0.4403, "step": 927 }, { "epoch": 9.086903304773562, "grad_norm": 2.062225818634033, "learning_rate": 1.678803009926841e-06, "loss": 0.6456, "step": 928 }, { "epoch": 9.096695226438188, "grad_norm": 1.6223728656768799, "learning_rate": 1.673956264248981e-06, "loss": 0.5212, "step": 929 }, { "epoch": 9.106487148102815, "grad_norm": 2.0617520809173584, "learning_rate": 1.6691130013008514e-06, "loss": 0.5263, "step": 930 }, { "epoch": 9.116279069767442, "grad_norm": 2.2465288639068604, "learning_rate": 1.6642732415024071e-06, "loss": 0.4477, "step": 931 }, { "epoch": 9.12607099143207, "grad_norm": 2.156426191329956, "learning_rate": 1.6594370052588328e-06, "loss": 0.8745, "step": 932 }, { "epoch": 9.135862913096695, "grad_norm": 2.1958324909210205, "learning_rate": 1.6546043129604572e-06, "loss": 0.6182, "step": 933 }, { "epoch": 9.145654834761322, "grad_norm": 1.792380452156067, "learning_rate": 1.6497751849826692e-06, "loss": 0.5505, "step": 934 }, { "epoch": 9.155446756425949, "grad_norm": 1.9903302192687988, "learning_rate": 1.6449496416858285e-06, "loss": 0.7519, "step": 935 }, { "epoch": 9.165238678090576, "grad_norm": 1.8692671060562134, "learning_rate": 1.6401277034151798e-06, "loss": 0.5706, "step": 936 }, { "epoch": 9.175030599755202, "grad_norm": 2.029550790786743, "learning_rate": 1.6353093905007705e-06, "loss": 0.7219, "step": 937 }, { "epoch": 9.184822521419829, "grad_norm": 1.969567894935608, "learning_rate": 1.630494723257363e-06, "loss": 0.55, "step": 938 }, { "epoch": 9.194614443084456, "grad_norm": 1.8979378938674927, "learning_rate": 1.6256837219843472e-06, "loss": 0.5077, "step": 939 }, { "epoch": 9.204406364749081, "grad_norm": 1.7967567443847656, "learning_rate": 1.620876406965658e-06, "loss": 0.4541, "step": 940 }, { "epoch": 9.214198286413708, "grad_norm": 2.0161335468292236, "learning_rate": 1.6160727984696892e-06, "loss": 0.5423, "step": 941 }, { "epoch": 9.223990208078336, "grad_norm": 2.0905377864837646, "learning_rate": 1.611272916749205e-06, "loss": 0.6819, "step": 942 }, { "epoch": 9.233782129742963, "grad_norm": 1.900076150894165, "learning_rate": 1.6064767820412588e-06, "loss": 0.5631, "step": 943 }, { "epoch": 9.243574051407588, "grad_norm": 1.6789194345474243, "learning_rate": 1.6016844145671062e-06, "loss": 0.3642, "step": 944 }, { "epoch": 9.253365973072215, "grad_norm": 1.8091360330581665, "learning_rate": 1.5968958345321178e-06, "loss": 0.5297, "step": 945 }, { "epoch": 9.263157894736842, "grad_norm": 2.4467523097991943, "learning_rate": 1.5921110621256972e-06, "loss": 0.5278, "step": 946 }, { "epoch": 9.27294981640147, "grad_norm": 2.0257585048675537, "learning_rate": 1.5873301175211947e-06, "loss": 0.6498, "step": 947 }, { "epoch": 9.282741738066095, "grad_norm": 1.9079017639160156, "learning_rate": 1.58255302087582e-06, "loss": 0.562, "step": 948 }, { "epoch": 9.292533659730722, "grad_norm": 1.917777180671692, "learning_rate": 1.5777797923305615e-06, "loss": 0.3274, "step": 949 }, { "epoch": 9.30232558139535, "grad_norm": 2.0385780334472656, "learning_rate": 1.5730104520100984e-06, "loss": 0.5055, "step": 950 }, { "epoch": 9.312117503059975, "grad_norm": 1.996443748474121, "learning_rate": 1.568245020022715e-06, "loss": 0.6829, "step": 951 }, { "epoch": 9.321909424724602, "grad_norm": 1.9922800064086914, "learning_rate": 1.56348351646022e-06, "loss": 0.4672, "step": 952 }, { "epoch": 9.331701346389229, "grad_norm": 1.686010479927063, "learning_rate": 1.5587259613978578e-06, "loss": 0.4116, "step": 953 }, { "epoch": 9.341493268053856, "grad_norm": 1.9642030000686646, "learning_rate": 1.5539723748942246e-06, "loss": 0.5485, "step": 954 }, { "epoch": 9.351285189718482, "grad_norm": 2.0619044303894043, "learning_rate": 1.549222776991186e-06, "loss": 0.7304, "step": 955 }, { "epoch": 9.361077111383109, "grad_norm": 1.949583888053894, "learning_rate": 1.544477187713791e-06, "loss": 0.5475, "step": 956 }, { "epoch": 9.370869033047736, "grad_norm": 1.776412010192871, "learning_rate": 1.5397356270701858e-06, "loss": 0.4137, "step": 957 }, { "epoch": 9.380660954712361, "grad_norm": 1.9808849096298218, "learning_rate": 1.534998115051533e-06, "loss": 0.6534, "step": 958 }, { "epoch": 9.390452876376989, "grad_norm": 1.964613676071167, "learning_rate": 1.5302646716319258e-06, "loss": 0.4895, "step": 959 }, { "epoch": 9.400244798041616, "grad_norm": 1.955569863319397, "learning_rate": 1.5255353167683017e-06, "loss": 0.3451, "step": 960 }, { "epoch": 9.410036719706243, "grad_norm": 2.470039129257202, "learning_rate": 1.520810070400362e-06, "loss": 0.7049, "step": 961 }, { "epoch": 9.419828641370868, "grad_norm": 1.8120123147964478, "learning_rate": 1.5160889524504857e-06, "loss": 0.3711, "step": 962 }, { "epoch": 9.429620563035495, "grad_norm": 1.8435657024383545, "learning_rate": 1.5113719828236439e-06, "loss": 0.472, "step": 963 }, { "epoch": 9.439412484700123, "grad_norm": 2.0470046997070312, "learning_rate": 1.50665918140732e-06, "loss": 0.4892, "step": 964 }, { "epoch": 9.44920440636475, "grad_norm": 1.95719575881958, "learning_rate": 1.5019505680714233e-06, "loss": 0.5366, "step": 965 }, { "epoch": 9.458996328029375, "grad_norm": 2.057894706726074, "learning_rate": 1.4972461626682033e-06, "loss": 0.4905, "step": 966 }, { "epoch": 9.468788249694002, "grad_norm": 1.8695260286331177, "learning_rate": 1.4925459850321711e-06, "loss": 0.3814, "step": 967 }, { "epoch": 9.47858017135863, "grad_norm": 1.9245821237564087, "learning_rate": 1.4878500549800115e-06, "loss": 0.4923, "step": 968 }, { "epoch": 9.488372093023255, "grad_norm": 1.8848599195480347, "learning_rate": 1.4831583923105e-06, "loss": 0.4738, "step": 969 }, { "epoch": 9.498164014687882, "grad_norm": 2.0027384757995605, "learning_rate": 1.4784710168044215e-06, "loss": 0.6102, "step": 970 }, { "epoch": 9.50795593635251, "grad_norm": 1.8755180835723877, "learning_rate": 1.4737879482244854e-06, "loss": 0.5308, "step": 971 }, { "epoch": 9.517747858017136, "grad_norm": 1.9568339586257935, "learning_rate": 1.4691092063152417e-06, "loss": 0.6971, "step": 972 }, { "epoch": 9.527539779681762, "grad_norm": 1.8696670532226562, "learning_rate": 1.4644348108029989e-06, "loss": 0.4938, "step": 973 }, { "epoch": 9.537331701346389, "grad_norm": 1.9223078489303589, "learning_rate": 1.459764781395741e-06, "loss": 0.624, "step": 974 }, { "epoch": 9.547123623011016, "grad_norm": 2.089118480682373, "learning_rate": 1.4550991377830426e-06, "loss": 0.4148, "step": 975 }, { "epoch": 9.556915544675643, "grad_norm": 1.9852639436721802, "learning_rate": 1.4504378996359867e-06, "loss": 0.8297, "step": 976 }, { "epoch": 9.566707466340269, "grad_norm": 2.072979688644409, "learning_rate": 1.4457810866070854e-06, "loss": 0.7347, "step": 977 }, { "epoch": 9.576499388004896, "grad_norm": 1.8752530813217163, "learning_rate": 1.4411287183301902e-06, "loss": 0.7494, "step": 978 }, { "epoch": 9.586291309669523, "grad_norm": 1.8769545555114746, "learning_rate": 1.436480814420414e-06, "loss": 0.6057, "step": 979 }, { "epoch": 9.596083231334148, "grad_norm": 2.123978614807129, "learning_rate": 1.4318373944740485e-06, "loss": 0.6354, "step": 980 }, { "epoch": 9.605875152998776, "grad_norm": 2.0652873516082764, "learning_rate": 1.4271984780684778e-06, "loss": 0.5237, "step": 981 }, { "epoch": 9.615667074663403, "grad_norm": 1.8793216943740845, "learning_rate": 1.4225640847621006e-06, "loss": 0.4843, "step": 982 }, { "epoch": 9.62545899632803, "grad_norm": 1.909644603729248, "learning_rate": 1.4179342340942459e-06, "loss": 0.467, "step": 983 }, { "epoch": 9.635250917992655, "grad_norm": 1.7923080921173096, "learning_rate": 1.4133089455850878e-06, "loss": 0.5029, "step": 984 }, { "epoch": 9.645042839657282, "grad_norm": 1.8393352031707764, "learning_rate": 1.408688238735566e-06, "loss": 0.6465, "step": 985 }, { "epoch": 9.65483476132191, "grad_norm": 1.8239715099334717, "learning_rate": 1.4040721330273063e-06, "loss": 0.4163, "step": 986 }, { "epoch": 9.664626682986537, "grad_norm": 1.833296537399292, "learning_rate": 1.399460647922532e-06, "loss": 0.4749, "step": 987 }, { "epoch": 9.674418604651162, "grad_norm": 2.0324273109436035, "learning_rate": 1.3948538028639851e-06, "loss": 0.625, "step": 988 }, { "epoch": 9.68421052631579, "grad_norm": 1.8538012504577637, "learning_rate": 1.3902516172748478e-06, "loss": 0.4561, "step": 989 }, { "epoch": 9.694002447980417, "grad_norm": 1.909273386001587, "learning_rate": 1.3856541105586545e-06, "loss": 0.5704, "step": 990 }, { "epoch": 9.703794369645042, "grad_norm": 1.9033586978912354, "learning_rate": 1.381061302099212e-06, "loss": 0.6575, "step": 991 }, { "epoch": 9.713586291309669, "grad_norm": 1.591498613357544, "learning_rate": 1.3764732112605223e-06, "loss": 0.3622, "step": 992 }, { "epoch": 9.723378212974296, "grad_norm": 1.9659687280654907, "learning_rate": 1.371889857386693e-06, "loss": 0.5569, "step": 993 }, { "epoch": 9.733170134638923, "grad_norm": 1.9424748420715332, "learning_rate": 1.367311259801863e-06, "loss": 0.5434, "step": 994 }, { "epoch": 9.742962056303549, "grad_norm": 2.0628254413604736, "learning_rate": 1.362737437810114e-06, "loss": 0.5471, "step": 995 }, { "epoch": 9.752753977968176, "grad_norm": 2.2389705181121826, "learning_rate": 1.3581684106953987e-06, "loss": 0.6072, "step": 996 }, { "epoch": 9.762545899632803, "grad_norm": 1.970963478088379, "learning_rate": 1.3536041977214499e-06, "loss": 0.5926, "step": 997 }, { "epoch": 9.77233782129743, "grad_norm": 2.271240234375, "learning_rate": 1.3490448181317025e-06, "loss": 0.8875, "step": 998 }, { "epoch": 9.782129742962056, "grad_norm": 2.1531565189361572, "learning_rate": 1.3444902911492174e-06, "loss": 0.5067, "step": 999 }, { "epoch": 9.791921664626683, "grad_norm": 1.9256696701049805, "learning_rate": 1.3399406359765921e-06, "loss": 0.4627, "step": 1000 }, { "epoch": 9.80171358629131, "grad_norm": 2.0190601348876953, "learning_rate": 1.3353958717958843e-06, "loss": 0.5713, "step": 1001 }, { "epoch": 9.811505507955935, "grad_norm": 2.2044289112091064, "learning_rate": 1.3308560177685334e-06, "loss": 0.4462, "step": 1002 }, { "epoch": 9.821297429620563, "grad_norm": 1.7944589853286743, "learning_rate": 1.3263210930352737e-06, "loss": 0.4628, "step": 1003 }, { "epoch": 9.83108935128519, "grad_norm": 2.0486531257629395, "learning_rate": 1.3217911167160575e-06, "loss": 0.7955, "step": 1004 }, { "epoch": 9.840881272949817, "grad_norm": 1.992489218711853, "learning_rate": 1.3172661079099752e-06, "loss": 0.6992, "step": 1005 }, { "epoch": 9.850673194614442, "grad_norm": 2.057983160018921, "learning_rate": 1.3127460856951724e-06, "loss": 0.5205, "step": 1006 }, { "epoch": 9.86046511627907, "grad_norm": 1.7972743511199951, "learning_rate": 1.308231069128769e-06, "loss": 0.6072, "step": 1007 }, { "epoch": 9.870257037943697, "grad_norm": 1.8492416143417358, "learning_rate": 1.303721077246784e-06, "loss": 0.4504, "step": 1008 }, { "epoch": 9.880048959608324, "grad_norm": 1.8589346408843994, "learning_rate": 1.2992161290640483e-06, "loss": 0.5712, "step": 1009 }, { "epoch": 9.88984088127295, "grad_norm": 1.9458105564117432, "learning_rate": 1.2947162435741278e-06, "loss": 0.5041, "step": 1010 }, { "epoch": 9.899632802937576, "grad_norm": 1.954971432685852, "learning_rate": 1.2902214397492463e-06, "loss": 0.6039, "step": 1011 }, { "epoch": 9.909424724602204, "grad_norm": 1.843713641166687, "learning_rate": 1.2857317365401997e-06, "loss": 0.4874, "step": 1012 }, { "epoch": 9.919216646266829, "grad_norm": 2.0728683471679688, "learning_rate": 1.2812471528762785e-06, "loss": 0.606, "step": 1013 }, { "epoch": 9.929008567931456, "grad_norm": 1.9876255989074707, "learning_rate": 1.2767677076651913e-06, "loss": 0.4764, "step": 1014 }, { "epoch": 9.938800489596083, "grad_norm": 1.9659452438354492, "learning_rate": 1.2722934197929804e-06, "loss": 0.659, "step": 1015 }, { "epoch": 9.94859241126071, "grad_norm": 1.8326551914215088, "learning_rate": 1.2678243081239421e-06, "loss": 0.6383, "step": 1016 }, { "epoch": 9.958384332925336, "grad_norm": 2.4267449378967285, "learning_rate": 1.2633603915005535e-06, "loss": 0.5836, "step": 1017 }, { "epoch": 9.968176254589963, "grad_norm": 1.933519721031189, "learning_rate": 1.2589016887433846e-06, "loss": 0.4109, "step": 1018 }, { "epoch": 9.97796817625459, "grad_norm": 2.025700330734253, "learning_rate": 1.2544482186510242e-06, "loss": 0.4638, "step": 1019 }, { "epoch": 9.987760097919217, "grad_norm": 1.805585265159607, "learning_rate": 1.2500000000000007e-06, "loss": 0.73, "step": 1020 }, { "epoch": 9.997552019583843, "grad_norm": 2.159691333770752, "learning_rate": 1.2455570515446997e-06, "loss": 0.5863, "step": 1021 }, { "epoch": 10.00734394124847, "grad_norm": 7.259884834289551, "learning_rate": 1.2411193920172866e-06, "loss": 0.7361, "step": 1022 }, { "epoch": 10.017135862913097, "grad_norm": 1.8734283447265625, "learning_rate": 1.2366870401276304e-06, "loss": 0.5293, "step": 1023 }, { "epoch": 10.026927784577722, "grad_norm": 2.1296513080596924, "learning_rate": 1.2322600145632204e-06, "loss": 0.8705, "step": 1024 }, { "epoch": 10.03671970624235, "grad_norm": 1.6430468559265137, "learning_rate": 1.2278383339890882e-06, "loss": 0.4106, "step": 1025 }, { "epoch": 10.046511627906977, "grad_norm": 2.0129048824310303, "learning_rate": 1.2234220170477332e-06, "loss": 0.3838, "step": 1026 }, { "epoch": 10.056303549571604, "grad_norm": 1.7515485286712646, "learning_rate": 1.2190110823590385e-06, "loss": 0.5751, "step": 1027 }, { "epoch": 10.06609547123623, "grad_norm": 2.072009801864624, "learning_rate": 1.2146055485201943e-06, "loss": 0.6412, "step": 1028 }, { "epoch": 10.075887392900857, "grad_norm": 2.0166397094726562, "learning_rate": 1.2102054341056221e-06, "loss": 0.507, "step": 1029 }, { "epoch": 10.085679314565484, "grad_norm": 2.1298539638519287, "learning_rate": 1.205810757666894e-06, "loss": 0.7368, "step": 1030 }, { "epoch": 10.095471236230111, "grad_norm": 2.041167974472046, "learning_rate": 1.2014215377326531e-06, "loss": 0.4772, "step": 1031 }, { "epoch": 10.105263157894736, "grad_norm": 2.2893929481506348, "learning_rate": 1.1970377928085372e-06, "loss": 0.6376, "step": 1032 }, { "epoch": 10.115055079559363, "grad_norm": 1.902682900428772, "learning_rate": 1.1926595413771028e-06, "loss": 0.4395, "step": 1033 }, { "epoch": 10.12484700122399, "grad_norm": 1.9623550176620483, "learning_rate": 1.188286801897743e-06, "loss": 0.4276, "step": 1034 }, { "epoch": 10.134638922888616, "grad_norm": 2.0566391944885254, "learning_rate": 1.1839195928066101e-06, "loss": 0.3541, "step": 1035 }, { "epoch": 10.144430844553243, "grad_norm": 1.9980727434158325, "learning_rate": 1.1795579325165448e-06, "loss": 0.471, "step": 1036 }, { "epoch": 10.15422276621787, "grad_norm": 2.398865222930908, "learning_rate": 1.1752018394169882e-06, "loss": 0.5276, "step": 1037 }, { "epoch": 10.164014687882498, "grad_norm": 1.887375831604004, "learning_rate": 1.1708513318739096e-06, "loss": 0.4296, "step": 1038 }, { "epoch": 10.173806609547123, "grad_norm": 2.0189316272735596, "learning_rate": 1.1665064282297323e-06, "loss": 0.4812, "step": 1039 }, { "epoch": 10.18359853121175, "grad_norm": 2.2055790424346924, "learning_rate": 1.1621671468032495e-06, "loss": 0.6845, "step": 1040 }, { "epoch": 10.193390452876377, "grad_norm": 1.6477124691009521, "learning_rate": 1.1578335058895498e-06, "loss": 0.4933, "step": 1041 }, { "epoch": 10.203182374541004, "grad_norm": 1.9674268960952759, "learning_rate": 1.153505523759944e-06, "loss": 0.737, "step": 1042 }, { "epoch": 10.21297429620563, "grad_norm": 1.8057160377502441, "learning_rate": 1.1491832186618818e-06, "loss": 0.6951, "step": 1043 }, { "epoch": 10.222766217870257, "grad_norm": 1.8128598928451538, "learning_rate": 1.1448666088188766e-06, "loss": 0.5325, "step": 1044 }, { "epoch": 10.232558139534884, "grad_norm": 1.8670271635055542, "learning_rate": 1.1405557124304338e-06, "loss": 0.4504, "step": 1045 }, { "epoch": 10.24235006119951, "grad_norm": 1.8077363967895508, "learning_rate": 1.1362505476719662e-06, "loss": 0.6132, "step": 1046 }, { "epoch": 10.252141982864137, "grad_norm": 1.8012129068374634, "learning_rate": 1.1319511326947221e-06, "loss": 0.369, "step": 1047 }, { "epoch": 10.261933904528764, "grad_norm": 2.0068607330322266, "learning_rate": 1.1276574856257097e-06, "loss": 0.6003, "step": 1048 }, { "epoch": 10.271725826193391, "grad_norm": 2.00742769241333, "learning_rate": 1.1233696245676167e-06, "loss": 0.5576, "step": 1049 }, { "epoch": 10.281517747858016, "grad_norm": 2.014188528060913, "learning_rate": 1.1190875675987355e-06, "loss": 0.3914, "step": 1050 }, { "epoch": 10.291309669522644, "grad_norm": 2.058257579803467, "learning_rate": 1.1148113327728908e-06, "loss": 0.6235, "step": 1051 }, { "epoch": 10.30110159118727, "grad_norm": 1.9059385061264038, "learning_rate": 1.1105409381193572e-06, "loss": 0.4915, "step": 1052 }, { "epoch": 10.310893512851898, "grad_norm": 1.8118654489517212, "learning_rate": 1.1062764016427864e-06, "loss": 0.5164, "step": 1053 }, { "epoch": 10.320685434516523, "grad_norm": 1.8032629489898682, "learning_rate": 1.1020177413231334e-06, "loss": 0.4088, "step": 1054 }, { "epoch": 10.33047735618115, "grad_norm": 1.9700795412063599, "learning_rate": 1.0977649751155762e-06, "loss": 0.379, "step": 1055 }, { "epoch": 10.340269277845778, "grad_norm": 2.4769201278686523, "learning_rate": 1.0935181209504422e-06, "loss": 0.4937, "step": 1056 }, { "epoch": 10.350061199510403, "grad_norm": 2.1627488136291504, "learning_rate": 1.0892771967331356e-06, "loss": 0.7721, "step": 1057 }, { "epoch": 10.35985312117503, "grad_norm": 1.9900619983673096, "learning_rate": 1.0850422203440555e-06, "loss": 0.5129, "step": 1058 }, { "epoch": 10.369645042839657, "grad_norm": 1.6130856275558472, "learning_rate": 1.0808132096385248e-06, "loss": 0.4525, "step": 1059 }, { "epoch": 10.379436964504285, "grad_norm": 3.118593692779541, "learning_rate": 1.0765901824467167e-06, "loss": 0.5611, "step": 1060 }, { "epoch": 10.38922888616891, "grad_norm": 2.377380132675171, "learning_rate": 1.072373156573574e-06, "loss": 0.4784, "step": 1061 }, { "epoch": 10.399020807833537, "grad_norm": 2.0620832443237305, "learning_rate": 1.068162149798737e-06, "loss": 0.5762, "step": 1062 }, { "epoch": 10.408812729498164, "grad_norm": 1.8769875764846802, "learning_rate": 1.0639571798764719e-06, "loss": 0.3553, "step": 1063 }, { "epoch": 10.418604651162791, "grad_norm": 1.797479271888733, "learning_rate": 1.0597582645355891e-06, "loss": 0.3987, "step": 1064 }, { "epoch": 10.428396572827417, "grad_norm": 2.1158134937286377, "learning_rate": 1.0555654214793723e-06, "loss": 0.501, "step": 1065 }, { "epoch": 10.438188494492044, "grad_norm": 1.9618295431137085, "learning_rate": 1.0513786683855062e-06, "loss": 0.6083, "step": 1066 }, { "epoch": 10.447980416156671, "grad_norm": 1.8006033897399902, "learning_rate": 1.0471980229059963e-06, "loss": 0.513, "step": 1067 }, { "epoch": 10.457772337821297, "grad_norm": 1.8421907424926758, "learning_rate": 1.0430235026670979e-06, "loss": 0.5236, "step": 1068 }, { "epoch": 10.467564259485924, "grad_norm": 1.9178632497787476, "learning_rate": 1.0388551252692437e-06, "loss": 0.5196, "step": 1069 }, { "epoch": 10.477356181150551, "grad_norm": 2.248973846435547, "learning_rate": 1.034692908286964e-06, "loss": 0.5458, "step": 1070 }, { "epoch": 10.487148102815178, "grad_norm": 1.8714607954025269, "learning_rate": 1.0305368692688175e-06, "loss": 0.5154, "step": 1071 }, { "epoch": 10.496940024479803, "grad_norm": 2.2324085235595703, "learning_rate": 1.0263870257373162e-06, "loss": 0.8552, "step": 1072 }, { "epoch": 10.50673194614443, "grad_norm": 1.8638917207717896, "learning_rate": 1.0222433951888503e-06, "loss": 0.4067, "step": 1073 }, { "epoch": 10.516523867809058, "grad_norm": 1.9835755825042725, "learning_rate": 1.0181059950936131e-06, "loss": 0.4522, "step": 1074 }, { "epoch": 10.526315789473685, "grad_norm": 1.9755423069000244, "learning_rate": 1.0139748428955334e-06, "loss": 0.6428, "step": 1075 }, { "epoch": 10.53610771113831, "grad_norm": 1.9253593683242798, "learning_rate": 1.0098499560121943e-06, "loss": 0.5563, "step": 1076 }, { "epoch": 10.545899632802938, "grad_norm": 1.7793023586273193, "learning_rate": 1.005731351834766e-06, "loss": 0.4943, "step": 1077 }, { "epoch": 10.555691554467565, "grad_norm": 1.8162922859191895, "learning_rate": 1.0016190477279274e-06, "loss": 0.3662, "step": 1078 }, { "epoch": 10.56548347613219, "grad_norm": 2.344635486602783, "learning_rate": 9.975130610297978e-07, "loss": 1.0261, "step": 1079 }, { "epoch": 10.575275397796817, "grad_norm": 1.6713858842849731, "learning_rate": 9.934134090518593e-07, "loss": 0.4447, "step": 1080 }, { "epoch": 10.585067319461444, "grad_norm": 1.7141319513320923, "learning_rate": 9.893201090788857e-07, "loss": 0.5825, "step": 1081 }, { "epoch": 10.594859241126072, "grad_norm": 1.8361754417419434, "learning_rate": 9.852331783688722e-07, "loss": 0.4302, "step": 1082 }, { "epoch": 10.604651162790697, "grad_norm": 2.0883617401123047, "learning_rate": 9.811526341529582e-07, "loss": 0.6973, "step": 1083 }, { "epoch": 10.614443084455324, "grad_norm": 1.849515438079834, "learning_rate": 9.770784936353555e-07, "loss": 0.5281, "step": 1084 }, { "epoch": 10.624235006119951, "grad_norm": 1.8748483657836914, "learning_rate": 9.730107739932807e-07, "loss": 0.4827, "step": 1085 }, { "epoch": 10.634026927784578, "grad_norm": 1.967962384223938, "learning_rate": 9.689494923768756e-07, "loss": 0.4855, "step": 1086 }, { "epoch": 10.643818849449204, "grad_norm": 1.8471128940582275, "learning_rate": 9.648946659091386e-07, "loss": 0.3885, "step": 1087 }, { "epoch": 10.653610771113831, "grad_norm": 1.9718245267868042, "learning_rate": 9.608463116858544e-07, "loss": 0.6159, "step": 1088 }, { "epoch": 10.663402692778458, "grad_norm": 1.8125596046447754, "learning_rate": 9.56804446775518e-07, "loss": 0.6373, "step": 1089 }, { "epoch": 10.673194614443084, "grad_norm": 2.0702903270721436, "learning_rate": 9.527690882192636e-07, "loss": 0.6322, "step": 1090 }, { "epoch": 10.68298653610771, "grad_norm": 1.7800555229187012, "learning_rate": 9.487402530307966e-07, "loss": 0.398, "step": 1091 }, { "epoch": 10.692778457772338, "grad_norm": 2.2512807846069336, "learning_rate": 9.447179581963156e-07, "loss": 0.8287, "step": 1092 }, { "epoch": 10.702570379436965, "grad_norm": 1.6735938787460327, "learning_rate": 9.407022206744454e-07, "loss": 0.3781, "step": 1093 }, { "epoch": 10.71236230110159, "grad_norm": 2.040163516998291, "learning_rate": 9.366930573961649e-07, "loss": 0.4053, "step": 1094 }, { "epoch": 10.722154222766218, "grad_norm": 1.8567315340042114, "learning_rate": 9.326904852647345e-07, "loss": 0.3717, "step": 1095 }, { "epoch": 10.731946144430845, "grad_norm": 2.1457667350769043, "learning_rate": 9.286945211556231e-07, "loss": 0.6674, "step": 1096 }, { "epoch": 10.741738066095472, "grad_norm": 2.147291660308838, "learning_rate": 9.247051819164432e-07, "loss": 0.6432, "step": 1097 }, { "epoch": 10.751529987760097, "grad_norm": 2.0626306533813477, "learning_rate": 9.207224843668733e-07, "loss": 0.5218, "step": 1098 }, { "epoch": 10.761321909424725, "grad_norm": 2.007749319076538, "learning_rate": 9.16746445298588e-07, "loss": 0.388, "step": 1099 }, { "epoch": 10.771113831089352, "grad_norm": 1.953169584274292, "learning_rate": 9.127770814751933e-07, "loss": 0.495, "step": 1100 }, { "epoch": 10.780905752753977, "grad_norm": 1.9562841653823853, "learning_rate": 9.088144096321472e-07, "loss": 0.5594, "step": 1101 }, { "epoch": 10.790697674418604, "grad_norm": 2.199822425842285, "learning_rate": 9.048584464766938e-07, "loss": 0.6411, "step": 1102 }, { "epoch": 10.800489596083231, "grad_norm": 2.076439619064331, "learning_rate": 9.00909208687795e-07, "loss": 0.4917, "step": 1103 }, { "epoch": 10.810281517747859, "grad_norm": 1.7562271356582642, "learning_rate": 8.969667129160547e-07, "loss": 0.4123, "step": 1104 }, { "epoch": 10.820073439412484, "grad_norm": 1.9027916193008423, "learning_rate": 8.930309757836517e-07, "loss": 0.6119, "step": 1105 }, { "epoch": 10.829865361077111, "grad_norm": 1.779505729675293, "learning_rate": 8.891020138842718e-07, "loss": 0.3611, "step": 1106 }, { "epoch": 10.839657282741738, "grad_norm": 2.037295341491699, "learning_rate": 8.851798437830323e-07, "loss": 0.7884, "step": 1107 }, { "epoch": 10.849449204406366, "grad_norm": 1.8404275178909302, "learning_rate": 8.81264482016416e-07, "loss": 0.4405, "step": 1108 }, { "epoch": 10.859241126070991, "grad_norm": 1.978280782699585, "learning_rate": 8.773559450922028e-07, "loss": 0.4878, "step": 1109 }, { "epoch": 10.869033047735618, "grad_norm": 2.1189818382263184, "learning_rate": 8.734542494893955e-07, "loss": 0.4429, "step": 1110 }, { "epoch": 10.878824969400245, "grad_norm": 2.166550397872925, "learning_rate": 8.695594116581527e-07, "loss": 0.742, "step": 1111 }, { "epoch": 10.88861689106487, "grad_norm": 1.7087887525558472, "learning_rate": 8.65671448019722e-07, "loss": 0.5863, "step": 1112 }, { "epoch": 10.898408812729498, "grad_norm": 1.867903709411621, "learning_rate": 8.617903749663667e-07, "loss": 0.519, "step": 1113 }, { "epoch": 10.908200734394125, "grad_norm": 1.9154669046401978, "learning_rate": 8.579162088612974e-07, "loss": 0.6231, "step": 1114 }, { "epoch": 10.917992656058752, "grad_norm": 1.6995677947998047, "learning_rate": 8.540489660386064e-07, "loss": 0.4943, "step": 1115 }, { "epoch": 10.927784577723378, "grad_norm": 1.991498351097107, "learning_rate": 8.501886628031941e-07, "loss": 0.614, "step": 1116 }, { "epoch": 10.937576499388005, "grad_norm": 1.5336772203445435, "learning_rate": 8.463353154307031e-07, "loss": 0.2928, "step": 1117 }, { "epoch": 10.947368421052632, "grad_norm": 1.9852231740951538, "learning_rate": 8.424889401674505e-07, "loss": 0.554, "step": 1118 }, { "epoch": 10.957160342717259, "grad_norm": 2.0256094932556152, "learning_rate": 8.386495532303557e-07, "loss": 0.5243, "step": 1119 }, { "epoch": 10.966952264381884, "grad_norm": 2.042241334915161, "learning_rate": 8.348171708068748e-07, "loss": 0.7404, "step": 1120 }, { "epoch": 10.976744186046512, "grad_norm": 1.9857434034347534, "learning_rate": 8.309918090549335e-07, "loss": 0.5904, "step": 1121 }, { "epoch": 10.986536107711139, "grad_norm": 1.8304249048233032, "learning_rate": 8.271734841028553e-07, "loss": 0.4639, "step": 1122 }, { "epoch": 10.996328029375764, "grad_norm": 2.2533512115478516, "learning_rate": 8.233622120492952e-07, "loss": 0.7298, "step": 1123 }, { "epoch": 11.006119951040391, "grad_norm": 4.955414295196533, "learning_rate": 8.195580089631733e-07, "loss": 1.0181, "step": 1124 }, { "epoch": 11.015911872705018, "grad_norm": 1.7299411296844482, "learning_rate": 8.157608908836071e-07, "loss": 0.376, "step": 1125 }, { "epoch": 11.025703794369646, "grad_norm": 1.9476960897445679, "learning_rate": 8.119708738198395e-07, "loss": 0.5202, "step": 1126 }, { "epoch": 11.035495716034271, "grad_norm": 1.8122228384017944, "learning_rate": 8.081879737511752e-07, "loss": 0.4078, "step": 1127 }, { "epoch": 11.045287637698898, "grad_norm": 2.2311043739318848, "learning_rate": 8.04412206626915e-07, "loss": 0.5647, "step": 1128 }, { "epoch": 11.055079559363525, "grad_norm": 1.8042285442352295, "learning_rate": 8.006435883662836e-07, "loss": 0.4296, "step": 1129 }, { "epoch": 11.064871481028153, "grad_norm": 1.8421496152877808, "learning_rate": 7.968821348583644e-07, "loss": 0.6665, "step": 1130 }, { "epoch": 11.074663402692778, "grad_norm": 1.885489821434021, "learning_rate": 7.931278619620364e-07, "loss": 0.4247, "step": 1131 }, { "epoch": 11.084455324357405, "grad_norm": 1.6915487051010132, "learning_rate": 7.89380785505901e-07, "loss": 0.2823, "step": 1132 }, { "epoch": 11.094247246022032, "grad_norm": 2.111504554748535, "learning_rate": 7.856409212882183e-07, "loss": 0.4172, "step": 1133 }, { "epoch": 11.104039167686658, "grad_norm": 1.7155784368515015, "learning_rate": 7.819082850768433e-07, "loss": 0.3659, "step": 1134 }, { "epoch": 11.113831089351285, "grad_norm": 1.890836477279663, "learning_rate": 7.781828926091536e-07, "loss": 0.5909, "step": 1135 }, { "epoch": 11.123623011015912, "grad_norm": 1.7993725538253784, "learning_rate": 7.744647595919869e-07, "loss": 0.5241, "step": 1136 }, { "epoch": 11.13341493268054, "grad_norm": 2.067066192626953, "learning_rate": 7.70753901701575e-07, "loss": 0.5252, "step": 1137 }, { "epoch": 11.143206854345165, "grad_norm": 2.010432481765747, "learning_rate": 7.670503345834757e-07, "loss": 0.5711, "step": 1138 }, { "epoch": 11.152998776009792, "grad_norm": 1.7126110792160034, "learning_rate": 7.633540738525066e-07, "loss": 0.3409, "step": 1139 }, { "epoch": 11.162790697674419, "grad_norm": 1.8957414627075195, "learning_rate": 7.596651350926837e-07, "loss": 0.5221, "step": 1140 }, { "epoch": 11.172582619339046, "grad_norm": 1.8879555463790894, "learning_rate": 7.559835338571492e-07, "loss": 0.5823, "step": 1141 }, { "epoch": 11.182374541003671, "grad_norm": 2.180203437805176, "learning_rate": 7.523092856681099e-07, "loss": 0.5022, "step": 1142 }, { "epoch": 11.192166462668299, "grad_norm": 1.959574580192566, "learning_rate": 7.486424060167726e-07, "loss": 0.3336, "step": 1143 }, { "epoch": 11.201958384332926, "grad_norm": 1.8028064966201782, "learning_rate": 7.44982910363276e-07, "loss": 0.3675, "step": 1144 }, { "epoch": 11.211750305997551, "grad_norm": 1.8317159414291382, "learning_rate": 7.413308141366254e-07, "loss": 0.4083, "step": 1145 }, { "epoch": 11.221542227662178, "grad_norm": 2.40229868888855, "learning_rate": 7.376861327346325e-07, "loss": 0.6124, "step": 1146 }, { "epoch": 11.231334149326806, "grad_norm": 2.018671989440918, "learning_rate": 7.34048881523843e-07, "loss": 0.4744, "step": 1147 }, { "epoch": 11.241126070991433, "grad_norm": 1.9953429698944092, "learning_rate": 7.304190758394775e-07, "loss": 0.7773, "step": 1148 }, { "epoch": 11.250917992656058, "grad_norm": 1.8805975914001465, "learning_rate": 7.267967309853665e-07, "loss": 0.5842, "step": 1149 }, { "epoch": 11.260709914320685, "grad_norm": 2.0895066261291504, "learning_rate": 7.231818622338824e-07, "loss": 0.7958, "step": 1150 }, { "epoch": 11.270501835985312, "grad_norm": 1.9781478643417358, "learning_rate": 7.195744848258768e-07, "loss": 0.4084, "step": 1151 }, { "epoch": 11.28029375764994, "grad_norm": 2.188415765762329, "learning_rate": 7.159746139706194e-07, "loss": 0.4485, "step": 1152 }, { "epoch": 11.290085679314565, "grad_norm": 1.835811734199524, "learning_rate": 7.123822648457287e-07, "loss": 0.6944, "step": 1153 }, { "epoch": 11.299877600979192, "grad_norm": 1.8257427215576172, "learning_rate": 7.087974525971103e-07, "loss": 0.5128, "step": 1154 }, { "epoch": 11.30966952264382, "grad_norm": 1.9624758958816528, "learning_rate": 7.052201923388955e-07, "loss": 0.3883, "step": 1155 }, { "epoch": 11.319461444308445, "grad_norm": 1.7039844989776611, "learning_rate": 7.016504991533727e-07, "loss": 0.4141, "step": 1156 }, { "epoch": 11.329253365973072, "grad_norm": 1.8658639192581177, "learning_rate": 6.980883880909267e-07, "loss": 0.5017, "step": 1157 }, { "epoch": 11.339045287637699, "grad_norm": 2.162304401397705, "learning_rate": 6.94533874169977e-07, "loss": 0.5116, "step": 1158 }, { "epoch": 11.348837209302326, "grad_norm": 1.9891886711120605, "learning_rate": 6.9098697237691e-07, "loss": 0.6457, "step": 1159 }, { "epoch": 11.358629130966952, "grad_norm": 1.9596093893051147, "learning_rate": 6.874476976660185e-07, "loss": 0.5894, "step": 1160 }, { "epoch": 11.368421052631579, "grad_norm": 1.9216418266296387, "learning_rate": 6.839160649594401e-07, "loss": 0.5233, "step": 1161 }, { "epoch": 11.378212974296206, "grad_norm": 1.8161520957946777, "learning_rate": 6.803920891470905e-07, "loss": 0.3194, "step": 1162 }, { "epoch": 11.388004895960833, "grad_norm": 1.837099313735962, "learning_rate": 6.768757850866032e-07, "loss": 0.2996, "step": 1163 }, { "epoch": 11.397796817625458, "grad_norm": 2.0498571395874023, "learning_rate": 6.733671676032674e-07, "loss": 0.6927, "step": 1164 }, { "epoch": 11.407588739290086, "grad_norm": 2.131337881088257, "learning_rate": 6.69866251489964e-07, "loss": 0.52, "step": 1165 }, { "epoch": 11.417380660954713, "grad_norm": 2.134185314178467, "learning_rate": 6.663730515071019e-07, "loss": 0.6614, "step": 1166 }, { "epoch": 11.427172582619338, "grad_norm": 1.7154711484909058, "learning_rate": 6.628875823825612e-07, "loss": 0.4149, "step": 1167 }, { "epoch": 11.436964504283965, "grad_norm": 2.099763870239258, "learning_rate": 6.594098588116243e-07, "loss": 0.6353, "step": 1168 }, { "epoch": 11.446756425948593, "grad_norm": 1.9347047805786133, "learning_rate": 6.559398954569182e-07, "loss": 0.3909, "step": 1169 }, { "epoch": 11.45654834761322, "grad_norm": 1.78669273853302, "learning_rate": 6.524777069483526e-07, "loss": 0.3754, "step": 1170 }, { "epoch": 11.466340269277845, "grad_norm": 2.071563243865967, "learning_rate": 6.49023307883056e-07, "loss": 0.4965, "step": 1171 }, { "epoch": 11.476132190942472, "grad_norm": 2.076728105545044, "learning_rate": 6.455767128253148e-07, "loss": 0.469, "step": 1172 }, { "epoch": 11.4859241126071, "grad_norm": 2.0857369899749756, "learning_rate": 6.421379363065142e-07, "loss": 0.6349, "step": 1173 }, { "epoch": 11.495716034271727, "grad_norm": 1.644263505935669, "learning_rate": 6.38706992825075e-07, "loss": 0.2912, "step": 1174 }, { "epoch": 11.505507955936352, "grad_norm": 2.0650486946105957, "learning_rate": 6.352838968463921e-07, "loss": 0.6588, "step": 1175 }, { "epoch": 11.51529987760098, "grad_norm": 1.6472177505493164, "learning_rate": 6.318686628027723e-07, "loss": 0.487, "step": 1176 }, { "epoch": 11.525091799265606, "grad_norm": 2.0165016651153564, "learning_rate": 6.284613050933794e-07, "loss": 0.6354, "step": 1177 }, { "epoch": 11.534883720930232, "grad_norm": 1.9155522584915161, "learning_rate": 6.250618380841661e-07, "loss": 0.4094, "step": 1178 }, { "epoch": 11.544675642594859, "grad_norm": 2.129122257232666, "learning_rate": 6.216702761078167e-07, "loss": 0.7578, "step": 1179 }, { "epoch": 11.554467564259486, "grad_norm": 1.8486154079437256, "learning_rate": 6.182866334636889e-07, "loss": 0.6053, "step": 1180 }, { "epoch": 11.564259485924113, "grad_norm": 1.856162667274475, "learning_rate": 6.149109244177495e-07, "loss": 0.525, "step": 1181 }, { "epoch": 11.574051407588739, "grad_norm": 2.0944762229919434, "learning_rate": 6.115431632025154e-07, "loss": 0.6114, "step": 1182 }, { "epoch": 11.583843329253366, "grad_norm": 1.8472967147827148, "learning_rate": 6.081833640169968e-07, "loss": 0.5054, "step": 1183 }, { "epoch": 11.593635250917993, "grad_norm": 2.068192958831787, "learning_rate": 6.048315410266326e-07, "loss": 0.6684, "step": 1184 }, { "epoch": 11.60342717258262, "grad_norm": 2.0607235431671143, "learning_rate": 6.01487708363232e-07, "loss": 0.5961, "step": 1185 }, { "epoch": 11.613219094247246, "grad_norm": 1.8114880323410034, "learning_rate": 5.981518801249192e-07, "loss": 0.3904, "step": 1186 }, { "epoch": 11.623011015911873, "grad_norm": 1.9627492427825928, "learning_rate": 5.948240703760671e-07, "loss": 0.4132, "step": 1187 }, { "epoch": 11.6328029375765, "grad_norm": 2.1180152893066406, "learning_rate": 5.915042931472426e-07, "loss": 0.3478, "step": 1188 }, { "epoch": 11.642594859241125, "grad_norm": 2.4858107566833496, "learning_rate": 5.88192562435147e-07, "loss": 0.6308, "step": 1189 }, { "epoch": 11.652386780905752, "grad_norm": 2.283162832260132, "learning_rate": 5.848888922025553e-07, "loss": 0.5324, "step": 1190 }, { "epoch": 11.66217870257038, "grad_norm": 1.9822237491607666, "learning_rate": 5.815932963782575e-07, "loss": 0.6608, "step": 1191 }, { "epoch": 11.671970624235007, "grad_norm": 2.039518356323242, "learning_rate": 5.783057888570034e-07, "loss": 0.5624, "step": 1192 }, { "epoch": 11.681762545899632, "grad_norm": 2.007932662963867, "learning_rate": 5.750263834994383e-07, "loss": 0.4852, "step": 1193 }, { "epoch": 11.69155446756426, "grad_norm": 1.7244541645050049, "learning_rate": 5.717550941320482e-07, "loss": 0.4532, "step": 1194 }, { "epoch": 11.701346389228886, "grad_norm": 1.8952066898345947, "learning_rate": 5.684919345471029e-07, "loss": 0.544, "step": 1195 }, { "epoch": 11.711138310893514, "grad_norm": 2.0001256465911865, "learning_rate": 5.65236918502593e-07, "loss": 0.5132, "step": 1196 }, { "epoch": 11.720930232558139, "grad_norm": 2.057631492614746, "learning_rate": 5.619900597221753e-07, "loss": 0.657, "step": 1197 }, { "epoch": 11.730722154222766, "grad_norm": 2.041717767715454, "learning_rate": 5.587513718951165e-07, "loss": 0.6528, "step": 1198 }, { "epoch": 11.740514075887393, "grad_norm": 1.6682372093200684, "learning_rate": 5.555208686762304e-07, "loss": 0.3696, "step": 1199 }, { "epoch": 11.750305997552019, "grad_norm": 1.899059772491455, "learning_rate": 5.522985636858238e-07, "loss": 0.5276, "step": 1200 }, { "epoch": 11.760097919216646, "grad_norm": 2.058868169784546, "learning_rate": 5.490844705096407e-07, "loss": 0.5492, "step": 1201 }, { "epoch": 11.769889840881273, "grad_norm": 1.8956670761108398, "learning_rate": 5.458786026988005e-07, "loss": 0.347, "step": 1202 }, { "epoch": 11.7796817625459, "grad_norm": 2.0858352184295654, "learning_rate": 5.42680973769743e-07, "loss": 0.5909, "step": 1203 }, { "epoch": 11.789473684210526, "grad_norm": 1.7983345985412598, "learning_rate": 5.394915972041739e-07, "loss": 0.5332, "step": 1204 }, { "epoch": 11.799265605875153, "grad_norm": 1.9118212461471558, "learning_rate": 5.363104864490035e-07, "loss": 0.5436, "step": 1205 }, { "epoch": 11.80905752753978, "grad_norm": 2.0349478721618652, "learning_rate": 5.33137654916292e-07, "loss": 0.7181, "step": 1206 }, { "epoch": 11.818849449204407, "grad_norm": 1.7118622064590454, "learning_rate": 5.299731159831953e-07, "loss": 0.4565, "step": 1207 }, { "epoch": 11.828641370869033, "grad_norm": 1.957894206047058, "learning_rate": 5.268168829919046e-07, "loss": 0.4774, "step": 1208 }, { "epoch": 11.83843329253366, "grad_norm": 1.8915562629699707, "learning_rate": 5.236689692495916e-07, "loss": 0.5685, "step": 1209 }, { "epoch": 11.848225214198287, "grad_norm": 2.3093106746673584, "learning_rate": 5.205293880283552e-07, "loss": 0.5969, "step": 1210 }, { "epoch": 11.858017135862912, "grad_norm": 2.045757532119751, "learning_rate": 5.173981525651601e-07, "loss": 0.6257, "step": 1211 }, { "epoch": 11.86780905752754, "grad_norm": 1.7559894323349, "learning_rate": 5.14275276061785e-07, "loss": 0.4219, "step": 1212 }, { "epoch": 11.877600979192167, "grad_norm": 1.8561455011367798, "learning_rate": 5.111607716847675e-07, "loss": 0.6551, "step": 1213 }, { "epoch": 11.887392900856794, "grad_norm": 1.6830816268920898, "learning_rate": 5.080546525653448e-07, "loss": 0.543, "step": 1214 }, { "epoch": 11.89718482252142, "grad_norm": 1.806162714958191, "learning_rate": 5.049569317994013e-07, "loss": 0.4515, "step": 1215 }, { "epoch": 11.906976744186046, "grad_norm": 2.0484213829040527, "learning_rate": 5.018676224474139e-07, "loss": 0.5381, "step": 1216 }, { "epoch": 11.916768665850674, "grad_norm": 1.9797966480255127, "learning_rate": 4.987867375343938e-07, "loss": 0.429, "step": 1217 }, { "epoch": 11.9265605875153, "grad_norm": 1.913184404373169, "learning_rate": 4.957142900498335e-07, "loss": 0.5483, "step": 1218 }, { "epoch": 11.936352509179926, "grad_norm": 1.6935491561889648, "learning_rate": 4.92650292947654e-07, "loss": 0.4675, "step": 1219 }, { "epoch": 11.946144430844553, "grad_norm": 2.0470242500305176, "learning_rate": 4.895947591461456e-07, "loss": 0.4662, "step": 1220 }, { "epoch": 11.95593635250918, "grad_norm": 1.9798767566680908, "learning_rate": 4.865477015279185e-07, "loss": 0.4666, "step": 1221 }, { "epoch": 11.965728274173806, "grad_norm": 2.3621938228607178, "learning_rate": 4.835091329398436e-07, "loss": 0.9118, "step": 1222 }, { "epoch": 11.975520195838433, "grad_norm": 1.8317756652832031, "learning_rate": 4.804790661930028e-07, "loss": 0.5941, "step": 1223 }, { "epoch": 11.98531211750306, "grad_norm": 1.9772015810012817, "learning_rate": 4.774575140626317e-07, "loss": 0.423, "step": 1224 }, { "epoch": 11.995104039167687, "grad_norm": 2.137575626373291, "learning_rate": 4.744444892880662e-07, "loss": 0.4814, "step": 1225 }, { "epoch": 12.004895960832313, "grad_norm": 5.899466514587402, "learning_rate": 4.714400045726919e-07, "loss": 1.7449, "step": 1226 }, { "epoch": 12.01468788249694, "grad_norm": 1.867515206336975, "learning_rate": 4.6844407258388626e-07, "loss": 0.4916, "step": 1227 }, { "epoch": 12.024479804161567, "grad_norm": 1.9251153469085693, "learning_rate": 4.6545670595296686e-07, "loss": 0.5093, "step": 1228 }, { "epoch": 12.034271725826194, "grad_norm": 1.9238775968551636, "learning_rate": 4.6247791727514035e-07, "loss": 0.4919, "step": 1229 }, { "epoch": 12.04406364749082, "grad_norm": 1.911022424697876, "learning_rate": 4.5950771910944603e-07, "loss": 0.5174, "step": 1230 }, { "epoch": 12.053855569155447, "grad_norm": 1.920501470565796, "learning_rate": 4.5654612397870333e-07, "loss": 0.421, "step": 1231 }, { "epoch": 12.063647490820074, "grad_norm": 2.1917028427124023, "learning_rate": 4.5359314436946275e-07, "loss": 0.5259, "step": 1232 }, { "epoch": 12.0734394124847, "grad_norm": 1.8999507427215576, "learning_rate": 4.506487927319475e-07, "loss": 0.4541, "step": 1233 }, { "epoch": 12.083231334149326, "grad_norm": 1.9580881595611572, "learning_rate": 4.4771308148000487e-07, "loss": 0.4817, "step": 1234 }, { "epoch": 12.093023255813954, "grad_norm": 1.7205885648727417, "learning_rate": 4.4478602299105446e-07, "loss": 0.5076, "step": 1235 }, { "epoch": 12.10281517747858, "grad_norm": 2.0340042114257812, "learning_rate": 4.418676296060323e-07, "loss": 0.7615, "step": 1236 }, { "epoch": 12.112607099143206, "grad_norm": 1.6397504806518555, "learning_rate": 4.389579136293412e-07, "loss": 0.3314, "step": 1237 }, { "epoch": 12.122399020807833, "grad_norm": 1.9061412811279297, "learning_rate": 4.3605688732880097e-07, "loss": 0.4512, "step": 1238 }, { "epoch": 12.13219094247246, "grad_norm": 1.734278917312622, "learning_rate": 4.3316456293559154e-07, "loss": 0.4046, "step": 1239 }, { "epoch": 12.141982864137088, "grad_norm": 1.862695574760437, "learning_rate": 4.302809526442053e-07, "loss": 0.4874, "step": 1240 }, { "epoch": 12.151774785801713, "grad_norm": 1.9570109844207764, "learning_rate": 4.27406068612396e-07, "loss": 0.6334, "step": 1241 }, { "epoch": 12.16156670746634, "grad_norm": 1.887754201889038, "learning_rate": 4.2453992296112384e-07, "loss": 0.5852, "step": 1242 }, { "epoch": 12.171358629130967, "grad_norm": 2.04457950592041, "learning_rate": 4.216825277745071e-07, "loss": 0.6924, "step": 1243 }, { "epoch": 12.181150550795593, "grad_norm": 2.101046323776245, "learning_rate": 4.188338950997728e-07, "loss": 0.5397, "step": 1244 }, { "epoch": 12.19094247246022, "grad_norm": 2.055476188659668, "learning_rate": 4.159940369472015e-07, "loss": 0.4906, "step": 1245 }, { "epoch": 12.200734394124847, "grad_norm": 1.8816996812820435, "learning_rate": 4.1316296529007955e-07, "loss": 0.4771, "step": 1246 }, { "epoch": 12.210526315789474, "grad_norm": 1.9904950857162476, "learning_rate": 4.1034069206464913e-07, "loss": 0.5954, "step": 1247 }, { "epoch": 12.2203182374541, "grad_norm": 1.7235548496246338, "learning_rate": 4.075272291700558e-07, "loss": 0.4704, "step": 1248 }, { "epoch": 12.230110159118727, "grad_norm": 1.8656024932861328, "learning_rate": 4.047225884682987e-07, "loss": 0.4673, "step": 1249 }, { "epoch": 12.239902080783354, "grad_norm": 2.3303070068359375, "learning_rate": 4.019267817841835e-07, "loss": 0.6601, "step": 1250 }, { "epoch": 12.249694002447981, "grad_norm": 1.9218095541000366, "learning_rate": 3.991398209052685e-07, "loss": 0.5379, "step": 1251 }, { "epoch": 12.259485924112607, "grad_norm": 1.8027405738830566, "learning_rate": 3.9636171758181657e-07, "loss": 0.3934, "step": 1252 }, { "epoch": 12.269277845777234, "grad_norm": 2.0258567333221436, "learning_rate": 3.9359248352674737e-07, "loss": 0.6078, "step": 1253 }, { "epoch": 12.279069767441861, "grad_norm": 1.9668664932250977, "learning_rate": 3.908321304155846e-07, "loss": 0.601, "step": 1254 }, { "epoch": 12.288861689106486, "grad_norm": 1.920562505722046, "learning_rate": 3.880806698864087e-07, "loss": 0.5523, "step": 1255 }, { "epoch": 12.298653610771114, "grad_norm": 1.9591246843338013, "learning_rate": 3.853381135398093e-07, "loss": 0.4438, "step": 1256 }, { "epoch": 12.30844553243574, "grad_norm": 1.8171638250350952, "learning_rate": 3.8260447293883255e-07, "loss": 0.3835, "step": 1257 }, { "epoch": 12.318237454100368, "grad_norm": 2.0839426517486572, "learning_rate": 3.798797596089351e-07, "loss": 0.4999, "step": 1258 }, { "epoch": 12.328029375764993, "grad_norm": 1.8789358139038086, "learning_rate": 3.771639850379358e-07, "loss": 0.4171, "step": 1259 }, { "epoch": 12.33782129742962, "grad_norm": 2.1004509925842285, "learning_rate": 3.7445716067596506e-07, "loss": 0.4527, "step": 1260 }, { "epoch": 12.347613219094248, "grad_norm": 1.9021142721176147, "learning_rate": 3.717592979354176e-07, "loss": 0.3917, "step": 1261 }, { "epoch": 12.357405140758875, "grad_norm": 1.9369584321975708, "learning_rate": 3.6907040819090604e-07, "loss": 0.5497, "step": 1262 }, { "epoch": 12.3671970624235, "grad_norm": 1.8855191469192505, "learning_rate": 3.6639050277921055e-07, "loss": 0.482, "step": 1263 }, { "epoch": 12.376988984088127, "grad_norm": 1.9581860303878784, "learning_rate": 3.63719592999231e-07, "loss": 0.648, "step": 1264 }, { "epoch": 12.386780905752754, "grad_norm": 2.041123628616333, "learning_rate": 3.6105769011194225e-07, "loss": 0.6284, "step": 1265 }, { "epoch": 12.39657282741738, "grad_norm": 1.9144151210784912, "learning_rate": 3.5840480534034355e-07, "loss": 0.4635, "step": 1266 }, { "epoch": 12.406364749082007, "grad_norm": 1.9708912372589111, "learning_rate": 3.5576094986941146e-07, "loss": 0.5839, "step": 1267 }, { "epoch": 12.416156670746634, "grad_norm": 1.825844168663025, "learning_rate": 3.5312613484605546e-07, "loss": 0.6101, "step": 1268 }, { "epoch": 12.425948592411261, "grad_norm": 1.91981840133667, "learning_rate": 3.505003713790689e-07, "loss": 0.7387, "step": 1269 }, { "epoch": 12.435740514075887, "grad_norm": 1.867347002029419, "learning_rate": 3.4788367053908087e-07, "loss": 0.5045, "step": 1270 }, { "epoch": 12.445532435740514, "grad_norm": 1.8076897859573364, "learning_rate": 3.4527604335851117e-07, "loss": 0.5408, "step": 1271 }, { "epoch": 12.455324357405141, "grad_norm": 1.9808920621871948, "learning_rate": 3.4267750083152587e-07, "loss": 0.4968, "step": 1272 }, { "epoch": 12.465116279069768, "grad_norm": 2.0484859943389893, "learning_rate": 3.4008805391398614e-07, "loss": 0.6171, "step": 1273 }, { "epoch": 12.474908200734394, "grad_norm": 1.8254646062850952, "learning_rate": 3.375077135234051e-07, "loss": 0.4376, "step": 1274 }, { "epoch": 12.48470012239902, "grad_norm": 1.9746294021606445, "learning_rate": 3.3493649053890325e-07, "loss": 0.5943, "step": 1275 }, { "epoch": 12.494492044063648, "grad_norm": 2.043105125427246, "learning_rate": 3.323743958011588e-07, "loss": 0.6165, "step": 1276 }, { "epoch": 12.504283965728273, "grad_norm": 2.184504508972168, "learning_rate": 3.298214401123634e-07, "loss": 0.4594, "step": 1277 }, { "epoch": 12.5140758873929, "grad_norm": 1.975448489189148, "learning_rate": 3.2727763423617915e-07, "loss": 0.4594, "step": 1278 }, { "epoch": 12.523867809057528, "grad_norm": 1.9909656047821045, "learning_rate": 3.2474298889768967e-07, "loss": 0.623, "step": 1279 }, { "epoch": 12.533659730722155, "grad_norm": 2.0538201332092285, "learning_rate": 3.222175147833556e-07, "loss": 0.5231, "step": 1280 }, { "epoch": 12.54345165238678, "grad_norm": 1.8169667720794678, "learning_rate": 3.197012225409729e-07, "loss": 0.3419, "step": 1281 }, { "epoch": 12.553243574051407, "grad_norm": 1.7477304935455322, "learning_rate": 3.171941227796227e-07, "loss": 0.4297, "step": 1282 }, { "epoch": 12.563035495716035, "grad_norm": 1.827813982963562, "learning_rate": 3.1469622606962955e-07, "loss": 0.5072, "step": 1283 }, { "epoch": 12.572827417380662, "grad_norm": 1.8867334127426147, "learning_rate": 3.122075429425184e-07, "loss": 0.473, "step": 1284 }, { "epoch": 12.582619339045287, "grad_norm": 1.9495500326156616, "learning_rate": 3.0972808389096636e-07, "loss": 0.4184, "step": 1285 }, { "epoch": 12.592411260709914, "grad_norm": 1.8422271013259888, "learning_rate": 3.072578593687606e-07, "loss": 0.3199, "step": 1286 }, { "epoch": 12.602203182374542, "grad_norm": 1.8673765659332275, "learning_rate": 3.047968797907552e-07, "loss": 0.6013, "step": 1287 }, { "epoch": 12.611995104039167, "grad_norm": 1.789392352104187, "learning_rate": 3.0234515553282523e-07, "loss": 0.5454, "step": 1288 }, { "epoch": 12.621787025703794, "grad_norm": 2.0017011165618896, "learning_rate": 2.999026969318236e-07, "loss": 0.531, "step": 1289 }, { "epoch": 12.631578947368421, "grad_norm": 2.092477321624756, "learning_rate": 2.9746951428553884e-07, "loss": 0.5699, "step": 1290 }, { "epoch": 12.641370869033048, "grad_norm": 2.0250766277313232, "learning_rate": 2.950456178526498e-07, "loss": 0.5838, "step": 1291 }, { "epoch": 12.651162790697674, "grad_norm": 1.9144033193588257, "learning_rate": 2.9263101785268253e-07, "loss": 0.4428, "step": 1292 }, { "epoch": 12.660954712362301, "grad_norm": 2.1199519634246826, "learning_rate": 2.902257244659701e-07, "loss": 0.4508, "step": 1293 }, { "epoch": 12.670746634026928, "grad_norm": 1.888130784034729, "learning_rate": 2.8782974783360534e-07, "loss": 0.4375, "step": 1294 }, { "epoch": 12.680538555691555, "grad_norm": 2.2820069789886475, "learning_rate": 2.854430980574002e-07, "loss": 0.5079, "step": 1295 }, { "epoch": 12.69033047735618, "grad_norm": 1.8801189661026, "learning_rate": 2.8306578519984526e-07, "loss": 0.3016, "step": 1296 }, { "epoch": 12.700122399020808, "grad_norm": 2.2378287315368652, "learning_rate": 2.8069781928406313e-07, "loss": 0.5653, "step": 1297 }, { "epoch": 12.709914320685435, "grad_norm": 1.8046623468399048, "learning_rate": 2.783392102937682e-07, "loss": 0.28, "step": 1298 }, { "epoch": 12.71970624235006, "grad_norm": 1.8635425567626953, "learning_rate": 2.7598996817322614e-07, "loss": 0.354, "step": 1299 }, { "epoch": 12.729498164014688, "grad_norm": 1.8970017433166504, "learning_rate": 2.7365010282720954e-07, "loss": 0.3553, "step": 1300 }, { "epoch": 12.739290085679315, "grad_norm": 1.8114985227584839, "learning_rate": 2.713196241209562e-07, "loss": 0.32, "step": 1301 }, { "epoch": 12.749082007343942, "grad_norm": 1.9324902296066284, "learning_rate": 2.6899854188013054e-07, "loss": 0.6203, "step": 1302 }, { "epoch": 12.758873929008567, "grad_norm": 1.9595811367034912, "learning_rate": 2.6668686589077815e-07, "loss": 0.449, "step": 1303 }, { "epoch": 12.768665850673194, "grad_norm": 2.0151216983795166, "learning_rate": 2.643846058992866e-07, "loss": 0.4417, "step": 1304 }, { "epoch": 12.778457772337822, "grad_norm": 2.1633002758026123, "learning_rate": 2.620917716123444e-07, "loss": 0.7243, "step": 1305 }, { "epoch": 12.788249694002449, "grad_norm": 2.058457374572754, "learning_rate": 2.5980837269690056e-07, "loss": 0.7341, "step": 1306 }, { "epoch": 12.798041615667074, "grad_norm": 1.598192572593689, "learning_rate": 2.5753441878012236e-07, "loss": 0.302, "step": 1307 }, { "epoch": 12.807833537331701, "grad_norm": 2.1217687129974365, "learning_rate": 2.552699194493549e-07, "loss": 0.6068, "step": 1308 }, { "epoch": 12.817625458996329, "grad_norm": 1.8791109323501587, "learning_rate": 2.53014884252083e-07, "loss": 0.4228, "step": 1309 }, { "epoch": 12.827417380660954, "grad_norm": 1.7703254222869873, "learning_rate": 2.507693226958871e-07, "loss": 0.4075, "step": 1310 }, { "epoch": 12.837209302325581, "grad_norm": 2.177889108657837, "learning_rate": 2.4853324424840674e-07, "loss": 0.6143, "step": 1311 }, { "epoch": 12.847001223990208, "grad_norm": 1.8406115770339966, "learning_rate": 2.463066583372989e-07, "loss": 0.6163, "step": 1312 }, { "epoch": 12.856793145654835, "grad_norm": 1.698757529258728, "learning_rate": 2.4408957435019836e-07, "loss": 0.3683, "step": 1313 }, { "epoch": 12.86658506731946, "grad_norm": 1.8872238397598267, "learning_rate": 2.418820016346779e-07, "loss": 0.4721, "step": 1314 }, { "epoch": 12.876376988984088, "grad_norm": 2.2248075008392334, "learning_rate": 2.3968394949821034e-07, "loss": 0.6698, "step": 1315 }, { "epoch": 12.886168910648715, "grad_norm": 1.768311858177185, "learning_rate": 2.3749542720812757e-07, "loss": 0.4077, "step": 1316 }, { "epoch": 12.895960832313342, "grad_norm": 2.0064873695373535, "learning_rate": 2.3531644399158198e-07, "loss": 0.4331, "step": 1317 }, { "epoch": 12.905752753977968, "grad_norm": 1.9465358257293701, "learning_rate": 2.331470090355084e-07, "loss": 0.5159, "step": 1318 }, { "epoch": 12.915544675642595, "grad_norm": 1.8796097040176392, "learning_rate": 2.3098713148658414e-07, "loss": 0.637, "step": 1319 }, { "epoch": 12.925336597307222, "grad_norm": 1.8773866891860962, "learning_rate": 2.2883682045119066e-07, "loss": 0.4701, "step": 1320 }, { "epoch": 12.935128518971847, "grad_norm": 2.0348799228668213, "learning_rate": 2.266960849953767e-07, "loss": 0.3891, "step": 1321 }, { "epoch": 12.944920440636475, "grad_norm": 2.386507511138916, "learning_rate": 2.2456493414481778e-07, "loss": 0.6821, "step": 1322 }, { "epoch": 12.954712362301102, "grad_norm": 1.843592882156372, "learning_rate": 2.224433768847789e-07, "loss": 0.4941, "step": 1323 }, { "epoch": 12.964504283965729, "grad_norm": 2.0456154346466064, "learning_rate": 2.2033142216007913e-07, "loss": 0.7217, "step": 1324 }, { "epoch": 12.974296205630354, "grad_norm": 1.9596956968307495, "learning_rate": 2.1822907887504936e-07, "loss": 0.6575, "step": 1325 }, { "epoch": 12.984088127294982, "grad_norm": 1.9567314386367798, "learning_rate": 2.1613635589349756e-07, "loss": 0.5005, "step": 1326 }, { "epoch": 12.993880048959609, "grad_norm": 1.9836480617523193, "learning_rate": 2.140532620386726e-07, "loss": 0.5636, "step": 1327 }, { "epoch": 13.003671970624236, "grad_norm": 5.507444381713867, "learning_rate": 2.1197980609322406e-07, "loss": 1.7195, "step": 1328 }, { "epoch": 13.013463892288861, "grad_norm": 1.674594521522522, "learning_rate": 2.0991599679916624e-07, "loss": 0.3194, "step": 1329 }, { "epoch": 13.023255813953488, "grad_norm": 1.959998607635498, "learning_rate": 2.07861842857843e-07, "loss": 0.7446, "step": 1330 }, { "epoch": 13.033047735618116, "grad_norm": 2.142037868499756, "learning_rate": 2.0581735292988847e-07, "loss": 0.4909, "step": 1331 }, { "epoch": 13.042839657282741, "grad_norm": 1.7994287014007568, "learning_rate": 2.0378253563519247e-07, "loss": 0.4113, "step": 1332 }, { "epoch": 13.052631578947368, "grad_norm": 2.0098366737365723, "learning_rate": 2.017573995528635e-07, "loss": 0.627, "step": 1333 }, { "epoch": 13.062423500611995, "grad_norm": 2.370952606201172, "learning_rate": 1.997419532211925e-07, "loss": 0.3698, "step": 1334 }, { "epoch": 13.072215422276622, "grad_norm": 2.1613762378692627, "learning_rate": 1.9773620513761582e-07, "loss": 0.3042, "step": 1335 }, { "epoch": 13.082007343941248, "grad_norm": 1.9969666004180908, "learning_rate": 1.9574016375868282e-07, "loss": 0.5605, "step": 1336 }, { "epoch": 13.091799265605875, "grad_norm": 2.193392753601074, "learning_rate": 1.9375383750001559e-07, "loss": 0.8394, "step": 1337 }, { "epoch": 13.101591187270502, "grad_norm": 2.0694093704223633, "learning_rate": 1.9177723473627647e-07, "loss": 0.4901, "step": 1338 }, { "epoch": 13.11138310893513, "grad_norm": 1.890702486038208, "learning_rate": 1.898103638011331e-07, "loss": 0.4428, "step": 1339 }, { "epoch": 13.121175030599755, "grad_norm": 1.6986807584762573, "learning_rate": 1.8785323298722098e-07, "loss": 0.296, "step": 1340 }, { "epoch": 13.130966952264382, "grad_norm": 1.7923239469528198, "learning_rate": 1.859058505461095e-07, "loss": 0.5187, "step": 1341 }, { "epoch": 13.140758873929009, "grad_norm": 1.9225521087646484, "learning_rate": 1.839682246882682e-07, "loss": 0.6645, "step": 1342 }, { "epoch": 13.150550795593634, "grad_norm": 1.9675309658050537, "learning_rate": 1.8204036358303173e-07, "loss": 0.5809, "step": 1343 }, { "epoch": 13.160342717258262, "grad_norm": 2.035162925720215, "learning_rate": 1.801222753585638e-07, "loss": 0.45, "step": 1344 }, { "epoch": 13.170134638922889, "grad_norm": 2.291029453277588, "learning_rate": 1.782139681018244e-07, "loss": 0.5717, "step": 1345 }, { "epoch": 13.179926560587516, "grad_norm": 1.8917964696884155, "learning_rate": 1.7631544985853623e-07, "loss": 0.3383, "step": 1346 }, { "epoch": 13.189718482252141, "grad_norm": 1.7345136404037476, "learning_rate": 1.744267286331497e-07, "loss": 0.3889, "step": 1347 }, { "epoch": 13.199510403916769, "grad_norm": 1.8893895149230957, "learning_rate": 1.725478123888083e-07, "loss": 0.5418, "step": 1348 }, { "epoch": 13.209302325581396, "grad_norm": 2.04268479347229, "learning_rate": 1.7067870904731803e-07, "loss": 0.4571, "step": 1349 }, { "epoch": 13.219094247246023, "grad_norm": 1.7676559686660767, "learning_rate": 1.6881942648911077e-07, "loss": 0.3639, "step": 1350 }, { "epoch": 13.228886168910648, "grad_norm": 1.8710107803344727, "learning_rate": 1.6696997255321283e-07, "loss": 0.4669, "step": 1351 }, { "epoch": 13.238678090575275, "grad_norm": 1.8840008974075317, "learning_rate": 1.6513035503721213e-07, "loss": 0.6166, "step": 1352 }, { "epoch": 13.248470012239903, "grad_norm": 2.0223827362060547, "learning_rate": 1.6330058169722384e-07, "loss": 0.5979, "step": 1353 }, { "epoch": 13.258261933904528, "grad_norm": 1.7062898874282837, "learning_rate": 1.614806602478583e-07, "loss": 0.3199, "step": 1354 }, { "epoch": 13.268053855569155, "grad_norm": 1.9188346862792969, "learning_rate": 1.5967059836219047e-07, "loss": 0.6318, "step": 1355 }, { "epoch": 13.277845777233782, "grad_norm": 1.776623010635376, "learning_rate": 1.5787040367172379e-07, "loss": 0.3405, "step": 1356 }, { "epoch": 13.28763769889841, "grad_norm": 2.0769689083099365, "learning_rate": 1.560800837663609e-07, "loss": 0.4649, "step": 1357 }, { "epoch": 13.297429620563035, "grad_norm": 1.83968186378479, "learning_rate": 1.542996461943716e-07, "loss": 0.5239, "step": 1358 }, { "epoch": 13.307221542227662, "grad_norm": 1.7961751222610474, "learning_rate": 1.5252909846235898e-07, "loss": 0.468, "step": 1359 }, { "epoch": 13.31701346389229, "grad_norm": 2.1870229244232178, "learning_rate": 1.507684480352292e-07, "loss": 0.735, "step": 1360 }, { "epoch": 13.326805385556916, "grad_norm": 1.9340951442718506, "learning_rate": 1.490177023361608e-07, "loss": 0.538, "step": 1361 }, { "epoch": 13.336597307221542, "grad_norm": 1.8332093954086304, "learning_rate": 1.4727686874657143e-07, "loss": 0.4368, "step": 1362 }, { "epoch": 13.346389228886169, "grad_norm": 1.9275071620941162, "learning_rate": 1.455459546060875e-07, "loss": 0.4635, "step": 1363 }, { "epoch": 13.356181150550796, "grad_norm": 2.0469605922698975, "learning_rate": 1.4382496721251526e-07, "loss": 0.6578, "step": 1364 }, { "epoch": 13.365973072215422, "grad_norm": 2.023045778274536, "learning_rate": 1.421139138218064e-07, "loss": 0.5658, "step": 1365 }, { "epoch": 13.375764993880049, "grad_norm": 1.727044939994812, "learning_rate": 1.4041280164802967e-07, "loss": 0.3066, "step": 1366 }, { "epoch": 13.385556915544676, "grad_norm": 2.4948127269744873, "learning_rate": 1.3872163786334103e-07, "loss": 0.9658, "step": 1367 }, { "epoch": 13.395348837209303, "grad_norm": 2.1113333702087402, "learning_rate": 1.3704042959795132e-07, "loss": 0.5048, "step": 1368 }, { "epoch": 13.405140758873928, "grad_norm": 2.171971559524536, "learning_rate": 1.3536918394009752e-07, "loss": 0.5133, "step": 1369 }, { "epoch": 13.414932680538556, "grad_norm": 1.9796112775802612, "learning_rate": 1.3370790793601373e-07, "loss": 0.5596, "step": 1370 }, { "epoch": 13.424724602203183, "grad_norm": 1.8083933591842651, "learning_rate": 1.3205660858989943e-07, "loss": 0.4685, "step": 1371 }, { "epoch": 13.43451652386781, "grad_norm": 2.1228833198547363, "learning_rate": 1.3041529286389078e-07, "loss": 0.7031, "step": 1372 }, { "epoch": 13.444308445532435, "grad_norm": 1.9004195928573608, "learning_rate": 1.2878396767803252e-07, "loss": 0.5195, "step": 1373 }, { "epoch": 13.454100367197062, "grad_norm": 1.8698636293411255, "learning_rate": 1.2716263991024712e-07, "loss": 0.4356, "step": 1374 }, { "epoch": 13.46389228886169, "grad_norm": 2.235520601272583, "learning_rate": 1.255513163963057e-07, "loss": 0.7666, "step": 1375 }, { "epoch": 13.473684210526315, "grad_norm": 1.7209237813949585, "learning_rate": 1.2395000392980057e-07, "loss": 0.3454, "step": 1376 }, { "epoch": 13.483476132190942, "grad_norm": 1.8723653554916382, "learning_rate": 1.223587092621162e-07, "loss": 0.3125, "step": 1377 }, { "epoch": 13.49326805385557, "grad_norm": 2.075610399246216, "learning_rate": 1.2077743910239998e-07, "loss": 0.4442, "step": 1378 }, { "epoch": 13.503059975520197, "grad_norm": 1.7446602582931519, "learning_rate": 1.1920620011753397e-07, "loss": 0.406, "step": 1379 }, { "epoch": 13.512851897184822, "grad_norm": 1.8240257501602173, "learning_rate": 1.1764499893210879e-07, "loss": 0.3662, "step": 1380 }, { "epoch": 13.522643818849449, "grad_norm": 1.9492757320404053, "learning_rate": 1.160938421283922e-07, "loss": 0.5193, "step": 1381 }, { "epoch": 13.532435740514076, "grad_norm": 2.241713523864746, "learning_rate": 1.145527362463042e-07, "loss": 0.5315, "step": 1382 }, { "epoch": 13.542227662178703, "grad_norm": 2.0243585109710693, "learning_rate": 1.1302168778338923e-07, "loss": 0.5548, "step": 1383 }, { "epoch": 13.552019583843329, "grad_norm": 1.9987409114837646, "learning_rate": 1.1150070319478679e-07, "loss": 0.4261, "step": 1384 }, { "epoch": 13.561811505507956, "grad_norm": 1.814024567604065, "learning_rate": 1.0998978889320583e-07, "loss": 0.3669, "step": 1385 }, { "epoch": 13.571603427172583, "grad_norm": 1.914786458015442, "learning_rate": 1.0848895124889819e-07, "loss": 0.6041, "step": 1386 }, { "epoch": 13.581395348837209, "grad_norm": 1.670328974723816, "learning_rate": 1.0699819658962996e-07, "loss": 0.3073, "step": 1387 }, { "epoch": 13.591187270501836, "grad_norm": 2.48321270942688, "learning_rate": 1.0551753120065621e-07, "loss": 0.5374, "step": 1388 }, { "epoch": 13.600979192166463, "grad_norm": 2.292393207550049, "learning_rate": 1.0404696132469445e-07, "loss": 0.6152, "step": 1389 }, { "epoch": 13.61077111383109, "grad_norm": 2.3069007396698, "learning_rate": 1.0258649316189722e-07, "loss": 0.8122, "step": 1390 }, { "epoch": 13.620563035495715, "grad_norm": 1.9227274656295776, "learning_rate": 1.0113613286982682e-07, "loss": 0.6842, "step": 1391 }, { "epoch": 13.630354957160343, "grad_norm": 1.9147167205810547, "learning_rate": 9.969588656342982e-08, "loss": 0.4876, "step": 1392 }, { "epoch": 13.64014687882497, "grad_norm": 2.272982120513916, "learning_rate": 9.826576031501001e-08, "loss": 0.4608, "step": 1393 }, { "epoch": 13.649938800489597, "grad_norm": 1.9352953433990479, "learning_rate": 9.684576015420277e-08, "loss": 0.4135, "step": 1394 }, { "epoch": 13.659730722154222, "grad_norm": 2.049243688583374, "learning_rate": 9.54358920679524e-08, "loss": 0.6044, "step": 1395 }, { "epoch": 13.66952264381885, "grad_norm": 1.8770604133605957, "learning_rate": 9.403616200048288e-08, "loss": 0.5958, "step": 1396 }, { "epoch": 13.679314565483477, "grad_norm": 1.7168536186218262, "learning_rate": 9.264657585327519e-08, "loss": 0.3728, "step": 1397 }, { "epoch": 13.689106487148102, "grad_norm": 2.173676013946533, "learning_rate": 9.12671394850423e-08, "loss": 0.6132, "step": 1398 }, { "epoch": 13.69889840881273, "grad_norm": 1.8513484001159668, "learning_rate": 8.989785871170415e-08, "loss": 0.3971, "step": 1399 }, { "epoch": 13.708690330477356, "grad_norm": 1.895801305770874, "learning_rate": 8.85387393063622e-08, "loss": 0.5569, "step": 1400 }, { "epoch": 13.718482252141984, "grad_norm": 1.9521677494049072, "learning_rate": 8.718978699927744e-08, "loss": 0.5179, "step": 1401 }, { "epoch": 13.728274173806609, "grad_norm": 1.9037365913391113, "learning_rate": 8.585100747784376e-08, "loss": 0.4201, "step": 1402 }, { "epoch": 13.738066095471236, "grad_norm": 1.8467025756835938, "learning_rate": 8.45224063865649e-08, "loss": 0.4806, "step": 1403 }, { "epoch": 13.747858017135863, "grad_norm": 1.7039233446121216, "learning_rate": 8.320398932703145e-08, "loss": 0.5024, "step": 1404 }, { "epoch": 13.75764993880049, "grad_norm": 1.9479820728302002, "learning_rate": 8.189576185789638e-08, "loss": 0.5498, "step": 1405 }, { "epoch": 13.767441860465116, "grad_norm": 1.8009456396102905, "learning_rate": 8.059772949485068e-08, "loss": 0.449, "step": 1406 }, { "epoch": 13.777233782129743, "grad_norm": 1.9916868209838867, "learning_rate": 7.930989771060299e-08, "loss": 0.515, "step": 1407 }, { "epoch": 13.78702570379437, "grad_norm": 1.88126540184021, "learning_rate": 7.803227193485336e-08, "loss": 0.4845, "step": 1408 }, { "epoch": 13.796817625458996, "grad_norm": 1.864071011543274, "learning_rate": 7.676485755427177e-08, "loss": 0.5492, "step": 1409 }, { "epoch": 13.806609547123623, "grad_norm": 1.840491533279419, "learning_rate": 7.550765991247655e-08, "loss": 0.3873, "step": 1410 }, { "epoch": 13.81640146878825, "grad_norm": 1.8186510801315308, "learning_rate": 7.426068431000883e-08, "loss": 0.5462, "step": 1411 }, { "epoch": 13.826193390452877, "grad_norm": 1.8547239303588867, "learning_rate": 7.30239360043139e-08, "loss": 0.4139, "step": 1412 }, { "epoch": 13.835985312117502, "grad_norm": 2.010782480239868, "learning_rate": 7.17974202097152e-08, "loss": 0.5298, "step": 1413 }, { "epoch": 13.84577723378213, "grad_norm": 1.8667088747024536, "learning_rate": 7.058114209739675e-08, "loss": 0.406, "step": 1414 }, { "epoch": 13.855569155446757, "grad_norm": 1.8373886346817017, "learning_rate": 6.937510679537628e-08, "loss": 0.5954, "step": 1415 }, { "epoch": 13.865361077111384, "grad_norm": 1.708153486251831, "learning_rate": 6.817931938848805e-08, "loss": 0.3425, "step": 1416 }, { "epoch": 13.87515299877601, "grad_norm": 1.838835597038269, "learning_rate": 6.699378491835911e-08, "loss": 0.5723, "step": 1417 }, { "epoch": 13.884944920440637, "grad_norm": 1.976016640663147, "learning_rate": 6.581850838338816e-08, "loss": 0.7409, "step": 1418 }, { "epoch": 13.894736842105264, "grad_norm": 1.791101336479187, "learning_rate": 6.465349473872479e-08, "loss": 0.5263, "step": 1419 }, { "epoch": 13.904528763769889, "grad_norm": 1.8388831615447998, "learning_rate": 6.349874889624963e-08, "loss": 0.4407, "step": 1420 }, { "epoch": 13.914320685434516, "grad_norm": 1.9698755741119385, "learning_rate": 6.235427572455155e-08, "loss": 0.5674, "step": 1421 }, { "epoch": 13.924112607099143, "grad_norm": 1.7518919706344604, "learning_rate": 6.12200800489085e-08, "loss": 0.466, "step": 1422 }, { "epoch": 13.93390452876377, "grad_norm": 1.619680643081665, "learning_rate": 6.009616665126728e-08, "loss": 0.3003, "step": 1423 }, { "epoch": 13.943696450428396, "grad_norm": 1.962293267250061, "learning_rate": 5.898254027022293e-08, "loss": 0.6451, "step": 1424 }, { "epoch": 13.953488372093023, "grad_norm": 1.9984534978866577, "learning_rate": 5.78792056009983e-08, "loss": 0.6049, "step": 1425 }, { "epoch": 13.96328029375765, "grad_norm": 2.17364501953125, "learning_rate": 5.678616729542535e-08, "loss": 0.5014, "step": 1426 }, { "epoch": 13.973072215422278, "grad_norm": 1.91876220703125, "learning_rate": 5.5703429961924404e-08, "loss": 0.3848, "step": 1427 }, { "epoch": 13.982864137086903, "grad_norm": 1.971498966217041, "learning_rate": 5.463099816548578e-08, "loss": 0.5689, "step": 1428 }, { "epoch": 13.99265605875153, "grad_norm": 1.9278484582901, "learning_rate": 5.356887642765013e-08, "loss": 0.6432, "step": 1429 }, { "epoch": 14.002447980416157, "grad_norm": 8.710877418518066, "learning_rate": 5.2517069226488694e-08, "loss": 0.8635, "step": 1430 }, { "epoch": 14.012239902080783, "grad_norm": 1.8645914793014526, "learning_rate": 5.1475580996585285e-08, "loss": 0.4164, "step": 1431 }, { "epoch": 14.02203182374541, "grad_norm": 1.8691827058792114, "learning_rate": 5.044441612901768e-08, "loss": 0.7011, "step": 1432 }, { "epoch": 14.031823745410037, "grad_norm": 1.900776982307434, "learning_rate": 4.942357897133793e-08, "loss": 0.469, "step": 1433 }, { "epoch": 14.041615667074664, "grad_norm": 1.997868537902832, "learning_rate": 4.841307382755567e-08, "loss": 0.7734, "step": 1434 }, { "epoch": 14.05140758873929, "grad_norm": 1.656363844871521, "learning_rate": 4.7412904958118736e-08, "loss": 0.3789, "step": 1435 }, { "epoch": 14.061199510403917, "grad_norm": 1.8283662796020508, "learning_rate": 4.6423076579895646e-08, "loss": 0.5724, "step": 1436 }, { "epoch": 14.070991432068544, "grad_norm": 2.093168020248413, "learning_rate": 4.544359286615785e-08, "loss": 0.6101, "step": 1437 }, { "epoch": 14.080783353733171, "grad_norm": 1.943001627922058, "learning_rate": 4.4474457946562245e-08, "loss": 0.5572, "step": 1438 }, { "epoch": 14.090575275397796, "grad_norm": 1.8186061382293701, "learning_rate": 4.351567590713313e-08, "loss": 0.5179, "step": 1439 }, { "epoch": 14.100367197062424, "grad_norm": 2.003988742828369, "learning_rate": 4.256725079024554e-08, "loss": 0.4297, "step": 1440 }, { "epoch": 14.11015911872705, "grad_norm": 2.312518835067749, "learning_rate": 4.1629186594608905e-08, "loss": 0.6148, "step": 1441 }, { "epoch": 14.119951040391676, "grad_norm": 1.821373462677002, "learning_rate": 4.070148727524814e-08, "loss": 0.2952, "step": 1442 }, { "epoch": 14.129742962056303, "grad_norm": 1.9149075746536255, "learning_rate": 3.978415674348896e-08, "loss": 0.491, "step": 1443 }, { "epoch": 14.13953488372093, "grad_norm": 1.8513234853744507, "learning_rate": 3.887719886694091e-08, "loss": 0.5945, "step": 1444 }, { "epoch": 14.149326805385558, "grad_norm": 1.966253638267517, "learning_rate": 3.798061746947995e-08, "loss": 0.6106, "step": 1445 }, { "epoch": 14.159118727050183, "grad_norm": 1.9442884922027588, "learning_rate": 3.709441633123367e-08, "loss": 0.5533, "step": 1446 }, { "epoch": 14.16891064871481, "grad_norm": 2.1292994022369385, "learning_rate": 3.621859918856524e-08, "loss": 0.7585, "step": 1447 }, { "epoch": 14.178702570379437, "grad_norm": 1.8493951559066772, "learning_rate": 3.535316973405672e-08, "loss": 0.5246, "step": 1448 }, { "epoch": 14.188494492044065, "grad_norm": 2.0847296714782715, "learning_rate": 3.449813161649357e-08, "loss": 0.6878, "step": 1449 }, { "epoch": 14.19828641370869, "grad_norm": 1.8896414041519165, "learning_rate": 3.3653488440851255e-08, "loss": 0.7654, "step": 1450 }, { "epoch": 14.208078335373317, "grad_norm": 1.963158130645752, "learning_rate": 3.281924376827728e-08, "loss": 0.6161, "step": 1451 }, { "epoch": 14.217870257037944, "grad_norm": 1.780059576034546, "learning_rate": 3.1995401116077516e-08, "loss": 0.401, "step": 1452 }, { "epoch": 14.22766217870257, "grad_norm": 1.869331955909729, "learning_rate": 3.118196395770212e-08, "loss": 0.4771, "step": 1453 }, { "epoch": 14.237454100367197, "grad_norm": 1.9949331283569336, "learning_rate": 3.037893572272937e-08, "loss": 0.4554, "step": 1454 }, { "epoch": 14.247246022031824, "grad_norm": 2.11059308052063, "learning_rate": 2.958631979685156e-08, "loss": 0.452, "step": 1455 }, { "epoch": 14.257037943696451, "grad_norm": 1.846139907836914, "learning_rate": 2.8804119521862183e-08, "loss": 0.4716, "step": 1456 }, { "epoch": 14.266829865361077, "grad_norm": 1.9983474016189575, "learning_rate": 2.8032338195639886e-08, "loss": 0.5966, "step": 1457 }, { "epoch": 14.276621787025704, "grad_norm": 1.8684366941452026, "learning_rate": 2.7270979072135106e-08, "loss": 0.3852, "step": 1458 }, { "epoch": 14.286413708690331, "grad_norm": 1.9761377573013306, "learning_rate": 2.6520045361357606e-08, "loss": 0.4229, "step": 1459 }, { "epoch": 14.296205630354958, "grad_norm": 1.5237106084823608, "learning_rate": 2.5779540229361744e-08, "loss": 0.2809, "step": 1460 }, { "epoch": 14.305997552019583, "grad_norm": 1.8286893367767334, "learning_rate": 2.5049466798232614e-08, "loss": 0.5039, "step": 1461 }, { "epoch": 14.31578947368421, "grad_norm": 1.9930884838104248, "learning_rate": 2.4329828146074096e-08, "loss": 0.4821, "step": 1462 }, { "epoch": 14.325581395348838, "grad_norm": 2.0325517654418945, "learning_rate": 2.3620627306995826e-08, "loss": 0.5246, "step": 1463 }, { "epoch": 14.335373317013463, "grad_norm": 1.8594211339950562, "learning_rate": 2.2921867271099296e-08, "loss": 0.5429, "step": 1464 }, { "epoch": 14.34516523867809, "grad_norm": 1.7734907865524292, "learning_rate": 2.2233550984466225e-08, "loss": 0.4919, "step": 1465 }, { "epoch": 14.354957160342718, "grad_norm": 2.08024525642395, "learning_rate": 2.155568134914604e-08, "loss": 0.4798, "step": 1466 }, { "epoch": 14.364749082007345, "grad_norm": 1.9115285873413086, "learning_rate": 2.0888261223143136e-08, "loss": 0.4909, "step": 1467 }, { "epoch": 14.37454100367197, "grad_norm": 1.8302574157714844, "learning_rate": 2.0231293420405194e-08, "loss": 0.439, "step": 1468 }, { "epoch": 14.384332925336597, "grad_norm": 1.8212227821350098, "learning_rate": 1.9584780710811546e-08, "loss": 0.4185, "step": 1469 }, { "epoch": 14.394124847001224, "grad_norm": 1.8805993795394897, "learning_rate": 1.8948725820160663e-08, "loss": 0.4541, "step": 1470 }, { "epoch": 14.403916768665852, "grad_norm": 1.6841789484024048, "learning_rate": 1.8323131430159902e-08, "loss": 0.3968, "step": 1471 }, { "epoch": 14.413708690330477, "grad_norm": 1.9470292329788208, "learning_rate": 1.770800017841301e-08, "loss": 0.5892, "step": 1472 }, { "epoch": 14.423500611995104, "grad_norm": 1.8566615581512451, "learning_rate": 1.7103334658409577e-08, "loss": 0.4822, "step": 1473 }, { "epoch": 14.433292533659731, "grad_norm": 1.796751618385315, "learning_rate": 1.650913741951421e-08, "loss": 0.5016, "step": 1474 }, { "epoch": 14.443084455324357, "grad_norm": 2.07592511177063, "learning_rate": 1.5925410966955713e-08, "loss": 0.5869, "step": 1475 }, { "epoch": 14.452876376988984, "grad_norm": 2.1160025596618652, "learning_rate": 1.5352157761815978e-08, "loss": 0.4181, "step": 1476 }, { "epoch": 14.462668298653611, "grad_norm": 1.8969182968139648, "learning_rate": 1.4789380221019999e-08, "loss": 0.519, "step": 1477 }, { "epoch": 14.472460220318238, "grad_norm": 1.8452991247177124, "learning_rate": 1.4237080717326712e-08, "loss": 0.3495, "step": 1478 }, { "epoch": 14.482252141982864, "grad_norm": 1.5435068607330322, "learning_rate": 1.3695261579316776e-08, "loss": 0.4068, "step": 1479 }, { "epoch": 14.49204406364749, "grad_norm": 2.0963876247406006, "learning_rate": 1.3163925091384532e-08, "loss": 0.5224, "step": 1480 }, { "epoch": 14.501835985312118, "grad_norm": 1.6774734258651733, "learning_rate": 1.2643073493728009e-08, "loss": 0.4128, "step": 1481 }, { "epoch": 14.511627906976745, "grad_norm": 1.9852231740951538, "learning_rate": 1.2132708982338925e-08, "loss": 0.7033, "step": 1482 }, { "epoch": 14.52141982864137, "grad_norm": 1.7852762937545776, "learning_rate": 1.1632833708994096e-08, "loss": 0.589, "step": 1483 }, { "epoch": 14.531211750305998, "grad_norm": 1.8648505210876465, "learning_rate": 1.1143449781245985e-08, "loss": 0.5603, "step": 1484 }, { "epoch": 14.541003671970625, "grad_norm": 2.0609068870544434, "learning_rate": 1.0664559262413831e-08, "loss": 0.4305, "step": 1485 }, { "epoch": 14.55079559363525, "grad_norm": 2.3634092807769775, "learning_rate": 1.0196164171574762e-08, "loss": 0.5454, "step": 1486 }, { "epoch": 14.560587515299877, "grad_norm": 2.0448837280273438, "learning_rate": 9.738266483556303e-09, "loss": 0.5194, "step": 1487 }, { "epoch": 14.570379436964505, "grad_norm": 2.3015999794006348, "learning_rate": 9.290868128926378e-09, "loss": 0.609, "step": 1488 }, { "epoch": 14.580171358629132, "grad_norm": 2.249049663543701, "learning_rate": 8.85397099398666e-09, "loss": 0.7069, "step": 1489 }, { "epoch": 14.589963280293757, "grad_norm": 2.2199394702911377, "learning_rate": 8.427576920763957e-09, "loss": 0.7729, "step": 1490 }, { "epoch": 14.599755201958384, "grad_norm": 2.180298328399658, "learning_rate": 8.011687707002447e-09, "loss": 0.5823, "step": 1491 }, { "epoch": 14.609547123623011, "grad_norm": 1.9999346733093262, "learning_rate": 7.606305106155898e-09, "loss": 0.3654, "step": 1492 }, { "epoch": 14.619339045287639, "grad_norm": 1.6423143148422241, "learning_rate": 7.211430827381294e-09, "loss": 0.4234, "step": 1493 }, { "epoch": 14.629130966952264, "grad_norm": 1.7439075708389282, "learning_rate": 6.827066535529947e-09, "loss": 0.357, "step": 1494 }, { "epoch": 14.638922888616891, "grad_norm": 1.9694961309432983, "learning_rate": 6.453213851142226e-09, "loss": 0.6283, "step": 1495 }, { "epoch": 14.648714810281518, "grad_norm": 1.8617464303970337, "learning_rate": 6.089874350439507e-09, "loss": 0.5938, "step": 1496 }, { "epoch": 14.658506731946144, "grad_norm": 2.1271812915802, "learning_rate": 5.737049565317787e-09, "loss": 0.5709, "step": 1497 }, { "epoch": 14.668298653610771, "grad_norm": 1.8519381284713745, "learning_rate": 5.394740983341862e-09, "loss": 0.536, "step": 1498 }, { "epoch": 14.678090575275398, "grad_norm": 1.8103715181350708, "learning_rate": 5.06295004773838e-09, "loss": 0.5833, "step": 1499 }, { "epoch": 14.687882496940025, "grad_norm": 1.8526721000671387, "learning_rate": 4.74167815738974e-09, "loss": 0.4403, "step": 1500 }, { "epoch": 14.69767441860465, "grad_norm": 2.00471568107605, "learning_rate": 4.430926666828539e-09, "loss": 0.3999, "step": 1501 }, { "epoch": 14.707466340269278, "grad_norm": 1.8998697996139526, "learning_rate": 4.130696886231744e-09, "loss": 0.5023, "step": 1502 }, { "epoch": 14.717258261933905, "grad_norm": 1.7947837114334106, "learning_rate": 3.840990081415141e-09, "loss": 0.4021, "step": 1503 }, { "epoch": 14.727050183598532, "grad_norm": 1.7356659173965454, "learning_rate": 3.561807473827783e-09, "loss": 0.3916, "step": 1504 }, { "epoch": 14.736842105263158, "grad_norm": 1.7205500602722168, "learning_rate": 3.293150240547549e-09, "loss": 0.4758, "step": 1505 }, { "epoch": 14.746634026927785, "grad_norm": 1.6489745378494263, "learning_rate": 3.035019514275317e-09, "loss": 0.3939, "step": 1506 }, { "epoch": 14.756425948592412, "grad_norm": 2.1436476707458496, "learning_rate": 2.7874163833302435e-09, "loss": 0.5457, "step": 1507 }, { "epoch": 14.766217870257037, "grad_norm": 1.8804194927215576, "learning_rate": 2.5503418916464352e-09, "loss": 0.4147, "step": 1508 }, { "epoch": 14.776009791921664, "grad_norm": 1.7551625967025757, "learning_rate": 2.3237970387671173e-09, "loss": 0.5315, "step": 1509 }, { "epoch": 14.785801713586292, "grad_norm": 2.0606727600097656, "learning_rate": 2.1077827798404728e-09, "loss": 0.7363, "step": 1510 }, { "epoch": 14.795593635250919, "grad_norm": 2.0211243629455566, "learning_rate": 1.9023000256163103e-09, "loss": 0.4193, "step": 1511 }, { "epoch": 14.805385556915544, "grad_norm": 1.7291994094848633, "learning_rate": 1.707349642442735e-09, "loss": 0.3526, "step": 1512 }, { "epoch": 14.815177478580171, "grad_norm": 1.830012321472168, "learning_rate": 1.5229324522605949e-09, "loss": 0.5169, "step": 1513 }, { "epoch": 14.824969400244798, "grad_norm": 1.8500601053237915, "learning_rate": 1.349049232601818e-09, "loss": 0.4349, "step": 1514 }, { "epoch": 14.834761321909426, "grad_norm": 1.6950221061706543, "learning_rate": 1.1857007165852475e-09, "loss": 0.5129, "step": 1515 }, { "epoch": 14.844553243574051, "grad_norm": 2.0525310039520264, "learning_rate": 1.0328875929138671e-09, "loss": 0.5669, "step": 1516 }, { "epoch": 14.854345165238678, "grad_norm": 2.2973618507385254, "learning_rate": 8.906105058714698e-10, "loss": 0.5553, "step": 1517 }, { "epoch": 14.864137086903305, "grad_norm": 1.7915773391723633, "learning_rate": 7.588700553209926e-10, "loss": 0.3236, "step": 1518 }, { "epoch": 14.87392900856793, "grad_norm": 2.0079126358032227, "learning_rate": 6.376667967003536e-10, "loss": 0.5682, "step": 1519 }, { "epoch": 14.883720930232558, "grad_norm": 1.92130708694458, "learning_rate": 5.270012410216185e-10, "loss": 0.4189, "step": 1520 }, { "epoch": 14.893512851897185, "grad_norm": 2.1490514278411865, "learning_rate": 4.268738548682261e-10, "loss": 0.536, "step": 1521 }, { "epoch": 14.903304773561812, "grad_norm": 1.790059208869934, "learning_rate": 3.3728506039276686e-10, "loss": 0.4361, "step": 1522 }, { "epoch": 14.913096695226438, "grad_norm": 1.9256318807601929, "learning_rate": 2.5823523531587344e-10, "loss": 0.4007, "step": 1523 }, { "epoch": 14.922888616891065, "grad_norm": 2.059610605239868, "learning_rate": 1.8972471292344474e-10, "loss": 0.476, "step": 1524 }, { "epoch": 14.932680538555692, "grad_norm": 2.0634407997131348, "learning_rate": 1.3175378206720102e-10, "loss": 0.5522, "step": 1525 }, { "epoch": 14.94247246022032, "grad_norm": 2.2826569080352783, "learning_rate": 8.432268716135338e-11, "loss": 0.5548, "step": 1526 }, { "epoch": 14.952264381884945, "grad_norm": 1.8803513050079346, "learning_rate": 4.743162818288127e-11, "loss": 0.3558, "step": 1527 }, { "epoch": 14.962056303549572, "grad_norm": 1.9702091217041016, "learning_rate": 2.108076067014464e-11, "loss": 0.4017, "step": 1528 }, { "epoch": 14.971848225214199, "grad_norm": 2.4966163635253906, "learning_rate": 5.2701957226064526e-12, "loss": 0.5141, "step": 1529 }, { "epoch": 14.981640146878824, "grad_norm": 1.9435852766036987, "learning_rate": 0.0, "loss": 0.5115, "step": 1530 }, { "epoch": 14.981640146878824, "step": 1530, "total_flos": 1.5105633831878656e+17, "train_loss": 0.6510952122460782, "train_runtime": 5624.8673, "train_samples_per_second": 2.179, "train_steps_per_second": 0.272 } ], "logging_steps": 1, "max_steps": 1530, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5105633831878656e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }