{ "best_metric": 1.2795084714889526, "best_model_checkpoint": "saved_model/c2s_jun2024/checkpoint-9692", "epoch": 2.0, "eval_steps": 500, "global_step": 9692, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": NaN, "learning_rate": 0.0, "loss": 72.6113, "step": 1 }, { "epoch": 0.0, "grad_norm": 18.23219108581543, "learning_rate": 2.5e-06, "loss": 74.5495, "step": 10 }, { "epoch": 0.0, "grad_norm": 16.582162857055664, "learning_rate": 7.5e-06, "loss": 73.7367, "step": 20 }, { "epoch": 0.01, "grad_norm": 14.804972648620605, "learning_rate": 1.2e-05, "loss": 72.8853, "step": 30 }, { "epoch": 0.01, "grad_norm": 13.634269714355469, "learning_rate": 1.7000000000000003e-05, "loss": 70.9592, "step": 40 }, { "epoch": 0.01, "grad_norm": 13.762855529785156, "learning_rate": 2.2000000000000003e-05, "loss": 66.9603, "step": 50 }, { "epoch": 0.01, "grad_norm": 16.27646827697754, "learning_rate": 2.7000000000000002e-05, "loss": 61.4318, "step": 60 }, { "epoch": 0.01, "grad_norm": 27.16312026977539, "learning_rate": 3.15e-05, "loss": 53.3651, "step": 70 }, { "epoch": 0.02, "grad_norm": 28.43309783935547, "learning_rate": 3.65e-05, "loss": 33.9745, "step": 80 }, { "epoch": 0.02, "grad_norm": 12.292057991027832, "learning_rate": 4.15e-05, "loss": 13.4627, "step": 90 }, { "epoch": 0.02, "grad_norm": 9.148832321166992, "learning_rate": 4.6500000000000005e-05, "loss": 6.8387, "step": 100 }, { "epoch": 0.02, "grad_norm": 6.579999923706055, "learning_rate": 5.1500000000000005e-05, "loss": 4.7847, "step": 110 }, { "epoch": 0.02, "grad_norm": 6.650771141052246, "learning_rate": 5.65e-05, "loss": 4.1684, "step": 120 }, { "epoch": 0.03, "grad_norm": 6.5379791259765625, "learning_rate": 6.15e-05, "loss": 3.8221, "step": 130 }, { "epoch": 0.03, "grad_norm": 6.095062732696533, "learning_rate": 6.65e-05, "loss": 3.5635, "step": 140 }, { "epoch": 0.03, "grad_norm": 7.0580973625183105, "learning_rate": 7.15e-05, "loss": 3.4446, "step": 150 }, { "epoch": 0.03, "grad_norm": 6.517209053039551, "learning_rate": 7.65e-05, "loss": 3.2972, "step": 160 }, { "epoch": 0.04, "grad_norm": 5.954787731170654, "learning_rate": 8.15e-05, "loss": 3.2621, "step": 170 }, { "epoch": 0.04, "grad_norm": 6.085761547088623, "learning_rate": 8.65e-05, "loss": 3.2072, "step": 180 }, { "epoch": 0.04, "grad_norm": 6.4346442222595215, "learning_rate": 9.15e-05, "loss": 3.0868, "step": 190 }, { "epoch": 0.04, "grad_norm": 6.535578727722168, "learning_rate": 9.65e-05, "loss": 3.0201, "step": 200 }, { "epoch": 0.04, "grad_norm": 5.239222526550293, "learning_rate": 9.999378367177788e-05, "loss": 2.9792, "step": 210 }, { "epoch": 0.05, "grad_norm": 5.576033592224121, "learning_rate": 9.997306257770411e-05, "loss": 3.0079, "step": 220 }, { "epoch": 0.05, "grad_norm": 5.455887794494629, "learning_rate": 9.995234148363033e-05, "loss": 2.8296, "step": 230 }, { "epoch": 0.05, "grad_norm": 4.566660404205322, "learning_rate": 9.993162038955657e-05, "loss": 2.7655, "step": 240 }, { "epoch": 0.05, "grad_norm": 4.954742908477783, "learning_rate": 9.99108992954828e-05, "loss": 2.5655, "step": 250 }, { "epoch": 0.05, "grad_norm": 3.5510752201080322, "learning_rate": 9.989017820140904e-05, "loss": 2.4527, "step": 260 }, { "epoch": 0.06, "grad_norm": 3.358351230621338, "learning_rate": 9.986945710733528e-05, "loss": 2.2679, "step": 270 }, { "epoch": 0.06, "grad_norm": 2.9349524974823, "learning_rate": 9.98487360132615e-05, "loss": 2.1456, "step": 280 }, { "epoch": 0.06, "grad_norm": 3.3249402046203613, "learning_rate": 9.982801491918775e-05, "loss": 2.0943, "step": 290 }, { "epoch": 0.06, "grad_norm": 2.92372989654541, "learning_rate": 9.980729382511397e-05, "loss": 2.0194, "step": 300 }, { "epoch": 0.06, "grad_norm": 3.124359130859375, "learning_rate": 9.97865727310402e-05, "loss": 1.9523, "step": 310 }, { "epoch": 0.07, "grad_norm": 2.372561454772949, "learning_rate": 9.976585163696644e-05, "loss": 1.905, "step": 320 }, { "epoch": 0.07, "grad_norm": 2.5799174308776855, "learning_rate": 9.974513054289267e-05, "loss": 1.9159, "step": 330 }, { "epoch": 0.07, "grad_norm": 2.1826956272125244, "learning_rate": 9.97244094488189e-05, "loss": 1.8362, "step": 340 }, { "epoch": 0.07, "grad_norm": 2.1002371311187744, "learning_rate": 9.970368835474514e-05, "loss": 1.844, "step": 350 }, { "epoch": 0.07, "grad_norm": 3.1345527172088623, "learning_rate": 9.968296726067136e-05, "loss": 1.8084, "step": 360 }, { "epoch": 0.08, "grad_norm": 1.9457321166992188, "learning_rate": 9.96622461665976e-05, "loss": 1.7775, "step": 370 }, { "epoch": 0.08, "grad_norm": 1.9511795043945312, "learning_rate": 9.964152507252383e-05, "loss": 1.7872, "step": 380 }, { "epoch": 0.08, "grad_norm": 1.9775121212005615, "learning_rate": 9.962080397845007e-05, "loss": 1.7665, "step": 390 }, { "epoch": 0.08, "grad_norm": 1.9561394453048706, "learning_rate": 9.96000828843763e-05, "loss": 1.7664, "step": 400 }, { "epoch": 0.08, "grad_norm": 2.7436013221740723, "learning_rate": 9.957936179030253e-05, "loss": 1.7016, "step": 410 }, { "epoch": 0.09, "grad_norm": 1.6739649772644043, "learning_rate": 9.955864069622876e-05, "loss": 1.7219, "step": 420 }, { "epoch": 0.09, "grad_norm": 1.940246343612671, "learning_rate": 9.9537919602155e-05, "loss": 1.7174, "step": 430 }, { "epoch": 0.09, "grad_norm": 1.8286395072937012, "learning_rate": 9.951719850808123e-05, "loss": 1.6698, "step": 440 }, { "epoch": 0.09, "grad_norm": 2.0042293071746826, "learning_rate": 9.949647741400747e-05, "loss": 1.6908, "step": 450 }, { "epoch": 0.09, "grad_norm": 1.6445887088775635, "learning_rate": 9.94757563199337e-05, "loss": 1.6796, "step": 460 }, { "epoch": 0.1, "grad_norm": 2.068713903427124, "learning_rate": 9.945503522585992e-05, "loss": 1.6685, "step": 470 }, { "epoch": 0.1, "grad_norm": 1.8053257465362549, "learning_rate": 9.943431413178617e-05, "loss": 1.6522, "step": 480 }, { "epoch": 0.1, "grad_norm": 1.580461859703064, "learning_rate": 9.94135930377124e-05, "loss": 1.6425, "step": 490 }, { "epoch": 0.1, "grad_norm": 1.607007384300232, "learning_rate": 9.939287194363863e-05, "loss": 1.632, "step": 500 }, { "epoch": 0.11, "grad_norm": 1.649885654449463, "learning_rate": 9.937215084956486e-05, "loss": 1.5966, "step": 510 }, { "epoch": 0.11, "grad_norm": 1.7667235136032104, "learning_rate": 9.93514297554911e-05, "loss": 1.5942, "step": 520 }, { "epoch": 0.11, "grad_norm": 1.595691442489624, "learning_rate": 9.933070866141732e-05, "loss": 1.609, "step": 530 }, { "epoch": 0.11, "grad_norm": 1.5232254266738892, "learning_rate": 9.930998756734357e-05, "loss": 1.5614, "step": 540 }, { "epoch": 0.11, "grad_norm": 1.4872910976409912, "learning_rate": 9.928926647326979e-05, "loss": 1.5657, "step": 550 }, { "epoch": 0.12, "grad_norm": 1.609491229057312, "learning_rate": 9.926854537919603e-05, "loss": 1.5935, "step": 560 }, { "epoch": 0.12, "grad_norm": 1.6403166055679321, "learning_rate": 9.924782428512226e-05, "loss": 1.6159, "step": 570 }, { "epoch": 0.12, "grad_norm": 1.6648396253585815, "learning_rate": 9.922710319104848e-05, "loss": 1.6012, "step": 580 }, { "epoch": 0.12, "grad_norm": 1.6322458982467651, "learning_rate": 9.920638209697473e-05, "loss": 1.5541, "step": 590 }, { "epoch": 0.12, "grad_norm": 1.5503164529800415, "learning_rate": 9.918566100290095e-05, "loss": 1.5733, "step": 600 }, { "epoch": 0.13, "grad_norm": 1.6093209981918335, "learning_rate": 9.916493990882719e-05, "loss": 1.5144, "step": 610 }, { "epoch": 0.13, "grad_norm": 1.6871626377105713, "learning_rate": 9.914421881475342e-05, "loss": 1.573, "step": 620 }, { "epoch": 0.13, "grad_norm": 1.7600977420806885, "learning_rate": 9.912349772067966e-05, "loss": 1.5577, "step": 630 }, { "epoch": 0.13, "grad_norm": 1.4892425537109375, "learning_rate": 9.910277662660588e-05, "loss": 1.5751, "step": 640 }, { "epoch": 0.13, "grad_norm": 1.5667476654052734, "learning_rate": 9.908205553253213e-05, "loss": 1.5298, "step": 650 }, { "epoch": 0.14, "grad_norm": 1.3411659002304077, "learning_rate": 9.906133443845835e-05, "loss": 1.5409, "step": 660 }, { "epoch": 0.14, "grad_norm": 1.5329233407974243, "learning_rate": 9.904061334438459e-05, "loss": 1.5165, "step": 670 }, { "epoch": 0.14, "grad_norm": 1.5168925523757935, "learning_rate": 9.901989225031082e-05, "loss": 1.5222, "step": 680 }, { "epoch": 0.14, "grad_norm": 1.6860578060150146, "learning_rate": 9.899917115623706e-05, "loss": 1.5179, "step": 690 }, { "epoch": 0.14, "grad_norm": 1.4629698991775513, "learning_rate": 9.897845006216329e-05, "loss": 1.5593, "step": 700 }, { "epoch": 0.15, "grad_norm": 1.3701924085617065, "learning_rate": 9.895772896808953e-05, "loss": 1.52, "step": 710 }, { "epoch": 0.15, "grad_norm": 1.4276106357574463, "learning_rate": 9.893700787401575e-05, "loss": 1.5546, "step": 720 }, { "epoch": 0.15, "grad_norm": 1.5609627962112427, "learning_rate": 9.8916286779942e-05, "loss": 1.5071, "step": 730 }, { "epoch": 0.15, "grad_norm": 1.9602493047714233, "learning_rate": 9.889556568586822e-05, "loss": 1.5192, "step": 740 }, { "epoch": 0.15, "grad_norm": 1.4681726694107056, "learning_rate": 9.887484459179444e-05, "loss": 1.5065, "step": 750 }, { "epoch": 0.16, "grad_norm": 1.547143816947937, "learning_rate": 9.885412349772069e-05, "loss": 1.5303, "step": 760 }, { "epoch": 0.16, "grad_norm": 1.7585084438323975, "learning_rate": 9.883340240364691e-05, "loss": 1.5412, "step": 770 }, { "epoch": 0.16, "grad_norm": 1.4589301347732544, "learning_rate": 9.881268130957315e-05, "loss": 1.5008, "step": 780 }, { "epoch": 0.16, "grad_norm": 1.5748664140701294, "learning_rate": 9.879196021549938e-05, "loss": 1.4856, "step": 790 }, { "epoch": 0.17, "grad_norm": 1.4392333030700684, "learning_rate": 9.877123912142562e-05, "loss": 1.4593, "step": 800 }, { "epoch": 0.17, "grad_norm": 1.439276933670044, "learning_rate": 9.875051802735185e-05, "loss": 1.4565, "step": 810 }, { "epoch": 0.17, "grad_norm": 1.5028575658798218, "learning_rate": 9.872979693327809e-05, "loss": 1.5106, "step": 820 }, { "epoch": 0.17, "grad_norm": 1.5902388095855713, "learning_rate": 9.870907583920431e-05, "loss": 1.459, "step": 830 }, { "epoch": 0.17, "grad_norm": 1.5270620584487915, "learning_rate": 9.868835474513056e-05, "loss": 1.4705, "step": 840 }, { "epoch": 0.18, "grad_norm": 1.354683518409729, "learning_rate": 9.866763365105678e-05, "loss": 1.4468, "step": 850 }, { "epoch": 0.18, "grad_norm": 1.3697203397750854, "learning_rate": 9.864691255698301e-05, "loss": 1.4669, "step": 860 }, { "epoch": 0.18, "grad_norm": 1.5006585121154785, "learning_rate": 9.862619146290925e-05, "loss": 1.4641, "step": 870 }, { "epoch": 0.18, "grad_norm": 1.3566001653671265, "learning_rate": 9.860547036883548e-05, "loss": 1.4545, "step": 880 }, { "epoch": 0.18, "grad_norm": 1.3500274419784546, "learning_rate": 9.85847492747617e-05, "loss": 1.477, "step": 890 }, { "epoch": 0.19, "grad_norm": 1.3306142091751099, "learning_rate": 9.856402818068794e-05, "loss": 1.4469, "step": 900 }, { "epoch": 0.19, "grad_norm": 1.2983628511428833, "learning_rate": 9.854330708661418e-05, "loss": 1.4603, "step": 910 }, { "epoch": 0.19, "grad_norm": 1.3828344345092773, "learning_rate": 9.852258599254041e-05, "loss": 1.4686, "step": 920 }, { "epoch": 0.19, "grad_norm": 1.427741527557373, "learning_rate": 9.850186489846665e-05, "loss": 1.4756, "step": 930 }, { "epoch": 0.19, "grad_norm": 1.267404556274414, "learning_rate": 9.848114380439287e-05, "loss": 1.4777, "step": 940 }, { "epoch": 0.2, "grad_norm": 1.3213374614715576, "learning_rate": 9.846042271031912e-05, "loss": 1.4526, "step": 950 }, { "epoch": 0.2, "grad_norm": 1.6813840866088867, "learning_rate": 9.843970161624534e-05, "loss": 1.49, "step": 960 }, { "epoch": 0.2, "grad_norm": 1.2110322713851929, "learning_rate": 9.841898052217157e-05, "loss": 1.4796, "step": 970 }, { "epoch": 0.2, "grad_norm": 1.3316526412963867, "learning_rate": 9.839825942809781e-05, "loss": 1.4523, "step": 980 }, { "epoch": 0.2, "grad_norm": 1.313766598701477, "learning_rate": 9.837753833402404e-05, "loss": 1.4195, "step": 990 }, { "epoch": 0.21, "grad_norm": 1.4528905153274536, "learning_rate": 9.835681723995028e-05, "loss": 1.4433, "step": 1000 }, { "epoch": 0.21, "grad_norm": 1.3782751560211182, "learning_rate": 9.833609614587651e-05, "loss": 1.4673, "step": 1010 }, { "epoch": 0.21, "grad_norm": 1.5674275159835815, "learning_rate": 9.831537505180273e-05, "loss": 1.4296, "step": 1020 }, { "epoch": 0.21, "grad_norm": 1.3901402950286865, "learning_rate": 9.829465395772898e-05, "loss": 1.4516, "step": 1030 }, { "epoch": 0.21, "grad_norm": 1.1594748497009277, "learning_rate": 9.82739328636552e-05, "loss": 1.4225, "step": 1040 }, { "epoch": 0.22, "grad_norm": 1.2297048568725586, "learning_rate": 9.825321176958144e-05, "loss": 1.4416, "step": 1050 }, { "epoch": 0.22, "grad_norm": 1.1866023540496826, "learning_rate": 9.823249067550768e-05, "loss": 1.444, "step": 1060 }, { "epoch": 0.22, "grad_norm": 1.407461404800415, "learning_rate": 9.82117695814339e-05, "loss": 1.4415, "step": 1070 }, { "epoch": 0.22, "grad_norm": 1.302164912223816, "learning_rate": 9.819104848736013e-05, "loss": 1.4405, "step": 1080 }, { "epoch": 0.22, "grad_norm": 1.3704490661621094, "learning_rate": 9.817032739328637e-05, "loss": 1.4408, "step": 1090 }, { "epoch": 0.23, "grad_norm": 1.2673710584640503, "learning_rate": 9.81496062992126e-05, "loss": 1.4221, "step": 1100 }, { "epoch": 0.23, "grad_norm": 1.3337206840515137, "learning_rate": 9.812888520513884e-05, "loss": 1.4193, "step": 1110 }, { "epoch": 0.23, "grad_norm": 1.3280502557754517, "learning_rate": 9.810816411106507e-05, "loss": 1.4736, "step": 1120 }, { "epoch": 0.23, "grad_norm": 1.2532864809036255, "learning_rate": 9.80874430169913e-05, "loss": 1.4665, "step": 1130 }, { "epoch": 0.24, "grad_norm": 1.2475242614746094, "learning_rate": 9.806672192291754e-05, "loss": 1.426, "step": 1140 }, { "epoch": 0.24, "grad_norm": 1.7034567594528198, "learning_rate": 9.804600082884376e-05, "loss": 1.4473, "step": 1150 }, { "epoch": 0.24, "grad_norm": 1.3586080074310303, "learning_rate": 9.802527973477e-05, "loss": 1.3959, "step": 1160 }, { "epoch": 0.24, "grad_norm": 1.2611415386199951, "learning_rate": 9.800455864069623e-05, "loss": 1.4401, "step": 1170 }, { "epoch": 0.24, "grad_norm": 1.3101681470870972, "learning_rate": 9.798383754662247e-05, "loss": 1.4431, "step": 1180 }, { "epoch": 0.25, "grad_norm": 1.1770988702774048, "learning_rate": 9.796311645254869e-05, "loss": 1.4108, "step": 1190 }, { "epoch": 0.25, "grad_norm": 1.2325702905654907, "learning_rate": 9.794239535847494e-05, "loss": 1.4141, "step": 1200 }, { "epoch": 0.25, "grad_norm": 1.2543164491653442, "learning_rate": 9.792167426440116e-05, "loss": 1.4133, "step": 1210 }, { "epoch": 0.25, "grad_norm": 1.1258199214935303, "learning_rate": 9.79009531703274e-05, "loss": 1.4041, "step": 1220 }, { "epoch": 0.25, "grad_norm": 1.3423455953598022, "learning_rate": 9.788023207625363e-05, "loss": 1.4144, "step": 1230 }, { "epoch": 0.26, "grad_norm": 1.248947024345398, "learning_rate": 9.785951098217985e-05, "loss": 1.4043, "step": 1240 }, { "epoch": 0.26, "grad_norm": 1.129650354385376, "learning_rate": 9.78387898881061e-05, "loss": 1.4216, "step": 1250 }, { "epoch": 0.26, "grad_norm": 1.2218910455703735, "learning_rate": 9.781806879403232e-05, "loss": 1.3976, "step": 1260 }, { "epoch": 0.26, "grad_norm": 1.153981328010559, "learning_rate": 9.779734769995856e-05, "loss": 1.4304, "step": 1270 }, { "epoch": 0.26, "grad_norm": 1.1724766492843628, "learning_rate": 9.77766266058848e-05, "loss": 1.43, "step": 1280 }, { "epoch": 0.27, "grad_norm": 1.2830730676651, "learning_rate": 9.775590551181103e-05, "loss": 1.4429, "step": 1290 }, { "epoch": 0.27, "grad_norm": 1.2320913076400757, "learning_rate": 9.773518441773725e-05, "loss": 1.3898, "step": 1300 }, { "epoch": 0.27, "grad_norm": 1.2313491106033325, "learning_rate": 9.77144633236635e-05, "loss": 1.4273, "step": 1310 }, { "epoch": 0.27, "grad_norm": 1.1946086883544922, "learning_rate": 9.769374222958972e-05, "loss": 1.4234, "step": 1320 }, { "epoch": 0.27, "grad_norm": 1.127300500869751, "learning_rate": 9.767302113551596e-05, "loss": 1.4144, "step": 1330 }, { "epoch": 0.28, "grad_norm": 1.4888228178024292, "learning_rate": 9.765230004144219e-05, "loss": 1.4092, "step": 1340 }, { "epoch": 0.28, "grad_norm": 1.3795928955078125, "learning_rate": 9.763157894736843e-05, "loss": 1.3647, "step": 1350 }, { "epoch": 0.28, "grad_norm": 1.1433610916137695, "learning_rate": 9.761085785329466e-05, "loss": 1.415, "step": 1360 }, { "epoch": 0.28, "grad_norm": 1.040281891822815, "learning_rate": 9.75901367592209e-05, "loss": 1.4244, "step": 1370 }, { "epoch": 0.28, "grad_norm": 1.1311726570129395, "learning_rate": 9.756941566514712e-05, "loss": 1.3852, "step": 1380 }, { "epoch": 0.29, "grad_norm": 1.2847346067428589, "learning_rate": 9.754869457107337e-05, "loss": 1.4225, "step": 1390 }, { "epoch": 0.29, "grad_norm": 1.2235894203186035, "learning_rate": 9.752797347699959e-05, "loss": 1.3973, "step": 1400 }, { "epoch": 0.29, "grad_norm": 1.1802481412887573, "learning_rate": 9.750725238292582e-05, "loss": 1.3923, "step": 1410 }, { "epoch": 0.29, "grad_norm": 1.141739010810852, "learning_rate": 9.748653128885206e-05, "loss": 1.4049, "step": 1420 }, { "epoch": 0.3, "grad_norm": 1.2155243158340454, "learning_rate": 9.746581019477828e-05, "loss": 1.3866, "step": 1430 }, { "epoch": 0.3, "grad_norm": 1.4717819690704346, "learning_rate": 9.744508910070453e-05, "loss": 1.4264, "step": 1440 }, { "epoch": 0.3, "grad_norm": 1.1440094709396362, "learning_rate": 9.742436800663075e-05, "loss": 1.4291, "step": 1450 }, { "epoch": 0.3, "grad_norm": 1.3254936933517456, "learning_rate": 9.740364691255699e-05, "loss": 1.3973, "step": 1460 }, { "epoch": 0.3, "grad_norm": 1.2041431665420532, "learning_rate": 9.738292581848322e-05, "loss": 1.3779, "step": 1470 }, { "epoch": 0.31, "grad_norm": 1.1422394514083862, "learning_rate": 9.736220472440946e-05, "loss": 1.3918, "step": 1480 }, { "epoch": 0.31, "grad_norm": 1.2341557741165161, "learning_rate": 9.734148363033568e-05, "loss": 1.4065, "step": 1490 }, { "epoch": 0.31, "grad_norm": 1.1723967790603638, "learning_rate": 9.732076253626193e-05, "loss": 1.4003, "step": 1500 }, { "epoch": 0.31, "grad_norm": 1.279010534286499, "learning_rate": 9.730004144218815e-05, "loss": 1.3762, "step": 1510 }, { "epoch": 0.31, "grad_norm": 1.2639541625976562, "learning_rate": 9.727932034811438e-05, "loss": 1.3932, "step": 1520 }, { "epoch": 0.32, "grad_norm": 1.1406500339508057, "learning_rate": 9.725859925404062e-05, "loss": 1.4318, "step": 1530 }, { "epoch": 0.32, "grad_norm": 1.1991297006607056, "learning_rate": 9.723787815996685e-05, "loss": 1.3742, "step": 1540 }, { "epoch": 0.32, "grad_norm": 1.1058017015457153, "learning_rate": 9.721715706589309e-05, "loss": 1.3975, "step": 1550 }, { "epoch": 0.32, "grad_norm": 1.3658838272094727, "learning_rate": 9.719643597181932e-05, "loss": 1.4245, "step": 1560 }, { "epoch": 0.32, "grad_norm": 1.0663561820983887, "learning_rate": 9.717571487774555e-05, "loss": 1.3779, "step": 1570 }, { "epoch": 0.33, "grad_norm": 1.1523654460906982, "learning_rate": 9.715499378367178e-05, "loss": 1.4306, "step": 1580 }, { "epoch": 0.33, "grad_norm": 1.223913311958313, "learning_rate": 9.713427268959802e-05, "loss": 1.3748, "step": 1590 }, { "epoch": 0.33, "grad_norm": 1.0876872539520264, "learning_rate": 9.711355159552424e-05, "loss": 1.3806, "step": 1600 }, { "epoch": 0.33, "grad_norm": 1.3317033052444458, "learning_rate": 9.709283050145049e-05, "loss": 1.3586, "step": 1610 }, { "epoch": 0.33, "grad_norm": 1.2402222156524658, "learning_rate": 9.707210940737671e-05, "loss": 1.3886, "step": 1620 }, { "epoch": 0.34, "grad_norm": 1.1467841863632202, "learning_rate": 9.705138831330294e-05, "loss": 1.3634, "step": 1630 }, { "epoch": 0.34, "grad_norm": 1.1589218378067017, "learning_rate": 9.703066721922918e-05, "loss": 1.3466, "step": 1640 }, { "epoch": 0.34, "grad_norm": 0.9369345307350159, "learning_rate": 9.700994612515541e-05, "loss": 1.3819, "step": 1650 }, { "epoch": 0.34, "grad_norm": 1.0450528860092163, "learning_rate": 9.698922503108165e-05, "loss": 1.3482, "step": 1660 }, { "epoch": 0.34, "grad_norm": 1.0236886739730835, "learning_rate": 9.696850393700788e-05, "loss": 1.3468, "step": 1670 }, { "epoch": 0.35, "grad_norm": 1.0324066877365112, "learning_rate": 9.69477828429341e-05, "loss": 1.3926, "step": 1680 }, { "epoch": 0.35, "grad_norm": 1.1705087423324585, "learning_rate": 9.692706174886035e-05, "loss": 1.3547, "step": 1690 }, { "epoch": 0.35, "grad_norm": 1.1479854583740234, "learning_rate": 9.690634065478658e-05, "loss": 1.3517, "step": 1700 }, { "epoch": 0.35, "grad_norm": 1.1700282096862793, "learning_rate": 9.688561956071281e-05, "loss": 1.3635, "step": 1710 }, { "epoch": 0.35, "grad_norm": 1.079822301864624, "learning_rate": 9.686489846663905e-05, "loss": 1.3878, "step": 1720 }, { "epoch": 0.36, "grad_norm": 1.188466191291809, "learning_rate": 9.684417737256528e-05, "loss": 1.36, "step": 1730 }, { "epoch": 0.36, "grad_norm": 1.1050995588302612, "learning_rate": 9.68234562784915e-05, "loss": 1.3513, "step": 1740 }, { "epoch": 0.36, "grad_norm": 1.2480050325393677, "learning_rate": 9.680273518441774e-05, "loss": 1.362, "step": 1750 }, { "epoch": 0.36, "grad_norm": 1.1782851219177246, "learning_rate": 9.678201409034397e-05, "loss": 1.378, "step": 1760 }, { "epoch": 0.37, "grad_norm": 1.1327308416366577, "learning_rate": 9.676129299627021e-05, "loss": 1.3836, "step": 1770 }, { "epoch": 0.37, "grad_norm": 1.0974417924880981, "learning_rate": 9.674057190219644e-05, "loss": 1.3589, "step": 1780 }, { "epoch": 0.37, "grad_norm": 1.1006550788879395, "learning_rate": 9.671985080812266e-05, "loss": 1.3734, "step": 1790 }, { "epoch": 0.37, "grad_norm": 1.0745372772216797, "learning_rate": 9.669912971404891e-05, "loss": 1.4078, "step": 1800 }, { "epoch": 0.37, "grad_norm": 1.2572031021118164, "learning_rate": 9.667840861997513e-05, "loss": 1.3535, "step": 1810 }, { "epoch": 0.38, "grad_norm": 1.065767526626587, "learning_rate": 9.665768752590137e-05, "loss": 1.3657, "step": 1820 }, { "epoch": 0.38, "grad_norm": 1.2773245573043823, "learning_rate": 9.66369664318276e-05, "loss": 1.3813, "step": 1830 }, { "epoch": 0.38, "grad_norm": 1.0642096996307373, "learning_rate": 9.661624533775384e-05, "loss": 1.3829, "step": 1840 }, { "epoch": 0.38, "grad_norm": 1.1348739862442017, "learning_rate": 9.659552424368008e-05, "loss": 1.3864, "step": 1850 }, { "epoch": 0.38, "grad_norm": 1.136107087135315, "learning_rate": 9.657480314960631e-05, "loss": 1.3523, "step": 1860 }, { "epoch": 0.39, "grad_norm": 1.1533474922180176, "learning_rate": 9.655408205553253e-05, "loss": 1.3669, "step": 1870 }, { "epoch": 0.39, "grad_norm": 1.1027289628982544, "learning_rate": 9.653336096145878e-05, "loss": 1.3256, "step": 1880 }, { "epoch": 0.39, "grad_norm": 0.9988449811935425, "learning_rate": 9.6512639867385e-05, "loss": 1.4024, "step": 1890 }, { "epoch": 0.39, "grad_norm": 1.2975176572799683, "learning_rate": 9.649191877331124e-05, "loss": 1.3751, "step": 1900 }, { "epoch": 0.39, "grad_norm": 1.2186543941497803, "learning_rate": 9.647119767923747e-05, "loss": 1.3444, "step": 1910 }, { "epoch": 0.4, "grad_norm": 1.1342490911483765, "learning_rate": 9.64504765851637e-05, "loss": 1.3449, "step": 1920 }, { "epoch": 0.4, "grad_norm": 1.148695707321167, "learning_rate": 9.642975549108993e-05, "loss": 1.3325, "step": 1930 }, { "epoch": 0.4, "grad_norm": 0.9545331001281738, "learning_rate": 9.640903439701616e-05, "loss": 1.3375, "step": 1940 }, { "epoch": 0.4, "grad_norm": 1.0941437482833862, "learning_rate": 9.63883133029424e-05, "loss": 1.3671, "step": 1950 }, { "epoch": 0.4, "grad_norm": 1.0803030729293823, "learning_rate": 9.636759220886863e-05, "loss": 1.3648, "step": 1960 }, { "epoch": 0.41, "grad_norm": 1.0937373638153076, "learning_rate": 9.634687111479487e-05, "loss": 1.3518, "step": 1970 }, { "epoch": 0.41, "grad_norm": 1.1884483098983765, "learning_rate": 9.632615002072109e-05, "loss": 1.3461, "step": 1980 }, { "epoch": 0.41, "grad_norm": 1.1179327964782715, "learning_rate": 9.630542892664734e-05, "loss": 1.3765, "step": 1990 }, { "epoch": 0.41, "grad_norm": 0.9843894839286804, "learning_rate": 9.628470783257356e-05, "loss": 1.3379, "step": 2000 }, { "epoch": 0.41, "grad_norm": 1.0279515981674194, "learning_rate": 9.62639867384998e-05, "loss": 1.3389, "step": 2010 }, { "epoch": 0.42, "grad_norm": 1.0797231197357178, "learning_rate": 9.624326564442603e-05, "loss": 1.346, "step": 2020 }, { "epoch": 0.42, "grad_norm": 1.1976298093795776, "learning_rate": 9.622254455035227e-05, "loss": 1.3366, "step": 2030 }, { "epoch": 0.42, "grad_norm": 1.057880163192749, "learning_rate": 9.620182345627849e-05, "loss": 1.3264, "step": 2040 }, { "epoch": 0.42, "grad_norm": 1.1059492826461792, "learning_rate": 9.618110236220474e-05, "loss": 1.3446, "step": 2050 }, { "epoch": 0.43, "grad_norm": 1.0970298051834106, "learning_rate": 9.616038126813096e-05, "loss": 1.3521, "step": 2060 }, { "epoch": 0.43, "grad_norm": 1.0951462984085083, "learning_rate": 9.61396601740572e-05, "loss": 1.3669, "step": 2070 }, { "epoch": 0.43, "grad_norm": 1.0926049947738647, "learning_rate": 9.611893907998343e-05, "loss": 1.354, "step": 2080 }, { "epoch": 0.43, "grad_norm": 1.0136979818344116, "learning_rate": 9.609821798590965e-05, "loss": 1.3321, "step": 2090 }, { "epoch": 0.43, "grad_norm": 1.129214882850647, "learning_rate": 9.60774968918359e-05, "loss": 1.382, "step": 2100 }, { "epoch": 0.44, "grad_norm": 1.1166954040527344, "learning_rate": 9.605677579776212e-05, "loss": 1.3337, "step": 2110 }, { "epoch": 0.44, "grad_norm": 1.204231858253479, "learning_rate": 9.603605470368836e-05, "loss": 1.3642, "step": 2120 }, { "epoch": 0.44, "grad_norm": 1.0265048742294312, "learning_rate": 9.601533360961459e-05, "loss": 1.3662, "step": 2130 }, { "epoch": 0.44, "grad_norm": 1.0513389110565186, "learning_rate": 9.599461251554083e-05, "loss": 1.3395, "step": 2140 }, { "epoch": 0.44, "grad_norm": 1.17727792263031, "learning_rate": 9.597389142146705e-05, "loss": 1.3738, "step": 2150 }, { "epoch": 0.45, "grad_norm": 1.0676214694976807, "learning_rate": 9.59531703273933e-05, "loss": 1.3383, "step": 2160 }, { "epoch": 0.45, "grad_norm": 0.9273681640625, "learning_rate": 9.593244923331952e-05, "loss": 1.367, "step": 2170 }, { "epoch": 0.45, "grad_norm": 1.0774747133255005, "learning_rate": 9.591172813924575e-05, "loss": 1.3369, "step": 2180 }, { "epoch": 0.45, "grad_norm": 1.131264090538025, "learning_rate": 9.589100704517199e-05, "loss": 1.3457, "step": 2190 }, { "epoch": 0.45, "grad_norm": 1.106242060661316, "learning_rate": 9.587028595109822e-05, "loss": 1.321, "step": 2200 }, { "epoch": 0.46, "grad_norm": 1.054598331451416, "learning_rate": 9.584956485702446e-05, "loss": 1.3424, "step": 2210 }, { "epoch": 0.46, "grad_norm": 1.0380080938339233, "learning_rate": 9.58288437629507e-05, "loss": 1.3425, "step": 2220 }, { "epoch": 0.46, "grad_norm": 1.1068315505981445, "learning_rate": 9.580812266887692e-05, "loss": 1.321, "step": 2230 }, { "epoch": 0.46, "grad_norm": 1.1228212118148804, "learning_rate": 9.578740157480316e-05, "loss": 1.3301, "step": 2240 }, { "epoch": 0.46, "grad_norm": 0.9643247127532959, "learning_rate": 9.576668048072939e-05, "loss": 1.3403, "step": 2250 }, { "epoch": 0.47, "grad_norm": 0.9587458372116089, "learning_rate": 9.574595938665562e-05, "loss": 1.3402, "step": 2260 }, { "epoch": 0.47, "grad_norm": 1.0192015171051025, "learning_rate": 9.572523829258186e-05, "loss": 1.3595, "step": 2270 }, { "epoch": 0.47, "grad_norm": 1.1033486127853394, "learning_rate": 9.570451719850808e-05, "loss": 1.3515, "step": 2280 }, { "epoch": 0.47, "grad_norm": 1.244828462600708, "learning_rate": 9.568379610443431e-05, "loss": 1.3148, "step": 2290 }, { "epoch": 0.47, "grad_norm": 1.031778335571289, "learning_rate": 9.566307501036055e-05, "loss": 1.3343, "step": 2300 }, { "epoch": 0.48, "grad_norm": 1.0581692457199097, "learning_rate": 9.564235391628678e-05, "loss": 1.3352, "step": 2310 }, { "epoch": 0.48, "grad_norm": 0.9989519119262695, "learning_rate": 9.562163282221302e-05, "loss": 1.3206, "step": 2320 }, { "epoch": 0.48, "grad_norm": 1.1149669885635376, "learning_rate": 9.560091172813925e-05, "loss": 1.3355, "step": 2330 }, { "epoch": 0.48, "grad_norm": 1.1359626054763794, "learning_rate": 9.558019063406548e-05, "loss": 1.3233, "step": 2340 }, { "epoch": 0.48, "grad_norm": 1.1091575622558594, "learning_rate": 9.555946953999172e-05, "loss": 1.3678, "step": 2350 }, { "epoch": 0.49, "grad_norm": 1.0405771732330322, "learning_rate": 9.553874844591795e-05, "loss": 1.3555, "step": 2360 }, { "epoch": 0.49, "grad_norm": 1.068385124206543, "learning_rate": 9.551802735184418e-05, "loss": 1.346, "step": 2370 }, { "epoch": 0.49, "grad_norm": 1.0115128755569458, "learning_rate": 9.549730625777042e-05, "loss": 1.3448, "step": 2380 }, { "epoch": 0.49, "grad_norm": 1.026138424873352, "learning_rate": 9.547658516369665e-05, "loss": 1.3286, "step": 2390 }, { "epoch": 0.5, "grad_norm": 1.375127911567688, "learning_rate": 9.545586406962289e-05, "loss": 1.3931, "step": 2400 }, { "epoch": 0.5, "grad_norm": 1.2297391891479492, "learning_rate": 9.543514297554912e-05, "loss": 1.3223, "step": 2410 }, { "epoch": 0.5, "grad_norm": 1.143249750137329, "learning_rate": 9.541442188147534e-05, "loss": 1.3142, "step": 2420 }, { "epoch": 0.5, "grad_norm": 1.1182348728179932, "learning_rate": 9.539370078740158e-05, "loss": 1.3414, "step": 2430 }, { "epoch": 0.5, "grad_norm": 1.0450687408447266, "learning_rate": 9.537297969332781e-05, "loss": 1.3119, "step": 2440 }, { "epoch": 0.51, "grad_norm": 1.105624794960022, "learning_rate": 9.535225859925403e-05, "loss": 1.3275, "step": 2450 }, { "epoch": 0.51, "grad_norm": 1.1117305755615234, "learning_rate": 9.533153750518028e-05, "loss": 1.3384, "step": 2460 }, { "epoch": 0.51, "grad_norm": 1.122660756111145, "learning_rate": 9.53108164111065e-05, "loss": 1.3509, "step": 2470 }, { "epoch": 0.51, "grad_norm": 0.994361937046051, "learning_rate": 9.529009531703274e-05, "loss": 1.3638, "step": 2480 }, { "epoch": 0.51, "grad_norm": 1.1339287757873535, "learning_rate": 9.526937422295898e-05, "loss": 1.3282, "step": 2490 }, { "epoch": 0.52, "grad_norm": 1.0273313522338867, "learning_rate": 9.524865312888521e-05, "loss": 1.3261, "step": 2500 }, { "epoch": 0.52, "grad_norm": 1.067122220993042, "learning_rate": 9.522793203481145e-05, "loss": 1.3502, "step": 2510 }, { "epoch": 0.52, "grad_norm": 0.9780186414718628, "learning_rate": 9.520721094073768e-05, "loss": 1.3209, "step": 2520 }, { "epoch": 0.52, "grad_norm": 1.0634074211120605, "learning_rate": 9.51864898466639e-05, "loss": 1.3508, "step": 2530 }, { "epoch": 0.52, "grad_norm": 1.0088226795196533, "learning_rate": 9.516576875259015e-05, "loss": 1.2848, "step": 2540 }, { "epoch": 0.53, "grad_norm": 1.156569242477417, "learning_rate": 9.514504765851637e-05, "loss": 1.3336, "step": 2550 }, { "epoch": 0.53, "grad_norm": 0.9981438517570496, "learning_rate": 9.512432656444261e-05, "loss": 1.3237, "step": 2560 }, { "epoch": 0.53, "grad_norm": 1.0465401411056519, "learning_rate": 9.510360547036884e-05, "loss": 1.3347, "step": 2570 }, { "epoch": 0.53, "grad_norm": 3.282174825668335, "learning_rate": 9.508288437629508e-05, "loss": 1.3234, "step": 2580 }, { "epoch": 0.53, "grad_norm": 1.0925480127334595, "learning_rate": 9.50621632822213e-05, "loss": 1.3604, "step": 2590 }, { "epoch": 0.54, "grad_norm": 1.0559757947921753, "learning_rate": 9.504144218814753e-05, "loss": 1.3411, "step": 2600 }, { "epoch": 0.54, "grad_norm": 1.0160987377166748, "learning_rate": 9.502072109407377e-05, "loss": 1.3299, "step": 2610 }, { "epoch": 0.54, "grad_norm": 1.0814076662063599, "learning_rate": 9.5e-05, "loss": 1.3053, "step": 2620 }, { "epoch": 0.54, "grad_norm": 1.1541906595230103, "learning_rate": 9.497927890592624e-05, "loss": 1.3368, "step": 2630 }, { "epoch": 0.54, "grad_norm": 1.0476430654525757, "learning_rate": 9.495855781185246e-05, "loss": 1.3266, "step": 2640 }, { "epoch": 0.55, "grad_norm": 1.0859614610671997, "learning_rate": 9.493783671777871e-05, "loss": 1.3077, "step": 2650 }, { "epoch": 0.55, "grad_norm": 1.047561526298523, "learning_rate": 9.491711562370493e-05, "loss": 1.301, "step": 2660 }, { "epoch": 0.55, "grad_norm": 1.1071749925613403, "learning_rate": 9.489639452963117e-05, "loss": 1.3069, "step": 2670 }, { "epoch": 0.55, "grad_norm": 1.0207133293151855, "learning_rate": 9.48756734355574e-05, "loss": 1.3128, "step": 2680 }, { "epoch": 0.56, "grad_norm": 1.1883114576339722, "learning_rate": 9.485495234148364e-05, "loss": 1.2987, "step": 2690 }, { "epoch": 0.56, "grad_norm": 1.1708128452301025, "learning_rate": 9.483423124740986e-05, "loss": 1.3386, "step": 2700 }, { "epoch": 0.56, "grad_norm": 1.0731940269470215, "learning_rate": 9.481351015333611e-05, "loss": 1.3165, "step": 2710 }, { "epoch": 0.56, "grad_norm": 1.02231764793396, "learning_rate": 9.479278905926233e-05, "loss": 1.3364, "step": 2720 }, { "epoch": 0.56, "grad_norm": 0.9825921654701233, "learning_rate": 9.477206796518856e-05, "loss": 1.3078, "step": 2730 }, { "epoch": 0.57, "grad_norm": 1.1280665397644043, "learning_rate": 9.47513468711148e-05, "loss": 1.3359, "step": 2740 }, { "epoch": 0.57, "grad_norm": 0.9910861253738403, "learning_rate": 9.473062577704103e-05, "loss": 1.3361, "step": 2750 }, { "epoch": 0.57, "grad_norm": 1.0153850317001343, "learning_rate": 9.470990468296727e-05, "loss": 1.3059, "step": 2760 }, { "epoch": 0.57, "grad_norm": 1.01111900806427, "learning_rate": 9.468918358889349e-05, "loss": 1.3226, "step": 2770 }, { "epoch": 0.57, "grad_norm": 1.0714573860168457, "learning_rate": 9.466846249481973e-05, "loss": 1.3195, "step": 2780 }, { "epoch": 0.58, "grad_norm": 1.0012733936309814, "learning_rate": 9.464774140074596e-05, "loss": 1.2948, "step": 2790 }, { "epoch": 0.58, "grad_norm": 0.9637882709503174, "learning_rate": 9.46270203066722e-05, "loss": 1.3208, "step": 2800 }, { "epoch": 0.58, "grad_norm": 1.0453296899795532, "learning_rate": 9.460629921259843e-05, "loss": 1.3095, "step": 2810 }, { "epoch": 0.58, "grad_norm": 1.0107698440551758, "learning_rate": 9.458557811852467e-05, "loss": 1.3164, "step": 2820 }, { "epoch": 0.58, "grad_norm": 1.1132638454437256, "learning_rate": 9.456485702445089e-05, "loss": 1.3162, "step": 2830 }, { "epoch": 0.59, "grad_norm": 1.0389189720153809, "learning_rate": 9.454413593037714e-05, "loss": 1.317, "step": 2840 }, { "epoch": 0.59, "grad_norm": 1.0654906034469604, "learning_rate": 9.452341483630336e-05, "loss": 1.305, "step": 2850 }, { "epoch": 0.59, "grad_norm": 1.2564867734909058, "learning_rate": 9.45026937422296e-05, "loss": 1.3301, "step": 2860 }, { "epoch": 0.59, "grad_norm": 1.0308964252471924, "learning_rate": 9.448197264815583e-05, "loss": 1.334, "step": 2870 }, { "epoch": 0.59, "grad_norm": 1.0542854070663452, "learning_rate": 9.446125155408206e-05, "loss": 1.3001, "step": 2880 }, { "epoch": 0.6, "grad_norm": 1.2161365747451782, "learning_rate": 9.444053046000829e-05, "loss": 1.2985, "step": 2890 }, { "epoch": 0.6, "grad_norm": 1.206581473350525, "learning_rate": 9.441980936593454e-05, "loss": 1.3177, "step": 2900 }, { "epoch": 0.6, "grad_norm": 1.1631922721862793, "learning_rate": 9.439908827186076e-05, "loss": 1.3269, "step": 2910 }, { "epoch": 0.6, "grad_norm": 0.9827607274055481, "learning_rate": 9.437836717778699e-05, "loss": 1.3228, "step": 2920 }, { "epoch": 0.6, "grad_norm": 1.0078628063201904, "learning_rate": 9.435764608371323e-05, "loss": 1.3047, "step": 2930 }, { "epoch": 0.61, "grad_norm": 1.1704260110855103, "learning_rate": 9.433692498963945e-05, "loss": 1.3074, "step": 2940 }, { "epoch": 0.61, "grad_norm": 1.075964093208313, "learning_rate": 9.43162038955657e-05, "loss": 1.3252, "step": 2950 }, { "epoch": 0.61, "grad_norm": 0.9463378190994263, "learning_rate": 9.429548280149192e-05, "loss": 1.3201, "step": 2960 }, { "epoch": 0.61, "grad_norm": 1.01523756980896, "learning_rate": 9.427476170741815e-05, "loss": 1.3127, "step": 2970 }, { "epoch": 0.61, "grad_norm": 0.9392449259757996, "learning_rate": 9.425404061334439e-05, "loss": 1.3254, "step": 2980 }, { "epoch": 0.62, "grad_norm": 1.01919424533844, "learning_rate": 9.423331951927062e-05, "loss": 1.3021, "step": 2990 }, { "epoch": 0.62, "grad_norm": 1.1243764162063599, "learning_rate": 9.421259842519685e-05, "loss": 1.3112, "step": 3000 }, { "epoch": 0.62, "grad_norm": 1.0084974765777588, "learning_rate": 9.41918773311231e-05, "loss": 1.3173, "step": 3010 }, { "epoch": 0.62, "grad_norm": 0.9945486783981323, "learning_rate": 9.417115623704932e-05, "loss": 1.3114, "step": 3020 }, { "epoch": 0.63, "grad_norm": 1.1148301362991333, "learning_rate": 9.415043514297555e-05, "loss": 1.3275, "step": 3030 }, { "epoch": 0.63, "grad_norm": 1.2701823711395264, "learning_rate": 9.412971404890179e-05, "loss": 1.3094, "step": 3040 }, { "epoch": 0.63, "grad_norm": 1.1923747062683105, "learning_rate": 9.410899295482802e-05, "loss": 1.2812, "step": 3050 }, { "epoch": 0.63, "grad_norm": 1.2106274366378784, "learning_rate": 9.408827186075426e-05, "loss": 1.3011, "step": 3060 }, { "epoch": 0.63, "grad_norm": 1.0127681493759155, "learning_rate": 9.406755076668049e-05, "loss": 1.3059, "step": 3070 }, { "epoch": 0.64, "grad_norm": 1.042222499847412, "learning_rate": 9.404682967260671e-05, "loss": 1.2961, "step": 3080 }, { "epoch": 0.64, "grad_norm": 0.9650092124938965, "learning_rate": 9.402610857853296e-05, "loss": 1.3264, "step": 3090 }, { "epoch": 0.64, "grad_norm": 1.0504155158996582, "learning_rate": 9.400538748445918e-05, "loss": 1.2853, "step": 3100 }, { "epoch": 0.64, "grad_norm": 1.0501419305801392, "learning_rate": 9.39846663903854e-05, "loss": 1.3179, "step": 3110 }, { "epoch": 0.64, "grad_norm": 1.056299090385437, "learning_rate": 9.396394529631165e-05, "loss": 1.2962, "step": 3120 }, { "epoch": 0.65, "grad_norm": 1.0278836488723755, "learning_rate": 9.394322420223788e-05, "loss": 1.2828, "step": 3130 }, { "epoch": 0.65, "grad_norm": 0.9813990592956543, "learning_rate": 9.392250310816411e-05, "loss": 1.2986, "step": 3140 }, { "epoch": 0.65, "grad_norm": 1.0665332078933716, "learning_rate": 9.390178201409035e-05, "loss": 1.2891, "step": 3150 }, { "epoch": 0.65, "grad_norm": 1.0281347036361694, "learning_rate": 9.388106092001658e-05, "loss": 1.299, "step": 3160 }, { "epoch": 0.65, "grad_norm": 1.0530226230621338, "learning_rate": 9.386033982594282e-05, "loss": 1.2887, "step": 3170 }, { "epoch": 0.66, "grad_norm": 1.0053261518478394, "learning_rate": 9.383961873186905e-05, "loss": 1.327, "step": 3180 }, { "epoch": 0.66, "grad_norm": 1.1362097263336182, "learning_rate": 9.381889763779527e-05, "loss": 1.3001, "step": 3190 }, { "epoch": 0.66, "grad_norm": 1.0610814094543457, "learning_rate": 9.379817654372152e-05, "loss": 1.2535, "step": 3200 }, { "epoch": 0.66, "grad_norm": 0.9906120300292969, "learning_rate": 9.377745544964774e-05, "loss": 1.291, "step": 3210 }, { "epoch": 0.66, "grad_norm": 1.0676803588867188, "learning_rate": 9.375673435557398e-05, "loss": 1.3032, "step": 3220 }, { "epoch": 0.67, "grad_norm": 1.056851863861084, "learning_rate": 9.373601326150021e-05, "loss": 1.2879, "step": 3230 }, { "epoch": 0.67, "grad_norm": 1.048841118812561, "learning_rate": 9.371529216742645e-05, "loss": 1.2798, "step": 3240 }, { "epoch": 0.67, "grad_norm": 1.047361969947815, "learning_rate": 9.369457107335268e-05, "loss": 1.3195, "step": 3250 }, { "epoch": 0.67, "grad_norm": 1.076904296875, "learning_rate": 9.367384997927892e-05, "loss": 1.3013, "step": 3260 }, { "epoch": 0.67, "grad_norm": 1.0863533020019531, "learning_rate": 9.365312888520514e-05, "loss": 1.2971, "step": 3270 }, { "epoch": 0.68, "grad_norm": 1.0460786819458008, "learning_rate": 9.363240779113138e-05, "loss": 1.3023, "step": 3280 }, { "epoch": 0.68, "grad_norm": 0.906493604183197, "learning_rate": 9.361168669705761e-05, "loss": 1.3053, "step": 3290 }, { "epoch": 0.68, "grad_norm": 1.1181541681289673, "learning_rate": 9.359096560298383e-05, "loss": 1.3142, "step": 3300 }, { "epoch": 0.68, "grad_norm": 1.0198432207107544, "learning_rate": 9.357024450891008e-05, "loss": 1.293, "step": 3310 }, { "epoch": 0.69, "grad_norm": 1.0075292587280273, "learning_rate": 9.35495234148363e-05, "loss": 1.299, "step": 3320 }, { "epoch": 0.69, "grad_norm": 1.024592399597168, "learning_rate": 9.352880232076254e-05, "loss": 1.2983, "step": 3330 }, { "epoch": 0.69, "grad_norm": 0.9931455254554749, "learning_rate": 9.350808122668877e-05, "loss": 1.279, "step": 3340 }, { "epoch": 0.69, "grad_norm": 1.0673152208328247, "learning_rate": 9.348736013261501e-05, "loss": 1.2816, "step": 3350 }, { "epoch": 0.69, "grad_norm": 1.068587303161621, "learning_rate": 9.346663903854124e-05, "loss": 1.2934, "step": 3360 }, { "epoch": 0.7, "grad_norm": 0.9838789701461792, "learning_rate": 9.344591794446748e-05, "loss": 1.2917, "step": 3370 }, { "epoch": 0.7, "grad_norm": 1.0613404512405396, "learning_rate": 9.34251968503937e-05, "loss": 1.2879, "step": 3380 }, { "epoch": 0.7, "grad_norm": 1.0173070430755615, "learning_rate": 9.340447575631995e-05, "loss": 1.2966, "step": 3390 }, { "epoch": 0.7, "grad_norm": 1.1227622032165527, "learning_rate": 9.338375466224617e-05, "loss": 1.2554, "step": 3400 }, { "epoch": 0.7, "grad_norm": 1.007338523864746, "learning_rate": 9.33630335681724e-05, "loss": 1.3115, "step": 3410 }, { "epoch": 0.71, "grad_norm": 1.0479813814163208, "learning_rate": 9.334231247409864e-05, "loss": 1.3048, "step": 3420 }, { "epoch": 0.71, "grad_norm": 1.0560479164123535, "learning_rate": 9.332159138002486e-05, "loss": 1.2919, "step": 3430 }, { "epoch": 0.71, "grad_norm": 1.1081204414367676, "learning_rate": 9.33008702859511e-05, "loss": 1.2967, "step": 3440 }, { "epoch": 0.71, "grad_norm": 1.0260145664215088, "learning_rate": 9.328014919187733e-05, "loss": 1.3195, "step": 3450 }, { "epoch": 0.71, "grad_norm": 1.057966947555542, "learning_rate": 9.325942809780357e-05, "loss": 1.2896, "step": 3460 }, { "epoch": 0.72, "grad_norm": 1.0711556673049927, "learning_rate": 9.32387070037298e-05, "loss": 1.2817, "step": 3470 }, { "epoch": 0.72, "grad_norm": 1.0118924379348755, "learning_rate": 9.321798590965604e-05, "loss": 1.3052, "step": 3480 }, { "epoch": 0.72, "grad_norm": 1.0227614641189575, "learning_rate": 9.319726481558226e-05, "loss": 1.3186, "step": 3490 }, { "epoch": 0.72, "grad_norm": 1.0655134916305542, "learning_rate": 9.317654372150851e-05, "loss": 1.3087, "step": 3500 }, { "epoch": 0.72, "grad_norm": 1.1255359649658203, "learning_rate": 9.315582262743473e-05, "loss": 1.2749, "step": 3510 }, { "epoch": 0.73, "grad_norm": 1.0832923650741577, "learning_rate": 9.313510153336096e-05, "loss": 1.2892, "step": 3520 }, { "epoch": 0.73, "grad_norm": 1.067236304283142, "learning_rate": 9.31143804392872e-05, "loss": 1.284, "step": 3530 }, { "epoch": 0.73, "grad_norm": 1.1556322574615479, "learning_rate": 9.309365934521344e-05, "loss": 1.2604, "step": 3540 }, { "epoch": 0.73, "grad_norm": 1.151723861694336, "learning_rate": 9.307293825113966e-05, "loss": 1.3045, "step": 3550 }, { "epoch": 0.73, "grad_norm": 1.0258938074111938, "learning_rate": 9.30522171570659e-05, "loss": 1.2859, "step": 3560 }, { "epoch": 0.74, "grad_norm": 1.0165237188339233, "learning_rate": 9.303149606299213e-05, "loss": 1.3212, "step": 3570 }, { "epoch": 0.74, "grad_norm": 0.9969586133956909, "learning_rate": 9.301077496891836e-05, "loss": 1.3038, "step": 3580 }, { "epoch": 0.74, "grad_norm": 1.1335457563400269, "learning_rate": 9.29900538748446e-05, "loss": 1.2747, "step": 3590 }, { "epoch": 0.74, "grad_norm": 1.0744903087615967, "learning_rate": 9.296933278077082e-05, "loss": 1.3078, "step": 3600 }, { "epoch": 0.74, "grad_norm": 1.2294646501541138, "learning_rate": 9.294861168669707e-05, "loss": 1.2631, "step": 3610 }, { "epoch": 0.75, "grad_norm": 1.0542582273483276, "learning_rate": 9.292789059262329e-05, "loss": 1.2778, "step": 3620 }, { "epoch": 0.75, "grad_norm": 1.0787122249603271, "learning_rate": 9.290716949854952e-05, "loss": 1.2952, "step": 3630 }, { "epoch": 0.75, "grad_norm": 1.182387351989746, "learning_rate": 9.288644840447576e-05, "loss": 1.2955, "step": 3640 }, { "epoch": 0.75, "grad_norm": 1.0466411113739014, "learning_rate": 9.2865727310402e-05, "loss": 1.3085, "step": 3650 }, { "epoch": 0.76, "grad_norm": 1.0271363258361816, "learning_rate": 9.284500621632823e-05, "loss": 1.2881, "step": 3660 }, { "epoch": 0.76, "grad_norm": 1.1320871114730835, "learning_rate": 9.282428512225446e-05, "loss": 1.2671, "step": 3670 }, { "epoch": 0.76, "grad_norm": 1.1176432371139526, "learning_rate": 9.280356402818069e-05, "loss": 1.299, "step": 3680 }, { "epoch": 0.76, "grad_norm": 0.9895033240318298, "learning_rate": 9.278284293410694e-05, "loss": 1.2984, "step": 3690 }, { "epoch": 0.76, "grad_norm": 1.191007137298584, "learning_rate": 9.276212184003316e-05, "loss": 1.2808, "step": 3700 }, { "epoch": 0.77, "grad_norm": 1.0878729820251465, "learning_rate": 9.274140074595939e-05, "loss": 1.2864, "step": 3710 }, { "epoch": 0.77, "grad_norm": 1.1144053936004639, "learning_rate": 9.272067965188563e-05, "loss": 1.3175, "step": 3720 }, { "epoch": 0.77, "grad_norm": 1.128405213356018, "learning_rate": 9.269995855781186e-05, "loss": 1.3147, "step": 3730 }, { "epoch": 0.77, "grad_norm": 1.0539438724517822, "learning_rate": 9.267923746373808e-05, "loss": 1.2927, "step": 3740 }, { "epoch": 0.77, "grad_norm": 1.0515836477279663, "learning_rate": 9.265851636966433e-05, "loss": 1.2743, "step": 3750 }, { "epoch": 0.78, "grad_norm": 1.1526955366134644, "learning_rate": 9.263779527559055e-05, "loss": 1.2911, "step": 3760 }, { "epoch": 0.78, "grad_norm": 1.010903000831604, "learning_rate": 9.261707418151679e-05, "loss": 1.2735, "step": 3770 }, { "epoch": 0.78, "grad_norm": 1.1373246908187866, "learning_rate": 9.259635308744302e-05, "loss": 1.2952, "step": 3780 }, { "epoch": 0.78, "grad_norm": 0.9458179473876953, "learning_rate": 9.257563199336925e-05, "loss": 1.2936, "step": 3790 }, { "epoch": 0.78, "grad_norm": 0.949252724647522, "learning_rate": 9.25549108992955e-05, "loss": 1.287, "step": 3800 }, { "epoch": 0.79, "grad_norm": 1.1074215173721313, "learning_rate": 9.253418980522172e-05, "loss": 1.2937, "step": 3810 }, { "epoch": 0.79, "grad_norm": 0.995959460735321, "learning_rate": 9.251346871114795e-05, "loss": 1.265, "step": 3820 }, { "epoch": 0.79, "grad_norm": 1.0461138486862183, "learning_rate": 9.249274761707419e-05, "loss": 1.2822, "step": 3830 }, { "epoch": 0.79, "grad_norm": 1.0449700355529785, "learning_rate": 9.247202652300042e-05, "loss": 1.2896, "step": 3840 }, { "epoch": 0.79, "grad_norm": 1.0590384006500244, "learning_rate": 9.245130542892664e-05, "loss": 1.2923, "step": 3850 }, { "epoch": 0.8, "grad_norm": 1.178272008895874, "learning_rate": 9.243058433485289e-05, "loss": 1.2797, "step": 3860 }, { "epoch": 0.8, "grad_norm": 1.0651668310165405, "learning_rate": 9.240986324077911e-05, "loss": 1.2632, "step": 3870 }, { "epoch": 0.8, "grad_norm": 1.0944633483886719, "learning_rate": 9.238914214670535e-05, "loss": 1.2853, "step": 3880 }, { "epoch": 0.8, "grad_norm": 1.042576551437378, "learning_rate": 9.236842105263158e-05, "loss": 1.2884, "step": 3890 }, { "epoch": 0.8, "grad_norm": 1.1282780170440674, "learning_rate": 9.234769995855782e-05, "loss": 1.2937, "step": 3900 }, { "epoch": 0.81, "grad_norm": 0.9996076822280884, "learning_rate": 9.232697886448405e-05, "loss": 1.2657, "step": 3910 }, { "epoch": 0.81, "grad_norm": 0.9630957245826721, "learning_rate": 9.230625777041029e-05, "loss": 1.2679, "step": 3920 }, { "epoch": 0.81, "grad_norm": 1.0428036451339722, "learning_rate": 9.228553667633651e-05, "loss": 1.2921, "step": 3930 }, { "epoch": 0.81, "grad_norm": 1.1940115690231323, "learning_rate": 9.226481558226275e-05, "loss": 1.2759, "step": 3940 }, { "epoch": 0.82, "grad_norm": 1.1081199645996094, "learning_rate": 9.224409448818898e-05, "loss": 1.2636, "step": 3950 }, { "epoch": 0.82, "grad_norm": 0.989032506942749, "learning_rate": 9.22233733941152e-05, "loss": 1.2489, "step": 3960 }, { "epoch": 0.82, "grad_norm": 1.0575727224349976, "learning_rate": 9.220265230004145e-05, "loss": 1.2777, "step": 3970 }, { "epoch": 0.82, "grad_norm": 1.0007938146591187, "learning_rate": 9.218193120596767e-05, "loss": 1.286, "step": 3980 }, { "epoch": 0.82, "grad_norm": 1.0560977458953857, "learning_rate": 9.216121011189391e-05, "loss": 1.2939, "step": 3990 }, { "epoch": 0.83, "grad_norm": 1.308013916015625, "learning_rate": 9.214048901782014e-05, "loss": 1.2728, "step": 4000 }, { "epoch": 0.83, "grad_norm": 1.2011935710906982, "learning_rate": 9.211976792374638e-05, "loss": 1.2853, "step": 4010 }, { "epoch": 0.83, "grad_norm": 1.0514850616455078, "learning_rate": 9.209904682967261e-05, "loss": 1.3102, "step": 4020 }, { "epoch": 0.83, "grad_norm": 1.0865683555603027, "learning_rate": 9.207832573559885e-05, "loss": 1.2835, "step": 4030 }, { "epoch": 0.83, "grad_norm": 1.2012451887130737, "learning_rate": 9.205760464152507e-05, "loss": 1.2801, "step": 4040 }, { "epoch": 0.84, "grad_norm": 1.0710102319717407, "learning_rate": 9.203688354745132e-05, "loss": 1.2745, "step": 4050 }, { "epoch": 0.84, "grad_norm": 1.0107301473617554, "learning_rate": 9.201616245337754e-05, "loss": 1.2928, "step": 4060 }, { "epoch": 0.84, "grad_norm": 1.0026335716247559, "learning_rate": 9.199544135930378e-05, "loss": 1.2916, "step": 4070 }, { "epoch": 0.84, "grad_norm": 0.9443692564964294, "learning_rate": 9.197472026523001e-05, "loss": 1.2956, "step": 4080 }, { "epoch": 0.84, "grad_norm": 0.9472268223762512, "learning_rate": 9.195399917115625e-05, "loss": 1.2875, "step": 4090 }, { "epoch": 0.85, "grad_norm": 1.0817506313323975, "learning_rate": 9.193327807708247e-05, "loss": 1.2779, "step": 4100 }, { "epoch": 0.85, "grad_norm": 1.0539813041687012, "learning_rate": 9.19125569830087e-05, "loss": 1.2661, "step": 4110 }, { "epoch": 0.85, "grad_norm": 0.9975004196166992, "learning_rate": 9.189183588893494e-05, "loss": 1.2499, "step": 4120 }, { "epoch": 0.85, "grad_norm": 1.0313421487808228, "learning_rate": 9.187111479486117e-05, "loss": 1.2907, "step": 4130 }, { "epoch": 0.85, "grad_norm": 1.0273147821426392, "learning_rate": 9.185039370078741e-05, "loss": 1.2929, "step": 4140 }, { "epoch": 0.86, "grad_norm": 0.9810879230499268, "learning_rate": 9.182967260671363e-05, "loss": 1.2974, "step": 4150 }, { "epoch": 0.86, "grad_norm": 1.0243279933929443, "learning_rate": 9.180895151263988e-05, "loss": 1.2406, "step": 4160 }, { "epoch": 0.86, "grad_norm": 1.0115349292755127, "learning_rate": 9.17882304185661e-05, "loss": 1.2765, "step": 4170 }, { "epoch": 0.86, "grad_norm": 1.1206727027893066, "learning_rate": 9.176750932449234e-05, "loss": 1.2956, "step": 4180 }, { "epoch": 0.86, "grad_norm": 0.99837327003479, "learning_rate": 9.174678823041857e-05, "loss": 1.2614, "step": 4190 }, { "epoch": 0.87, "grad_norm": 1.0117827653884888, "learning_rate": 9.17260671363448e-05, "loss": 1.2611, "step": 4200 }, { "epoch": 0.87, "grad_norm": 1.1119413375854492, "learning_rate": 9.170534604227104e-05, "loss": 1.3006, "step": 4210 }, { "epoch": 0.87, "grad_norm": 1.0567435026168823, "learning_rate": 9.168462494819728e-05, "loss": 1.2544, "step": 4220 }, { "epoch": 0.87, "grad_norm": 1.1326267719268799, "learning_rate": 9.16639038541235e-05, "loss": 1.2895, "step": 4230 }, { "epoch": 0.87, "grad_norm": 1.0858250856399536, "learning_rate": 9.164318276004975e-05, "loss": 1.2672, "step": 4240 }, { "epoch": 0.88, "grad_norm": 1.1111207008361816, "learning_rate": 9.162246166597597e-05, "loss": 1.255, "step": 4250 }, { "epoch": 0.88, "grad_norm": 1.0791577100753784, "learning_rate": 9.16017405719022e-05, "loss": 1.2865, "step": 4260 }, { "epoch": 0.88, "grad_norm": 1.1741734743118286, "learning_rate": 9.158101947782844e-05, "loss": 1.2898, "step": 4270 }, { "epoch": 0.88, "grad_norm": 0.9900088310241699, "learning_rate": 9.156029838375466e-05, "loss": 1.26, "step": 4280 }, { "epoch": 0.89, "grad_norm": 1.2227307558059692, "learning_rate": 9.15395772896809e-05, "loss": 1.2818, "step": 4290 }, { "epoch": 0.89, "grad_norm": 1.0696605443954468, "learning_rate": 9.151885619560713e-05, "loss": 1.2739, "step": 4300 }, { "epoch": 0.89, "grad_norm": 1.145063042640686, "learning_rate": 9.149813510153336e-05, "loss": 1.2886, "step": 4310 }, { "epoch": 0.89, "grad_norm": 1.0415133237838745, "learning_rate": 9.14774140074596e-05, "loss": 1.2999, "step": 4320 }, { "epoch": 0.89, "grad_norm": 0.9151254296302795, "learning_rate": 9.145669291338584e-05, "loss": 1.2883, "step": 4330 }, { "epoch": 0.9, "grad_norm": 1.1220918893814087, "learning_rate": 9.143597181931206e-05, "loss": 1.2751, "step": 4340 }, { "epoch": 0.9, "grad_norm": 1.0417348146438599, "learning_rate": 9.14152507252383e-05, "loss": 1.2675, "step": 4350 }, { "epoch": 0.9, "grad_norm": 1.1090322732925415, "learning_rate": 9.139452963116453e-05, "loss": 1.2521, "step": 4360 }, { "epoch": 0.9, "grad_norm": 1.003110408782959, "learning_rate": 9.137380853709076e-05, "loss": 1.2461, "step": 4370 }, { "epoch": 0.9, "grad_norm": 1.0214240550994873, "learning_rate": 9.1353087443017e-05, "loss": 1.3003, "step": 4380 }, { "epoch": 0.91, "grad_norm": 1.0389635562896729, "learning_rate": 9.133236634894323e-05, "loss": 1.2642, "step": 4390 }, { "epoch": 0.91, "grad_norm": 1.1644842624664307, "learning_rate": 9.131164525486945e-05, "loss": 1.247, "step": 4400 }, { "epoch": 0.91, "grad_norm": 1.0494420528411865, "learning_rate": 9.12909241607957e-05, "loss": 1.2727, "step": 4410 }, { "epoch": 0.91, "grad_norm": 1.1759871244430542, "learning_rate": 9.127020306672192e-05, "loss": 1.2569, "step": 4420 }, { "epoch": 0.91, "grad_norm": 1.0006252527236938, "learning_rate": 9.124948197264816e-05, "loss": 1.2761, "step": 4430 }, { "epoch": 0.92, "grad_norm": 1.0942422151565552, "learning_rate": 9.12287608785744e-05, "loss": 1.2778, "step": 4440 }, { "epoch": 0.92, "grad_norm": 1.1273877620697021, "learning_rate": 9.120803978450062e-05, "loss": 1.2738, "step": 4450 }, { "epoch": 0.92, "grad_norm": 1.0103455781936646, "learning_rate": 9.118731869042686e-05, "loss": 1.2559, "step": 4460 }, { "epoch": 0.92, "grad_norm": 1.0412319898605347, "learning_rate": 9.116659759635309e-05, "loss": 1.2839, "step": 4470 }, { "epoch": 0.92, "grad_norm": 1.1623831987380981, "learning_rate": 9.114587650227932e-05, "loss": 1.2684, "step": 4480 }, { "epoch": 0.93, "grad_norm": 1.213977336883545, "learning_rate": 9.112515540820556e-05, "loss": 1.2587, "step": 4490 }, { "epoch": 0.93, "grad_norm": 1.1630234718322754, "learning_rate": 9.110443431413179e-05, "loss": 1.2557, "step": 4500 }, { "epoch": 0.93, "grad_norm": 1.0047425031661987, "learning_rate": 9.108371322005801e-05, "loss": 1.2785, "step": 4510 }, { "epoch": 0.93, "grad_norm": 1.0434467792510986, "learning_rate": 9.106299212598426e-05, "loss": 1.2916, "step": 4520 }, { "epoch": 0.93, "grad_norm": 1.0278736352920532, "learning_rate": 9.104227103191048e-05, "loss": 1.2494, "step": 4530 }, { "epoch": 0.94, "grad_norm": 0.9865803122520447, "learning_rate": 9.102154993783672e-05, "loss": 1.2487, "step": 4540 }, { "epoch": 0.94, "grad_norm": 1.0419522523880005, "learning_rate": 9.100082884376295e-05, "loss": 1.2618, "step": 4550 }, { "epoch": 0.94, "grad_norm": 1.1537505388259888, "learning_rate": 9.098010774968919e-05, "loss": 1.2613, "step": 4560 }, { "epoch": 0.94, "grad_norm": 1.0565922260284424, "learning_rate": 9.095938665561542e-05, "loss": 1.2592, "step": 4570 }, { "epoch": 0.95, "grad_norm": 1.106313705444336, "learning_rate": 9.093866556154166e-05, "loss": 1.2585, "step": 4580 }, { "epoch": 0.95, "grad_norm": 1.0392132997512817, "learning_rate": 9.091794446746788e-05, "loss": 1.2626, "step": 4590 }, { "epoch": 0.95, "grad_norm": 1.1017506122589111, "learning_rate": 9.089722337339413e-05, "loss": 1.2696, "step": 4600 }, { "epoch": 0.95, "grad_norm": 1.134838581085205, "learning_rate": 9.087650227932035e-05, "loss": 1.2626, "step": 4610 }, { "epoch": 0.95, "grad_norm": 1.3448967933654785, "learning_rate": 9.085578118524659e-05, "loss": 1.2728, "step": 4620 }, { "epoch": 0.96, "grad_norm": 0.9399920701980591, "learning_rate": 9.083506009117282e-05, "loss": 1.253, "step": 4630 }, { "epoch": 0.96, "grad_norm": 1.0530354976654053, "learning_rate": 9.081433899709904e-05, "loss": 1.2501, "step": 4640 }, { "epoch": 0.96, "grad_norm": 1.0609047412872314, "learning_rate": 9.079361790302529e-05, "loss": 1.2599, "step": 4650 }, { "epoch": 0.96, "grad_norm": 1.120044469833374, "learning_rate": 9.077289680895151e-05, "loss": 1.2511, "step": 4660 }, { "epoch": 0.96, "grad_norm": 1.068034291267395, "learning_rate": 9.075217571487775e-05, "loss": 1.2623, "step": 4670 }, { "epoch": 0.97, "grad_norm": 0.9914749264717102, "learning_rate": 9.073145462080398e-05, "loss": 1.2754, "step": 4680 }, { "epoch": 0.97, "grad_norm": 1.084227204322815, "learning_rate": 9.071073352673022e-05, "loss": 1.2861, "step": 4690 }, { "epoch": 0.97, "grad_norm": 0.9811500906944275, "learning_rate": 9.069001243265644e-05, "loss": 1.2361, "step": 4700 }, { "epoch": 0.97, "grad_norm": 1.0811760425567627, "learning_rate": 9.066929133858269e-05, "loss": 1.247, "step": 4710 }, { "epoch": 0.97, "grad_norm": 1.0787228345870972, "learning_rate": 9.064857024450891e-05, "loss": 1.2609, "step": 4720 }, { "epoch": 0.98, "grad_norm": 1.0164875984191895, "learning_rate": 9.062784915043515e-05, "loss": 1.289, "step": 4730 }, { "epoch": 0.98, "grad_norm": 1.1449209451675415, "learning_rate": 9.060712805636138e-05, "loss": 1.269, "step": 4740 }, { "epoch": 0.98, "grad_norm": 1.2805284261703491, "learning_rate": 9.058640696228762e-05, "loss": 1.2803, "step": 4750 }, { "epoch": 0.98, "grad_norm": 1.03864586353302, "learning_rate": 9.056568586821385e-05, "loss": 1.2635, "step": 4760 }, { "epoch": 0.98, "grad_norm": 0.9715671539306641, "learning_rate": 9.054496477414009e-05, "loss": 1.2232, "step": 4770 }, { "epoch": 0.99, "grad_norm": 1.1221860647201538, "learning_rate": 9.052424368006631e-05, "loss": 1.2536, "step": 4780 }, { "epoch": 0.99, "grad_norm": 0.9467464089393616, "learning_rate": 9.050352258599254e-05, "loss": 1.2529, "step": 4790 }, { "epoch": 0.99, "grad_norm": 1.1306428909301758, "learning_rate": 9.048280149191878e-05, "loss": 1.2621, "step": 4800 }, { "epoch": 0.99, "grad_norm": 1.038275122642517, "learning_rate": 9.0462080397845e-05, "loss": 1.2785, "step": 4810 }, { "epoch": 0.99, "grad_norm": 1.1947646141052246, "learning_rate": 9.044135930377125e-05, "loss": 1.2643, "step": 4820 }, { "epoch": 1.0, "grad_norm": 1.0585174560546875, "learning_rate": 9.042063820969747e-05, "loss": 1.248, "step": 4830 }, { "epoch": 1.0, "grad_norm": 1.1931108236312866, "learning_rate": 9.03999171156237e-05, "loss": 1.2884, "step": 4840 }, { "epoch": 1.0, "eval_loss": 1.280229926109314, "eval_runtime": 1606.712, "eval_samples_per_second": 262.53, "eval_steps_per_second": 4.102, "step": 4846 }, { "epoch": 1.0, "grad_norm": 1.0185681581497192, "learning_rate": 9.037919602154994e-05, "loss": 1.2701, "step": 4850 }, { "epoch": 1.0, "grad_norm": 1.2571942806243896, "learning_rate": 9.035847492747618e-05, "loss": 1.2309, "step": 4860 }, { "epoch": 1.0, "grad_norm": 1.1584192514419556, "learning_rate": 9.033775383340241e-05, "loss": 1.2859, "step": 4870 }, { "epoch": 1.01, "grad_norm": 1.05573570728302, "learning_rate": 9.031703273932865e-05, "loss": 1.2502, "step": 4880 }, { "epoch": 1.01, "grad_norm": 1.0500950813293457, "learning_rate": 9.029631164525487e-05, "loss": 1.2288, "step": 4890 }, { "epoch": 1.01, "grad_norm": 1.0603646039962769, "learning_rate": 9.027559055118112e-05, "loss": 1.2609, "step": 4900 }, { "epoch": 1.01, "grad_norm": 1.085644245147705, "learning_rate": 9.025486945710734e-05, "loss": 1.2392, "step": 4910 }, { "epoch": 1.02, "grad_norm": 1.0088363885879517, "learning_rate": 9.023414836303357e-05, "loss": 1.2361, "step": 4920 }, { "epoch": 1.02, "grad_norm": 1.0100585222244263, "learning_rate": 9.021342726895981e-05, "loss": 1.2382, "step": 4930 }, { "epoch": 1.02, "grad_norm": 1.067159652709961, "learning_rate": 9.019270617488604e-05, "loss": 1.2433, "step": 4940 }, { "epoch": 1.02, "grad_norm": 1.0431448221206665, "learning_rate": 9.017198508081226e-05, "loss": 1.2461, "step": 4950 }, { "epoch": 1.02, "grad_norm": 1.3945410251617432, "learning_rate": 9.01512639867385e-05, "loss": 1.2406, "step": 4960 }, { "epoch": 1.03, "grad_norm": 1.0631392002105713, "learning_rate": 9.013054289266474e-05, "loss": 1.2265, "step": 4970 }, { "epoch": 1.03, "grad_norm": 1.0823907852172852, "learning_rate": 9.010982179859097e-05, "loss": 1.2388, "step": 4980 }, { "epoch": 1.03, "grad_norm": 1.1125047206878662, "learning_rate": 9.00891007045172e-05, "loss": 1.2553, "step": 4990 }, { "epoch": 1.03, "grad_norm": 1.0436159372329712, "learning_rate": 9.006837961044343e-05, "loss": 1.2362, "step": 5000 }, { "epoch": 1.03, "grad_norm": 1.2972134351730347, "learning_rate": 9.004765851636968e-05, "loss": 1.2387, "step": 5010 }, { "epoch": 1.04, "grad_norm": 1.0587375164031982, "learning_rate": 9.00269374222959e-05, "loss": 1.2348, "step": 5020 }, { "epoch": 1.04, "grad_norm": 1.0204670429229736, "learning_rate": 9.000621632822213e-05, "loss": 1.2391, "step": 5030 }, { "epoch": 1.04, "grad_norm": 1.0182541608810425, "learning_rate": 8.998549523414837e-05, "loss": 1.2337, "step": 5040 }, { "epoch": 1.04, "grad_norm": 0.9534347057342529, "learning_rate": 8.99647741400746e-05, "loss": 1.2403, "step": 5050 }, { "epoch": 1.04, "grad_norm": 1.1534489393234253, "learning_rate": 8.994405304600084e-05, "loss": 1.251, "step": 5060 }, { "epoch": 1.05, "grad_norm": 1.0630913972854614, "learning_rate": 8.992333195192707e-05, "loss": 1.2741, "step": 5070 }, { "epoch": 1.05, "grad_norm": 1.0464857816696167, "learning_rate": 8.99026108578533e-05, "loss": 1.27, "step": 5080 }, { "epoch": 1.05, "grad_norm": 1.1541072130203247, "learning_rate": 8.988188976377954e-05, "loss": 1.2183, "step": 5090 }, { "epoch": 1.05, "grad_norm": 1.007450819015503, "learning_rate": 8.986116866970576e-05, "loss": 1.2345, "step": 5100 }, { "epoch": 1.05, "grad_norm": 0.984767496585846, "learning_rate": 8.9840447575632e-05, "loss": 1.2661, "step": 5110 }, { "epoch": 1.06, "grad_norm": 1.2972489595413208, "learning_rate": 8.981972648155824e-05, "loss": 1.2577, "step": 5120 }, { "epoch": 1.06, "grad_norm": 1.1882805824279785, "learning_rate": 8.979900538748446e-05, "loss": 1.2685, "step": 5130 }, { "epoch": 1.06, "grad_norm": 1.1580913066864014, "learning_rate": 8.977828429341069e-05, "loss": 1.2269, "step": 5140 }, { "epoch": 1.06, "grad_norm": 1.0898735523223877, "learning_rate": 8.975756319933693e-05, "loss": 1.2315, "step": 5150 }, { "epoch": 1.06, "grad_norm": 1.1261813640594482, "learning_rate": 8.973684210526316e-05, "loss": 1.2617, "step": 5160 }, { "epoch": 1.07, "grad_norm": 1.0266163349151611, "learning_rate": 8.97161210111894e-05, "loss": 1.2379, "step": 5170 }, { "epoch": 1.07, "grad_norm": 1.0973056554794312, "learning_rate": 8.969539991711563e-05, "loss": 1.2527, "step": 5180 }, { "epoch": 1.07, "grad_norm": 1.0927435159683228, "learning_rate": 8.967467882304185e-05, "loss": 1.2433, "step": 5190 }, { "epoch": 1.07, "grad_norm": 1.1209070682525635, "learning_rate": 8.96539577289681e-05, "loss": 1.2512, "step": 5200 }, { "epoch": 1.08, "grad_norm": 1.222163200378418, "learning_rate": 8.963323663489432e-05, "loss": 1.2377, "step": 5210 }, { "epoch": 1.08, "grad_norm": 1.1234538555145264, "learning_rate": 8.961251554082056e-05, "loss": 1.2409, "step": 5220 }, { "epoch": 1.08, "grad_norm": 1.1121318340301514, "learning_rate": 8.95917944467468e-05, "loss": 1.2402, "step": 5230 }, { "epoch": 1.08, "grad_norm": 1.0124129056930542, "learning_rate": 8.957107335267303e-05, "loss": 1.2609, "step": 5240 }, { "epoch": 1.08, "grad_norm": 1.0647163391113281, "learning_rate": 8.955035225859925e-05, "loss": 1.2144, "step": 5250 }, { "epoch": 1.09, "grad_norm": 1.0653977394104004, "learning_rate": 8.95296311645255e-05, "loss": 1.2337, "step": 5260 }, { "epoch": 1.09, "grad_norm": 1.0555377006530762, "learning_rate": 8.950891007045172e-05, "loss": 1.2121, "step": 5270 }, { "epoch": 1.09, "grad_norm": 1.0859911441802979, "learning_rate": 8.948818897637796e-05, "loss": 1.232, "step": 5280 }, { "epoch": 1.09, "grad_norm": 1.0167226791381836, "learning_rate": 8.946746788230419e-05, "loss": 1.2252, "step": 5290 }, { "epoch": 1.09, "grad_norm": 1.0780484676361084, "learning_rate": 8.944674678823041e-05, "loss": 1.2537, "step": 5300 }, { "epoch": 1.1, "grad_norm": 1.0674623250961304, "learning_rate": 8.942602569415666e-05, "loss": 1.2511, "step": 5310 }, { "epoch": 1.1, "grad_norm": 1.1867153644561768, "learning_rate": 8.940530460008288e-05, "loss": 1.2406, "step": 5320 }, { "epoch": 1.1, "grad_norm": 1.0617753267288208, "learning_rate": 8.938458350600912e-05, "loss": 1.2648, "step": 5330 }, { "epoch": 1.1, "grad_norm": 1.1268622875213623, "learning_rate": 8.936386241193535e-05, "loss": 1.2461, "step": 5340 }, { "epoch": 1.1, "grad_norm": 1.06646728515625, "learning_rate": 8.934314131786159e-05, "loss": 1.235, "step": 5350 }, { "epoch": 1.11, "grad_norm": 1.0061641931533813, "learning_rate": 8.932242022378781e-05, "loss": 1.2256, "step": 5360 }, { "epoch": 1.11, "grad_norm": 1.1114619970321655, "learning_rate": 8.930169912971406e-05, "loss": 1.2481, "step": 5370 }, { "epoch": 1.11, "grad_norm": 1.121626615524292, "learning_rate": 8.928097803564028e-05, "loss": 1.2592, "step": 5380 }, { "epoch": 1.11, "grad_norm": 1.0986332893371582, "learning_rate": 8.926025694156652e-05, "loss": 1.2397, "step": 5390 }, { "epoch": 1.11, "grad_norm": 1.055759072303772, "learning_rate": 8.923953584749275e-05, "loss": 1.2468, "step": 5400 }, { "epoch": 1.12, "grad_norm": 1.035019874572754, "learning_rate": 8.921881475341899e-05, "loss": 1.2455, "step": 5410 }, { "epoch": 1.12, "grad_norm": 1.0355908870697021, "learning_rate": 8.919809365934522e-05, "loss": 1.2584, "step": 5420 }, { "epoch": 1.12, "grad_norm": 1.0616044998168945, "learning_rate": 8.917737256527146e-05, "loss": 1.2472, "step": 5430 }, { "epoch": 1.12, "grad_norm": 1.0461037158966064, "learning_rate": 8.915665147119768e-05, "loss": 1.2531, "step": 5440 }, { "epoch": 1.12, "grad_norm": 1.2218214273452759, "learning_rate": 8.913593037712393e-05, "loss": 1.2486, "step": 5450 }, { "epoch": 1.13, "grad_norm": 1.0007187128067017, "learning_rate": 8.911520928305015e-05, "loss": 1.2464, "step": 5460 }, { "epoch": 1.13, "grad_norm": 1.056276798248291, "learning_rate": 8.909448818897638e-05, "loss": 1.2375, "step": 5470 }, { "epoch": 1.13, "grad_norm": 1.0595409870147705, "learning_rate": 8.907376709490262e-05, "loss": 1.2645, "step": 5480 }, { "epoch": 1.13, "grad_norm": 1.1598786115646362, "learning_rate": 8.905304600082884e-05, "loss": 1.2502, "step": 5490 }, { "epoch": 1.13, "grad_norm": 1.0664838552474976, "learning_rate": 8.903232490675508e-05, "loss": 1.2712, "step": 5500 }, { "epoch": 1.14, "grad_norm": 1.0016567707061768, "learning_rate": 8.901160381268131e-05, "loss": 1.2386, "step": 5510 }, { "epoch": 1.14, "grad_norm": 1.0359711647033691, "learning_rate": 8.899088271860755e-05, "loss": 1.238, "step": 5520 }, { "epoch": 1.14, "grad_norm": 1.1633154153823853, "learning_rate": 8.897016162453378e-05, "loss": 1.2139, "step": 5530 }, { "epoch": 1.14, "grad_norm": 1.085390567779541, "learning_rate": 8.894944053046002e-05, "loss": 1.2564, "step": 5540 }, { "epoch": 1.15, "grad_norm": 1.041222333908081, "learning_rate": 8.892871943638624e-05, "loss": 1.2361, "step": 5550 }, { "epoch": 1.15, "grad_norm": 1.0352637767791748, "learning_rate": 8.890799834231249e-05, "loss": 1.2104, "step": 5560 }, { "epoch": 1.15, "grad_norm": 1.0879154205322266, "learning_rate": 8.888727724823871e-05, "loss": 1.2408, "step": 5570 }, { "epoch": 1.15, "grad_norm": 1.177937388420105, "learning_rate": 8.886655615416494e-05, "loss": 1.221, "step": 5580 }, { "epoch": 1.15, "grad_norm": 1.0638147592544556, "learning_rate": 8.884583506009118e-05, "loss": 1.2309, "step": 5590 }, { "epoch": 1.16, "grad_norm": 1.0461276769638062, "learning_rate": 8.882511396601741e-05, "loss": 1.2498, "step": 5600 }, { "epoch": 1.16, "grad_norm": 0.9356592297554016, "learning_rate": 8.880439287194365e-05, "loss": 1.2584, "step": 5610 }, { "epoch": 1.16, "grad_norm": 0.9423561096191406, "learning_rate": 8.878367177786988e-05, "loss": 1.2094, "step": 5620 }, { "epoch": 1.16, "grad_norm": 1.1970158815383911, "learning_rate": 8.87629506837961e-05, "loss": 1.2134, "step": 5630 }, { "epoch": 1.16, "grad_norm": 1.1081819534301758, "learning_rate": 8.874222958972234e-05, "loss": 1.2134, "step": 5640 }, { "epoch": 1.17, "grad_norm": 1.1248120069503784, "learning_rate": 8.872150849564858e-05, "loss": 1.2288, "step": 5650 }, { "epoch": 1.17, "grad_norm": 1.0750600099563599, "learning_rate": 8.87007874015748e-05, "loss": 1.2191, "step": 5660 }, { "epoch": 1.17, "grad_norm": 1.058366060256958, "learning_rate": 8.868006630750105e-05, "loss": 1.2274, "step": 5670 }, { "epoch": 1.17, "grad_norm": 0.9436173439025879, "learning_rate": 8.865934521342727e-05, "loss": 1.2556, "step": 5680 }, { "epoch": 1.17, "grad_norm": 1.0624874830245972, "learning_rate": 8.86386241193535e-05, "loss": 1.2383, "step": 5690 }, { "epoch": 1.18, "grad_norm": 1.0870168209075928, "learning_rate": 8.861790302527974e-05, "loss": 1.2432, "step": 5700 }, { "epoch": 1.18, "grad_norm": 1.0561186075210571, "learning_rate": 8.859718193120597e-05, "loss": 1.2329, "step": 5710 }, { "epoch": 1.18, "grad_norm": 1.2139157056808472, "learning_rate": 8.857646083713221e-05, "loss": 1.2183, "step": 5720 }, { "epoch": 1.18, "grad_norm": 1.0662713050842285, "learning_rate": 8.855573974305844e-05, "loss": 1.2371, "step": 5730 }, { "epoch": 1.18, "grad_norm": 1.1198335886001587, "learning_rate": 8.853501864898466e-05, "loss": 1.2396, "step": 5740 }, { "epoch": 1.19, "grad_norm": 1.2191115617752075, "learning_rate": 8.851429755491091e-05, "loss": 1.2355, "step": 5750 }, { "epoch": 1.19, "grad_norm": 1.039201259613037, "learning_rate": 8.849357646083714e-05, "loss": 1.2251, "step": 5760 }, { "epoch": 1.19, "grad_norm": 0.9540196061134338, "learning_rate": 8.847285536676337e-05, "loss": 1.2424, "step": 5770 }, { "epoch": 1.19, "grad_norm": 1.0640240907669067, "learning_rate": 8.84521342726896e-05, "loss": 1.2414, "step": 5780 }, { "epoch": 1.19, "grad_norm": 1.0465424060821533, "learning_rate": 8.843141317861584e-05, "loss": 1.2241, "step": 5790 }, { "epoch": 1.2, "grad_norm": 1.1881465911865234, "learning_rate": 8.841069208454206e-05, "loss": 1.2182, "step": 5800 }, { "epoch": 1.2, "grad_norm": 1.0851510763168335, "learning_rate": 8.83899709904683e-05, "loss": 1.2115, "step": 5810 }, { "epoch": 1.2, "grad_norm": 1.0743972063064575, "learning_rate": 8.836924989639453e-05, "loss": 1.2211, "step": 5820 }, { "epoch": 1.2, "grad_norm": 1.049249529838562, "learning_rate": 8.834852880232077e-05, "loss": 1.2452, "step": 5830 }, { "epoch": 1.21, "grad_norm": 1.0910190343856812, "learning_rate": 8.8327807708247e-05, "loss": 1.249, "step": 5840 }, { "epoch": 1.21, "grad_norm": 1.0976841449737549, "learning_rate": 8.830708661417322e-05, "loss": 1.2123, "step": 5850 }, { "epoch": 1.21, "grad_norm": 1.3569914102554321, "learning_rate": 8.828636552009947e-05, "loss": 1.2338, "step": 5860 }, { "epoch": 1.21, "grad_norm": 1.068427324295044, "learning_rate": 8.82656444260257e-05, "loss": 1.2351, "step": 5870 }, { "epoch": 1.21, "grad_norm": 1.1309690475463867, "learning_rate": 8.824492333195193e-05, "loss": 1.2367, "step": 5880 }, { "epoch": 1.22, "grad_norm": 1.1933242082595825, "learning_rate": 8.822420223787816e-05, "loss": 1.2261, "step": 5890 }, { "epoch": 1.22, "grad_norm": 1.029557466506958, "learning_rate": 8.82034811438044e-05, "loss": 1.2499, "step": 5900 }, { "epoch": 1.22, "grad_norm": 1.106086015701294, "learning_rate": 8.818276004973062e-05, "loss": 1.2236, "step": 5910 }, { "epoch": 1.22, "grad_norm": 1.0114145278930664, "learning_rate": 8.816203895565687e-05, "loss": 1.2445, "step": 5920 }, { "epoch": 1.22, "grad_norm": 1.1829569339752197, "learning_rate": 8.814131786158309e-05, "loss": 1.2334, "step": 5930 }, { "epoch": 1.23, "grad_norm": 1.1952738761901855, "learning_rate": 8.812059676750933e-05, "loss": 1.2402, "step": 5940 }, { "epoch": 1.23, "grad_norm": 1.0816442966461182, "learning_rate": 8.809987567343556e-05, "loss": 1.252, "step": 5950 }, { "epoch": 1.23, "grad_norm": 1.1453193426132202, "learning_rate": 8.80791545793618e-05, "loss": 1.233, "step": 5960 }, { "epoch": 1.23, "grad_norm": 1.045602798461914, "learning_rate": 8.805843348528803e-05, "loss": 1.2195, "step": 5970 }, { "epoch": 1.23, "grad_norm": 1.0934393405914307, "learning_rate": 8.803771239121425e-05, "loss": 1.2177, "step": 5980 }, { "epoch": 1.24, "grad_norm": 1.0120600461959839, "learning_rate": 8.801699129714049e-05, "loss": 1.2329, "step": 5990 }, { "epoch": 1.24, "grad_norm": 1.103251576423645, "learning_rate": 8.799627020306672e-05, "loss": 1.2379, "step": 6000 }, { "epoch": 1.24, "grad_norm": 1.0402165651321411, "learning_rate": 8.797554910899296e-05, "loss": 1.2256, "step": 6010 }, { "epoch": 1.24, "grad_norm": 1.0983202457427979, "learning_rate": 8.79548280149192e-05, "loss": 1.2296, "step": 6020 }, { "epoch": 1.24, "grad_norm": 1.0412105321884155, "learning_rate": 8.793410692084543e-05, "loss": 1.2228, "step": 6030 }, { "epoch": 1.25, "grad_norm": 1.0473746061325073, "learning_rate": 8.791338582677165e-05, "loss": 1.2228, "step": 6040 }, { "epoch": 1.25, "grad_norm": 1.004840612411499, "learning_rate": 8.78926647326979e-05, "loss": 1.2108, "step": 6050 }, { "epoch": 1.25, "grad_norm": 1.1168406009674072, "learning_rate": 8.787194363862412e-05, "loss": 1.2104, "step": 6060 }, { "epoch": 1.25, "grad_norm": 1.1124876737594604, "learning_rate": 8.785122254455036e-05, "loss": 1.2336, "step": 6070 }, { "epoch": 1.25, "grad_norm": 1.059415340423584, "learning_rate": 8.783050145047659e-05, "loss": 1.2438, "step": 6080 }, { "epoch": 1.26, "grad_norm": 1.063344955444336, "learning_rate": 8.780978035640283e-05, "loss": 1.2352, "step": 6090 }, { "epoch": 1.26, "grad_norm": 1.0552620887756348, "learning_rate": 8.778905926232905e-05, "loss": 1.2131, "step": 6100 }, { "epoch": 1.26, "grad_norm": 1.1673498153686523, "learning_rate": 8.77683381682553e-05, "loss": 1.213, "step": 6110 }, { "epoch": 1.26, "grad_norm": 1.1740206480026245, "learning_rate": 8.774761707418152e-05, "loss": 1.2162, "step": 6120 }, { "epoch": 1.26, "grad_norm": 1.0944995880126953, "learning_rate": 8.772689598010775e-05, "loss": 1.2005, "step": 6130 }, { "epoch": 1.27, "grad_norm": 1.0152305364608765, "learning_rate": 8.770617488603399e-05, "loss": 1.2198, "step": 6140 }, { "epoch": 1.27, "grad_norm": 1.1265654563903809, "learning_rate": 8.768545379196021e-05, "loss": 1.2362, "step": 6150 }, { "epoch": 1.27, "grad_norm": 1.151207447052002, "learning_rate": 8.766473269788646e-05, "loss": 1.2561, "step": 6160 }, { "epoch": 1.27, "grad_norm": 1.035855770111084, "learning_rate": 8.764401160381268e-05, "loss": 1.2391, "step": 6170 }, { "epoch": 1.28, "grad_norm": 1.091009497642517, "learning_rate": 8.762329050973892e-05, "loss": 1.2342, "step": 6180 }, { "epoch": 1.28, "grad_norm": 1.057400107383728, "learning_rate": 8.760256941566515e-05, "loss": 1.2216, "step": 6190 }, { "epoch": 1.28, "grad_norm": 0.9303261041641235, "learning_rate": 8.758184832159139e-05, "loss": 1.2352, "step": 6200 }, { "epoch": 1.28, "grad_norm": 1.2061206102371216, "learning_rate": 8.756112722751761e-05, "loss": 1.215, "step": 6210 }, { "epoch": 1.28, "grad_norm": 1.0886818170547485, "learning_rate": 8.754040613344386e-05, "loss": 1.2266, "step": 6220 }, { "epoch": 1.29, "grad_norm": 1.279563546180725, "learning_rate": 8.751968503937008e-05, "loss": 1.237, "step": 6230 }, { "epoch": 1.29, "grad_norm": 1.0063716173171997, "learning_rate": 8.749896394529631e-05, "loss": 1.2119, "step": 6240 }, { "epoch": 1.29, "grad_norm": 1.0414812564849854, "learning_rate": 8.747824285122255e-05, "loss": 1.2177, "step": 6250 }, { "epoch": 1.29, "grad_norm": 1.0915932655334473, "learning_rate": 8.745752175714878e-05, "loss": 1.2422, "step": 6260 }, { "epoch": 1.29, "grad_norm": 1.0544465780258179, "learning_rate": 8.743680066307502e-05, "loss": 1.234, "step": 6270 }, { "epoch": 1.3, "grad_norm": 1.1502125263214111, "learning_rate": 8.741607956900125e-05, "loss": 1.2183, "step": 6280 }, { "epoch": 1.3, "grad_norm": 1.3113386631011963, "learning_rate": 8.739535847492748e-05, "loss": 1.202, "step": 6290 }, { "epoch": 1.3, "grad_norm": 1.087583303451538, "learning_rate": 8.737463738085371e-05, "loss": 1.241, "step": 6300 }, { "epoch": 1.3, "grad_norm": 1.135565161705017, "learning_rate": 8.735391628677995e-05, "loss": 1.2411, "step": 6310 }, { "epoch": 1.3, "grad_norm": 1.0986896753311157, "learning_rate": 8.733319519270617e-05, "loss": 1.2178, "step": 6320 }, { "epoch": 1.31, "grad_norm": 1.6357513666152954, "learning_rate": 8.731247409863242e-05, "loss": 1.2102, "step": 6330 }, { "epoch": 1.31, "grad_norm": 1.0731899738311768, "learning_rate": 8.729175300455864e-05, "loss": 1.213, "step": 6340 }, { "epoch": 1.31, "grad_norm": 1.1432324647903442, "learning_rate": 8.727103191048487e-05, "loss": 1.2395, "step": 6350 }, { "epoch": 1.31, "grad_norm": 1.0547071695327759, "learning_rate": 8.725031081641111e-05, "loss": 1.2121, "step": 6360 }, { "epoch": 1.31, "grad_norm": 1.1022225618362427, "learning_rate": 8.722958972233734e-05, "loss": 1.2274, "step": 6370 }, { "epoch": 1.32, "grad_norm": 1.0772980451583862, "learning_rate": 8.720886862826358e-05, "loss": 1.2244, "step": 6380 }, { "epoch": 1.32, "grad_norm": 1.073470115661621, "learning_rate": 8.718814753418981e-05, "loss": 1.2243, "step": 6390 }, { "epoch": 1.32, "grad_norm": 1.1750410795211792, "learning_rate": 8.716742644011604e-05, "loss": 1.2191, "step": 6400 }, { "epoch": 1.32, "grad_norm": 1.206298828125, "learning_rate": 8.714670534604228e-05, "loss": 1.2284, "step": 6410 }, { "epoch": 1.32, "grad_norm": 1.1253398656845093, "learning_rate": 8.71259842519685e-05, "loss": 1.2222, "step": 6420 }, { "epoch": 1.33, "grad_norm": 1.1828629970550537, "learning_rate": 8.710526315789474e-05, "loss": 1.225, "step": 6430 }, { "epoch": 1.33, "grad_norm": 1.0462387800216675, "learning_rate": 8.708454206382098e-05, "loss": 1.2156, "step": 6440 }, { "epoch": 1.33, "grad_norm": 1.3468637466430664, "learning_rate": 8.706382096974721e-05, "loss": 1.2111, "step": 6450 }, { "epoch": 1.33, "grad_norm": 1.1359398365020752, "learning_rate": 8.704309987567345e-05, "loss": 1.2022, "step": 6460 }, { "epoch": 1.34, "grad_norm": 1.0748237371444702, "learning_rate": 8.702237878159967e-05, "loss": 1.2227, "step": 6470 }, { "epoch": 1.34, "grad_norm": 1.0240176916122437, "learning_rate": 8.70016576875259e-05, "loss": 1.2042, "step": 6480 }, { "epoch": 1.34, "grad_norm": 1.0575100183486938, "learning_rate": 8.698093659345214e-05, "loss": 1.2166, "step": 6490 }, { "epoch": 1.34, "grad_norm": 1.0095828771591187, "learning_rate": 8.696021549937837e-05, "loss": 1.2352, "step": 6500 }, { "epoch": 1.34, "grad_norm": 1.140643835067749, "learning_rate": 8.69394944053046e-05, "loss": 1.2143, "step": 6510 }, { "epoch": 1.35, "grad_norm": 1.0403310060501099, "learning_rate": 8.691877331123084e-05, "loss": 1.2268, "step": 6520 }, { "epoch": 1.35, "grad_norm": 1.1176542043685913, "learning_rate": 8.689805221715706e-05, "loss": 1.2123, "step": 6530 }, { "epoch": 1.35, "grad_norm": 1.041100025177002, "learning_rate": 8.68773311230833e-05, "loss": 1.2356, "step": 6540 }, { "epoch": 1.35, "grad_norm": 0.9852685928344727, "learning_rate": 8.685661002900954e-05, "loss": 1.2043, "step": 6550 }, { "epoch": 1.35, "grad_norm": 1.206588864326477, "learning_rate": 8.683588893493577e-05, "loss": 1.2132, "step": 6560 }, { "epoch": 1.36, "grad_norm": 1.1841477155685425, "learning_rate": 8.6815167840862e-05, "loss": 1.241, "step": 6570 }, { "epoch": 1.36, "grad_norm": 1.1257410049438477, "learning_rate": 8.679444674678824e-05, "loss": 1.2241, "step": 6580 }, { "epoch": 1.36, "grad_norm": 1.053404688835144, "learning_rate": 8.677372565271446e-05, "loss": 1.2409, "step": 6590 }, { "epoch": 1.36, "grad_norm": 1.109271764755249, "learning_rate": 8.675300455864071e-05, "loss": 1.2036, "step": 6600 }, { "epoch": 1.36, "grad_norm": 1.0103741884231567, "learning_rate": 8.673228346456693e-05, "loss": 1.2128, "step": 6610 }, { "epoch": 1.37, "grad_norm": 1.149715542793274, "learning_rate": 8.671156237049317e-05, "loss": 1.208, "step": 6620 }, { "epoch": 1.37, "grad_norm": 1.0523351430892944, "learning_rate": 8.66908412764194e-05, "loss": 1.2146, "step": 6630 }, { "epoch": 1.37, "grad_norm": 1.2250432968139648, "learning_rate": 8.667012018234562e-05, "loss": 1.2454, "step": 6640 }, { "epoch": 1.37, "grad_norm": 0.9975298643112183, "learning_rate": 8.664939908827186e-05, "loss": 1.228, "step": 6650 }, { "epoch": 1.37, "grad_norm": 1.0323069095611572, "learning_rate": 8.66286779941981e-05, "loss": 1.2223, "step": 6660 }, { "epoch": 1.38, "grad_norm": 1.0941072702407837, "learning_rate": 8.660795690012433e-05, "loss": 1.2214, "step": 6670 }, { "epoch": 1.38, "grad_norm": 1.2442706823349, "learning_rate": 8.658723580605056e-05, "loss": 1.2316, "step": 6680 }, { "epoch": 1.38, "grad_norm": 1.1723049879074097, "learning_rate": 8.65665147119768e-05, "loss": 1.2184, "step": 6690 }, { "epoch": 1.38, "grad_norm": 1.1901644468307495, "learning_rate": 8.654579361790302e-05, "loss": 1.2382, "step": 6700 }, { "epoch": 1.38, "grad_norm": 1.1003718376159668, "learning_rate": 8.652507252382927e-05, "loss": 1.2272, "step": 6710 }, { "epoch": 1.39, "grad_norm": 1.1174200773239136, "learning_rate": 8.650435142975549e-05, "loss": 1.2248, "step": 6720 }, { "epoch": 1.39, "grad_norm": 1.2489265203475952, "learning_rate": 8.648363033568173e-05, "loss": 1.2252, "step": 6730 }, { "epoch": 1.39, "grad_norm": 1.2472732067108154, "learning_rate": 8.646290924160796e-05, "loss": 1.227, "step": 6740 }, { "epoch": 1.39, "grad_norm": 1.0011487007141113, "learning_rate": 8.64421881475342e-05, "loss": 1.2267, "step": 6750 }, { "epoch": 1.39, "grad_norm": 1.060117244720459, "learning_rate": 8.642146705346042e-05, "loss": 1.2263, "step": 6760 }, { "epoch": 1.4, "grad_norm": 1.102454423904419, "learning_rate": 8.640074595938667e-05, "loss": 1.2205, "step": 6770 }, { "epoch": 1.4, "grad_norm": 1.0496042966842651, "learning_rate": 8.638002486531289e-05, "loss": 1.2207, "step": 6780 }, { "epoch": 1.4, "grad_norm": 1.005552053451538, "learning_rate": 8.635930377123912e-05, "loss": 1.1992, "step": 6790 }, { "epoch": 1.4, "grad_norm": 1.0828580856323242, "learning_rate": 8.633858267716536e-05, "loss": 1.2164, "step": 6800 }, { "epoch": 1.41, "grad_norm": 1.0597617626190186, "learning_rate": 8.631786158309158e-05, "loss": 1.2081, "step": 6810 }, { "epoch": 1.41, "grad_norm": 1.1591441631317139, "learning_rate": 8.629714048901783e-05, "loss": 1.2315, "step": 6820 }, { "epoch": 1.41, "grad_norm": 1.1969908475875854, "learning_rate": 8.627641939494405e-05, "loss": 1.2307, "step": 6830 }, { "epoch": 1.41, "grad_norm": 1.1163939237594604, "learning_rate": 8.625569830087029e-05, "loss": 1.2272, "step": 6840 }, { "epoch": 1.41, "grad_norm": 1.047241449356079, "learning_rate": 8.623497720679652e-05, "loss": 1.1991, "step": 6850 }, { "epoch": 1.42, "grad_norm": 1.0410964488983154, "learning_rate": 8.621425611272276e-05, "loss": 1.211, "step": 6860 }, { "epoch": 1.42, "grad_norm": 1.0674628019332886, "learning_rate": 8.619353501864899e-05, "loss": 1.2195, "step": 6870 }, { "epoch": 1.42, "grad_norm": 0.9892363548278809, "learning_rate": 8.617281392457523e-05, "loss": 1.2344, "step": 6880 }, { "epoch": 1.42, "grad_norm": 1.08130943775177, "learning_rate": 8.615209283050145e-05, "loss": 1.2299, "step": 6890 }, { "epoch": 1.42, "grad_norm": 1.0285786390304565, "learning_rate": 8.613137173642768e-05, "loss": 1.2166, "step": 6900 }, { "epoch": 1.43, "grad_norm": 1.0203038454055786, "learning_rate": 8.611065064235392e-05, "loss": 1.2043, "step": 6910 }, { "epoch": 1.43, "grad_norm": 1.17252516746521, "learning_rate": 8.608992954828015e-05, "loss": 1.2278, "step": 6920 }, { "epoch": 1.43, "grad_norm": 1.233001708984375, "learning_rate": 8.606920845420639e-05, "loss": 1.233, "step": 6930 }, { "epoch": 1.43, "grad_norm": 1.0858713388442993, "learning_rate": 8.604848736013262e-05, "loss": 1.2174, "step": 6940 }, { "epoch": 1.43, "grad_norm": 1.069405198097229, "learning_rate": 8.602776626605885e-05, "loss": 1.2176, "step": 6950 }, { "epoch": 1.44, "grad_norm": 1.0860111713409424, "learning_rate": 8.60070451719851e-05, "loss": 1.2179, "step": 6960 }, { "epoch": 1.44, "grad_norm": 1.111624836921692, "learning_rate": 8.598632407791132e-05, "loss": 1.2167, "step": 6970 }, { "epoch": 1.44, "grad_norm": 1.092925786972046, "learning_rate": 8.596560298383755e-05, "loss": 1.2154, "step": 6980 }, { "epoch": 1.44, "grad_norm": 1.115460753440857, "learning_rate": 8.594488188976379e-05, "loss": 1.2052, "step": 6990 }, { "epoch": 1.44, "grad_norm": 1.229777455329895, "learning_rate": 8.592416079569001e-05, "loss": 1.1769, "step": 7000 }, { "epoch": 1.45, "grad_norm": 1.1182063817977905, "learning_rate": 8.590343970161626e-05, "loss": 1.2035, "step": 7010 }, { "epoch": 1.45, "grad_norm": 1.0315207242965698, "learning_rate": 8.588271860754248e-05, "loss": 1.2158, "step": 7020 }, { "epoch": 1.45, "grad_norm": 1.1051239967346191, "learning_rate": 8.586199751346871e-05, "loss": 1.1874, "step": 7030 }, { "epoch": 1.45, "grad_norm": 1.0437036752700806, "learning_rate": 8.584127641939495e-05, "loss": 1.2421, "step": 7040 }, { "epoch": 1.45, "grad_norm": 1.1060231924057007, "learning_rate": 8.582055532532118e-05, "loss": 1.2392, "step": 7050 }, { "epoch": 1.46, "grad_norm": 1.0951759815216064, "learning_rate": 8.57998342312474e-05, "loss": 1.2348, "step": 7060 }, { "epoch": 1.46, "grad_norm": 1.7351630926132202, "learning_rate": 8.577911313717365e-05, "loss": 1.2198, "step": 7070 }, { "epoch": 1.46, "grad_norm": 1.3410096168518066, "learning_rate": 8.575839204309988e-05, "loss": 1.2069, "step": 7080 }, { "epoch": 1.46, "grad_norm": 1.1723905801773071, "learning_rate": 8.573767094902611e-05, "loss": 1.2264, "step": 7090 }, { "epoch": 1.47, "grad_norm": 1.0400230884552002, "learning_rate": 8.571694985495235e-05, "loss": 1.2285, "step": 7100 }, { "epoch": 1.47, "grad_norm": 1.1555556058883667, "learning_rate": 8.569622876087858e-05, "loss": 1.1982, "step": 7110 }, { "epoch": 1.47, "grad_norm": 1.0779943466186523, "learning_rate": 8.567550766680482e-05, "loss": 1.2241, "step": 7120 }, { "epoch": 1.47, "grad_norm": 1.011435627937317, "learning_rate": 8.565478657273105e-05, "loss": 1.1919, "step": 7130 }, { "epoch": 1.47, "grad_norm": 1.1413905620574951, "learning_rate": 8.563406547865727e-05, "loss": 1.2026, "step": 7140 }, { "epoch": 1.48, "grad_norm": 1.305766224861145, "learning_rate": 8.561334438458351e-05, "loss": 1.2243, "step": 7150 }, { "epoch": 1.48, "grad_norm": 1.1241309642791748, "learning_rate": 8.559262329050974e-05, "loss": 1.2305, "step": 7160 }, { "epoch": 1.48, "grad_norm": 1.1354045867919922, "learning_rate": 8.557190219643596e-05, "loss": 1.2203, "step": 7170 }, { "epoch": 1.48, "grad_norm": 1.0294325351715088, "learning_rate": 8.555118110236221e-05, "loss": 1.2356, "step": 7180 }, { "epoch": 1.48, "grad_norm": 1.1068978309631348, "learning_rate": 8.553046000828844e-05, "loss": 1.2191, "step": 7190 }, { "epoch": 1.49, "grad_norm": 1.0707268714904785, "learning_rate": 8.550973891421467e-05, "loss": 1.2168, "step": 7200 }, { "epoch": 1.49, "grad_norm": 1.158420443534851, "learning_rate": 8.54890178201409e-05, "loss": 1.1924, "step": 7210 }, { "epoch": 1.49, "grad_norm": 1.018847107887268, "learning_rate": 8.546829672606714e-05, "loss": 1.2326, "step": 7220 }, { "epoch": 1.49, "grad_norm": 1.1226823329925537, "learning_rate": 8.544757563199338e-05, "loss": 1.2029, "step": 7230 }, { "epoch": 1.49, "grad_norm": 1.1094051599502563, "learning_rate": 8.542685453791961e-05, "loss": 1.2005, "step": 7240 }, { "epoch": 1.5, "grad_norm": 1.1425297260284424, "learning_rate": 8.540613344384583e-05, "loss": 1.2109, "step": 7250 }, { "epoch": 1.5, "grad_norm": 1.0846836566925049, "learning_rate": 8.538541234977208e-05, "loss": 1.2098, "step": 7260 }, { "epoch": 1.5, "grad_norm": 1.0861196517944336, "learning_rate": 8.53646912556983e-05, "loss": 1.2201, "step": 7270 }, { "epoch": 1.5, "grad_norm": 1.0006380081176758, "learning_rate": 8.534397016162454e-05, "loss": 1.2457, "step": 7280 }, { "epoch": 1.5, "grad_norm": 1.0799134969711304, "learning_rate": 8.532324906755077e-05, "loss": 1.2117, "step": 7290 }, { "epoch": 1.51, "grad_norm": 1.1802451610565186, "learning_rate": 8.530252797347701e-05, "loss": 1.2277, "step": 7300 }, { "epoch": 1.51, "grad_norm": 1.0235154628753662, "learning_rate": 8.528180687940323e-05, "loss": 1.204, "step": 7310 }, { "epoch": 1.51, "grad_norm": 1.1273279190063477, "learning_rate": 8.526108578532946e-05, "loss": 1.2179, "step": 7320 }, { "epoch": 1.51, "grad_norm": 0.9829218983650208, "learning_rate": 8.52403646912557e-05, "loss": 1.1913, "step": 7330 }, { "epoch": 1.51, "grad_norm": 1.1805267333984375, "learning_rate": 8.521964359718194e-05, "loss": 1.1822, "step": 7340 }, { "epoch": 1.52, "grad_norm": 1.026557207107544, "learning_rate": 8.519892250310817e-05, "loss": 1.1912, "step": 7350 }, { "epoch": 1.52, "grad_norm": 1.04379141330719, "learning_rate": 8.517820140903439e-05, "loss": 1.2093, "step": 7360 }, { "epoch": 1.52, "grad_norm": 1.0553088188171387, "learning_rate": 8.515748031496064e-05, "loss": 1.2213, "step": 7370 }, { "epoch": 1.52, "grad_norm": 0.8822417855262756, "learning_rate": 8.513675922088686e-05, "loss": 1.2223, "step": 7380 }, { "epoch": 1.52, "grad_norm": 0.9738419055938721, "learning_rate": 8.51160381268131e-05, "loss": 1.2011, "step": 7390 }, { "epoch": 1.53, "grad_norm": 1.0899471044540405, "learning_rate": 8.509738914214672e-05, "loss": 1.2406, "step": 7400 }, { "epoch": 1.53, "grad_norm": 1.1138182878494263, "learning_rate": 8.507666804807294e-05, "loss": 1.1963, "step": 7410 }, { "epoch": 1.53, "grad_norm": 1.1311627626419067, "learning_rate": 8.505594695399917e-05, "loss": 1.1977, "step": 7420 }, { "epoch": 1.53, "grad_norm": 2.138723134994507, "learning_rate": 8.503522585992541e-05, "loss": 1.2104, "step": 7430 }, { "epoch": 1.54, "grad_norm": 1.0873496532440186, "learning_rate": 8.501450476585164e-05, "loss": 1.2113, "step": 7440 }, { "epoch": 1.54, "grad_norm": 0.9628106355667114, "learning_rate": 8.499378367177787e-05, "loss": 1.2258, "step": 7450 }, { "epoch": 1.54, "grad_norm": 1.1409735679626465, "learning_rate": 8.497306257770411e-05, "loss": 1.1964, "step": 7460 }, { "epoch": 1.54, "grad_norm": 1.111512541770935, "learning_rate": 8.495234148363034e-05, "loss": 1.2061, "step": 7470 }, { "epoch": 1.54, "grad_norm": 1.0905687808990479, "learning_rate": 8.493162038955657e-05, "loss": 1.2025, "step": 7480 }, { "epoch": 1.55, "grad_norm": 1.1417665481567383, "learning_rate": 8.49108992954828e-05, "loss": 1.1861, "step": 7490 }, { "epoch": 1.55, "grad_norm": 1.024594783782959, "learning_rate": 8.489017820140904e-05, "loss": 1.1893, "step": 7500 }, { "epoch": 1.55, "grad_norm": 1.112746238708496, "learning_rate": 8.486945710733528e-05, "loss": 1.2196, "step": 7510 }, { "epoch": 1.55, "grad_norm": 1.1404857635498047, "learning_rate": 8.484873601326151e-05, "loss": 1.2131, "step": 7520 }, { "epoch": 1.55, "grad_norm": 1.1401077508926392, "learning_rate": 8.482801491918773e-05, "loss": 1.2107, "step": 7530 }, { "epoch": 1.56, "grad_norm": 1.2235480546951294, "learning_rate": 8.480729382511397e-05, "loss": 1.2363, "step": 7540 }, { "epoch": 1.56, "grad_norm": 1.0837748050689697, "learning_rate": 8.47865727310402e-05, "loss": 1.2139, "step": 7550 }, { "epoch": 1.56, "grad_norm": 1.22895348072052, "learning_rate": 8.476585163696643e-05, "loss": 1.1998, "step": 7560 }, { "epoch": 1.56, "grad_norm": 1.105093240737915, "learning_rate": 8.474513054289267e-05, "loss": 1.2212, "step": 7570 }, { "epoch": 1.56, "grad_norm": 1.222209095954895, "learning_rate": 8.47244094488189e-05, "loss": 1.2453, "step": 7580 }, { "epoch": 1.57, "grad_norm": 1.1390637159347534, "learning_rate": 8.470368835474513e-05, "loss": 1.2071, "step": 7590 }, { "epoch": 1.57, "grad_norm": 1.0356426239013672, "learning_rate": 8.468296726067137e-05, "loss": 1.2157, "step": 7600 }, { "epoch": 1.57, "grad_norm": 1.0771561861038208, "learning_rate": 8.46622461665976e-05, "loss": 1.212, "step": 7610 }, { "epoch": 1.57, "grad_norm": 1.3607767820358276, "learning_rate": 8.464152507252384e-05, "loss": 1.2253, "step": 7620 }, { "epoch": 1.57, "grad_norm": 1.0732100009918213, "learning_rate": 8.462080397845007e-05, "loss": 1.1938, "step": 7630 }, { "epoch": 1.58, "grad_norm": 1.0551025867462158, "learning_rate": 8.460008288437629e-05, "loss": 1.2163, "step": 7640 }, { "epoch": 1.58, "grad_norm": 1.127244472503662, "learning_rate": 8.457936179030254e-05, "loss": 1.1956, "step": 7650 }, { "epoch": 1.58, "grad_norm": 1.0608160495758057, "learning_rate": 8.455864069622876e-05, "loss": 1.191, "step": 7660 }, { "epoch": 1.58, "grad_norm": 1.0632834434509277, "learning_rate": 8.4537919602155e-05, "loss": 1.1971, "step": 7670 }, { "epoch": 1.58, "grad_norm": 1.1187163591384888, "learning_rate": 8.451719850808123e-05, "loss": 1.2156, "step": 7680 }, { "epoch": 1.59, "grad_norm": 1.135420799255371, "learning_rate": 8.449647741400747e-05, "loss": 1.1953, "step": 7690 }, { "epoch": 1.59, "grad_norm": 1.1564319133758545, "learning_rate": 8.447575631993369e-05, "loss": 1.2169, "step": 7700 }, { "epoch": 1.59, "grad_norm": 1.1321407556533813, "learning_rate": 8.445503522585993e-05, "loss": 1.1869, "step": 7710 }, { "epoch": 1.59, "grad_norm": 1.097676157951355, "learning_rate": 8.443431413178616e-05, "loss": 1.2039, "step": 7720 }, { "epoch": 1.6, "grad_norm": 0.9873398542404175, "learning_rate": 8.44135930377124e-05, "loss": 1.2115, "step": 7730 }, { "epoch": 1.6, "grad_norm": 1.0774983167648315, "learning_rate": 8.439287194363863e-05, "loss": 1.2089, "step": 7740 }, { "epoch": 1.6, "grad_norm": 1.199475884437561, "learning_rate": 8.437215084956485e-05, "loss": 1.1913, "step": 7750 }, { "epoch": 1.6, "grad_norm": 1.2517530918121338, "learning_rate": 8.43514297554911e-05, "loss": 1.24, "step": 7760 }, { "epoch": 1.6, "grad_norm": 1.1117113828659058, "learning_rate": 8.433070866141732e-05, "loss": 1.1907, "step": 7770 }, { "epoch": 1.61, "grad_norm": 1.1518152952194214, "learning_rate": 8.430998756734356e-05, "loss": 1.2194, "step": 7780 }, { "epoch": 1.61, "grad_norm": 1.0633752346038818, "learning_rate": 8.428926647326979e-05, "loss": 1.2173, "step": 7790 }, { "epoch": 1.61, "grad_norm": 1.1065930128097534, "learning_rate": 8.426854537919603e-05, "loss": 1.1803, "step": 7800 }, { "epoch": 1.61, "grad_norm": 1.03267240524292, "learning_rate": 8.424782428512226e-05, "loss": 1.2178, "step": 7810 }, { "epoch": 1.61, "grad_norm": 0.9949610233306885, "learning_rate": 8.42271031910485e-05, "loss": 1.1999, "step": 7820 }, { "epoch": 1.62, "grad_norm": 1.0859277248382568, "learning_rate": 8.420638209697472e-05, "loss": 1.2056, "step": 7830 }, { "epoch": 1.62, "grad_norm": 1.0535070896148682, "learning_rate": 8.418566100290096e-05, "loss": 1.2004, "step": 7840 }, { "epoch": 1.62, "grad_norm": 1.1210662126541138, "learning_rate": 8.416493990882719e-05, "loss": 1.2223, "step": 7850 }, { "epoch": 1.62, "grad_norm": 1.0601907968521118, "learning_rate": 8.414421881475343e-05, "loss": 1.1985, "step": 7860 }, { "epoch": 1.62, "grad_norm": 1.0329095125198364, "learning_rate": 8.412349772067966e-05, "loss": 1.202, "step": 7870 }, { "epoch": 1.63, "grad_norm": 1.1331712007522583, "learning_rate": 8.410277662660588e-05, "loss": 1.1923, "step": 7880 }, { "epoch": 1.63, "grad_norm": 1.1752616167068481, "learning_rate": 8.408205553253212e-05, "loss": 1.195, "step": 7890 }, { "epoch": 1.63, "grad_norm": 1.1065553426742554, "learning_rate": 8.406133443845835e-05, "loss": 1.2173, "step": 7900 }, { "epoch": 1.63, "grad_norm": 1.1917232275009155, "learning_rate": 8.404061334438459e-05, "loss": 1.1926, "step": 7910 }, { "epoch": 1.63, "grad_norm": 1.1937179565429688, "learning_rate": 8.401989225031082e-05, "loss": 1.18, "step": 7920 }, { "epoch": 1.64, "grad_norm": 1.1231236457824707, "learning_rate": 8.399917115623706e-05, "loss": 1.2006, "step": 7930 }, { "epoch": 1.64, "grad_norm": 1.2102551460266113, "learning_rate": 8.397845006216328e-05, "loss": 1.209, "step": 7940 }, { "epoch": 1.64, "grad_norm": 1.0514934062957764, "learning_rate": 8.395772896808953e-05, "loss": 1.1959, "step": 7950 }, { "epoch": 1.64, "grad_norm": 1.0771079063415527, "learning_rate": 8.393700787401575e-05, "loss": 1.2303, "step": 7960 }, { "epoch": 1.64, "grad_norm": 1.1473889350891113, "learning_rate": 8.391628677994198e-05, "loss": 1.189, "step": 7970 }, { "epoch": 1.65, "grad_norm": 1.101943850517273, "learning_rate": 8.389556568586822e-05, "loss": 1.1792, "step": 7980 }, { "epoch": 1.65, "grad_norm": 1.217653751373291, "learning_rate": 8.387484459179446e-05, "loss": 1.2032, "step": 7990 }, { "epoch": 1.65, "grad_norm": 1.1558287143707275, "learning_rate": 8.385412349772068e-05, "loss": 1.2048, "step": 8000 }, { "epoch": 1.65, "grad_norm": 1.1772674322128296, "learning_rate": 8.383340240364693e-05, "loss": 1.208, "step": 8010 }, { "epoch": 1.65, "grad_norm": 1.1206742525100708, "learning_rate": 8.381268130957315e-05, "loss": 1.2158, "step": 8020 }, { "epoch": 1.66, "grad_norm": 1.0629864931106567, "learning_rate": 8.379196021549938e-05, "loss": 1.1989, "step": 8030 }, { "epoch": 1.66, "grad_norm": 1.0601340532302856, "learning_rate": 8.377123912142562e-05, "loss": 1.2149, "step": 8040 }, { "epoch": 1.66, "grad_norm": 1.2142128944396973, "learning_rate": 8.375051802735184e-05, "loss": 1.2045, "step": 8050 }, { "epoch": 1.66, "grad_norm": 1.015764832496643, "learning_rate": 8.372979693327809e-05, "loss": 1.2177, "step": 8060 }, { "epoch": 1.67, "grad_norm": 1.1525570154190063, "learning_rate": 8.370907583920431e-05, "loss": 1.2203, "step": 8070 }, { "epoch": 1.67, "grad_norm": 1.0959409475326538, "learning_rate": 8.368835474513054e-05, "loss": 1.2022, "step": 8080 }, { "epoch": 1.67, "grad_norm": 1.073423981666565, "learning_rate": 8.366763365105678e-05, "loss": 1.2225, "step": 8090 }, { "epoch": 1.67, "grad_norm": 1.1058803796768188, "learning_rate": 8.364691255698301e-05, "loss": 1.202, "step": 8100 }, { "epoch": 1.67, "grad_norm": 1.067132592201233, "learning_rate": 8.362619146290924e-05, "loss": 1.2215, "step": 8110 }, { "epoch": 1.68, "grad_norm": 1.1303818225860596, "learning_rate": 8.360547036883548e-05, "loss": 1.2049, "step": 8120 }, { "epoch": 1.68, "grad_norm": 1.1500287055969238, "learning_rate": 8.35847492747617e-05, "loss": 1.2241, "step": 8130 }, { "epoch": 1.68, "grad_norm": 0.9954307675361633, "learning_rate": 8.356402818068794e-05, "loss": 1.1806, "step": 8140 }, { "epoch": 1.68, "grad_norm": 1.0905861854553223, "learning_rate": 8.354330708661418e-05, "loss": 1.2108, "step": 8150 }, { "epoch": 1.68, "grad_norm": 1.1111913919448853, "learning_rate": 8.352258599254041e-05, "loss": 1.2026, "step": 8160 }, { "epoch": 1.69, "grad_norm": 0.9808979034423828, "learning_rate": 8.350186489846665e-05, "loss": 1.2101, "step": 8170 }, { "epoch": 1.69, "grad_norm": 1.127181053161621, "learning_rate": 8.348114380439288e-05, "loss": 1.1955, "step": 8180 }, { "epoch": 1.69, "grad_norm": 1.0933669805526733, "learning_rate": 8.34604227103191e-05, "loss": 1.1904, "step": 8190 }, { "epoch": 1.69, "grad_norm": 1.2010207176208496, "learning_rate": 8.343970161624535e-05, "loss": 1.2018, "step": 8200 }, { "epoch": 1.69, "grad_norm": 1.019642949104309, "learning_rate": 8.341898052217157e-05, "loss": 1.221, "step": 8210 }, { "epoch": 1.7, "grad_norm": 1.115064024925232, "learning_rate": 8.339825942809781e-05, "loss": 1.193, "step": 8220 }, { "epoch": 1.7, "grad_norm": 1.008520245552063, "learning_rate": 8.337753833402404e-05, "loss": 1.222, "step": 8230 }, { "epoch": 1.7, "grad_norm": 1.0627065896987915, "learning_rate": 8.335681723995027e-05, "loss": 1.1968, "step": 8240 }, { "epoch": 1.7, "grad_norm": 1.2253535985946655, "learning_rate": 8.33360961458765e-05, "loss": 1.2135, "step": 8250 }, { "epoch": 1.7, "grad_norm": 1.0848592519760132, "learning_rate": 8.331537505180274e-05, "loss": 1.2087, "step": 8260 }, { "epoch": 1.71, "grad_norm": 1.1441541910171509, "learning_rate": 8.329465395772897e-05, "loss": 1.1789, "step": 8270 }, { "epoch": 1.71, "grad_norm": 1.1108758449554443, "learning_rate": 8.32739328636552e-05, "loss": 1.1778, "step": 8280 }, { "epoch": 1.71, "grad_norm": 1.1133983135223389, "learning_rate": 8.325321176958144e-05, "loss": 1.2062, "step": 8290 }, { "epoch": 1.71, "grad_norm": 1.1412779092788696, "learning_rate": 8.323249067550766e-05, "loss": 1.2129, "step": 8300 }, { "epoch": 1.71, "grad_norm": 1.0338503122329712, "learning_rate": 8.321176958143391e-05, "loss": 1.1727, "step": 8310 }, { "epoch": 1.72, "grad_norm": 1.1191462278366089, "learning_rate": 8.319104848736013e-05, "loss": 1.1997, "step": 8320 }, { "epoch": 1.72, "grad_norm": 1.0909239053726196, "learning_rate": 8.317032739328637e-05, "loss": 1.2144, "step": 8330 }, { "epoch": 1.72, "grad_norm": 1.1151865720748901, "learning_rate": 8.31496062992126e-05, "loss": 1.2033, "step": 8340 }, { "epoch": 1.72, "grad_norm": 1.143604040145874, "learning_rate": 8.312888520513884e-05, "loss": 1.2088, "step": 8350 }, { "epoch": 1.73, "grad_norm": 1.2317973375320435, "learning_rate": 8.310816411106507e-05, "loss": 1.2179, "step": 8360 }, { "epoch": 1.73, "grad_norm": 1.1043517589569092, "learning_rate": 8.308744301699131e-05, "loss": 1.1913, "step": 8370 }, { "epoch": 1.73, "grad_norm": 1.149396300315857, "learning_rate": 8.306672192291753e-05, "loss": 1.2066, "step": 8380 }, { "epoch": 1.73, "grad_norm": 1.0468456745147705, "learning_rate": 8.304600082884377e-05, "loss": 1.1823, "step": 8390 }, { "epoch": 1.73, "grad_norm": 1.0730814933776855, "learning_rate": 8.302527973477e-05, "loss": 1.2053, "step": 8400 }, { "epoch": 1.74, "grad_norm": 1.2069722414016724, "learning_rate": 8.300455864069622e-05, "loss": 1.1999, "step": 8410 }, { "epoch": 1.74, "grad_norm": 1.0964728593826294, "learning_rate": 8.298383754662247e-05, "loss": 1.1927, "step": 8420 }, { "epoch": 1.74, "grad_norm": 1.0547986030578613, "learning_rate": 8.296311645254869e-05, "loss": 1.2198, "step": 8430 }, { "epoch": 1.74, "grad_norm": 0.9480800628662109, "learning_rate": 8.294239535847493e-05, "loss": 1.1673, "step": 8440 }, { "epoch": 1.74, "grad_norm": 1.042935848236084, "learning_rate": 8.292167426440116e-05, "loss": 1.211, "step": 8450 }, { "epoch": 1.75, "grad_norm": 1.1812864542007446, "learning_rate": 8.29009531703274e-05, "loss": 1.1775, "step": 8460 }, { "epoch": 1.75, "grad_norm": 1.0850603580474854, "learning_rate": 8.288023207625363e-05, "loss": 1.2023, "step": 8470 }, { "epoch": 1.75, "grad_norm": 1.0494657754898071, "learning_rate": 8.285951098217987e-05, "loss": 1.1897, "step": 8480 }, { "epoch": 1.75, "grad_norm": 1.2138115167617798, "learning_rate": 8.283878988810609e-05, "loss": 1.1691, "step": 8490 }, { "epoch": 1.75, "grad_norm": 1.0943918228149414, "learning_rate": 8.281806879403234e-05, "loss": 1.1972, "step": 8500 }, { "epoch": 1.76, "grad_norm": 1.0463635921478271, "learning_rate": 8.279734769995856e-05, "loss": 1.2312, "step": 8510 }, { "epoch": 1.76, "grad_norm": 1.1013418436050415, "learning_rate": 8.27766266058848e-05, "loss": 1.2311, "step": 8520 }, { "epoch": 1.76, "grad_norm": 0.9996619820594788, "learning_rate": 8.275590551181103e-05, "loss": 1.2129, "step": 8530 }, { "epoch": 1.76, "grad_norm": 1.0416114330291748, "learning_rate": 8.273518441773727e-05, "loss": 1.1822, "step": 8540 }, { "epoch": 1.76, "grad_norm": 1.0862529277801514, "learning_rate": 8.271446332366349e-05, "loss": 1.2145, "step": 8550 }, { "epoch": 1.77, "grad_norm": 1.1965521574020386, "learning_rate": 8.269374222958972e-05, "loss": 1.1969, "step": 8560 }, { "epoch": 1.77, "grad_norm": 1.1601965427398682, "learning_rate": 8.267302113551596e-05, "loss": 1.2112, "step": 8570 }, { "epoch": 1.77, "grad_norm": 1.0845617055892944, "learning_rate": 8.265230004144219e-05, "loss": 1.1885, "step": 8580 }, { "epoch": 1.77, "grad_norm": 1.0907461643218994, "learning_rate": 8.263157894736843e-05, "loss": 1.2091, "step": 8590 }, { "epoch": 1.77, "grad_norm": 1.027777075767517, "learning_rate": 8.261085785329465e-05, "loss": 1.1919, "step": 8600 }, { "epoch": 1.78, "grad_norm": 1.2468430995941162, "learning_rate": 8.25901367592209e-05, "loss": 1.1948, "step": 8610 }, { "epoch": 1.78, "grad_norm": 1.1889891624450684, "learning_rate": 8.256941566514712e-05, "loss": 1.2143, "step": 8620 }, { "epoch": 1.78, "grad_norm": 1.2794381380081177, "learning_rate": 8.254869457107336e-05, "loss": 1.2132, "step": 8630 }, { "epoch": 1.78, "grad_norm": 1.0493546724319458, "learning_rate": 8.252797347699959e-05, "loss": 1.1766, "step": 8640 }, { "epoch": 1.78, "grad_norm": 1.001658320426941, "learning_rate": 8.250725238292583e-05, "loss": 1.1715, "step": 8650 }, { "epoch": 1.79, "grad_norm": 1.06058669090271, "learning_rate": 8.248653128885205e-05, "loss": 1.1793, "step": 8660 }, { "epoch": 1.79, "grad_norm": 1.09765625, "learning_rate": 8.24658101947783e-05, "loss": 1.1954, "step": 8670 }, { "epoch": 1.79, "grad_norm": 1.0230962038040161, "learning_rate": 8.244508910070452e-05, "loss": 1.2158, "step": 8680 }, { "epoch": 1.79, "grad_norm": 1.028173804283142, "learning_rate": 8.242436800663075e-05, "loss": 1.1874, "step": 8690 }, { "epoch": 1.8, "grad_norm": 1.0620988607406616, "learning_rate": 8.240364691255699e-05, "loss": 1.2101, "step": 8700 }, { "epoch": 1.8, "grad_norm": 1.0968023538589478, "learning_rate": 8.238292581848322e-05, "loss": 1.2131, "step": 8710 }, { "epoch": 1.8, "grad_norm": 1.1402392387390137, "learning_rate": 8.236220472440946e-05, "loss": 1.2173, "step": 8720 }, { "epoch": 1.8, "grad_norm": 1.1263937950134277, "learning_rate": 8.234148363033568e-05, "loss": 1.1796, "step": 8730 }, { "epoch": 1.8, "grad_norm": 1.1383157968521118, "learning_rate": 8.232076253626191e-05, "loss": 1.1724, "step": 8740 }, { "epoch": 1.81, "grad_norm": 1.255072832107544, "learning_rate": 8.230004144218815e-05, "loss": 1.2019, "step": 8750 }, { "epoch": 1.81, "grad_norm": 1.4536107778549194, "learning_rate": 8.227932034811438e-05, "loss": 1.189, "step": 8760 }, { "epoch": 1.81, "grad_norm": 1.0804840326309204, "learning_rate": 8.225859925404062e-05, "loss": 1.2009, "step": 8770 }, { "epoch": 1.81, "grad_norm": 1.1545298099517822, "learning_rate": 8.223787815996686e-05, "loss": 1.2047, "step": 8780 }, { "epoch": 1.81, "grad_norm": 1.2308061122894287, "learning_rate": 8.221715706589308e-05, "loss": 1.1958, "step": 8790 }, { "epoch": 1.82, "grad_norm": 1.2053345441818237, "learning_rate": 8.219643597181933e-05, "loss": 1.183, "step": 8800 }, { "epoch": 1.82, "grad_norm": 1.2016477584838867, "learning_rate": 8.217571487774555e-05, "loss": 1.1831, "step": 8810 }, { "epoch": 1.82, "grad_norm": 1.2991312742233276, "learning_rate": 8.215499378367178e-05, "loss": 1.1893, "step": 8820 }, { "epoch": 1.82, "grad_norm": 1.0688191652297974, "learning_rate": 8.213427268959802e-05, "loss": 1.1962, "step": 8830 }, { "epoch": 1.82, "grad_norm": 1.0042345523834229, "learning_rate": 8.211355159552425e-05, "loss": 1.1935, "step": 8840 }, { "epoch": 1.83, "grad_norm": 1.1283092498779297, "learning_rate": 8.209283050145047e-05, "loss": 1.1983, "step": 8850 }, { "epoch": 1.83, "grad_norm": 1.1149276494979858, "learning_rate": 8.207210940737672e-05, "loss": 1.1967, "step": 8860 }, { "epoch": 1.83, "grad_norm": 1.0818403959274292, "learning_rate": 8.205138831330294e-05, "loss": 1.187, "step": 8870 }, { "epoch": 1.83, "grad_norm": 1.1263095140457153, "learning_rate": 8.203066721922918e-05, "loss": 1.1866, "step": 8880 }, { "epoch": 1.83, "grad_norm": 1.0879017114639282, "learning_rate": 8.200994612515541e-05, "loss": 1.2018, "step": 8890 }, { "epoch": 1.84, "grad_norm": 1.1448615789413452, "learning_rate": 8.198922503108164e-05, "loss": 1.1879, "step": 8900 }, { "epoch": 1.84, "grad_norm": 0.9900506138801575, "learning_rate": 8.196850393700788e-05, "loss": 1.1985, "step": 8910 }, { "epoch": 1.84, "grad_norm": 1.0438898801803589, "learning_rate": 8.19477828429341e-05, "loss": 1.1881, "step": 8920 }, { "epoch": 1.84, "grad_norm": 1.1896075010299683, "learning_rate": 8.192706174886034e-05, "loss": 1.2022, "step": 8930 }, { "epoch": 1.84, "grad_norm": 1.152300238609314, "learning_rate": 8.190634065478658e-05, "loss": 1.1828, "step": 8940 }, { "epoch": 1.85, "grad_norm": 1.0616978406906128, "learning_rate": 8.188561956071281e-05, "loss": 1.1788, "step": 8950 }, { "epoch": 1.85, "grad_norm": 1.0310215950012207, "learning_rate": 8.186489846663903e-05, "loss": 1.1898, "step": 8960 }, { "epoch": 1.85, "grad_norm": 1.1227622032165527, "learning_rate": 8.184417737256528e-05, "loss": 1.1936, "step": 8970 }, { "epoch": 1.85, "grad_norm": 1.0663117170333862, "learning_rate": 8.18234562784915e-05, "loss": 1.1857, "step": 8980 }, { "epoch": 1.86, "grad_norm": 1.202330470085144, "learning_rate": 8.180273518441774e-05, "loss": 1.1811, "step": 8990 }, { "epoch": 1.86, "grad_norm": 1.036596417427063, "learning_rate": 8.178201409034397e-05, "loss": 1.2013, "step": 9000 }, { "epoch": 1.86, "grad_norm": 1.0503324270248413, "learning_rate": 8.176129299627021e-05, "loss": 1.1755, "step": 9010 }, { "epoch": 1.86, "grad_norm": 1.0510581731796265, "learning_rate": 8.174057190219644e-05, "loss": 1.1936, "step": 9020 }, { "epoch": 1.86, "grad_norm": 1.0580915212631226, "learning_rate": 8.171985080812268e-05, "loss": 1.2006, "step": 9030 }, { "epoch": 1.87, "grad_norm": 1.0723899602890015, "learning_rate": 8.16991297140489e-05, "loss": 1.1841, "step": 9040 }, { "epoch": 1.87, "grad_norm": 1.0289912223815918, "learning_rate": 8.167840861997515e-05, "loss": 1.1762, "step": 9050 }, { "epoch": 1.87, "grad_norm": 1.2664040327072144, "learning_rate": 8.165768752590137e-05, "loss": 1.1695, "step": 9060 }, { "epoch": 1.87, "grad_norm": 1.0689359903335571, "learning_rate": 8.163696643182759e-05, "loss": 1.1888, "step": 9070 }, { "epoch": 1.87, "grad_norm": 1.131007194519043, "learning_rate": 8.161624533775384e-05, "loss": 1.1856, "step": 9080 }, { "epoch": 1.88, "grad_norm": 1.06068754196167, "learning_rate": 8.159552424368006e-05, "loss": 1.2055, "step": 9090 }, { "epoch": 1.88, "grad_norm": 1.0843102931976318, "learning_rate": 8.15748031496063e-05, "loss": 1.1898, "step": 9100 }, { "epoch": 1.88, "grad_norm": 1.141878366470337, "learning_rate": 8.155408205553253e-05, "loss": 1.1885, "step": 9110 }, { "epoch": 1.88, "grad_norm": 1.1082584857940674, "learning_rate": 8.153336096145877e-05, "loss": 1.1843, "step": 9120 }, { "epoch": 1.88, "grad_norm": 1.1285297870635986, "learning_rate": 8.1512639867385e-05, "loss": 1.2126, "step": 9130 }, { "epoch": 1.89, "grad_norm": 1.2701120376586914, "learning_rate": 8.149191877331124e-05, "loss": 1.1816, "step": 9140 }, { "epoch": 1.89, "grad_norm": 1.1763367652893066, "learning_rate": 8.147119767923746e-05, "loss": 1.2031, "step": 9150 }, { "epoch": 1.89, "grad_norm": 1.0942009687423706, "learning_rate": 8.145047658516371e-05, "loss": 1.1922, "step": 9160 }, { "epoch": 1.89, "grad_norm": 1.1373592615127563, "learning_rate": 8.142975549108993e-05, "loss": 1.1891, "step": 9170 }, { "epoch": 1.89, "grad_norm": 1.0547840595245361, "learning_rate": 8.140903439701617e-05, "loss": 1.2119, "step": 9180 }, { "epoch": 1.9, "grad_norm": 0.973156213760376, "learning_rate": 8.13883133029424e-05, "loss": 1.1929, "step": 9190 }, { "epoch": 1.9, "grad_norm": 1.07832670211792, "learning_rate": 8.136759220886864e-05, "loss": 1.1693, "step": 9200 }, { "epoch": 1.9, "grad_norm": 1.173492670059204, "learning_rate": 8.134687111479487e-05, "loss": 1.1998, "step": 9210 }, { "epoch": 1.9, "grad_norm": 1.161414623260498, "learning_rate": 8.132615002072109e-05, "loss": 1.1769, "step": 9220 }, { "epoch": 1.9, "grad_norm": 1.1352944374084473, "learning_rate": 8.130542892664733e-05, "loss": 1.1837, "step": 9230 }, { "epoch": 1.91, "grad_norm": 1.102401852607727, "learning_rate": 8.128470783257356e-05, "loss": 1.175, "step": 9240 }, { "epoch": 1.91, "grad_norm": 1.2963926792144775, "learning_rate": 8.12639867384998e-05, "loss": 1.1782, "step": 9250 }, { "epoch": 1.91, "grad_norm": 1.0536015033721924, "learning_rate": 8.124326564442602e-05, "loss": 1.1799, "step": 9260 }, { "epoch": 1.91, "grad_norm": 1.1193984746932983, "learning_rate": 8.122254455035227e-05, "loss": 1.1729, "step": 9270 }, { "epoch": 1.91, "grad_norm": 1.1112126111984253, "learning_rate": 8.120182345627849e-05, "loss": 1.168, "step": 9280 }, { "epoch": 1.92, "grad_norm": 1.0647683143615723, "learning_rate": 8.118110236220473e-05, "loss": 1.2057, "step": 9290 }, { "epoch": 1.92, "grad_norm": 1.1709637641906738, "learning_rate": 8.116038126813096e-05, "loss": 1.1757, "step": 9300 }, { "epoch": 1.92, "grad_norm": 1.0774716138839722, "learning_rate": 8.11396601740572e-05, "loss": 1.1784, "step": 9310 }, { "epoch": 1.92, "grad_norm": 1.088477373123169, "learning_rate": 8.111893907998343e-05, "loss": 1.1726, "step": 9320 }, { "epoch": 1.93, "grad_norm": 1.0596317052841187, "learning_rate": 8.109821798590967e-05, "loss": 1.2029, "step": 9330 }, { "epoch": 1.93, "grad_norm": 1.1406421661376953, "learning_rate": 8.107749689183589e-05, "loss": 1.2154, "step": 9340 }, { "epoch": 1.93, "grad_norm": 1.1345916986465454, "learning_rate": 8.105677579776214e-05, "loss": 1.1684, "step": 9350 }, { "epoch": 1.93, "grad_norm": 1.2776840925216675, "learning_rate": 8.103605470368836e-05, "loss": 1.1915, "step": 9360 }, { "epoch": 1.93, "grad_norm": 1.1104062795639038, "learning_rate": 8.101533360961459e-05, "loss": 1.1935, "step": 9370 }, { "epoch": 1.94, "grad_norm": 1.0262198448181152, "learning_rate": 8.099461251554083e-05, "loss": 1.2047, "step": 9380 }, { "epoch": 1.94, "grad_norm": 1.1350001096725464, "learning_rate": 8.097389142146705e-05, "loss": 1.1798, "step": 9390 }, { "epoch": 1.94, "grad_norm": 1.0631601810455322, "learning_rate": 8.095317032739328e-05, "loss": 1.1744, "step": 9400 }, { "epoch": 1.94, "grad_norm": 1.1678036451339722, "learning_rate": 8.093244923331952e-05, "loss": 1.208, "step": 9410 }, { "epoch": 1.94, "grad_norm": 1.0665799379348755, "learning_rate": 8.091172813924576e-05, "loss": 1.1792, "step": 9420 }, { "epoch": 1.95, "grad_norm": 1.1364054679870605, "learning_rate": 8.089100704517199e-05, "loss": 1.2036, "step": 9430 }, { "epoch": 1.95, "grad_norm": 1.0586035251617432, "learning_rate": 8.087028595109823e-05, "loss": 1.201, "step": 9440 }, { "epoch": 1.95, "grad_norm": 0.9820613861083984, "learning_rate": 8.084956485702445e-05, "loss": 1.1912, "step": 9450 }, { "epoch": 1.95, "grad_norm": 1.0832290649414062, "learning_rate": 8.08288437629507e-05, "loss": 1.2044, "step": 9460 }, { "epoch": 1.95, "grad_norm": 1.2247551679611206, "learning_rate": 8.080812266887692e-05, "loss": 1.1613, "step": 9470 }, { "epoch": 1.96, "grad_norm": 1.0764803886413574, "learning_rate": 8.078740157480315e-05, "loss": 1.1885, "step": 9480 }, { "epoch": 1.96, "grad_norm": 1.1871037483215332, "learning_rate": 8.076668048072939e-05, "loss": 1.2089, "step": 9490 }, { "epoch": 1.96, "grad_norm": 1.0709806680679321, "learning_rate": 8.074595938665562e-05, "loss": 1.1743, "step": 9500 }, { "epoch": 1.96, "grad_norm": 1.2077934741973877, "learning_rate": 8.072523829258184e-05, "loss": 1.192, "step": 9510 }, { "epoch": 1.96, "grad_norm": 1.111323356628418, "learning_rate": 8.070451719850809e-05, "loss": 1.178, "step": 9520 }, { "epoch": 1.97, "grad_norm": 1.1351996660232544, "learning_rate": 8.06858682138417e-05, "loss": 1.1921, "step": 9530 }, { "epoch": 1.97, "grad_norm": 1.1319098472595215, "learning_rate": 8.066514711976792e-05, "loss": 1.2069, "step": 9540 }, { "epoch": 1.97, "grad_norm": 1.080480694770813, "learning_rate": 8.064442602569417e-05, "loss": 1.1949, "step": 9550 }, { "epoch": 1.97, "grad_norm": 1.126483678817749, "learning_rate": 8.062370493162039e-05, "loss": 1.1523, "step": 9560 }, { "epoch": 1.97, "grad_norm": 1.0840978622436523, "learning_rate": 8.060298383754663e-05, "loss": 1.2074, "step": 9570 }, { "epoch": 1.98, "grad_norm": 1.1075447797775269, "learning_rate": 8.058226274347286e-05, "loss": 1.1937, "step": 9580 }, { "epoch": 1.98, "grad_norm": 1.0781664848327637, "learning_rate": 8.05615416493991e-05, "loss": 1.1837, "step": 9590 }, { "epoch": 1.98, "grad_norm": 1.0375909805297852, "learning_rate": 8.054082055532532e-05, "loss": 1.1928, "step": 9600 }, { "epoch": 1.98, "grad_norm": 1.101149320602417, "learning_rate": 8.052009946125155e-05, "loss": 1.2006, "step": 9610 }, { "epoch": 1.99, "grad_norm": 0.9537743926048279, "learning_rate": 8.049937836717779e-05, "loss": 1.1939, "step": 9620 }, { "epoch": 1.99, "grad_norm": 1.1106035709381104, "learning_rate": 8.047865727310402e-05, "loss": 1.1874, "step": 9630 }, { "epoch": 1.99, "grad_norm": 1.1406303644180298, "learning_rate": 8.045793617903026e-05, "loss": 1.1885, "step": 9640 }, { "epoch": 1.99, "grad_norm": 1.0510179996490479, "learning_rate": 8.043721508495648e-05, "loss": 1.1831, "step": 9650 }, { "epoch": 1.99, "grad_norm": 1.1432597637176514, "learning_rate": 8.041649399088273e-05, "loss": 1.1746, "step": 9660 }, { "epoch": 2.0, "grad_norm": 1.0710557699203491, "learning_rate": 8.039577289680895e-05, "loss": 1.1668, "step": 9670 }, { "epoch": 2.0, "grad_norm": 1.067130446434021, "learning_rate": 8.037505180273519e-05, "loss": 1.1895, "step": 9680 }, { "epoch": 2.0, "grad_norm": 1.1639565229415894, "learning_rate": 8.035433070866142e-05, "loss": 1.1786, "step": 9690 }, { "epoch": 2.0, "eval_loss": 1.2795084714889526, "eval_runtime": 1604.6697, "eval_samples_per_second": 262.864, "eval_steps_per_second": 4.107, "step": 9692 } ], "logging_steps": 10, "max_steps": 48460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 4.1335523730815713e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }