{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999939429547479, "eval_steps": 2000, "global_step": 20637, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00048456362017480634, "grad_norm": 1.9635685682296753, "learning_rate": 8.064516129032258e-07, "loss": 3.9647, "num_input_tokens_seen": 396856, "step": 10 }, { "epoch": 0.0009691272403496127, "grad_norm": 1.8676133155822754, "learning_rate": 1.6129032258064516e-06, "loss": 3.8773, "num_input_tokens_seen": 797088, "step": 20 }, { "epoch": 0.001453690860524419, "grad_norm": 1.3513129949569702, "learning_rate": 2.4193548387096776e-06, "loss": 3.7214, "num_input_tokens_seen": 1195348, "step": 30 }, { "epoch": 0.0019382544806992254, "grad_norm": 1.085821270942688, "learning_rate": 3.225806451612903e-06, "loss": 3.608, "num_input_tokens_seen": 1563616, "step": 40 }, { "epoch": 0.0024228181008740315, "grad_norm": 0.7733827233314514, "learning_rate": 4.032258064516129e-06, "loss": 3.4798, "num_input_tokens_seen": 1953880, "step": 50 }, { "epoch": 0.002907381721048838, "grad_norm": 0.7254723310470581, "learning_rate": 4.838709677419355e-06, "loss": 3.3992, "num_input_tokens_seen": 2358888, "step": 60 }, { "epoch": 0.0033919453412236443, "grad_norm": 0.692788302898407, "learning_rate": 5.64516129032258e-06, "loss": 3.287, "num_input_tokens_seen": 2784136, "step": 70 }, { "epoch": 0.0038765089613984507, "grad_norm": 0.7256464958190918, "learning_rate": 6.451612903225806e-06, "loss": 3.1694, "num_input_tokens_seen": 3174820, "step": 80 }, { "epoch": 0.004361072581573257, "grad_norm": 0.6375479102134705, "learning_rate": 7.258064516129033e-06, "loss": 3.093, "num_input_tokens_seen": 3579040, "step": 90 }, { "epoch": 0.004845636201748063, "grad_norm": 0.5901396870613098, "learning_rate": 8.064516129032258e-06, "loss": 2.9749, "num_input_tokens_seen": 3959804, "step": 100 }, { "epoch": 0.00533019982192287, "grad_norm": 0.5945073366165161, "learning_rate": 8.870967741935484e-06, "loss": 2.9903, "num_input_tokens_seen": 4327636, "step": 110 }, { "epoch": 0.005814763442097676, "grad_norm": 0.5525237917900085, "learning_rate": 9.67741935483871e-06, "loss": 2.9102, "num_input_tokens_seen": 4722664, "step": 120 }, { "epoch": 0.006299327062272482, "grad_norm": 0.5421098470687866, "learning_rate": 1.0483870967741936e-05, "loss": 2.8672, "num_input_tokens_seen": 5108488, "step": 130 }, { "epoch": 0.006783890682447289, "grad_norm": 0.6057586669921875, "learning_rate": 1.129032258064516e-05, "loss": 2.7504, "num_input_tokens_seen": 5499032, "step": 140 }, { "epoch": 0.007268454302622095, "grad_norm": 0.5147073864936829, "learning_rate": 1.2096774193548388e-05, "loss": 2.728, "num_input_tokens_seen": 5899468, "step": 150 }, { "epoch": 0.0077530179227969015, "grad_norm": 0.4975648820400238, "learning_rate": 1.2903225806451613e-05, "loss": 2.635, "num_input_tokens_seen": 6294784, "step": 160 }, { "epoch": 0.008237581542971708, "grad_norm": 0.6665644645690918, "learning_rate": 1.3709677419354839e-05, "loss": 2.6568, "num_input_tokens_seen": 6724124, "step": 170 }, { "epoch": 0.008722145163146514, "grad_norm": 0.5339853763580322, "learning_rate": 1.4516129032258066e-05, "loss": 2.6569, "num_input_tokens_seen": 7126528, "step": 180 }, { "epoch": 0.00920670878332132, "grad_norm": 0.6043190956115723, "learning_rate": 1.5322580645161292e-05, "loss": 2.5659, "num_input_tokens_seen": 7539124, "step": 190 }, { "epoch": 0.009691272403496126, "grad_norm": 0.576831579208374, "learning_rate": 1.6129032258064517e-05, "loss": 2.6224, "num_input_tokens_seen": 7943312, "step": 200 }, { "epoch": 0.010175836023670932, "grad_norm": 0.5602587461471558, "learning_rate": 1.693548387096774e-05, "loss": 2.5489, "num_input_tokens_seen": 8345264, "step": 210 }, { "epoch": 0.01066039964384574, "grad_norm": 0.5199883580207825, "learning_rate": 1.774193548387097e-05, "loss": 2.4895, "num_input_tokens_seen": 8724596, "step": 220 }, { "epoch": 0.011144963264020546, "grad_norm": 0.5386412143707275, "learning_rate": 1.8548387096774193e-05, "loss": 2.4376, "num_input_tokens_seen": 9128556, "step": 230 }, { "epoch": 0.011629526884195352, "grad_norm": 0.4958534836769104, "learning_rate": 1.935483870967742e-05, "loss": 2.4393, "num_input_tokens_seen": 9531512, "step": 240 }, { "epoch": 0.012114090504370158, "grad_norm": 0.5383809208869934, "learning_rate": 2.0161290322580645e-05, "loss": 2.408, "num_input_tokens_seen": 9926976, "step": 250 }, { "epoch": 0.012598654124544964, "grad_norm": 0.5953584313392639, "learning_rate": 2.0967741935483873e-05, "loss": 2.4448, "num_input_tokens_seen": 10307532, "step": 260 }, { "epoch": 0.013083217744719771, "grad_norm": 0.6360442638397217, "learning_rate": 2.1774193548387097e-05, "loss": 2.4015, "num_input_tokens_seen": 10680264, "step": 270 }, { "epoch": 0.013567781364894577, "grad_norm": 0.48746633529663086, "learning_rate": 2.258064516129032e-05, "loss": 2.3641, "num_input_tokens_seen": 11070464, "step": 280 }, { "epoch": 0.014052344985069383, "grad_norm": 0.6602309346199036, "learning_rate": 2.338709677419355e-05, "loss": 2.3722, "num_input_tokens_seen": 11470192, "step": 290 }, { "epoch": 0.01453690860524419, "grad_norm": 0.5370129346847534, "learning_rate": 2.4193548387096777e-05, "loss": 2.3605, "num_input_tokens_seen": 11872288, "step": 300 }, { "epoch": 0.015021472225418997, "grad_norm": 0.575202226638794, "learning_rate": 2.5e-05, "loss": 2.3164, "num_input_tokens_seen": 12282104, "step": 310 }, { "epoch": 0.015506035845593803, "grad_norm": 0.5220383405685425, "learning_rate": 2.5806451612903226e-05, "loss": 2.3395, "num_input_tokens_seen": 12670432, "step": 320 }, { "epoch": 0.01599059946576861, "grad_norm": 0.6971769332885742, "learning_rate": 2.661290322580645e-05, "loss": 2.3035, "num_input_tokens_seen": 13048952, "step": 330 }, { "epoch": 0.016475163085943417, "grad_norm": 0.5094983577728271, "learning_rate": 2.7419354838709678e-05, "loss": 2.2668, "num_input_tokens_seen": 13454104, "step": 340 }, { "epoch": 0.01695972670611822, "grad_norm": 0.5272497534751892, "learning_rate": 2.822580645161291e-05, "loss": 2.2398, "num_input_tokens_seen": 13846768, "step": 350 }, { "epoch": 0.01744429032629303, "grad_norm": 0.5047647356987, "learning_rate": 2.9032258064516133e-05, "loss": 2.2096, "num_input_tokens_seen": 14247376, "step": 360 }, { "epoch": 0.017928853946467833, "grad_norm": 0.5520374178886414, "learning_rate": 2.9838709677419357e-05, "loss": 2.2371, "num_input_tokens_seen": 14644760, "step": 370 }, { "epoch": 0.01841341756664264, "grad_norm": 0.46873587369918823, "learning_rate": 3.0645161290322585e-05, "loss": 2.2636, "num_input_tokens_seen": 15030968, "step": 380 }, { "epoch": 0.018897981186817448, "grad_norm": 0.5534300208091736, "learning_rate": 3.1451612903225806e-05, "loss": 2.1924, "num_input_tokens_seen": 15442684, "step": 390 }, { "epoch": 0.019382544806992252, "grad_norm": 0.5355961918830872, "learning_rate": 3.2258064516129034e-05, "loss": 2.1482, "num_input_tokens_seen": 15823228, "step": 400 }, { "epoch": 0.01986710842716706, "grad_norm": 0.5530371069908142, "learning_rate": 3.306451612903226e-05, "loss": 2.1865, "num_input_tokens_seen": 16206660, "step": 410 }, { "epoch": 0.020351672047341864, "grad_norm": 0.5505437254905701, "learning_rate": 3.387096774193548e-05, "loss": 2.1665, "num_input_tokens_seen": 16610308, "step": 420 }, { "epoch": 0.020836235667516672, "grad_norm": 0.5833243727684021, "learning_rate": 3.467741935483872e-05, "loss": 2.1413, "num_input_tokens_seen": 17013200, "step": 430 }, { "epoch": 0.02132079928769148, "grad_norm": 0.5723456144332886, "learning_rate": 3.548387096774194e-05, "loss": 2.149, "num_input_tokens_seen": 17412528, "step": 440 }, { "epoch": 0.021805362907866284, "grad_norm": 0.49083006381988525, "learning_rate": 3.6290322580645165e-05, "loss": 2.208, "num_input_tokens_seen": 17818648, "step": 450 }, { "epoch": 0.02228992652804109, "grad_norm": 0.5125194787979126, "learning_rate": 3.7096774193548386e-05, "loss": 2.1628, "num_input_tokens_seen": 18215112, "step": 460 }, { "epoch": 0.022774490148215896, "grad_norm": 0.5249300003051758, "learning_rate": 3.7903225806451614e-05, "loss": 2.1179, "num_input_tokens_seen": 18613100, "step": 470 }, { "epoch": 0.023259053768390704, "grad_norm": 0.5913705825805664, "learning_rate": 3.870967741935484e-05, "loss": 2.0918, "num_input_tokens_seen": 19006580, "step": 480 }, { "epoch": 0.02374361738856551, "grad_norm": 0.5256056785583496, "learning_rate": 3.951612903225806e-05, "loss": 2.1146, "num_input_tokens_seen": 19410508, "step": 490 }, { "epoch": 0.024228181008740315, "grad_norm": 0.5684739947319031, "learning_rate": 4.032258064516129e-05, "loss": 2.0621, "num_input_tokens_seen": 19801804, "step": 500 }, { "epoch": 0.024712744628915123, "grad_norm": 0.5903922319412231, "learning_rate": 4.112903225806452e-05, "loss": 2.1035, "num_input_tokens_seen": 20183328, "step": 510 }, { "epoch": 0.025197308249089927, "grad_norm": 0.5174550414085388, "learning_rate": 4.1935483870967746e-05, "loss": 2.088, "num_input_tokens_seen": 20585364, "step": 520 }, { "epoch": 0.025681871869264735, "grad_norm": 0.5224788188934326, "learning_rate": 4.2741935483870973e-05, "loss": 2.0708, "num_input_tokens_seen": 20971292, "step": 530 }, { "epoch": 0.026166435489439543, "grad_norm": 0.5536626577377319, "learning_rate": 4.3548387096774194e-05, "loss": 2.0135, "num_input_tokens_seen": 21390084, "step": 540 }, { "epoch": 0.026650999109614347, "grad_norm": 0.49406698346138, "learning_rate": 4.435483870967742e-05, "loss": 2.0626, "num_input_tokens_seen": 21790964, "step": 550 }, { "epoch": 0.027135562729789155, "grad_norm": 0.5543055534362793, "learning_rate": 4.516129032258064e-05, "loss": 2.0395, "num_input_tokens_seen": 22160120, "step": 560 }, { "epoch": 0.02762012634996396, "grad_norm": 0.5112178325653076, "learning_rate": 4.596774193548387e-05, "loss": 2.0956, "num_input_tokens_seen": 22555328, "step": 570 }, { "epoch": 0.028104689970138767, "grad_norm": 0.5143571496009827, "learning_rate": 4.67741935483871e-05, "loss": 2.0209, "num_input_tokens_seen": 22941364, "step": 580 }, { "epoch": 0.028589253590313574, "grad_norm": 0.501890242099762, "learning_rate": 4.7580645161290326e-05, "loss": 2.0326, "num_input_tokens_seen": 23298096, "step": 590 }, { "epoch": 0.02907381721048838, "grad_norm": 0.47222423553466797, "learning_rate": 4.8387096774193554e-05, "loss": 2.0099, "num_input_tokens_seen": 23698488, "step": 600 }, { "epoch": 0.029558380830663186, "grad_norm": 0.5555810928344727, "learning_rate": 4.9193548387096775e-05, "loss": 1.9978, "num_input_tokens_seen": 24088604, "step": 610 }, { "epoch": 0.030042944450837994, "grad_norm": 0.6044710278511047, "learning_rate": 5e-05, "loss": 2.0005, "num_input_tokens_seen": 24477980, "step": 620 }, { "epoch": 0.030527508071012798, "grad_norm": 0.5720553398132324, "learning_rate": 4.999996920985807e-05, "loss": 1.9575, "num_input_tokens_seen": 24859124, "step": 630 }, { "epoch": 0.031012071691187606, "grad_norm": 0.5833857655525208, "learning_rate": 4.9999876839508106e-05, "loss": 1.992, "num_input_tokens_seen": 25283928, "step": 640 }, { "epoch": 0.03149663531136241, "grad_norm": 0.49288150668144226, "learning_rate": 4.999972288917764e-05, "loss": 1.952, "num_input_tokens_seen": 25662068, "step": 650 }, { "epoch": 0.03198119893153722, "grad_norm": 0.5453637838363647, "learning_rate": 4.999950735924589e-05, "loss": 2.0213, "num_input_tokens_seen": 26066280, "step": 660 }, { "epoch": 0.032465762551712025, "grad_norm": 0.4919157922267914, "learning_rate": 4.9999230250243744e-05, "loss": 1.9531, "num_input_tokens_seen": 26446308, "step": 670 }, { "epoch": 0.03295032617188683, "grad_norm": 0.5228652954101562, "learning_rate": 4.999889156285379e-05, "loss": 1.9746, "num_input_tokens_seen": 26862780, "step": 680 }, { "epoch": 0.033434889792061634, "grad_norm": 0.5261794328689575, "learning_rate": 4.999849129791028e-05, "loss": 1.9717, "num_input_tokens_seen": 27239428, "step": 690 }, { "epoch": 0.03391945341223644, "grad_norm": 0.5125803351402283, "learning_rate": 4.9998029456399144e-05, "loss": 1.975, "num_input_tokens_seen": 27633548, "step": 700 }, { "epoch": 0.03440401703241125, "grad_norm": 0.48046156764030457, "learning_rate": 4.999750603945801e-05, "loss": 1.925, "num_input_tokens_seen": 28046196, "step": 710 }, { "epoch": 0.03488858065258606, "grad_norm": 0.5283321738243103, "learning_rate": 4.999692104837615e-05, "loss": 1.9442, "num_input_tokens_seen": 28423160, "step": 720 }, { "epoch": 0.035373144272760865, "grad_norm": 0.4812624156475067, "learning_rate": 4.999627448459453e-05, "loss": 1.8945, "num_input_tokens_seen": 28842052, "step": 730 }, { "epoch": 0.035857707892935665, "grad_norm": 0.5417799353599548, "learning_rate": 4.999556634970578e-05, "loss": 1.9628, "num_input_tokens_seen": 29251476, "step": 740 }, { "epoch": 0.03634227151311047, "grad_norm": 0.48694953322410583, "learning_rate": 4.999479664545417e-05, "loss": 1.9061, "num_input_tokens_seen": 29660076, "step": 750 }, { "epoch": 0.03682683513328528, "grad_norm": 0.5140628814697266, "learning_rate": 4.999396537373565e-05, "loss": 1.9304, "num_input_tokens_seen": 30049624, "step": 760 }, { "epoch": 0.03731139875346009, "grad_norm": 0.48269200325012207, "learning_rate": 4.9993072536597816e-05, "loss": 1.9188, "num_input_tokens_seen": 30466320, "step": 770 }, { "epoch": 0.037795962373634896, "grad_norm": 0.5299521684646606, "learning_rate": 4.999211813623993e-05, "loss": 1.8832, "num_input_tokens_seen": 30853272, "step": 780 }, { "epoch": 0.0382805259938097, "grad_norm": 0.5212485790252686, "learning_rate": 4.999110217501286e-05, "loss": 1.8968, "num_input_tokens_seen": 31245900, "step": 790 }, { "epoch": 0.038765089613984505, "grad_norm": 0.5231190323829651, "learning_rate": 4.9990024655419146e-05, "loss": 1.8859, "num_input_tokens_seen": 31662092, "step": 800 }, { "epoch": 0.03924965323415931, "grad_norm": 0.49723413586616516, "learning_rate": 4.998888558011295e-05, "loss": 1.872, "num_input_tokens_seen": 32059116, "step": 810 }, { "epoch": 0.03973421685433412, "grad_norm": 0.6407498121261597, "learning_rate": 4.9987684951900036e-05, "loss": 1.8606, "num_input_tokens_seen": 32484640, "step": 820 }, { "epoch": 0.04021878047450893, "grad_norm": 0.5182517766952515, "learning_rate": 4.998642277373783e-05, "loss": 1.8906, "num_input_tokens_seen": 32857036, "step": 830 }, { "epoch": 0.04070334409468373, "grad_norm": 0.46163928508758545, "learning_rate": 4.998509904873533e-05, "loss": 1.8566, "num_input_tokens_seen": 33227492, "step": 840 }, { "epoch": 0.041187907714858536, "grad_norm": 0.47485044598579407, "learning_rate": 4.998371378015314e-05, "loss": 1.8759, "num_input_tokens_seen": 33614644, "step": 850 }, { "epoch": 0.041672471335033344, "grad_norm": 0.4748258888721466, "learning_rate": 4.998226697140349e-05, "loss": 1.877, "num_input_tokens_seen": 33984924, "step": 860 }, { "epoch": 0.04215703495520815, "grad_norm": 0.5473403930664062, "learning_rate": 4.998075862605017e-05, "loss": 1.8403, "num_input_tokens_seen": 34379932, "step": 870 }, { "epoch": 0.04264159857538296, "grad_norm": 0.4143201410770416, "learning_rate": 4.9979188747808545e-05, "loss": 1.87, "num_input_tokens_seen": 34769220, "step": 880 }, { "epoch": 0.04312616219555776, "grad_norm": 0.43778666853904724, "learning_rate": 4.997755734054557e-05, "loss": 1.9247, "num_input_tokens_seen": 35124728, "step": 890 }, { "epoch": 0.04361072581573257, "grad_norm": 0.532357394695282, "learning_rate": 4.9975864408279725e-05, "loss": 1.8527, "num_input_tokens_seen": 35526992, "step": 900 }, { "epoch": 0.044095289435907375, "grad_norm": 0.4756541848182678, "learning_rate": 4.997410995518108e-05, "loss": 1.8328, "num_input_tokens_seen": 35932440, "step": 910 }, { "epoch": 0.04457985305608218, "grad_norm": 0.4680721163749695, "learning_rate": 4.997229398557122e-05, "loss": 1.804, "num_input_tokens_seen": 36352128, "step": 920 }, { "epoch": 0.04506441667625699, "grad_norm": 0.48043662309646606, "learning_rate": 4.9970416503923254e-05, "loss": 1.8609, "num_input_tokens_seen": 36746228, "step": 930 }, { "epoch": 0.04554898029643179, "grad_norm": 0.4995608627796173, "learning_rate": 4.996847751486182e-05, "loss": 1.8493, "num_input_tokens_seen": 37138932, "step": 940 }, { "epoch": 0.0460335439166066, "grad_norm": 0.5137046575546265, "learning_rate": 4.996647702316306e-05, "loss": 1.8672, "num_input_tokens_seen": 37519424, "step": 950 }, { "epoch": 0.04651810753678141, "grad_norm": 0.48803040385246277, "learning_rate": 4.996441503375461e-05, "loss": 1.8371, "num_input_tokens_seen": 37895416, "step": 960 }, { "epoch": 0.047002671156956215, "grad_norm": 0.4492471516132355, "learning_rate": 4.996229155171558e-05, "loss": 1.8697, "num_input_tokens_seen": 38307728, "step": 970 }, { "epoch": 0.04748723477713102, "grad_norm": 0.4708845913410187, "learning_rate": 4.9960106582276556e-05, "loss": 1.8481, "num_input_tokens_seen": 38697636, "step": 980 }, { "epoch": 0.04797179839730582, "grad_norm": 0.49647071957588196, "learning_rate": 4.995786013081958e-05, "loss": 1.8812, "num_input_tokens_seen": 39088844, "step": 990 }, { "epoch": 0.04845636201748063, "grad_norm": 0.5095282793045044, "learning_rate": 4.995555220287814e-05, "loss": 1.8549, "num_input_tokens_seen": 39495924, "step": 1000 }, { "epoch": 0.04894092563765544, "grad_norm": 0.47867298126220703, "learning_rate": 4.9953182804137144e-05, "loss": 1.783, "num_input_tokens_seen": 39909712, "step": 1010 }, { "epoch": 0.049425489257830246, "grad_norm": 0.4481337070465088, "learning_rate": 4.9950751940432935e-05, "loss": 1.7918, "num_input_tokens_seen": 40301008, "step": 1020 }, { "epoch": 0.049910052878005054, "grad_norm": 0.4755343496799469, "learning_rate": 4.994825961775323e-05, "loss": 1.7714, "num_input_tokens_seen": 40676476, "step": 1030 }, { "epoch": 0.050394616498179855, "grad_norm": 0.5189250707626343, "learning_rate": 4.994570584223715e-05, "loss": 1.8269, "num_input_tokens_seen": 41064096, "step": 1040 }, { "epoch": 0.05087918011835466, "grad_norm": 0.47439032793045044, "learning_rate": 4.994309062017519e-05, "loss": 1.7933, "num_input_tokens_seen": 41445416, "step": 1050 }, { "epoch": 0.05136374373852947, "grad_norm": 0.4340980052947998, "learning_rate": 4.994041395800918e-05, "loss": 1.7959, "num_input_tokens_seen": 41850336, "step": 1060 }, { "epoch": 0.05184830735870428, "grad_norm": 0.4619131088256836, "learning_rate": 4.993767586233232e-05, "loss": 1.836, "num_input_tokens_seen": 42265248, "step": 1070 }, { "epoch": 0.052332870978879086, "grad_norm": 0.5889285206794739, "learning_rate": 4.993487633988912e-05, "loss": 1.8257, "num_input_tokens_seen": 42676764, "step": 1080 }, { "epoch": 0.052817434599053886, "grad_norm": 0.4897553026676178, "learning_rate": 4.993201539757538e-05, "loss": 1.8588, "num_input_tokens_seen": 43053880, "step": 1090 }, { "epoch": 0.053301998219228694, "grad_norm": 0.47212594747543335, "learning_rate": 4.992909304243822e-05, "loss": 1.8094, "num_input_tokens_seen": 43430872, "step": 1100 }, { "epoch": 0.0537865618394035, "grad_norm": 0.44554567337036133, "learning_rate": 4.992610928167601e-05, "loss": 1.7834, "num_input_tokens_seen": 43851752, "step": 1110 }, { "epoch": 0.05427112545957831, "grad_norm": 0.45216086506843567, "learning_rate": 4.992306412263839e-05, "loss": 1.8522, "num_input_tokens_seen": 44254964, "step": 1120 }, { "epoch": 0.05475568907975312, "grad_norm": 0.5445901155471802, "learning_rate": 4.9919957572826216e-05, "loss": 1.839, "num_input_tokens_seen": 44656772, "step": 1130 }, { "epoch": 0.05524025269992792, "grad_norm": 0.5541059970855713, "learning_rate": 4.9916789639891595e-05, "loss": 1.7898, "num_input_tokens_seen": 45058836, "step": 1140 }, { "epoch": 0.055724816320102726, "grad_norm": 0.5577613711357117, "learning_rate": 4.99135603316378e-05, "loss": 1.8336, "num_input_tokens_seen": 45471592, "step": 1150 }, { "epoch": 0.05620937994027753, "grad_norm": 0.47430625557899475, "learning_rate": 4.991026965601932e-05, "loss": 1.7995, "num_input_tokens_seen": 45846124, "step": 1160 }, { "epoch": 0.05669394356045234, "grad_norm": 0.5153654217720032, "learning_rate": 4.990691762114176e-05, "loss": 1.8068, "num_input_tokens_seen": 46213352, "step": 1170 }, { "epoch": 0.05717850718062715, "grad_norm": 0.5673589110374451, "learning_rate": 4.99035042352619e-05, "loss": 1.773, "num_input_tokens_seen": 46626636, "step": 1180 }, { "epoch": 0.05766307080080195, "grad_norm": 0.505126416683197, "learning_rate": 4.9900029506787645e-05, "loss": 1.7516, "num_input_tokens_seen": 47031172, "step": 1190 }, { "epoch": 0.05814763442097676, "grad_norm": 0.4553806483745575, "learning_rate": 4.989649344427796e-05, "loss": 1.7862, "num_input_tokens_seen": 47425060, "step": 1200 }, { "epoch": 0.058632198041151565, "grad_norm": 0.4654844105243683, "learning_rate": 4.989289605644294e-05, "loss": 1.7957, "num_input_tokens_seen": 47824468, "step": 1210 }, { "epoch": 0.05911676166132637, "grad_norm": 0.4679258167743683, "learning_rate": 4.988923735214369e-05, "loss": 1.7706, "num_input_tokens_seen": 48220624, "step": 1220 }, { "epoch": 0.05960132528150118, "grad_norm": 0.4447259306907654, "learning_rate": 4.988551734039239e-05, "loss": 1.7788, "num_input_tokens_seen": 48597080, "step": 1230 }, { "epoch": 0.06008588890167599, "grad_norm": 0.5004059672355652, "learning_rate": 4.98817360303522e-05, "loss": 1.7994, "num_input_tokens_seen": 49002120, "step": 1240 }, { "epoch": 0.06057045252185079, "grad_norm": 0.4713003635406494, "learning_rate": 4.98778934313373e-05, "loss": 1.729, "num_input_tokens_seen": 49399124, "step": 1250 }, { "epoch": 0.061055016142025596, "grad_norm": 0.4393056035041809, "learning_rate": 4.987398955281281e-05, "loss": 1.7764, "num_input_tokens_seen": 49820492, "step": 1260 }, { "epoch": 0.061539579762200404, "grad_norm": 0.49267756938934326, "learning_rate": 4.987002440439481e-05, "loss": 1.7541, "num_input_tokens_seen": 50240940, "step": 1270 }, { "epoch": 0.06202414338237521, "grad_norm": 0.4592071771621704, "learning_rate": 4.986599799585031e-05, "loss": 1.8162, "num_input_tokens_seen": 50644136, "step": 1280 }, { "epoch": 0.06250870700255001, "grad_norm": 0.4428696930408478, "learning_rate": 4.98619103370972e-05, "loss": 1.7786, "num_input_tokens_seen": 51030808, "step": 1290 }, { "epoch": 0.06299327062272482, "grad_norm": 0.4989221394062042, "learning_rate": 4.985776143820423e-05, "loss": 1.7498, "num_input_tokens_seen": 51408072, "step": 1300 }, { "epoch": 0.06347783424289963, "grad_norm": 0.46731674671173096, "learning_rate": 4.985355130939104e-05, "loss": 1.7556, "num_input_tokens_seen": 51788056, "step": 1310 }, { "epoch": 0.06396239786307444, "grad_norm": 0.4602036476135254, "learning_rate": 4.984927996102806e-05, "loss": 1.7395, "num_input_tokens_seen": 52195616, "step": 1320 }, { "epoch": 0.06444696148324924, "grad_norm": 0.49800294637680054, "learning_rate": 4.984494740363651e-05, "loss": 1.7324, "num_input_tokens_seen": 52605440, "step": 1330 }, { "epoch": 0.06493152510342405, "grad_norm": 0.4798908233642578, "learning_rate": 4.984055364788842e-05, "loss": 1.7495, "num_input_tokens_seen": 53005000, "step": 1340 }, { "epoch": 0.06541608872359886, "grad_norm": 0.510181188583374, "learning_rate": 4.9836098704606515e-05, "loss": 1.7238, "num_input_tokens_seen": 53428392, "step": 1350 }, { "epoch": 0.06590065234377367, "grad_norm": 0.4547858238220215, "learning_rate": 4.983158258476427e-05, "loss": 1.7711, "num_input_tokens_seen": 53787676, "step": 1360 }, { "epoch": 0.06638521596394846, "grad_norm": 0.486717164516449, "learning_rate": 4.982700529948585e-05, "loss": 1.7099, "num_input_tokens_seen": 54228120, "step": 1370 }, { "epoch": 0.06686977958412327, "grad_norm": 0.4875461459159851, "learning_rate": 4.982236686004606e-05, "loss": 1.7542, "num_input_tokens_seen": 54637396, "step": 1380 }, { "epoch": 0.06735434320429808, "grad_norm": 0.4732173681259155, "learning_rate": 4.9817667277870384e-05, "loss": 1.725, "num_input_tokens_seen": 55042360, "step": 1390 }, { "epoch": 0.06783890682447288, "grad_norm": 0.4369388818740845, "learning_rate": 4.981290656453486e-05, "loss": 1.7288, "num_input_tokens_seen": 55430008, "step": 1400 }, { "epoch": 0.06832347044464769, "grad_norm": 0.42465418577194214, "learning_rate": 4.9808084731766134e-05, "loss": 1.7026, "num_input_tokens_seen": 55815764, "step": 1410 }, { "epoch": 0.0688080340648225, "grad_norm": 0.4628385007381439, "learning_rate": 4.980320179144141e-05, "loss": 1.7732, "num_input_tokens_seen": 56164316, "step": 1420 }, { "epoch": 0.0692925976849973, "grad_norm": 0.5061572194099426, "learning_rate": 4.97982577555884e-05, "loss": 1.7107, "num_input_tokens_seen": 56523812, "step": 1430 }, { "epoch": 0.06977716130517211, "grad_norm": 0.49572449922561646, "learning_rate": 4.9793252636385305e-05, "loss": 1.6988, "num_input_tokens_seen": 56892024, "step": 1440 }, { "epoch": 0.07026172492534692, "grad_norm": 0.5096593499183655, "learning_rate": 4.9788186446160795e-05, "loss": 1.7888, "num_input_tokens_seen": 57270864, "step": 1450 }, { "epoch": 0.07074628854552173, "grad_norm": 0.4694225490093231, "learning_rate": 4.978305919739396e-05, "loss": 1.7126, "num_input_tokens_seen": 57643264, "step": 1460 }, { "epoch": 0.07123085216569652, "grad_norm": 0.43490421772003174, "learning_rate": 4.9777870902714306e-05, "loss": 1.6902, "num_input_tokens_seen": 58035896, "step": 1470 }, { "epoch": 0.07171541578587133, "grad_norm": 0.42695242166519165, "learning_rate": 4.97726215749017e-05, "loss": 1.7368, "num_input_tokens_seen": 58412320, "step": 1480 }, { "epoch": 0.07219997940604614, "grad_norm": 0.48507386445999146, "learning_rate": 4.976731122688634e-05, "loss": 1.6676, "num_input_tokens_seen": 58792164, "step": 1490 }, { "epoch": 0.07268454302622095, "grad_norm": 0.5080244541168213, "learning_rate": 4.9761939871748734e-05, "loss": 1.6863, "num_input_tokens_seen": 59187108, "step": 1500 }, { "epoch": 0.07316910664639575, "grad_norm": 0.4605392813682556, "learning_rate": 4.975650752271967e-05, "loss": 1.7126, "num_input_tokens_seen": 59576276, "step": 1510 }, { "epoch": 0.07365367026657056, "grad_norm": 0.5134192705154419, "learning_rate": 4.9751014193180165e-05, "loss": 1.7108, "num_input_tokens_seen": 59993792, "step": 1520 }, { "epoch": 0.07413823388674537, "grad_norm": 0.4663729667663574, "learning_rate": 4.974545989666147e-05, "loss": 1.7352, "num_input_tokens_seen": 60408024, "step": 1530 }, { "epoch": 0.07462279750692018, "grad_norm": 0.4669811427593231, "learning_rate": 4.973984464684497e-05, "loss": 1.7421, "num_input_tokens_seen": 60767824, "step": 1540 }, { "epoch": 0.07510736112709498, "grad_norm": 0.46583324670791626, "learning_rate": 4.973416845756221e-05, "loss": 1.7187, "num_input_tokens_seen": 61143088, "step": 1550 }, { "epoch": 0.07559192474726979, "grad_norm": 0.4906235635280609, "learning_rate": 4.9728431342794865e-05, "loss": 1.7195, "num_input_tokens_seen": 61531644, "step": 1560 }, { "epoch": 0.07607648836744459, "grad_norm": 0.4702226221561432, "learning_rate": 4.972263331667465e-05, "loss": 1.6975, "num_input_tokens_seen": 61937832, "step": 1570 }, { "epoch": 0.0765610519876194, "grad_norm": 0.4586687386035919, "learning_rate": 4.971677439348332e-05, "loss": 1.7196, "num_input_tokens_seen": 62332144, "step": 1580 }, { "epoch": 0.0770456156077942, "grad_norm": 0.4383968710899353, "learning_rate": 4.9710854587652654e-05, "loss": 1.6704, "num_input_tokens_seen": 62703152, "step": 1590 }, { "epoch": 0.07753017922796901, "grad_norm": 0.4787232577800751, "learning_rate": 4.970487391376438e-05, "loss": 1.7729, "num_input_tokens_seen": 63098040, "step": 1600 }, { "epoch": 0.07801474284814382, "grad_norm": 0.47267040610313416, "learning_rate": 4.969883238655017e-05, "loss": 1.6948, "num_input_tokens_seen": 63500456, "step": 1610 }, { "epoch": 0.07849930646831862, "grad_norm": 0.4243592917919159, "learning_rate": 4.969273002089157e-05, "loss": 1.6758, "num_input_tokens_seen": 63899700, "step": 1620 }, { "epoch": 0.07898387008849343, "grad_norm": 0.5044109225273132, "learning_rate": 4.968656683181999e-05, "loss": 1.7079, "num_input_tokens_seen": 64275644, "step": 1630 }, { "epoch": 0.07946843370866824, "grad_norm": 0.4761982858181, "learning_rate": 4.968034283451669e-05, "loss": 1.6974, "num_input_tokens_seen": 64658944, "step": 1640 }, { "epoch": 0.07995299732884305, "grad_norm": 0.46890366077423096, "learning_rate": 4.967405804431267e-05, "loss": 1.6835, "num_input_tokens_seen": 65053324, "step": 1650 }, { "epoch": 0.08043756094901786, "grad_norm": 0.4691985547542572, "learning_rate": 4.966771247668871e-05, "loss": 1.6842, "num_input_tokens_seen": 65471828, "step": 1660 }, { "epoch": 0.08092212456919266, "grad_norm": 0.5264722108840942, "learning_rate": 4.966130614727529e-05, "loss": 1.6913, "num_input_tokens_seen": 65888112, "step": 1670 }, { "epoch": 0.08140668818936746, "grad_norm": 0.46564558148384094, "learning_rate": 4.9654839071852535e-05, "loss": 1.6894, "num_input_tokens_seen": 66301484, "step": 1680 }, { "epoch": 0.08189125180954226, "grad_norm": 0.4879053831100464, "learning_rate": 4.964831126635022e-05, "loss": 1.7088, "num_input_tokens_seen": 66683244, "step": 1690 }, { "epoch": 0.08237581542971707, "grad_norm": 0.39720138907432556, "learning_rate": 4.964172274684772e-05, "loss": 1.7227, "num_input_tokens_seen": 67102092, "step": 1700 }, { "epoch": 0.08286037904989188, "grad_norm": 0.4953685998916626, "learning_rate": 4.963507352957395e-05, "loss": 1.7142, "num_input_tokens_seen": 67495072, "step": 1710 }, { "epoch": 0.08334494267006669, "grad_norm": 0.4297001361846924, "learning_rate": 4.962836363090734e-05, "loss": 1.6934, "num_input_tokens_seen": 67872712, "step": 1720 }, { "epoch": 0.0838295062902415, "grad_norm": 0.4302520155906677, "learning_rate": 4.962159306737578e-05, "loss": 1.6898, "num_input_tokens_seen": 68264844, "step": 1730 }, { "epoch": 0.0843140699104163, "grad_norm": 0.47304287552833557, "learning_rate": 4.96147618556566e-05, "loss": 1.6631, "num_input_tokens_seen": 68650516, "step": 1740 }, { "epoch": 0.08479863353059111, "grad_norm": 0.47196635603904724, "learning_rate": 4.960787001257652e-05, "loss": 1.6963, "num_input_tokens_seen": 69003456, "step": 1750 }, { "epoch": 0.08528319715076592, "grad_norm": 0.46627262234687805, "learning_rate": 4.9600917555111615e-05, "loss": 1.6733, "num_input_tokens_seen": 69386088, "step": 1760 }, { "epoch": 0.08576776077094073, "grad_norm": 0.4163726568222046, "learning_rate": 4.9593904500387245e-05, "loss": 1.6907, "num_input_tokens_seen": 69762188, "step": 1770 }, { "epoch": 0.08625232439111552, "grad_norm": 0.46610623598098755, "learning_rate": 4.9586830865678046e-05, "loss": 1.6722, "num_input_tokens_seen": 70170884, "step": 1780 }, { "epoch": 0.08673688801129033, "grad_norm": 0.5110335946083069, "learning_rate": 4.957969666840788e-05, "loss": 1.6884, "num_input_tokens_seen": 70544880, "step": 1790 }, { "epoch": 0.08722145163146514, "grad_norm": 0.4962434768676758, "learning_rate": 4.957250192614979e-05, "loss": 1.659, "num_input_tokens_seen": 70937756, "step": 1800 }, { "epoch": 0.08770601525163994, "grad_norm": 0.4480324387550354, "learning_rate": 4.956524665662593e-05, "loss": 1.6428, "num_input_tokens_seen": 71295700, "step": 1810 }, { "epoch": 0.08819057887181475, "grad_norm": 0.46165645122528076, "learning_rate": 4.955793087770758e-05, "loss": 1.7055, "num_input_tokens_seen": 71677740, "step": 1820 }, { "epoch": 0.08867514249198956, "grad_norm": 0.4461020827293396, "learning_rate": 4.955055460741503e-05, "loss": 1.7052, "num_input_tokens_seen": 72088472, "step": 1830 }, { "epoch": 0.08915970611216437, "grad_norm": 0.46780285239219666, "learning_rate": 4.9543117863917624e-05, "loss": 1.6684, "num_input_tokens_seen": 72483916, "step": 1840 }, { "epoch": 0.08964426973233917, "grad_norm": 0.4738537073135376, "learning_rate": 4.953562066553359e-05, "loss": 1.6663, "num_input_tokens_seen": 72895204, "step": 1850 }, { "epoch": 0.09012883335251398, "grad_norm": 0.41888901591300964, "learning_rate": 4.952806303073015e-05, "loss": 1.7064, "num_input_tokens_seen": 73289656, "step": 1860 }, { "epoch": 0.09061339697268879, "grad_norm": 0.4899497330188751, "learning_rate": 4.952044497812334e-05, "loss": 1.7179, "num_input_tokens_seen": 73724044, "step": 1870 }, { "epoch": 0.09109796059286358, "grad_norm": 0.4161849319934845, "learning_rate": 4.951276652647803e-05, "loss": 1.6841, "num_input_tokens_seen": 74111024, "step": 1880 }, { "epoch": 0.09158252421303839, "grad_norm": 0.44479620456695557, "learning_rate": 4.950502769470787e-05, "loss": 1.6294, "num_input_tokens_seen": 74502300, "step": 1890 }, { "epoch": 0.0920670878332132, "grad_norm": 0.48210737109184265, "learning_rate": 4.949722850187525e-05, "loss": 1.6615, "num_input_tokens_seen": 74920152, "step": 1900 }, { "epoch": 0.092551651453388, "grad_norm": 0.5125983357429504, "learning_rate": 4.948936896719121e-05, "loss": 1.6056, "num_input_tokens_seen": 75318632, "step": 1910 }, { "epoch": 0.09303621507356281, "grad_norm": 0.45514819025993347, "learning_rate": 4.948144911001546e-05, "loss": 1.6808, "num_input_tokens_seen": 75682924, "step": 1920 }, { "epoch": 0.09352077869373762, "grad_norm": 0.45765024423599243, "learning_rate": 4.9473468949856295e-05, "loss": 1.6256, "num_input_tokens_seen": 76084200, "step": 1930 }, { "epoch": 0.09400534231391243, "grad_norm": 0.46034979820251465, "learning_rate": 4.946542850637051e-05, "loss": 1.6342, "num_input_tokens_seen": 76509976, "step": 1940 }, { "epoch": 0.09448990593408724, "grad_norm": 0.47426116466522217, "learning_rate": 4.945732779936343e-05, "loss": 1.6783, "num_input_tokens_seen": 76925836, "step": 1950 }, { "epoch": 0.09497446955426204, "grad_norm": 0.44276344776153564, "learning_rate": 4.944916684878881e-05, "loss": 1.6505, "num_input_tokens_seen": 77309476, "step": 1960 }, { "epoch": 0.09545903317443685, "grad_norm": 0.4252452552318573, "learning_rate": 4.944094567474878e-05, "loss": 1.6174, "num_input_tokens_seen": 77719060, "step": 1970 }, { "epoch": 0.09594359679461165, "grad_norm": 0.4776414930820465, "learning_rate": 4.9432664297493855e-05, "loss": 1.6953, "num_input_tokens_seen": 78109656, "step": 1980 }, { "epoch": 0.09642816041478645, "grad_norm": 0.5169130563735962, "learning_rate": 4.94243227374228e-05, "loss": 1.6782, "num_input_tokens_seen": 78511580, "step": 1990 }, { "epoch": 0.09691272403496126, "grad_norm": 0.41066092252731323, "learning_rate": 4.941592101508264e-05, "loss": 1.6487, "num_input_tokens_seen": 78885660, "step": 2000 }, { "epoch": 0.09691272403496126, "eval_loss": 1.7664850950241089, "eval_runtime": 3.9496, "eval_samples_per_second": 37.979, "eval_steps_per_second": 4.811, "num_input_tokens_seen": 78885660, "step": 2000 }, { "epoch": 0.09739728765513607, "grad_norm": 0.4655153751373291, "learning_rate": 4.94074591511686e-05, "loss": 1.6776, "num_input_tokens_seen": 79304256, "step": 2010 }, { "epoch": 0.09788185127531088, "grad_norm": 0.47283700108528137, "learning_rate": 4.939893716652404e-05, "loss": 1.6605, "num_input_tokens_seen": 79720364, "step": 2020 }, { "epoch": 0.09836641489548568, "grad_norm": 0.433597207069397, "learning_rate": 4.93903550821404e-05, "loss": 1.6256, "num_input_tokens_seen": 80133808, "step": 2030 }, { "epoch": 0.09885097851566049, "grad_norm": 0.5006552934646606, "learning_rate": 4.9381712919157174e-05, "loss": 1.672, "num_input_tokens_seen": 80519820, "step": 2040 }, { "epoch": 0.0993355421358353, "grad_norm": 0.4227645993232727, "learning_rate": 4.937301069886184e-05, "loss": 1.5913, "num_input_tokens_seen": 80921068, "step": 2050 }, { "epoch": 0.09982010575601011, "grad_norm": 0.4382697641849518, "learning_rate": 4.93642484426898e-05, "loss": 1.6587, "num_input_tokens_seen": 81330628, "step": 2060 }, { "epoch": 0.10030466937618492, "grad_norm": 0.45559099316596985, "learning_rate": 4.935542617222434e-05, "loss": 1.6698, "num_input_tokens_seen": 81744300, "step": 2070 }, { "epoch": 0.10078923299635971, "grad_norm": 0.44068899750709534, "learning_rate": 4.9346543909196584e-05, "loss": 1.6527, "num_input_tokens_seen": 82151876, "step": 2080 }, { "epoch": 0.10127379661653452, "grad_norm": 0.4670219123363495, "learning_rate": 4.933760167548542e-05, "loss": 1.66, "num_input_tokens_seen": 82539776, "step": 2090 }, { "epoch": 0.10175836023670932, "grad_norm": 0.4150504171848297, "learning_rate": 4.932859949311745e-05, "loss": 1.6959, "num_input_tokens_seen": 82918640, "step": 2100 }, { "epoch": 0.10224292385688413, "grad_norm": 0.46419015526771545, "learning_rate": 4.931953738426698e-05, "loss": 1.6897, "num_input_tokens_seen": 83299820, "step": 2110 }, { "epoch": 0.10272748747705894, "grad_norm": 0.4582190215587616, "learning_rate": 4.931041537125587e-05, "loss": 1.6196, "num_input_tokens_seen": 83659764, "step": 2120 }, { "epoch": 0.10321205109723375, "grad_norm": 0.4939762055873871, "learning_rate": 4.930123347655358e-05, "loss": 1.6231, "num_input_tokens_seen": 84052080, "step": 2130 }, { "epoch": 0.10369661471740856, "grad_norm": 0.43949180841445923, "learning_rate": 4.929199172277705e-05, "loss": 1.5858, "num_input_tokens_seen": 84431660, "step": 2140 }, { "epoch": 0.10418117833758336, "grad_norm": 0.44852331280708313, "learning_rate": 4.928269013269069e-05, "loss": 1.678, "num_input_tokens_seen": 84828416, "step": 2150 }, { "epoch": 0.10466574195775817, "grad_norm": 0.4769739508628845, "learning_rate": 4.927332872920626e-05, "loss": 1.6463, "num_input_tokens_seen": 85231908, "step": 2160 }, { "epoch": 0.10515030557793298, "grad_norm": 0.41337233781814575, "learning_rate": 4.926390753538288e-05, "loss": 1.6137, "num_input_tokens_seen": 85644796, "step": 2170 }, { "epoch": 0.10563486919810777, "grad_norm": 0.4415737986564636, "learning_rate": 4.925442657442696e-05, "loss": 1.638, "num_input_tokens_seen": 86024508, "step": 2180 }, { "epoch": 0.10611943281828258, "grad_norm": 0.4723414480686188, "learning_rate": 4.924488586969208e-05, "loss": 1.6074, "num_input_tokens_seen": 86425788, "step": 2190 }, { "epoch": 0.10660399643845739, "grad_norm": 0.4304730296134949, "learning_rate": 4.923528544467905e-05, "loss": 1.6436, "num_input_tokens_seen": 86797336, "step": 2200 }, { "epoch": 0.1070885600586322, "grad_norm": 0.4499008059501648, "learning_rate": 4.9225625323035706e-05, "loss": 1.6196, "num_input_tokens_seen": 87182044, "step": 2210 }, { "epoch": 0.107573123678807, "grad_norm": 0.415367066860199, "learning_rate": 4.9215905528557e-05, "loss": 1.6363, "num_input_tokens_seen": 87604236, "step": 2220 }, { "epoch": 0.10805768729898181, "grad_norm": 0.4271062910556793, "learning_rate": 4.9206126085184824e-05, "loss": 1.6101, "num_input_tokens_seen": 87995600, "step": 2230 }, { "epoch": 0.10854225091915662, "grad_norm": 0.4146898686885834, "learning_rate": 4.919628701700802e-05, "loss": 1.5793, "num_input_tokens_seen": 88393892, "step": 2240 }, { "epoch": 0.10902681453933143, "grad_norm": 0.43309634923934937, "learning_rate": 4.918638834826229e-05, "loss": 1.5593, "num_input_tokens_seen": 88770548, "step": 2250 }, { "epoch": 0.10951137815950623, "grad_norm": 0.42651528120040894, "learning_rate": 4.917643010333015e-05, "loss": 1.6263, "num_input_tokens_seen": 89160792, "step": 2260 }, { "epoch": 0.10999594177968104, "grad_norm": 0.4150625467300415, "learning_rate": 4.916641230674086e-05, "loss": 1.6196, "num_input_tokens_seen": 89548084, "step": 2270 }, { "epoch": 0.11048050539985584, "grad_norm": 0.4922536313533783, "learning_rate": 4.915633498317037e-05, "loss": 1.6234, "num_input_tokens_seen": 89957316, "step": 2280 }, { "epoch": 0.11096506902003064, "grad_norm": 0.436235636472702, "learning_rate": 4.914619815744126e-05, "loss": 1.602, "num_input_tokens_seen": 90352888, "step": 2290 }, { "epoch": 0.11144963264020545, "grad_norm": 0.44822990894317627, "learning_rate": 4.913600185452267e-05, "loss": 1.6393, "num_input_tokens_seen": 90729132, "step": 2300 }, { "epoch": 0.11193419626038026, "grad_norm": 0.4682275652885437, "learning_rate": 4.912574609953026e-05, "loss": 1.5937, "num_input_tokens_seen": 91133560, "step": 2310 }, { "epoch": 0.11241875988055507, "grad_norm": 0.44868946075439453, "learning_rate": 4.911543091772611e-05, "loss": 1.6376, "num_input_tokens_seen": 91514000, "step": 2320 }, { "epoch": 0.11290332350072987, "grad_norm": 0.4826831817626953, "learning_rate": 4.910505633451869e-05, "loss": 1.6667, "num_input_tokens_seen": 91919504, "step": 2330 }, { "epoch": 0.11338788712090468, "grad_norm": 0.5310072898864746, "learning_rate": 4.9094622375462806e-05, "loss": 1.6148, "num_input_tokens_seen": 92332568, "step": 2340 }, { "epoch": 0.11387245074107949, "grad_norm": 0.43625608086586, "learning_rate": 4.90841290662595e-05, "loss": 1.6217, "num_input_tokens_seen": 92716832, "step": 2350 }, { "epoch": 0.1143570143612543, "grad_norm": 0.46015968918800354, "learning_rate": 4.9073576432755995e-05, "loss": 1.6222, "num_input_tokens_seen": 93138708, "step": 2360 }, { "epoch": 0.1148415779814291, "grad_norm": 0.4389876425266266, "learning_rate": 4.906296450094568e-05, "loss": 1.617, "num_input_tokens_seen": 93552848, "step": 2370 }, { "epoch": 0.1153261416016039, "grad_norm": 0.46346965432167053, "learning_rate": 4.9052293296967975e-05, "loss": 1.5736, "num_input_tokens_seen": 93947928, "step": 2380 }, { "epoch": 0.1158107052217787, "grad_norm": 0.4337058961391449, "learning_rate": 4.9041562847108304e-05, "loss": 1.5778, "num_input_tokens_seen": 94323152, "step": 2390 }, { "epoch": 0.11629526884195351, "grad_norm": 0.4524190425872803, "learning_rate": 4.903077317779805e-05, "loss": 1.5926, "num_input_tokens_seen": 94707744, "step": 2400 }, { "epoch": 0.11677983246212832, "grad_norm": 0.4808904826641083, "learning_rate": 4.901992431561443e-05, "loss": 1.6427, "num_input_tokens_seen": 95084712, "step": 2410 }, { "epoch": 0.11726439608230313, "grad_norm": 0.4621996581554413, "learning_rate": 4.9009016287280496e-05, "loss": 1.5838, "num_input_tokens_seen": 95498392, "step": 2420 }, { "epoch": 0.11774895970247794, "grad_norm": 0.41997554898262024, "learning_rate": 4.899804911966502e-05, "loss": 1.6226, "num_input_tokens_seen": 95880184, "step": 2430 }, { "epoch": 0.11823352332265274, "grad_norm": 0.475421279668808, "learning_rate": 4.898702283978247e-05, "loss": 1.6259, "num_input_tokens_seen": 96265332, "step": 2440 }, { "epoch": 0.11871808694282755, "grad_norm": 0.45337820053100586, "learning_rate": 4.8975937474792895e-05, "loss": 1.6884, "num_input_tokens_seen": 96681320, "step": 2450 }, { "epoch": 0.11920265056300236, "grad_norm": 0.42129579186439514, "learning_rate": 4.896479305200188e-05, "loss": 1.5833, "num_input_tokens_seen": 97088276, "step": 2460 }, { "epoch": 0.11968721418317717, "grad_norm": 0.42145466804504395, "learning_rate": 4.895358959886051e-05, "loss": 1.6114, "num_input_tokens_seen": 97501600, "step": 2470 }, { "epoch": 0.12017177780335198, "grad_norm": 0.4644761085510254, "learning_rate": 4.8942327142965244e-05, "loss": 1.6331, "num_input_tokens_seen": 97893056, "step": 2480 }, { "epoch": 0.12065634142352677, "grad_norm": 0.4692575931549072, "learning_rate": 4.8931005712057905e-05, "loss": 1.591, "num_input_tokens_seen": 98303764, "step": 2490 }, { "epoch": 0.12114090504370158, "grad_norm": 0.4318110942840576, "learning_rate": 4.891962533402556e-05, "loss": 1.5659, "num_input_tokens_seen": 98672376, "step": 2500 }, { "epoch": 0.12162546866387638, "grad_norm": 0.44102174043655396, "learning_rate": 4.890818603690049e-05, "loss": 1.5804, "num_input_tokens_seen": 99057612, "step": 2510 }, { "epoch": 0.12211003228405119, "grad_norm": 0.45612481236457825, "learning_rate": 4.88966878488601e-05, "loss": 1.6189, "num_input_tokens_seen": 99443028, "step": 2520 }, { "epoch": 0.122594595904226, "grad_norm": 0.502396821975708, "learning_rate": 4.888513079822686e-05, "loss": 1.542, "num_input_tokens_seen": 99861476, "step": 2530 }, { "epoch": 0.12307915952440081, "grad_norm": 0.438915491104126, "learning_rate": 4.887351491346822e-05, "loss": 1.554, "num_input_tokens_seen": 100294464, "step": 2540 }, { "epoch": 0.12356372314457562, "grad_norm": 0.474985808134079, "learning_rate": 4.886184022319657e-05, "loss": 1.6263, "num_input_tokens_seen": 100653652, "step": 2550 }, { "epoch": 0.12404828676475042, "grad_norm": 0.43759259581565857, "learning_rate": 4.8850106756169146e-05, "loss": 1.5717, "num_input_tokens_seen": 101039388, "step": 2560 }, { "epoch": 0.12453285038492523, "grad_norm": 0.4843815267086029, "learning_rate": 4.8838314541287936e-05, "loss": 1.6216, "num_input_tokens_seen": 101443224, "step": 2570 }, { "epoch": 0.12501741400510002, "grad_norm": 0.5523076057434082, "learning_rate": 4.882646360759967e-05, "loss": 1.5746, "num_input_tokens_seen": 101857288, "step": 2580 }, { "epoch": 0.12550197762527485, "grad_norm": 0.43904462456703186, "learning_rate": 4.88145539842957e-05, "loss": 1.5948, "num_input_tokens_seen": 102260496, "step": 2590 }, { "epoch": 0.12598654124544964, "grad_norm": 0.4333835244178772, "learning_rate": 4.880258570071194e-05, "loss": 1.6429, "num_input_tokens_seen": 102644332, "step": 2600 }, { "epoch": 0.12647110486562446, "grad_norm": 0.42234155535697937, "learning_rate": 4.879055878632881e-05, "loss": 1.5685, "num_input_tokens_seen": 103032436, "step": 2610 }, { "epoch": 0.12695566848579926, "grad_norm": 0.40778931975364685, "learning_rate": 4.8778473270771144e-05, "loss": 1.5875, "num_input_tokens_seen": 103422688, "step": 2620 }, { "epoch": 0.12744023210597408, "grad_norm": 0.45513054728507996, "learning_rate": 4.8766329183808115e-05, "loss": 1.5896, "num_input_tokens_seen": 103798060, "step": 2630 }, { "epoch": 0.12792479572614887, "grad_norm": 0.4449268579483032, "learning_rate": 4.875412655535319e-05, "loss": 1.6182, "num_input_tokens_seen": 104187120, "step": 2640 }, { "epoch": 0.12840935934632366, "grad_norm": 0.4176337718963623, "learning_rate": 4.8741865415463995e-05, "loss": 1.5704, "num_input_tokens_seen": 104566676, "step": 2650 }, { "epoch": 0.1288939229664985, "grad_norm": 0.4052715003490448, "learning_rate": 4.8729545794342326e-05, "loss": 1.5684, "num_input_tokens_seen": 104970872, "step": 2660 }, { "epoch": 0.12937848658667328, "grad_norm": 0.43383073806762695, "learning_rate": 4.871716772233401e-05, "loss": 1.5695, "num_input_tokens_seen": 105341948, "step": 2670 }, { "epoch": 0.1298630502068481, "grad_norm": 0.4440796673297882, "learning_rate": 4.870473122992886e-05, "loss": 1.5893, "num_input_tokens_seen": 105747904, "step": 2680 }, { "epoch": 0.1303476138270229, "grad_norm": 0.47211089730262756, "learning_rate": 4.869223634776059e-05, "loss": 1.5961, "num_input_tokens_seen": 106128328, "step": 2690 }, { "epoch": 0.13083217744719772, "grad_norm": 0.4548238515853882, "learning_rate": 4.867968310660671e-05, "loss": 1.6209, "num_input_tokens_seen": 106507300, "step": 2700 }, { "epoch": 0.1313167410673725, "grad_norm": 0.4506012797355652, "learning_rate": 4.8667071537388535e-05, "loss": 1.6299, "num_input_tokens_seen": 106889452, "step": 2710 }, { "epoch": 0.13180130468754733, "grad_norm": 0.42806220054626465, "learning_rate": 4.8654401671171014e-05, "loss": 1.5991, "num_input_tokens_seen": 107252004, "step": 2720 }, { "epoch": 0.13228586830772213, "grad_norm": 0.4180905222892761, "learning_rate": 4.86416735391627e-05, "loss": 1.5844, "num_input_tokens_seen": 107629528, "step": 2730 }, { "epoch": 0.13277043192789692, "grad_norm": 0.4253162145614624, "learning_rate": 4.862888717271568e-05, "loss": 1.5904, "num_input_tokens_seen": 107996228, "step": 2740 }, { "epoch": 0.13325499554807174, "grad_norm": 0.44208529591560364, "learning_rate": 4.861604260332547e-05, "loss": 1.5569, "num_input_tokens_seen": 108380192, "step": 2750 }, { "epoch": 0.13373955916824654, "grad_norm": 0.40524107217788696, "learning_rate": 4.8603139862630966e-05, "loss": 1.6175, "num_input_tokens_seen": 108779284, "step": 2760 }, { "epoch": 0.13422412278842136, "grad_norm": 0.5582908391952515, "learning_rate": 4.8590178982414346e-05, "loss": 1.5905, "num_input_tokens_seen": 109193100, "step": 2770 }, { "epoch": 0.13470868640859615, "grad_norm": 0.4604218900203705, "learning_rate": 4.8577159994600995e-05, "loss": 1.5554, "num_input_tokens_seen": 109569728, "step": 2780 }, { "epoch": 0.13519325002877097, "grad_norm": 0.41278553009033203, "learning_rate": 4.8564082931259426e-05, "loss": 1.5784, "num_input_tokens_seen": 109984452, "step": 2790 }, { "epoch": 0.13567781364894577, "grad_norm": 0.4047918915748596, "learning_rate": 4.8550947824601216e-05, "loss": 1.5317, "num_input_tokens_seen": 110389128, "step": 2800 }, { "epoch": 0.1361623772691206, "grad_norm": 0.4299299716949463, "learning_rate": 4.853775470698091e-05, "loss": 1.5954, "num_input_tokens_seen": 110788072, "step": 2810 }, { "epoch": 0.13664694088929538, "grad_norm": 0.4523586332798004, "learning_rate": 4.8524503610895944e-05, "loss": 1.5679, "num_input_tokens_seen": 111191500, "step": 2820 }, { "epoch": 0.1371315045094702, "grad_norm": 0.4595218002796173, "learning_rate": 4.8511194568986563e-05, "loss": 1.5974, "num_input_tokens_seen": 111580564, "step": 2830 }, { "epoch": 0.137616068129645, "grad_norm": 0.4451237916946411, "learning_rate": 4.8497827614035755e-05, "loss": 1.5807, "num_input_tokens_seen": 111974908, "step": 2840 }, { "epoch": 0.1381006317498198, "grad_norm": 0.4401971995830536, "learning_rate": 4.848440277896915e-05, "loss": 1.5301, "num_input_tokens_seen": 112379284, "step": 2850 }, { "epoch": 0.1385851953699946, "grad_norm": 0.4478752017021179, "learning_rate": 4.847092009685496e-05, "loss": 1.5541, "num_input_tokens_seen": 112797012, "step": 2860 }, { "epoch": 0.1390697589901694, "grad_norm": 0.47638776898384094, "learning_rate": 4.8457379600903886e-05, "loss": 1.5861, "num_input_tokens_seen": 113177816, "step": 2870 }, { "epoch": 0.13955432261034423, "grad_norm": 0.4315278232097626, "learning_rate": 4.844378132446903e-05, "loss": 1.527, "num_input_tokens_seen": 113585668, "step": 2880 }, { "epoch": 0.14003888623051902, "grad_norm": 0.45071589946746826, "learning_rate": 4.843012530104581e-05, "loss": 1.5542, "num_input_tokens_seen": 114012516, "step": 2890 }, { "epoch": 0.14052344985069384, "grad_norm": 0.4602771997451782, "learning_rate": 4.841641156427189e-05, "loss": 1.543, "num_input_tokens_seen": 114413536, "step": 2900 }, { "epoch": 0.14100801347086864, "grad_norm": 0.4230845272541046, "learning_rate": 4.8402640147927134e-05, "loss": 1.5379, "num_input_tokens_seen": 114815632, "step": 2910 }, { "epoch": 0.14149257709104346, "grad_norm": 0.4051437973976135, "learning_rate": 4.838881108593342e-05, "loss": 1.5231, "num_input_tokens_seen": 115202344, "step": 2920 }, { "epoch": 0.14197714071121825, "grad_norm": 0.5333889126777649, "learning_rate": 4.837492441235467e-05, "loss": 1.5378, "num_input_tokens_seen": 115622044, "step": 2930 }, { "epoch": 0.14246170433139305, "grad_norm": 0.4066469967365265, "learning_rate": 4.8360980161396685e-05, "loss": 1.584, "num_input_tokens_seen": 115983972, "step": 2940 }, { "epoch": 0.14294626795156787, "grad_norm": 0.4050688147544861, "learning_rate": 4.834697836740712e-05, "loss": 1.5642, "num_input_tokens_seen": 116370664, "step": 2950 }, { "epoch": 0.14343083157174266, "grad_norm": 0.4575314521789551, "learning_rate": 4.833291906487533e-05, "loss": 1.512, "num_input_tokens_seen": 116802032, "step": 2960 }, { "epoch": 0.14391539519191748, "grad_norm": 0.43021175265312195, "learning_rate": 4.831880228843235e-05, "loss": 1.5795, "num_input_tokens_seen": 117193496, "step": 2970 }, { "epoch": 0.14439995881209228, "grad_norm": 0.4586018919944763, "learning_rate": 4.83046280728508e-05, "loss": 1.556, "num_input_tokens_seen": 117616412, "step": 2980 }, { "epoch": 0.1448845224322671, "grad_norm": 0.47244206070899963, "learning_rate": 4.8290396453044764e-05, "loss": 1.5146, "num_input_tokens_seen": 117972728, "step": 2990 }, { "epoch": 0.1453690860524419, "grad_norm": 0.3958747088909149, "learning_rate": 4.827610746406972e-05, "loss": 1.5349, "num_input_tokens_seen": 118356720, "step": 3000 }, { "epoch": 0.14585364967261671, "grad_norm": 0.45024189352989197, "learning_rate": 4.826176114112247e-05, "loss": 1.6121, "num_input_tokens_seen": 118769664, "step": 3010 }, { "epoch": 0.1463382132927915, "grad_norm": 0.44841715693473816, "learning_rate": 4.824735751954106e-05, "loss": 1.5356, "num_input_tokens_seen": 119180436, "step": 3020 }, { "epoch": 0.14682277691296633, "grad_norm": 0.4641439616680145, "learning_rate": 4.8232896634804634e-05, "loss": 1.6029, "num_input_tokens_seen": 119553244, "step": 3030 }, { "epoch": 0.14730734053314112, "grad_norm": 0.4200107455253601, "learning_rate": 4.8218378522533404e-05, "loss": 1.5575, "num_input_tokens_seen": 119933780, "step": 3040 }, { "epoch": 0.14779190415331592, "grad_norm": 0.47531604766845703, "learning_rate": 4.8203803218488567e-05, "loss": 1.5486, "num_input_tokens_seen": 120345892, "step": 3050 }, { "epoch": 0.14827646777349074, "grad_norm": 0.42880284786224365, "learning_rate": 4.8189170758572154e-05, "loss": 1.5223, "num_input_tokens_seen": 120711848, "step": 3060 }, { "epoch": 0.14876103139366553, "grad_norm": 0.4171142280101776, "learning_rate": 4.817448117882703e-05, "loss": 1.5771, "num_input_tokens_seen": 121113568, "step": 3070 }, { "epoch": 0.14924559501384035, "grad_norm": 0.43676143884658813, "learning_rate": 4.815973451543672e-05, "loss": 1.5337, "num_input_tokens_seen": 121540304, "step": 3080 }, { "epoch": 0.14973015863401515, "grad_norm": 0.4129098057746887, "learning_rate": 4.814493080472538e-05, "loss": 1.5162, "num_input_tokens_seen": 121912956, "step": 3090 }, { "epoch": 0.15021472225418997, "grad_norm": 0.4339870810508728, "learning_rate": 4.8130070083157676e-05, "loss": 1.5347, "num_input_tokens_seen": 122302884, "step": 3100 }, { "epoch": 0.15069928587436476, "grad_norm": 0.4579829275608063, "learning_rate": 4.8115152387338705e-05, "loss": 1.5682, "num_input_tokens_seen": 122729188, "step": 3110 }, { "epoch": 0.15118384949453958, "grad_norm": 0.44452232122421265, "learning_rate": 4.81001777540139e-05, "loss": 1.5084, "num_input_tokens_seen": 123099424, "step": 3120 }, { "epoch": 0.15166841311471438, "grad_norm": 0.4334086775779724, "learning_rate": 4.8085146220068955e-05, "loss": 1.5089, "num_input_tokens_seen": 123501724, "step": 3130 }, { "epoch": 0.15215297673488917, "grad_norm": 0.4431312680244446, "learning_rate": 4.8070057822529715e-05, "loss": 1.4841, "num_input_tokens_seen": 123897884, "step": 3140 }, { "epoch": 0.152637540355064, "grad_norm": 0.42815452814102173, "learning_rate": 4.8054912598562086e-05, "loss": 1.5234, "num_input_tokens_seen": 124295448, "step": 3150 }, { "epoch": 0.1531221039752388, "grad_norm": 0.44743165373802185, "learning_rate": 4.8039710585471966e-05, "loss": 1.5925, "num_input_tokens_seen": 124680420, "step": 3160 }, { "epoch": 0.1536066675954136, "grad_norm": 0.41514381766319275, "learning_rate": 4.802445182070511e-05, "loss": 1.5505, "num_input_tokens_seen": 125076112, "step": 3170 }, { "epoch": 0.1540912312155884, "grad_norm": 0.4845329225063324, "learning_rate": 4.8009136341847094e-05, "loss": 1.5366, "num_input_tokens_seen": 125473256, "step": 3180 }, { "epoch": 0.15457579483576322, "grad_norm": 0.4526010751724243, "learning_rate": 4.799376418662318e-05, "loss": 1.5357, "num_input_tokens_seen": 125882064, "step": 3190 }, { "epoch": 0.15506035845593802, "grad_norm": 0.42789971828460693, "learning_rate": 4.7978335392898235e-05, "loss": 1.4749, "num_input_tokens_seen": 126280000, "step": 3200 }, { "epoch": 0.15554492207611284, "grad_norm": 0.4213913083076477, "learning_rate": 4.796284999867663e-05, "loss": 1.5433, "num_input_tokens_seen": 126699892, "step": 3210 }, { "epoch": 0.15602948569628763, "grad_norm": 0.39674410223960876, "learning_rate": 4.794730804210217e-05, "loss": 1.5379, "num_input_tokens_seen": 127115872, "step": 3220 }, { "epoch": 0.15651404931646246, "grad_norm": 0.45900386571884155, "learning_rate": 4.793170956145798e-05, "loss": 1.5696, "num_input_tokens_seen": 127499596, "step": 3230 }, { "epoch": 0.15699861293663725, "grad_norm": 0.43314629793167114, "learning_rate": 4.791605459516641e-05, "loss": 1.5543, "num_input_tokens_seen": 127906492, "step": 3240 }, { "epoch": 0.15748317655681204, "grad_norm": 0.408204585313797, "learning_rate": 4.7900343181788955e-05, "loss": 1.5537, "num_input_tokens_seen": 128323396, "step": 3250 }, { "epoch": 0.15796774017698686, "grad_norm": 0.45687878131866455, "learning_rate": 4.788457536002614e-05, "loss": 1.5473, "num_input_tokens_seen": 128719820, "step": 3260 }, { "epoch": 0.15845230379716166, "grad_norm": 0.423566997051239, "learning_rate": 4.7868751168717454e-05, "loss": 1.6038, "num_input_tokens_seen": 129126672, "step": 3270 }, { "epoch": 0.15893686741733648, "grad_norm": 0.4187241196632385, "learning_rate": 4.785287064684122e-05, "loss": 1.5793, "num_input_tokens_seen": 129537864, "step": 3280 }, { "epoch": 0.15942143103751127, "grad_norm": 0.45594966411590576, "learning_rate": 4.783693383351452e-05, "loss": 1.5753, "num_input_tokens_seen": 129948744, "step": 3290 }, { "epoch": 0.1599059946576861, "grad_norm": 0.4248170554637909, "learning_rate": 4.782094076799308e-05, "loss": 1.5443, "num_input_tokens_seen": 130350392, "step": 3300 }, { "epoch": 0.1603905582778609, "grad_norm": 0.491043359041214, "learning_rate": 4.780489148967122e-05, "loss": 1.5265, "num_input_tokens_seen": 130744872, "step": 3310 }, { "epoch": 0.1608751218980357, "grad_norm": 0.44143855571746826, "learning_rate": 4.77887860380817e-05, "loss": 1.4956, "num_input_tokens_seen": 131164480, "step": 3320 }, { "epoch": 0.1613596855182105, "grad_norm": 0.4861333668231964, "learning_rate": 4.777262445289565e-05, "loss": 1.5403, "num_input_tokens_seen": 131523208, "step": 3330 }, { "epoch": 0.16184424913838533, "grad_norm": 0.44125744700431824, "learning_rate": 4.775640677392246e-05, "loss": 1.5067, "num_input_tokens_seen": 131941532, "step": 3340 }, { "epoch": 0.16232881275856012, "grad_norm": 0.43593043088912964, "learning_rate": 4.774013304110972e-05, "loss": 1.5443, "num_input_tokens_seen": 132309260, "step": 3350 }, { "epoch": 0.16281337637873491, "grad_norm": 0.37986883521080017, "learning_rate": 4.7723803294543056e-05, "loss": 1.5141, "num_input_tokens_seen": 132674688, "step": 3360 }, { "epoch": 0.16329793999890974, "grad_norm": 0.4436347484588623, "learning_rate": 4.7707417574446086e-05, "loss": 1.5817, "num_input_tokens_seen": 133074724, "step": 3370 }, { "epoch": 0.16378250361908453, "grad_norm": 0.48060745000839233, "learning_rate": 4.769097592118033e-05, "loss": 1.5664, "num_input_tokens_seen": 133457192, "step": 3380 }, { "epoch": 0.16426706723925935, "grad_norm": 0.4321390688419342, "learning_rate": 4.7674478375245013e-05, "loss": 1.5077, "num_input_tokens_seen": 133844504, "step": 3390 }, { "epoch": 0.16475163085943414, "grad_norm": 0.4549488425254822, "learning_rate": 4.76579249772771e-05, "loss": 1.5348, "num_input_tokens_seen": 134258676, "step": 3400 }, { "epoch": 0.16523619447960897, "grad_norm": 0.47234755754470825, "learning_rate": 4.764131576805111e-05, "loss": 1.55, "num_input_tokens_seen": 134615472, "step": 3410 }, { "epoch": 0.16572075809978376, "grad_norm": 0.4680884778499603, "learning_rate": 4.762465078847903e-05, "loss": 1.4957, "num_input_tokens_seen": 135018208, "step": 3420 }, { "epoch": 0.16620532171995858, "grad_norm": 0.4551864564418793, "learning_rate": 4.760793007961023e-05, "loss": 1.4933, "num_input_tokens_seen": 135419696, "step": 3430 }, { "epoch": 0.16668988534013338, "grad_norm": 0.4141683578491211, "learning_rate": 4.759115368263135e-05, "loss": 1.5556, "num_input_tokens_seen": 135800204, "step": 3440 }, { "epoch": 0.16717444896030817, "grad_norm": 0.3936518430709839, "learning_rate": 4.75743216388662e-05, "loss": 1.5483, "num_input_tokens_seen": 136190572, "step": 3450 }, { "epoch": 0.167659012580483, "grad_norm": 0.41701188683509827, "learning_rate": 4.7557433989775654e-05, "loss": 1.5062, "num_input_tokens_seen": 136592572, "step": 3460 }, { "epoch": 0.16814357620065778, "grad_norm": 0.41708120703697205, "learning_rate": 4.754049077695758e-05, "loss": 1.557, "num_input_tokens_seen": 136951096, "step": 3470 }, { "epoch": 0.1686281398208326, "grad_norm": 0.5057613253593445, "learning_rate": 4.752349204214668e-05, "loss": 1.5247, "num_input_tokens_seen": 137320496, "step": 3480 }, { "epoch": 0.1691127034410074, "grad_norm": 0.4021996855735779, "learning_rate": 4.750643782721442e-05, "loss": 1.548, "num_input_tokens_seen": 137671948, "step": 3490 }, { "epoch": 0.16959726706118222, "grad_norm": 0.4202756881713867, "learning_rate": 4.7489328174168964e-05, "loss": 1.529, "num_input_tokens_seen": 138105600, "step": 3500 }, { "epoch": 0.17008183068135702, "grad_norm": 0.4259394407272339, "learning_rate": 4.747216312515498e-05, "loss": 1.5241, "num_input_tokens_seen": 138487120, "step": 3510 }, { "epoch": 0.17056639430153184, "grad_norm": 0.3875693678855896, "learning_rate": 4.745494272245361e-05, "loss": 1.524, "num_input_tokens_seen": 138885020, "step": 3520 }, { "epoch": 0.17105095792170663, "grad_norm": 0.3988937735557556, "learning_rate": 4.743766700848237e-05, "loss": 1.5126, "num_input_tokens_seen": 139288928, "step": 3530 }, { "epoch": 0.17153552154188145, "grad_norm": 0.41776183247566223, "learning_rate": 4.742033602579497e-05, "loss": 1.5061, "num_input_tokens_seen": 139667980, "step": 3540 }, { "epoch": 0.17202008516205625, "grad_norm": 0.4490506649017334, "learning_rate": 4.740294981708129e-05, "loss": 1.5089, "num_input_tokens_seen": 140092556, "step": 3550 }, { "epoch": 0.17250464878223104, "grad_norm": 0.44959497451782227, "learning_rate": 4.738550842516724e-05, "loss": 1.5266, "num_input_tokens_seen": 140485860, "step": 3560 }, { "epoch": 0.17298921240240586, "grad_norm": 0.42410293221473694, "learning_rate": 4.736801189301466e-05, "loss": 1.5103, "num_input_tokens_seen": 140883128, "step": 3570 }, { "epoch": 0.17347377602258066, "grad_norm": 0.43540823459625244, "learning_rate": 4.73504602637212e-05, "loss": 1.5119, "num_input_tokens_seen": 141262836, "step": 3580 }, { "epoch": 0.17395833964275548, "grad_norm": 0.3838653862476349, "learning_rate": 4.733285358052022e-05, "loss": 1.4833, "num_input_tokens_seen": 141659460, "step": 3590 }, { "epoch": 0.17444290326293027, "grad_norm": 0.40834611654281616, "learning_rate": 4.7315191886780727e-05, "loss": 1.5645, "num_input_tokens_seen": 142062380, "step": 3600 }, { "epoch": 0.1749274668831051, "grad_norm": 0.39526131749153137, "learning_rate": 4.729747522600719e-05, "loss": 1.5083, "num_input_tokens_seen": 142433008, "step": 3610 }, { "epoch": 0.1754120305032799, "grad_norm": 0.48167720437049866, "learning_rate": 4.727970364183949e-05, "loss": 1.5694, "num_input_tokens_seen": 142814724, "step": 3620 }, { "epoch": 0.1758965941234547, "grad_norm": 0.40276476740837097, "learning_rate": 4.72618771780528e-05, "loss": 1.4578, "num_input_tokens_seen": 143208888, "step": 3630 }, { "epoch": 0.1763811577436295, "grad_norm": 0.46488335728645325, "learning_rate": 4.724399587855747e-05, "loss": 1.5344, "num_input_tokens_seen": 143612984, "step": 3640 }, { "epoch": 0.1768657213638043, "grad_norm": 0.4321140944957733, "learning_rate": 4.7226059787398914e-05, "loss": 1.5565, "num_input_tokens_seen": 144004420, "step": 3650 }, { "epoch": 0.17735028498397912, "grad_norm": 0.42687898874282837, "learning_rate": 4.720806894875751e-05, "loss": 1.4911, "num_input_tokens_seen": 144385500, "step": 3660 }, { "epoch": 0.1778348486041539, "grad_norm": 0.39949852228164673, "learning_rate": 4.7190023406948506e-05, "loss": 1.4925, "num_input_tokens_seen": 144786456, "step": 3670 }, { "epoch": 0.17831941222432873, "grad_norm": 0.45870667695999146, "learning_rate": 4.7171923206421886e-05, "loss": 1.5279, "num_input_tokens_seen": 145150804, "step": 3680 }, { "epoch": 0.17880397584450353, "grad_norm": 0.3944126069545746, "learning_rate": 4.715376839176226e-05, "loss": 1.5185, "num_input_tokens_seen": 145542392, "step": 3690 }, { "epoch": 0.17928853946467835, "grad_norm": 0.41185203194618225, "learning_rate": 4.713555900768879e-05, "loss": 1.4337, "num_input_tokens_seen": 145911624, "step": 3700 }, { "epoch": 0.17977310308485314, "grad_norm": 0.39053046703338623, "learning_rate": 4.711729509905501e-05, "loss": 1.5279, "num_input_tokens_seen": 146287476, "step": 3710 }, { "epoch": 0.18025766670502796, "grad_norm": 0.41288021206855774, "learning_rate": 4.709897671084881e-05, "loss": 1.5093, "num_input_tokens_seen": 146688592, "step": 3720 }, { "epoch": 0.18074223032520276, "grad_norm": 0.400301992893219, "learning_rate": 4.7080603888192256e-05, "loss": 1.4321, "num_input_tokens_seen": 147100860, "step": 3730 }, { "epoch": 0.18122679394537758, "grad_norm": 0.39671310782432556, "learning_rate": 4.7062176676341476e-05, "loss": 1.4853, "num_input_tokens_seen": 147506564, "step": 3740 }, { "epoch": 0.18171135756555237, "grad_norm": 0.4283420741558075, "learning_rate": 4.7043695120686594e-05, "loss": 1.4895, "num_input_tokens_seen": 147906624, "step": 3750 }, { "epoch": 0.18219592118572717, "grad_norm": 0.44357410073280334, "learning_rate": 4.7025159266751586e-05, "loss": 1.5736, "num_input_tokens_seen": 148321696, "step": 3760 }, { "epoch": 0.182680484805902, "grad_norm": 0.4820682406425476, "learning_rate": 4.7006569160194185e-05, "loss": 1.4884, "num_input_tokens_seen": 148710300, "step": 3770 }, { "epoch": 0.18316504842607678, "grad_norm": 0.46169939637184143, "learning_rate": 4.698792484680574e-05, "loss": 1.4714, "num_input_tokens_seen": 149111440, "step": 3780 }, { "epoch": 0.1836496120462516, "grad_norm": 0.42826372385025024, "learning_rate": 4.6969226372511153e-05, "loss": 1.5092, "num_input_tokens_seen": 149498596, "step": 3790 }, { "epoch": 0.1841341756664264, "grad_norm": 0.4284881055355072, "learning_rate": 4.695047378336871e-05, "loss": 1.5269, "num_input_tokens_seen": 149928100, "step": 3800 }, { "epoch": 0.18461873928660122, "grad_norm": 0.41511744260787964, "learning_rate": 4.693166712556999e-05, "loss": 1.5135, "num_input_tokens_seen": 150349972, "step": 3810 }, { "epoch": 0.185103302906776, "grad_norm": 0.43388912081718445, "learning_rate": 4.6912806445439786e-05, "loss": 1.5093, "num_input_tokens_seen": 150745692, "step": 3820 }, { "epoch": 0.18558786652695083, "grad_norm": 0.4395964741706848, "learning_rate": 4.689389178943593e-05, "loss": 1.5737, "num_input_tokens_seen": 151156868, "step": 3830 }, { "epoch": 0.18607243014712563, "grad_norm": 0.4370187819004059, "learning_rate": 4.6874923204149215e-05, "loss": 1.5126, "num_input_tokens_seen": 151565152, "step": 3840 }, { "epoch": 0.18655699376730042, "grad_norm": 0.4095444083213806, "learning_rate": 4.685590073630327e-05, "loss": 1.4713, "num_input_tokens_seen": 151965476, "step": 3850 }, { "epoch": 0.18704155738747524, "grad_norm": 0.3944699764251709, "learning_rate": 4.683682443275447e-05, "loss": 1.5337, "num_input_tokens_seen": 152338220, "step": 3860 }, { "epoch": 0.18752612100765004, "grad_norm": 0.4351947009563446, "learning_rate": 4.681769434049177e-05, "loss": 1.491, "num_input_tokens_seen": 152724792, "step": 3870 }, { "epoch": 0.18801068462782486, "grad_norm": 0.5131449699401855, "learning_rate": 4.6798510506636626e-05, "loss": 1.5185, "num_input_tokens_seen": 153099052, "step": 3880 }, { "epoch": 0.18849524824799965, "grad_norm": 0.43832552433013916, "learning_rate": 4.6779272978442904e-05, "loss": 1.5165, "num_input_tokens_seen": 153488160, "step": 3890 }, { "epoch": 0.18897981186817447, "grad_norm": 0.4259849488735199, "learning_rate": 4.6759981803296666e-05, "loss": 1.4512, "num_input_tokens_seen": 153864880, "step": 3900 }, { "epoch": 0.18946437548834927, "grad_norm": 0.46592122316360474, "learning_rate": 4.674063702871617e-05, "loss": 1.4473, "num_input_tokens_seen": 154258796, "step": 3910 }, { "epoch": 0.1899489391085241, "grad_norm": 0.4338749349117279, "learning_rate": 4.672123870235169e-05, "loss": 1.5149, "num_input_tokens_seen": 154659628, "step": 3920 }, { "epoch": 0.19043350272869888, "grad_norm": 0.4241959750652313, "learning_rate": 4.6701786871985395e-05, "loss": 1.546, "num_input_tokens_seen": 155065544, "step": 3930 }, { "epoch": 0.1909180663488737, "grad_norm": 0.39792391657829285, "learning_rate": 4.6682281585531264e-05, "loss": 1.5081, "num_input_tokens_seen": 155439252, "step": 3940 }, { "epoch": 0.1914026299690485, "grad_norm": 0.4416780173778534, "learning_rate": 4.666272289103492e-05, "loss": 1.5018, "num_input_tokens_seen": 155848612, "step": 3950 }, { "epoch": 0.1918871935892233, "grad_norm": 0.42491331696510315, "learning_rate": 4.664311083667359e-05, "loss": 1.5042, "num_input_tokens_seen": 156231544, "step": 3960 }, { "epoch": 0.19237175720939811, "grad_norm": 0.4376972019672394, "learning_rate": 4.6623445470755875e-05, "loss": 1.5095, "num_input_tokens_seen": 156622212, "step": 3970 }, { "epoch": 0.1928563208295729, "grad_norm": 0.4348180890083313, "learning_rate": 4.660372684172176e-05, "loss": 1.4884, "num_input_tokens_seen": 157011628, "step": 3980 }, { "epoch": 0.19334088444974773, "grad_norm": 0.4411153793334961, "learning_rate": 4.658395499814238e-05, "loss": 1.5275, "num_input_tokens_seen": 157391712, "step": 3990 }, { "epoch": 0.19382544806992252, "grad_norm": 0.4086403548717499, "learning_rate": 4.656412998871996e-05, "loss": 1.4957, "num_input_tokens_seen": 157778628, "step": 4000 }, { "epoch": 0.19382544806992252, "eval_loss": 1.6085283756256104, "eval_runtime": 3.8392, "eval_samples_per_second": 39.07, "eval_steps_per_second": 4.949, "num_input_tokens_seen": 157778628, "step": 4000 }, { "epoch": 0.19431001169009735, "grad_norm": 0.4232029914855957, "learning_rate": 4.65442518622877e-05, "loss": 1.4919, "num_input_tokens_seen": 158177392, "step": 4010 }, { "epoch": 0.19479457531027214, "grad_norm": 0.4746312201023102, "learning_rate": 4.652432066780962e-05, "loss": 1.4801, "num_input_tokens_seen": 158587736, "step": 4020 }, { "epoch": 0.19527913893044696, "grad_norm": 0.4055451452732086, "learning_rate": 4.650433645438047e-05, "loss": 1.5079, "num_input_tokens_seen": 158997624, "step": 4030 }, { "epoch": 0.19576370255062175, "grad_norm": 0.47638824582099915, "learning_rate": 4.648429927122558e-05, "loss": 1.5269, "num_input_tokens_seen": 159406580, "step": 4040 }, { "epoch": 0.19624826617079658, "grad_norm": 0.426217257976532, "learning_rate": 4.646420916770078e-05, "loss": 1.4881, "num_input_tokens_seen": 159776464, "step": 4050 }, { "epoch": 0.19673282979097137, "grad_norm": 0.3979707360267639, "learning_rate": 4.644406619329223e-05, "loss": 1.5584, "num_input_tokens_seen": 160136328, "step": 4060 }, { "epoch": 0.19721739341114616, "grad_norm": 0.37950029969215393, "learning_rate": 4.642387039761635e-05, "loss": 1.5176, "num_input_tokens_seen": 160532608, "step": 4070 }, { "epoch": 0.19770195703132099, "grad_norm": 0.4001169502735138, "learning_rate": 4.6403621830419644e-05, "loss": 1.4584, "num_input_tokens_seen": 160930792, "step": 4080 }, { "epoch": 0.19818652065149578, "grad_norm": 0.4448148012161255, "learning_rate": 4.6383320541578604e-05, "loss": 1.5309, "num_input_tokens_seen": 161311248, "step": 4090 }, { "epoch": 0.1986710842716706, "grad_norm": 0.4266856014728546, "learning_rate": 4.63629665810996e-05, "loss": 1.4883, "num_input_tokens_seen": 161718084, "step": 4100 }, { "epoch": 0.1991556478918454, "grad_norm": 0.4387410581111908, "learning_rate": 4.6342559999118736e-05, "loss": 1.4552, "num_input_tokens_seen": 162101052, "step": 4110 }, { "epoch": 0.19964021151202022, "grad_norm": 0.3797076344490051, "learning_rate": 4.632210084590175e-05, "loss": 1.5668, "num_input_tokens_seen": 162513632, "step": 4120 }, { "epoch": 0.200124775132195, "grad_norm": 0.4161923825740814, "learning_rate": 4.630158917184385e-05, "loss": 1.5182, "num_input_tokens_seen": 162905012, "step": 4130 }, { "epoch": 0.20060933875236983, "grad_norm": 0.41969460248947144, "learning_rate": 4.6281025027469625e-05, "loss": 1.4945, "num_input_tokens_seen": 163281652, "step": 4140 }, { "epoch": 0.20109390237254463, "grad_norm": 0.4282332956790924, "learning_rate": 4.626040846343291e-05, "loss": 1.5109, "num_input_tokens_seen": 163652028, "step": 4150 }, { "epoch": 0.20157846599271942, "grad_norm": 0.42534980177879333, "learning_rate": 4.623973953051667e-05, "loss": 1.4954, "num_input_tokens_seen": 164028336, "step": 4160 }, { "epoch": 0.20206302961289424, "grad_norm": 0.4472944736480713, "learning_rate": 4.621901827963283e-05, "loss": 1.4857, "num_input_tokens_seen": 164392624, "step": 4170 }, { "epoch": 0.20254759323306903, "grad_norm": 0.42009228467941284, "learning_rate": 4.619824476182223e-05, "loss": 1.5229, "num_input_tokens_seen": 164753004, "step": 4180 }, { "epoch": 0.20303215685324386, "grad_norm": 0.5307909250259399, "learning_rate": 4.617741902825443e-05, "loss": 1.4997, "num_input_tokens_seen": 165122832, "step": 4190 }, { "epoch": 0.20351672047341865, "grad_norm": 0.42677533626556396, "learning_rate": 4.615654113022761e-05, "loss": 1.4825, "num_input_tokens_seen": 165469868, "step": 4200 }, { "epoch": 0.20400128409359347, "grad_norm": 0.38291576504707336, "learning_rate": 4.6135611119168465e-05, "loss": 1.5681, "num_input_tokens_seen": 165869628, "step": 4210 }, { "epoch": 0.20448584771376827, "grad_norm": 0.40809643268585205, "learning_rate": 4.611462904663201e-05, "loss": 1.4854, "num_input_tokens_seen": 166262208, "step": 4220 }, { "epoch": 0.2049704113339431, "grad_norm": 0.43991562724113464, "learning_rate": 4.6093594964301534e-05, "loss": 1.4577, "num_input_tokens_seen": 166640892, "step": 4230 }, { "epoch": 0.20545497495411788, "grad_norm": 0.432851642370224, "learning_rate": 4.607250892398843e-05, "loss": 1.4723, "num_input_tokens_seen": 167080484, "step": 4240 }, { "epoch": 0.2059395385742927, "grad_norm": 0.41228345036506653, "learning_rate": 4.605137097763207e-05, "loss": 1.4305, "num_input_tokens_seen": 167486976, "step": 4250 }, { "epoch": 0.2064241021944675, "grad_norm": 0.4661470949649811, "learning_rate": 4.603018117729968e-05, "loss": 1.4348, "num_input_tokens_seen": 167871328, "step": 4260 }, { "epoch": 0.2069086658146423, "grad_norm": 0.39085134863853455, "learning_rate": 4.600893957518622e-05, "loss": 1.4937, "num_input_tokens_seen": 168255252, "step": 4270 }, { "epoch": 0.2073932294348171, "grad_norm": 0.3891197443008423, "learning_rate": 4.5987646223614244e-05, "loss": 1.4864, "num_input_tokens_seen": 168634480, "step": 4280 }, { "epoch": 0.2078777930549919, "grad_norm": 0.40408509969711304, "learning_rate": 4.5966301175033785e-05, "loss": 1.4056, "num_input_tokens_seen": 169049616, "step": 4290 }, { "epoch": 0.20836235667516673, "grad_norm": 0.4421607553958893, "learning_rate": 4.59449044820222e-05, "loss": 1.491, "num_input_tokens_seen": 169439824, "step": 4300 }, { "epoch": 0.20884692029534152, "grad_norm": 0.3921188712120056, "learning_rate": 4.5923456197284065e-05, "loss": 1.4783, "num_input_tokens_seen": 169796656, "step": 4310 }, { "epoch": 0.20933148391551634, "grad_norm": 0.4339124262332916, "learning_rate": 4.590195637365105e-05, "loss": 1.4668, "num_input_tokens_seen": 170178796, "step": 4320 }, { "epoch": 0.20981604753569114, "grad_norm": 0.41062822937965393, "learning_rate": 4.588040506408176e-05, "loss": 1.4628, "num_input_tokens_seen": 170578188, "step": 4330 }, { "epoch": 0.21030061115586596, "grad_norm": 0.40424343943595886, "learning_rate": 4.5858802321661616e-05, "loss": 1.4596, "num_input_tokens_seen": 170964776, "step": 4340 }, { "epoch": 0.21078517477604075, "grad_norm": 0.4466400742530823, "learning_rate": 4.5837148199602745e-05, "loss": 1.5198, "num_input_tokens_seen": 171343616, "step": 4350 }, { "epoch": 0.21126973839621555, "grad_norm": 0.45615527033805847, "learning_rate": 4.581544275124383e-05, "loss": 1.4517, "num_input_tokens_seen": 171718620, "step": 4360 }, { "epoch": 0.21175430201639037, "grad_norm": 0.45989635586738586, "learning_rate": 4.5793686030049974e-05, "loss": 1.4469, "num_input_tokens_seen": 172101080, "step": 4370 }, { "epoch": 0.21223886563656516, "grad_norm": 0.3968257009983063, "learning_rate": 4.577187808961258e-05, "loss": 1.4241, "num_input_tokens_seen": 172502216, "step": 4380 }, { "epoch": 0.21272342925673998, "grad_norm": 0.4084954559803009, "learning_rate": 4.5750018983649214e-05, "loss": 1.4621, "num_input_tokens_seen": 172905268, "step": 4390 }, { "epoch": 0.21320799287691478, "grad_norm": 0.42288362979888916, "learning_rate": 4.5728108766003474e-05, "loss": 1.4642, "num_input_tokens_seen": 173303252, "step": 4400 }, { "epoch": 0.2136925564970896, "grad_norm": 0.40454548597335815, "learning_rate": 4.570614749064486e-05, "loss": 1.4608, "num_input_tokens_seen": 173696264, "step": 4410 }, { "epoch": 0.2141771201172644, "grad_norm": 0.416332870721817, "learning_rate": 4.568413521166863e-05, "loss": 1.4757, "num_input_tokens_seen": 174067360, "step": 4420 }, { "epoch": 0.2146616837374392, "grad_norm": 0.4173552095890045, "learning_rate": 4.566207198329568e-05, "loss": 1.4377, "num_input_tokens_seen": 174474464, "step": 4430 }, { "epoch": 0.215146247357614, "grad_norm": 0.4277855455875397, "learning_rate": 4.563995785987241e-05, "loss": 1.4673, "num_input_tokens_seen": 174876064, "step": 4440 }, { "epoch": 0.21563081097778883, "grad_norm": 0.45221036672592163, "learning_rate": 4.561779289587058e-05, "loss": 1.4808, "num_input_tokens_seen": 175281952, "step": 4450 }, { "epoch": 0.21611537459796362, "grad_norm": 0.3993636667728424, "learning_rate": 4.559557714588717e-05, "loss": 1.5006, "num_input_tokens_seen": 175657752, "step": 4460 }, { "epoch": 0.21659993821813842, "grad_norm": 0.42281702160835266, "learning_rate": 4.557331066464428e-05, "loss": 1.4489, "num_input_tokens_seen": 176038544, "step": 4470 }, { "epoch": 0.21708450183831324, "grad_norm": 0.4503469467163086, "learning_rate": 4.555099350698895e-05, "loss": 1.5062, "num_input_tokens_seen": 176415768, "step": 4480 }, { "epoch": 0.21756906545848803, "grad_norm": 0.4171935021877289, "learning_rate": 4.5528625727893065e-05, "loss": 1.4701, "num_input_tokens_seen": 176823204, "step": 4490 }, { "epoch": 0.21805362907866285, "grad_norm": 0.41661641001701355, "learning_rate": 4.5506207382453184e-05, "loss": 1.485, "num_input_tokens_seen": 177218572, "step": 4500 }, { "epoch": 0.21853819269883765, "grad_norm": 0.4115769863128662, "learning_rate": 4.548373852589044e-05, "loss": 1.4757, "num_input_tokens_seen": 177591148, "step": 4510 }, { "epoch": 0.21902275631901247, "grad_norm": 0.42897987365722656, "learning_rate": 4.5461219213550365e-05, "loss": 1.4452, "num_input_tokens_seen": 177978272, "step": 4520 }, { "epoch": 0.21950731993918726, "grad_norm": 0.4455084204673767, "learning_rate": 4.5438649500902796e-05, "loss": 1.5184, "num_input_tokens_seen": 178398380, "step": 4530 }, { "epoch": 0.21999188355936208, "grad_norm": 0.3757345974445343, "learning_rate": 4.5416029443541694e-05, "loss": 1.4825, "num_input_tokens_seen": 178807360, "step": 4540 }, { "epoch": 0.22047644717953688, "grad_norm": 0.4476206302642822, "learning_rate": 4.539335909718504e-05, "loss": 1.5054, "num_input_tokens_seen": 179185000, "step": 4550 }, { "epoch": 0.22096101079971167, "grad_norm": 0.3936362862586975, "learning_rate": 4.5370638517674716e-05, "loss": 1.477, "num_input_tokens_seen": 179593392, "step": 4560 }, { "epoch": 0.2214455744198865, "grad_norm": 0.41432103514671326, "learning_rate": 4.534786776097627e-05, "loss": 1.4547, "num_input_tokens_seen": 179985908, "step": 4570 }, { "epoch": 0.2219301380400613, "grad_norm": 0.3996790051460266, "learning_rate": 4.532504688317891e-05, "loss": 1.4691, "num_input_tokens_seen": 180414548, "step": 4580 }, { "epoch": 0.2224147016602361, "grad_norm": 0.4232152998447418, "learning_rate": 4.530217594049529e-05, "loss": 1.4798, "num_input_tokens_seen": 180809424, "step": 4590 }, { "epoch": 0.2228992652804109, "grad_norm": 0.4331175684928894, "learning_rate": 4.527925498926136e-05, "loss": 1.4674, "num_input_tokens_seen": 181195556, "step": 4600 }, { "epoch": 0.22338382890058572, "grad_norm": 0.4475395083427429, "learning_rate": 4.5256284085936275e-05, "loss": 1.4507, "num_input_tokens_seen": 181583096, "step": 4610 }, { "epoch": 0.22386839252076052, "grad_norm": 0.42866751551628113, "learning_rate": 4.523326328710222e-05, "loss": 1.4855, "num_input_tokens_seen": 181999512, "step": 4620 }, { "epoch": 0.22435295614093534, "grad_norm": 0.41574034094810486, "learning_rate": 4.5210192649464296e-05, "loss": 1.464, "num_input_tokens_seen": 182381672, "step": 4630 }, { "epoch": 0.22483751976111013, "grad_norm": 0.39740654826164246, "learning_rate": 4.518707222985035e-05, "loss": 1.4681, "num_input_tokens_seen": 182796928, "step": 4640 }, { "epoch": 0.22532208338128495, "grad_norm": 0.41937920451164246, "learning_rate": 4.5163902085210866e-05, "loss": 1.4463, "num_input_tokens_seen": 183201244, "step": 4650 }, { "epoch": 0.22580664700145975, "grad_norm": 0.39213669300079346, "learning_rate": 4.514068227261882e-05, "loss": 1.423, "num_input_tokens_seen": 183580392, "step": 4660 }, { "epoch": 0.22629121062163454, "grad_norm": 0.3789583742618561, "learning_rate": 4.511741284926949e-05, "loss": 1.4835, "num_input_tokens_seen": 183979192, "step": 4670 }, { "epoch": 0.22677577424180936, "grad_norm": 0.4139784872531891, "learning_rate": 4.5094093872480405e-05, "loss": 1.51, "num_input_tokens_seen": 184365440, "step": 4680 }, { "epoch": 0.22726033786198416, "grad_norm": 0.414145290851593, "learning_rate": 4.507072539969114e-05, "loss": 1.4815, "num_input_tokens_seen": 184735664, "step": 4690 }, { "epoch": 0.22774490148215898, "grad_norm": 0.4188004434108734, "learning_rate": 4.5047307488463156e-05, "loss": 1.4549, "num_input_tokens_seen": 185132600, "step": 4700 }, { "epoch": 0.22822946510233377, "grad_norm": 0.4184124767780304, "learning_rate": 4.502384019647974e-05, "loss": 1.5019, "num_input_tokens_seen": 185517396, "step": 4710 }, { "epoch": 0.2287140287225086, "grad_norm": 0.3586943745613098, "learning_rate": 4.5000323581545784e-05, "loss": 1.4141, "num_input_tokens_seen": 185908324, "step": 4720 }, { "epoch": 0.2291985923426834, "grad_norm": 0.40754491090774536, "learning_rate": 4.497675770158768e-05, "loss": 1.4299, "num_input_tokens_seen": 186287004, "step": 4730 }, { "epoch": 0.2296831559628582, "grad_norm": 0.38670873641967773, "learning_rate": 4.4953142614653175e-05, "loss": 1.4442, "num_input_tokens_seen": 186713356, "step": 4740 }, { "epoch": 0.230167719583033, "grad_norm": 0.40814658999443054, "learning_rate": 4.4929478378911214e-05, "loss": 1.5235, "num_input_tokens_seen": 187111252, "step": 4750 }, { "epoch": 0.2306522832032078, "grad_norm": 0.4206823408603668, "learning_rate": 4.490576505265182e-05, "loss": 1.4814, "num_input_tokens_seen": 187499448, "step": 4760 }, { "epoch": 0.23113684682338262, "grad_norm": 0.5019489526748657, "learning_rate": 4.488200269428592e-05, "loss": 1.424, "num_input_tokens_seen": 187884064, "step": 4770 }, { "epoch": 0.2316214104435574, "grad_norm": 0.39812424778938293, "learning_rate": 4.4858191362345224e-05, "loss": 1.4475, "num_input_tokens_seen": 188262996, "step": 4780 }, { "epoch": 0.23210597406373223, "grad_norm": 0.43520602583885193, "learning_rate": 4.483433111548208e-05, "loss": 1.4525, "num_input_tokens_seen": 188660764, "step": 4790 }, { "epoch": 0.23259053768390703, "grad_norm": 0.40959790349006653, "learning_rate": 4.4810422012469315e-05, "loss": 1.4299, "num_input_tokens_seen": 189046960, "step": 4800 }, { "epoch": 0.23307510130408185, "grad_norm": 0.42006418108940125, "learning_rate": 4.478646411220011e-05, "loss": 1.5022, "num_input_tokens_seen": 189438452, "step": 4810 }, { "epoch": 0.23355966492425664, "grad_norm": 0.410160094499588, "learning_rate": 4.476245747368783e-05, "loss": 1.4756, "num_input_tokens_seen": 189823928, "step": 4820 }, { "epoch": 0.23404422854443147, "grad_norm": 0.4331664741039276, "learning_rate": 4.473840215606589e-05, "loss": 1.4821, "num_input_tokens_seen": 190244028, "step": 4830 }, { "epoch": 0.23452879216460626, "grad_norm": 0.4130837619304657, "learning_rate": 4.4714298218587654e-05, "loss": 1.4746, "num_input_tokens_seen": 190647640, "step": 4840 }, { "epoch": 0.23501335578478108, "grad_norm": 0.392727792263031, "learning_rate": 4.469014572062618e-05, "loss": 1.4636, "num_input_tokens_seen": 191073988, "step": 4850 }, { "epoch": 0.23549791940495587, "grad_norm": 0.4510425627231598, "learning_rate": 4.466594472167419e-05, "loss": 1.4326, "num_input_tokens_seen": 191476524, "step": 4860 }, { "epoch": 0.23598248302513067, "grad_norm": 0.40726661682128906, "learning_rate": 4.4641695281343867e-05, "loss": 1.4472, "num_input_tokens_seen": 191852232, "step": 4870 }, { "epoch": 0.2364670466453055, "grad_norm": 0.46587809920310974, "learning_rate": 4.461739745936669e-05, "loss": 1.4811, "num_input_tokens_seen": 192231868, "step": 4880 }, { "epoch": 0.23695161026548028, "grad_norm": 0.37719038128852844, "learning_rate": 4.459305131559334e-05, "loss": 1.4508, "num_input_tokens_seen": 192629400, "step": 4890 }, { "epoch": 0.2374361738856551, "grad_norm": 0.4082610309123993, "learning_rate": 4.4568656909993515e-05, "loss": 1.4626, "num_input_tokens_seen": 193036008, "step": 4900 }, { "epoch": 0.2379207375058299, "grad_norm": 0.4302626848220825, "learning_rate": 4.454421430265579e-05, "loss": 1.4371, "num_input_tokens_seen": 193442156, "step": 4910 }, { "epoch": 0.23840530112600472, "grad_norm": 0.42628800868988037, "learning_rate": 4.451972355378748e-05, "loss": 1.4682, "num_input_tokens_seen": 193831248, "step": 4920 }, { "epoch": 0.23888986474617951, "grad_norm": 0.3756248950958252, "learning_rate": 4.449518472371447e-05, "loss": 1.4266, "num_input_tokens_seen": 194205320, "step": 4930 }, { "epoch": 0.23937442836635434, "grad_norm": 0.41749587655067444, "learning_rate": 4.447059787288107e-05, "loss": 1.4397, "num_input_tokens_seen": 194598988, "step": 4940 }, { "epoch": 0.23985899198652913, "grad_norm": 0.39159849286079407, "learning_rate": 4.444596306184992e-05, "loss": 1.3895, "num_input_tokens_seen": 195006708, "step": 4950 }, { "epoch": 0.24034355560670395, "grad_norm": 0.44320693612098694, "learning_rate": 4.4421280351301744e-05, "loss": 1.4094, "num_input_tokens_seen": 195416052, "step": 4960 }, { "epoch": 0.24082811922687875, "grad_norm": 0.4559500813484192, "learning_rate": 4.439654980203527e-05, "loss": 1.4498, "num_input_tokens_seen": 195806840, "step": 4970 }, { "epoch": 0.24131268284705354, "grad_norm": 0.4020100235939026, "learning_rate": 4.437177147496709e-05, "loss": 1.4628, "num_input_tokens_seen": 196212692, "step": 4980 }, { "epoch": 0.24179724646722836, "grad_norm": 0.39954742789268494, "learning_rate": 4.434694543113145e-05, "loss": 1.4211, "num_input_tokens_seen": 196623736, "step": 4990 }, { "epoch": 0.24228181008740315, "grad_norm": 0.4263516366481781, "learning_rate": 4.4322071731680146e-05, "loss": 1.453, "num_input_tokens_seen": 197006512, "step": 5000 }, { "epoch": 0.24276637370757798, "grad_norm": 0.4143199026584625, "learning_rate": 4.429715043788235e-05, "loss": 1.4952, "num_input_tokens_seen": 197391012, "step": 5010 }, { "epoch": 0.24325093732775277, "grad_norm": 0.4607667028903961, "learning_rate": 4.427218161112449e-05, "loss": 1.4704, "num_input_tokens_seen": 197751048, "step": 5020 }, { "epoch": 0.2437355009479276, "grad_norm": 0.4273362159729004, "learning_rate": 4.4247165312910034e-05, "loss": 1.4698, "num_input_tokens_seen": 198159508, "step": 5030 }, { "epoch": 0.24422006456810239, "grad_norm": 0.39402100443840027, "learning_rate": 4.4222101604859445e-05, "loss": 1.4527, "num_input_tokens_seen": 198550724, "step": 5040 }, { "epoch": 0.2447046281882772, "grad_norm": 0.42816653847694397, "learning_rate": 4.419699054870992e-05, "loss": 1.4948, "num_input_tokens_seen": 198905612, "step": 5050 }, { "epoch": 0.245189191808452, "grad_norm": 0.4140118360519409, "learning_rate": 4.417183220631528e-05, "loss": 1.4628, "num_input_tokens_seen": 199308252, "step": 5060 }, { "epoch": 0.2456737554286268, "grad_norm": 0.3967956304550171, "learning_rate": 4.4146626639645874e-05, "loss": 1.4542, "num_input_tokens_seen": 199703748, "step": 5070 }, { "epoch": 0.24615831904880162, "grad_norm": 0.4145415425300598, "learning_rate": 4.412137391078832e-05, "loss": 1.4535, "num_input_tokens_seen": 200118988, "step": 5080 }, { "epoch": 0.2466428826689764, "grad_norm": 0.39303335547447205, "learning_rate": 4.4096074081945425e-05, "loss": 1.4249, "num_input_tokens_seen": 200527544, "step": 5090 }, { "epoch": 0.24712744628915123, "grad_norm": 0.4353494942188263, "learning_rate": 4.4070727215436025e-05, "loss": 1.4521, "num_input_tokens_seen": 200930660, "step": 5100 }, { "epoch": 0.24761200990932603, "grad_norm": 0.4420648515224457, "learning_rate": 4.4045333373694795e-05, "loss": 1.4591, "num_input_tokens_seen": 201323040, "step": 5110 }, { "epoch": 0.24809657352950085, "grad_norm": 0.40382125973701477, "learning_rate": 4.4019892619272144e-05, "loss": 1.4383, "num_input_tokens_seen": 201734800, "step": 5120 }, { "epoch": 0.24858113714967564, "grad_norm": 0.4114893078804016, "learning_rate": 4.399440501483403e-05, "loss": 1.415, "num_input_tokens_seen": 202108596, "step": 5130 }, { "epoch": 0.24906570076985046, "grad_norm": 0.4096544086933136, "learning_rate": 4.3968870623161804e-05, "loss": 1.4244, "num_input_tokens_seen": 202487992, "step": 5140 }, { "epoch": 0.24955026439002526, "grad_norm": 0.40928128361701965, "learning_rate": 4.3943289507152066e-05, "loss": 1.3944, "num_input_tokens_seen": 202886772, "step": 5150 }, { "epoch": 0.25003482801020005, "grad_norm": 0.4127601087093353, "learning_rate": 4.391766172981653e-05, "loss": 1.4302, "num_input_tokens_seen": 203310128, "step": 5160 }, { "epoch": 0.25051939163037484, "grad_norm": 0.4010636508464813, "learning_rate": 4.38919873542818e-05, "loss": 1.4494, "num_input_tokens_seen": 203716500, "step": 5170 }, { "epoch": 0.2510039552505497, "grad_norm": 0.479033499956131, "learning_rate": 4.38662664437893e-05, "loss": 1.466, "num_input_tokens_seen": 204104804, "step": 5180 }, { "epoch": 0.2514885188707245, "grad_norm": 0.4351840317249298, "learning_rate": 4.384049906169509e-05, "loss": 1.4538, "num_input_tokens_seen": 204478748, "step": 5190 }, { "epoch": 0.2519730824908993, "grad_norm": 0.45871713757514954, "learning_rate": 4.381468527146965e-05, "loss": 1.4958, "num_input_tokens_seen": 204854344, "step": 5200 }, { "epoch": 0.2524576461110741, "grad_norm": 0.42551910877227783, "learning_rate": 4.378882513669782e-05, "loss": 1.4588, "num_input_tokens_seen": 205245060, "step": 5210 }, { "epoch": 0.2529422097312489, "grad_norm": 0.39652207493782043, "learning_rate": 4.376291872107856e-05, "loss": 1.4118, "num_input_tokens_seen": 205643280, "step": 5220 }, { "epoch": 0.2534267733514237, "grad_norm": 0.4601559042930603, "learning_rate": 4.373696608842486e-05, "loss": 1.4046, "num_input_tokens_seen": 206032812, "step": 5230 }, { "epoch": 0.2539113369715985, "grad_norm": 0.4585094153881073, "learning_rate": 4.371096730266354e-05, "loss": 1.4753, "num_input_tokens_seen": 206405924, "step": 5240 }, { "epoch": 0.2543959005917733, "grad_norm": 0.39976418018341064, "learning_rate": 4.3684922427835094e-05, "loss": 1.5263, "num_input_tokens_seen": 206774840, "step": 5250 }, { "epoch": 0.25488046421194815, "grad_norm": 0.41982370615005493, "learning_rate": 4.365883152809356e-05, "loss": 1.4208, "num_input_tokens_seen": 207172068, "step": 5260 }, { "epoch": 0.25536502783212295, "grad_norm": 0.4455938935279846, "learning_rate": 4.3632694667706345e-05, "loss": 1.4489, "num_input_tokens_seen": 207533668, "step": 5270 }, { "epoch": 0.25584959145229774, "grad_norm": 0.46062058210372925, "learning_rate": 4.360651191105405e-05, "loss": 1.4252, "num_input_tokens_seen": 207962000, "step": 5280 }, { "epoch": 0.25633415507247254, "grad_norm": 0.46039023995399475, "learning_rate": 4.358028332263034e-05, "loss": 1.4279, "num_input_tokens_seen": 208396128, "step": 5290 }, { "epoch": 0.25681871869264733, "grad_norm": 0.4030589759349823, "learning_rate": 4.355400896704177e-05, "loss": 1.4416, "num_input_tokens_seen": 208771020, "step": 5300 }, { "epoch": 0.2573032823128222, "grad_norm": 0.46707648038864136, "learning_rate": 4.3527688909007645e-05, "loss": 1.4595, "num_input_tokens_seen": 209175816, "step": 5310 }, { "epoch": 0.257787845932997, "grad_norm": 0.39080390334129333, "learning_rate": 4.350132321335982e-05, "loss": 1.4442, "num_input_tokens_seen": 209573164, "step": 5320 }, { "epoch": 0.25827240955317177, "grad_norm": 0.3796825706958771, "learning_rate": 4.347491194504257e-05, "loss": 1.3894, "num_input_tokens_seen": 209973236, "step": 5330 }, { "epoch": 0.25875697317334656, "grad_norm": 0.413033127784729, "learning_rate": 4.344845516911244e-05, "loss": 1.4657, "num_input_tokens_seen": 210345012, "step": 5340 }, { "epoch": 0.2592415367935214, "grad_norm": 0.4229408800601959, "learning_rate": 4.342195295073806e-05, "loss": 1.4725, "num_input_tokens_seen": 210769268, "step": 5350 }, { "epoch": 0.2597261004136962, "grad_norm": 0.39224693179130554, "learning_rate": 4.339540535519999e-05, "loss": 1.4123, "num_input_tokens_seen": 211153696, "step": 5360 }, { "epoch": 0.260210664033871, "grad_norm": 0.4225060045719147, "learning_rate": 4.3368812447890575e-05, "loss": 1.4308, "num_input_tokens_seen": 211552748, "step": 5370 }, { "epoch": 0.2606952276540458, "grad_norm": 0.43410396575927734, "learning_rate": 4.334217429431376e-05, "loss": 1.4755, "num_input_tokens_seen": 211943156, "step": 5380 }, { "epoch": 0.2611797912742206, "grad_norm": 0.3979720175266266, "learning_rate": 4.331549096008496e-05, "loss": 1.4245, "num_input_tokens_seen": 212315716, "step": 5390 }, { "epoch": 0.26166435489439543, "grad_norm": 0.3867861032485962, "learning_rate": 4.328876251093086e-05, "loss": 1.4195, "num_input_tokens_seen": 212702260, "step": 5400 }, { "epoch": 0.26214891851457023, "grad_norm": 0.4254813492298126, "learning_rate": 4.326198901268926e-05, "loss": 1.4395, "num_input_tokens_seen": 213112036, "step": 5410 }, { "epoch": 0.262633482134745, "grad_norm": 0.37997907400131226, "learning_rate": 4.323517053130898e-05, "loss": 1.4331, "num_input_tokens_seen": 213509980, "step": 5420 }, { "epoch": 0.2631180457549198, "grad_norm": 0.4054989218711853, "learning_rate": 4.320830713284958e-05, "loss": 1.4607, "num_input_tokens_seen": 213898376, "step": 5430 }, { "epoch": 0.26360260937509467, "grad_norm": 0.5132128596305847, "learning_rate": 4.3181398883481304e-05, "loss": 1.4274, "num_input_tokens_seen": 214290036, "step": 5440 }, { "epoch": 0.26408717299526946, "grad_norm": 0.40065979957580566, "learning_rate": 4.315444584948485e-05, "loss": 1.4295, "num_input_tokens_seen": 214675368, "step": 5450 }, { "epoch": 0.26457173661544425, "grad_norm": 0.39053869247436523, "learning_rate": 4.3127448097251235e-05, "loss": 1.4356, "num_input_tokens_seen": 215066532, "step": 5460 }, { "epoch": 0.26505630023561905, "grad_norm": 0.413308709859848, "learning_rate": 4.310040569328164e-05, "loss": 1.4318, "num_input_tokens_seen": 215432972, "step": 5470 }, { "epoch": 0.26554086385579384, "grad_norm": 0.38464999198913574, "learning_rate": 4.3073318704187206e-05, "loss": 1.4088, "num_input_tokens_seen": 215839824, "step": 5480 }, { "epoch": 0.2660254274759687, "grad_norm": 0.39387422800064087, "learning_rate": 4.3046187196688923e-05, "loss": 1.446, "num_input_tokens_seen": 216225156, "step": 5490 }, { "epoch": 0.2665099910961435, "grad_norm": 0.3859797716140747, "learning_rate": 4.3019011237617434e-05, "loss": 1.4121, "num_input_tokens_seen": 216625012, "step": 5500 }, { "epoch": 0.2669945547163183, "grad_norm": 0.41637757420539856, "learning_rate": 4.2991790893912856e-05, "loss": 1.4053, "num_input_tokens_seen": 217002304, "step": 5510 }, { "epoch": 0.26747911833649307, "grad_norm": 0.4308488368988037, "learning_rate": 4.296452623262465e-05, "loss": 1.4176, "num_input_tokens_seen": 217381264, "step": 5520 }, { "epoch": 0.2679636819566679, "grad_norm": 0.42404836416244507, "learning_rate": 4.293721732091145e-05, "loss": 1.4504, "num_input_tokens_seen": 217798076, "step": 5530 }, { "epoch": 0.2684482455768427, "grad_norm": 0.42752760648727417, "learning_rate": 4.290986422604087e-05, "loss": 1.4442, "num_input_tokens_seen": 218198560, "step": 5540 }, { "epoch": 0.2689328091970175, "grad_norm": 0.4021061956882477, "learning_rate": 4.288246701538936e-05, "loss": 1.4238, "num_input_tokens_seen": 218581244, "step": 5550 }, { "epoch": 0.2694173728171923, "grad_norm": 0.3673078119754791, "learning_rate": 4.285502575644206e-05, "loss": 1.4188, "num_input_tokens_seen": 218957676, "step": 5560 }, { "epoch": 0.2699019364373671, "grad_norm": 0.40478530526161194, "learning_rate": 4.282754051679256e-05, "loss": 1.4707, "num_input_tokens_seen": 219324792, "step": 5570 }, { "epoch": 0.27038650005754195, "grad_norm": 0.41956064105033875, "learning_rate": 4.280001136414283e-05, "loss": 1.4583, "num_input_tokens_seen": 219696608, "step": 5580 }, { "epoch": 0.27087106367771674, "grad_norm": 0.4060947597026825, "learning_rate": 4.2772438366303004e-05, "loss": 1.4013, "num_input_tokens_seen": 220080788, "step": 5590 }, { "epoch": 0.27135562729789153, "grad_norm": 0.3806227445602417, "learning_rate": 4.274482159119119e-05, "loss": 1.4192, "num_input_tokens_seen": 220493132, "step": 5600 }, { "epoch": 0.2718401909180663, "grad_norm": 0.3854431211948395, "learning_rate": 4.2717161106833336e-05, "loss": 1.4181, "num_input_tokens_seen": 220880964, "step": 5610 }, { "epoch": 0.2723247545382412, "grad_norm": 0.40384823083877563, "learning_rate": 4.2689456981363074e-05, "loss": 1.4363, "num_input_tokens_seen": 221269648, "step": 5620 }, { "epoch": 0.27280931815841597, "grad_norm": 0.3799000680446625, "learning_rate": 4.2661709283021514e-05, "loss": 1.4415, "num_input_tokens_seen": 221654464, "step": 5630 }, { "epoch": 0.27329388177859076, "grad_norm": 0.4527069628238678, "learning_rate": 4.26339180801571e-05, "loss": 1.4193, "num_input_tokens_seen": 222030672, "step": 5640 }, { "epoch": 0.27377844539876556, "grad_norm": 0.41296154260635376, "learning_rate": 4.260608344122544e-05, "loss": 1.3981, "num_input_tokens_seen": 222409816, "step": 5650 }, { "epoch": 0.2742630090189404, "grad_norm": 0.37670767307281494, "learning_rate": 4.257820543478913e-05, "loss": 1.4171, "num_input_tokens_seen": 222803080, "step": 5660 }, { "epoch": 0.2747475726391152, "grad_norm": 0.3829585611820221, "learning_rate": 4.255028412951761e-05, "loss": 1.4174, "num_input_tokens_seen": 223208420, "step": 5670 }, { "epoch": 0.27523213625929, "grad_norm": 0.41849249601364136, "learning_rate": 4.2522319594186934e-05, "loss": 1.4399, "num_input_tokens_seen": 223609392, "step": 5680 }, { "epoch": 0.2757166998794648, "grad_norm": 0.39865440130233765, "learning_rate": 4.2494311897679664e-05, "loss": 1.4429, "num_input_tokens_seen": 223997220, "step": 5690 }, { "epoch": 0.2762012634996396, "grad_norm": 0.43166518211364746, "learning_rate": 4.246626110898469e-05, "loss": 1.4143, "num_input_tokens_seen": 224395400, "step": 5700 }, { "epoch": 0.27668582711981443, "grad_norm": 0.41193270683288574, "learning_rate": 4.2438167297197027e-05, "loss": 1.4019, "num_input_tokens_seen": 224779756, "step": 5710 }, { "epoch": 0.2771703907399892, "grad_norm": 0.38215872645378113, "learning_rate": 4.2410030531517665e-05, "loss": 1.4117, "num_input_tokens_seen": 225155936, "step": 5720 }, { "epoch": 0.277654954360164, "grad_norm": 0.3913954794406891, "learning_rate": 4.2381850881253415e-05, "loss": 1.4038, "num_input_tokens_seen": 225541516, "step": 5730 }, { "epoch": 0.2781395179803388, "grad_norm": 0.4006546139717102, "learning_rate": 4.23536284158167e-05, "loss": 1.4241, "num_input_tokens_seen": 225902060, "step": 5740 }, { "epoch": 0.27862408160051366, "grad_norm": 0.39373400807380676, "learning_rate": 4.232536320472543e-05, "loss": 1.422, "num_input_tokens_seen": 226304908, "step": 5750 }, { "epoch": 0.27910864522068846, "grad_norm": 0.40954354405403137, "learning_rate": 4.2297055317602785e-05, "loss": 1.4074, "num_input_tokens_seen": 226677752, "step": 5760 }, { "epoch": 0.27959320884086325, "grad_norm": 0.41480016708374023, "learning_rate": 4.226870482417707e-05, "loss": 1.4142, "num_input_tokens_seen": 227050196, "step": 5770 }, { "epoch": 0.28007777246103804, "grad_norm": 0.3755822479724884, "learning_rate": 4.2240311794281564e-05, "loss": 1.4397, "num_input_tokens_seen": 227446460, "step": 5780 }, { "epoch": 0.28056233608121284, "grad_norm": 0.44134095311164856, "learning_rate": 4.221187629785428e-05, "loss": 1.4487, "num_input_tokens_seen": 227831552, "step": 5790 }, { "epoch": 0.2810468997013877, "grad_norm": 0.4083091616630554, "learning_rate": 4.218339840493786e-05, "loss": 1.414, "num_input_tokens_seen": 228221528, "step": 5800 }, { "epoch": 0.2815314633215625, "grad_norm": 0.4252268671989441, "learning_rate": 4.215487818567937e-05, "loss": 1.3964, "num_input_tokens_seen": 228590960, "step": 5810 }, { "epoch": 0.2820160269417373, "grad_norm": 0.3926514983177185, "learning_rate": 4.212631571033015e-05, "loss": 1.3921, "num_input_tokens_seen": 228975244, "step": 5820 }, { "epoch": 0.28250059056191207, "grad_norm": 0.41333383321762085, "learning_rate": 4.20977110492456e-05, "loss": 1.4569, "num_input_tokens_seen": 229370872, "step": 5830 }, { "epoch": 0.2829851541820869, "grad_norm": 0.41047653555870056, "learning_rate": 4.206906427288506e-05, "loss": 1.465, "num_input_tokens_seen": 229788924, "step": 5840 }, { "epoch": 0.2834697178022617, "grad_norm": 0.4507644474506378, "learning_rate": 4.204037545181158e-05, "loss": 1.4222, "num_input_tokens_seen": 230192956, "step": 5850 }, { "epoch": 0.2839542814224365, "grad_norm": 0.4235022962093353, "learning_rate": 4.201164465669179e-05, "loss": 1.4538, "num_input_tokens_seen": 230540380, "step": 5860 }, { "epoch": 0.2844388450426113, "grad_norm": 0.3815031051635742, "learning_rate": 4.1982871958295734e-05, "loss": 1.44, "num_input_tokens_seen": 230931500, "step": 5870 }, { "epoch": 0.2849234086627861, "grad_norm": 0.4607156813144684, "learning_rate": 4.1954057427496615e-05, "loss": 1.3994, "num_input_tokens_seen": 231318500, "step": 5880 }, { "epoch": 0.28540797228296094, "grad_norm": 0.4481658637523651, "learning_rate": 4.192520113527075e-05, "loss": 1.4693, "num_input_tokens_seen": 231716316, "step": 5890 }, { "epoch": 0.28589253590313574, "grad_norm": 0.41151443123817444, "learning_rate": 4.1896303152697254e-05, "loss": 1.4007, "num_input_tokens_seen": 232105992, "step": 5900 }, { "epoch": 0.28637709952331053, "grad_norm": 0.3963596522808075, "learning_rate": 4.186736355095798e-05, "loss": 1.3917, "num_input_tokens_seen": 232505400, "step": 5910 }, { "epoch": 0.2868616631434853, "grad_norm": 0.42216241359710693, "learning_rate": 4.183838240133728e-05, "loss": 1.43, "num_input_tokens_seen": 232922304, "step": 5920 }, { "epoch": 0.2873462267636602, "grad_norm": 0.43040603399276733, "learning_rate": 4.1809359775221854e-05, "loss": 1.4388, "num_input_tokens_seen": 233313888, "step": 5930 }, { "epoch": 0.28783079038383497, "grad_norm": 0.4640505611896515, "learning_rate": 4.178029574410056e-05, "loss": 1.4517, "num_input_tokens_seen": 233693256, "step": 5940 }, { "epoch": 0.28831535400400976, "grad_norm": 0.40719953179359436, "learning_rate": 4.175119037956425e-05, "loss": 1.4023, "num_input_tokens_seen": 234072408, "step": 5950 }, { "epoch": 0.28879991762418455, "grad_norm": 0.3908650279045105, "learning_rate": 4.17220437533056e-05, "loss": 1.4172, "num_input_tokens_seen": 234464800, "step": 5960 }, { "epoch": 0.2892844812443594, "grad_norm": 0.4344137907028198, "learning_rate": 4.169285593711889e-05, "loss": 1.394, "num_input_tokens_seen": 234870688, "step": 5970 }, { "epoch": 0.2897690448645342, "grad_norm": 0.4607277512550354, "learning_rate": 4.16636270028999e-05, "loss": 1.428, "num_input_tokens_seen": 235291024, "step": 5980 }, { "epoch": 0.290253608484709, "grad_norm": 0.4087435007095337, "learning_rate": 4.163435702264567e-05, "loss": 1.4582, "num_input_tokens_seen": 235698736, "step": 5990 }, { "epoch": 0.2907381721048838, "grad_norm": 0.4173283576965332, "learning_rate": 4.160504606845432e-05, "loss": 1.4224, "num_input_tokens_seen": 236103764, "step": 6000 }, { "epoch": 0.2907381721048838, "eval_loss": 1.5239336490631104, "eval_runtime": 4.4295, "eval_samples_per_second": 33.864, "eval_steps_per_second": 4.289, "num_input_tokens_seen": 236103764, "step": 6000 }, { "epoch": 0.2912227357250586, "grad_norm": 0.42691048979759216, "learning_rate": 4.157569421252496e-05, "loss": 1.4339, "num_input_tokens_seen": 236508456, "step": 6010 }, { "epoch": 0.29170729934523343, "grad_norm": 0.4078880548477173, "learning_rate": 4.15463015271574e-05, "loss": 1.4053, "num_input_tokens_seen": 236915292, "step": 6020 }, { "epoch": 0.2921918629654082, "grad_norm": 0.5139662623405457, "learning_rate": 4.151686808475204e-05, "loss": 1.4309, "num_input_tokens_seen": 237303592, "step": 6030 }, { "epoch": 0.292676426585583, "grad_norm": 0.40330371260643005, "learning_rate": 4.1487393957809664e-05, "loss": 1.4339, "num_input_tokens_seen": 237688980, "step": 6040 }, { "epoch": 0.2931609902057578, "grad_norm": 0.45068076252937317, "learning_rate": 4.145787921893128e-05, "loss": 1.4589, "num_input_tokens_seen": 238057980, "step": 6050 }, { "epoch": 0.29364555382593266, "grad_norm": 0.38412150740623474, "learning_rate": 4.1428323940817933e-05, "loss": 1.4576, "num_input_tokens_seen": 238451220, "step": 6060 }, { "epoch": 0.29413011744610745, "grad_norm": 0.385979026556015, "learning_rate": 4.139872819627051e-05, "loss": 1.4162, "num_input_tokens_seen": 238829748, "step": 6070 }, { "epoch": 0.29461468106628225, "grad_norm": 0.4156942069530487, "learning_rate": 4.1369092058189586e-05, "loss": 1.3992, "num_input_tokens_seen": 239203752, "step": 6080 }, { "epoch": 0.29509924468645704, "grad_norm": 0.4118397533893585, "learning_rate": 4.133941559957524e-05, "loss": 1.4029, "num_input_tokens_seen": 239590056, "step": 6090 }, { "epoch": 0.29558380830663183, "grad_norm": 0.47945645451545715, "learning_rate": 4.130969889352686e-05, "loss": 1.3959, "num_input_tokens_seen": 239985824, "step": 6100 }, { "epoch": 0.2960683719268067, "grad_norm": 0.36734700202941895, "learning_rate": 4.1279942013242966e-05, "loss": 1.3805, "num_input_tokens_seen": 240361856, "step": 6110 }, { "epoch": 0.2965529355469815, "grad_norm": 0.41420719027519226, "learning_rate": 4.125014503202106e-05, "loss": 1.3727, "num_input_tokens_seen": 240780880, "step": 6120 }, { "epoch": 0.29703749916715627, "grad_norm": 0.43215250968933105, "learning_rate": 4.122030802325738e-05, "loss": 1.4033, "num_input_tokens_seen": 241189188, "step": 6130 }, { "epoch": 0.29752206278733107, "grad_norm": 0.42517751455307007, "learning_rate": 4.119043106044681e-05, "loss": 1.4514, "num_input_tokens_seen": 241610008, "step": 6140 }, { "epoch": 0.2980066264075059, "grad_norm": 0.38953685760498047, "learning_rate": 4.116051421718261e-05, "loss": 1.4399, "num_input_tokens_seen": 242006608, "step": 6150 }, { "epoch": 0.2984911900276807, "grad_norm": 0.4071796238422394, "learning_rate": 4.113055756715628e-05, "loss": 1.3734, "num_input_tokens_seen": 242416040, "step": 6160 }, { "epoch": 0.2989757536478555, "grad_norm": 0.42148080468177795, "learning_rate": 4.11005611841574e-05, "loss": 1.4508, "num_input_tokens_seen": 242794628, "step": 6170 }, { "epoch": 0.2994603172680303, "grad_norm": 0.4180959463119507, "learning_rate": 4.107052514207339e-05, "loss": 1.4159, "num_input_tokens_seen": 243164260, "step": 6180 }, { "epoch": 0.2999448808882051, "grad_norm": 0.36293745040893555, "learning_rate": 4.1040449514889375e-05, "loss": 1.3633, "num_input_tokens_seen": 243550040, "step": 6190 }, { "epoch": 0.30042944450837994, "grad_norm": 0.4095574617385864, "learning_rate": 4.1010334376687975e-05, "loss": 1.4784, "num_input_tokens_seen": 243908832, "step": 6200 }, { "epoch": 0.30091400812855473, "grad_norm": 0.4131157100200653, "learning_rate": 4.0980179801649146e-05, "loss": 1.4184, "num_input_tokens_seen": 244298944, "step": 6210 }, { "epoch": 0.3013985717487295, "grad_norm": 0.39260920882225037, "learning_rate": 4.094998586404998e-05, "loss": 1.4639, "num_input_tokens_seen": 244692220, "step": 6220 }, { "epoch": 0.3018831353689043, "grad_norm": 0.43091267347335815, "learning_rate": 4.0919752638264516e-05, "loss": 1.4488, "num_input_tokens_seen": 245050952, "step": 6230 }, { "epoch": 0.30236769898907917, "grad_norm": 0.42741450667381287, "learning_rate": 4.088948019876359e-05, "loss": 1.4434, "num_input_tokens_seen": 245465356, "step": 6240 }, { "epoch": 0.30285226260925396, "grad_norm": 0.4109672009944916, "learning_rate": 4.085916862011463e-05, "loss": 1.4374, "num_input_tokens_seen": 245880668, "step": 6250 }, { "epoch": 0.30333682622942876, "grad_norm": 0.4069707691669464, "learning_rate": 4.082881797698143e-05, "loss": 1.3555, "num_input_tokens_seen": 246284348, "step": 6260 }, { "epoch": 0.30382138984960355, "grad_norm": 0.4159133732318878, "learning_rate": 4.0798428344124064e-05, "loss": 1.3921, "num_input_tokens_seen": 246662396, "step": 6270 }, { "epoch": 0.30430595346977835, "grad_norm": 0.42207905650138855, "learning_rate": 4.07679997963986e-05, "loss": 1.3996, "num_input_tokens_seen": 247049844, "step": 6280 }, { "epoch": 0.3047905170899532, "grad_norm": 0.3645390272140503, "learning_rate": 4.0737532408757014e-05, "loss": 1.4054, "num_input_tokens_seen": 247441300, "step": 6290 }, { "epoch": 0.305275080710128, "grad_norm": 0.41653063893318176, "learning_rate": 4.0707026256246894e-05, "loss": 1.3984, "num_input_tokens_seen": 247851160, "step": 6300 }, { "epoch": 0.3057596443303028, "grad_norm": 0.41180601716041565, "learning_rate": 4.0676481414011345e-05, "loss": 1.3664, "num_input_tokens_seen": 248245884, "step": 6310 }, { "epoch": 0.3062442079504776, "grad_norm": 0.4273717403411865, "learning_rate": 4.064589795728878e-05, "loss": 1.4068, "num_input_tokens_seen": 248649064, "step": 6320 }, { "epoch": 0.3067287715706524, "grad_norm": 0.3938922882080078, "learning_rate": 4.06152759614127e-05, "loss": 1.4184, "num_input_tokens_seen": 249030280, "step": 6330 }, { "epoch": 0.3072133351908272, "grad_norm": 0.4099946916103363, "learning_rate": 4.0584615501811577e-05, "loss": 1.3773, "num_input_tokens_seen": 249430164, "step": 6340 }, { "epoch": 0.307697898811002, "grad_norm": 0.4208473265171051, "learning_rate": 4.055391665400858e-05, "loss": 1.3534, "num_input_tokens_seen": 249806996, "step": 6350 }, { "epoch": 0.3081824624311768, "grad_norm": 0.4177727997303009, "learning_rate": 4.052317949362147e-05, "loss": 1.3888, "num_input_tokens_seen": 250196904, "step": 6360 }, { "epoch": 0.30866702605135166, "grad_norm": 0.4083334505558014, "learning_rate": 4.049240409636237e-05, "loss": 1.3849, "num_input_tokens_seen": 250581560, "step": 6370 }, { "epoch": 0.30915158967152645, "grad_norm": 0.4475148022174835, "learning_rate": 4.046159053803758e-05, "loss": 1.3855, "num_input_tokens_seen": 250966948, "step": 6380 }, { "epoch": 0.30963615329170124, "grad_norm": 0.39256733655929565, "learning_rate": 4.0430738894547426e-05, "loss": 1.4499, "num_input_tokens_seen": 251353488, "step": 6390 }, { "epoch": 0.31012071691187604, "grad_norm": 0.3913814127445221, "learning_rate": 4.0399849241886e-05, "loss": 1.3994, "num_input_tokens_seen": 251742612, "step": 6400 }, { "epoch": 0.31060528053205083, "grad_norm": 0.41286328434944153, "learning_rate": 4.0368921656141065e-05, "loss": 1.3918, "num_input_tokens_seen": 252141148, "step": 6410 }, { "epoch": 0.3110898441522257, "grad_norm": 0.39640408754348755, "learning_rate": 4.03379562134938e-05, "loss": 1.3757, "num_input_tokens_seen": 252532656, "step": 6420 }, { "epoch": 0.3115744077724005, "grad_norm": 0.40248820185661316, "learning_rate": 4.030695299021863e-05, "loss": 1.3941, "num_input_tokens_seen": 252899616, "step": 6430 }, { "epoch": 0.31205897139257527, "grad_norm": 0.3976843059062958, "learning_rate": 4.027591206268304e-05, "loss": 1.3953, "num_input_tokens_seen": 253271420, "step": 6440 }, { "epoch": 0.31254353501275006, "grad_norm": 0.42651301622390747, "learning_rate": 4.02448335073474e-05, "loss": 1.4118, "num_input_tokens_seen": 253688464, "step": 6450 }, { "epoch": 0.3130280986329249, "grad_norm": 0.4326588213443756, "learning_rate": 4.0213717400764766e-05, "loss": 1.3611, "num_input_tokens_seen": 254091036, "step": 6460 }, { "epoch": 0.3135126622530997, "grad_norm": 0.43498626351356506, "learning_rate": 4.018256381958068e-05, "loss": 1.3872, "num_input_tokens_seen": 254494504, "step": 6470 }, { "epoch": 0.3139972258732745, "grad_norm": 0.3556174039840698, "learning_rate": 4.0151372840533e-05, "loss": 1.4149, "num_input_tokens_seen": 254877836, "step": 6480 }, { "epoch": 0.3144817894934493, "grad_norm": 0.40494486689567566, "learning_rate": 4.0120144540451706e-05, "loss": 1.4471, "num_input_tokens_seen": 255276796, "step": 6490 }, { "epoch": 0.3149663531136241, "grad_norm": 0.42257654666900635, "learning_rate": 4.008887899625868e-05, "loss": 1.3782, "num_input_tokens_seen": 255671500, "step": 6500 }, { "epoch": 0.31545091673379894, "grad_norm": 0.4325736165046692, "learning_rate": 4.005757628496759e-05, "loss": 1.4032, "num_input_tokens_seen": 256064776, "step": 6510 }, { "epoch": 0.31593548035397373, "grad_norm": 0.4220854341983795, "learning_rate": 4.002623648368361e-05, "loss": 1.3634, "num_input_tokens_seen": 256461304, "step": 6520 }, { "epoch": 0.3164200439741485, "grad_norm": 0.38566797971725464, "learning_rate": 3.9994859669603316e-05, "loss": 1.3689, "num_input_tokens_seen": 256882232, "step": 6530 }, { "epoch": 0.3169046075943233, "grad_norm": 0.38369229435920715, "learning_rate": 3.99634459200144e-05, "loss": 1.4368, "num_input_tokens_seen": 257264380, "step": 6540 }, { "epoch": 0.31738917121449817, "grad_norm": 0.432253897190094, "learning_rate": 3.9931995312295596e-05, "loss": 1.4141, "num_input_tokens_seen": 257641472, "step": 6550 }, { "epoch": 0.31787373483467296, "grad_norm": 0.3968162536621094, "learning_rate": 3.9900507923916394e-05, "loss": 1.428, "num_input_tokens_seen": 258040272, "step": 6560 }, { "epoch": 0.31835829845484775, "grad_norm": 0.44724634289741516, "learning_rate": 3.9868983832436876e-05, "loss": 1.402, "num_input_tokens_seen": 258430376, "step": 6570 }, { "epoch": 0.31884286207502255, "grad_norm": 0.39584001898765564, "learning_rate": 3.983742311550755e-05, "loss": 1.3927, "num_input_tokens_seen": 258847780, "step": 6580 }, { "epoch": 0.31932742569519734, "grad_norm": 0.446869432926178, "learning_rate": 3.9805825850869125e-05, "loss": 1.4192, "num_input_tokens_seen": 259228264, "step": 6590 }, { "epoch": 0.3198119893153722, "grad_norm": 0.3873324394226074, "learning_rate": 3.977419211635235e-05, "loss": 1.37, "num_input_tokens_seen": 259636616, "step": 6600 }, { "epoch": 0.320296552935547, "grad_norm": 0.4095400869846344, "learning_rate": 3.9742521989877795e-05, "loss": 1.3664, "num_input_tokens_seen": 260029232, "step": 6610 }, { "epoch": 0.3207811165557218, "grad_norm": 0.3852662146091461, "learning_rate": 3.971081554945568e-05, "loss": 1.3546, "num_input_tokens_seen": 260456224, "step": 6620 }, { "epoch": 0.3212656801758966, "grad_norm": 0.3864857256412506, "learning_rate": 3.967907287318566e-05, "loss": 1.3756, "num_input_tokens_seen": 260837824, "step": 6630 }, { "epoch": 0.3217502437960714, "grad_norm": 0.3785705268383026, "learning_rate": 3.964729403925666e-05, "loss": 1.4206, "num_input_tokens_seen": 261235928, "step": 6640 }, { "epoch": 0.3222348074162462, "grad_norm": 0.37969115376472473, "learning_rate": 3.961547912594667e-05, "loss": 1.406, "num_input_tokens_seen": 261622640, "step": 6650 }, { "epoch": 0.322719371036421, "grad_norm": 0.40466952323913574, "learning_rate": 3.958362821162254e-05, "loss": 1.393, "num_input_tokens_seen": 262061520, "step": 6660 }, { "epoch": 0.3232039346565958, "grad_norm": 0.40950527787208557, "learning_rate": 3.955174137473979e-05, "loss": 1.4314, "num_input_tokens_seen": 262449712, "step": 6670 }, { "epoch": 0.32368849827677065, "grad_norm": 0.4362260699272156, "learning_rate": 3.951981869384247e-05, "loss": 1.4466, "num_input_tokens_seen": 262842776, "step": 6680 }, { "epoch": 0.32417306189694545, "grad_norm": 0.4015698730945587, "learning_rate": 3.948786024756287e-05, "loss": 1.4056, "num_input_tokens_seen": 263186520, "step": 6690 }, { "epoch": 0.32465762551712024, "grad_norm": 0.3874407112598419, "learning_rate": 3.9455866114621396e-05, "loss": 1.379, "num_input_tokens_seen": 263570516, "step": 6700 }, { "epoch": 0.32514218913729503, "grad_norm": 0.3963698744773865, "learning_rate": 3.9423836373826375e-05, "loss": 1.4146, "num_input_tokens_seen": 263984720, "step": 6710 }, { "epoch": 0.32562675275746983, "grad_norm": 0.38951539993286133, "learning_rate": 3.9391771104073805e-05, "loss": 1.3588, "num_input_tokens_seen": 264376356, "step": 6720 }, { "epoch": 0.3261113163776447, "grad_norm": 0.4249687194824219, "learning_rate": 3.9359670384347244e-05, "loss": 1.4055, "num_input_tokens_seen": 264737708, "step": 6730 }, { "epoch": 0.32659587999781947, "grad_norm": 0.38605302572250366, "learning_rate": 3.9327534293717537e-05, "loss": 1.3578, "num_input_tokens_seen": 265138996, "step": 6740 }, { "epoch": 0.32708044361799427, "grad_norm": 0.41331490874290466, "learning_rate": 3.929536291134267e-05, "loss": 1.3902, "num_input_tokens_seen": 265509020, "step": 6750 }, { "epoch": 0.32756500723816906, "grad_norm": 0.372742623090744, "learning_rate": 3.926315631646756e-05, "loss": 1.3639, "num_input_tokens_seen": 265879672, "step": 6760 }, { "epoch": 0.3280495708583439, "grad_norm": 0.5427610874176025, "learning_rate": 3.9230914588423864e-05, "loss": 1.3371, "num_input_tokens_seen": 266275292, "step": 6770 }, { "epoch": 0.3285341344785187, "grad_norm": 0.4112084209918976, "learning_rate": 3.9198637806629756e-05, "loss": 1.4216, "num_input_tokens_seen": 266671844, "step": 6780 }, { "epoch": 0.3290186980986935, "grad_norm": 0.3875449299812317, "learning_rate": 3.916632605058978e-05, "loss": 1.3905, "num_input_tokens_seen": 267057556, "step": 6790 }, { "epoch": 0.3295032617188683, "grad_norm": 0.3956202566623688, "learning_rate": 3.913397939989461e-05, "loss": 1.3913, "num_input_tokens_seen": 267444300, "step": 6800 }, { "epoch": 0.3299878253390431, "grad_norm": 0.41076987981796265, "learning_rate": 3.910159793422091e-05, "loss": 1.4032, "num_input_tokens_seen": 267811660, "step": 6810 }, { "epoch": 0.33047238895921793, "grad_norm": 0.41515880823135376, "learning_rate": 3.9069181733331056e-05, "loss": 1.4098, "num_input_tokens_seen": 268195916, "step": 6820 }, { "epoch": 0.3309569525793927, "grad_norm": 0.5097044110298157, "learning_rate": 3.9036730877073e-05, "loss": 1.3921, "num_input_tokens_seen": 268578008, "step": 6830 }, { "epoch": 0.3314415161995675, "grad_norm": 0.43721285462379456, "learning_rate": 3.900424544538006e-05, "loss": 1.3893, "num_input_tokens_seen": 269003192, "step": 6840 }, { "epoch": 0.3319260798197423, "grad_norm": 0.42796212434768677, "learning_rate": 3.897172551827073e-05, "loss": 1.4325, "num_input_tokens_seen": 269393648, "step": 6850 }, { "epoch": 0.33241064343991716, "grad_norm": 0.36507412791252136, "learning_rate": 3.8939171175848447e-05, "loss": 1.4346, "num_input_tokens_seen": 269809016, "step": 6860 }, { "epoch": 0.33289520706009196, "grad_norm": 0.39873602986335754, "learning_rate": 3.8906582498301455e-05, "loss": 1.4412, "num_input_tokens_seen": 270180224, "step": 6870 }, { "epoch": 0.33337977068026675, "grad_norm": 0.3825407326221466, "learning_rate": 3.887395956590254e-05, "loss": 1.3762, "num_input_tokens_seen": 270581856, "step": 6880 }, { "epoch": 0.33386433430044155, "grad_norm": 0.40756458044052124, "learning_rate": 3.884130245900889e-05, "loss": 1.4066, "num_input_tokens_seen": 270962900, "step": 6890 }, { "epoch": 0.33434889792061634, "grad_norm": 0.41066256165504456, "learning_rate": 3.880861125806186e-05, "loss": 1.3903, "num_input_tokens_seen": 271374220, "step": 6900 }, { "epoch": 0.3348334615407912, "grad_norm": 0.4207150340080261, "learning_rate": 3.877588604358678e-05, "loss": 1.3709, "num_input_tokens_seen": 271765116, "step": 6910 }, { "epoch": 0.335318025160966, "grad_norm": 0.4337269067764282, "learning_rate": 3.8743126896192784e-05, "loss": 1.3251, "num_input_tokens_seen": 272147924, "step": 6920 }, { "epoch": 0.3358025887811408, "grad_norm": 0.4367486536502838, "learning_rate": 3.871033389657255e-05, "loss": 1.3525, "num_input_tokens_seen": 272511972, "step": 6930 }, { "epoch": 0.33628715240131557, "grad_norm": 0.43631139397621155, "learning_rate": 3.867750712550219e-05, "loss": 1.3855, "num_input_tokens_seen": 272876876, "step": 6940 }, { "epoch": 0.3367717160214904, "grad_norm": 0.40306615829467773, "learning_rate": 3.8644646663840976e-05, "loss": 1.3868, "num_input_tokens_seen": 273287016, "step": 6950 }, { "epoch": 0.3372562796416652, "grad_norm": 0.3835316598415375, "learning_rate": 3.861175259253117e-05, "loss": 1.3667, "num_input_tokens_seen": 273674052, "step": 6960 }, { "epoch": 0.33774084326184, "grad_norm": 0.38782796263694763, "learning_rate": 3.857882499259782e-05, "loss": 1.353, "num_input_tokens_seen": 274083796, "step": 6970 }, { "epoch": 0.3382254068820148, "grad_norm": 0.395175039768219, "learning_rate": 3.854586394514855e-05, "loss": 1.3281, "num_input_tokens_seen": 274493468, "step": 6980 }, { "epoch": 0.3387099705021896, "grad_norm": 0.45938950777053833, "learning_rate": 3.851286953137341e-05, "loss": 1.4144, "num_input_tokens_seen": 274858868, "step": 6990 }, { "epoch": 0.33919453412236444, "grad_norm": 0.40472185611724854, "learning_rate": 3.847984183254461e-05, "loss": 1.3877, "num_input_tokens_seen": 275233524, "step": 7000 }, { "epoch": 0.33967909774253924, "grad_norm": 0.4177361726760864, "learning_rate": 3.8446780930016336e-05, "loss": 1.3834, "num_input_tokens_seen": 275600504, "step": 7010 }, { "epoch": 0.34016366136271403, "grad_norm": 0.4230976700782776, "learning_rate": 3.8413686905224595e-05, "loss": 1.3601, "num_input_tokens_seen": 275991592, "step": 7020 }, { "epoch": 0.3406482249828888, "grad_norm": 0.3965121805667877, "learning_rate": 3.838055983968695e-05, "loss": 1.3801, "num_input_tokens_seen": 276355724, "step": 7030 }, { "epoch": 0.3411327886030637, "grad_norm": 0.39479291439056396, "learning_rate": 3.8347399815002385e-05, "loss": 1.3976, "num_input_tokens_seen": 276749052, "step": 7040 }, { "epoch": 0.34161735222323847, "grad_norm": 0.37649762630462646, "learning_rate": 3.8314206912851036e-05, "loss": 1.3981, "num_input_tokens_seen": 277163328, "step": 7050 }, { "epoch": 0.34210191584341326, "grad_norm": 0.389952152967453, "learning_rate": 3.828098121499404e-05, "loss": 1.3929, "num_input_tokens_seen": 277532872, "step": 7060 }, { "epoch": 0.34258647946358806, "grad_norm": 0.4221923053264618, "learning_rate": 3.82477228032733e-05, "loss": 1.3894, "num_input_tokens_seen": 277930776, "step": 7070 }, { "epoch": 0.3430710430837629, "grad_norm": 0.4203970730304718, "learning_rate": 3.821443175961134e-05, "loss": 1.3611, "num_input_tokens_seen": 278313072, "step": 7080 }, { "epoch": 0.3435556067039377, "grad_norm": 0.44928234815597534, "learning_rate": 3.818110816601101e-05, "loss": 1.4055, "num_input_tokens_seen": 278692232, "step": 7090 }, { "epoch": 0.3440401703241125, "grad_norm": 0.40566375851631165, "learning_rate": 3.814775210455538e-05, "loss": 1.3874, "num_input_tokens_seen": 279070076, "step": 7100 }, { "epoch": 0.3445247339442873, "grad_norm": 0.4111347794532776, "learning_rate": 3.811436365740748e-05, "loss": 1.4077, "num_input_tokens_seen": 279468284, "step": 7110 }, { "epoch": 0.3450092975644621, "grad_norm": 0.3757016956806183, "learning_rate": 3.808094290681011e-05, "loss": 1.3623, "num_input_tokens_seen": 279858588, "step": 7120 }, { "epoch": 0.34549386118463693, "grad_norm": 0.41814279556274414, "learning_rate": 3.8047489935085635e-05, "loss": 1.3726, "num_input_tokens_seen": 280269536, "step": 7130 }, { "epoch": 0.3459784248048117, "grad_norm": 0.3829779624938965, "learning_rate": 3.801400482463581e-05, "loss": 1.3783, "num_input_tokens_seen": 280677220, "step": 7140 }, { "epoch": 0.3464629884249865, "grad_norm": 0.40414533019065857, "learning_rate": 3.798048765794151e-05, "loss": 1.3914, "num_input_tokens_seen": 281062416, "step": 7150 }, { "epoch": 0.3469475520451613, "grad_norm": 0.4087967574596405, "learning_rate": 3.7946938517562635e-05, "loss": 1.4013, "num_input_tokens_seen": 281462400, "step": 7160 }, { "epoch": 0.34743211566533616, "grad_norm": 0.3836404085159302, "learning_rate": 3.791335748613779e-05, "loss": 1.4126, "num_input_tokens_seen": 281857064, "step": 7170 }, { "epoch": 0.34791667928551095, "grad_norm": 0.4010114073753357, "learning_rate": 3.7879744646384154e-05, "loss": 1.4138, "num_input_tokens_seen": 282243900, "step": 7180 }, { "epoch": 0.34840124290568575, "grad_norm": 0.42484843730926514, "learning_rate": 3.7846100081097255e-05, "loss": 1.3773, "num_input_tokens_seen": 282661384, "step": 7190 }, { "epoch": 0.34888580652586054, "grad_norm": 0.42864614725112915, "learning_rate": 3.7812423873150775e-05, "loss": 1.4116, "num_input_tokens_seen": 283074744, "step": 7200 }, { "epoch": 0.34937037014603534, "grad_norm": 0.4581463634967804, "learning_rate": 3.777871610549632e-05, "loss": 1.355, "num_input_tokens_seen": 283452024, "step": 7210 }, { "epoch": 0.3498549337662102, "grad_norm": 0.4017016887664795, "learning_rate": 3.774497686116327e-05, "loss": 1.393, "num_input_tokens_seen": 283843012, "step": 7220 }, { "epoch": 0.350339497386385, "grad_norm": 0.387157142162323, "learning_rate": 3.7711206223258493e-05, "loss": 1.3497, "num_input_tokens_seen": 284225640, "step": 7230 }, { "epoch": 0.3508240610065598, "grad_norm": 0.42208153009414673, "learning_rate": 3.767740427496621e-05, "loss": 1.4008, "num_input_tokens_seen": 284608708, "step": 7240 }, { "epoch": 0.35130862462673457, "grad_norm": 0.4280160367488861, "learning_rate": 3.764357109954777e-05, "loss": 1.326, "num_input_tokens_seen": 285029224, "step": 7250 }, { "epoch": 0.3517931882469094, "grad_norm": 0.39940527081489563, "learning_rate": 3.7609706780341425e-05, "loss": 1.3688, "num_input_tokens_seen": 285413332, "step": 7260 }, { "epoch": 0.3522777518670842, "grad_norm": 0.40198463201522827, "learning_rate": 3.757581140076217e-05, "loss": 1.3835, "num_input_tokens_seen": 285805028, "step": 7270 }, { "epoch": 0.352762315487259, "grad_norm": 0.4302620589733124, "learning_rate": 3.754188504430147e-05, "loss": 1.3317, "num_input_tokens_seen": 286217520, "step": 7280 }, { "epoch": 0.3532468791074338, "grad_norm": 0.41558027267456055, "learning_rate": 3.750792779452712e-05, "loss": 1.3584, "num_input_tokens_seen": 286618988, "step": 7290 }, { "epoch": 0.3537314427276086, "grad_norm": 0.4010438621044159, "learning_rate": 3.7473939735082995e-05, "loss": 1.3638, "num_input_tokens_seen": 286993540, "step": 7300 }, { "epoch": 0.35421600634778344, "grad_norm": 0.4007185995578766, "learning_rate": 3.743992094968888e-05, "loss": 1.3662, "num_input_tokens_seen": 287379072, "step": 7310 }, { "epoch": 0.35470056996795823, "grad_norm": 0.4279842674732208, "learning_rate": 3.740587152214022e-05, "loss": 1.3124, "num_input_tokens_seen": 287779160, "step": 7320 }, { "epoch": 0.35518513358813303, "grad_norm": 0.4510299563407898, "learning_rate": 3.737179153630797e-05, "loss": 1.4197, "num_input_tokens_seen": 288150168, "step": 7330 }, { "epoch": 0.3556696972083078, "grad_norm": 0.4274813234806061, "learning_rate": 3.733768107613832e-05, "loss": 1.3953, "num_input_tokens_seen": 288526312, "step": 7340 }, { "epoch": 0.35615426082848267, "grad_norm": 0.4202810525894165, "learning_rate": 3.730354022565257e-05, "loss": 1.4117, "num_input_tokens_seen": 288933160, "step": 7350 }, { "epoch": 0.35663882444865747, "grad_norm": 0.38731688261032104, "learning_rate": 3.7269369068946816e-05, "loss": 1.3829, "num_input_tokens_seen": 289312352, "step": 7360 }, { "epoch": 0.35712338806883226, "grad_norm": 0.39851441979408264, "learning_rate": 3.7235167690191856e-05, "loss": 1.354, "num_input_tokens_seen": 289728744, "step": 7370 }, { "epoch": 0.35760795168900705, "grad_norm": 0.35968631505966187, "learning_rate": 3.7200936173632915e-05, "loss": 1.4089, "num_input_tokens_seen": 290118632, "step": 7380 }, { "epoch": 0.3580925153091819, "grad_norm": 0.41429969668388367, "learning_rate": 3.716667460358945e-05, "loss": 1.3695, "num_input_tokens_seen": 290507048, "step": 7390 }, { "epoch": 0.3585770789293567, "grad_norm": 0.37059974670410156, "learning_rate": 3.7132383064454956e-05, "loss": 1.3864, "num_input_tokens_seen": 290871436, "step": 7400 }, { "epoch": 0.3590616425495315, "grad_norm": 0.3918827474117279, "learning_rate": 3.7098061640696734e-05, "loss": 1.3879, "num_input_tokens_seen": 291230652, "step": 7410 }, { "epoch": 0.3595462061697063, "grad_norm": 0.36059802770614624, "learning_rate": 3.706371041685571e-05, "loss": 1.3579, "num_input_tokens_seen": 291641780, "step": 7420 }, { "epoch": 0.3600307697898811, "grad_norm": 0.4405132234096527, "learning_rate": 3.70293294775462e-05, "loss": 1.3851, "num_input_tokens_seen": 292025908, "step": 7430 }, { "epoch": 0.3605153334100559, "grad_norm": 0.4112297594547272, "learning_rate": 3.6994918907455734e-05, "loss": 1.3806, "num_input_tokens_seen": 292393396, "step": 7440 }, { "epoch": 0.3609998970302307, "grad_norm": 0.39005085825920105, "learning_rate": 3.696047879134481e-05, "loss": 1.3836, "num_input_tokens_seen": 292779588, "step": 7450 }, { "epoch": 0.3614844606504055, "grad_norm": 0.4453318417072296, "learning_rate": 3.692600921404672e-05, "loss": 1.37, "num_input_tokens_seen": 293187468, "step": 7460 }, { "epoch": 0.3619690242705803, "grad_norm": 0.4039851725101471, "learning_rate": 3.689151026046732e-05, "loss": 1.3835, "num_input_tokens_seen": 293571348, "step": 7470 }, { "epoch": 0.36245358789075516, "grad_norm": 0.4166390001773834, "learning_rate": 3.685698201558482e-05, "loss": 1.3745, "num_input_tokens_seen": 293971468, "step": 7480 }, { "epoch": 0.36293815151092995, "grad_norm": 0.41377830505371094, "learning_rate": 3.6822424564449584e-05, "loss": 1.3585, "num_input_tokens_seen": 294316832, "step": 7490 }, { "epoch": 0.36342271513110475, "grad_norm": 0.3810221254825592, "learning_rate": 3.6787837992183916e-05, "loss": 1.3045, "num_input_tokens_seen": 294725576, "step": 7500 }, { "epoch": 0.36390727875127954, "grad_norm": 0.3837166428565979, "learning_rate": 3.675322238398186e-05, "loss": 1.3864, "num_input_tokens_seen": 295122992, "step": 7510 }, { "epoch": 0.36439184237145433, "grad_norm": 0.4303184151649475, "learning_rate": 3.671857782510897e-05, "loss": 1.4116, "num_input_tokens_seen": 295537492, "step": 7520 }, { "epoch": 0.3648764059916292, "grad_norm": 0.4150845408439636, "learning_rate": 3.668390440090212e-05, "loss": 1.3549, "num_input_tokens_seen": 295910304, "step": 7530 }, { "epoch": 0.365360969611804, "grad_norm": 0.39363113045692444, "learning_rate": 3.6649202196769284e-05, "loss": 1.3322, "num_input_tokens_seen": 296280276, "step": 7540 }, { "epoch": 0.36584553323197877, "grad_norm": 0.395766943693161, "learning_rate": 3.6614471298189323e-05, "loss": 1.3772, "num_input_tokens_seen": 296664476, "step": 7550 }, { "epoch": 0.36633009685215356, "grad_norm": 0.3787136971950531, "learning_rate": 3.6579711790711777e-05, "loss": 1.3246, "num_input_tokens_seen": 297062148, "step": 7560 }, { "epoch": 0.3668146604723284, "grad_norm": 0.44149187207221985, "learning_rate": 3.654492375995666e-05, "loss": 1.3614, "num_input_tokens_seen": 297468140, "step": 7570 }, { "epoch": 0.3672992240925032, "grad_norm": 0.4064597189426422, "learning_rate": 3.6510107291614254e-05, "loss": 1.3648, "num_input_tokens_seen": 297882592, "step": 7580 }, { "epoch": 0.367783787712678, "grad_norm": 0.425382137298584, "learning_rate": 3.647526247144486e-05, "loss": 1.3475, "num_input_tokens_seen": 298248144, "step": 7590 }, { "epoch": 0.3682683513328528, "grad_norm": 0.4232633411884308, "learning_rate": 3.644038938527866e-05, "loss": 1.3726, "num_input_tokens_seen": 298645072, "step": 7600 }, { "epoch": 0.3687529149530276, "grad_norm": 0.4248599708080292, "learning_rate": 3.640548811901541e-05, "loss": 1.3719, "num_input_tokens_seen": 299051544, "step": 7610 }, { "epoch": 0.36923747857320244, "grad_norm": 0.4088067412376404, "learning_rate": 3.637055875862433e-05, "loss": 1.3693, "num_input_tokens_seen": 299444432, "step": 7620 }, { "epoch": 0.36972204219337723, "grad_norm": 0.4372110366821289, "learning_rate": 3.6335601390143797e-05, "loss": 1.3766, "num_input_tokens_seen": 299838044, "step": 7630 }, { "epoch": 0.370206605813552, "grad_norm": 0.391181081533432, "learning_rate": 3.630061609968121e-05, "loss": 1.3456, "num_input_tokens_seen": 300211940, "step": 7640 }, { "epoch": 0.3706911694337268, "grad_norm": 0.4158838391304016, "learning_rate": 3.6265602973412736e-05, "loss": 1.3756, "num_input_tokens_seen": 300598508, "step": 7650 }, { "epoch": 0.37117573305390167, "grad_norm": 0.39379456639289856, "learning_rate": 3.623056209758309e-05, "loss": 1.413, "num_input_tokens_seen": 301012928, "step": 7660 }, { "epoch": 0.37166029667407646, "grad_norm": 0.40179499983787537, "learning_rate": 3.619549355850536e-05, "loss": 1.367, "num_input_tokens_seen": 301381712, "step": 7670 }, { "epoch": 0.37214486029425126, "grad_norm": 0.3952077627182007, "learning_rate": 3.616039744256078e-05, "loss": 1.3613, "num_input_tokens_seen": 301797060, "step": 7680 }, { "epoch": 0.37262942391442605, "grad_norm": 0.41166162490844727, "learning_rate": 3.61252738361985e-05, "loss": 1.3913, "num_input_tokens_seen": 302205268, "step": 7690 }, { "epoch": 0.37311398753460084, "grad_norm": 0.38384559750556946, "learning_rate": 3.609012282593538e-05, "loss": 1.3422, "num_input_tokens_seen": 302592424, "step": 7700 }, { "epoch": 0.3735985511547757, "grad_norm": 0.4590049982070923, "learning_rate": 3.605494449835578e-05, "loss": 1.3665, "num_input_tokens_seen": 302963964, "step": 7710 }, { "epoch": 0.3740831147749505, "grad_norm": 0.43561649322509766, "learning_rate": 3.601973894011137e-05, "loss": 1.3385, "num_input_tokens_seen": 303350216, "step": 7720 }, { "epoch": 0.3745676783951253, "grad_norm": 0.42796000838279724, "learning_rate": 3.598450623792088e-05, "loss": 1.362, "num_input_tokens_seen": 303774160, "step": 7730 }, { "epoch": 0.3750522420153001, "grad_norm": 0.37895604968070984, "learning_rate": 3.5949246478569885e-05, "loss": 1.3766, "num_input_tokens_seen": 304184292, "step": 7740 }, { "epoch": 0.3755368056354749, "grad_norm": 0.3759874105453491, "learning_rate": 3.591395974891065e-05, "loss": 1.3544, "num_input_tokens_seen": 304615976, "step": 7750 }, { "epoch": 0.3760213692556497, "grad_norm": 0.4173840880393982, "learning_rate": 3.5878646135861826e-05, "loss": 1.3919, "num_input_tokens_seen": 305014124, "step": 7760 }, { "epoch": 0.3765059328758245, "grad_norm": 0.45354533195495605, "learning_rate": 3.5843305726408323e-05, "loss": 1.3588, "num_input_tokens_seen": 305416284, "step": 7770 }, { "epoch": 0.3769904964959993, "grad_norm": 0.3822251260280609, "learning_rate": 3.580793860760103e-05, "loss": 1.3708, "num_input_tokens_seen": 305811776, "step": 7780 }, { "epoch": 0.37747506011617415, "grad_norm": 0.3847945034503937, "learning_rate": 3.5772544866556634e-05, "loss": 1.3743, "num_input_tokens_seen": 306210288, "step": 7790 }, { "epoch": 0.37795962373634895, "grad_norm": 0.3933156132698059, "learning_rate": 3.5737124590457404e-05, "loss": 1.3817, "num_input_tokens_seen": 306612992, "step": 7800 }, { "epoch": 0.37844418735652374, "grad_norm": 0.42493316531181335, "learning_rate": 3.570167786655096e-05, "loss": 1.3504, "num_input_tokens_seen": 306981668, "step": 7810 }, { "epoch": 0.37892875097669854, "grad_norm": 0.4432273507118225, "learning_rate": 3.566620478215008e-05, "loss": 1.3901, "num_input_tokens_seen": 307387500, "step": 7820 }, { "epoch": 0.37941331459687333, "grad_norm": 0.4004375636577606, "learning_rate": 3.5630705424632475e-05, "loss": 1.4102, "num_input_tokens_seen": 307810852, "step": 7830 }, { "epoch": 0.3798978782170482, "grad_norm": 0.38001713156700134, "learning_rate": 3.5595179881440554e-05, "loss": 1.4202, "num_input_tokens_seen": 308189112, "step": 7840 }, { "epoch": 0.380382441837223, "grad_norm": 0.40970560908317566, "learning_rate": 3.5559628240081244e-05, "loss": 1.3796, "num_input_tokens_seen": 308593308, "step": 7850 }, { "epoch": 0.38086700545739777, "grad_norm": 0.4416448473930359, "learning_rate": 3.5524050588125744e-05, "loss": 1.3305, "num_input_tokens_seen": 308977396, "step": 7860 }, { "epoch": 0.38135156907757256, "grad_norm": 0.3899739682674408, "learning_rate": 3.548844701320934e-05, "loss": 1.3648, "num_input_tokens_seen": 309358540, "step": 7870 }, { "epoch": 0.3818361326977474, "grad_norm": 0.4026392698287964, "learning_rate": 3.545281760303116e-05, "loss": 1.3735, "num_input_tokens_seen": 309781936, "step": 7880 }, { "epoch": 0.3823206963179222, "grad_norm": 0.41612690687179565, "learning_rate": 3.5417162445353965e-05, "loss": 1.346, "num_input_tokens_seen": 310174720, "step": 7890 }, { "epoch": 0.382805259938097, "grad_norm": 0.3857744336128235, "learning_rate": 3.5381481628003964e-05, "loss": 1.386, "num_input_tokens_seen": 310557832, "step": 7900 }, { "epoch": 0.3832898235582718, "grad_norm": 0.4121117889881134, "learning_rate": 3.534577523887053e-05, "loss": 1.3303, "num_input_tokens_seen": 310940032, "step": 7910 }, { "epoch": 0.3837743871784466, "grad_norm": 0.45025011897087097, "learning_rate": 3.5310043365906046e-05, "loss": 1.3234, "num_input_tokens_seen": 311335136, "step": 7920 }, { "epoch": 0.38425895079862143, "grad_norm": 0.3884856700897217, "learning_rate": 3.527428609712569e-05, "loss": 1.3849, "num_input_tokens_seen": 311699972, "step": 7930 }, { "epoch": 0.38474351441879623, "grad_norm": 0.37907713651657104, "learning_rate": 3.5238503520607144e-05, "loss": 1.3602, "num_input_tokens_seen": 312078664, "step": 7940 }, { "epoch": 0.385228078038971, "grad_norm": 0.37585073709487915, "learning_rate": 3.520269572449047e-05, "loss": 1.3895, "num_input_tokens_seen": 312453588, "step": 7950 }, { "epoch": 0.3857126416591458, "grad_norm": 0.395000696182251, "learning_rate": 3.516686279697784e-05, "loss": 1.3542, "num_input_tokens_seen": 312865692, "step": 7960 }, { "epoch": 0.38619720527932067, "grad_norm": 0.40303224325180054, "learning_rate": 3.513100482633332e-05, "loss": 1.3497, "num_input_tokens_seen": 313258996, "step": 7970 }, { "epoch": 0.38668176889949546, "grad_norm": 0.4608016312122345, "learning_rate": 3.509512190088269e-05, "loss": 1.3596, "num_input_tokens_seen": 313656952, "step": 7980 }, { "epoch": 0.38716633251967025, "grad_norm": 0.393044114112854, "learning_rate": 3.505921410901316e-05, "loss": 1.3828, "num_input_tokens_seen": 314048964, "step": 7990 }, { "epoch": 0.38765089613984505, "grad_norm": 0.41909295320510864, "learning_rate": 3.50232815391732e-05, "loss": 1.3764, "num_input_tokens_seen": 314442716, "step": 8000 }, { "epoch": 0.38765089613984505, "eval_loss": 1.471493124961853, "eval_runtime": 4.7345, "eval_samples_per_second": 31.683, "eval_steps_per_second": 4.013, "num_input_tokens_seen": 314442716, "step": 8000 }, { "epoch": 0.38813545976001984, "grad_norm": 0.4128382205963135, "learning_rate": 3.498732427987236e-05, "loss": 1.3531, "num_input_tokens_seen": 314818980, "step": 8010 }, { "epoch": 0.3886200233801947, "grad_norm": 0.4025222659111023, "learning_rate": 3.4951342419680946e-05, "loss": 1.3588, "num_input_tokens_seen": 315239696, "step": 8020 }, { "epoch": 0.3891045870003695, "grad_norm": 0.4064643383026123, "learning_rate": 3.491533604722987e-05, "loss": 1.3689, "num_input_tokens_seen": 315645796, "step": 8030 }, { "epoch": 0.3895891506205443, "grad_norm": 0.36004194617271423, "learning_rate": 3.4879305251210474e-05, "loss": 1.3705, "num_input_tokens_seen": 316049168, "step": 8040 }, { "epoch": 0.39007371424071907, "grad_norm": 0.36490750312805176, "learning_rate": 3.4843250120374206e-05, "loss": 1.3596, "num_input_tokens_seen": 316455320, "step": 8050 }, { "epoch": 0.3905582778608939, "grad_norm": 0.3944610357284546, "learning_rate": 3.4807170743532466e-05, "loss": 1.3421, "num_input_tokens_seen": 316841708, "step": 8060 }, { "epoch": 0.3910428414810687, "grad_norm": 0.3791704475879669, "learning_rate": 3.4771067209556405e-05, "loss": 1.3642, "num_input_tokens_seen": 317231160, "step": 8070 }, { "epoch": 0.3915274051012435, "grad_norm": 0.39615359902381897, "learning_rate": 3.4734939607376635e-05, "loss": 1.3042, "num_input_tokens_seen": 317616996, "step": 8080 }, { "epoch": 0.3920119687214183, "grad_norm": 0.3907454013824463, "learning_rate": 3.469878802598308e-05, "loss": 1.3768, "num_input_tokens_seen": 318013328, "step": 8090 }, { "epoch": 0.39249653234159315, "grad_norm": 0.46760374307632446, "learning_rate": 3.466261255442473e-05, "loss": 1.3638, "num_input_tokens_seen": 318396864, "step": 8100 }, { "epoch": 0.39298109596176795, "grad_norm": 0.38027724623680115, "learning_rate": 3.4626413281809434e-05, "loss": 1.3292, "num_input_tokens_seen": 318748480, "step": 8110 }, { "epoch": 0.39346565958194274, "grad_norm": 0.38730815052986145, "learning_rate": 3.4590190297303623e-05, "loss": 1.3718, "num_input_tokens_seen": 319146300, "step": 8120 }, { "epoch": 0.39395022320211753, "grad_norm": 0.39373862743377686, "learning_rate": 3.455394369013218e-05, "loss": 1.384, "num_input_tokens_seen": 319537280, "step": 8130 }, { "epoch": 0.3944347868222923, "grad_norm": 0.40906721353530884, "learning_rate": 3.4517673549578154e-05, "loss": 1.3672, "num_input_tokens_seen": 319904616, "step": 8140 }, { "epoch": 0.3949193504424672, "grad_norm": 0.4988723397254944, "learning_rate": 3.448137996498258e-05, "loss": 1.3505, "num_input_tokens_seen": 320278336, "step": 8150 }, { "epoch": 0.39540391406264197, "grad_norm": 0.4247932434082031, "learning_rate": 3.44450630257442e-05, "loss": 1.3632, "num_input_tokens_seen": 320673012, "step": 8160 }, { "epoch": 0.39588847768281676, "grad_norm": 0.37421149015426636, "learning_rate": 3.440872282131934e-05, "loss": 1.3142, "num_input_tokens_seen": 321069804, "step": 8170 }, { "epoch": 0.39637304130299156, "grad_norm": 0.41044512391090393, "learning_rate": 3.4372359441221594e-05, "loss": 1.3794, "num_input_tokens_seen": 321501056, "step": 8180 }, { "epoch": 0.3968576049231664, "grad_norm": 0.3953944742679596, "learning_rate": 3.4335972975021646e-05, "loss": 1.3451, "num_input_tokens_seen": 321913760, "step": 8190 }, { "epoch": 0.3973421685433412, "grad_norm": 0.44824719429016113, "learning_rate": 3.429956351234705e-05, "loss": 1.4203, "num_input_tokens_seen": 322313132, "step": 8200 }, { "epoch": 0.397826732163516, "grad_norm": 0.4102291166782379, "learning_rate": 3.426313114288203e-05, "loss": 1.3424, "num_input_tokens_seen": 322675076, "step": 8210 }, { "epoch": 0.3983112957836908, "grad_norm": 0.375482976436615, "learning_rate": 3.4226675956367195e-05, "loss": 1.3142, "num_input_tokens_seen": 323090684, "step": 8220 }, { "epoch": 0.3987958594038656, "grad_norm": 0.42846444249153137, "learning_rate": 3.419019804259937e-05, "loss": 1.3529, "num_input_tokens_seen": 323463428, "step": 8230 }, { "epoch": 0.39928042302404043, "grad_norm": 0.42404618859291077, "learning_rate": 3.4153697491431375e-05, "loss": 1.3657, "num_input_tokens_seen": 323846056, "step": 8240 }, { "epoch": 0.3997649866442152, "grad_norm": 0.40639814734458923, "learning_rate": 3.411717439277178e-05, "loss": 1.3446, "num_input_tokens_seen": 324257868, "step": 8250 }, { "epoch": 0.40024955026439, "grad_norm": 0.41451844573020935, "learning_rate": 3.40806288365847e-05, "loss": 1.379, "num_input_tokens_seen": 324644548, "step": 8260 }, { "epoch": 0.4007341138845648, "grad_norm": 0.42741525173187256, "learning_rate": 3.404406091288956e-05, "loss": 1.383, "num_input_tokens_seen": 325051824, "step": 8270 }, { "epoch": 0.40121867750473966, "grad_norm": 0.35797035694122314, "learning_rate": 3.4007470711760885e-05, "loss": 1.3184, "num_input_tokens_seen": 325435196, "step": 8280 }, { "epoch": 0.40170324112491446, "grad_norm": 0.44313570857048035, "learning_rate": 3.397085832332808e-05, "loss": 1.3761, "num_input_tokens_seen": 325815308, "step": 8290 }, { "epoch": 0.40218780474508925, "grad_norm": 0.4352870285511017, "learning_rate": 3.393422383777518e-05, "loss": 1.3331, "num_input_tokens_seen": 326215248, "step": 8300 }, { "epoch": 0.40267236836526404, "grad_norm": 0.427741140127182, "learning_rate": 3.389756734534069e-05, "loss": 1.3546, "num_input_tokens_seen": 326587588, "step": 8310 }, { "epoch": 0.40315693198543884, "grad_norm": 0.39280158281326294, "learning_rate": 3.386088893631727e-05, "loss": 1.3251, "num_input_tokens_seen": 326969008, "step": 8320 }, { "epoch": 0.4036414956056137, "grad_norm": 0.37476789951324463, "learning_rate": 3.382418870105161e-05, "loss": 1.3944, "num_input_tokens_seen": 327337052, "step": 8330 }, { "epoch": 0.4041260592257885, "grad_norm": 0.3965238332748413, "learning_rate": 3.3787466729944156e-05, "loss": 1.3314, "num_input_tokens_seen": 327726100, "step": 8340 }, { "epoch": 0.4046106228459633, "grad_norm": 0.3928758203983307, "learning_rate": 3.375072311344887e-05, "loss": 1.3753, "num_input_tokens_seen": 328097960, "step": 8350 }, { "epoch": 0.40509518646613807, "grad_norm": 0.4109261631965637, "learning_rate": 3.371395794207304e-05, "loss": 1.3061, "num_input_tokens_seen": 328495520, "step": 8360 }, { "epoch": 0.4055797500863129, "grad_norm": 0.3748038113117218, "learning_rate": 3.3677171306377066e-05, "loss": 1.3259, "num_input_tokens_seen": 328901644, "step": 8370 }, { "epoch": 0.4060643137064877, "grad_norm": 0.40184059739112854, "learning_rate": 3.36403632969742e-05, "loss": 1.372, "num_input_tokens_seen": 329298316, "step": 8380 }, { "epoch": 0.4065488773266625, "grad_norm": 0.42977675795555115, "learning_rate": 3.360353400453035e-05, "loss": 1.3228, "num_input_tokens_seen": 329697248, "step": 8390 }, { "epoch": 0.4070334409468373, "grad_norm": 0.42152881622314453, "learning_rate": 3.356668351976385e-05, "loss": 1.3153, "num_input_tokens_seen": 330092604, "step": 8400 }, { "epoch": 0.4075180045670121, "grad_norm": 0.42566007375717163, "learning_rate": 3.352981193344523e-05, "loss": 1.3555, "num_input_tokens_seen": 330476288, "step": 8410 }, { "epoch": 0.40800256818718694, "grad_norm": 0.4720443785190582, "learning_rate": 3.349291933639701e-05, "loss": 1.4095, "num_input_tokens_seen": 330852500, "step": 8420 }, { "epoch": 0.40848713180736174, "grad_norm": 0.4077743887901306, "learning_rate": 3.345600581949344e-05, "loss": 1.3875, "num_input_tokens_seen": 331243588, "step": 8430 }, { "epoch": 0.40897169542753653, "grad_norm": 0.37246859073638916, "learning_rate": 3.3419071473660316e-05, "loss": 1.34, "num_input_tokens_seen": 331625036, "step": 8440 }, { "epoch": 0.4094562590477113, "grad_norm": 0.40044400095939636, "learning_rate": 3.338211638987475e-05, "loss": 1.3883, "num_input_tokens_seen": 332007224, "step": 8450 }, { "epoch": 0.4099408226678862, "grad_norm": 0.40075942873954773, "learning_rate": 3.33451406591649e-05, "loss": 1.3145, "num_input_tokens_seen": 332377820, "step": 8460 }, { "epoch": 0.41042538628806097, "grad_norm": 0.41642826795578003, "learning_rate": 3.330814437260983e-05, "loss": 1.3723, "num_input_tokens_seen": 332768892, "step": 8470 }, { "epoch": 0.41090994990823576, "grad_norm": 0.45073971152305603, "learning_rate": 3.32711276213392e-05, "loss": 1.3439, "num_input_tokens_seen": 333169136, "step": 8480 }, { "epoch": 0.41139451352841055, "grad_norm": 0.4418103098869324, "learning_rate": 3.32340904965331e-05, "loss": 1.3355, "num_input_tokens_seen": 333560236, "step": 8490 }, { "epoch": 0.4118790771485854, "grad_norm": 0.38057634234428406, "learning_rate": 3.3197033089421794e-05, "loss": 1.3748, "num_input_tokens_seen": 333908344, "step": 8500 }, { "epoch": 0.4123636407687602, "grad_norm": 0.38415491580963135, "learning_rate": 3.31599554912855e-05, "loss": 1.3595, "num_input_tokens_seen": 334331680, "step": 8510 }, { "epoch": 0.412848204388935, "grad_norm": 0.4075462818145752, "learning_rate": 3.3122857793454186e-05, "loss": 1.349, "num_input_tokens_seen": 334721924, "step": 8520 }, { "epoch": 0.4133327680091098, "grad_norm": 0.4361826479434967, "learning_rate": 3.308574008730732e-05, "loss": 1.3445, "num_input_tokens_seen": 335116152, "step": 8530 }, { "epoch": 0.4138173316292846, "grad_norm": 0.4111301898956299, "learning_rate": 3.304860246427366e-05, "loss": 1.388, "num_input_tokens_seen": 335498760, "step": 8540 }, { "epoch": 0.41430189524945943, "grad_norm": 0.4673983156681061, "learning_rate": 3.301144501583102e-05, "loss": 1.3624, "num_input_tokens_seen": 335894792, "step": 8550 }, { "epoch": 0.4147864588696342, "grad_norm": 0.4051591157913208, "learning_rate": 3.297426783350606e-05, "loss": 1.2684, "num_input_tokens_seen": 336281536, "step": 8560 }, { "epoch": 0.415271022489809, "grad_norm": 0.4352741241455078, "learning_rate": 3.293707100887401e-05, "loss": 1.336, "num_input_tokens_seen": 336691052, "step": 8570 }, { "epoch": 0.4157555861099838, "grad_norm": 0.43101462721824646, "learning_rate": 3.2899854633558534e-05, "loss": 1.3345, "num_input_tokens_seen": 337098468, "step": 8580 }, { "epoch": 0.41624014973015866, "grad_norm": 0.43431204557418823, "learning_rate": 3.2862618799231424e-05, "loss": 1.3372, "num_input_tokens_seen": 337498280, "step": 8590 }, { "epoch": 0.41672471335033345, "grad_norm": 0.3821309804916382, "learning_rate": 3.2825363597612405e-05, "loss": 1.3787, "num_input_tokens_seen": 337889884, "step": 8600 }, { "epoch": 0.41720927697050825, "grad_norm": 0.39203059673309326, "learning_rate": 3.2788089120468924e-05, "loss": 1.3589, "num_input_tokens_seen": 338259116, "step": 8610 }, { "epoch": 0.41769384059068304, "grad_norm": 0.39438045024871826, "learning_rate": 3.275079545961588e-05, "loss": 1.2828, "num_input_tokens_seen": 338648980, "step": 8620 }, { "epoch": 0.41817840421085783, "grad_norm": 0.4104815423488617, "learning_rate": 3.271348270691546e-05, "loss": 1.3274, "num_input_tokens_seen": 339078392, "step": 8630 }, { "epoch": 0.4186629678310327, "grad_norm": 0.39212706685066223, "learning_rate": 3.2676150954276846e-05, "loss": 1.3259, "num_input_tokens_seen": 339453980, "step": 8640 }, { "epoch": 0.4191475314512075, "grad_norm": 0.3868629038333893, "learning_rate": 3.263880029365604e-05, "loss": 1.3381, "num_input_tokens_seen": 339852160, "step": 8650 }, { "epoch": 0.41963209507138227, "grad_norm": 0.3927762806415558, "learning_rate": 3.260143081705561e-05, "loss": 1.3366, "num_input_tokens_seen": 340245664, "step": 8660 }, { "epoch": 0.42011665869155707, "grad_norm": 0.4036239683628082, "learning_rate": 3.256404261652449e-05, "loss": 1.3157, "num_input_tokens_seen": 340637260, "step": 8670 }, { "epoch": 0.4206012223117319, "grad_norm": 0.41092178225517273, "learning_rate": 3.2526635784157695e-05, "loss": 1.3727, "num_input_tokens_seen": 341032376, "step": 8680 }, { "epoch": 0.4210857859319067, "grad_norm": 0.40871837735176086, "learning_rate": 3.248921041209618e-05, "loss": 1.3247, "num_input_tokens_seen": 341418744, "step": 8690 }, { "epoch": 0.4215703495520815, "grad_norm": 0.41571468114852905, "learning_rate": 3.245176659252654e-05, "loss": 1.3083, "num_input_tokens_seen": 341804060, "step": 8700 }, { "epoch": 0.4220549131722563, "grad_norm": 0.38524648547172546, "learning_rate": 3.241430441768081e-05, "loss": 1.2899, "num_input_tokens_seen": 342183932, "step": 8710 }, { "epoch": 0.4225394767924311, "grad_norm": 0.3792508542537689, "learning_rate": 3.2376823979836256e-05, "loss": 1.3544, "num_input_tokens_seen": 342580412, "step": 8720 }, { "epoch": 0.42302404041260594, "grad_norm": 0.3552215099334717, "learning_rate": 3.233932537131511e-05, "loss": 1.3571, "num_input_tokens_seen": 342982212, "step": 8730 }, { "epoch": 0.42350860403278073, "grad_norm": 0.40093865990638733, "learning_rate": 3.230180868448437e-05, "loss": 1.3159, "num_input_tokens_seen": 343375004, "step": 8740 }, { "epoch": 0.4239931676529555, "grad_norm": 0.42487427592277527, "learning_rate": 3.2264274011755575e-05, "loss": 1.3574, "num_input_tokens_seen": 343760048, "step": 8750 }, { "epoch": 0.4244777312731303, "grad_norm": 0.370417058467865, "learning_rate": 3.222672144558455e-05, "loss": 1.3853, "num_input_tokens_seen": 344142832, "step": 8760 }, { "epoch": 0.42496229489330517, "grad_norm": 0.42524397373199463, "learning_rate": 3.21891510784712e-05, "loss": 1.3497, "num_input_tokens_seen": 344534952, "step": 8770 }, { "epoch": 0.42544685851347996, "grad_norm": 0.36123722791671753, "learning_rate": 3.215156300295928e-05, "loss": 1.3211, "num_input_tokens_seen": 344914096, "step": 8780 }, { "epoch": 0.42593142213365476, "grad_norm": 0.38714513182640076, "learning_rate": 3.2113957311636154e-05, "loss": 1.3753, "num_input_tokens_seen": 345315972, "step": 8790 }, { "epoch": 0.42641598575382955, "grad_norm": 0.38927286863327026, "learning_rate": 3.207633409713262e-05, "loss": 1.3256, "num_input_tokens_seen": 345704748, "step": 8800 }, { "epoch": 0.42690054937400435, "grad_norm": 0.3460540771484375, "learning_rate": 3.203869345212258e-05, "loss": 1.358, "num_input_tokens_seen": 346089592, "step": 8810 }, { "epoch": 0.4273851129941792, "grad_norm": 0.39570000767707825, "learning_rate": 3.20010354693229e-05, "loss": 1.3397, "num_input_tokens_seen": 346495852, "step": 8820 }, { "epoch": 0.427869676614354, "grad_norm": 0.5098708271980286, "learning_rate": 3.196336024149316e-05, "loss": 1.3545, "num_input_tokens_seen": 346912816, "step": 8830 }, { "epoch": 0.4283542402345288, "grad_norm": 0.40923282504081726, "learning_rate": 3.192566786143541e-05, "loss": 1.3697, "num_input_tokens_seen": 347337700, "step": 8840 }, { "epoch": 0.4288388038547036, "grad_norm": 0.40802377462387085, "learning_rate": 3.1887958421993944e-05, "loss": 1.3471, "num_input_tokens_seen": 347738832, "step": 8850 }, { "epoch": 0.4293233674748784, "grad_norm": 0.4220910668373108, "learning_rate": 3.185023201605508e-05, "loss": 1.3619, "num_input_tokens_seen": 348131200, "step": 8860 }, { "epoch": 0.4298079310950532, "grad_norm": 0.4093485474586487, "learning_rate": 3.181248873654693e-05, "loss": 1.3157, "num_input_tokens_seen": 348533868, "step": 8870 }, { "epoch": 0.430292494715228, "grad_norm": 0.4146769940853119, "learning_rate": 3.177472867643917e-05, "loss": 1.3552, "num_input_tokens_seen": 348915400, "step": 8880 }, { "epoch": 0.4307770583354028, "grad_norm": 0.4098985493183136, "learning_rate": 3.1736951928742804e-05, "loss": 1.2922, "num_input_tokens_seen": 349298476, "step": 8890 }, { "epoch": 0.43126162195557766, "grad_norm": 0.3871714174747467, "learning_rate": 3.169915858650996e-05, "loss": 1.3978, "num_input_tokens_seen": 349662284, "step": 8900 }, { "epoch": 0.43174618557575245, "grad_norm": 0.4378563463687897, "learning_rate": 3.166134874283361e-05, "loss": 1.3481, "num_input_tokens_seen": 350084716, "step": 8910 }, { "epoch": 0.43223074919592724, "grad_norm": 0.3912814259529114, "learning_rate": 3.16235224908474e-05, "loss": 1.3555, "num_input_tokens_seen": 350479224, "step": 8920 }, { "epoch": 0.43271531281610204, "grad_norm": 0.41960766911506653, "learning_rate": 3.158567992372538e-05, "loss": 1.3159, "num_input_tokens_seen": 350878796, "step": 8930 }, { "epoch": 0.43319987643627683, "grad_norm": 0.3939763009548187, "learning_rate": 3.154782113468179e-05, "loss": 1.3948, "num_input_tokens_seen": 351259368, "step": 8940 }, { "epoch": 0.4336844400564517, "grad_norm": 0.37786418199539185, "learning_rate": 3.1509946216970844e-05, "loss": 1.3577, "num_input_tokens_seen": 351642444, "step": 8950 }, { "epoch": 0.4341690036766265, "grad_norm": 0.41558992862701416, "learning_rate": 3.1472055263886443e-05, "loss": 1.336, "num_input_tokens_seen": 352081616, "step": 8960 }, { "epoch": 0.43465356729680127, "grad_norm": 0.407986044883728, "learning_rate": 3.143414836876204e-05, "loss": 1.3638, "num_input_tokens_seen": 352492396, "step": 8970 }, { "epoch": 0.43513813091697606, "grad_norm": 0.3700858950614929, "learning_rate": 3.13962256249703e-05, "loss": 1.3362, "num_input_tokens_seen": 352908204, "step": 8980 }, { "epoch": 0.4356226945371509, "grad_norm": 0.43616652488708496, "learning_rate": 3.1358287125922986e-05, "loss": 1.3545, "num_input_tokens_seen": 353322600, "step": 8990 }, { "epoch": 0.4361072581573257, "grad_norm": 0.40746721625328064, "learning_rate": 3.132033296507063e-05, "loss": 1.3484, "num_input_tokens_seen": 353715168, "step": 9000 }, { "epoch": 0.4365918217775005, "grad_norm": 0.3873213529586792, "learning_rate": 3.128236323590234e-05, "loss": 1.338, "num_input_tokens_seen": 354097720, "step": 9010 }, { "epoch": 0.4370763853976753, "grad_norm": 0.37148579955101013, "learning_rate": 3.1244378031945585e-05, "loss": 1.3348, "num_input_tokens_seen": 354523140, "step": 9020 }, { "epoch": 0.4375609490178501, "grad_norm": 0.4213094711303711, "learning_rate": 3.1206377446765966e-05, "loss": 1.304, "num_input_tokens_seen": 354914168, "step": 9030 }, { "epoch": 0.43804551263802494, "grad_norm": 0.4143025875091553, "learning_rate": 3.1168361573966945e-05, "loss": 1.3531, "num_input_tokens_seen": 355288384, "step": 9040 }, { "epoch": 0.43853007625819973, "grad_norm": 0.4011501669883728, "learning_rate": 3.113033050718966e-05, "loss": 1.3663, "num_input_tokens_seen": 355652940, "step": 9050 }, { "epoch": 0.4390146398783745, "grad_norm": 0.37911659479141235, "learning_rate": 3.109228434011265e-05, "loss": 1.3144, "num_input_tokens_seen": 356034868, "step": 9060 }, { "epoch": 0.4394992034985493, "grad_norm": 0.4045577049255371, "learning_rate": 3.105422316645169e-05, "loss": 1.3398, "num_input_tokens_seen": 356432868, "step": 9070 }, { "epoch": 0.43998376711872417, "grad_norm": 0.4345623254776001, "learning_rate": 3.101614707995948e-05, "loss": 1.3056, "num_input_tokens_seen": 356832276, "step": 9080 }, { "epoch": 0.44046833073889896, "grad_norm": 0.42983555793762207, "learning_rate": 3.097805617442546e-05, "loss": 1.3681, "num_input_tokens_seen": 357220024, "step": 9090 }, { "epoch": 0.44095289435907375, "grad_norm": 0.3999558389186859, "learning_rate": 3.09399505436756e-05, "loss": 1.3268, "num_input_tokens_seen": 357614528, "step": 9100 }, { "epoch": 0.44143745797924855, "grad_norm": 0.4042194187641144, "learning_rate": 3.090183028157211e-05, "loss": 1.3262, "num_input_tokens_seen": 357978768, "step": 9110 }, { "epoch": 0.44192202159942334, "grad_norm": 0.4250580370426178, "learning_rate": 3.086369548201326e-05, "loss": 1.3547, "num_input_tokens_seen": 358366236, "step": 9120 }, { "epoch": 0.4424065852195982, "grad_norm": 0.394369900226593, "learning_rate": 3.082554623893312e-05, "loss": 1.3799, "num_input_tokens_seen": 358751508, "step": 9130 }, { "epoch": 0.442891148839773, "grad_norm": 0.38379648327827454, "learning_rate": 3.0787382646301324e-05, "loss": 1.3512, "num_input_tokens_seen": 359143272, "step": 9140 }, { "epoch": 0.4433757124599478, "grad_norm": 0.3755399286746979, "learning_rate": 3.074920479812289e-05, "loss": 1.3386, "num_input_tokens_seen": 359526224, "step": 9150 }, { "epoch": 0.4438602760801226, "grad_norm": 0.4327252209186554, "learning_rate": 3.0711012788437916e-05, "loss": 1.3552, "num_input_tokens_seen": 359890840, "step": 9160 }, { "epoch": 0.4443448397002974, "grad_norm": 0.4325873851776123, "learning_rate": 3.067280671132139e-05, "loss": 1.2917, "num_input_tokens_seen": 360291064, "step": 9170 }, { "epoch": 0.4448294033204722, "grad_norm": 0.45133915543556213, "learning_rate": 3.063458666088296e-05, "loss": 1.3375, "num_input_tokens_seen": 360685988, "step": 9180 }, { "epoch": 0.445313966940647, "grad_norm": 0.42375510931015015, "learning_rate": 3.0596352731266684e-05, "loss": 1.3736, "num_input_tokens_seen": 361111956, "step": 9190 }, { "epoch": 0.4457985305608218, "grad_norm": 0.3929903209209442, "learning_rate": 3.055810501665082e-05, "loss": 1.3429, "num_input_tokens_seen": 361512340, "step": 9200 }, { "epoch": 0.44628309418099665, "grad_norm": 0.4164799749851227, "learning_rate": 3.051984361124756e-05, "loss": 1.3907, "num_input_tokens_seen": 361904800, "step": 9210 }, { "epoch": 0.44676765780117145, "grad_norm": 0.39341410994529724, "learning_rate": 3.0481568609302846e-05, "loss": 1.3494, "num_input_tokens_seen": 362296512, "step": 9220 }, { "epoch": 0.44725222142134624, "grad_norm": 0.37080663442611694, "learning_rate": 3.0443280105096096e-05, "loss": 1.3476, "num_input_tokens_seen": 362693456, "step": 9230 }, { "epoch": 0.44773678504152103, "grad_norm": 0.436410129070282, "learning_rate": 3.0404978192939974e-05, "loss": 1.3445, "num_input_tokens_seen": 363121352, "step": 9240 }, { "epoch": 0.44822134866169583, "grad_norm": 0.39854753017425537, "learning_rate": 3.0366662967180198e-05, "loss": 1.3764, "num_input_tokens_seen": 363511128, "step": 9250 }, { "epoch": 0.4487059122818707, "grad_norm": 0.4296603202819824, "learning_rate": 3.0328334522195262e-05, "loss": 1.3258, "num_input_tokens_seen": 363885672, "step": 9260 }, { "epoch": 0.44919047590204547, "grad_norm": 0.42518150806427, "learning_rate": 3.0289992952396234e-05, "loss": 1.2693, "num_input_tokens_seen": 364269780, "step": 9270 }, { "epoch": 0.44967503952222027, "grad_norm": 0.4305862784385681, "learning_rate": 3.0251638352226495e-05, "loss": 1.3471, "num_input_tokens_seen": 364672480, "step": 9280 }, { "epoch": 0.45015960314239506, "grad_norm": 0.3815818727016449, "learning_rate": 3.0213270816161536e-05, "loss": 1.3163, "num_input_tokens_seen": 365041504, "step": 9290 }, { "epoch": 0.4506441667625699, "grad_norm": 0.40457382798194885, "learning_rate": 3.0174890438708715e-05, "loss": 1.3232, "num_input_tokens_seen": 365438776, "step": 9300 }, { "epoch": 0.4511287303827447, "grad_norm": 0.43254563212394714, "learning_rate": 3.0136497314406992e-05, "loss": 1.3224, "num_input_tokens_seen": 365820180, "step": 9310 }, { "epoch": 0.4516132940029195, "grad_norm": 0.39054685831069946, "learning_rate": 3.0098091537826766e-05, "loss": 1.3043, "num_input_tokens_seen": 366203692, "step": 9320 }, { "epoch": 0.4520978576230943, "grad_norm": 0.3737923502922058, "learning_rate": 3.0059673203569572e-05, "loss": 1.3191, "num_input_tokens_seen": 366605944, "step": 9330 }, { "epoch": 0.4525824212432691, "grad_norm": 0.47919556498527527, "learning_rate": 3.0021242406267892e-05, "loss": 1.3303, "num_input_tokens_seen": 366992400, "step": 9340 }, { "epoch": 0.45306698486344393, "grad_norm": 0.44770705699920654, "learning_rate": 2.9982799240584907e-05, "loss": 1.3101, "num_input_tokens_seen": 367423000, "step": 9350 }, { "epoch": 0.4535515484836187, "grad_norm": 0.3613060712814331, "learning_rate": 2.9944343801214253e-05, "loss": 1.3585, "num_input_tokens_seen": 367812296, "step": 9360 }, { "epoch": 0.4540361121037935, "grad_norm": 0.40847277641296387, "learning_rate": 2.9905876182879806e-05, "loss": 1.2867, "num_input_tokens_seen": 368210368, "step": 9370 }, { "epoch": 0.4545206757239683, "grad_norm": 0.4815332889556885, "learning_rate": 2.986739648033544e-05, "loss": 1.4084, "num_input_tokens_seen": 368581832, "step": 9380 }, { "epoch": 0.45500523934414316, "grad_norm": 0.4206528663635254, "learning_rate": 2.9828904788364785e-05, "loss": 1.3391, "num_input_tokens_seen": 368993896, "step": 9390 }, { "epoch": 0.45548980296431796, "grad_norm": 0.43159064650535583, "learning_rate": 2.9790401201781037e-05, "loss": 1.3808, "num_input_tokens_seen": 369375788, "step": 9400 }, { "epoch": 0.45597436658449275, "grad_norm": 0.43968769907951355, "learning_rate": 2.975188581542665e-05, "loss": 1.3115, "num_input_tokens_seen": 369796992, "step": 9410 }, { "epoch": 0.45645893020466755, "grad_norm": 0.40895915031433105, "learning_rate": 2.9713358724173167e-05, "loss": 1.3758, "num_input_tokens_seen": 370171640, "step": 9420 }, { "epoch": 0.45694349382484234, "grad_norm": 0.3732263147830963, "learning_rate": 2.9674820022920953e-05, "loss": 1.3431, "num_input_tokens_seen": 370532156, "step": 9430 }, { "epoch": 0.4574280574450172, "grad_norm": 0.38699179887771606, "learning_rate": 2.963626980659898e-05, "loss": 1.295, "num_input_tokens_seen": 370950576, "step": 9440 }, { "epoch": 0.457912621065192, "grad_norm": 0.4024085998535156, "learning_rate": 2.9597708170164567e-05, "loss": 1.3202, "num_input_tokens_seen": 371331076, "step": 9450 }, { "epoch": 0.4583971846853668, "grad_norm": 0.3956678509712219, "learning_rate": 2.955913520860319e-05, "loss": 1.3289, "num_input_tokens_seen": 371714060, "step": 9460 }, { "epoch": 0.45888174830554157, "grad_norm": 0.41495439410209656, "learning_rate": 2.9520551016928193e-05, "loss": 1.3285, "num_input_tokens_seen": 372137132, "step": 9470 }, { "epoch": 0.4593663119257164, "grad_norm": 0.3949216604232788, "learning_rate": 2.9481955690180606e-05, "loss": 1.2749, "num_input_tokens_seen": 372529012, "step": 9480 }, { "epoch": 0.4598508755458912, "grad_norm": 0.3913583755493164, "learning_rate": 2.9443349323428876e-05, "loss": 1.3173, "num_input_tokens_seen": 372945044, "step": 9490 }, { "epoch": 0.460335439166066, "grad_norm": 0.4193466901779175, "learning_rate": 2.9404732011768632e-05, "loss": 1.2946, "num_input_tokens_seen": 373331840, "step": 9500 }, { "epoch": 0.4608200027862408, "grad_norm": 0.4153105914592743, "learning_rate": 2.936610385032249e-05, "loss": 1.3241, "num_input_tokens_seen": 373726396, "step": 9510 }, { "epoch": 0.4613045664064156, "grad_norm": 0.41479748487472534, "learning_rate": 2.932746493423976e-05, "loss": 1.3622, "num_input_tokens_seen": 374113504, "step": 9520 }, { "epoch": 0.46178913002659044, "grad_norm": 0.4534710645675659, "learning_rate": 2.9288815358696265e-05, "loss": 1.3537, "num_input_tokens_seen": 374495524, "step": 9530 }, { "epoch": 0.46227369364676524, "grad_norm": 0.36162427067756653, "learning_rate": 2.9250155218894083e-05, "loss": 1.2787, "num_input_tokens_seen": 374880396, "step": 9540 }, { "epoch": 0.46275825726694003, "grad_norm": 0.36983734369277954, "learning_rate": 2.9211484610061307e-05, "loss": 1.3314, "num_input_tokens_seen": 375287296, "step": 9550 }, { "epoch": 0.4632428208871148, "grad_norm": 0.4066306948661804, "learning_rate": 2.9172803627451817e-05, "loss": 1.3169, "num_input_tokens_seen": 375663144, "step": 9560 }, { "epoch": 0.4637273845072897, "grad_norm": 0.3863866329193115, "learning_rate": 2.9134112366345055e-05, "loss": 1.3676, "num_input_tokens_seen": 376030684, "step": 9570 }, { "epoch": 0.46421194812746447, "grad_norm": 0.4485596716403961, "learning_rate": 2.909541092204576e-05, "loss": 1.3345, "num_input_tokens_seen": 376376536, "step": 9580 }, { "epoch": 0.46469651174763926, "grad_norm": 0.4069863259792328, "learning_rate": 2.9056699389883783e-05, "loss": 1.2957, "num_input_tokens_seen": 376751920, "step": 9590 }, { "epoch": 0.46518107536781406, "grad_norm": 0.41476011276245117, "learning_rate": 2.9017977865213814e-05, "loss": 1.323, "num_input_tokens_seen": 377165456, "step": 9600 }, { "epoch": 0.4656656389879889, "grad_norm": 0.38815972208976746, "learning_rate": 2.8979246443415132e-05, "loss": 1.2949, "num_input_tokens_seen": 377554888, "step": 9610 }, { "epoch": 0.4661502026081637, "grad_norm": 0.39285364747047424, "learning_rate": 2.8940505219891432e-05, "loss": 1.2969, "num_input_tokens_seen": 377948944, "step": 9620 }, { "epoch": 0.4666347662283385, "grad_norm": 0.43776214122772217, "learning_rate": 2.890175429007054e-05, "loss": 1.3136, "num_input_tokens_seen": 378360736, "step": 9630 }, { "epoch": 0.4671193298485133, "grad_norm": 0.3744518458843231, "learning_rate": 2.8862993749404166e-05, "loss": 1.3181, "num_input_tokens_seen": 378760840, "step": 9640 }, { "epoch": 0.4676038934686881, "grad_norm": 0.3794598877429962, "learning_rate": 2.8824223693367724e-05, "loss": 1.3395, "num_input_tokens_seen": 379173080, "step": 9650 }, { "epoch": 0.46808845708886293, "grad_norm": 0.37090277671813965, "learning_rate": 2.8785444217460067e-05, "loss": 1.315, "num_input_tokens_seen": 379569804, "step": 9660 }, { "epoch": 0.4685730207090377, "grad_norm": 0.4118629992008209, "learning_rate": 2.8746655417203216e-05, "loss": 1.3049, "num_input_tokens_seen": 379963480, "step": 9670 }, { "epoch": 0.4690575843292125, "grad_norm": 0.4254318177700043, "learning_rate": 2.8707857388142212e-05, "loss": 1.331, "num_input_tokens_seen": 380393192, "step": 9680 }, { "epoch": 0.4695421479493873, "grad_norm": 0.4114190638065338, "learning_rate": 2.866905022584478e-05, "loss": 1.2986, "num_input_tokens_seen": 380779764, "step": 9690 }, { "epoch": 0.47002671156956216, "grad_norm": 0.4168798625469208, "learning_rate": 2.8630234025901175e-05, "loss": 1.2768, "num_input_tokens_seen": 381172796, "step": 9700 }, { "epoch": 0.47051127518973695, "grad_norm": 0.39690959453582764, "learning_rate": 2.8591408883923892e-05, "loss": 1.2874, "num_input_tokens_seen": 381548028, "step": 9710 }, { "epoch": 0.47099583880991175, "grad_norm": 0.4413483440876007, "learning_rate": 2.8552574895547468e-05, "loss": 1.3259, "num_input_tokens_seen": 381920092, "step": 9720 }, { "epoch": 0.47148040243008654, "grad_norm": 0.3835400640964508, "learning_rate": 2.8513732156428224e-05, "loss": 1.3392, "num_input_tokens_seen": 382302052, "step": 9730 }, { "epoch": 0.47196496605026134, "grad_norm": 0.3888629972934723, "learning_rate": 2.8474880762244034e-05, "loss": 1.3213, "num_input_tokens_seen": 382707108, "step": 9740 }, { "epoch": 0.4724495296704362, "grad_norm": 0.42624571919441223, "learning_rate": 2.8436020808694086e-05, "loss": 1.3378, "num_input_tokens_seen": 383075220, "step": 9750 }, { "epoch": 0.472934093290611, "grad_norm": 0.40198907256126404, "learning_rate": 2.8397152391498677e-05, "loss": 1.3228, "num_input_tokens_seen": 383491136, "step": 9760 }, { "epoch": 0.4734186569107858, "grad_norm": 0.4753158390522003, "learning_rate": 2.835827560639892e-05, "loss": 1.3172, "num_input_tokens_seen": 383869604, "step": 9770 }, { "epoch": 0.47390322053096057, "grad_norm": 0.4025874733924866, "learning_rate": 2.831939054915656e-05, "loss": 1.3099, "num_input_tokens_seen": 384263944, "step": 9780 }, { "epoch": 0.4743877841511354, "grad_norm": 0.4010298550128937, "learning_rate": 2.8280497315553705e-05, "loss": 1.2988, "num_input_tokens_seen": 384659820, "step": 9790 }, { "epoch": 0.4748723477713102, "grad_norm": 0.3888191878795624, "learning_rate": 2.8241596001392617e-05, "loss": 1.3535, "num_input_tokens_seen": 385047304, "step": 9800 }, { "epoch": 0.475356911391485, "grad_norm": 0.4061359763145447, "learning_rate": 2.8202686702495447e-05, "loss": 1.3124, "num_input_tokens_seen": 385440160, "step": 9810 }, { "epoch": 0.4758414750116598, "grad_norm": 0.3900851607322693, "learning_rate": 2.816376951470402e-05, "loss": 1.3093, "num_input_tokens_seen": 385848164, "step": 9820 }, { "epoch": 0.4763260386318346, "grad_norm": 0.39700040221214294, "learning_rate": 2.8124844533879607e-05, "loss": 1.31, "num_input_tokens_seen": 386286824, "step": 9830 }, { "epoch": 0.47681060225200944, "grad_norm": 0.4107182025909424, "learning_rate": 2.808591185590265e-05, "loss": 1.2721, "num_input_tokens_seen": 386666264, "step": 9840 }, { "epoch": 0.47729516587218423, "grad_norm": 0.392866313457489, "learning_rate": 2.8046971576672582e-05, "loss": 1.3607, "num_input_tokens_seen": 387054376, "step": 9850 }, { "epoch": 0.47777972949235903, "grad_norm": 0.380993127822876, "learning_rate": 2.8008023792107512e-05, "loss": 1.3442, "num_input_tokens_seen": 387450084, "step": 9860 }, { "epoch": 0.4782642931125338, "grad_norm": 0.4111970365047455, "learning_rate": 2.7969068598144095e-05, "loss": 1.3341, "num_input_tokens_seen": 387853932, "step": 9870 }, { "epoch": 0.47874885673270867, "grad_norm": 0.385023832321167, "learning_rate": 2.793010609073719e-05, "loss": 1.3521, "num_input_tokens_seen": 388260636, "step": 9880 }, { "epoch": 0.47923342035288347, "grad_norm": 0.3947872519493103, "learning_rate": 2.7891136365859683e-05, "loss": 1.3337, "num_input_tokens_seen": 388613256, "step": 9890 }, { "epoch": 0.47971798397305826, "grad_norm": 0.36505380272865295, "learning_rate": 2.7852159519502263e-05, "loss": 1.3526, "num_input_tokens_seen": 388996424, "step": 9900 }, { "epoch": 0.48020254759323305, "grad_norm": 0.40470102429389954, "learning_rate": 2.7813175647673123e-05, "loss": 1.2916, "num_input_tokens_seen": 389343832, "step": 9910 }, { "epoch": 0.4806871112134079, "grad_norm": 0.44250667095184326, "learning_rate": 2.777418484639779e-05, "loss": 1.3489, "num_input_tokens_seen": 389719140, "step": 9920 }, { "epoch": 0.4811716748335827, "grad_norm": 0.42761561274528503, "learning_rate": 2.773518721171884e-05, "loss": 1.2865, "num_input_tokens_seen": 390108812, "step": 9930 }, { "epoch": 0.4816562384537575, "grad_norm": 0.3793517053127289, "learning_rate": 2.769618283969569e-05, "loss": 1.3011, "num_input_tokens_seen": 390505056, "step": 9940 }, { "epoch": 0.4821408020739323, "grad_norm": 0.39107024669647217, "learning_rate": 2.765717182640436e-05, "loss": 1.3088, "num_input_tokens_seen": 390913668, "step": 9950 }, { "epoch": 0.4826253656941071, "grad_norm": 0.3844979703426361, "learning_rate": 2.7618154267937206e-05, "loss": 1.2989, "num_input_tokens_seen": 391319096, "step": 9960 }, { "epoch": 0.4831099293142819, "grad_norm": 0.4240836203098297, "learning_rate": 2.7579130260402736e-05, "loss": 1.3159, "num_input_tokens_seen": 391707568, "step": 9970 }, { "epoch": 0.4835944929344567, "grad_norm": 0.3745054006576538, "learning_rate": 2.7540099899925325e-05, "loss": 1.2823, "num_input_tokens_seen": 392088720, "step": 9980 }, { "epoch": 0.4840790565546315, "grad_norm": 0.41763851046562195, "learning_rate": 2.750106328264499e-05, "loss": 1.3461, "num_input_tokens_seen": 392488252, "step": 9990 }, { "epoch": 0.4845636201748063, "grad_norm": 0.371481716632843, "learning_rate": 2.746202050471719e-05, "loss": 1.3553, "num_input_tokens_seen": 392909044, "step": 10000 }, { "epoch": 0.4845636201748063, "eval_loss": 1.4267897605895996, "eval_runtime": 3.7798, "eval_samples_per_second": 39.684, "eval_steps_per_second": 5.027, "num_input_tokens_seen": 392909044, "step": 10000 }, { "epoch": 0.48504818379498116, "grad_norm": 0.4022182822227478, "learning_rate": 2.742297166231252e-05, "loss": 1.3079, "num_input_tokens_seen": 393315936, "step": 10010 }, { "epoch": 0.48553274741515595, "grad_norm": 0.3892512917518616, "learning_rate": 2.738391685161654e-05, "loss": 1.324, "num_input_tokens_seen": 393713452, "step": 10020 }, { "epoch": 0.48601731103533075, "grad_norm": 0.43548429012298584, "learning_rate": 2.7344856168829502e-05, "loss": 1.2849, "num_input_tokens_seen": 394098204, "step": 10030 }, { "epoch": 0.48650187465550554, "grad_norm": 0.3741033375263214, "learning_rate": 2.7305789710166123e-05, "loss": 1.256, "num_input_tokens_seen": 394469340, "step": 10040 }, { "epoch": 0.48698643827568033, "grad_norm": 0.3614325225353241, "learning_rate": 2.726671757185535e-05, "loss": 1.3384, "num_input_tokens_seen": 394882124, "step": 10050 }, { "epoch": 0.4874710018958552, "grad_norm": 0.4203681945800781, "learning_rate": 2.7227639850140118e-05, "loss": 1.3698, "num_input_tokens_seen": 395262528, "step": 10060 }, { "epoch": 0.48795556551603, "grad_norm": 0.42467623949050903, "learning_rate": 2.7188556641277107e-05, "loss": 1.3216, "num_input_tokens_seen": 395660708, "step": 10070 }, { "epoch": 0.48844012913620477, "grad_norm": 0.40550142526626587, "learning_rate": 2.7149468041536535e-05, "loss": 1.3211, "num_input_tokens_seen": 396068784, "step": 10080 }, { "epoch": 0.48892469275637956, "grad_norm": 0.4358334541320801, "learning_rate": 2.711037414720187e-05, "loss": 1.2643, "num_input_tokens_seen": 396441268, "step": 10090 }, { "epoch": 0.4894092563765544, "grad_norm": 0.38662582635879517, "learning_rate": 2.7071275054569638e-05, "loss": 1.2977, "num_input_tokens_seen": 396849108, "step": 10100 }, { "epoch": 0.4898938199967292, "grad_norm": 0.4204176664352417, "learning_rate": 2.703217085994918e-05, "loss": 1.2823, "num_input_tokens_seen": 397241560, "step": 10110 }, { "epoch": 0.490378383616904, "grad_norm": 0.41810983419418335, "learning_rate": 2.699306165966238e-05, "loss": 1.2884, "num_input_tokens_seen": 397641548, "step": 10120 }, { "epoch": 0.4908629472370788, "grad_norm": 0.43055158853530884, "learning_rate": 2.695394755004347e-05, "loss": 1.3254, "num_input_tokens_seen": 398025916, "step": 10130 }, { "epoch": 0.4913475108572536, "grad_norm": 0.43699413537979126, "learning_rate": 2.691482862743877e-05, "loss": 1.2906, "num_input_tokens_seen": 398442156, "step": 10140 }, { "epoch": 0.49183207447742844, "grad_norm": 0.461458295583725, "learning_rate": 2.6875704988206457e-05, "loss": 1.335, "num_input_tokens_seen": 398846656, "step": 10150 }, { "epoch": 0.49231663809760323, "grad_norm": 0.3919227123260498, "learning_rate": 2.6836576728716313e-05, "loss": 1.3533, "num_input_tokens_seen": 399230044, "step": 10160 }, { "epoch": 0.492801201717778, "grad_norm": 0.3971085548400879, "learning_rate": 2.679744394534952e-05, "loss": 1.2967, "num_input_tokens_seen": 399626988, "step": 10170 }, { "epoch": 0.4932857653379528, "grad_norm": 0.40311896800994873, "learning_rate": 2.6758306734498383e-05, "loss": 1.3335, "num_input_tokens_seen": 399992148, "step": 10180 }, { "epoch": 0.49377032895812767, "grad_norm": 0.4395916759967804, "learning_rate": 2.6719165192566138e-05, "loss": 1.2957, "num_input_tokens_seen": 400375556, "step": 10190 }, { "epoch": 0.49425489257830246, "grad_norm": 0.41459786891937256, "learning_rate": 2.6680019415966673e-05, "loss": 1.313, "num_input_tokens_seen": 400752968, "step": 10200 }, { "epoch": 0.49473945619847726, "grad_norm": 0.43075108528137207, "learning_rate": 2.6640869501124305e-05, "loss": 1.3392, "num_input_tokens_seen": 401136408, "step": 10210 }, { "epoch": 0.49522401981865205, "grad_norm": 0.43251320719718933, "learning_rate": 2.660171554447355e-05, "loss": 1.267, "num_input_tokens_seen": 401538004, "step": 10220 }, { "epoch": 0.49570858343882684, "grad_norm": 0.3789636492729187, "learning_rate": 2.6562557642458872e-05, "loss": 1.3561, "num_input_tokens_seen": 401913896, "step": 10230 }, { "epoch": 0.4961931470590017, "grad_norm": 0.410152405500412, "learning_rate": 2.652339589153447e-05, "loss": 1.3323, "num_input_tokens_seen": 402303296, "step": 10240 }, { "epoch": 0.4966777106791765, "grad_norm": 0.4302387237548828, "learning_rate": 2.648423038816401e-05, "loss": 1.3612, "num_input_tokens_seen": 402673940, "step": 10250 }, { "epoch": 0.4971622742993513, "grad_norm": 0.4012112021446228, "learning_rate": 2.6445061228820406e-05, "loss": 1.3009, "num_input_tokens_seen": 403069012, "step": 10260 }, { "epoch": 0.4976468379195261, "grad_norm": 0.42036718130111694, "learning_rate": 2.6405888509985576e-05, "loss": 1.3463, "num_input_tokens_seen": 403464332, "step": 10270 }, { "epoch": 0.4981314015397009, "grad_norm": 0.42190247774124146, "learning_rate": 2.6366712328150205e-05, "loss": 1.3038, "num_input_tokens_seen": 403863168, "step": 10280 }, { "epoch": 0.4986159651598757, "grad_norm": 0.4091307520866394, "learning_rate": 2.6327532779813506e-05, "loss": 1.2853, "num_input_tokens_seen": 404242308, "step": 10290 }, { "epoch": 0.4991005287800505, "grad_norm": 0.43492767214775085, "learning_rate": 2.6288349961482993e-05, "loss": 1.3446, "num_input_tokens_seen": 404634860, "step": 10300 }, { "epoch": 0.4995850924002253, "grad_norm": 0.4058671295642853, "learning_rate": 2.624916396967423e-05, "loss": 1.2642, "num_input_tokens_seen": 405016164, "step": 10310 }, { "epoch": 0.5000696560204001, "grad_norm": 0.3747016191482544, "learning_rate": 2.620997490091058e-05, "loss": 1.3063, "num_input_tokens_seen": 405389624, "step": 10320 }, { "epoch": 0.5005542196405749, "grad_norm": 0.4174031913280487, "learning_rate": 2.617078285172302e-05, "loss": 1.3075, "num_input_tokens_seen": 405768816, "step": 10330 }, { "epoch": 0.5010387832607497, "grad_norm": 0.42868369817733765, "learning_rate": 2.6131587918649854e-05, "loss": 1.3153, "num_input_tokens_seen": 406177564, "step": 10340 }, { "epoch": 0.5015233468809246, "grad_norm": 0.4055063724517822, "learning_rate": 2.6092390198236468e-05, "loss": 1.3084, "num_input_tokens_seen": 406576960, "step": 10350 }, { "epoch": 0.5020079105010994, "grad_norm": 0.36954593658447266, "learning_rate": 2.6053189787035147e-05, "loss": 1.2724, "num_input_tokens_seen": 407003236, "step": 10360 }, { "epoch": 0.5024924741212742, "grad_norm": 0.46568143367767334, "learning_rate": 2.6013986781604782e-05, "loss": 1.2893, "num_input_tokens_seen": 407393488, "step": 10370 }, { "epoch": 0.502977037741449, "grad_norm": 0.40515366196632385, "learning_rate": 2.5974781278510656e-05, "loss": 1.3126, "num_input_tokens_seen": 407794284, "step": 10380 }, { "epoch": 0.5034616013616238, "grad_norm": 0.4094693064689636, "learning_rate": 2.5935573374324228e-05, "loss": 1.288, "num_input_tokens_seen": 408172776, "step": 10390 }, { "epoch": 0.5039461649817986, "grad_norm": 0.4908318519592285, "learning_rate": 2.5896363165622833e-05, "loss": 1.3331, "num_input_tokens_seen": 408581868, "step": 10400 }, { "epoch": 0.5044307286019734, "grad_norm": 0.42689284682273865, "learning_rate": 2.585715074898951e-05, "loss": 1.2885, "num_input_tokens_seen": 408941632, "step": 10410 }, { "epoch": 0.5049152922221481, "grad_norm": 0.40442904829978943, "learning_rate": 2.5817936221012733e-05, "loss": 1.3529, "num_input_tokens_seen": 409363076, "step": 10420 }, { "epoch": 0.5053998558423229, "grad_norm": 0.38854020833969116, "learning_rate": 2.5778719678286172e-05, "loss": 1.2953, "num_input_tokens_seen": 409759340, "step": 10430 }, { "epoch": 0.5058844194624978, "grad_norm": 0.41985204815864563, "learning_rate": 2.5739501217408457e-05, "loss": 1.3149, "num_input_tokens_seen": 410179976, "step": 10440 }, { "epoch": 0.5063689830826726, "grad_norm": 0.42030394077301025, "learning_rate": 2.5700280934982947e-05, "loss": 1.3282, "num_input_tokens_seen": 410588712, "step": 10450 }, { "epoch": 0.5068535467028474, "grad_norm": 0.4288428723812103, "learning_rate": 2.5661058927617476e-05, "loss": 1.2864, "num_input_tokens_seen": 410998856, "step": 10460 }, { "epoch": 0.5073381103230222, "grad_norm": 0.4092228412628174, "learning_rate": 2.5621835291924157e-05, "loss": 1.2977, "num_input_tokens_seen": 411403584, "step": 10470 }, { "epoch": 0.507822673943197, "grad_norm": 0.40826088190078735, "learning_rate": 2.5582610124519087e-05, "loss": 1.2867, "num_input_tokens_seen": 411806684, "step": 10480 }, { "epoch": 0.5083072375633718, "grad_norm": 0.44615307450294495, "learning_rate": 2.5543383522022137e-05, "loss": 1.3498, "num_input_tokens_seen": 412178432, "step": 10490 }, { "epoch": 0.5087918011835466, "grad_norm": 0.382753849029541, "learning_rate": 2.5504155581056734e-05, "loss": 1.2994, "num_input_tokens_seen": 412554676, "step": 10500 }, { "epoch": 0.5092763648037214, "grad_norm": 0.4127599000930786, "learning_rate": 2.546492639824957e-05, "loss": 1.3228, "num_input_tokens_seen": 412919116, "step": 10510 }, { "epoch": 0.5097609284238963, "grad_norm": 0.3876650333404541, "learning_rate": 2.542569607023042e-05, "loss": 1.3474, "num_input_tokens_seen": 413304516, "step": 10520 }, { "epoch": 0.5102454920440711, "grad_norm": 0.3939809799194336, "learning_rate": 2.5386464693631885e-05, "loss": 1.3047, "num_input_tokens_seen": 413691892, "step": 10530 }, { "epoch": 0.5107300556642459, "grad_norm": 0.4118193984031677, "learning_rate": 2.5347232365089125e-05, "loss": 1.2911, "num_input_tokens_seen": 414112144, "step": 10540 }, { "epoch": 0.5112146192844207, "grad_norm": 0.40481314063072205, "learning_rate": 2.530799918123966e-05, "loss": 1.341, "num_input_tokens_seen": 414501584, "step": 10550 }, { "epoch": 0.5116991829045955, "grad_norm": 0.4449983537197113, "learning_rate": 2.526876523872312e-05, "loss": 1.3046, "num_input_tokens_seen": 414863372, "step": 10560 }, { "epoch": 0.5121837465247703, "grad_norm": 0.37632080912590027, "learning_rate": 2.5229530634180986e-05, "loss": 1.3226, "num_input_tokens_seen": 415256912, "step": 10570 }, { "epoch": 0.5126683101449451, "grad_norm": 0.4017525613307953, "learning_rate": 2.519029546425639e-05, "loss": 1.2999, "num_input_tokens_seen": 415645068, "step": 10580 }, { "epoch": 0.5131528737651199, "grad_norm": 0.405073881149292, "learning_rate": 2.5151059825593847e-05, "loss": 1.3299, "num_input_tokens_seen": 416025852, "step": 10590 }, { "epoch": 0.5136374373852947, "grad_norm": 0.3685546815395355, "learning_rate": 2.511182381483902e-05, "loss": 1.3465, "num_input_tokens_seen": 416410872, "step": 10600 }, { "epoch": 0.5141220010054696, "grad_norm": 0.40560224652290344, "learning_rate": 2.507258752863851e-05, "loss": 1.3055, "num_input_tokens_seen": 416840448, "step": 10610 }, { "epoch": 0.5146065646256444, "grad_norm": 0.402498334646225, "learning_rate": 2.503335106363957e-05, "loss": 1.3216, "num_input_tokens_seen": 417239552, "step": 10620 }, { "epoch": 0.5150911282458192, "grad_norm": 0.40907391905784607, "learning_rate": 2.4994114516489917e-05, "loss": 1.3089, "num_input_tokens_seen": 417648304, "step": 10630 }, { "epoch": 0.515575691865994, "grad_norm": 0.4574085474014282, "learning_rate": 2.4954877983837446e-05, "loss": 1.2659, "num_input_tokens_seen": 418049068, "step": 10640 }, { "epoch": 0.5160602554861687, "grad_norm": 0.4318368434906006, "learning_rate": 2.491564156233005e-05, "loss": 1.3243, "num_input_tokens_seen": 418453168, "step": 10650 }, { "epoch": 0.5165448191063435, "grad_norm": 0.3779464364051819, "learning_rate": 2.4876405348615303e-05, "loss": 1.3325, "num_input_tokens_seen": 418845328, "step": 10660 }, { "epoch": 0.5170293827265183, "grad_norm": 0.4640369713306427, "learning_rate": 2.483716943934031e-05, "loss": 1.3597, "num_input_tokens_seen": 419240400, "step": 10670 }, { "epoch": 0.5175139463466931, "grad_norm": 0.39406731724739075, "learning_rate": 2.47979339311514e-05, "loss": 1.348, "num_input_tokens_seen": 419604196, "step": 10680 }, { "epoch": 0.5179985099668679, "grad_norm": 0.45061835646629333, "learning_rate": 2.4758698920693933e-05, "loss": 1.3107, "num_input_tokens_seen": 420001636, "step": 10690 }, { "epoch": 0.5184830735870428, "grad_norm": 0.3857516050338745, "learning_rate": 2.4719464504612015e-05, "loss": 1.2952, "num_input_tokens_seen": 420412924, "step": 10700 }, { "epoch": 0.5189676372072176, "grad_norm": 0.40518441796302795, "learning_rate": 2.4680230779548325e-05, "loss": 1.3131, "num_input_tokens_seen": 420817072, "step": 10710 }, { "epoch": 0.5194522008273924, "grad_norm": 0.3917674422264099, "learning_rate": 2.4640997842143797e-05, "loss": 1.3628, "num_input_tokens_seen": 421217132, "step": 10720 }, { "epoch": 0.5199367644475672, "grad_norm": 0.41887742280960083, "learning_rate": 2.4601765789037465e-05, "loss": 1.2873, "num_input_tokens_seen": 421590708, "step": 10730 }, { "epoch": 0.520421328067742, "grad_norm": 0.4056337773799896, "learning_rate": 2.456253471686617e-05, "loss": 1.3099, "num_input_tokens_seen": 421991544, "step": 10740 }, { "epoch": 0.5209058916879168, "grad_norm": 0.4133414924144745, "learning_rate": 2.452330472226432e-05, "loss": 1.3146, "num_input_tokens_seen": 422369440, "step": 10750 }, { "epoch": 0.5213904553080916, "grad_norm": 0.39193689823150635, "learning_rate": 2.4484075901863697e-05, "loss": 1.3141, "num_input_tokens_seen": 422783140, "step": 10760 }, { "epoch": 0.5218750189282664, "grad_norm": 0.447322279214859, "learning_rate": 2.444484835229316e-05, "loss": 1.2537, "num_input_tokens_seen": 423177384, "step": 10770 }, { "epoch": 0.5223595825484412, "grad_norm": 0.3933646082878113, "learning_rate": 2.4405622170178483e-05, "loss": 1.3232, "num_input_tokens_seen": 423569888, "step": 10780 }, { "epoch": 0.5228441461686161, "grad_norm": 0.4285268187522888, "learning_rate": 2.436639745214201e-05, "loss": 1.2985, "num_input_tokens_seen": 423969632, "step": 10790 }, { "epoch": 0.5233287097887909, "grad_norm": 0.40538379549980164, "learning_rate": 2.432717429480254e-05, "loss": 1.3513, "num_input_tokens_seen": 424379412, "step": 10800 }, { "epoch": 0.5238132734089657, "grad_norm": 0.4205954670906067, "learning_rate": 2.4287952794774972e-05, "loss": 1.2476, "num_input_tokens_seen": 424732692, "step": 10810 }, { "epoch": 0.5242978370291405, "grad_norm": 0.4187707304954529, "learning_rate": 2.424873304867018e-05, "loss": 1.2784, "num_input_tokens_seen": 425099744, "step": 10820 }, { "epoch": 0.5247824006493153, "grad_norm": 0.4088019132614136, "learning_rate": 2.420951515309466e-05, "loss": 1.3619, "num_input_tokens_seen": 425533784, "step": 10830 }, { "epoch": 0.52526696426949, "grad_norm": 0.4790429472923279, "learning_rate": 2.4170299204650402e-05, "loss": 1.3588, "num_input_tokens_seen": 425907244, "step": 10840 }, { "epoch": 0.5257515278896648, "grad_norm": 0.39465224742889404, "learning_rate": 2.4131085299934552e-05, "loss": 1.2817, "num_input_tokens_seen": 426303280, "step": 10850 }, { "epoch": 0.5262360915098396, "grad_norm": 0.3853622376918793, "learning_rate": 2.4091873535539263e-05, "loss": 1.3173, "num_input_tokens_seen": 426708880, "step": 10860 }, { "epoch": 0.5267206551300144, "grad_norm": 0.47276604175567627, "learning_rate": 2.40526640080514e-05, "loss": 1.267, "num_input_tokens_seen": 427072384, "step": 10870 }, { "epoch": 0.5272052187501893, "grad_norm": 0.4207751452922821, "learning_rate": 2.40134568140523e-05, "loss": 1.3232, "num_input_tokens_seen": 427483780, "step": 10880 }, { "epoch": 0.5276897823703641, "grad_norm": 0.4074230194091797, "learning_rate": 2.3974252050117578e-05, "loss": 1.3059, "num_input_tokens_seen": 427878960, "step": 10890 }, { "epoch": 0.5281743459905389, "grad_norm": 0.4160650968551636, "learning_rate": 2.3935049812816853e-05, "loss": 1.284, "num_input_tokens_seen": 428262904, "step": 10900 }, { "epoch": 0.5286589096107137, "grad_norm": 0.4167976975440979, "learning_rate": 2.3895850198713532e-05, "loss": 1.3627, "num_input_tokens_seen": 428669280, "step": 10910 }, { "epoch": 0.5291434732308885, "grad_norm": 0.3857592046260834, "learning_rate": 2.3856653304364528e-05, "loss": 1.2677, "num_input_tokens_seen": 429050584, "step": 10920 }, { "epoch": 0.5296280368510633, "grad_norm": 0.39841026067733765, "learning_rate": 2.38174592263201e-05, "loss": 1.2863, "num_input_tokens_seen": 429432948, "step": 10930 }, { "epoch": 0.5301126004712381, "grad_norm": 0.37593045830726624, "learning_rate": 2.377826806112352e-05, "loss": 1.319, "num_input_tokens_seen": 429843780, "step": 10940 }, { "epoch": 0.5305971640914129, "grad_norm": 0.39701202511787415, "learning_rate": 2.3739079905310925e-05, "loss": 1.3137, "num_input_tokens_seen": 430264816, "step": 10950 }, { "epoch": 0.5310817277115877, "grad_norm": 0.4029529094696045, "learning_rate": 2.3699894855411025e-05, "loss": 1.3228, "num_input_tokens_seen": 430690612, "step": 10960 }, { "epoch": 0.5315662913317626, "grad_norm": 0.4053628742694855, "learning_rate": 2.366071300794489e-05, "loss": 1.3084, "num_input_tokens_seen": 431088820, "step": 10970 }, { "epoch": 0.5320508549519374, "grad_norm": 0.4208217263221741, "learning_rate": 2.362153445942567e-05, "loss": 1.2865, "num_input_tokens_seen": 431510088, "step": 10980 }, { "epoch": 0.5325354185721122, "grad_norm": 0.38513317704200745, "learning_rate": 2.3582359306358425e-05, "loss": 1.2924, "num_input_tokens_seen": 431912904, "step": 10990 }, { "epoch": 0.533019982192287, "grad_norm": 0.3804323077201843, "learning_rate": 2.354318764523984e-05, "loss": 1.3148, "num_input_tokens_seen": 432316308, "step": 11000 }, { "epoch": 0.5335045458124618, "grad_norm": 0.4341334104537964, "learning_rate": 2.3504019572557978e-05, "loss": 1.3025, "num_input_tokens_seen": 432706392, "step": 11010 }, { "epoch": 0.5339891094326366, "grad_norm": 0.3772577941417694, "learning_rate": 2.3464855184792103e-05, "loss": 1.3058, "num_input_tokens_seen": 433114000, "step": 11020 }, { "epoch": 0.5344736730528113, "grad_norm": 0.37960031628608704, "learning_rate": 2.3425694578412357e-05, "loss": 1.2901, "num_input_tokens_seen": 433515624, "step": 11030 }, { "epoch": 0.5349582366729861, "grad_norm": 0.3838542401790619, "learning_rate": 2.338653784987961e-05, "loss": 1.3182, "num_input_tokens_seen": 433899616, "step": 11040 }, { "epoch": 0.5354428002931609, "grad_norm": 0.40158811211586, "learning_rate": 2.3347385095645143e-05, "loss": 1.3337, "num_input_tokens_seen": 434295476, "step": 11050 }, { "epoch": 0.5359273639133358, "grad_norm": 0.41163861751556396, "learning_rate": 2.3308236412150488e-05, "loss": 1.2498, "num_input_tokens_seen": 434692404, "step": 11060 }, { "epoch": 0.5364119275335106, "grad_norm": 0.4120139181613922, "learning_rate": 2.3269091895827096e-05, "loss": 1.325, "num_input_tokens_seen": 435084432, "step": 11070 }, { "epoch": 0.5368964911536854, "grad_norm": 0.4136423170566559, "learning_rate": 2.3229951643096215e-05, "loss": 1.3232, "num_input_tokens_seen": 435481608, "step": 11080 }, { "epoch": 0.5373810547738602, "grad_norm": 0.3973861038684845, "learning_rate": 2.3190815750368534e-05, "loss": 1.2942, "num_input_tokens_seen": 435879740, "step": 11090 }, { "epoch": 0.537865618394035, "grad_norm": 0.3832756280899048, "learning_rate": 2.3151684314044042e-05, "loss": 1.291, "num_input_tokens_seen": 436231608, "step": 11100 }, { "epoch": 0.5383501820142098, "grad_norm": 0.446608304977417, "learning_rate": 2.3112557430511734e-05, "loss": 1.2808, "num_input_tokens_seen": 436633972, "step": 11110 }, { "epoch": 0.5388347456343846, "grad_norm": 0.41678836941719055, "learning_rate": 2.3073435196149392e-05, "loss": 1.3131, "num_input_tokens_seen": 437031292, "step": 11120 }, { "epoch": 0.5393193092545594, "grad_norm": 0.3977677822113037, "learning_rate": 2.3034317707323364e-05, "loss": 1.2948, "num_input_tokens_seen": 437405436, "step": 11130 }, { "epoch": 0.5398038728747342, "grad_norm": 0.4163575768470764, "learning_rate": 2.2995205060388265e-05, "loss": 1.3057, "num_input_tokens_seen": 437781780, "step": 11140 }, { "epoch": 0.5402884364949091, "grad_norm": 0.38908636569976807, "learning_rate": 2.295609735168684e-05, "loss": 1.323, "num_input_tokens_seen": 438167672, "step": 11150 }, { "epoch": 0.5407730001150839, "grad_norm": 0.3993242681026459, "learning_rate": 2.2916994677549614e-05, "loss": 1.2775, "num_input_tokens_seen": 438551092, "step": 11160 }, { "epoch": 0.5412575637352587, "grad_norm": 0.43453851342201233, "learning_rate": 2.2877897134294755e-05, "loss": 1.3369, "num_input_tokens_seen": 438925056, "step": 11170 }, { "epoch": 0.5417421273554335, "grad_norm": 0.38773950934410095, "learning_rate": 2.2838804818227766e-05, "loss": 1.3376, "num_input_tokens_seen": 439315640, "step": 11180 }, { "epoch": 0.5422266909756083, "grad_norm": 0.4047930836677551, "learning_rate": 2.2799717825641297e-05, "loss": 1.2998, "num_input_tokens_seen": 439714968, "step": 11190 }, { "epoch": 0.5427112545957831, "grad_norm": 0.385567307472229, "learning_rate": 2.2760636252814858e-05, "loss": 1.3205, "num_input_tokens_seen": 440096324, "step": 11200 }, { "epoch": 0.5431958182159579, "grad_norm": 0.3837738335132599, "learning_rate": 2.2721560196014635e-05, "loss": 1.2782, "num_input_tokens_seen": 440489640, "step": 11210 }, { "epoch": 0.5436803818361327, "grad_norm": 0.4107000231742859, "learning_rate": 2.26824897514932e-05, "loss": 1.356, "num_input_tokens_seen": 440874496, "step": 11220 }, { "epoch": 0.5441649454563076, "grad_norm": 0.4507005214691162, "learning_rate": 2.2643425015489335e-05, "loss": 1.2991, "num_input_tokens_seen": 441274860, "step": 11230 }, { "epoch": 0.5446495090764824, "grad_norm": 0.40049469470977783, "learning_rate": 2.260436608422772e-05, "loss": 1.3207, "num_input_tokens_seen": 441672140, "step": 11240 }, { "epoch": 0.5451340726966571, "grad_norm": 0.4210105836391449, "learning_rate": 2.2565313053918764e-05, "loss": 1.3396, "num_input_tokens_seen": 442056044, "step": 11250 }, { "epoch": 0.5456186363168319, "grad_norm": 0.40691545605659485, "learning_rate": 2.252626602075835e-05, "loss": 1.3008, "num_input_tokens_seen": 442444840, "step": 11260 }, { "epoch": 0.5461031999370067, "grad_norm": 0.377591073513031, "learning_rate": 2.2487225080927553e-05, "loss": 1.2803, "num_input_tokens_seen": 442832056, "step": 11270 }, { "epoch": 0.5465877635571815, "grad_norm": 0.36699971556663513, "learning_rate": 2.244819033059248e-05, "loss": 1.3336, "num_input_tokens_seen": 443227492, "step": 11280 }, { "epoch": 0.5470723271773563, "grad_norm": 0.3749854862689972, "learning_rate": 2.2409161865903952e-05, "loss": 1.31, "num_input_tokens_seen": 443640864, "step": 11290 }, { "epoch": 0.5475568907975311, "grad_norm": 0.5490464568138123, "learning_rate": 2.2370139782997342e-05, "loss": 1.3327, "num_input_tokens_seen": 444016308, "step": 11300 }, { "epoch": 0.5480414544177059, "grad_norm": 0.4179551601409912, "learning_rate": 2.2331124177992274e-05, "loss": 1.3102, "num_input_tokens_seen": 444404972, "step": 11310 }, { "epoch": 0.5485260180378808, "grad_norm": 0.3857383728027344, "learning_rate": 2.2292115146992438e-05, "loss": 1.2987, "num_input_tokens_seen": 444771056, "step": 11320 }, { "epoch": 0.5490105816580556, "grad_norm": 0.40112438797950745, "learning_rate": 2.2253112786085313e-05, "loss": 1.2942, "num_input_tokens_seen": 445173356, "step": 11330 }, { "epoch": 0.5494951452782304, "grad_norm": 0.4082011580467224, "learning_rate": 2.2214117191341972e-05, "loss": 1.2856, "num_input_tokens_seen": 445519032, "step": 11340 }, { "epoch": 0.5499797088984052, "grad_norm": 0.423109769821167, "learning_rate": 2.2175128458816792e-05, "loss": 1.3118, "num_input_tokens_seen": 445906772, "step": 11350 }, { "epoch": 0.55046427251858, "grad_norm": 0.42687487602233887, "learning_rate": 2.213614668454728e-05, "loss": 1.2962, "num_input_tokens_seen": 446254804, "step": 11360 }, { "epoch": 0.5509488361387548, "grad_norm": 0.3661687672138214, "learning_rate": 2.2097171964553757e-05, "loss": 1.3353, "num_input_tokens_seen": 446614404, "step": 11370 }, { "epoch": 0.5514333997589296, "grad_norm": 0.39668744802474976, "learning_rate": 2.2058204394839217e-05, "loss": 1.3309, "num_input_tokens_seen": 447010280, "step": 11380 }, { "epoch": 0.5519179633791044, "grad_norm": 0.3846248984336853, "learning_rate": 2.201924407138902e-05, "loss": 1.2841, "num_input_tokens_seen": 447381468, "step": 11390 }, { "epoch": 0.5524025269992792, "grad_norm": 0.39855462312698364, "learning_rate": 2.1980291090170664e-05, "loss": 1.3553, "num_input_tokens_seen": 447788436, "step": 11400 }, { "epoch": 0.5528870906194541, "grad_norm": 0.40765848755836487, "learning_rate": 2.19413455471336e-05, "loss": 1.2543, "num_input_tokens_seen": 448189856, "step": 11410 }, { "epoch": 0.5533716542396289, "grad_norm": 0.3901714086532593, "learning_rate": 2.1902407538208897e-05, "loss": 1.3356, "num_input_tokens_seen": 448564344, "step": 11420 }, { "epoch": 0.5538562178598037, "grad_norm": 0.3917088806629181, "learning_rate": 2.1863477159309132e-05, "loss": 1.3467, "num_input_tokens_seen": 448945696, "step": 11430 }, { "epoch": 0.5543407814799785, "grad_norm": 0.4396210312843323, "learning_rate": 2.182455450632803e-05, "loss": 1.3022, "num_input_tokens_seen": 449313644, "step": 11440 }, { "epoch": 0.5548253451001532, "grad_norm": 0.41703107953071594, "learning_rate": 2.178563967514034e-05, "loss": 1.2391, "num_input_tokens_seen": 449716576, "step": 11450 }, { "epoch": 0.555309908720328, "grad_norm": 0.405318945646286, "learning_rate": 2.1746732761601486e-05, "loss": 1.2382, "num_input_tokens_seen": 450092104, "step": 11460 }, { "epoch": 0.5557944723405028, "grad_norm": 0.37074992060661316, "learning_rate": 2.1707833861547442e-05, "loss": 1.2947, "num_input_tokens_seen": 450487336, "step": 11470 }, { "epoch": 0.5562790359606776, "grad_norm": 0.4145805835723877, "learning_rate": 2.1668943070794407e-05, "loss": 1.3364, "num_input_tokens_seen": 450861056, "step": 11480 }, { "epoch": 0.5567635995808524, "grad_norm": 0.42270269989967346, "learning_rate": 2.163006048513863e-05, "loss": 1.354, "num_input_tokens_seen": 451258688, "step": 11490 }, { "epoch": 0.5572481632010273, "grad_norm": 0.3963187336921692, "learning_rate": 2.1591186200356122e-05, "loss": 1.281, "num_input_tokens_seen": 451641956, "step": 11500 }, { "epoch": 0.5577327268212021, "grad_norm": 0.4037591814994812, "learning_rate": 2.1552320312202485e-05, "loss": 1.2815, "num_input_tokens_seen": 452047412, "step": 11510 }, { "epoch": 0.5582172904413769, "grad_norm": 0.3662770390510559, "learning_rate": 2.1513462916412592e-05, "loss": 1.3455, "num_input_tokens_seen": 452448840, "step": 11520 }, { "epoch": 0.5587018540615517, "grad_norm": 0.3583228290081024, "learning_rate": 2.147461410870043e-05, "loss": 1.2794, "num_input_tokens_seen": 452843044, "step": 11530 }, { "epoch": 0.5591864176817265, "grad_norm": 0.37632209062576294, "learning_rate": 2.143577398475883e-05, "loss": 1.2674, "num_input_tokens_seen": 453264900, "step": 11540 }, { "epoch": 0.5596709813019013, "grad_norm": 0.3996264636516571, "learning_rate": 2.139694264025922e-05, "loss": 1.2744, "num_input_tokens_seen": 453667984, "step": 11550 }, { "epoch": 0.5601555449220761, "grad_norm": 0.3832734525203705, "learning_rate": 2.135812017085142e-05, "loss": 1.2997, "num_input_tokens_seen": 454063124, "step": 11560 }, { "epoch": 0.5606401085422509, "grad_norm": 0.4112671911716461, "learning_rate": 2.1319306672163355e-05, "loss": 1.326, "num_input_tokens_seen": 454468736, "step": 11570 }, { "epoch": 0.5611246721624257, "grad_norm": 0.40387192368507385, "learning_rate": 2.1280502239800905e-05, "loss": 1.2733, "num_input_tokens_seen": 454868724, "step": 11580 }, { "epoch": 0.5616092357826006, "grad_norm": 0.4105452001094818, "learning_rate": 2.1241706969347554e-05, "loss": 1.2681, "num_input_tokens_seen": 455262064, "step": 11590 }, { "epoch": 0.5620937994027754, "grad_norm": 0.41394761204719543, "learning_rate": 2.1202920956364282e-05, "loss": 1.2333, "num_input_tokens_seen": 455641940, "step": 11600 }, { "epoch": 0.5625783630229502, "grad_norm": 0.47459670901298523, "learning_rate": 2.116414429638922e-05, "loss": 1.3239, "num_input_tokens_seen": 456036468, "step": 11610 }, { "epoch": 0.563062926643125, "grad_norm": 0.38530948758125305, "learning_rate": 2.112537708493749e-05, "loss": 1.3167, "num_input_tokens_seen": 456420516, "step": 11620 }, { "epoch": 0.5635474902632998, "grad_norm": 0.3913155794143677, "learning_rate": 2.108661941750091e-05, "loss": 1.3043, "num_input_tokens_seen": 456824708, "step": 11630 }, { "epoch": 0.5640320538834745, "grad_norm": 0.44334185123443604, "learning_rate": 2.1047871389547826e-05, "loss": 1.3133, "num_input_tokens_seen": 457192180, "step": 11640 }, { "epoch": 0.5645166175036493, "grad_norm": 0.41720449924468994, "learning_rate": 2.1009133096522805e-05, "loss": 1.2314, "num_input_tokens_seen": 457554472, "step": 11650 }, { "epoch": 0.5650011811238241, "grad_norm": 0.3976454734802246, "learning_rate": 2.0970404633846453e-05, "loss": 1.3181, "num_input_tokens_seen": 457940752, "step": 11660 }, { "epoch": 0.5654857447439989, "grad_norm": 0.3785874843597412, "learning_rate": 2.0931686096915172e-05, "loss": 1.2951, "num_input_tokens_seen": 458329516, "step": 11670 }, { "epoch": 0.5659703083641738, "grad_norm": 0.42193588614463806, "learning_rate": 2.0892977581100884e-05, "loss": 1.2895, "num_input_tokens_seen": 458728512, "step": 11680 }, { "epoch": 0.5664548719843486, "grad_norm": 0.4394080936908722, "learning_rate": 2.085427918175086e-05, "loss": 1.287, "num_input_tokens_seen": 459138172, "step": 11690 }, { "epoch": 0.5669394356045234, "grad_norm": 0.39886605739593506, "learning_rate": 2.0815590994187416e-05, "loss": 1.3094, "num_input_tokens_seen": 459549264, "step": 11700 }, { "epoch": 0.5674239992246982, "grad_norm": 0.37033554911613464, "learning_rate": 2.0776913113707766e-05, "loss": 1.3196, "num_input_tokens_seen": 459931536, "step": 11710 }, { "epoch": 0.567908562844873, "grad_norm": 0.38384774327278137, "learning_rate": 2.0738245635583675e-05, "loss": 1.2916, "num_input_tokens_seen": 460349548, "step": 11720 }, { "epoch": 0.5683931264650478, "grad_norm": 0.41543838381767273, "learning_rate": 2.0699588655061337e-05, "loss": 1.34, "num_input_tokens_seen": 460751796, "step": 11730 }, { "epoch": 0.5688776900852226, "grad_norm": 0.38962802290916443, "learning_rate": 2.066094226736104e-05, "loss": 1.2622, "num_input_tokens_seen": 461166532, "step": 11740 }, { "epoch": 0.5693622537053974, "grad_norm": 0.3820241391658783, "learning_rate": 2.0622306567677026e-05, "loss": 1.3371, "num_input_tokens_seen": 461544700, "step": 11750 }, { "epoch": 0.5698468173255722, "grad_norm": 0.37688279151916504, "learning_rate": 2.0583681651177177e-05, "loss": 1.3028, "num_input_tokens_seen": 461923852, "step": 11760 }, { "epoch": 0.5703313809457471, "grad_norm": 0.41758424043655396, "learning_rate": 2.0545067613002844e-05, "loss": 1.303, "num_input_tokens_seen": 462284468, "step": 11770 }, { "epoch": 0.5708159445659219, "grad_norm": 0.41535744071006775, "learning_rate": 2.050646454826854e-05, "loss": 1.2806, "num_input_tokens_seen": 462662724, "step": 11780 }, { "epoch": 0.5713005081860967, "grad_norm": 0.41372743248939514, "learning_rate": 2.0467872552061785e-05, "loss": 1.2763, "num_input_tokens_seen": 463065836, "step": 11790 }, { "epoch": 0.5717850718062715, "grad_norm": 0.4078621566295624, "learning_rate": 2.042929171944283e-05, "loss": 1.2739, "num_input_tokens_seen": 463465784, "step": 11800 }, { "epoch": 0.5722696354264463, "grad_norm": 0.4270489811897278, "learning_rate": 2.03907221454444e-05, "loss": 1.2816, "num_input_tokens_seen": 463873784, "step": 11810 }, { "epoch": 0.5727541990466211, "grad_norm": 0.40874189138412476, "learning_rate": 2.0352163925071526e-05, "loss": 1.3118, "num_input_tokens_seen": 464264876, "step": 11820 }, { "epoch": 0.5732387626667959, "grad_norm": 0.4195123016834259, "learning_rate": 2.031361715330124e-05, "loss": 1.28, "num_input_tokens_seen": 464655612, "step": 11830 }, { "epoch": 0.5737233262869706, "grad_norm": 0.42843663692474365, "learning_rate": 2.0275081925082408e-05, "loss": 1.2797, "num_input_tokens_seen": 465021840, "step": 11840 }, { "epoch": 0.5742078899071454, "grad_norm": 0.4026115834712982, "learning_rate": 2.0236558335335418e-05, "loss": 1.2662, "num_input_tokens_seen": 465412176, "step": 11850 }, { "epoch": 0.5746924535273203, "grad_norm": 0.4160013794898987, "learning_rate": 2.0198046478952034e-05, "loss": 1.3044, "num_input_tokens_seen": 465798436, "step": 11860 }, { "epoch": 0.5751770171474951, "grad_norm": 0.3887726068496704, "learning_rate": 2.0159546450795076e-05, "loss": 1.271, "num_input_tokens_seen": 466194580, "step": 11870 }, { "epoch": 0.5756615807676699, "grad_norm": 0.4644850492477417, "learning_rate": 2.012105834569827e-05, "loss": 1.3343, "num_input_tokens_seen": 466583580, "step": 11880 }, { "epoch": 0.5761461443878447, "grad_norm": 0.37905943393707275, "learning_rate": 2.008258225846594e-05, "loss": 1.2825, "num_input_tokens_seen": 466995580, "step": 11890 }, { "epoch": 0.5766307080080195, "grad_norm": 0.4146198034286499, "learning_rate": 2.0044118283872842e-05, "loss": 1.2915, "num_input_tokens_seen": 467394060, "step": 11900 }, { "epoch": 0.5771152716281943, "grad_norm": 0.40112000703811646, "learning_rate": 2.0005666516663844e-05, "loss": 1.2813, "num_input_tokens_seen": 467762492, "step": 11910 }, { "epoch": 0.5775998352483691, "grad_norm": 0.3847469091415405, "learning_rate": 1.9967227051553798e-05, "loss": 1.3034, "num_input_tokens_seen": 468158852, "step": 11920 }, { "epoch": 0.5780843988685439, "grad_norm": 0.39969122409820557, "learning_rate": 1.992879998322723e-05, "loss": 1.2962, "num_input_tokens_seen": 468564816, "step": 11930 }, { "epoch": 0.5785689624887188, "grad_norm": 0.42271485924720764, "learning_rate": 1.9890385406338118e-05, "loss": 1.2821, "num_input_tokens_seen": 468965016, "step": 11940 }, { "epoch": 0.5790535261088936, "grad_norm": 0.4109669029712677, "learning_rate": 1.9851983415509704e-05, "loss": 1.2964, "num_input_tokens_seen": 469374000, "step": 11950 }, { "epoch": 0.5795380897290684, "grad_norm": 0.38713210821151733, "learning_rate": 1.981359410533418e-05, "loss": 1.2827, "num_input_tokens_seen": 469791140, "step": 11960 }, { "epoch": 0.5800226533492432, "grad_norm": 0.39154934883117676, "learning_rate": 1.9775217570372556e-05, "loss": 1.2617, "num_input_tokens_seen": 470165196, "step": 11970 }, { "epoch": 0.580507216969418, "grad_norm": 0.4089168310165405, "learning_rate": 1.9736853905154334e-05, "loss": 1.2968, "num_input_tokens_seen": 470550712, "step": 11980 }, { "epoch": 0.5809917805895928, "grad_norm": 0.45818933844566345, "learning_rate": 1.9698503204177342e-05, "loss": 1.2931, "num_input_tokens_seen": 470936976, "step": 11990 }, { "epoch": 0.5814763442097676, "grad_norm": 0.42065727710723877, "learning_rate": 1.9660165561907447e-05, "loss": 1.3308, "num_input_tokens_seen": 471314876, "step": 12000 }, { "epoch": 0.5814763442097676, "eval_loss": 1.4008795022964478, "eval_runtime": 3.8344, "eval_samples_per_second": 39.12, "eval_steps_per_second": 4.955, "num_input_tokens_seen": 471314876, "step": 12000 }, { "epoch": 0.5819609078299424, "grad_norm": 0.38812288641929626, "learning_rate": 1.9621841072778387e-05, "loss": 1.2928, "num_input_tokens_seen": 471727100, "step": 12010 }, { "epoch": 0.5824454714501172, "grad_norm": 0.402675062417984, "learning_rate": 1.9583529831191448e-05, "loss": 1.3312, "num_input_tokens_seen": 472107148, "step": 12020 }, { "epoch": 0.5829300350702921, "grad_norm": 0.4156324863433838, "learning_rate": 1.954523193151534e-05, "loss": 1.3157, "num_input_tokens_seen": 472518180, "step": 12030 }, { "epoch": 0.5834145986904669, "grad_norm": 0.40975773334503174, "learning_rate": 1.9506947468085866e-05, "loss": 1.2565, "num_input_tokens_seen": 472894628, "step": 12040 }, { "epoch": 0.5838991623106417, "grad_norm": 0.4518705904483795, "learning_rate": 1.9468676535205767e-05, "loss": 1.2686, "num_input_tokens_seen": 473270452, "step": 12050 }, { "epoch": 0.5843837259308164, "grad_norm": 0.4540398120880127, "learning_rate": 1.9430419227144443e-05, "loss": 1.2771, "num_input_tokens_seen": 473644072, "step": 12060 }, { "epoch": 0.5848682895509912, "grad_norm": 0.40955179929733276, "learning_rate": 1.939217563813771e-05, "loss": 1.3421, "num_input_tokens_seen": 474010688, "step": 12070 }, { "epoch": 0.585352853171166, "grad_norm": 0.4388551115989685, "learning_rate": 1.935394586238763e-05, "loss": 1.3129, "num_input_tokens_seen": 474390028, "step": 12080 }, { "epoch": 0.5858374167913408, "grad_norm": 0.41205498576164246, "learning_rate": 1.93157299940622e-05, "loss": 1.2662, "num_input_tokens_seen": 474801152, "step": 12090 }, { "epoch": 0.5863219804115156, "grad_norm": 0.41944679617881775, "learning_rate": 1.92775281272952e-05, "loss": 1.2786, "num_input_tokens_seen": 475187368, "step": 12100 }, { "epoch": 0.5868065440316904, "grad_norm": 0.4028654098510742, "learning_rate": 1.9239340356185892e-05, "loss": 1.2792, "num_input_tokens_seen": 475599252, "step": 12110 }, { "epoch": 0.5872911076518653, "grad_norm": 0.44524598121643066, "learning_rate": 1.9201166774798833e-05, "loss": 1.3175, "num_input_tokens_seen": 475971476, "step": 12120 }, { "epoch": 0.5877756712720401, "grad_norm": 0.3849859833717346, "learning_rate": 1.9163007477163616e-05, "loss": 1.2327, "num_input_tokens_seen": 476379776, "step": 12130 }, { "epoch": 0.5882602348922149, "grad_norm": 0.4216983914375305, "learning_rate": 1.9124862557274668e-05, "loss": 1.2934, "num_input_tokens_seen": 476772296, "step": 12140 }, { "epoch": 0.5887447985123897, "grad_norm": 0.4107081890106201, "learning_rate": 1.9086732109090965e-05, "loss": 1.3092, "num_input_tokens_seen": 477178020, "step": 12150 }, { "epoch": 0.5892293621325645, "grad_norm": 0.43871021270751953, "learning_rate": 1.904861622653589e-05, "loss": 1.3208, "num_input_tokens_seen": 477570016, "step": 12160 }, { "epoch": 0.5897139257527393, "grad_norm": 0.4565945565700531, "learning_rate": 1.9010515003496892e-05, "loss": 1.2592, "num_input_tokens_seen": 477957740, "step": 12170 }, { "epoch": 0.5901984893729141, "grad_norm": 0.4425964653491974, "learning_rate": 1.8972428533825345e-05, "loss": 1.2855, "num_input_tokens_seen": 478348188, "step": 12180 }, { "epoch": 0.5906830529930889, "grad_norm": 0.41607752442359924, "learning_rate": 1.8934356911336283e-05, "loss": 1.3222, "num_input_tokens_seen": 478741832, "step": 12190 }, { "epoch": 0.5911676166132637, "grad_norm": 0.43811753392219543, "learning_rate": 1.8896300229808144e-05, "loss": 1.2914, "num_input_tokens_seen": 479136864, "step": 12200 }, { "epoch": 0.5916521802334386, "grad_norm": 0.4176853895187378, "learning_rate": 1.8858258582982597e-05, "loss": 1.3503, "num_input_tokens_seen": 479527104, "step": 12210 }, { "epoch": 0.5921367438536134, "grad_norm": 0.3926706612110138, "learning_rate": 1.8820232064564233e-05, "loss": 1.286, "num_input_tokens_seen": 479884732, "step": 12220 }, { "epoch": 0.5926213074737882, "grad_norm": 0.38026413321495056, "learning_rate": 1.878222076822043e-05, "loss": 1.2862, "num_input_tokens_seen": 480292660, "step": 12230 }, { "epoch": 0.593105871093963, "grad_norm": 0.4443627595901489, "learning_rate": 1.8744224787581024e-05, "loss": 1.2863, "num_input_tokens_seen": 480669036, "step": 12240 }, { "epoch": 0.5935904347141377, "grad_norm": 0.3894060552120209, "learning_rate": 1.870624421623816e-05, "loss": 1.315, "num_input_tokens_seen": 481033564, "step": 12250 }, { "epoch": 0.5940749983343125, "grad_norm": 0.4227178692817688, "learning_rate": 1.8668279147746e-05, "loss": 1.3146, "num_input_tokens_seen": 481449056, "step": 12260 }, { "epoch": 0.5945595619544873, "grad_norm": 0.38117271661758423, "learning_rate": 1.863032967562055e-05, "loss": 1.2849, "num_input_tokens_seen": 481832856, "step": 12270 }, { "epoch": 0.5950441255746621, "grad_norm": 0.446122944355011, "learning_rate": 1.859239589333936e-05, "loss": 1.3231, "num_input_tokens_seen": 482219828, "step": 12280 }, { "epoch": 0.5955286891948369, "grad_norm": 0.4982196092605591, "learning_rate": 1.855447789434137e-05, "loss": 1.2839, "num_input_tokens_seen": 482624804, "step": 12290 }, { "epoch": 0.5960132528150118, "grad_norm": 0.44610318541526794, "learning_rate": 1.851657577202661e-05, "loss": 1.2712, "num_input_tokens_seen": 483017968, "step": 12300 }, { "epoch": 0.5964978164351866, "grad_norm": 0.4101884067058563, "learning_rate": 1.8478689619756026e-05, "loss": 1.3168, "num_input_tokens_seen": 483408892, "step": 12310 }, { "epoch": 0.5969823800553614, "grad_norm": 0.38893336057662964, "learning_rate": 1.8440819530851225e-05, "loss": 1.2848, "num_input_tokens_seen": 483774232, "step": 12320 }, { "epoch": 0.5974669436755362, "grad_norm": 0.4360596835613251, "learning_rate": 1.8402965598594227e-05, "loss": 1.3122, "num_input_tokens_seen": 484172492, "step": 12330 }, { "epoch": 0.597951507295711, "grad_norm": 0.3694493770599365, "learning_rate": 1.8365127916227288e-05, "loss": 1.2786, "num_input_tokens_seen": 484548300, "step": 12340 }, { "epoch": 0.5984360709158858, "grad_norm": 0.4072704315185547, "learning_rate": 1.8327306576952592e-05, "loss": 1.3197, "num_input_tokens_seen": 484966920, "step": 12350 }, { "epoch": 0.5989206345360606, "grad_norm": 0.43515607714653015, "learning_rate": 1.828950167393211e-05, "loss": 1.2959, "num_input_tokens_seen": 485363496, "step": 12360 }, { "epoch": 0.5994051981562354, "grad_norm": 0.3934329152107239, "learning_rate": 1.8251713300287294e-05, "loss": 1.2764, "num_input_tokens_seen": 485781532, "step": 12370 }, { "epoch": 0.5998897617764102, "grad_norm": 0.43624651432037354, "learning_rate": 1.821394154909891e-05, "loss": 1.2791, "num_input_tokens_seen": 486165704, "step": 12380 }, { "epoch": 0.6003743253965851, "grad_norm": 0.4212031960487366, "learning_rate": 1.817618651340675e-05, "loss": 1.2857, "num_input_tokens_seen": 486559644, "step": 12390 }, { "epoch": 0.6008588890167599, "grad_norm": 0.4342302680015564, "learning_rate": 1.813844828620946e-05, "loss": 1.265, "num_input_tokens_seen": 486973628, "step": 12400 }, { "epoch": 0.6013434526369347, "grad_norm": 0.35909196734428406, "learning_rate": 1.8100726960464254e-05, "loss": 1.2953, "num_input_tokens_seen": 487389200, "step": 12410 }, { "epoch": 0.6018280162571095, "grad_norm": 0.4191993772983551, "learning_rate": 1.8063022629086752e-05, "loss": 1.2548, "num_input_tokens_seen": 487788196, "step": 12420 }, { "epoch": 0.6023125798772843, "grad_norm": 0.4022323191165924, "learning_rate": 1.8025335384950665e-05, "loss": 1.2569, "num_input_tokens_seen": 488152244, "step": 12430 }, { "epoch": 0.602797143497459, "grad_norm": 0.3971206545829773, "learning_rate": 1.7987665320887666e-05, "loss": 1.3111, "num_input_tokens_seen": 488555012, "step": 12440 }, { "epoch": 0.6032817071176338, "grad_norm": 0.40955471992492676, "learning_rate": 1.795001252968706e-05, "loss": 1.2905, "num_input_tokens_seen": 488963324, "step": 12450 }, { "epoch": 0.6037662707378086, "grad_norm": 0.4047541320323944, "learning_rate": 1.7912377104095645e-05, "loss": 1.3335, "num_input_tokens_seen": 489371688, "step": 12460 }, { "epoch": 0.6042508343579834, "grad_norm": 0.4085901379585266, "learning_rate": 1.787475913681743e-05, "loss": 1.2776, "num_input_tokens_seen": 489791296, "step": 12470 }, { "epoch": 0.6047353979781583, "grad_norm": 0.40488049387931824, "learning_rate": 1.783715872051341e-05, "loss": 1.3007, "num_input_tokens_seen": 490158224, "step": 12480 }, { "epoch": 0.6052199615983331, "grad_norm": 0.37421244382858276, "learning_rate": 1.7799575947801374e-05, "loss": 1.3284, "num_input_tokens_seen": 490589156, "step": 12490 }, { "epoch": 0.6057045252185079, "grad_norm": 0.44858112931251526, "learning_rate": 1.776201091125561e-05, "loss": 1.2499, "num_input_tokens_seen": 491003784, "step": 12500 }, { "epoch": 0.6061890888386827, "grad_norm": 0.3810955286026001, "learning_rate": 1.7724463703406766e-05, "loss": 1.2935, "num_input_tokens_seen": 491417140, "step": 12510 }, { "epoch": 0.6066736524588575, "grad_norm": 0.4152781665325165, "learning_rate": 1.768693441674153e-05, "loss": 1.3209, "num_input_tokens_seen": 491809688, "step": 12520 }, { "epoch": 0.6071582160790323, "grad_norm": 0.4103911221027374, "learning_rate": 1.764942314370248e-05, "loss": 1.2924, "num_input_tokens_seen": 492210364, "step": 12530 }, { "epoch": 0.6076427796992071, "grad_norm": 0.4096033275127411, "learning_rate": 1.761192997668781e-05, "loss": 1.25, "num_input_tokens_seen": 492627760, "step": 12540 }, { "epoch": 0.6081273433193819, "grad_norm": 0.42093944549560547, "learning_rate": 1.7574455008051115e-05, "loss": 1.2955, "num_input_tokens_seen": 493015580, "step": 12550 }, { "epoch": 0.6086119069395567, "grad_norm": 0.43841052055358887, "learning_rate": 1.753699833010115e-05, "loss": 1.2778, "num_input_tokens_seen": 493413392, "step": 12560 }, { "epoch": 0.6090964705597316, "grad_norm": 0.3942921459674835, "learning_rate": 1.7499560035101653e-05, "loss": 1.2609, "num_input_tokens_seen": 493804560, "step": 12570 }, { "epoch": 0.6095810341799064, "grad_norm": 0.42653924226760864, "learning_rate": 1.746214021527103e-05, "loss": 1.2611, "num_input_tokens_seen": 494185728, "step": 12580 }, { "epoch": 0.6100655978000812, "grad_norm": 0.3946724236011505, "learning_rate": 1.7424738962782222e-05, "loss": 1.2609, "num_input_tokens_seen": 494597784, "step": 12590 }, { "epoch": 0.610550161420256, "grad_norm": 0.37868139147758484, "learning_rate": 1.7387356369762426e-05, "loss": 1.2714, "num_input_tokens_seen": 494995584, "step": 12600 }, { "epoch": 0.6110347250404308, "grad_norm": 0.42459574341773987, "learning_rate": 1.734999252829285e-05, "loss": 1.2983, "num_input_tokens_seen": 495396436, "step": 12610 }, { "epoch": 0.6115192886606056, "grad_norm": 0.3998323976993561, "learning_rate": 1.7312647530408548e-05, "loss": 1.2819, "num_input_tokens_seen": 495790568, "step": 12620 }, { "epoch": 0.6120038522807804, "grad_norm": 0.4067113995552063, "learning_rate": 1.7275321468098133e-05, "loss": 1.2594, "num_input_tokens_seen": 496182596, "step": 12630 }, { "epoch": 0.6124884159009552, "grad_norm": 0.41096609830856323, "learning_rate": 1.72380144333036e-05, "loss": 1.2807, "num_input_tokens_seen": 496584560, "step": 12640 }, { "epoch": 0.6129729795211301, "grad_norm": 0.448887437582016, "learning_rate": 1.720072651792004e-05, "loss": 1.2719, "num_input_tokens_seen": 496979328, "step": 12650 }, { "epoch": 0.6134575431413049, "grad_norm": 0.4053841829299927, "learning_rate": 1.716345781379549e-05, "loss": 1.3734, "num_input_tokens_seen": 497342904, "step": 12660 }, { "epoch": 0.6139421067614796, "grad_norm": 0.4214945137500763, "learning_rate": 1.7126208412730628e-05, "loss": 1.3149, "num_input_tokens_seen": 497758472, "step": 12670 }, { "epoch": 0.6144266703816544, "grad_norm": 0.42576828598976135, "learning_rate": 1.708897840647861e-05, "loss": 1.2727, "num_input_tokens_seen": 498155548, "step": 12680 }, { "epoch": 0.6149112340018292, "grad_norm": 0.3854653537273407, "learning_rate": 1.7051767886744808e-05, "loss": 1.2532, "num_input_tokens_seen": 498540656, "step": 12690 }, { "epoch": 0.615395797622004, "grad_norm": 0.42444708943367004, "learning_rate": 1.701457694518661e-05, "loss": 1.2709, "num_input_tokens_seen": 498973360, "step": 12700 }, { "epoch": 0.6158803612421788, "grad_norm": 0.3879097104072571, "learning_rate": 1.697740567341314e-05, "loss": 1.3207, "num_input_tokens_seen": 499404028, "step": 12710 }, { "epoch": 0.6163649248623536, "grad_norm": 0.3972528874874115, "learning_rate": 1.694025416298511e-05, "loss": 1.3065, "num_input_tokens_seen": 499780220, "step": 12720 }, { "epoch": 0.6168494884825284, "grad_norm": 0.4193166196346283, "learning_rate": 1.6903122505414552e-05, "loss": 1.3359, "num_input_tokens_seen": 500185720, "step": 12730 }, { "epoch": 0.6173340521027033, "grad_norm": 0.40607649087905884, "learning_rate": 1.686601079216457e-05, "loss": 1.2789, "num_input_tokens_seen": 500562168, "step": 12740 }, { "epoch": 0.6178186157228781, "grad_norm": 0.4222082197666168, "learning_rate": 1.682891911464917e-05, "loss": 1.2436, "num_input_tokens_seen": 500990044, "step": 12750 }, { "epoch": 0.6183031793430529, "grad_norm": 0.40841108560562134, "learning_rate": 1.6791847564232982e-05, "loss": 1.2466, "num_input_tokens_seen": 501376276, "step": 12760 }, { "epoch": 0.6187877429632277, "grad_norm": 0.4501042664051056, "learning_rate": 1.6754796232231084e-05, "loss": 1.3059, "num_input_tokens_seen": 501734004, "step": 12770 }, { "epoch": 0.6192723065834025, "grad_norm": 0.3940059244632721, "learning_rate": 1.6717765209908722e-05, "loss": 1.3457, "num_input_tokens_seen": 502130812, "step": 12780 }, { "epoch": 0.6197568702035773, "grad_norm": 0.372949481010437, "learning_rate": 1.668075458848115e-05, "loss": 1.2881, "num_input_tokens_seen": 502544872, "step": 12790 }, { "epoch": 0.6202414338237521, "grad_norm": 0.4232177138328552, "learning_rate": 1.6643764459113324e-05, "loss": 1.2777, "num_input_tokens_seen": 502909916, "step": 12800 }, { "epoch": 0.6207259974439269, "grad_norm": 0.40586578845977783, "learning_rate": 1.6606794912919776e-05, "loss": 1.2462, "num_input_tokens_seen": 503313708, "step": 12810 }, { "epoch": 0.6212105610641017, "grad_norm": 0.40091219544410706, "learning_rate": 1.6569846040964293e-05, "loss": 1.305, "num_input_tokens_seen": 503697412, "step": 12820 }, { "epoch": 0.6216951246842766, "grad_norm": 0.42310020327568054, "learning_rate": 1.653291793425978e-05, "loss": 1.2631, "num_input_tokens_seen": 504107344, "step": 12830 }, { "epoch": 0.6221796883044514, "grad_norm": 0.42501911520957947, "learning_rate": 1.6496010683767936e-05, "loss": 1.275, "num_input_tokens_seen": 504491792, "step": 12840 }, { "epoch": 0.6226642519246262, "grad_norm": 0.42510515451431274, "learning_rate": 1.6459124380399144e-05, "loss": 1.2764, "num_input_tokens_seen": 504881504, "step": 12850 }, { "epoch": 0.623148815544801, "grad_norm": 0.4044088125228882, "learning_rate": 1.6422259115012165e-05, "loss": 1.2925, "num_input_tokens_seen": 505293712, "step": 12860 }, { "epoch": 0.6236333791649757, "grad_norm": 0.4045158326625824, "learning_rate": 1.638541497841392e-05, "loss": 1.2921, "num_input_tokens_seen": 505655600, "step": 12870 }, { "epoch": 0.6241179427851505, "grad_norm": 0.42825374007225037, "learning_rate": 1.6348592061359334e-05, "loss": 1.2857, "num_input_tokens_seen": 506040624, "step": 12880 }, { "epoch": 0.6246025064053253, "grad_norm": 0.39818504452705383, "learning_rate": 1.6311790454551e-05, "loss": 1.2945, "num_input_tokens_seen": 506464380, "step": 12890 }, { "epoch": 0.6250870700255001, "grad_norm": 0.46967610716819763, "learning_rate": 1.6275010248639085e-05, "loss": 1.2733, "num_input_tokens_seen": 506879284, "step": 12900 }, { "epoch": 0.6255716336456749, "grad_norm": 0.46247559785842896, "learning_rate": 1.6238251534220982e-05, "loss": 1.2628, "num_input_tokens_seen": 507255672, "step": 12910 }, { "epoch": 0.6260561972658498, "grad_norm": 0.39192506670951843, "learning_rate": 1.6201514401841204e-05, "loss": 1.2749, "num_input_tokens_seen": 507642376, "step": 12920 }, { "epoch": 0.6265407608860246, "grad_norm": 0.41501137614250183, "learning_rate": 1.6164798941991046e-05, "loss": 1.2847, "num_input_tokens_seen": 508025372, "step": 12930 }, { "epoch": 0.6270253245061994, "grad_norm": 0.38921937346458435, "learning_rate": 1.6128105245108464e-05, "loss": 1.2737, "num_input_tokens_seen": 508418732, "step": 12940 }, { "epoch": 0.6275098881263742, "grad_norm": 0.3864487111568451, "learning_rate": 1.609143340157777e-05, "loss": 1.2822, "num_input_tokens_seen": 508802832, "step": 12950 }, { "epoch": 0.627994451746549, "grad_norm": 0.39535877108573914, "learning_rate": 1.6054783501729488e-05, "loss": 1.3428, "num_input_tokens_seen": 509204136, "step": 12960 }, { "epoch": 0.6284790153667238, "grad_norm": 0.47437265515327454, "learning_rate": 1.6018155635840046e-05, "loss": 1.261, "num_input_tokens_seen": 509580876, "step": 12970 }, { "epoch": 0.6289635789868986, "grad_norm": 0.40359655022621155, "learning_rate": 1.5981549894131628e-05, "loss": 1.2695, "num_input_tokens_seen": 509961156, "step": 12980 }, { "epoch": 0.6294481426070734, "grad_norm": 0.4515499770641327, "learning_rate": 1.5944966366771928e-05, "loss": 1.3089, "num_input_tokens_seen": 510362116, "step": 12990 }, { "epoch": 0.6299327062272482, "grad_norm": 0.384777307510376, "learning_rate": 1.5908405143873878e-05, "loss": 1.296, "num_input_tokens_seen": 510738872, "step": 13000 }, { "epoch": 0.6304172698474231, "grad_norm": 0.36402514576911926, "learning_rate": 1.587186631549552e-05, "loss": 1.2881, "num_input_tokens_seen": 511139464, "step": 13010 }, { "epoch": 0.6309018334675979, "grad_norm": 0.36122068762779236, "learning_rate": 1.5835349971639694e-05, "loss": 1.2815, "num_input_tokens_seen": 511548508, "step": 13020 }, { "epoch": 0.6313863970877727, "grad_norm": 0.4500305652618408, "learning_rate": 1.5798856202253885e-05, "loss": 1.2844, "num_input_tokens_seen": 511919064, "step": 13030 }, { "epoch": 0.6318709607079475, "grad_norm": 0.41978541016578674, "learning_rate": 1.5762385097229952e-05, "loss": 1.3016, "num_input_tokens_seen": 512296380, "step": 13040 }, { "epoch": 0.6323555243281223, "grad_norm": 0.40535783767700195, "learning_rate": 1.5725936746403952e-05, "loss": 1.3093, "num_input_tokens_seen": 512698204, "step": 13050 }, { "epoch": 0.632840087948297, "grad_norm": 0.41118356585502625, "learning_rate": 1.568951123955585e-05, "loss": 1.3026, "num_input_tokens_seen": 513094244, "step": 13060 }, { "epoch": 0.6333246515684718, "grad_norm": 0.4092055559158325, "learning_rate": 1.565310866640939e-05, "loss": 1.2814, "num_input_tokens_seen": 513478972, "step": 13070 }, { "epoch": 0.6338092151886466, "grad_norm": 0.42620420455932617, "learning_rate": 1.561672911663179e-05, "loss": 1.2906, "num_input_tokens_seen": 513892816, "step": 13080 }, { "epoch": 0.6342937788088214, "grad_norm": 0.40342986583709717, "learning_rate": 1.558037267983358e-05, "loss": 1.2528, "num_input_tokens_seen": 514268756, "step": 13090 }, { "epoch": 0.6347783424289963, "grad_norm": 0.44240349531173706, "learning_rate": 1.554403944556834e-05, "loss": 1.2711, "num_input_tokens_seen": 514676816, "step": 13100 }, { "epoch": 0.6352629060491711, "grad_norm": 0.38063910603523254, "learning_rate": 1.5507729503332503e-05, "loss": 1.2958, "num_input_tokens_seen": 515102476, "step": 13110 }, { "epoch": 0.6357474696693459, "grad_norm": 0.4031839072704315, "learning_rate": 1.547144294256514e-05, "loss": 1.3018, "num_input_tokens_seen": 515484244, "step": 13120 }, { "epoch": 0.6362320332895207, "grad_norm": 0.4612565338611603, "learning_rate": 1.5435179852647712e-05, "loss": 1.2909, "num_input_tokens_seen": 515868796, "step": 13130 }, { "epoch": 0.6367165969096955, "grad_norm": 0.4232649803161621, "learning_rate": 1.539894032290389e-05, "loss": 1.2522, "num_input_tokens_seen": 516277108, "step": 13140 }, { "epoch": 0.6372011605298703, "grad_norm": 0.4205443859100342, "learning_rate": 1.5362724442599276e-05, "loss": 1.3396, "num_input_tokens_seen": 516659132, "step": 13150 }, { "epoch": 0.6376857241500451, "grad_norm": 0.3871660828590393, "learning_rate": 1.532653230094125e-05, "loss": 1.312, "num_input_tokens_seen": 517083020, "step": 13160 }, { "epoch": 0.6381702877702199, "grad_norm": 0.42630356550216675, "learning_rate": 1.529036398707869e-05, "loss": 1.2414, "num_input_tokens_seen": 517464668, "step": 13170 }, { "epoch": 0.6386548513903947, "grad_norm": 0.40976426005363464, "learning_rate": 1.5254219590101816e-05, "loss": 1.2985, "num_input_tokens_seen": 517860704, "step": 13180 }, { "epoch": 0.6391394150105696, "grad_norm": 0.4328073561191559, "learning_rate": 1.5218099199041902e-05, "loss": 1.2917, "num_input_tokens_seen": 518276916, "step": 13190 }, { "epoch": 0.6396239786307444, "grad_norm": 0.3879329264163971, "learning_rate": 1.5182002902871123e-05, "loss": 1.3094, "num_input_tokens_seen": 518654656, "step": 13200 }, { "epoch": 0.6401085422509192, "grad_norm": 0.41468775272369385, "learning_rate": 1.5145930790502267e-05, "loss": 1.307, "num_input_tokens_seen": 519033292, "step": 13210 }, { "epoch": 0.640593105871094, "grad_norm": 0.40821224451065063, "learning_rate": 1.5109882950788586e-05, "loss": 1.2277, "num_input_tokens_seen": 519406144, "step": 13220 }, { "epoch": 0.6410776694912688, "grad_norm": 0.38090160489082336, "learning_rate": 1.5073859472523514e-05, "loss": 1.2876, "num_input_tokens_seen": 519814676, "step": 13230 }, { "epoch": 0.6415622331114436, "grad_norm": 0.39433416724205017, "learning_rate": 1.50378604444405e-05, "loss": 1.2361, "num_input_tokens_seen": 520205984, "step": 13240 }, { "epoch": 0.6420467967316184, "grad_norm": 0.3543625771999359, "learning_rate": 1.5001885955212758e-05, "loss": 1.276, "num_input_tokens_seen": 520602212, "step": 13250 }, { "epoch": 0.6425313603517931, "grad_norm": 0.43784913420677185, "learning_rate": 1.4965936093453054e-05, "loss": 1.307, "num_input_tokens_seen": 520984044, "step": 13260 }, { "epoch": 0.6430159239719679, "grad_norm": 0.38104525208473206, "learning_rate": 1.4930010947713513e-05, "loss": 1.287, "num_input_tokens_seen": 521381332, "step": 13270 }, { "epoch": 0.6435004875921428, "grad_norm": 0.42758843302726746, "learning_rate": 1.4894110606485334e-05, "loss": 1.3144, "num_input_tokens_seen": 521758556, "step": 13280 }, { "epoch": 0.6439850512123176, "grad_norm": 0.3898857831954956, "learning_rate": 1.4858235158198675e-05, "loss": 1.2667, "num_input_tokens_seen": 522155024, "step": 13290 }, { "epoch": 0.6444696148324924, "grad_norm": 0.406747967004776, "learning_rate": 1.482238469122232e-05, "loss": 1.277, "num_input_tokens_seen": 522511408, "step": 13300 }, { "epoch": 0.6449541784526672, "grad_norm": 0.3949616849422455, "learning_rate": 1.4786559293863566e-05, "loss": 1.2483, "num_input_tokens_seen": 522886168, "step": 13310 }, { "epoch": 0.645438742072842, "grad_norm": 0.4430643916130066, "learning_rate": 1.4750759054367923e-05, "loss": 1.2268, "num_input_tokens_seen": 523282444, "step": 13320 }, { "epoch": 0.6459233056930168, "grad_norm": 0.4223615825176239, "learning_rate": 1.4714984060918962e-05, "loss": 1.272, "num_input_tokens_seen": 523654248, "step": 13330 }, { "epoch": 0.6464078693131916, "grad_norm": 0.3976212739944458, "learning_rate": 1.4679234401638043e-05, "loss": 1.2618, "num_input_tokens_seen": 524040568, "step": 13340 }, { "epoch": 0.6468924329333664, "grad_norm": 0.40596914291381836, "learning_rate": 1.464351016458414e-05, "loss": 1.2804, "num_input_tokens_seen": 524461668, "step": 13350 }, { "epoch": 0.6473769965535413, "grad_norm": 0.3791384696960449, "learning_rate": 1.460781143775359e-05, "loss": 1.304, "num_input_tokens_seen": 524855056, "step": 13360 }, { "epoch": 0.6478615601737161, "grad_norm": 0.3824685215950012, "learning_rate": 1.457213830907992e-05, "loss": 1.288, "num_input_tokens_seen": 525251636, "step": 13370 }, { "epoch": 0.6483461237938909, "grad_norm": 0.39863574504852295, "learning_rate": 1.453649086643356e-05, "loss": 1.2836, "num_input_tokens_seen": 525642284, "step": 13380 }, { "epoch": 0.6488306874140657, "grad_norm": 0.4155414402484894, "learning_rate": 1.4500869197621708e-05, "loss": 1.2526, "num_input_tokens_seen": 526045452, "step": 13390 }, { "epoch": 0.6493152510342405, "grad_norm": 0.41306737065315247, "learning_rate": 1.446527339038808e-05, "loss": 1.2497, "num_input_tokens_seen": 526454148, "step": 13400 }, { "epoch": 0.6497998146544153, "grad_norm": 0.4045097529888153, "learning_rate": 1.4429703532412642e-05, "loss": 1.2635, "num_input_tokens_seen": 526858372, "step": 13410 }, { "epoch": 0.6502843782745901, "grad_norm": 0.39056453108787537, "learning_rate": 1.4394159711311494e-05, "loss": 1.3007, "num_input_tokens_seen": 527262932, "step": 13420 }, { "epoch": 0.6507689418947649, "grad_norm": 0.4115632474422455, "learning_rate": 1.435864201463657e-05, "loss": 1.2925, "num_input_tokens_seen": 527689048, "step": 13430 }, { "epoch": 0.6512535055149397, "grad_norm": 0.399703711271286, "learning_rate": 1.4323150529875462e-05, "loss": 1.2915, "num_input_tokens_seen": 528076752, "step": 13440 }, { "epoch": 0.6517380691351146, "grad_norm": 0.39391404390335083, "learning_rate": 1.4287685344451202e-05, "loss": 1.2642, "num_input_tokens_seen": 528478876, "step": 13450 }, { "epoch": 0.6522226327552894, "grad_norm": 0.41902291774749756, "learning_rate": 1.4252246545722048e-05, "loss": 1.2977, "num_input_tokens_seen": 528884840, "step": 13460 }, { "epoch": 0.6527071963754641, "grad_norm": 0.42969298362731934, "learning_rate": 1.4216834220981235e-05, "loss": 1.2721, "num_input_tokens_seen": 529259760, "step": 13470 }, { "epoch": 0.6531917599956389, "grad_norm": 0.41451066732406616, "learning_rate": 1.4181448457456814e-05, "loss": 1.2888, "num_input_tokens_seen": 529676680, "step": 13480 }, { "epoch": 0.6536763236158137, "grad_norm": 0.4217407703399658, "learning_rate": 1.4146089342311391e-05, "loss": 1.2666, "num_input_tokens_seen": 530073596, "step": 13490 }, { "epoch": 0.6541608872359885, "grad_norm": 0.3723660111427307, "learning_rate": 1.4110756962641952e-05, "loss": 1.2463, "num_input_tokens_seen": 530447064, "step": 13500 }, { "epoch": 0.6546454508561633, "grad_norm": 0.42004016041755676, "learning_rate": 1.4075451405479598e-05, "loss": 1.2802, "num_input_tokens_seen": 530851496, "step": 13510 }, { "epoch": 0.6551300144763381, "grad_norm": 0.4187595546245575, "learning_rate": 1.4040172757789388e-05, "loss": 1.2933, "num_input_tokens_seen": 531264604, "step": 13520 }, { "epoch": 0.6556145780965129, "grad_norm": 0.4250360429286957, "learning_rate": 1.4004921106470098e-05, "loss": 1.2751, "num_input_tokens_seen": 531679120, "step": 13530 }, { "epoch": 0.6560991417166878, "grad_norm": 0.37126293778419495, "learning_rate": 1.3969696538353977e-05, "loss": 1.284, "num_input_tokens_seen": 532051596, "step": 13540 }, { "epoch": 0.6565837053368626, "grad_norm": 0.3943636119365692, "learning_rate": 1.3934499140206596e-05, "loss": 1.2617, "num_input_tokens_seen": 532444860, "step": 13550 }, { "epoch": 0.6570682689570374, "grad_norm": 0.4207961857318878, "learning_rate": 1.3899328998726574e-05, "loss": 1.2434, "num_input_tokens_seen": 532852472, "step": 13560 }, { "epoch": 0.6575528325772122, "grad_norm": 0.41514161229133606, "learning_rate": 1.3864186200545403e-05, "loss": 1.2763, "num_input_tokens_seen": 533252152, "step": 13570 }, { "epoch": 0.658037396197387, "grad_norm": 0.422195702791214, "learning_rate": 1.3829070832227234e-05, "loss": 1.2649, "num_input_tokens_seen": 533646968, "step": 13580 }, { "epoch": 0.6585219598175618, "grad_norm": 0.3905238211154938, "learning_rate": 1.3793982980268644e-05, "loss": 1.2159, "num_input_tokens_seen": 534000320, "step": 13590 }, { "epoch": 0.6590065234377366, "grad_norm": 0.43043792247772217, "learning_rate": 1.3758922731098406e-05, "loss": 1.3498, "num_input_tokens_seen": 534396632, "step": 13600 }, { "epoch": 0.6594910870579114, "grad_norm": 0.4008113741874695, "learning_rate": 1.372389017107735e-05, "loss": 1.2894, "num_input_tokens_seen": 534782340, "step": 13610 }, { "epoch": 0.6599756506780862, "grad_norm": 0.3774920403957367, "learning_rate": 1.3688885386498052e-05, "loss": 1.2679, "num_input_tokens_seen": 535198784, "step": 13620 }, { "epoch": 0.6604602142982611, "grad_norm": 0.42692962288856506, "learning_rate": 1.3653908463584717e-05, "loss": 1.239, "num_input_tokens_seen": 535613348, "step": 13630 }, { "epoch": 0.6609447779184359, "grad_norm": 0.4580807387828827, "learning_rate": 1.3618959488492875e-05, "loss": 1.2745, "num_input_tokens_seen": 536005904, "step": 13640 }, { "epoch": 0.6614293415386107, "grad_norm": 0.4274024963378906, "learning_rate": 1.3584038547309253e-05, "loss": 1.2859, "num_input_tokens_seen": 536409736, "step": 13650 }, { "epoch": 0.6619139051587855, "grad_norm": 0.43948325514793396, "learning_rate": 1.3549145726051514e-05, "loss": 1.2886, "num_input_tokens_seen": 536786188, "step": 13660 }, { "epoch": 0.6623984687789602, "grad_norm": 0.39728376269340515, "learning_rate": 1.3514281110668036e-05, "loss": 1.2886, "num_input_tokens_seen": 537166236, "step": 13670 }, { "epoch": 0.662883032399135, "grad_norm": 0.39317986369132996, "learning_rate": 1.3479444787037756e-05, "loss": 1.3038, "num_input_tokens_seen": 537555492, "step": 13680 }, { "epoch": 0.6633675960193098, "grad_norm": 0.40623709559440613, "learning_rate": 1.3444636840969882e-05, "loss": 1.292, "num_input_tokens_seen": 537970244, "step": 13690 }, { "epoch": 0.6638521596394846, "grad_norm": 0.4472850263118744, "learning_rate": 1.340985735820376e-05, "loss": 1.2384, "num_input_tokens_seen": 538370372, "step": 13700 }, { "epoch": 0.6643367232596594, "grad_norm": 0.3942728638648987, "learning_rate": 1.3375106424408584e-05, "loss": 1.2675, "num_input_tokens_seen": 538759172, "step": 13710 }, { "epoch": 0.6648212868798343, "grad_norm": 0.4153252840042114, "learning_rate": 1.3340384125183263e-05, "loss": 1.3135, "num_input_tokens_seen": 539150680, "step": 13720 }, { "epoch": 0.6653058505000091, "grad_norm": 0.41260483860969543, "learning_rate": 1.330569054605616e-05, "loss": 1.2691, "num_input_tokens_seen": 539546148, "step": 13730 }, { "epoch": 0.6657904141201839, "grad_norm": 0.4087775647640228, "learning_rate": 1.3271025772484897e-05, "loss": 1.2561, "num_input_tokens_seen": 539930372, "step": 13740 }, { "epoch": 0.6662749777403587, "grad_norm": 0.40349823236465454, "learning_rate": 1.3236389889856123e-05, "loss": 1.2255, "num_input_tokens_seen": 540338764, "step": 13750 }, { "epoch": 0.6667595413605335, "grad_norm": 0.3926162123680115, "learning_rate": 1.3201782983485356e-05, "loss": 1.2884, "num_input_tokens_seen": 540728228, "step": 13760 }, { "epoch": 0.6672441049807083, "grad_norm": 0.4218115508556366, "learning_rate": 1.3167205138616703e-05, "loss": 1.2595, "num_input_tokens_seen": 541134348, "step": 13770 }, { "epoch": 0.6677286686008831, "grad_norm": 0.4189454913139343, "learning_rate": 1.3132656440422711e-05, "loss": 1.3104, "num_input_tokens_seen": 541508532, "step": 13780 }, { "epoch": 0.6682132322210579, "grad_norm": 0.4163365364074707, "learning_rate": 1.3098136974004136e-05, "loss": 1.2894, "num_input_tokens_seen": 541904872, "step": 13790 }, { "epoch": 0.6686977958412327, "grad_norm": 0.39865246415138245, "learning_rate": 1.30636468243897e-05, "loss": 1.2838, "num_input_tokens_seen": 542327136, "step": 13800 }, { "epoch": 0.6691823594614076, "grad_norm": 0.4374254047870636, "learning_rate": 1.3029186076535948e-05, "loss": 1.2899, "num_input_tokens_seen": 542708876, "step": 13810 }, { "epoch": 0.6696669230815824, "grad_norm": 0.39832600951194763, "learning_rate": 1.2994754815326976e-05, "loss": 1.3043, "num_input_tokens_seen": 543083288, "step": 13820 }, { "epoch": 0.6701514867017572, "grad_norm": 0.38928577303886414, "learning_rate": 1.2960353125574264e-05, "loss": 1.2409, "num_input_tokens_seen": 543498476, "step": 13830 }, { "epoch": 0.670636050321932, "grad_norm": 0.3987908363342285, "learning_rate": 1.2925981092016434e-05, "loss": 1.3112, "num_input_tokens_seen": 543904852, "step": 13840 }, { "epoch": 0.6711206139421068, "grad_norm": 0.43027713894844055, "learning_rate": 1.2891638799319078e-05, "loss": 1.2587, "num_input_tokens_seen": 544297824, "step": 13850 }, { "epoch": 0.6716051775622816, "grad_norm": 0.4249833822250366, "learning_rate": 1.2857326332074516e-05, "loss": 1.3004, "num_input_tokens_seen": 544682872, "step": 13860 }, { "epoch": 0.6720897411824563, "grad_norm": 0.42288899421691895, "learning_rate": 1.2823043774801625e-05, "loss": 1.3173, "num_input_tokens_seen": 545074136, "step": 13870 }, { "epoch": 0.6725743048026311, "grad_norm": 0.4400785267353058, "learning_rate": 1.278879121194556e-05, "loss": 1.225, "num_input_tokens_seen": 545481540, "step": 13880 }, { "epoch": 0.6730588684228059, "grad_norm": 0.42327094078063965, "learning_rate": 1.275456872787765e-05, "loss": 1.2532, "num_input_tokens_seen": 545873512, "step": 13890 }, { "epoch": 0.6735434320429808, "grad_norm": 0.39755240082740784, "learning_rate": 1.2720376406895086e-05, "loss": 1.2726, "num_input_tokens_seen": 546300044, "step": 13900 }, { "epoch": 0.6740279956631556, "grad_norm": 0.39591267704963684, "learning_rate": 1.2686214333220787e-05, "loss": 1.2871, "num_input_tokens_seen": 546682376, "step": 13910 }, { "epoch": 0.6745125592833304, "grad_norm": 0.3895485997200012, "learning_rate": 1.2652082591003173e-05, "loss": 1.2779, "num_input_tokens_seen": 547085092, "step": 13920 }, { "epoch": 0.6749971229035052, "grad_norm": 0.43440917134284973, "learning_rate": 1.261798126431592e-05, "loss": 1.2538, "num_input_tokens_seen": 547501820, "step": 13930 }, { "epoch": 0.67548168652368, "grad_norm": 0.39425063133239746, "learning_rate": 1.2583910437157825e-05, "loss": 1.3221, "num_input_tokens_seen": 547901628, "step": 13940 }, { "epoch": 0.6759662501438548, "grad_norm": 0.3926689922809601, "learning_rate": 1.2549870193452513e-05, "loss": 1.3095, "num_input_tokens_seen": 548291540, "step": 13950 }, { "epoch": 0.6764508137640296, "grad_norm": 0.4056353271007538, "learning_rate": 1.2515860617048314e-05, "loss": 1.2636, "num_input_tokens_seen": 548701312, "step": 13960 }, { "epoch": 0.6769353773842044, "grad_norm": 0.43241339921951294, "learning_rate": 1.2481881791717996e-05, "loss": 1.2654, "num_input_tokens_seen": 549068240, "step": 13970 }, { "epoch": 0.6774199410043792, "grad_norm": 0.3984728753566742, "learning_rate": 1.2447933801158593e-05, "loss": 1.213, "num_input_tokens_seen": 549458460, "step": 13980 }, { "epoch": 0.6779045046245541, "grad_norm": 0.39436209201812744, "learning_rate": 1.2414016728991171e-05, "loss": 1.2805, "num_input_tokens_seen": 549845624, "step": 13990 }, { "epoch": 0.6783890682447289, "grad_norm": 0.4029475450515747, "learning_rate": 1.2380130658760653e-05, "loss": 1.2622, "num_input_tokens_seen": 550234352, "step": 14000 }, { "epoch": 0.6783890682447289, "eval_loss": 1.3831204175949097, "eval_runtime": 3.6695, "eval_samples_per_second": 40.878, "eval_steps_per_second": 5.178, "num_input_tokens_seen": 550234352, "step": 14000 }, { "epoch": 0.6788736318649037, "grad_norm": 0.3884779214859009, "learning_rate": 1.2346275673935592e-05, "loss": 1.2657, "num_input_tokens_seen": 550605424, "step": 14010 }, { "epoch": 0.6793581954850785, "grad_norm": 0.3887692391872406, "learning_rate": 1.2312451857907983e-05, "loss": 1.2375, "num_input_tokens_seen": 551020804, "step": 14020 }, { "epoch": 0.6798427591052533, "grad_norm": 0.44228610396385193, "learning_rate": 1.2278659293993011e-05, "loss": 1.2443, "num_input_tokens_seen": 551415568, "step": 14030 }, { "epoch": 0.6803273227254281, "grad_norm": 0.4105127155780792, "learning_rate": 1.2244898065428918e-05, "loss": 1.256, "num_input_tokens_seen": 551790224, "step": 14040 }, { "epoch": 0.6808118863456029, "grad_norm": 0.367942750453949, "learning_rate": 1.2211168255376747e-05, "loss": 1.313, "num_input_tokens_seen": 552201712, "step": 14050 }, { "epoch": 0.6812964499657777, "grad_norm": 0.43449220061302185, "learning_rate": 1.217746994692014e-05, "loss": 1.2926, "num_input_tokens_seen": 552593300, "step": 14060 }, { "epoch": 0.6817810135859526, "grad_norm": 0.3836462199687958, "learning_rate": 1.2143803223065161e-05, "loss": 1.3065, "num_input_tokens_seen": 552995148, "step": 14070 }, { "epoch": 0.6822655772061273, "grad_norm": 0.4054076373577118, "learning_rate": 1.2110168166740057e-05, "loss": 1.3149, "num_input_tokens_seen": 553394712, "step": 14080 }, { "epoch": 0.6827501408263021, "grad_norm": 0.39375001192092896, "learning_rate": 1.2076564860795095e-05, "loss": 1.2744, "num_input_tokens_seen": 553807736, "step": 14090 }, { "epoch": 0.6832347044464769, "grad_norm": 0.4356180727481842, "learning_rate": 1.2042993388002302e-05, "loss": 1.2847, "num_input_tokens_seen": 554190868, "step": 14100 }, { "epoch": 0.6837192680666517, "grad_norm": 0.4105941951274872, "learning_rate": 1.2009453831055331e-05, "loss": 1.2395, "num_input_tokens_seen": 554599144, "step": 14110 }, { "epoch": 0.6842038316868265, "grad_norm": 0.43507689237594604, "learning_rate": 1.1975946272569177e-05, "loss": 1.2725, "num_input_tokens_seen": 554992164, "step": 14120 }, { "epoch": 0.6846883953070013, "grad_norm": 0.4143732786178589, "learning_rate": 1.194247079508006e-05, "loss": 1.2748, "num_input_tokens_seen": 555351352, "step": 14130 }, { "epoch": 0.6851729589271761, "grad_norm": 0.4113134741783142, "learning_rate": 1.1909027481045138e-05, "loss": 1.2729, "num_input_tokens_seen": 555746752, "step": 14140 }, { "epoch": 0.6856575225473509, "grad_norm": 0.400814414024353, "learning_rate": 1.1875616412842368e-05, "loss": 1.2404, "num_input_tokens_seen": 556135216, "step": 14150 }, { "epoch": 0.6861420861675258, "grad_norm": 0.386210560798645, "learning_rate": 1.1842237672770277e-05, "loss": 1.2877, "num_input_tokens_seen": 556532556, "step": 14160 }, { "epoch": 0.6866266497877006, "grad_norm": 0.41966861486434937, "learning_rate": 1.1808891343047754e-05, "loss": 1.2242, "num_input_tokens_seen": 556962628, "step": 14170 }, { "epoch": 0.6871112134078754, "grad_norm": 0.4356541037559509, "learning_rate": 1.1775577505813868e-05, "loss": 1.2896, "num_input_tokens_seen": 557340936, "step": 14180 }, { "epoch": 0.6875957770280502, "grad_norm": 0.38502469658851624, "learning_rate": 1.1742296243127621e-05, "loss": 1.3201, "num_input_tokens_seen": 557716816, "step": 14190 }, { "epoch": 0.688080340648225, "grad_norm": 0.4457418620586395, "learning_rate": 1.1709047636967812e-05, "loss": 1.2624, "num_input_tokens_seen": 558088256, "step": 14200 }, { "epoch": 0.6885649042683998, "grad_norm": 0.43167582154273987, "learning_rate": 1.1675831769232775e-05, "loss": 1.274, "num_input_tokens_seen": 558496660, "step": 14210 }, { "epoch": 0.6890494678885746, "grad_norm": 0.42342841625213623, "learning_rate": 1.1642648721740226e-05, "loss": 1.2851, "num_input_tokens_seen": 558846140, "step": 14220 }, { "epoch": 0.6895340315087494, "grad_norm": 0.4272722005844116, "learning_rate": 1.1609498576227008e-05, "loss": 1.2644, "num_input_tokens_seen": 559205772, "step": 14230 }, { "epoch": 0.6900185951289242, "grad_norm": 0.4222122132778168, "learning_rate": 1.1576381414348953e-05, "loss": 1.287, "num_input_tokens_seen": 559613824, "step": 14240 }, { "epoch": 0.6905031587490991, "grad_norm": 0.5002584457397461, "learning_rate": 1.1543297317680607e-05, "loss": 1.3271, "num_input_tokens_seen": 560013268, "step": 14250 }, { "epoch": 0.6909877223692739, "grad_norm": 0.4087948501110077, "learning_rate": 1.1510246367715122e-05, "loss": 1.2673, "num_input_tokens_seen": 560392376, "step": 14260 }, { "epoch": 0.6914722859894487, "grad_norm": 0.41717860102653503, "learning_rate": 1.1477228645863944e-05, "loss": 1.2923, "num_input_tokens_seen": 560761820, "step": 14270 }, { "epoch": 0.6919568496096234, "grad_norm": 0.3851679563522339, "learning_rate": 1.1444244233456717e-05, "loss": 1.2749, "num_input_tokens_seen": 561148148, "step": 14280 }, { "epoch": 0.6924414132297982, "grad_norm": 0.41185683012008667, "learning_rate": 1.1411293211741014e-05, "loss": 1.234, "num_input_tokens_seen": 561552512, "step": 14290 }, { "epoch": 0.692925976849973, "grad_norm": 0.3968884348869324, "learning_rate": 1.1378375661882181e-05, "loss": 1.2464, "num_input_tokens_seen": 561934928, "step": 14300 }, { "epoch": 0.6934105404701478, "grad_norm": 0.38210412859916687, "learning_rate": 1.1345491664963078e-05, "loss": 1.247, "num_input_tokens_seen": 562352708, "step": 14310 }, { "epoch": 0.6938951040903226, "grad_norm": 0.3991890847682953, "learning_rate": 1.1312641301983954e-05, "loss": 1.2252, "num_input_tokens_seen": 562733284, "step": 14320 }, { "epoch": 0.6943796677104974, "grad_norm": 0.3969036340713501, "learning_rate": 1.1279824653862197e-05, "loss": 1.2681, "num_input_tokens_seen": 563140896, "step": 14330 }, { "epoch": 0.6948642313306723, "grad_norm": 0.42541587352752686, "learning_rate": 1.1247041801432137e-05, "loss": 1.2644, "num_input_tokens_seen": 563495812, "step": 14340 }, { "epoch": 0.6953487949508471, "grad_norm": 0.3810971677303314, "learning_rate": 1.1214292825444883e-05, "loss": 1.2588, "num_input_tokens_seen": 563921972, "step": 14350 }, { "epoch": 0.6958333585710219, "grad_norm": 0.4099074602127075, "learning_rate": 1.1181577806568064e-05, "loss": 1.2685, "num_input_tokens_seen": 564303812, "step": 14360 }, { "epoch": 0.6963179221911967, "grad_norm": 0.42169204354286194, "learning_rate": 1.1148896825385707e-05, "loss": 1.2461, "num_input_tokens_seen": 564712192, "step": 14370 }, { "epoch": 0.6968024858113715, "grad_norm": 0.42555099725723267, "learning_rate": 1.111624996239796e-05, "loss": 1.274, "num_input_tokens_seen": 565108896, "step": 14380 }, { "epoch": 0.6972870494315463, "grad_norm": 0.4312445819377899, "learning_rate": 1.108363729802096e-05, "loss": 1.272, "num_input_tokens_seen": 565492580, "step": 14390 }, { "epoch": 0.6977716130517211, "grad_norm": 0.40941333770751953, "learning_rate": 1.1051058912586579e-05, "loss": 1.2707, "num_input_tokens_seen": 565921508, "step": 14400 }, { "epoch": 0.6982561766718959, "grad_norm": 0.4134999215602875, "learning_rate": 1.1018514886342279e-05, "loss": 1.2187, "num_input_tokens_seen": 566308100, "step": 14410 }, { "epoch": 0.6987407402920707, "grad_norm": 0.3979948163032532, "learning_rate": 1.0986005299450858e-05, "loss": 1.2768, "num_input_tokens_seen": 566705680, "step": 14420 }, { "epoch": 0.6992253039122456, "grad_norm": 0.41298606991767883, "learning_rate": 1.0953530231990311e-05, "loss": 1.2341, "num_input_tokens_seen": 567123900, "step": 14430 }, { "epoch": 0.6997098675324204, "grad_norm": 0.42366859316825867, "learning_rate": 1.0921089763953594e-05, "loss": 1.2959, "num_input_tokens_seen": 567534336, "step": 14440 }, { "epoch": 0.7001944311525952, "grad_norm": 0.39607560634613037, "learning_rate": 1.0888683975248431e-05, "loss": 1.2823, "num_input_tokens_seen": 567938980, "step": 14450 }, { "epoch": 0.70067899477277, "grad_norm": 0.38363564014434814, "learning_rate": 1.0856312945697142e-05, "loss": 1.2062, "num_input_tokens_seen": 568304904, "step": 14460 }, { "epoch": 0.7011635583929448, "grad_norm": 0.4327394366264343, "learning_rate": 1.0823976755036393e-05, "loss": 1.2881, "num_input_tokens_seen": 568703588, "step": 14470 }, { "epoch": 0.7016481220131195, "grad_norm": 0.40510445833206177, "learning_rate": 1.079167548291708e-05, "loss": 1.2547, "num_input_tokens_seen": 569104248, "step": 14480 }, { "epoch": 0.7021326856332943, "grad_norm": 0.41055554151535034, "learning_rate": 1.075940920890404e-05, "loss": 1.2536, "num_input_tokens_seen": 569485804, "step": 14490 }, { "epoch": 0.7026172492534691, "grad_norm": 0.4175165295600891, "learning_rate": 1.0727178012475944e-05, "loss": 1.2334, "num_input_tokens_seen": 569887012, "step": 14500 }, { "epoch": 0.7031018128736439, "grad_norm": 0.43152111768722534, "learning_rate": 1.0694981973025022e-05, "loss": 1.2328, "num_input_tokens_seen": 570274412, "step": 14510 }, { "epoch": 0.7035863764938188, "grad_norm": 0.4261281490325928, "learning_rate": 1.0662821169856948e-05, "loss": 1.2766, "num_input_tokens_seen": 570667476, "step": 14520 }, { "epoch": 0.7040709401139936, "grad_norm": 0.4019312858581543, "learning_rate": 1.0630695682190554e-05, "loss": 1.2272, "num_input_tokens_seen": 571069868, "step": 14530 }, { "epoch": 0.7045555037341684, "grad_norm": 0.4646441638469696, "learning_rate": 1.0598605589157726e-05, "loss": 1.2188, "num_input_tokens_seen": 571471600, "step": 14540 }, { "epoch": 0.7050400673543432, "grad_norm": 0.40755367279052734, "learning_rate": 1.0566550969803127e-05, "loss": 1.286, "num_input_tokens_seen": 571874376, "step": 14550 }, { "epoch": 0.705524630974518, "grad_norm": 0.4092569947242737, "learning_rate": 1.0534531903084065e-05, "loss": 1.2766, "num_input_tokens_seen": 572267796, "step": 14560 }, { "epoch": 0.7060091945946928, "grad_norm": 0.443362295627594, "learning_rate": 1.0502548467870284e-05, "loss": 1.2842, "num_input_tokens_seen": 572646180, "step": 14570 }, { "epoch": 0.7064937582148676, "grad_norm": 0.39347755908966064, "learning_rate": 1.0470600742943726e-05, "loss": 1.2541, "num_input_tokens_seen": 573058952, "step": 14580 }, { "epoch": 0.7069783218350424, "grad_norm": 0.4157586395740509, "learning_rate": 1.0438688806998395e-05, "loss": 1.2842, "num_input_tokens_seen": 573460344, "step": 14590 }, { "epoch": 0.7074628854552172, "grad_norm": 0.45576024055480957, "learning_rate": 1.0406812738640134e-05, "loss": 1.2606, "num_input_tokens_seen": 573848724, "step": 14600 }, { "epoch": 0.7079474490753921, "grad_norm": 0.41932404041290283, "learning_rate": 1.037497261638645e-05, "loss": 1.2402, "num_input_tokens_seen": 574233032, "step": 14610 }, { "epoch": 0.7084320126955669, "grad_norm": 0.4327371120452881, "learning_rate": 1.0343168518666272e-05, "loss": 1.2565, "num_input_tokens_seen": 574627952, "step": 14620 }, { "epoch": 0.7089165763157417, "grad_norm": 0.4439123272895813, "learning_rate": 1.0311400523819831e-05, "loss": 1.2774, "num_input_tokens_seen": 575040240, "step": 14630 }, { "epoch": 0.7094011399359165, "grad_norm": 0.4237552583217621, "learning_rate": 1.0279668710098401e-05, "loss": 1.2613, "num_input_tokens_seen": 575434676, "step": 14640 }, { "epoch": 0.7098857035560913, "grad_norm": 0.4437905251979828, "learning_rate": 1.0247973155664156e-05, "loss": 1.2636, "num_input_tokens_seen": 575812584, "step": 14650 }, { "epoch": 0.7103702671762661, "grad_norm": 0.3982216715812683, "learning_rate": 1.0216313938589936e-05, "loss": 1.2758, "num_input_tokens_seen": 576228776, "step": 14660 }, { "epoch": 0.7108548307964409, "grad_norm": 0.36928027868270874, "learning_rate": 1.0184691136859096e-05, "loss": 1.2293, "num_input_tokens_seen": 576617456, "step": 14670 }, { "epoch": 0.7113393944166156, "grad_norm": 0.3991413712501526, "learning_rate": 1.0153104828365261e-05, "loss": 1.2731, "num_input_tokens_seen": 576986340, "step": 14680 }, { "epoch": 0.7118239580367904, "grad_norm": 0.4070897698402405, "learning_rate": 1.0121555090912207e-05, "loss": 1.2772, "num_input_tokens_seen": 577384544, "step": 14690 }, { "epoch": 0.7123085216569653, "grad_norm": 0.3921777904033661, "learning_rate": 1.0090042002213587e-05, "loss": 1.2332, "num_input_tokens_seen": 577772956, "step": 14700 }, { "epoch": 0.7127930852771401, "grad_norm": 0.37934672832489014, "learning_rate": 1.0058565639892808e-05, "loss": 1.2527, "num_input_tokens_seen": 578188080, "step": 14710 }, { "epoch": 0.7132776488973149, "grad_norm": 0.3901313841342926, "learning_rate": 1.0027126081482801e-05, "loss": 1.2523, "num_input_tokens_seen": 578567732, "step": 14720 }, { "epoch": 0.7137622125174897, "grad_norm": 0.4196103513240814, "learning_rate": 9.995723404425845e-06, "loss": 1.2586, "num_input_tokens_seen": 578986928, "step": 14730 }, { "epoch": 0.7142467761376645, "grad_norm": 0.40894681215286255, "learning_rate": 9.964357686073378e-06, "loss": 1.2532, "num_input_tokens_seen": 579387500, "step": 14740 }, { "epoch": 0.7147313397578393, "grad_norm": 0.38982832431793213, "learning_rate": 9.933029003685778e-06, "loss": 1.2751, "num_input_tokens_seen": 579749804, "step": 14750 }, { "epoch": 0.7152159033780141, "grad_norm": 0.3959210515022278, "learning_rate": 9.901737434432226e-06, "loss": 1.2404, "num_input_tokens_seen": 580156476, "step": 14760 }, { "epoch": 0.7157004669981889, "grad_norm": 0.4251919984817505, "learning_rate": 9.870483055390456e-06, "loss": 1.2346, "num_input_tokens_seen": 580522596, "step": 14770 }, { "epoch": 0.7161850306183638, "grad_norm": 0.48525333404541016, "learning_rate": 9.839265943546627e-06, "loss": 1.2809, "num_input_tokens_seen": 580900272, "step": 14780 }, { "epoch": 0.7166695942385386, "grad_norm": 0.3704177439212799, "learning_rate": 9.808086175795061e-06, "loss": 1.2318, "num_input_tokens_seen": 581280140, "step": 14790 }, { "epoch": 0.7171541578587134, "grad_norm": 0.4149393141269684, "learning_rate": 9.77694382893814e-06, "loss": 1.3142, "num_input_tokens_seen": 581666396, "step": 14800 }, { "epoch": 0.7176387214788882, "grad_norm": 0.4752131402492523, "learning_rate": 9.745838979686026e-06, "loss": 1.2692, "num_input_tokens_seen": 582040104, "step": 14810 }, { "epoch": 0.718123285099063, "grad_norm": 0.41852232813835144, "learning_rate": 9.714771704656553e-06, "loss": 1.2908, "num_input_tokens_seen": 582431284, "step": 14820 }, { "epoch": 0.7186078487192378, "grad_norm": 0.3744739592075348, "learning_rate": 9.683742080374968e-06, "loss": 1.2438, "num_input_tokens_seen": 582835184, "step": 14830 }, { "epoch": 0.7190924123394126, "grad_norm": 0.3988076448440552, "learning_rate": 9.652750183273806e-06, "loss": 1.2978, "num_input_tokens_seen": 583230036, "step": 14840 }, { "epoch": 0.7195769759595874, "grad_norm": 0.41156208515167236, "learning_rate": 9.621796089692667e-06, "loss": 1.2709, "num_input_tokens_seen": 583626572, "step": 14850 }, { "epoch": 0.7200615395797622, "grad_norm": 0.437142014503479, "learning_rate": 9.59087987587801e-06, "loss": 1.2503, "num_input_tokens_seen": 584037012, "step": 14860 }, { "epoch": 0.7205461031999371, "grad_norm": 0.42297014594078064, "learning_rate": 9.560001617983005e-06, "loss": 1.2808, "num_input_tokens_seen": 584438176, "step": 14870 }, { "epoch": 0.7210306668201119, "grad_norm": 0.4088895618915558, "learning_rate": 9.529161392067336e-06, "loss": 1.2203, "num_input_tokens_seen": 584823036, "step": 14880 }, { "epoch": 0.7215152304402866, "grad_norm": 0.4062732458114624, "learning_rate": 9.498359274097002e-06, "loss": 1.2212, "num_input_tokens_seen": 585208164, "step": 14890 }, { "epoch": 0.7219997940604614, "grad_norm": 0.4162246882915497, "learning_rate": 9.467595339944116e-06, "loss": 1.3, "num_input_tokens_seen": 585566620, "step": 14900 }, { "epoch": 0.7224843576806362, "grad_norm": 0.38531064987182617, "learning_rate": 9.436869665386763e-06, "loss": 1.2206, "num_input_tokens_seen": 585980372, "step": 14910 }, { "epoch": 0.722968921300811, "grad_norm": 0.42929065227508545, "learning_rate": 9.40618232610876e-06, "loss": 1.2744, "num_input_tokens_seen": 586389084, "step": 14920 }, { "epoch": 0.7234534849209858, "grad_norm": 0.4108678102493286, "learning_rate": 9.375533397699523e-06, "loss": 1.252, "num_input_tokens_seen": 586790192, "step": 14930 }, { "epoch": 0.7239380485411606, "grad_norm": 0.43513932824134827, "learning_rate": 9.344922955653826e-06, "loss": 1.2669, "num_input_tokens_seen": 587176208, "step": 14940 }, { "epoch": 0.7244226121613354, "grad_norm": 0.4044954776763916, "learning_rate": 9.314351075371674e-06, "loss": 1.2949, "num_input_tokens_seen": 587570128, "step": 14950 }, { "epoch": 0.7249071757815103, "grad_norm": 0.4217306077480316, "learning_rate": 9.283817832158053e-06, "loss": 1.3133, "num_input_tokens_seen": 587951740, "step": 14960 }, { "epoch": 0.7253917394016851, "grad_norm": 0.4031387269496918, "learning_rate": 9.253323301222802e-06, "loss": 1.2583, "num_input_tokens_seen": 588343360, "step": 14970 }, { "epoch": 0.7258763030218599, "grad_norm": 0.4108414649963379, "learning_rate": 9.222867557680403e-06, "loss": 1.2722, "num_input_tokens_seen": 588756676, "step": 14980 }, { "epoch": 0.7263608666420347, "grad_norm": 0.4133433699607849, "learning_rate": 9.192450676549774e-06, "loss": 1.2386, "num_input_tokens_seen": 589152220, "step": 14990 }, { "epoch": 0.7268454302622095, "grad_norm": 0.3888295590877533, "learning_rate": 9.162072732754132e-06, "loss": 1.2964, "num_input_tokens_seen": 589548180, "step": 15000 }, { "epoch": 0.7273299938823843, "grad_norm": 0.40823793411254883, "learning_rate": 9.131733801120771e-06, "loss": 1.249, "num_input_tokens_seen": 589932416, "step": 15010 }, { "epoch": 0.7278145575025591, "grad_norm": 0.41177472472190857, "learning_rate": 9.1014339563809e-06, "loss": 1.2636, "num_input_tokens_seen": 590315064, "step": 15020 }, { "epoch": 0.7282991211227339, "grad_norm": 0.4307875633239746, "learning_rate": 9.071173273169428e-06, "loss": 1.2832, "num_input_tokens_seen": 590718608, "step": 15030 }, { "epoch": 0.7287836847429087, "grad_norm": 0.3676307499408722, "learning_rate": 9.040951826024824e-06, "loss": 1.2616, "num_input_tokens_seen": 591105428, "step": 15040 }, { "epoch": 0.7292682483630836, "grad_norm": 0.4145369529724121, "learning_rate": 9.010769689388885e-06, "loss": 1.2932, "num_input_tokens_seen": 591484840, "step": 15050 }, { "epoch": 0.7297528119832584, "grad_norm": 0.38058140873908997, "learning_rate": 8.980626937606612e-06, "loss": 1.2451, "num_input_tokens_seen": 591878184, "step": 15060 }, { "epoch": 0.7302373756034332, "grad_norm": 0.41027316451072693, "learning_rate": 8.950523644925954e-06, "loss": 1.2548, "num_input_tokens_seen": 592242680, "step": 15070 }, { "epoch": 0.730721939223608, "grad_norm": 0.4113955497741699, "learning_rate": 8.920459885497703e-06, "loss": 1.2306, "num_input_tokens_seen": 592616904, "step": 15080 }, { "epoch": 0.7312065028437827, "grad_norm": 0.4587390422821045, "learning_rate": 8.890435733375232e-06, "loss": 1.2836, "num_input_tokens_seen": 592980040, "step": 15090 }, { "epoch": 0.7316910664639575, "grad_norm": 0.4060867726802826, "learning_rate": 8.860451262514386e-06, "loss": 1.2048, "num_input_tokens_seen": 593376740, "step": 15100 }, { "epoch": 0.7321756300841323, "grad_norm": 0.44127029180526733, "learning_rate": 8.830506546773257e-06, "loss": 1.2557, "num_input_tokens_seen": 593757812, "step": 15110 }, { "epoch": 0.7326601937043071, "grad_norm": 0.423213392496109, "learning_rate": 8.800601659911998e-06, "loss": 1.2426, "num_input_tokens_seen": 594122040, "step": 15120 }, { "epoch": 0.7331447573244819, "grad_norm": 0.41493481397628784, "learning_rate": 8.770736675592678e-06, "loss": 1.2721, "num_input_tokens_seen": 594520000, "step": 15130 }, { "epoch": 0.7336293209446568, "grad_norm": 0.46575504541397095, "learning_rate": 8.740911667379053e-06, "loss": 1.2268, "num_input_tokens_seen": 594873824, "step": 15140 }, { "epoch": 0.7341138845648316, "grad_norm": 0.4391193091869354, "learning_rate": 8.711126708736426e-06, "loss": 1.313, "num_input_tokens_seen": 595233156, "step": 15150 }, { "epoch": 0.7345984481850064, "grad_norm": 0.41552045941352844, "learning_rate": 8.681381873031447e-06, "loss": 1.2626, "num_input_tokens_seen": 595644240, "step": 15160 }, { "epoch": 0.7350830118051812, "grad_norm": 0.3735989034175873, "learning_rate": 8.651677233531943e-06, "loss": 1.2961, "num_input_tokens_seen": 596014316, "step": 15170 }, { "epoch": 0.735567575425356, "grad_norm": 0.45374736189842224, "learning_rate": 8.6220128634067e-06, "loss": 1.2512, "num_input_tokens_seen": 596420640, "step": 15180 }, { "epoch": 0.7360521390455308, "grad_norm": 0.4039424657821655, "learning_rate": 8.592388835725352e-06, "loss": 1.2505, "num_input_tokens_seen": 596817584, "step": 15190 }, { "epoch": 0.7365367026657056, "grad_norm": 0.4076104164123535, "learning_rate": 8.56280522345812e-06, "loss": 1.2252, "num_input_tokens_seen": 597205152, "step": 15200 }, { "epoch": 0.7370212662858804, "grad_norm": 0.4360331594944, "learning_rate": 8.533262099475708e-06, "loss": 1.2516, "num_input_tokens_seen": 597588540, "step": 15210 }, { "epoch": 0.7375058299060552, "grad_norm": 0.39837440848350525, "learning_rate": 8.503759536549066e-06, "loss": 1.2656, "num_input_tokens_seen": 597989068, "step": 15220 }, { "epoch": 0.7379903935262301, "grad_norm": 0.45183074474334717, "learning_rate": 8.474297607349252e-06, "loss": 1.3073, "num_input_tokens_seen": 598396680, "step": 15230 }, { "epoch": 0.7384749571464049, "grad_norm": 0.46322163939476013, "learning_rate": 8.44487638444721e-06, "loss": 1.281, "num_input_tokens_seen": 598789612, "step": 15240 }, { "epoch": 0.7389595207665797, "grad_norm": 0.38120391964912415, "learning_rate": 8.415495940313637e-06, "loss": 1.2602, "num_input_tokens_seen": 599183232, "step": 15250 }, { "epoch": 0.7394440843867545, "grad_norm": 0.3967757225036621, "learning_rate": 8.386156347318785e-06, "loss": 1.264, "num_input_tokens_seen": 599587496, "step": 15260 }, { "epoch": 0.7399286480069293, "grad_norm": 0.44284841418266296, "learning_rate": 8.356857677732258e-06, "loss": 1.2691, "num_input_tokens_seen": 599970304, "step": 15270 }, { "epoch": 0.740413211627104, "grad_norm": 0.42405474185943604, "learning_rate": 8.327600003722887e-06, "loss": 1.3021, "num_input_tokens_seen": 600356376, "step": 15280 }, { "epoch": 0.7408977752472788, "grad_norm": 0.3977855145931244, "learning_rate": 8.298383397358494e-06, "loss": 1.2675, "num_input_tokens_seen": 600728604, "step": 15290 }, { "epoch": 0.7413823388674536, "grad_norm": 0.41853830218315125, "learning_rate": 8.269207930605757e-06, "loss": 1.2954, "num_input_tokens_seen": 601127264, "step": 15300 }, { "epoch": 0.7418669024876284, "grad_norm": 0.4441795349121094, "learning_rate": 8.240073675330023e-06, "loss": 1.2563, "num_input_tokens_seen": 601500628, "step": 15310 }, { "epoch": 0.7423514661078033, "grad_norm": 0.37602853775024414, "learning_rate": 8.210980703295126e-06, "loss": 1.2418, "num_input_tokens_seen": 601875864, "step": 15320 }, { "epoch": 0.7428360297279781, "grad_norm": 0.4239734411239624, "learning_rate": 8.181929086163186e-06, "loss": 1.2742, "num_input_tokens_seen": 602271004, "step": 15330 }, { "epoch": 0.7433205933481529, "grad_norm": 0.39539673924446106, "learning_rate": 8.1529188954945e-06, "loss": 1.2483, "num_input_tokens_seen": 602647156, "step": 15340 }, { "epoch": 0.7438051569683277, "grad_norm": 0.3857654631137848, "learning_rate": 8.123950202747274e-06, "loss": 1.2448, "num_input_tokens_seen": 603043708, "step": 15350 }, { "epoch": 0.7442897205885025, "grad_norm": 0.41120001673698425, "learning_rate": 8.095023079277541e-06, "loss": 1.2468, "num_input_tokens_seen": 603427072, "step": 15360 }, { "epoch": 0.7447742842086773, "grad_norm": 0.38883382081985474, "learning_rate": 8.066137596338908e-06, "loss": 1.2563, "num_input_tokens_seen": 603820012, "step": 15370 }, { "epoch": 0.7452588478288521, "grad_norm": 0.3978869915008545, "learning_rate": 8.037293825082423e-06, "loss": 1.2982, "num_input_tokens_seen": 604199356, "step": 15380 }, { "epoch": 0.7457434114490269, "grad_norm": 0.39663106203079224, "learning_rate": 8.008491836556408e-06, "loss": 1.2548, "num_input_tokens_seen": 604594724, "step": 15390 }, { "epoch": 0.7462279750692017, "grad_norm": 0.3799634873867035, "learning_rate": 7.979731701706231e-06, "loss": 1.2978, "num_input_tokens_seen": 604995016, "step": 15400 }, { "epoch": 0.7467125386893766, "grad_norm": 0.424040824174881, "learning_rate": 7.951013491374193e-06, "loss": 1.2551, "num_input_tokens_seen": 605385864, "step": 15410 }, { "epoch": 0.7471971023095514, "grad_norm": 0.47143998742103577, "learning_rate": 7.922337276299305e-06, "loss": 1.3032, "num_input_tokens_seen": 605778132, "step": 15420 }, { "epoch": 0.7476816659297262, "grad_norm": 0.439042866230011, "learning_rate": 7.89370312711715e-06, "loss": 1.2905, "num_input_tokens_seen": 606175808, "step": 15430 }, { "epoch": 0.748166229549901, "grad_norm": 0.42106932401657104, "learning_rate": 7.86511111435969e-06, "loss": 1.251, "num_input_tokens_seen": 606587468, "step": 15440 }, { "epoch": 0.7486507931700758, "grad_norm": 0.434646874666214, "learning_rate": 7.836561308455109e-06, "loss": 1.2324, "num_input_tokens_seen": 606989572, "step": 15450 }, { "epoch": 0.7491353567902506, "grad_norm": 0.3873103857040405, "learning_rate": 7.80805377972759e-06, "loss": 1.249, "num_input_tokens_seen": 607386504, "step": 15460 }, { "epoch": 0.7496199204104254, "grad_norm": 0.42348745465278625, "learning_rate": 7.779588598397222e-06, "loss": 1.1877, "num_input_tokens_seen": 607763740, "step": 15470 }, { "epoch": 0.7501044840306001, "grad_norm": 0.46129289269447327, "learning_rate": 7.751165834579744e-06, "loss": 1.2666, "num_input_tokens_seen": 608138336, "step": 15480 }, { "epoch": 0.750589047650775, "grad_norm": 0.4692579507827759, "learning_rate": 7.722785558286447e-06, "loss": 1.2385, "num_input_tokens_seen": 608572348, "step": 15490 }, { "epoch": 0.7510736112709498, "grad_norm": 0.4252379536628723, "learning_rate": 7.694447839423936e-06, "loss": 1.238, "num_input_tokens_seen": 608980572, "step": 15500 }, { "epoch": 0.7515581748911246, "grad_norm": 0.37597569823265076, "learning_rate": 7.666152747794006e-06, "loss": 1.2893, "num_input_tokens_seen": 609382708, "step": 15510 }, { "epoch": 0.7520427385112994, "grad_norm": 0.39012569189071655, "learning_rate": 7.63790035309346e-06, "loss": 1.3176, "num_input_tokens_seen": 609759336, "step": 15520 }, { "epoch": 0.7525273021314742, "grad_norm": 0.398588091135025, "learning_rate": 7.609690724913901e-06, "loss": 1.2815, "num_input_tokens_seen": 610153132, "step": 15530 }, { "epoch": 0.753011865751649, "grad_norm": 0.4179481565952301, "learning_rate": 7.581523932741619e-06, "loss": 1.2689, "num_input_tokens_seen": 610529384, "step": 15540 }, { "epoch": 0.7534964293718238, "grad_norm": 0.3886168599128723, "learning_rate": 7.553400045957362e-06, "loss": 1.2504, "num_input_tokens_seen": 610922988, "step": 15550 }, { "epoch": 0.7539809929919986, "grad_norm": 0.3878910541534424, "learning_rate": 7.525319133836223e-06, "loss": 1.2506, "num_input_tokens_seen": 611272416, "step": 15560 }, { "epoch": 0.7544655566121734, "grad_norm": 0.4470667541027069, "learning_rate": 7.497281265547406e-06, "loss": 1.241, "num_input_tokens_seen": 611675884, "step": 15570 }, { "epoch": 0.7549501202323483, "grad_norm": 0.440876841545105, "learning_rate": 7.469286510154116e-06, "loss": 1.2208, "num_input_tokens_seen": 612084480, "step": 15580 }, { "epoch": 0.7554346838525231, "grad_norm": 0.4724419414997101, "learning_rate": 7.441334936613353e-06, "loss": 1.2509, "num_input_tokens_seen": 612451128, "step": 15590 }, { "epoch": 0.7559192474726979, "grad_norm": 0.4163588881492615, "learning_rate": 7.413426613775759e-06, "loss": 1.2477, "num_input_tokens_seen": 612831956, "step": 15600 }, { "epoch": 0.7564038110928727, "grad_norm": 0.42442935705184937, "learning_rate": 7.385561610385414e-06, "loss": 1.2763, "num_input_tokens_seen": 613196848, "step": 15610 }, { "epoch": 0.7568883747130475, "grad_norm": 0.4455620050430298, "learning_rate": 7.357739995079724e-06, "loss": 1.2434, "num_input_tokens_seen": 613631540, "step": 15620 }, { "epoch": 0.7573729383332223, "grad_norm": 0.3932070732116699, "learning_rate": 7.329961836389198e-06, "loss": 1.2842, "num_input_tokens_seen": 614045632, "step": 15630 }, { "epoch": 0.7578575019533971, "grad_norm": 0.40935105085372925, "learning_rate": 7.302227202737316e-06, "loss": 1.2179, "num_input_tokens_seen": 614417140, "step": 15640 }, { "epoch": 0.7583420655735719, "grad_norm": 0.3958241939544678, "learning_rate": 7.274536162440351e-06, "loss": 1.2848, "num_input_tokens_seen": 614833672, "step": 15650 }, { "epoch": 0.7588266291937467, "grad_norm": 0.43931275606155396, "learning_rate": 7.246888783707173e-06, "loss": 1.2335, "num_input_tokens_seen": 615222276, "step": 15660 }, { "epoch": 0.7593111928139216, "grad_norm": 0.4984539747238159, "learning_rate": 7.219285134639134e-06, "loss": 1.2827, "num_input_tokens_seen": 615624440, "step": 15670 }, { "epoch": 0.7597957564340964, "grad_norm": 0.40793275833129883, "learning_rate": 7.191725283229839e-06, "loss": 1.2562, "num_input_tokens_seen": 616014768, "step": 15680 }, { "epoch": 0.7602803200542712, "grad_norm": 0.41619017720222473, "learning_rate": 7.164209297365043e-06, "loss": 1.2754, "num_input_tokens_seen": 616391320, "step": 15690 }, { "epoch": 0.760764883674446, "grad_norm": 0.43882516026496887, "learning_rate": 7.136737244822422e-06, "loss": 1.2868, "num_input_tokens_seen": 616766344, "step": 15700 }, { "epoch": 0.7612494472946207, "grad_norm": 0.3904348611831665, "learning_rate": 7.109309193271454e-06, "loss": 1.2305, "num_input_tokens_seen": 617152204, "step": 15710 }, { "epoch": 0.7617340109147955, "grad_norm": 0.41161251068115234, "learning_rate": 7.081925210273227e-06, "loss": 1.2739, "num_input_tokens_seen": 617555064, "step": 15720 }, { "epoch": 0.7622185745349703, "grad_norm": 0.44097068905830383, "learning_rate": 7.054585363280286e-06, "loss": 1.2619, "num_input_tokens_seen": 617932060, "step": 15730 }, { "epoch": 0.7627031381551451, "grad_norm": 0.46235036849975586, "learning_rate": 7.027289719636437e-06, "loss": 1.2797, "num_input_tokens_seen": 618306600, "step": 15740 }, { "epoch": 0.7631877017753199, "grad_norm": 0.421164870262146, "learning_rate": 7.0000383465766345e-06, "loss": 1.2303, "num_input_tokens_seen": 618670828, "step": 15750 }, { "epoch": 0.7636722653954948, "grad_norm": 0.3979032039642334, "learning_rate": 6.972831311226758e-06, "loss": 1.2491, "num_input_tokens_seen": 619060076, "step": 15760 }, { "epoch": 0.7641568290156696, "grad_norm": 0.41532114148139954, "learning_rate": 6.945668680603487e-06, "loss": 1.2922, "num_input_tokens_seen": 619436564, "step": 15770 }, { "epoch": 0.7646413926358444, "grad_norm": 0.41047605872154236, "learning_rate": 6.918550521614137e-06, "loss": 1.2638, "num_input_tokens_seen": 619817996, "step": 15780 }, { "epoch": 0.7651259562560192, "grad_norm": 0.39262655377388, "learning_rate": 6.891476901056445e-06, "loss": 1.2943, "num_input_tokens_seen": 620243736, "step": 15790 }, { "epoch": 0.765610519876194, "grad_norm": 0.4346751272678375, "learning_rate": 6.864447885618477e-06, "loss": 1.2818, "num_input_tokens_seen": 620638616, "step": 15800 }, { "epoch": 0.7660950834963688, "grad_norm": 0.39605090022087097, "learning_rate": 6.837463541878394e-06, "loss": 1.2827, "num_input_tokens_seen": 621007792, "step": 15810 }, { "epoch": 0.7665796471165436, "grad_norm": 0.45163053274154663, "learning_rate": 6.810523936304356e-06, "loss": 1.2719, "num_input_tokens_seen": 621383392, "step": 15820 }, { "epoch": 0.7670642107367184, "grad_norm": 0.38712289929389954, "learning_rate": 6.783629135254288e-06, "loss": 1.2749, "num_input_tokens_seen": 621788080, "step": 15830 }, { "epoch": 0.7675487743568932, "grad_norm": 0.417074590921402, "learning_rate": 6.756779204975785e-06, "loss": 1.2551, "num_input_tokens_seen": 622190428, "step": 15840 }, { "epoch": 0.7680333379770681, "grad_norm": 0.4212455749511719, "learning_rate": 6.729974211605888e-06, "loss": 1.2964, "num_input_tokens_seen": 622563676, "step": 15850 }, { "epoch": 0.7685179015972429, "grad_norm": 0.43890535831451416, "learning_rate": 6.703214221170961e-06, "loss": 1.2982, "num_input_tokens_seen": 622940944, "step": 15860 }, { "epoch": 0.7690024652174177, "grad_norm": 0.4360447824001312, "learning_rate": 6.676499299586525e-06, "loss": 1.2862, "num_input_tokens_seen": 623355512, "step": 15870 }, { "epoch": 0.7694870288375925, "grad_norm": 0.41077935695648193, "learning_rate": 6.649829512657082e-06, "loss": 1.269, "num_input_tokens_seen": 623743220, "step": 15880 }, { "epoch": 0.7699715924577673, "grad_norm": 0.40932127833366394, "learning_rate": 6.623204926075938e-06, "loss": 1.2807, "num_input_tokens_seen": 624121016, "step": 15890 }, { "epoch": 0.770456156077942, "grad_norm": 0.42621883749961853, "learning_rate": 6.596625605425083e-06, "loss": 1.2542, "num_input_tokens_seen": 624541652, "step": 15900 }, { "epoch": 0.7709407196981168, "grad_norm": 0.4213699400424957, "learning_rate": 6.570091616175014e-06, "loss": 1.2228, "num_input_tokens_seen": 624945824, "step": 15910 }, { "epoch": 0.7714252833182916, "grad_norm": 0.4320572018623352, "learning_rate": 6.543603023684536e-06, "loss": 1.2724, "num_input_tokens_seen": 625344196, "step": 15920 }, { "epoch": 0.7719098469384664, "grad_norm": 0.403472363948822, "learning_rate": 6.5171598932006665e-06, "loss": 1.3135, "num_input_tokens_seen": 625739548, "step": 15930 }, { "epoch": 0.7723944105586413, "grad_norm": 0.4023309051990509, "learning_rate": 6.49076228985841e-06, "loss": 1.261, "num_input_tokens_seen": 626125928, "step": 15940 }, { "epoch": 0.7728789741788161, "grad_norm": 0.39641690254211426, "learning_rate": 6.464410278680658e-06, "loss": 1.2736, "num_input_tokens_seen": 626532060, "step": 15950 }, { "epoch": 0.7733635377989909, "grad_norm": 0.39902058243751526, "learning_rate": 6.4381039245779675e-06, "loss": 1.2552, "num_input_tokens_seen": 626924212, "step": 15960 }, { "epoch": 0.7738481014191657, "grad_norm": 0.4693625271320343, "learning_rate": 6.411843292348465e-06, "loss": 1.31, "num_input_tokens_seen": 627344236, "step": 15970 }, { "epoch": 0.7743326650393405, "grad_norm": 0.4080311357975006, "learning_rate": 6.385628446677624e-06, "loss": 1.2568, "num_input_tokens_seen": 627754136, "step": 15980 }, { "epoch": 0.7748172286595153, "grad_norm": 0.4102632403373718, "learning_rate": 6.359459452138161e-06, "loss": 1.3095, "num_input_tokens_seen": 628131720, "step": 15990 }, { "epoch": 0.7753017922796901, "grad_norm": 0.4116896986961365, "learning_rate": 6.33333637318983e-06, "loss": 1.2585, "num_input_tokens_seen": 628560668, "step": 16000 }, { "epoch": 0.7753017922796901, "eval_loss": 1.3683557510375977, "eval_runtime": 3.6064, "eval_samples_per_second": 41.592, "eval_steps_per_second": 5.268, "num_input_tokens_seen": 628560668, "step": 16000 }, { "epoch": 0.7757863558998649, "grad_norm": 0.3886633813381195, "learning_rate": 6.3072592741793e-06, "loss": 1.2845, "num_input_tokens_seen": 628949040, "step": 16010 }, { "epoch": 0.7762709195200397, "grad_norm": 0.4156300127506256, "learning_rate": 6.28122821933998e-06, "loss": 1.3016, "num_input_tokens_seen": 629334480, "step": 16020 }, { "epoch": 0.7767554831402146, "grad_norm": 0.40398070216178894, "learning_rate": 6.255243272791858e-06, "loss": 1.2423, "num_input_tokens_seen": 629704276, "step": 16030 }, { "epoch": 0.7772400467603894, "grad_norm": 0.3824484944343567, "learning_rate": 6.2293044985413555e-06, "loss": 1.2772, "num_input_tokens_seen": 630086976, "step": 16040 }, { "epoch": 0.7777246103805642, "grad_norm": 0.39248552918434143, "learning_rate": 6.203411960481145e-06, "loss": 1.254, "num_input_tokens_seen": 630477336, "step": 16050 }, { "epoch": 0.778209174000739, "grad_norm": 0.37206169962882996, "learning_rate": 6.17756572239003e-06, "loss": 1.2717, "num_input_tokens_seen": 630871612, "step": 16060 }, { "epoch": 0.7786937376209138, "grad_norm": 0.40856611728668213, "learning_rate": 6.151765847932747e-06, "loss": 1.2494, "num_input_tokens_seen": 631248400, "step": 16070 }, { "epoch": 0.7791783012410886, "grad_norm": 0.40056514739990234, "learning_rate": 6.126012400659856e-06, "loss": 1.2542, "num_input_tokens_seen": 631641360, "step": 16080 }, { "epoch": 0.7796628648612633, "grad_norm": 0.4258357882499695, "learning_rate": 6.1003054440075205e-06, "loss": 1.2451, "num_input_tokens_seen": 632006744, "step": 16090 }, { "epoch": 0.7801474284814381, "grad_norm": 0.39988163113594055, "learning_rate": 6.074645041297425e-06, "loss": 1.2772, "num_input_tokens_seen": 632371660, "step": 16100 }, { "epoch": 0.7806319921016129, "grad_norm": 0.44167232513427734, "learning_rate": 6.049031255736548e-06, "loss": 1.2338, "num_input_tokens_seen": 632740436, "step": 16110 }, { "epoch": 0.7811165557217878, "grad_norm": 0.3898947536945343, "learning_rate": 6.023464150417077e-06, "loss": 1.2255, "num_input_tokens_seen": 633086408, "step": 16120 }, { "epoch": 0.7816011193419626, "grad_norm": 0.43151912093162537, "learning_rate": 5.997943788316179e-06, "loss": 1.2518, "num_input_tokens_seen": 633479808, "step": 16130 }, { "epoch": 0.7820856829621374, "grad_norm": 0.39515140652656555, "learning_rate": 5.972470232295907e-06, "loss": 1.2406, "num_input_tokens_seen": 633863396, "step": 16140 }, { "epoch": 0.7825702465823122, "grad_norm": 0.4262303411960602, "learning_rate": 5.947043545103012e-06, "loss": 1.2579, "num_input_tokens_seen": 634250168, "step": 16150 }, { "epoch": 0.783054810202487, "grad_norm": 0.40577808022499084, "learning_rate": 5.921663789368806e-06, "loss": 1.3019, "num_input_tokens_seen": 634607024, "step": 16160 }, { "epoch": 0.7835393738226618, "grad_norm": 0.42164546251296997, "learning_rate": 5.896331027608978e-06, "loss": 1.2479, "num_input_tokens_seen": 635005404, "step": 16170 }, { "epoch": 0.7840239374428366, "grad_norm": 0.4001612663269043, "learning_rate": 5.871045322223481e-06, "loss": 1.2902, "num_input_tokens_seen": 635356840, "step": 16180 }, { "epoch": 0.7845085010630114, "grad_norm": 0.40439048409461975, "learning_rate": 5.845806735496362e-06, "loss": 1.2674, "num_input_tokens_seen": 635733620, "step": 16190 }, { "epoch": 0.7849930646831863, "grad_norm": 0.37555184960365295, "learning_rate": 5.820615329595575e-06, "loss": 1.2414, "num_input_tokens_seen": 636132896, "step": 16200 }, { "epoch": 0.7854776283033611, "grad_norm": 0.4048125147819519, "learning_rate": 5.795471166572894e-06, "loss": 1.2734, "num_input_tokens_seen": 636522108, "step": 16210 }, { "epoch": 0.7859621919235359, "grad_norm": 0.3880002200603485, "learning_rate": 5.770374308363693e-06, "loss": 1.2619, "num_input_tokens_seen": 636887204, "step": 16220 }, { "epoch": 0.7864467555437107, "grad_norm": 0.4102485477924347, "learning_rate": 5.745324816786854e-06, "loss": 1.235, "num_input_tokens_seen": 637306444, "step": 16230 }, { "epoch": 0.7869313191638855, "grad_norm": 0.42181381583213806, "learning_rate": 5.720322753544549e-06, "loss": 1.2281, "num_input_tokens_seen": 637683960, "step": 16240 }, { "epoch": 0.7874158827840603, "grad_norm": 0.40080034732818604, "learning_rate": 5.695368180222163e-06, "loss": 1.1679, "num_input_tokens_seen": 638089180, "step": 16250 }, { "epoch": 0.7879004464042351, "grad_norm": 0.39268258213996887, "learning_rate": 5.670461158288071e-06, "loss": 1.2564, "num_input_tokens_seen": 638480792, "step": 16260 }, { "epoch": 0.7883850100244099, "grad_norm": 0.3921567499637604, "learning_rate": 5.6456017490935405e-06, "loss": 1.2426, "num_input_tokens_seen": 638897224, "step": 16270 }, { "epoch": 0.7888695736445847, "grad_norm": 0.3720284402370453, "learning_rate": 5.620790013872543e-06, "loss": 1.2322, "num_input_tokens_seen": 639288872, "step": 16280 }, { "epoch": 0.7893541372647596, "grad_norm": 0.41209957003593445, "learning_rate": 5.596026013741631e-06, "loss": 1.3017, "num_input_tokens_seen": 639672792, "step": 16290 }, { "epoch": 0.7898387008849344, "grad_norm": 0.43729230761528015, "learning_rate": 5.571309809699771e-06, "loss": 1.3003, "num_input_tokens_seen": 640077948, "step": 16300 }, { "epoch": 0.7903232645051091, "grad_norm": 0.3936404883861542, "learning_rate": 5.546641462628194e-06, "loss": 1.3, "num_input_tokens_seen": 640466812, "step": 16310 }, { "epoch": 0.7908078281252839, "grad_norm": 0.396609365940094, "learning_rate": 5.522021033290265e-06, "loss": 1.2776, "num_input_tokens_seen": 640869644, "step": 16320 }, { "epoch": 0.7912923917454587, "grad_norm": 0.39924660325050354, "learning_rate": 5.4974485823312885e-06, "loss": 1.2602, "num_input_tokens_seen": 641267224, "step": 16330 }, { "epoch": 0.7917769553656335, "grad_norm": 0.4168340563774109, "learning_rate": 5.472924170278418e-06, "loss": 1.2327, "num_input_tokens_seen": 641650820, "step": 16340 }, { "epoch": 0.7922615189858083, "grad_norm": 0.3994021415710449, "learning_rate": 5.448447857540453e-06, "loss": 1.257, "num_input_tokens_seen": 642044792, "step": 16350 }, { "epoch": 0.7927460826059831, "grad_norm": 0.40958625078201294, "learning_rate": 5.424019704407735e-06, "loss": 1.277, "num_input_tokens_seen": 642438212, "step": 16360 }, { "epoch": 0.7932306462261579, "grad_norm": 0.4128318428993225, "learning_rate": 5.3996397710519565e-06, "loss": 1.2549, "num_input_tokens_seen": 642855308, "step": 16370 }, { "epoch": 0.7937152098463328, "grad_norm": 0.38638776540756226, "learning_rate": 5.37530811752606e-06, "loss": 1.2651, "num_input_tokens_seen": 643244872, "step": 16380 }, { "epoch": 0.7941997734665076, "grad_norm": 0.41983383893966675, "learning_rate": 5.351024803764035e-06, "loss": 1.2892, "num_input_tokens_seen": 643614276, "step": 16390 }, { "epoch": 0.7946843370866824, "grad_norm": 0.41671621799468994, "learning_rate": 5.32678988958083e-06, "loss": 1.2707, "num_input_tokens_seen": 644000564, "step": 16400 }, { "epoch": 0.7951689007068572, "grad_norm": 0.4639870822429657, "learning_rate": 5.302603434672149e-06, "loss": 1.2279, "num_input_tokens_seen": 644383116, "step": 16410 }, { "epoch": 0.795653464327032, "grad_norm": 0.4280005693435669, "learning_rate": 5.278465498614349e-06, "loss": 1.2616, "num_input_tokens_seen": 644805824, "step": 16420 }, { "epoch": 0.7961380279472068, "grad_norm": 0.45124003291130066, "learning_rate": 5.254376140864273e-06, "loss": 1.2564, "num_input_tokens_seen": 645207656, "step": 16430 }, { "epoch": 0.7966225915673816, "grad_norm": 0.42907872796058655, "learning_rate": 5.230335420759089e-06, "loss": 1.2791, "num_input_tokens_seen": 645619040, "step": 16440 }, { "epoch": 0.7971071551875564, "grad_norm": 0.4425942599773407, "learning_rate": 5.206343397516178e-06, "loss": 1.2267, "num_input_tokens_seen": 646010824, "step": 16450 }, { "epoch": 0.7975917188077312, "grad_norm": 0.41618916392326355, "learning_rate": 5.182400130232962e-06, "loss": 1.2694, "num_input_tokens_seen": 646440088, "step": 16460 }, { "epoch": 0.7980762824279061, "grad_norm": 0.3770136833190918, "learning_rate": 5.1585056778867766e-06, "loss": 1.2545, "num_input_tokens_seen": 646840628, "step": 16470 }, { "epoch": 0.7985608460480809, "grad_norm": 0.39917248487472534, "learning_rate": 5.134660099334699e-06, "loss": 1.2714, "num_input_tokens_seen": 647216352, "step": 16480 }, { "epoch": 0.7990454096682557, "grad_norm": 0.41502079367637634, "learning_rate": 5.110863453313436e-06, "loss": 1.2083, "num_input_tokens_seen": 647613200, "step": 16490 }, { "epoch": 0.7995299732884305, "grad_norm": 0.43602079153060913, "learning_rate": 5.087115798439146e-06, "loss": 1.2545, "num_input_tokens_seen": 647990596, "step": 16500 }, { "epoch": 0.8000145369086052, "grad_norm": 0.3962024748325348, "learning_rate": 5.063417193207337e-06, "loss": 1.2527, "num_input_tokens_seen": 648402756, "step": 16510 }, { "epoch": 0.80049910052878, "grad_norm": 0.4318599998950958, "learning_rate": 5.039767695992664e-06, "loss": 1.2438, "num_input_tokens_seen": 648791928, "step": 16520 }, { "epoch": 0.8009836641489548, "grad_norm": 0.41063135862350464, "learning_rate": 5.016167365048857e-06, "loss": 1.2647, "num_input_tokens_seen": 649194520, "step": 16530 }, { "epoch": 0.8014682277691296, "grad_norm": 0.419166624546051, "learning_rate": 4.992616258508501e-06, "loss": 1.2353, "num_input_tokens_seen": 649580868, "step": 16540 }, { "epoch": 0.8019527913893044, "grad_norm": 0.4164583683013916, "learning_rate": 4.969114434382966e-06, "loss": 1.2328, "num_input_tokens_seen": 649985136, "step": 16550 }, { "epoch": 0.8024373550094793, "grad_norm": 0.41067010164260864, "learning_rate": 4.945661950562195e-06, "loss": 1.2201, "num_input_tokens_seen": 650381492, "step": 16560 }, { "epoch": 0.8029219186296541, "grad_norm": 0.38203445076942444, "learning_rate": 4.922258864814619e-06, "loss": 1.242, "num_input_tokens_seen": 650781812, "step": 16570 }, { "epoch": 0.8034064822498289, "grad_norm": 0.3730851709842682, "learning_rate": 4.8989052347869876e-06, "loss": 1.2403, "num_input_tokens_seen": 651179148, "step": 16580 }, { "epoch": 0.8038910458700037, "grad_norm": 0.38860711455345154, "learning_rate": 4.875601118004228e-06, "loss": 1.2151, "num_input_tokens_seen": 651589328, "step": 16590 }, { "epoch": 0.8043756094901785, "grad_norm": 0.437386691570282, "learning_rate": 4.852346571869307e-06, "loss": 1.2417, "num_input_tokens_seen": 651973144, "step": 16600 }, { "epoch": 0.8048601731103533, "grad_norm": 0.3950743079185486, "learning_rate": 4.8291416536630805e-06, "loss": 1.2422, "num_input_tokens_seen": 652374852, "step": 16610 }, { "epoch": 0.8053447367305281, "grad_norm": 0.4481166899204254, "learning_rate": 4.805986420544173e-06, "loss": 1.2728, "num_input_tokens_seen": 652767160, "step": 16620 }, { "epoch": 0.8058293003507029, "grad_norm": 0.4067986011505127, "learning_rate": 4.782880929548808e-06, "loss": 1.2141, "num_input_tokens_seen": 653149252, "step": 16630 }, { "epoch": 0.8063138639708777, "grad_norm": 0.44263729453086853, "learning_rate": 4.7598252375907035e-06, "loss": 1.2469, "num_input_tokens_seen": 653534468, "step": 16640 }, { "epoch": 0.8067984275910526, "grad_norm": 0.4220990836620331, "learning_rate": 4.736819401460893e-06, "loss": 1.2297, "num_input_tokens_seen": 653904180, "step": 16650 }, { "epoch": 0.8072829912112274, "grad_norm": 0.4025983512401581, "learning_rate": 4.713863477827626e-06, "loss": 1.2693, "num_input_tokens_seen": 654309572, "step": 16660 }, { "epoch": 0.8077675548314022, "grad_norm": 0.44055184721946716, "learning_rate": 4.690957523236178e-06, "loss": 1.2899, "num_input_tokens_seen": 654738392, "step": 16670 }, { "epoch": 0.808252118451577, "grad_norm": 0.43497493863105774, "learning_rate": 4.66810159410877e-06, "loss": 1.2421, "num_input_tokens_seen": 655115200, "step": 16680 }, { "epoch": 0.8087366820717518, "grad_norm": 0.38774144649505615, "learning_rate": 4.645295746744374e-06, "loss": 1.2939, "num_input_tokens_seen": 655499648, "step": 16690 }, { "epoch": 0.8092212456919265, "grad_norm": 0.41730353236198425, "learning_rate": 4.622540037318618e-06, "loss": 1.2312, "num_input_tokens_seen": 655885096, "step": 16700 }, { "epoch": 0.8097058093121013, "grad_norm": 0.4214309751987457, "learning_rate": 4.5998345218836304e-06, "loss": 1.2738, "num_input_tokens_seen": 656267208, "step": 16710 }, { "epoch": 0.8101903729322761, "grad_norm": 0.40911537408828735, "learning_rate": 4.577179256367886e-06, "loss": 1.2121, "num_input_tokens_seen": 656652956, "step": 16720 }, { "epoch": 0.8106749365524509, "grad_norm": 0.4146646559238434, "learning_rate": 4.554574296576092e-06, "loss": 1.2153, "num_input_tokens_seen": 657038508, "step": 16730 }, { "epoch": 0.8111595001726258, "grad_norm": 0.430595338344574, "learning_rate": 4.532019698189044e-06, "loss": 1.2613, "num_input_tokens_seen": 657434028, "step": 16740 }, { "epoch": 0.8116440637928006, "grad_norm": 0.3903404772281647, "learning_rate": 4.509515516763493e-06, "loss": 1.2283, "num_input_tokens_seen": 657845788, "step": 16750 }, { "epoch": 0.8121286274129754, "grad_norm": 0.4230201840400696, "learning_rate": 4.487061807731982e-06, "loss": 1.2466, "num_input_tokens_seen": 658242124, "step": 16760 }, { "epoch": 0.8126131910331502, "grad_norm": 0.39636996388435364, "learning_rate": 4.464658626402751e-06, "loss": 1.2527, "num_input_tokens_seen": 658637400, "step": 16770 }, { "epoch": 0.813097754653325, "grad_norm": 0.4210509657859802, "learning_rate": 4.442306027959564e-06, "loss": 1.2837, "num_input_tokens_seen": 659032724, "step": 16780 }, { "epoch": 0.8135823182734998, "grad_norm": 0.4173252582550049, "learning_rate": 4.4200040674616e-06, "loss": 1.2855, "num_input_tokens_seen": 659452924, "step": 16790 }, { "epoch": 0.8140668818936746, "grad_norm": 0.43643826246261597, "learning_rate": 4.397752799843294e-06, "loss": 1.2185, "num_input_tokens_seen": 659860564, "step": 16800 }, { "epoch": 0.8145514455138494, "grad_norm": 0.4406135380268097, "learning_rate": 4.375552279914233e-06, "loss": 1.3007, "num_input_tokens_seen": 660247376, "step": 16810 }, { "epoch": 0.8150360091340242, "grad_norm": 0.41418373584747314, "learning_rate": 4.353402562358977e-06, "loss": 1.2831, "num_input_tokens_seen": 660644512, "step": 16820 }, { "epoch": 0.8155205727541991, "grad_norm": 0.3968205749988556, "learning_rate": 4.331303701736969e-06, "loss": 1.2532, "num_input_tokens_seen": 661014408, "step": 16830 }, { "epoch": 0.8160051363743739, "grad_norm": 0.4220852255821228, "learning_rate": 4.309255752482378e-06, "loss": 1.2457, "num_input_tokens_seen": 661407992, "step": 16840 }, { "epoch": 0.8164896999945487, "grad_norm": 0.418720543384552, "learning_rate": 4.2872587689039484e-06, "loss": 1.2173, "num_input_tokens_seen": 661784464, "step": 16850 }, { "epoch": 0.8169742636147235, "grad_norm": 0.38268375396728516, "learning_rate": 4.265312805184909e-06, "loss": 1.237, "num_input_tokens_seen": 662147388, "step": 16860 }, { "epoch": 0.8174588272348983, "grad_norm": 0.40058982372283936, "learning_rate": 4.24341791538281e-06, "loss": 1.2639, "num_input_tokens_seen": 662530408, "step": 16870 }, { "epoch": 0.8179433908550731, "grad_norm": 0.4105454981327057, "learning_rate": 4.221574153429392e-06, "loss": 1.2431, "num_input_tokens_seen": 662921336, "step": 16880 }, { "epoch": 0.8184279544752479, "grad_norm": 0.41436707973480225, "learning_rate": 4.1997815731304515e-06, "loss": 1.2628, "num_input_tokens_seen": 663310688, "step": 16890 }, { "epoch": 0.8189125180954226, "grad_norm": 0.39193400740623474, "learning_rate": 4.178040228165725e-06, "loss": 1.2385, "num_input_tokens_seen": 663687484, "step": 16900 }, { "epoch": 0.8193970817155974, "grad_norm": 0.4802253246307373, "learning_rate": 4.156350172088736e-06, "loss": 1.2784, "num_input_tokens_seen": 664078036, "step": 16910 }, { "epoch": 0.8198816453357723, "grad_norm": 0.4015842080116272, "learning_rate": 4.134711458326681e-06, "loss": 1.2329, "num_input_tokens_seen": 664483676, "step": 16920 }, { "epoch": 0.8203662089559471, "grad_norm": 0.40182778239250183, "learning_rate": 4.11312414018028e-06, "loss": 1.2525, "num_input_tokens_seen": 664851144, "step": 16930 }, { "epoch": 0.8208507725761219, "grad_norm": 0.4445132315158844, "learning_rate": 4.091588270823671e-06, "loss": 1.2506, "num_input_tokens_seen": 665274412, "step": 16940 }, { "epoch": 0.8213353361962967, "grad_norm": 0.43962743878364563, "learning_rate": 4.070103903304237e-06, "loss": 1.299, "num_input_tokens_seen": 665663080, "step": 16950 }, { "epoch": 0.8218198998164715, "grad_norm": 0.43457552790641785, "learning_rate": 4.048671090542522e-06, "loss": 1.2759, "num_input_tokens_seen": 666049536, "step": 16960 }, { "epoch": 0.8223044634366463, "grad_norm": 0.44402769207954407, "learning_rate": 4.0272898853320835e-06, "loss": 1.2648, "num_input_tokens_seen": 666424352, "step": 16970 }, { "epoch": 0.8227890270568211, "grad_norm": 0.4204462468624115, "learning_rate": 4.005960340339335e-06, "loss": 1.2761, "num_input_tokens_seen": 666812808, "step": 16980 }, { "epoch": 0.8232735906769959, "grad_norm": 0.4030398428440094, "learning_rate": 3.984682508103466e-06, "loss": 1.2865, "num_input_tokens_seen": 667198772, "step": 16990 }, { "epoch": 0.8237581542971708, "grad_norm": 0.3986146152019501, "learning_rate": 3.963456441036259e-06, "loss": 1.247, "num_input_tokens_seen": 667609040, "step": 17000 }, { "epoch": 0.8242427179173456, "grad_norm": 0.4464845061302185, "learning_rate": 3.942282191422017e-06, "loss": 1.2972, "num_input_tokens_seen": 668009280, "step": 17010 }, { "epoch": 0.8247272815375204, "grad_norm": 0.37678518891334534, "learning_rate": 3.9211598114173855e-06, "loss": 1.2387, "num_input_tokens_seen": 668416796, "step": 17020 }, { "epoch": 0.8252118451576952, "grad_norm": 0.38661307096481323, "learning_rate": 3.900089353051259e-06, "loss": 1.2197, "num_input_tokens_seen": 668795620, "step": 17030 }, { "epoch": 0.82569640877787, "grad_norm": 0.3967253267765045, "learning_rate": 3.879070868224616e-06, "loss": 1.2319, "num_input_tokens_seen": 669174880, "step": 17040 }, { "epoch": 0.8261809723980448, "grad_norm": 0.45880743861198425, "learning_rate": 3.858104408710445e-06, "loss": 1.2629, "num_input_tokens_seen": 669581256, "step": 17050 }, { "epoch": 0.8266655360182196, "grad_norm": 0.40062230825424194, "learning_rate": 3.837190026153548e-06, "loss": 1.3047, "num_input_tokens_seen": 669998664, "step": 17060 }, { "epoch": 0.8271500996383944, "grad_norm": 0.42311957478523254, "learning_rate": 3.816327772070483e-06, "loss": 1.2558, "num_input_tokens_seen": 670389116, "step": 17070 }, { "epoch": 0.8276346632585692, "grad_norm": 0.41024044156074524, "learning_rate": 3.7955176978493822e-06, "loss": 1.2791, "num_input_tokens_seen": 670805700, "step": 17080 }, { "epoch": 0.8281192268787441, "grad_norm": 0.4341394305229187, "learning_rate": 3.7747598547498682e-06, "loss": 1.229, "num_input_tokens_seen": 671163656, "step": 17090 }, { "epoch": 0.8286037904989189, "grad_norm": 0.42642325162887573, "learning_rate": 3.754054293902884e-06, "loss": 1.2721, "num_input_tokens_seen": 671556364, "step": 17100 }, { "epoch": 0.8290883541190937, "grad_norm": 0.41226011514663696, "learning_rate": 3.7334010663106044e-06, "loss": 1.2968, "num_input_tokens_seen": 671957808, "step": 17110 }, { "epoch": 0.8295729177392684, "grad_norm": 0.4139423966407776, "learning_rate": 3.712800222846302e-06, "loss": 1.2627, "num_input_tokens_seen": 672364408, "step": 17120 }, { "epoch": 0.8300574813594432, "grad_norm": 0.4178198575973511, "learning_rate": 3.6922518142541994e-06, "loss": 1.2778, "num_input_tokens_seen": 672770620, "step": 17130 }, { "epoch": 0.830542044979618, "grad_norm": 0.43971824645996094, "learning_rate": 3.6717558911493784e-06, "loss": 1.1902, "num_input_tokens_seen": 673162284, "step": 17140 }, { "epoch": 0.8310266085997928, "grad_norm": 0.40751203894615173, "learning_rate": 3.6513125040176205e-06, "loss": 1.2592, "num_input_tokens_seen": 673552816, "step": 17150 }, { "epoch": 0.8315111722199676, "grad_norm": 0.3929533064365387, "learning_rate": 3.6309217032153093e-06, "loss": 1.2793, "num_input_tokens_seen": 673963224, "step": 17160 }, { "epoch": 0.8319957358401424, "grad_norm": 0.42938390374183655, "learning_rate": 3.610583538969306e-06, "loss": 1.2622, "num_input_tokens_seen": 674355136, "step": 17170 }, { "epoch": 0.8324802994603173, "grad_norm": 0.42667803168296814, "learning_rate": 3.590298061376804e-06, "loss": 1.3012, "num_input_tokens_seen": 674750972, "step": 17180 }, { "epoch": 0.8329648630804921, "grad_norm": 0.4376852214336395, "learning_rate": 3.5700653204052167e-06, "loss": 1.3022, "num_input_tokens_seen": 675185324, "step": 17190 }, { "epoch": 0.8334494267006669, "grad_norm": 0.40134289860725403, "learning_rate": 3.5498853658920695e-06, "loss": 1.2425, "num_input_tokens_seen": 675605452, "step": 17200 }, { "epoch": 0.8339339903208417, "grad_norm": 0.4355108439922333, "learning_rate": 3.5297582475448483e-06, "loss": 1.2516, "num_input_tokens_seen": 676001164, "step": 17210 }, { "epoch": 0.8344185539410165, "grad_norm": 0.3797595798969269, "learning_rate": 3.509684014940906e-06, "loss": 1.2325, "num_input_tokens_seen": 676377212, "step": 17220 }, { "epoch": 0.8349031175611913, "grad_norm": 0.42660579085350037, "learning_rate": 3.489662717527312e-06, "loss": 1.2337, "num_input_tokens_seen": 676770320, "step": 17230 }, { "epoch": 0.8353876811813661, "grad_norm": 0.39556610584259033, "learning_rate": 3.4696944046207574e-06, "loss": 1.3193, "num_input_tokens_seen": 677180796, "step": 17240 }, { "epoch": 0.8358722448015409, "grad_norm": 0.3952700197696686, "learning_rate": 3.449779125407426e-06, "loss": 1.2574, "num_input_tokens_seen": 677593456, "step": 17250 }, { "epoch": 0.8363568084217157, "grad_norm": 0.40727055072784424, "learning_rate": 3.4299169289428446e-06, "loss": 1.2599, "num_input_tokens_seen": 677958328, "step": 17260 }, { "epoch": 0.8368413720418906, "grad_norm": 0.4267754852771759, "learning_rate": 3.4101078641518124e-06, "loss": 1.2312, "num_input_tokens_seen": 678375748, "step": 17270 }, { "epoch": 0.8373259356620654, "grad_norm": 0.3855496644973755, "learning_rate": 3.3903519798282353e-06, "loss": 1.2167, "num_input_tokens_seen": 678754312, "step": 17280 }, { "epoch": 0.8378104992822402, "grad_norm": 0.39305436611175537, "learning_rate": 3.3706493246350335e-06, "loss": 1.2569, "num_input_tokens_seen": 679131624, "step": 17290 }, { "epoch": 0.838295062902415, "grad_norm": 0.395431786775589, "learning_rate": 3.3509999471040136e-06, "loss": 1.2482, "num_input_tokens_seen": 679524344, "step": 17300 }, { "epoch": 0.8387796265225897, "grad_norm": 0.4162997901439667, "learning_rate": 3.3314038956357514e-06, "loss": 1.2221, "num_input_tokens_seen": 679924936, "step": 17310 }, { "epoch": 0.8392641901427645, "grad_norm": 0.42025381326675415, "learning_rate": 3.3118612184994485e-06, "loss": 1.2243, "num_input_tokens_seen": 680326700, "step": 17320 }, { "epoch": 0.8397487537629393, "grad_norm": 0.46221596002578735, "learning_rate": 3.292371963832863e-06, "loss": 1.2707, "num_input_tokens_seen": 680740540, "step": 17330 }, { "epoch": 0.8402333173831141, "grad_norm": 0.40451374650001526, "learning_rate": 3.2729361796421387e-06, "loss": 1.2341, "num_input_tokens_seen": 681129484, "step": 17340 }, { "epoch": 0.8407178810032889, "grad_norm": 0.43298593163490295, "learning_rate": 3.253553913801727e-06, "loss": 1.305, "num_input_tokens_seen": 681512368, "step": 17350 }, { "epoch": 0.8412024446234638, "grad_norm": 0.4344673156738281, "learning_rate": 3.2342252140542373e-06, "loss": 1.3109, "num_input_tokens_seen": 681938304, "step": 17360 }, { "epoch": 0.8416870082436386, "grad_norm": 0.39718765020370483, "learning_rate": 3.2149501280103466e-06, "loss": 1.2778, "num_input_tokens_seen": 682342636, "step": 17370 }, { "epoch": 0.8421715718638134, "grad_norm": 0.4262661933898926, "learning_rate": 3.195728703148673e-06, "loss": 1.2626, "num_input_tokens_seen": 682748112, "step": 17380 }, { "epoch": 0.8426561354839882, "grad_norm": 0.4599827826023102, "learning_rate": 3.1765609868156325e-06, "loss": 1.285, "num_input_tokens_seen": 683133464, "step": 17390 }, { "epoch": 0.843140699104163, "grad_norm": 0.3915078639984131, "learning_rate": 3.1574470262253795e-06, "loss": 1.2881, "num_input_tokens_seen": 683506720, "step": 17400 }, { "epoch": 0.8436252627243378, "grad_norm": 0.4697626233100891, "learning_rate": 3.138386868459622e-06, "loss": 1.2346, "num_input_tokens_seen": 683894420, "step": 17410 }, { "epoch": 0.8441098263445126, "grad_norm": 0.4377289116382599, "learning_rate": 3.119380560467572e-06, "loss": 1.2772, "num_input_tokens_seen": 684275120, "step": 17420 }, { "epoch": 0.8445943899646874, "grad_norm": 0.4258815050125122, "learning_rate": 3.1004281490657703e-06, "loss": 1.2482, "num_input_tokens_seen": 684658984, "step": 17430 }, { "epoch": 0.8450789535848622, "grad_norm": 0.4437004625797272, "learning_rate": 3.0815296809380167e-06, "loss": 1.2674, "num_input_tokens_seen": 685068552, "step": 17440 }, { "epoch": 0.8455635172050371, "grad_norm": 0.3915651738643646, "learning_rate": 3.0626852026352347e-06, "loss": 1.2626, "num_input_tokens_seen": 685445348, "step": 17450 }, { "epoch": 0.8460480808252119, "grad_norm": 0.4123501777648926, "learning_rate": 3.043894760575358e-06, "loss": 1.2912, "num_input_tokens_seen": 685845820, "step": 17460 }, { "epoch": 0.8465326444453867, "grad_norm": 0.4246242344379425, "learning_rate": 3.0251584010432127e-06, "loss": 1.2455, "num_input_tokens_seen": 686224024, "step": 17470 }, { "epoch": 0.8470172080655615, "grad_norm": 0.43629172444343567, "learning_rate": 3.00647617019042e-06, "loss": 1.2333, "num_input_tokens_seen": 686627324, "step": 17480 }, { "epoch": 0.8475017716857363, "grad_norm": 0.3956005871295929, "learning_rate": 2.9878481140352495e-06, "loss": 1.2989, "num_input_tokens_seen": 687019208, "step": 17490 }, { "epoch": 0.847986335305911, "grad_norm": 0.434689462184906, "learning_rate": 2.96927427846255e-06, "loss": 1.2314, "num_input_tokens_seen": 687411440, "step": 17500 }, { "epoch": 0.8484708989260858, "grad_norm": 0.3737858831882477, "learning_rate": 2.9507547092236075e-06, "loss": 1.2527, "num_input_tokens_seen": 687806800, "step": 17510 }, { "epoch": 0.8489554625462606, "grad_norm": 0.4112042784690857, "learning_rate": 2.9322894519360237e-06, "loss": 1.249, "num_input_tokens_seen": 688225164, "step": 17520 }, { "epoch": 0.8494400261664354, "grad_norm": 0.41528812050819397, "learning_rate": 2.913878552083646e-06, "loss": 1.3127, "num_input_tokens_seen": 688617932, "step": 17530 }, { "epoch": 0.8499245897866103, "grad_norm": 0.43437185883522034, "learning_rate": 2.895522055016395e-06, "loss": 1.2433, "num_input_tokens_seen": 689007680, "step": 17540 }, { "epoch": 0.8504091534067851, "grad_norm": 0.39245957136154175, "learning_rate": 2.8772200059502153e-06, "loss": 1.2776, "num_input_tokens_seen": 689376876, "step": 17550 }, { "epoch": 0.8508937170269599, "grad_norm": 0.44739076495170593, "learning_rate": 2.8589724499669122e-06, "loss": 1.2425, "num_input_tokens_seen": 689774952, "step": 17560 }, { "epoch": 0.8513782806471347, "grad_norm": 0.400081604719162, "learning_rate": 2.840779432014079e-06, "loss": 1.253, "num_input_tokens_seen": 690191364, "step": 17570 }, { "epoch": 0.8518628442673095, "grad_norm": 0.43295279145240784, "learning_rate": 2.8226409969049627e-06, "loss": 1.24, "num_input_tokens_seen": 690593376, "step": 17580 }, { "epoch": 0.8523474078874843, "grad_norm": 0.4127790927886963, "learning_rate": 2.804557189318366e-06, "loss": 1.2606, "num_input_tokens_seen": 691002516, "step": 17590 }, { "epoch": 0.8528319715076591, "grad_norm": 0.40065881609916687, "learning_rate": 2.7865280537985233e-06, "loss": 1.2413, "num_input_tokens_seen": 691404960, "step": 17600 }, { "epoch": 0.8533165351278339, "grad_norm": 0.45817670226097107, "learning_rate": 2.768553634755011e-06, "loss": 1.2612, "num_input_tokens_seen": 691797660, "step": 17610 }, { "epoch": 0.8538010987480087, "grad_norm": 0.36435696482658386, "learning_rate": 2.750633976462616e-06, "loss": 1.2247, "num_input_tokens_seen": 692207740, "step": 17620 }, { "epoch": 0.8542856623681836, "grad_norm": 0.41439151763916016, "learning_rate": 2.732769123061249e-06, "loss": 1.271, "num_input_tokens_seen": 692585708, "step": 17630 }, { "epoch": 0.8547702259883584, "grad_norm": 0.4093399941921234, "learning_rate": 2.714959118555821e-06, "loss": 1.2442, "num_input_tokens_seen": 692977756, "step": 17640 }, { "epoch": 0.8552547896085332, "grad_norm": 0.44113224744796753, "learning_rate": 2.697204006816131e-06, "loss": 1.2601, "num_input_tokens_seen": 693350796, "step": 17650 }, { "epoch": 0.855739353228708, "grad_norm": 0.40980538725852966, "learning_rate": 2.6795038315767824e-06, "loss": 1.2668, "num_input_tokens_seen": 693759588, "step": 17660 }, { "epoch": 0.8562239168488828, "grad_norm": 0.42793384194374084, "learning_rate": 2.661858636437034e-06, "loss": 1.2285, "num_input_tokens_seen": 694174680, "step": 17670 }, { "epoch": 0.8567084804690576, "grad_norm": 0.4959481954574585, "learning_rate": 2.644268464860741e-06, "loss": 1.2491, "num_input_tokens_seen": 694561320, "step": 17680 }, { "epoch": 0.8571930440892324, "grad_norm": 0.40092337131500244, "learning_rate": 2.6267333601762088e-06, "loss": 1.2301, "num_input_tokens_seen": 694959912, "step": 17690 }, { "epoch": 0.8576776077094072, "grad_norm": 0.40096816420555115, "learning_rate": 2.6092533655761144e-06, "loss": 1.2886, "num_input_tokens_seen": 695328628, "step": 17700 }, { "epoch": 0.8581621713295821, "grad_norm": 0.4054926335811615, "learning_rate": 2.591828524117365e-06, "loss": 1.2309, "num_input_tokens_seen": 695748200, "step": 17710 }, { "epoch": 0.8586467349497569, "grad_norm": 0.38719359040260315, "learning_rate": 2.5744588787210366e-06, "loss": 1.2367, "num_input_tokens_seen": 696133424, "step": 17720 }, { "epoch": 0.8591312985699316, "grad_norm": 0.386705219745636, "learning_rate": 2.557144472172235e-06, "loss": 1.2824, "num_input_tokens_seen": 696536372, "step": 17730 }, { "epoch": 0.8596158621901064, "grad_norm": 0.4401761293411255, "learning_rate": 2.5398853471200105e-06, "loss": 1.2132, "num_input_tokens_seen": 696913248, "step": 17740 }, { "epoch": 0.8601004258102812, "grad_norm": 0.43798384070396423, "learning_rate": 2.522681546077224e-06, "loss": 1.2504, "num_input_tokens_seen": 697311808, "step": 17750 }, { "epoch": 0.860584989430456, "grad_norm": 0.4158650040626526, "learning_rate": 2.5055331114204798e-06, "loss": 1.2485, "num_input_tokens_seen": 697710452, "step": 17760 }, { "epoch": 0.8610695530506308, "grad_norm": 0.4461188018321991, "learning_rate": 2.4884400853900034e-06, "loss": 1.2431, "num_input_tokens_seen": 698116700, "step": 17770 }, { "epoch": 0.8615541166708056, "grad_norm": 0.36897483468055725, "learning_rate": 2.4714025100895155e-06, "loss": 1.2314, "num_input_tokens_seen": 698502984, "step": 17780 }, { "epoch": 0.8620386802909804, "grad_norm": 0.38578104972839355, "learning_rate": 2.4544204274861785e-06, "loss": 1.2852, "num_input_tokens_seen": 698869344, "step": 17790 }, { "epoch": 0.8625232439111553, "grad_norm": 0.3806360363960266, "learning_rate": 2.4374938794104407e-06, "loss": 1.2656, "num_input_tokens_seen": 699268952, "step": 17800 }, { "epoch": 0.8630078075313301, "grad_norm": 0.43705320358276367, "learning_rate": 2.420622907555975e-06, "loss": 1.2623, "num_input_tokens_seen": 699655576, "step": 17810 }, { "epoch": 0.8634923711515049, "grad_norm": 0.41822245717048645, "learning_rate": 2.403807553479548e-06, "loss": 1.2524, "num_input_tokens_seen": 700027896, "step": 17820 }, { "epoch": 0.8639769347716797, "grad_norm": 0.4171805679798126, "learning_rate": 2.3870478586009325e-06, "loss": 1.2786, "num_input_tokens_seen": 700417852, "step": 17830 }, { "epoch": 0.8644614983918545, "grad_norm": 0.4310646057128906, "learning_rate": 2.3703438642027927e-06, "loss": 1.2672, "num_input_tokens_seen": 700843676, "step": 17840 }, { "epoch": 0.8649460620120293, "grad_norm": 0.41659244894981384, "learning_rate": 2.353695611430609e-06, "loss": 1.2511, "num_input_tokens_seen": 701217408, "step": 17850 }, { "epoch": 0.8654306256322041, "grad_norm": 0.3900809586048126, "learning_rate": 2.337103141292535e-06, "loss": 1.2216, "num_input_tokens_seen": 701584684, "step": 17860 }, { "epoch": 0.8659151892523789, "grad_norm": 0.4181850552558899, "learning_rate": 2.3205664946593348e-06, "loss": 1.3174, "num_input_tokens_seen": 701960304, "step": 17870 }, { "epoch": 0.8663997528725537, "grad_norm": 0.4225742220878601, "learning_rate": 2.3040857122642674e-06, "loss": 1.229, "num_input_tokens_seen": 702347808, "step": 17880 }, { "epoch": 0.8668843164927286, "grad_norm": 0.3922450542449951, "learning_rate": 2.2876608347029816e-06, "loss": 1.2615, "num_input_tokens_seen": 702743944, "step": 17890 }, { "epoch": 0.8673688801129034, "grad_norm": 0.3811093866825104, "learning_rate": 2.2712919024334257e-06, "loss": 1.2735, "num_input_tokens_seen": 703123092, "step": 17900 }, { "epoch": 0.8678534437330782, "grad_norm": 0.45579883456230164, "learning_rate": 2.2549789557757327e-06, "loss": 1.2724, "num_input_tokens_seen": 703523388, "step": 17910 }, { "epoch": 0.868338007353253, "grad_norm": 0.44050538539886475, "learning_rate": 2.238722034912144e-06, "loss": 1.2293, "num_input_tokens_seen": 703903936, "step": 17920 }, { "epoch": 0.8688225709734277, "grad_norm": 0.3877395987510681, "learning_rate": 2.222521179886888e-06, "loss": 1.2697, "num_input_tokens_seen": 704304692, "step": 17930 }, { "epoch": 0.8693071345936025, "grad_norm": 0.427232027053833, "learning_rate": 2.206376430606097e-06, "loss": 1.2374, "num_input_tokens_seen": 704700272, "step": 17940 }, { "epoch": 0.8697916982137773, "grad_norm": 0.39587920904159546, "learning_rate": 2.1902878268376975e-06, "loss": 1.2735, "num_input_tokens_seen": 705078452, "step": 17950 }, { "epoch": 0.8702762618339521, "grad_norm": 0.37119200825691223, "learning_rate": 2.174255408211326e-06, "loss": 1.3054, "num_input_tokens_seen": 705480336, "step": 17960 }, { "epoch": 0.8707608254541269, "grad_norm": 0.4567846357822418, "learning_rate": 2.1582792142182117e-06, "loss": 1.2335, "num_input_tokens_seen": 705869500, "step": 17970 }, { "epoch": 0.8712453890743018, "grad_norm": 0.39730075001716614, "learning_rate": 2.1423592842111066e-06, "loss": 1.2125, "num_input_tokens_seen": 706254284, "step": 17980 }, { "epoch": 0.8717299526944766, "grad_norm": 0.41465237736701965, "learning_rate": 2.1264956574041513e-06, "loss": 1.2287, "num_input_tokens_seen": 706671516, "step": 17990 }, { "epoch": 0.8722145163146514, "grad_norm": 0.3785446584224701, "learning_rate": 2.1106883728728155e-06, "loss": 1.2477, "num_input_tokens_seen": 707047904, "step": 18000 }, { "epoch": 0.8722145163146514, "eval_loss": 1.3607689142227173, "eval_runtime": 3.6634, "eval_samples_per_second": 40.946, "eval_steps_per_second": 5.186, "num_input_tokens_seen": 707047904, "step": 18000 }, { "epoch": 0.8726990799348262, "grad_norm": 0.3779754042625427, "learning_rate": 2.094937469553787e-06, "loss": 1.2501, "num_input_tokens_seen": 707448824, "step": 18010 }, { "epoch": 0.873183643555001, "grad_norm": 0.40831905603408813, "learning_rate": 2.079242986244867e-06, "loss": 1.3242, "num_input_tokens_seen": 707830972, "step": 18020 }, { "epoch": 0.8736682071751758, "grad_norm": 0.4000668525695801, "learning_rate": 2.063604961604884e-06, "loss": 1.2698, "num_input_tokens_seen": 708228628, "step": 18030 }, { "epoch": 0.8741527707953506, "grad_norm": 0.4057333469390869, "learning_rate": 2.0480234341535952e-06, "loss": 1.3053, "num_input_tokens_seen": 708602260, "step": 18040 }, { "epoch": 0.8746373344155254, "grad_norm": 0.44601795077323914, "learning_rate": 2.0324984422716046e-06, "loss": 1.2743, "num_input_tokens_seen": 709010156, "step": 18050 }, { "epoch": 0.8751218980357002, "grad_norm": 0.4290080964565277, "learning_rate": 2.0170300242002365e-06, "loss": 1.2339, "num_input_tokens_seen": 709382372, "step": 18060 }, { "epoch": 0.8756064616558751, "grad_norm": 0.40737321972846985, "learning_rate": 2.001618218041487e-06, "loss": 1.2312, "num_input_tokens_seen": 709780940, "step": 18070 }, { "epoch": 0.8760910252760499, "grad_norm": 0.42168939113616943, "learning_rate": 1.9862630617578816e-06, "loss": 1.2517, "num_input_tokens_seen": 710158796, "step": 18080 }, { "epoch": 0.8765755888962247, "grad_norm": 0.4381299912929535, "learning_rate": 1.9709645931724225e-06, "loss": 1.2813, "num_input_tokens_seen": 710561412, "step": 18090 }, { "epoch": 0.8770601525163995, "grad_norm": 0.44368723034858704, "learning_rate": 1.955722849968464e-06, "loss": 1.3046, "num_input_tokens_seen": 710960164, "step": 18100 }, { "epoch": 0.8775447161365743, "grad_norm": 0.41541314125061035, "learning_rate": 1.9405378696896487e-06, "loss": 1.2637, "num_input_tokens_seen": 711347332, "step": 18110 }, { "epoch": 0.878029279756749, "grad_norm": 0.3992341160774231, "learning_rate": 1.925409689739785e-06, "loss": 1.2553, "num_input_tokens_seen": 711716796, "step": 18120 }, { "epoch": 0.8785138433769238, "grad_norm": 0.47702735662460327, "learning_rate": 1.910338347382787e-06, "loss": 1.2407, "num_input_tokens_seen": 712071888, "step": 18130 }, { "epoch": 0.8789984069970986, "grad_norm": 0.38431301712989807, "learning_rate": 1.8953238797425442e-06, "loss": 1.235, "num_input_tokens_seen": 712445676, "step": 18140 }, { "epoch": 0.8794829706172734, "grad_norm": 0.4030941128730774, "learning_rate": 1.8803663238028707e-06, "loss": 1.2388, "num_input_tokens_seen": 712841240, "step": 18150 }, { "epoch": 0.8799675342374483, "grad_norm": 0.3907192349433899, "learning_rate": 1.8654657164073884e-06, "loss": 1.2495, "num_input_tokens_seen": 713199460, "step": 18160 }, { "epoch": 0.8804520978576231, "grad_norm": 0.4238632917404175, "learning_rate": 1.850622094259441e-06, "loss": 1.329, "num_input_tokens_seen": 713586584, "step": 18170 }, { "epoch": 0.8809366614777979, "grad_norm": 0.43450093269348145, "learning_rate": 1.835835493922014e-06, "loss": 1.2552, "num_input_tokens_seen": 713936208, "step": 18180 }, { "epoch": 0.8814212250979727, "grad_norm": 0.36312198638916016, "learning_rate": 1.821105951817617e-06, "loss": 1.2135, "num_input_tokens_seen": 714345664, "step": 18190 }, { "epoch": 0.8819057887181475, "grad_norm": 0.3843260407447815, "learning_rate": 1.8064335042282387e-06, "loss": 1.2077, "num_input_tokens_seen": 714749968, "step": 18200 }, { "epoch": 0.8823903523383223, "grad_norm": 0.43907880783081055, "learning_rate": 1.791818187295205e-06, "loss": 1.2502, "num_input_tokens_seen": 715139224, "step": 18210 }, { "epoch": 0.8828749159584971, "grad_norm": 0.42311954498291016, "learning_rate": 1.7772600370191433e-06, "loss": 1.2607, "num_input_tokens_seen": 715522824, "step": 18220 }, { "epoch": 0.8833594795786719, "grad_norm": 0.41037461161613464, "learning_rate": 1.762759089259844e-06, "loss": 1.2311, "num_input_tokens_seen": 715920216, "step": 18230 }, { "epoch": 0.8838440431988467, "grad_norm": 0.4087775945663452, "learning_rate": 1.7483153797362123e-06, "loss": 1.2339, "num_input_tokens_seen": 716298856, "step": 18240 }, { "epoch": 0.8843286068190216, "grad_norm": 0.43544670939445496, "learning_rate": 1.733928944026153e-06, "loss": 1.258, "num_input_tokens_seen": 716698420, "step": 18250 }, { "epoch": 0.8848131704391964, "grad_norm": 0.4397813081741333, "learning_rate": 1.7195998175665057e-06, "loss": 1.2538, "num_input_tokens_seen": 717109980, "step": 18260 }, { "epoch": 0.8852977340593712, "grad_norm": 0.4167066812515259, "learning_rate": 1.7053280356529283e-06, "loss": 1.2635, "num_input_tokens_seen": 717495908, "step": 18270 }, { "epoch": 0.885782297679546, "grad_norm": 0.3873845040798187, "learning_rate": 1.691113633439842e-06, "loss": 1.2667, "num_input_tokens_seen": 717871192, "step": 18280 }, { "epoch": 0.8862668612997208, "grad_norm": 0.41001996397972107, "learning_rate": 1.6769566459403224e-06, "loss": 1.2516, "num_input_tokens_seen": 718259268, "step": 18290 }, { "epoch": 0.8867514249198956, "grad_norm": 0.454142689704895, "learning_rate": 1.6628571080260196e-06, "loss": 1.2868, "num_input_tokens_seen": 718662496, "step": 18300 }, { "epoch": 0.8872359885400704, "grad_norm": 0.433021605014801, "learning_rate": 1.6488150544270776e-06, "loss": 1.2646, "num_input_tokens_seen": 719043384, "step": 18310 }, { "epoch": 0.8877205521602451, "grad_norm": 0.3922811448574066, "learning_rate": 1.6348305197320417e-06, "loss": 1.3003, "num_input_tokens_seen": 719419832, "step": 18320 }, { "epoch": 0.8882051157804199, "grad_norm": 0.41497138142585754, "learning_rate": 1.6209035383877803e-06, "loss": 1.2812, "num_input_tokens_seen": 719822572, "step": 18330 }, { "epoch": 0.8886896794005948, "grad_norm": 0.3779890239238739, "learning_rate": 1.6070341446993875e-06, "loss": 1.1811, "num_input_tokens_seen": 720208548, "step": 18340 }, { "epoch": 0.8891742430207696, "grad_norm": 0.4199633300304413, "learning_rate": 1.5932223728301138e-06, "loss": 1.2312, "num_input_tokens_seen": 720585180, "step": 18350 }, { "epoch": 0.8896588066409444, "grad_norm": 0.40874430537223816, "learning_rate": 1.579468256801267e-06, "loss": 1.1845, "num_input_tokens_seen": 720970764, "step": 18360 }, { "epoch": 0.8901433702611192, "grad_norm": 0.37856462597846985, "learning_rate": 1.5657718304921492e-06, "loss": 1.2665, "num_input_tokens_seen": 721361356, "step": 18370 }, { "epoch": 0.890627933881294, "grad_norm": 0.37885743379592896, "learning_rate": 1.5521331276399488e-06, "loss": 1.2479, "num_input_tokens_seen": 721724800, "step": 18380 }, { "epoch": 0.8911124975014688, "grad_norm": 0.40030255913734436, "learning_rate": 1.538552181839678e-06, "loss": 1.2585, "num_input_tokens_seen": 722116248, "step": 18390 }, { "epoch": 0.8915970611216436, "grad_norm": 0.43325039744377136, "learning_rate": 1.525029026544067e-06, "loss": 1.2446, "num_input_tokens_seen": 722489540, "step": 18400 }, { "epoch": 0.8920816247418184, "grad_norm": 0.3845292627811432, "learning_rate": 1.511563695063517e-06, "loss": 1.258, "num_input_tokens_seen": 722864464, "step": 18410 }, { "epoch": 0.8925661883619933, "grad_norm": 0.4011618196964264, "learning_rate": 1.4981562205659772e-06, "loss": 1.1975, "num_input_tokens_seen": 723261460, "step": 18420 }, { "epoch": 0.8930507519821681, "grad_norm": 0.4230853021144867, "learning_rate": 1.4848066360768935e-06, "loss": 1.2705, "num_input_tokens_seen": 723627044, "step": 18430 }, { "epoch": 0.8935353156023429, "grad_norm": 0.41304829716682434, "learning_rate": 1.4715149744791156e-06, "loss": 1.2206, "num_input_tokens_seen": 724019484, "step": 18440 }, { "epoch": 0.8940198792225177, "grad_norm": 0.40190738439559937, "learning_rate": 1.458281268512815e-06, "loss": 1.2595, "num_input_tokens_seen": 724395804, "step": 18450 }, { "epoch": 0.8945044428426925, "grad_norm": 0.40311187505722046, "learning_rate": 1.445105550775408e-06, "loss": 1.2737, "num_input_tokens_seen": 724797948, "step": 18460 }, { "epoch": 0.8949890064628673, "grad_norm": 0.42915475368499756, "learning_rate": 1.431987853721467e-06, "loss": 1.2391, "num_input_tokens_seen": 725183560, "step": 18470 }, { "epoch": 0.8954735700830421, "grad_norm": 0.406745046377182, "learning_rate": 1.4189282096626593e-06, "loss": 1.2801, "num_input_tokens_seen": 725560960, "step": 18480 }, { "epoch": 0.8959581337032169, "grad_norm": 0.43271973729133606, "learning_rate": 1.405926650767639e-06, "loss": 1.2748, "num_input_tokens_seen": 725936388, "step": 18490 }, { "epoch": 0.8964426973233917, "grad_norm": 0.41202259063720703, "learning_rate": 1.3929832090620043e-06, "loss": 1.2548, "num_input_tokens_seen": 726331564, "step": 18500 }, { "epoch": 0.8969272609435666, "grad_norm": 0.4043997526168823, "learning_rate": 1.3800979164281775e-06, "loss": 1.2473, "num_input_tokens_seen": 726700276, "step": 18510 }, { "epoch": 0.8974118245637414, "grad_norm": 0.38937658071517944, "learning_rate": 1.3672708046053668e-06, "loss": 1.2406, "num_input_tokens_seen": 727097284, "step": 18520 }, { "epoch": 0.8978963881839161, "grad_norm": 0.4417135715484619, "learning_rate": 1.3545019051894537e-06, "loss": 1.2141, "num_input_tokens_seen": 727502228, "step": 18530 }, { "epoch": 0.8983809518040909, "grad_norm": 0.393096923828125, "learning_rate": 1.34179124963294e-06, "loss": 1.2459, "num_input_tokens_seen": 727852368, "step": 18540 }, { "epoch": 0.8988655154242657, "grad_norm": 0.42874211072921753, "learning_rate": 1.3291388692448503e-06, "loss": 1.2397, "num_input_tokens_seen": 728254728, "step": 18550 }, { "epoch": 0.8993500790444405, "grad_norm": 0.40823593735694885, "learning_rate": 1.3165447951906773e-06, "loss": 1.2552, "num_input_tokens_seen": 728648556, "step": 18560 }, { "epoch": 0.8998346426646153, "grad_norm": 0.37802520394325256, "learning_rate": 1.3040090584922921e-06, "loss": 1.2287, "num_input_tokens_seen": 729028520, "step": 18570 }, { "epoch": 0.9003192062847901, "grad_norm": 0.39629054069519043, "learning_rate": 1.29153169002785e-06, "loss": 1.2521, "num_input_tokens_seen": 729403340, "step": 18580 }, { "epoch": 0.9008037699049649, "grad_norm": 0.38545021414756775, "learning_rate": 1.2791127205317583e-06, "loss": 1.2657, "num_input_tokens_seen": 729803816, "step": 18590 }, { "epoch": 0.9012883335251398, "grad_norm": 0.457553893327713, "learning_rate": 1.2667521805945577e-06, "loss": 1.2545, "num_input_tokens_seen": 730207652, "step": 18600 }, { "epoch": 0.9017728971453146, "grad_norm": 0.42532676458358765, "learning_rate": 1.2544501006628768e-06, "loss": 1.2469, "num_input_tokens_seen": 730598644, "step": 18610 }, { "epoch": 0.9022574607654894, "grad_norm": 0.4223385155200958, "learning_rate": 1.2422065110393317e-06, "loss": 1.2694, "num_input_tokens_seen": 730997112, "step": 18620 }, { "epoch": 0.9027420243856642, "grad_norm": 0.404644638299942, "learning_rate": 1.2300214418824756e-06, "loss": 1.2535, "num_input_tokens_seen": 731392988, "step": 18630 }, { "epoch": 0.903226588005839, "grad_norm": 0.39279234409332275, "learning_rate": 1.2178949232067056e-06, "loss": 1.2253, "num_input_tokens_seen": 731765160, "step": 18640 }, { "epoch": 0.9037111516260138, "grad_norm": 0.41039055585861206, "learning_rate": 1.2058269848822052e-06, "loss": 1.2272, "num_input_tokens_seen": 732135724, "step": 18650 }, { "epoch": 0.9041957152461886, "grad_norm": 0.38321247696876526, "learning_rate": 1.1938176566348518e-06, "loss": 1.2478, "num_input_tokens_seen": 732558172, "step": 18660 }, { "epoch": 0.9046802788663634, "grad_norm": 0.47206875681877136, "learning_rate": 1.1818669680461636e-06, "loss": 1.2375, "num_input_tokens_seen": 732961984, "step": 18670 }, { "epoch": 0.9051648424865382, "grad_norm": 0.3947557210922241, "learning_rate": 1.169974948553207e-06, "loss": 1.2745, "num_input_tokens_seen": 733342248, "step": 18680 }, { "epoch": 0.9056494061067131, "grad_norm": 0.3839742839336395, "learning_rate": 1.1581416274485447e-06, "loss": 1.2215, "num_input_tokens_seen": 733722272, "step": 18690 }, { "epoch": 0.9061339697268879, "grad_norm": 0.4032100737094879, "learning_rate": 1.146367033880147e-06, "loss": 1.2589, "num_input_tokens_seen": 734138572, "step": 18700 }, { "epoch": 0.9066185333470627, "grad_norm": 0.42585238814353943, "learning_rate": 1.1346511968513218e-06, "loss": 1.2102, "num_input_tokens_seen": 734513304, "step": 18710 }, { "epoch": 0.9071030969672375, "grad_norm": 0.4173504412174225, "learning_rate": 1.122994145220657e-06, "loss": 1.2864, "num_input_tokens_seen": 734919300, "step": 18720 }, { "epoch": 0.9075876605874122, "grad_norm": 0.4611232876777649, "learning_rate": 1.1113959077019315e-06, "loss": 1.2136, "num_input_tokens_seen": 735332452, "step": 18730 }, { "epoch": 0.908072224207587, "grad_norm": 0.410653680562973, "learning_rate": 1.0998565128640615e-06, "loss": 1.2745, "num_input_tokens_seen": 735714372, "step": 18740 }, { "epoch": 0.9085567878277618, "grad_norm": 0.39335983991622925, "learning_rate": 1.0883759891310047e-06, "loss": 1.2148, "num_input_tokens_seen": 736147668, "step": 18750 }, { "epoch": 0.9090413514479366, "grad_norm": 0.4009684920310974, "learning_rate": 1.0769543647817293e-06, "loss": 1.255, "num_input_tokens_seen": 736533020, "step": 18760 }, { "epoch": 0.9095259150681114, "grad_norm": 0.39060232043266296, "learning_rate": 1.0655916679501026e-06, "loss": 1.28, "num_input_tokens_seen": 736930600, "step": 18770 }, { "epoch": 0.9100104786882863, "grad_norm": 0.3796159327030182, "learning_rate": 1.0542879266248501e-06, "loss": 1.295, "num_input_tokens_seen": 737329028, "step": 18780 }, { "epoch": 0.9104950423084611, "grad_norm": 0.41085562109947205, "learning_rate": 1.0430431686494768e-06, "loss": 1.2471, "num_input_tokens_seen": 737729856, "step": 18790 }, { "epoch": 0.9109796059286359, "grad_norm": 0.38016316294670105, "learning_rate": 1.031857421722196e-06, "loss": 1.2601, "num_input_tokens_seen": 738134952, "step": 18800 }, { "epoch": 0.9114641695488107, "grad_norm": 0.42966845631599426, "learning_rate": 1.0207307133958676e-06, "loss": 1.2398, "num_input_tokens_seen": 738559280, "step": 18810 }, { "epoch": 0.9119487331689855, "grad_norm": 0.39582687616348267, "learning_rate": 1.0096630710779264e-06, "loss": 1.2218, "num_input_tokens_seen": 738942248, "step": 18820 }, { "epoch": 0.9124332967891603, "grad_norm": 0.3957637846469879, "learning_rate": 9.98654522030318e-07, "loss": 1.2582, "num_input_tokens_seen": 739314768, "step": 18830 }, { "epoch": 0.9129178604093351, "grad_norm": 0.4404972493648529, "learning_rate": 9.877050933694176e-07, "loss": 1.2526, "num_input_tokens_seen": 739693344, "step": 18840 }, { "epoch": 0.9134024240295099, "grad_norm": 0.4136838912963867, "learning_rate": 9.768148120659903e-07, "loss": 1.2503, "num_input_tokens_seen": 740095948, "step": 18850 }, { "epoch": 0.9138869876496847, "grad_norm": 0.4641477167606354, "learning_rate": 9.65983704945092e-07, "loss": 1.2321, "num_input_tokens_seen": 740509384, "step": 18860 }, { "epoch": 0.9143715512698596, "grad_norm": 0.37187913060188293, "learning_rate": 9.55211798686037e-07, "loss": 1.2635, "num_input_tokens_seen": 740899344, "step": 18870 }, { "epoch": 0.9148561148900344, "grad_norm": 0.4150676131248474, "learning_rate": 9.444991198223008e-07, "loss": 1.2356, "num_input_tokens_seen": 741314140, "step": 18880 }, { "epoch": 0.9153406785102092, "grad_norm": 0.413786917924881, "learning_rate": 9.338456947414837e-07, "loss": 1.2555, "num_input_tokens_seen": 741697844, "step": 18890 }, { "epoch": 0.915825242130384, "grad_norm": 0.49498802423477173, "learning_rate": 9.23251549685214e-07, "loss": 1.2668, "num_input_tokens_seen": 742079296, "step": 18900 }, { "epoch": 0.9163098057505588, "grad_norm": 0.39548417925834656, "learning_rate": 9.127167107491174e-07, "loss": 1.2235, "num_input_tokens_seen": 742477492, "step": 18910 }, { "epoch": 0.9167943693707336, "grad_norm": 0.3953016698360443, "learning_rate": 9.022412038827227e-07, "loss": 1.2328, "num_input_tokens_seen": 742879824, "step": 18920 }, { "epoch": 0.9172789329909083, "grad_norm": 0.422567754983902, "learning_rate": 8.918250548894225e-07, "loss": 1.2207, "num_input_tokens_seen": 743283828, "step": 18930 }, { "epoch": 0.9177634966110831, "grad_norm": 0.4394066333770752, "learning_rate": 8.814682894263904e-07, "loss": 1.2525, "num_input_tokens_seen": 743671012, "step": 18940 }, { "epoch": 0.9182480602312579, "grad_norm": 0.3766193389892578, "learning_rate": 8.711709330045309e-07, "loss": 1.2441, "num_input_tokens_seen": 744042668, "step": 18950 }, { "epoch": 0.9187326238514328, "grad_norm": 0.4037720561027527, "learning_rate": 8.609330109884045e-07, "loss": 1.3088, "num_input_tokens_seen": 744414936, "step": 18960 }, { "epoch": 0.9192171874716076, "grad_norm": 0.38858434557914734, "learning_rate": 8.507545485961804e-07, "loss": 1.2561, "num_input_tokens_seen": 744849212, "step": 18970 }, { "epoch": 0.9197017510917824, "grad_norm": 0.44780433177948, "learning_rate": 8.406355708995672e-07, "loss": 1.2863, "num_input_tokens_seen": 745242736, "step": 18980 }, { "epoch": 0.9201863147119572, "grad_norm": 0.4300279915332794, "learning_rate": 8.305761028237353e-07, "loss": 1.2229, "num_input_tokens_seen": 745670052, "step": 18990 }, { "epoch": 0.920670878332132, "grad_norm": 0.4177449345588684, "learning_rate": 8.205761691472913e-07, "loss": 1.2178, "num_input_tokens_seen": 746084620, "step": 19000 }, { "epoch": 0.9211554419523068, "grad_norm": 0.38727110624313354, "learning_rate": 8.106357945021765e-07, "loss": 1.2613, "num_input_tokens_seen": 746466352, "step": 19010 }, { "epoch": 0.9216400055724816, "grad_norm": 0.4461565315723419, "learning_rate": 8.007550033736405e-07, "loss": 1.2423, "num_input_tokens_seen": 746860664, "step": 19020 }, { "epoch": 0.9221245691926564, "grad_norm": 0.4176614582538605, "learning_rate": 7.909338201001564e-07, "loss": 1.2664, "num_input_tokens_seen": 747238188, "step": 19030 }, { "epoch": 0.9226091328128312, "grad_norm": 0.4129350781440735, "learning_rate": 7.811722688733786e-07, "loss": 1.2481, "num_input_tokens_seen": 747640208, "step": 19040 }, { "epoch": 0.9230936964330061, "grad_norm": 0.40543922781944275, "learning_rate": 7.714703737380674e-07, "loss": 1.2575, "num_input_tokens_seen": 748033864, "step": 19050 }, { "epoch": 0.9235782600531809, "grad_norm": 0.4024973511695862, "learning_rate": 7.618281585920456e-07, "loss": 1.2477, "num_input_tokens_seen": 748427680, "step": 19060 }, { "epoch": 0.9240628236733557, "grad_norm": 0.3884994387626648, "learning_rate": 7.522456471861172e-07, "loss": 1.2325, "num_input_tokens_seen": 748812864, "step": 19070 }, { "epoch": 0.9245473872935305, "grad_norm": 0.38652002811431885, "learning_rate": 7.427228631240457e-07, "loss": 1.2592, "num_input_tokens_seen": 749169748, "step": 19080 }, { "epoch": 0.9250319509137053, "grad_norm": 0.4229782521724701, "learning_rate": 7.33259829862451e-07, "loss": 1.2767, "num_input_tokens_seen": 749552652, "step": 19090 }, { "epoch": 0.9255165145338801, "grad_norm": 0.3931877315044403, "learning_rate": 7.238565707107875e-07, "loss": 1.2596, "num_input_tokens_seen": 749938204, "step": 19100 }, { "epoch": 0.9260010781540549, "grad_norm": 0.42200735211372375, "learning_rate": 7.145131088312745e-07, "loss": 1.2536, "num_input_tokens_seen": 750322924, "step": 19110 }, { "epoch": 0.9264856417742297, "grad_norm": 0.40737777948379517, "learning_rate": 7.052294672388271e-07, "loss": 1.2143, "num_input_tokens_seen": 750735180, "step": 19120 }, { "epoch": 0.9269702053944046, "grad_norm": 0.41996297240257263, "learning_rate": 6.960056688010197e-07, "loss": 1.2552, "num_input_tokens_seen": 751134712, "step": 19130 }, { "epoch": 0.9274547690145793, "grad_norm": 0.40312203764915466, "learning_rate": 6.868417362380114e-07, "loss": 1.2409, "num_input_tokens_seen": 751509996, "step": 19140 }, { "epoch": 0.9279393326347541, "grad_norm": 0.3921646773815155, "learning_rate": 6.777376921225125e-07, "loss": 1.2459, "num_input_tokens_seen": 751937076, "step": 19150 }, { "epoch": 0.9284238962549289, "grad_norm": 0.4335014522075653, "learning_rate": 6.686935588797072e-07, "loss": 1.2383, "num_input_tokens_seen": 752333820, "step": 19160 }, { "epoch": 0.9289084598751037, "grad_norm": 0.42503783106803894, "learning_rate": 6.597093587872055e-07, "loss": 1.2457, "num_input_tokens_seen": 752732964, "step": 19170 }, { "epoch": 0.9293930234952785, "grad_norm": 0.4474683701992035, "learning_rate": 6.507851139749888e-07, "loss": 1.2576, "num_input_tokens_seen": 753114736, "step": 19180 }, { "epoch": 0.9298775871154533, "grad_norm": 0.40870001912117004, "learning_rate": 6.419208464253618e-07, "loss": 1.2785, "num_input_tokens_seen": 753496580, "step": 19190 }, { "epoch": 0.9303621507356281, "grad_norm": 0.4168875217437744, "learning_rate": 6.331165779728865e-07, "loss": 1.2859, "num_input_tokens_seen": 753875144, "step": 19200 }, { "epoch": 0.9308467143558029, "grad_norm": 0.3911028504371643, "learning_rate": 6.243723303043403e-07, "loss": 1.2791, "num_input_tokens_seen": 754260416, "step": 19210 }, { "epoch": 0.9313312779759778, "grad_norm": 0.4180554151535034, "learning_rate": 6.156881249586493e-07, "loss": 1.2427, "num_input_tokens_seen": 754653980, "step": 19220 }, { "epoch": 0.9318158415961526, "grad_norm": 0.4038565158843994, "learning_rate": 6.070639833268471e-07, "loss": 1.2551, "num_input_tokens_seen": 755023452, "step": 19230 }, { "epoch": 0.9323004052163274, "grad_norm": 0.433095246553421, "learning_rate": 5.984999266520214e-07, "loss": 1.2713, "num_input_tokens_seen": 755440128, "step": 19240 }, { "epoch": 0.9327849688365022, "grad_norm": 0.4306170642375946, "learning_rate": 5.899959760292478e-07, "loss": 1.2995, "num_input_tokens_seen": 755840776, "step": 19250 }, { "epoch": 0.933269532456677, "grad_norm": 0.4218454658985138, "learning_rate": 5.815521524055623e-07, "loss": 1.2378, "num_input_tokens_seen": 756263932, "step": 19260 }, { "epoch": 0.9337540960768518, "grad_norm": 0.3932402729988098, "learning_rate": 5.731684765798772e-07, "loss": 1.2506, "num_input_tokens_seen": 756648804, "step": 19270 }, { "epoch": 0.9342386596970266, "grad_norm": 0.4214298129081726, "learning_rate": 5.648449692029656e-07, "loss": 1.2264, "num_input_tokens_seen": 757063844, "step": 19280 }, { "epoch": 0.9347232233172014, "grad_norm": 0.4255639612674713, "learning_rate": 5.565816507773797e-07, "loss": 1.2217, "num_input_tokens_seen": 757437036, "step": 19290 }, { "epoch": 0.9352077869373762, "grad_norm": 0.3967702090740204, "learning_rate": 5.483785416574239e-07, "loss": 1.3114, "num_input_tokens_seen": 757815468, "step": 19300 }, { "epoch": 0.9356923505575511, "grad_norm": 0.4092276096343994, "learning_rate": 5.402356620490878e-07, "loss": 1.2753, "num_input_tokens_seen": 758208344, "step": 19310 }, { "epoch": 0.9361769141777259, "grad_norm": 0.40975773334503174, "learning_rate": 5.321530320100076e-07, "loss": 1.2785, "num_input_tokens_seen": 758618456, "step": 19320 }, { "epoch": 0.9366614777979007, "grad_norm": 0.44307002425193787, "learning_rate": 5.241306714494021e-07, "loss": 1.2863, "num_input_tokens_seen": 759009464, "step": 19330 }, { "epoch": 0.9371460414180754, "grad_norm": 0.39563581347465515, "learning_rate": 5.161686001280503e-07, "loss": 1.2356, "num_input_tokens_seen": 759418492, "step": 19340 }, { "epoch": 0.9376306050382502, "grad_norm": 0.4311593472957611, "learning_rate": 5.082668376582111e-07, "loss": 1.2691, "num_input_tokens_seen": 759767128, "step": 19350 }, { "epoch": 0.938115168658425, "grad_norm": 0.3757501542568207, "learning_rate": 5.00425403503596e-07, "loss": 1.2662, "num_input_tokens_seen": 760168828, "step": 19360 }, { "epoch": 0.9385997322785998, "grad_norm": 0.4447803795337677, "learning_rate": 4.926443169793154e-07, "loss": 1.3027, "num_input_tokens_seen": 760588676, "step": 19370 }, { "epoch": 0.9390842958987746, "grad_norm": 0.4607948958873749, "learning_rate": 4.849235972518295e-07, "loss": 1.2566, "num_input_tokens_seen": 760956864, "step": 19380 }, { "epoch": 0.9395688595189494, "grad_norm": 0.3940238058567047, "learning_rate": 4.772632633389063e-07, "loss": 1.2854, "num_input_tokens_seen": 761318912, "step": 19390 }, { "epoch": 0.9400534231391243, "grad_norm": 0.4033295810222626, "learning_rate": 4.6966333410956023e-07, "loss": 1.2041, "num_input_tokens_seen": 761704264, "step": 19400 }, { "epoch": 0.9405379867592991, "grad_norm": 0.4064282178878784, "learning_rate": 4.621238282840279e-07, "loss": 1.24, "num_input_tokens_seen": 762085876, "step": 19410 }, { "epoch": 0.9410225503794739, "grad_norm": 0.39992567896842957, "learning_rate": 4.546447644337065e-07, "loss": 1.2397, "num_input_tokens_seen": 762474368, "step": 19420 }, { "epoch": 0.9415071139996487, "grad_norm": 0.4709777235984802, "learning_rate": 4.4722616098110684e-07, "loss": 1.2858, "num_input_tokens_seen": 762874704, "step": 19430 }, { "epoch": 0.9419916776198235, "grad_norm": 0.4245252311229706, "learning_rate": 4.3986803619981973e-07, "loss": 1.2707, "num_input_tokens_seen": 763268244, "step": 19440 }, { "epoch": 0.9424762412399983, "grad_norm": 0.44352298974990845, "learning_rate": 4.325704082144666e-07, "loss": 1.2645, "num_input_tokens_seen": 763643580, "step": 19450 }, { "epoch": 0.9429608048601731, "grad_norm": 0.39565107226371765, "learning_rate": 4.2533329500063776e-07, "loss": 1.2278, "num_input_tokens_seen": 764054380, "step": 19460 }, { "epoch": 0.9434453684803479, "grad_norm": 0.3986281454563141, "learning_rate": 4.181567143848819e-07, "loss": 1.2794, "num_input_tokens_seen": 764458228, "step": 19470 }, { "epoch": 0.9439299321005227, "grad_norm": 0.4145820140838623, "learning_rate": 4.1104068404462514e-07, "loss": 1.242, "num_input_tokens_seen": 764816404, "step": 19480 }, { "epoch": 0.9444144957206976, "grad_norm": 0.42509499192237854, "learning_rate": 4.039852215081602e-07, "loss": 1.2862, "num_input_tokens_seen": 765190032, "step": 19490 }, { "epoch": 0.9448990593408724, "grad_norm": 0.38184764981269836, "learning_rate": 3.969903441545825e-07, "loss": 1.2225, "num_input_tokens_seen": 765565508, "step": 19500 }, { "epoch": 0.9453836229610472, "grad_norm": 0.460821270942688, "learning_rate": 3.9005606921375117e-07, "loss": 1.2652, "num_input_tokens_seen": 765974548, "step": 19510 }, { "epoch": 0.945868186581222, "grad_norm": 0.4286702573299408, "learning_rate": 3.831824137662504e-07, "loss": 1.2459, "num_input_tokens_seen": 766399956, "step": 19520 }, { "epoch": 0.9463527502013968, "grad_norm": 0.38720691204071045, "learning_rate": 3.7636939474334775e-07, "loss": 1.2141, "num_input_tokens_seen": 766793544, "step": 19530 }, { "epoch": 0.9468373138215715, "grad_norm": 0.3998976945877075, "learning_rate": 3.696170289269524e-07, "loss": 1.2611, "num_input_tokens_seen": 767186876, "step": 19540 }, { "epoch": 0.9473218774417463, "grad_norm": 0.4127962291240692, "learning_rate": 3.6292533294955966e-07, "loss": 1.2499, "num_input_tokens_seen": 767600456, "step": 19550 }, { "epoch": 0.9478064410619211, "grad_norm": 0.4086759388446808, "learning_rate": 3.5629432329424006e-07, "loss": 1.2433, "num_input_tokens_seen": 768032396, "step": 19560 }, { "epoch": 0.9482910046820959, "grad_norm": 0.3765683174133301, "learning_rate": 3.49724016294567e-07, "loss": 1.2663, "num_input_tokens_seen": 768403312, "step": 19570 }, { "epoch": 0.9487755683022708, "grad_norm": 0.47519636154174805, "learning_rate": 3.432144281345973e-07, "loss": 1.2692, "num_input_tokens_seen": 768763468, "step": 19580 }, { "epoch": 0.9492601319224456, "grad_norm": 0.41748225688934326, "learning_rate": 3.3676557484881855e-07, "loss": 1.2243, "num_input_tokens_seen": 769110720, "step": 19590 }, { "epoch": 0.9497446955426204, "grad_norm": 0.4386734962463379, "learning_rate": 3.30377472322127e-07, "loss": 1.2672, "num_input_tokens_seen": 769514668, "step": 19600 }, { "epoch": 0.9502292591627952, "grad_norm": 0.42562198638916016, "learning_rate": 3.2405013628976077e-07, "loss": 1.2504, "num_input_tokens_seen": 769901384, "step": 19610 }, { "epoch": 0.95071382278297, "grad_norm": 0.4268267750740051, "learning_rate": 3.1778358233729154e-07, "loss": 1.2657, "num_input_tokens_seen": 770308688, "step": 19620 }, { "epoch": 0.9511983864031448, "grad_norm": 0.4298213720321655, "learning_rate": 3.1157782590056637e-07, "loss": 1.1705, "num_input_tokens_seen": 770709140, "step": 19630 }, { "epoch": 0.9516829500233196, "grad_norm": 0.4501436650753021, "learning_rate": 3.054328822656688e-07, "loss": 1.2964, "num_input_tokens_seen": 771100080, "step": 19640 }, { "epoch": 0.9521675136434944, "grad_norm": 0.43033567070961, "learning_rate": 2.9934876656890207e-07, "loss": 1.2118, "num_input_tokens_seen": 771491372, "step": 19650 }, { "epoch": 0.9526520772636692, "grad_norm": 0.4044438302516937, "learning_rate": 2.933254937967228e-07, "loss": 1.2532, "num_input_tokens_seen": 771854668, "step": 19660 }, { "epoch": 0.9531366408838441, "grad_norm": 0.38256967067718506, "learning_rate": 2.8736307878572957e-07, "loss": 1.2422, "num_input_tokens_seen": 772233856, "step": 19670 }, { "epoch": 0.9536212045040189, "grad_norm": 0.4290720224380493, "learning_rate": 2.8146153622260475e-07, "loss": 1.2342, "num_input_tokens_seen": 772629696, "step": 19680 }, { "epoch": 0.9541057681241937, "grad_norm": 0.3771139681339264, "learning_rate": 2.7562088064410354e-07, "loss": 1.2594, "num_input_tokens_seen": 773032252, "step": 19690 }, { "epoch": 0.9545903317443685, "grad_norm": 0.4023270606994629, "learning_rate": 2.6984112643698434e-07, "loss": 1.2354, "num_input_tokens_seen": 773437064, "step": 19700 }, { "epoch": 0.9550748953645433, "grad_norm": 0.42096689343452454, "learning_rate": 2.641222878380117e-07, "loss": 1.2892, "num_input_tokens_seen": 773816632, "step": 19710 }, { "epoch": 0.9555594589847181, "grad_norm": 0.3777530789375305, "learning_rate": 2.5846437893388977e-07, "loss": 1.2892, "num_input_tokens_seen": 774228364, "step": 19720 }, { "epoch": 0.9560440226048929, "grad_norm": 0.43396878242492676, "learning_rate": 2.52867413661248e-07, "loss": 1.2451, "num_input_tokens_seen": 774601636, "step": 19730 }, { "epoch": 0.9565285862250676, "grad_norm": 0.39147117733955383, "learning_rate": 2.4733140580658897e-07, "loss": 1.2891, "num_input_tokens_seen": 774994136, "step": 19740 }, { "epoch": 0.9570131498452424, "grad_norm": 0.43016576766967773, "learning_rate": 2.4185636900627417e-07, "loss": 1.2161, "num_input_tokens_seen": 775389680, "step": 19750 }, { "epoch": 0.9574977134654173, "grad_norm": 0.4033438265323639, "learning_rate": 2.3644231674647688e-07, "loss": 1.2664, "num_input_tokens_seen": 775778568, "step": 19760 }, { "epoch": 0.9579822770855921, "grad_norm": 0.4349776804447174, "learning_rate": 2.3108926236314887e-07, "loss": 1.2115, "num_input_tokens_seen": 776157176, "step": 19770 }, { "epoch": 0.9584668407057669, "grad_norm": 0.4078141748905182, "learning_rate": 2.2579721904199824e-07, "loss": 1.2701, "num_input_tokens_seen": 776547680, "step": 19780 }, { "epoch": 0.9589514043259417, "grad_norm": 0.4067167639732361, "learning_rate": 2.2056619981844495e-07, "loss": 1.2334, "num_input_tokens_seen": 776966012, "step": 19790 }, { "epoch": 0.9594359679461165, "grad_norm": 0.41261714696884155, "learning_rate": 2.153962175775931e-07, "loss": 1.2612, "num_input_tokens_seen": 777377344, "step": 19800 }, { "epoch": 0.9599205315662913, "grad_norm": 0.3794651925563812, "learning_rate": 2.1028728505420313e-07, "loss": 1.2746, "num_input_tokens_seen": 777740524, "step": 19810 }, { "epoch": 0.9604050951864661, "grad_norm": 0.3992972671985626, "learning_rate": 2.0523941483265586e-07, "loss": 1.2302, "num_input_tokens_seen": 778140684, "step": 19820 }, { "epoch": 0.9608896588066409, "grad_norm": 0.4393966495990753, "learning_rate": 2.0025261934691897e-07, "loss": 1.2686, "num_input_tokens_seen": 778528860, "step": 19830 }, { "epoch": 0.9613742224268158, "grad_norm": 0.4146067798137665, "learning_rate": 1.9532691088053056e-07, "loss": 1.2656, "num_input_tokens_seen": 778900016, "step": 19840 }, { "epoch": 0.9618587860469906, "grad_norm": 0.43496647477149963, "learning_rate": 1.904623015665463e-07, "loss": 1.2421, "num_input_tokens_seen": 779282552, "step": 19850 }, { "epoch": 0.9623433496671654, "grad_norm": 0.43942153453826904, "learning_rate": 1.8565880338752838e-07, "loss": 1.2166, "num_input_tokens_seen": 779634656, "step": 19860 }, { "epoch": 0.9628279132873402, "grad_norm": 0.41181692481040955, "learning_rate": 1.8091642817550935e-07, "loss": 1.2809, "num_input_tokens_seen": 780011584, "step": 19870 }, { "epoch": 0.963312476907515, "grad_norm": 0.44306066632270813, "learning_rate": 1.762351876119589e-07, "loss": 1.2406, "num_input_tokens_seen": 780398180, "step": 19880 }, { "epoch": 0.9637970405276898, "grad_norm": 0.38546454906463623, "learning_rate": 1.7161509322776437e-07, "loss": 1.2775, "num_input_tokens_seen": 780787352, "step": 19890 }, { "epoch": 0.9642816041478646, "grad_norm": 0.425279825925827, "learning_rate": 1.6705615640319472e-07, "loss": 1.2662, "num_input_tokens_seen": 781181080, "step": 19900 }, { "epoch": 0.9647661677680394, "grad_norm": 0.4292610287666321, "learning_rate": 1.625583883678755e-07, "loss": 1.2033, "num_input_tokens_seen": 781590820, "step": 19910 }, { "epoch": 0.9652507313882142, "grad_norm": 0.44324782490730286, "learning_rate": 1.5812180020075563e-07, "loss": 1.2729, "num_input_tokens_seen": 782000476, "step": 19920 }, { "epoch": 0.9657352950083891, "grad_norm": 0.4017219543457031, "learning_rate": 1.537464028300961e-07, "loss": 1.2573, "num_input_tokens_seen": 782384008, "step": 19930 }, { "epoch": 0.9662198586285639, "grad_norm": 0.4076748490333557, "learning_rate": 1.4943220703342031e-07, "loss": 1.2051, "num_input_tokens_seen": 782770540, "step": 19940 }, { "epoch": 0.9667044222487386, "grad_norm": 0.4016420841217041, "learning_rate": 1.4517922343750546e-07, "loss": 1.2357, "num_input_tokens_seen": 783161236, "step": 19950 }, { "epoch": 0.9671889858689134, "grad_norm": 0.4644124507904053, "learning_rate": 1.4098746251834938e-07, "loss": 1.3148, "num_input_tokens_seen": 783549976, "step": 19960 }, { "epoch": 0.9676735494890882, "grad_norm": 0.4020436108112335, "learning_rate": 1.3685693460114835e-07, "loss": 1.2423, "num_input_tokens_seen": 783956176, "step": 19970 }, { "epoch": 0.968158113109263, "grad_norm": 0.42544135451316833, "learning_rate": 1.3278764986025816e-07, "loss": 1.2435, "num_input_tokens_seen": 784370068, "step": 19980 }, { "epoch": 0.9686426767294378, "grad_norm": 0.40704938769340515, "learning_rate": 1.2877961831919416e-07, "loss": 1.1991, "num_input_tokens_seen": 784774032, "step": 19990 }, { "epoch": 0.9691272403496126, "grad_norm": 0.45113271474838257, "learning_rate": 1.248328498505813e-07, "loss": 1.216, "num_input_tokens_seen": 785148304, "step": 20000 }, { "epoch": 0.9691272403496126, "eval_loss": 1.3588515520095825, "eval_runtime": 4.2882, "eval_samples_per_second": 34.98, "eval_steps_per_second": 4.431, "num_input_tokens_seen": 785148304, "step": 20000 }, { "epoch": 0.9696118039697874, "grad_norm": 0.4246367812156677, "learning_rate": 1.2094735417614579e-07, "loss": 1.3037, "num_input_tokens_seen": 785532360, "step": 20010 }, { "epoch": 0.9700963675899623, "grad_norm": 0.40048691630363464, "learning_rate": 1.1712314086668452e-07, "loss": 1.2873, "num_input_tokens_seen": 785948652, "step": 20020 }, { "epoch": 0.9705809312101371, "grad_norm": 0.39899271726608276, "learning_rate": 1.13360219342043e-07, "loss": 1.2443, "num_input_tokens_seen": 786340648, "step": 20030 }, { "epoch": 0.9710654948303119, "grad_norm": 0.41089197993278503, "learning_rate": 1.0965859887109575e-07, "loss": 1.2696, "num_input_tokens_seen": 786729636, "step": 20040 }, { "epoch": 0.9715500584504867, "grad_norm": 0.3835326135158539, "learning_rate": 1.0601828857171037e-07, "loss": 1.2485, "num_input_tokens_seen": 787122652, "step": 20050 }, { "epoch": 0.9720346220706615, "grad_norm": 0.43811744451522827, "learning_rate": 1.0243929741074465e-07, "loss": 1.2809, "num_input_tokens_seen": 787529572, "step": 20060 }, { "epoch": 0.9725191856908363, "grad_norm": 0.4033362865447998, "learning_rate": 9.892163420400779e-08, "loss": 1.2607, "num_input_tokens_seen": 787926640, "step": 20070 }, { "epoch": 0.9730037493110111, "grad_norm": 0.4612255096435547, "learning_rate": 9.546530761624928e-08, "loss": 1.2536, "num_input_tokens_seen": 788343440, "step": 20080 }, { "epoch": 0.9734883129311859, "grad_norm": 0.4130823314189911, "learning_rate": 9.20703261611311e-08, "loss": 1.2822, "num_input_tokens_seen": 788759176, "step": 20090 }, { "epoch": 0.9739728765513607, "grad_norm": 0.46311095356941223, "learning_rate": 8.873669820121111e-08, "loss": 1.2476, "num_input_tokens_seen": 789162040, "step": 20100 }, { "epoch": 0.9744574401715356, "grad_norm": 0.3985438942909241, "learning_rate": 8.546443194791808e-08, "loss": 1.2453, "num_input_tokens_seen": 789567068, "step": 20110 }, { "epoch": 0.9749420037917104, "grad_norm": 0.40944841504096985, "learning_rate": 8.22535354615378e-08, "loss": 1.2376, "num_input_tokens_seen": 789989960, "step": 20120 }, { "epoch": 0.9754265674118852, "grad_norm": 0.41747674345970154, "learning_rate": 7.910401665118528e-08, "loss": 1.2289, "num_input_tokens_seen": 790375840, "step": 20130 }, { "epoch": 0.97591113103206, "grad_norm": 0.41349682211875916, "learning_rate": 7.601588327479092e-08, "loss": 1.2703, "num_input_tokens_seen": 790758920, "step": 20140 }, { "epoch": 0.9763956946522347, "grad_norm": 0.4096575677394867, "learning_rate": 7.298914293907833e-08, "loss": 1.2, "num_input_tokens_seen": 791138556, "step": 20150 }, { "epoch": 0.9768802582724095, "grad_norm": 0.39834490418434143, "learning_rate": 7.002380309955314e-08, "loss": 1.2784, "num_input_tokens_seen": 791537516, "step": 20160 }, { "epoch": 0.9773648218925843, "grad_norm": 0.41982364654541016, "learning_rate": 6.711987106046979e-08, "loss": 1.2283, "num_input_tokens_seen": 791914936, "step": 20170 }, { "epoch": 0.9778493855127591, "grad_norm": 0.4055969715118408, "learning_rate": 6.427735397483148e-08, "loss": 1.243, "num_input_tokens_seen": 792312344, "step": 20180 }, { "epoch": 0.9783339491329339, "grad_norm": 0.490959495306015, "learning_rate": 6.149625884435407e-08, "loss": 1.2525, "num_input_tokens_seen": 792710660, "step": 20190 }, { "epoch": 0.9788185127531088, "grad_norm": 0.4319050908088684, "learning_rate": 5.877659251946332e-08, "loss": 1.2896, "num_input_tokens_seen": 793093572, "step": 20200 }, { "epoch": 0.9793030763732836, "grad_norm": 0.42025476694107056, "learning_rate": 5.611836169927276e-08, "loss": 1.2736, "num_input_tokens_seen": 793485640, "step": 20210 }, { "epoch": 0.9797876399934584, "grad_norm": 0.47515881061553955, "learning_rate": 5.35215729315669e-08, "loss": 1.3133, "num_input_tokens_seen": 793896728, "step": 20220 }, { "epoch": 0.9802722036136332, "grad_norm": 0.41248735785484314, "learning_rate": 5.0986232612787453e-08, "loss": 1.2762, "num_input_tokens_seen": 794295484, "step": 20230 }, { "epoch": 0.980756767233808, "grad_norm": 0.4190135896205902, "learning_rate": 4.851234698800833e-08, "loss": 1.2217, "num_input_tokens_seen": 794691580, "step": 20240 }, { "epoch": 0.9812413308539828, "grad_norm": 0.41553497314453125, "learning_rate": 4.609992215093839e-08, "loss": 1.2819, "num_input_tokens_seen": 795077668, "step": 20250 }, { "epoch": 0.9817258944741576, "grad_norm": 0.42695561051368713, "learning_rate": 4.374896404388818e-08, "loss": 1.252, "num_input_tokens_seen": 795454924, "step": 20260 }, { "epoch": 0.9822104580943324, "grad_norm": 0.3926050066947937, "learning_rate": 4.145947845776155e-08, "loss": 1.2739, "num_input_tokens_seen": 795863124, "step": 20270 }, { "epoch": 0.9826950217145072, "grad_norm": 0.4227527678012848, "learning_rate": 3.923147103204738e-08, "loss": 1.2595, "num_input_tokens_seen": 796247640, "step": 20280 }, { "epoch": 0.9831795853346821, "grad_norm": 0.40944164991378784, "learning_rate": 3.7064947254797365e-08, "loss": 1.3064, "num_input_tokens_seen": 796646028, "step": 20290 }, { "epoch": 0.9836641489548569, "grad_norm": 0.39912155270576477, "learning_rate": 3.4959912462620426e-08, "loss": 1.1981, "num_input_tokens_seen": 797037764, "step": 20300 }, { "epoch": 0.9841487125750317, "grad_norm": 0.4131697118282318, "learning_rate": 3.2916371840660544e-08, "loss": 1.2223, "num_input_tokens_seen": 797429064, "step": 20310 }, { "epoch": 0.9846332761952065, "grad_norm": 0.4304356873035431, "learning_rate": 3.093433042259119e-08, "loss": 1.2225, "num_input_tokens_seen": 797810616, "step": 20320 }, { "epoch": 0.9851178398153813, "grad_norm": 0.4309372007846832, "learning_rate": 2.9013793090598708e-08, "loss": 1.2713, "num_input_tokens_seen": 798207544, "step": 20330 }, { "epoch": 0.985602403435556, "grad_norm": 0.4649990200996399, "learning_rate": 2.715476457537114e-08, "loss": 1.2487, "num_input_tokens_seen": 798598064, "step": 20340 }, { "epoch": 0.9860869670557308, "grad_norm": 0.4426388740539551, "learning_rate": 2.535724945608997e-08, "loss": 1.2796, "num_input_tokens_seen": 798970944, "step": 20350 }, { "epoch": 0.9865715306759056, "grad_norm": 0.4075920879840851, "learning_rate": 2.3621252160413443e-08, "loss": 1.2605, "num_input_tokens_seen": 799356380, "step": 20360 }, { "epoch": 0.9870560942960804, "grad_norm": 0.40336674451828003, "learning_rate": 2.194677696447378e-08, "loss": 1.2305, "num_input_tokens_seen": 799753192, "step": 20370 }, { "epoch": 0.9875406579162553, "grad_norm": 0.4300762712955475, "learning_rate": 2.0333827992852217e-08, "loss": 1.22, "num_input_tokens_seen": 800120392, "step": 20380 }, { "epoch": 0.9880252215364301, "grad_norm": 0.4360959827899933, "learning_rate": 1.878240921858454e-08, "loss": 1.2784, "num_input_tokens_seen": 800495740, "step": 20390 }, { "epoch": 0.9885097851566049, "grad_norm": 0.4353114664554596, "learning_rate": 1.7292524463144445e-08, "loss": 1.2539, "num_input_tokens_seen": 800902016, "step": 20400 }, { "epoch": 0.9889943487767797, "grad_norm": 0.43556949496269226, "learning_rate": 1.5864177396432422e-08, "loss": 1.227, "num_input_tokens_seen": 801289824, "step": 20410 }, { "epoch": 0.9894789123969545, "grad_norm": 0.4041503071784973, "learning_rate": 1.4497371536770221e-08, "loss": 1.2591, "num_input_tokens_seen": 801688324, "step": 20420 }, { "epoch": 0.9899634760171293, "grad_norm": 0.42014312744140625, "learning_rate": 1.3192110250886957e-08, "loss": 1.1945, "num_input_tokens_seen": 802094388, "step": 20430 }, { "epoch": 0.9904480396373041, "grad_norm": 0.44136643409729004, "learning_rate": 1.1948396753919123e-08, "loss": 1.274, "num_input_tokens_seen": 802477196, "step": 20440 }, { "epoch": 0.9909326032574789, "grad_norm": 0.4420247972011566, "learning_rate": 1.0766234109393925e-08, "loss": 1.2007, "num_input_tokens_seen": 802877584, "step": 20450 }, { "epoch": 0.9914171668776537, "grad_norm": 0.4464600086212158, "learning_rate": 9.645625229232069e-09, "loss": 1.219, "num_input_tokens_seen": 803263948, "step": 20460 }, { "epoch": 0.9919017304978286, "grad_norm": 0.4290953278541565, "learning_rate": 8.586572873725551e-09, "loss": 1.2345, "num_input_tokens_seen": 803671476, "step": 20470 }, { "epoch": 0.9923862941180034, "grad_norm": 0.40474194288253784, "learning_rate": 7.589079651543207e-09, "loss": 1.2742, "num_input_tokens_seen": 804084040, "step": 20480 }, { "epoch": 0.9928708577381782, "grad_norm": 0.41104060411453247, "learning_rate": 6.653148019727939e-09, "loss": 1.2496, "num_input_tokens_seen": 804500276, "step": 20490 }, { "epoch": 0.993355421358353, "grad_norm": 0.3800486922264099, "learning_rate": 5.778780283671736e-09, "loss": 1.2768, "num_input_tokens_seen": 804875212, "step": 20500 }, { "epoch": 0.9938399849785278, "grad_norm": 0.421836256980896, "learning_rate": 4.96597859712955e-09, "loss": 1.2106, "num_input_tokens_seen": 805248472, "step": 20510 }, { "epoch": 0.9943245485987026, "grad_norm": 0.4047603905200958, "learning_rate": 4.2147449622026435e-09, "loss": 1.2057, "num_input_tokens_seen": 805647944, "step": 20520 }, { "epoch": 0.9948091122188774, "grad_norm": 0.3970736265182495, "learning_rate": 3.525081229338589e-09, "loss": 1.2238, "num_input_tokens_seen": 806023600, "step": 20530 }, { "epoch": 0.9952936758390521, "grad_norm": 0.4089398682117462, "learning_rate": 2.8969890973257198e-09, "loss": 1.2181, "num_input_tokens_seen": 806426272, "step": 20540 }, { "epoch": 0.995778239459227, "grad_norm": 0.39923080801963806, "learning_rate": 2.3304701132847996e-09, "loss": 1.2999, "num_input_tokens_seen": 806826016, "step": 20550 }, { "epoch": 0.9962628030794018, "grad_norm": 0.3768922686576843, "learning_rate": 1.8255256726745772e-09, "loss": 1.2206, "num_input_tokens_seen": 807236380, "step": 20560 }, { "epoch": 0.9967473666995766, "grad_norm": 0.39100906252861023, "learning_rate": 1.3821570192806832e-09, "loss": 1.266, "num_input_tokens_seen": 807644712, "step": 20570 }, { "epoch": 0.9972319303197514, "grad_norm": 0.40794339776039124, "learning_rate": 1.0003652452128532e-09, "loss": 1.2559, "num_input_tokens_seen": 808025808, "step": 20580 }, { "epoch": 0.9977164939399262, "grad_norm": 0.4562262296676636, "learning_rate": 6.801512909021535e-10, "loss": 1.2459, "num_input_tokens_seen": 808411128, "step": 20590 }, { "epoch": 0.998201057560101, "grad_norm": 0.41534632444381714, "learning_rate": 4.21515945106532e-10, "loss": 1.2734, "num_input_tokens_seen": 808793980, "step": 20600 }, { "epoch": 0.9986856211802758, "grad_norm": 0.4286879897117615, "learning_rate": 2.2445984489971593e-10, "loss": 1.2695, "num_input_tokens_seen": 809188808, "step": 20610 }, { "epoch": 0.9991701848004506, "grad_norm": 0.4052944779396057, "learning_rate": 8.898347567121157e-11, "loss": 1.2466, "num_input_tokens_seen": 809544636, "step": 20620 }, { "epoch": 0.9996547484206254, "grad_norm": 0.4112201929092407, "learning_rate": 1.5087171129080092e-11, "loss": 1.231, "num_input_tokens_seen": 809914964, "step": 20630 }, { "epoch": 0.9999939429547479, "num_input_tokens_seen": 810173896, "step": 20637, "total_flos": 4.073925813718745e+18, "train_loss": 1.2524831550218467, "train_runtime": 185903.2881, "train_samples_per_second": 14.209, "train_steps_per_second": 0.111 } ], "logging_steps": 10, "max_steps": 20637, "num_input_tokens_seen": 810173896, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.073925813718745e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }