diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16637 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999939429547479, + "eval_steps": 2000, + "global_step": 20637, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00048456362017480634, + "grad_norm": 1.9635685682296753, + "learning_rate": 8.064516129032258e-07, + "loss": 3.9647, + "num_input_tokens_seen": 396856, + "step": 10 + }, + { + "epoch": 0.0009691272403496127, + "grad_norm": 1.8676133155822754, + "learning_rate": 1.6129032258064516e-06, + "loss": 3.8773, + "num_input_tokens_seen": 797088, + "step": 20 + }, + { + "epoch": 0.001453690860524419, + "grad_norm": 1.3513129949569702, + "learning_rate": 2.4193548387096776e-06, + "loss": 3.7214, + "num_input_tokens_seen": 1195348, + "step": 30 + }, + { + "epoch": 0.0019382544806992254, + "grad_norm": 1.085821270942688, + "learning_rate": 3.225806451612903e-06, + "loss": 3.608, + "num_input_tokens_seen": 1563616, + "step": 40 + }, + { + "epoch": 0.0024228181008740315, + "grad_norm": 0.7733827233314514, + "learning_rate": 4.032258064516129e-06, + "loss": 3.4798, + "num_input_tokens_seen": 1953880, + "step": 50 + }, + { + "epoch": 0.002907381721048838, + "grad_norm": 0.7254723310470581, + "learning_rate": 4.838709677419355e-06, + "loss": 3.3992, + "num_input_tokens_seen": 2358888, + "step": 60 + }, + { + "epoch": 0.0033919453412236443, + "grad_norm": 0.692788302898407, + "learning_rate": 5.64516129032258e-06, + "loss": 3.287, + "num_input_tokens_seen": 2784136, + "step": 70 + }, + { + "epoch": 0.0038765089613984507, + "grad_norm": 0.7256464958190918, + "learning_rate": 6.451612903225806e-06, + "loss": 3.1694, + "num_input_tokens_seen": 3174820, + "step": 80 + }, + { + "epoch": 0.004361072581573257, + "grad_norm": 0.6375479102134705, + "learning_rate": 7.258064516129033e-06, + "loss": 3.093, + "num_input_tokens_seen": 3579040, + "step": 90 + }, + { + "epoch": 0.004845636201748063, + "grad_norm": 0.5901396870613098, + "learning_rate": 8.064516129032258e-06, + "loss": 2.9749, + "num_input_tokens_seen": 3959804, + "step": 100 + }, + { + "epoch": 0.00533019982192287, + "grad_norm": 0.5945073366165161, + "learning_rate": 8.870967741935484e-06, + "loss": 2.9903, + "num_input_tokens_seen": 4327636, + "step": 110 + }, + { + "epoch": 0.005814763442097676, + "grad_norm": 0.5525237917900085, + "learning_rate": 9.67741935483871e-06, + "loss": 2.9102, + "num_input_tokens_seen": 4722664, + "step": 120 + }, + { + "epoch": 0.006299327062272482, + "grad_norm": 0.5421098470687866, + "learning_rate": 1.0483870967741936e-05, + "loss": 2.8672, + "num_input_tokens_seen": 5108488, + "step": 130 + }, + { + "epoch": 0.006783890682447289, + "grad_norm": 0.6057586669921875, + "learning_rate": 1.129032258064516e-05, + "loss": 2.7504, + "num_input_tokens_seen": 5499032, + "step": 140 + }, + { + "epoch": 0.007268454302622095, + "grad_norm": 0.5147073864936829, + "learning_rate": 1.2096774193548388e-05, + "loss": 2.728, + "num_input_tokens_seen": 5899468, + "step": 150 + }, + { + "epoch": 0.0077530179227969015, + "grad_norm": 0.4975648820400238, + "learning_rate": 1.2903225806451613e-05, + "loss": 2.635, + "num_input_tokens_seen": 6294784, + "step": 160 + }, + { + "epoch": 0.008237581542971708, + "grad_norm": 0.6665644645690918, + "learning_rate": 1.3709677419354839e-05, + "loss": 2.6568, + "num_input_tokens_seen": 6724124, + "step": 170 + }, + { + "epoch": 0.008722145163146514, + "grad_norm": 0.5339853763580322, + "learning_rate": 1.4516129032258066e-05, + "loss": 2.6569, + "num_input_tokens_seen": 7126528, + "step": 180 + }, + { + "epoch": 0.00920670878332132, + "grad_norm": 0.6043190956115723, + "learning_rate": 1.5322580645161292e-05, + "loss": 2.5659, + "num_input_tokens_seen": 7539124, + "step": 190 + }, + { + "epoch": 0.009691272403496126, + "grad_norm": 0.576831579208374, + "learning_rate": 1.6129032258064517e-05, + "loss": 2.6224, + "num_input_tokens_seen": 7943312, + "step": 200 + }, + { + "epoch": 0.010175836023670932, + "grad_norm": 0.5602587461471558, + "learning_rate": 1.693548387096774e-05, + "loss": 2.5489, + "num_input_tokens_seen": 8345264, + "step": 210 + }, + { + "epoch": 0.01066039964384574, + "grad_norm": 0.5199883580207825, + "learning_rate": 1.774193548387097e-05, + "loss": 2.4895, + "num_input_tokens_seen": 8724596, + "step": 220 + }, + { + "epoch": 0.011144963264020546, + "grad_norm": 0.5386412143707275, + "learning_rate": 1.8548387096774193e-05, + "loss": 2.4376, + "num_input_tokens_seen": 9128556, + "step": 230 + }, + { + "epoch": 0.011629526884195352, + "grad_norm": 0.4958534836769104, + "learning_rate": 1.935483870967742e-05, + "loss": 2.4393, + "num_input_tokens_seen": 9531512, + "step": 240 + }, + { + "epoch": 0.012114090504370158, + "grad_norm": 0.5383809208869934, + "learning_rate": 2.0161290322580645e-05, + "loss": 2.408, + "num_input_tokens_seen": 9926976, + "step": 250 + }, + { + "epoch": 0.012598654124544964, + "grad_norm": 0.5953584313392639, + "learning_rate": 2.0967741935483873e-05, + "loss": 2.4448, + "num_input_tokens_seen": 10307532, + "step": 260 + }, + { + "epoch": 0.013083217744719771, + "grad_norm": 0.6360442638397217, + "learning_rate": 2.1774193548387097e-05, + "loss": 2.4015, + "num_input_tokens_seen": 10680264, + "step": 270 + }, + { + "epoch": 0.013567781364894577, + "grad_norm": 0.48746633529663086, + "learning_rate": 2.258064516129032e-05, + "loss": 2.3641, + "num_input_tokens_seen": 11070464, + "step": 280 + }, + { + "epoch": 0.014052344985069383, + "grad_norm": 0.6602309346199036, + "learning_rate": 2.338709677419355e-05, + "loss": 2.3722, + "num_input_tokens_seen": 11470192, + "step": 290 + }, + { + "epoch": 0.01453690860524419, + "grad_norm": 0.5370129346847534, + "learning_rate": 2.4193548387096777e-05, + "loss": 2.3605, + "num_input_tokens_seen": 11872288, + "step": 300 + }, + { + "epoch": 0.015021472225418997, + "grad_norm": 0.575202226638794, + "learning_rate": 2.5e-05, + "loss": 2.3164, + "num_input_tokens_seen": 12282104, + "step": 310 + }, + { + "epoch": 0.015506035845593803, + "grad_norm": 0.5220383405685425, + "learning_rate": 2.5806451612903226e-05, + "loss": 2.3395, + "num_input_tokens_seen": 12670432, + "step": 320 + }, + { + "epoch": 0.01599059946576861, + "grad_norm": 0.6971769332885742, + "learning_rate": 2.661290322580645e-05, + "loss": 2.3035, + "num_input_tokens_seen": 13048952, + "step": 330 + }, + { + "epoch": 0.016475163085943417, + "grad_norm": 0.5094983577728271, + "learning_rate": 2.7419354838709678e-05, + "loss": 2.2668, + "num_input_tokens_seen": 13454104, + "step": 340 + }, + { + "epoch": 0.01695972670611822, + "grad_norm": 0.5272497534751892, + "learning_rate": 2.822580645161291e-05, + "loss": 2.2398, + "num_input_tokens_seen": 13846768, + "step": 350 + }, + { + "epoch": 0.01744429032629303, + "grad_norm": 0.5047647356987, + "learning_rate": 2.9032258064516133e-05, + "loss": 2.2096, + "num_input_tokens_seen": 14247376, + "step": 360 + }, + { + "epoch": 0.017928853946467833, + "grad_norm": 0.5520374178886414, + "learning_rate": 2.9838709677419357e-05, + "loss": 2.2371, + "num_input_tokens_seen": 14644760, + "step": 370 + }, + { + "epoch": 0.01841341756664264, + "grad_norm": 0.46873587369918823, + "learning_rate": 3.0645161290322585e-05, + "loss": 2.2636, + "num_input_tokens_seen": 15030968, + "step": 380 + }, + { + "epoch": 0.018897981186817448, + "grad_norm": 0.5534300208091736, + "learning_rate": 3.1451612903225806e-05, + "loss": 2.1924, + "num_input_tokens_seen": 15442684, + "step": 390 + }, + { + "epoch": 0.019382544806992252, + "grad_norm": 0.5355961918830872, + "learning_rate": 3.2258064516129034e-05, + "loss": 2.1482, + "num_input_tokens_seen": 15823228, + "step": 400 + }, + { + "epoch": 0.01986710842716706, + "grad_norm": 0.5530371069908142, + "learning_rate": 3.306451612903226e-05, + "loss": 2.1865, + "num_input_tokens_seen": 16206660, + "step": 410 + }, + { + "epoch": 0.020351672047341864, + "grad_norm": 0.5505437254905701, + "learning_rate": 3.387096774193548e-05, + "loss": 2.1665, + "num_input_tokens_seen": 16610308, + "step": 420 + }, + { + "epoch": 0.020836235667516672, + "grad_norm": 0.5833243727684021, + "learning_rate": 3.467741935483872e-05, + "loss": 2.1413, + "num_input_tokens_seen": 17013200, + "step": 430 + }, + { + "epoch": 0.02132079928769148, + "grad_norm": 0.5723456144332886, + "learning_rate": 3.548387096774194e-05, + "loss": 2.149, + "num_input_tokens_seen": 17412528, + "step": 440 + }, + { + "epoch": 0.021805362907866284, + "grad_norm": 0.49083006381988525, + "learning_rate": 3.6290322580645165e-05, + "loss": 2.208, + "num_input_tokens_seen": 17818648, + "step": 450 + }, + { + "epoch": 0.02228992652804109, + "grad_norm": 0.5125194787979126, + "learning_rate": 3.7096774193548386e-05, + "loss": 2.1628, + "num_input_tokens_seen": 18215112, + "step": 460 + }, + { + "epoch": 0.022774490148215896, + "grad_norm": 0.5249300003051758, + "learning_rate": 3.7903225806451614e-05, + "loss": 2.1179, + "num_input_tokens_seen": 18613100, + "step": 470 + }, + { + "epoch": 0.023259053768390704, + "grad_norm": 0.5913705825805664, + "learning_rate": 3.870967741935484e-05, + "loss": 2.0918, + "num_input_tokens_seen": 19006580, + "step": 480 + }, + { + "epoch": 0.02374361738856551, + "grad_norm": 0.5256056785583496, + "learning_rate": 3.951612903225806e-05, + "loss": 2.1146, + "num_input_tokens_seen": 19410508, + "step": 490 + }, + { + "epoch": 0.024228181008740315, + "grad_norm": 0.5684739947319031, + "learning_rate": 4.032258064516129e-05, + "loss": 2.0621, + "num_input_tokens_seen": 19801804, + "step": 500 + }, + { + "epoch": 0.024712744628915123, + "grad_norm": 0.5903922319412231, + "learning_rate": 4.112903225806452e-05, + "loss": 2.1035, + "num_input_tokens_seen": 20183328, + "step": 510 + }, + { + "epoch": 0.025197308249089927, + "grad_norm": 0.5174550414085388, + "learning_rate": 4.1935483870967746e-05, + "loss": 2.088, + "num_input_tokens_seen": 20585364, + "step": 520 + }, + { + "epoch": 0.025681871869264735, + "grad_norm": 0.5224788188934326, + "learning_rate": 4.2741935483870973e-05, + "loss": 2.0708, + "num_input_tokens_seen": 20971292, + "step": 530 + }, + { + "epoch": 0.026166435489439543, + "grad_norm": 0.5536626577377319, + "learning_rate": 4.3548387096774194e-05, + "loss": 2.0135, + "num_input_tokens_seen": 21390084, + "step": 540 + }, + { + "epoch": 0.026650999109614347, + "grad_norm": 0.49406698346138, + "learning_rate": 4.435483870967742e-05, + "loss": 2.0626, + "num_input_tokens_seen": 21790964, + "step": 550 + }, + { + "epoch": 0.027135562729789155, + "grad_norm": 0.5543055534362793, + "learning_rate": 4.516129032258064e-05, + "loss": 2.0395, + "num_input_tokens_seen": 22160120, + "step": 560 + }, + { + "epoch": 0.02762012634996396, + "grad_norm": 0.5112178325653076, + "learning_rate": 4.596774193548387e-05, + "loss": 2.0956, + "num_input_tokens_seen": 22555328, + "step": 570 + }, + { + "epoch": 0.028104689970138767, + "grad_norm": 0.5143571496009827, + "learning_rate": 4.67741935483871e-05, + "loss": 2.0209, + "num_input_tokens_seen": 22941364, + "step": 580 + }, + { + "epoch": 0.028589253590313574, + "grad_norm": 0.501890242099762, + "learning_rate": 4.7580645161290326e-05, + "loss": 2.0326, + "num_input_tokens_seen": 23298096, + "step": 590 + }, + { + "epoch": 0.02907381721048838, + "grad_norm": 0.47222423553466797, + "learning_rate": 4.8387096774193554e-05, + "loss": 2.0099, + "num_input_tokens_seen": 23698488, + "step": 600 + }, + { + "epoch": 0.029558380830663186, + "grad_norm": 0.5555810928344727, + "learning_rate": 4.9193548387096775e-05, + "loss": 1.9978, + "num_input_tokens_seen": 24088604, + "step": 610 + }, + { + "epoch": 0.030042944450837994, + "grad_norm": 0.6044710278511047, + "learning_rate": 5e-05, + "loss": 2.0005, + "num_input_tokens_seen": 24477980, + "step": 620 + }, + { + "epoch": 0.030527508071012798, + "grad_norm": 0.5720553398132324, + "learning_rate": 4.999996920985807e-05, + "loss": 1.9575, + "num_input_tokens_seen": 24859124, + "step": 630 + }, + { + "epoch": 0.031012071691187606, + "grad_norm": 0.5833857655525208, + "learning_rate": 4.9999876839508106e-05, + "loss": 1.992, + "num_input_tokens_seen": 25283928, + "step": 640 + }, + { + "epoch": 0.03149663531136241, + "grad_norm": 0.49288150668144226, + "learning_rate": 4.999972288917764e-05, + "loss": 1.952, + "num_input_tokens_seen": 25662068, + "step": 650 + }, + { + "epoch": 0.03198119893153722, + "grad_norm": 0.5453637838363647, + "learning_rate": 4.999950735924589e-05, + "loss": 2.0213, + "num_input_tokens_seen": 26066280, + "step": 660 + }, + { + "epoch": 0.032465762551712025, + "grad_norm": 0.4919157922267914, + "learning_rate": 4.9999230250243744e-05, + "loss": 1.9531, + "num_input_tokens_seen": 26446308, + "step": 670 + }, + { + "epoch": 0.03295032617188683, + "grad_norm": 0.5228652954101562, + "learning_rate": 4.999889156285379e-05, + "loss": 1.9746, + "num_input_tokens_seen": 26862780, + "step": 680 + }, + { + "epoch": 0.033434889792061634, + "grad_norm": 0.5261794328689575, + "learning_rate": 4.999849129791028e-05, + "loss": 1.9717, + "num_input_tokens_seen": 27239428, + "step": 690 + }, + { + "epoch": 0.03391945341223644, + "grad_norm": 0.5125803351402283, + "learning_rate": 4.9998029456399144e-05, + "loss": 1.975, + "num_input_tokens_seen": 27633548, + "step": 700 + }, + { + "epoch": 0.03440401703241125, + "grad_norm": 0.48046156764030457, + "learning_rate": 4.999750603945801e-05, + "loss": 1.925, + "num_input_tokens_seen": 28046196, + "step": 710 + }, + { + "epoch": 0.03488858065258606, + "grad_norm": 0.5283321738243103, + "learning_rate": 4.999692104837615e-05, + "loss": 1.9442, + "num_input_tokens_seen": 28423160, + "step": 720 + }, + { + "epoch": 0.035373144272760865, + "grad_norm": 0.4812624156475067, + "learning_rate": 4.999627448459453e-05, + "loss": 1.8945, + "num_input_tokens_seen": 28842052, + "step": 730 + }, + { + "epoch": 0.035857707892935665, + "grad_norm": 0.5417799353599548, + "learning_rate": 4.999556634970578e-05, + "loss": 1.9628, + "num_input_tokens_seen": 29251476, + "step": 740 + }, + { + "epoch": 0.03634227151311047, + "grad_norm": 0.48694953322410583, + "learning_rate": 4.999479664545417e-05, + "loss": 1.9061, + "num_input_tokens_seen": 29660076, + "step": 750 + }, + { + "epoch": 0.03682683513328528, + "grad_norm": 0.5140628814697266, + "learning_rate": 4.999396537373565e-05, + "loss": 1.9304, + "num_input_tokens_seen": 30049624, + "step": 760 + }, + { + "epoch": 0.03731139875346009, + "grad_norm": 0.48269200325012207, + "learning_rate": 4.9993072536597816e-05, + "loss": 1.9188, + "num_input_tokens_seen": 30466320, + "step": 770 + }, + { + "epoch": 0.037795962373634896, + "grad_norm": 0.5299521684646606, + "learning_rate": 4.999211813623993e-05, + "loss": 1.8832, + "num_input_tokens_seen": 30853272, + "step": 780 + }, + { + "epoch": 0.0382805259938097, + "grad_norm": 0.5212485790252686, + "learning_rate": 4.999110217501286e-05, + "loss": 1.8968, + "num_input_tokens_seen": 31245900, + "step": 790 + }, + { + "epoch": 0.038765089613984505, + "grad_norm": 0.5231190323829651, + "learning_rate": 4.9990024655419146e-05, + "loss": 1.8859, + "num_input_tokens_seen": 31662092, + "step": 800 + }, + { + "epoch": 0.03924965323415931, + "grad_norm": 0.49723413586616516, + "learning_rate": 4.998888558011295e-05, + "loss": 1.872, + "num_input_tokens_seen": 32059116, + "step": 810 + }, + { + "epoch": 0.03973421685433412, + "grad_norm": 0.6407498121261597, + "learning_rate": 4.9987684951900036e-05, + "loss": 1.8606, + "num_input_tokens_seen": 32484640, + "step": 820 + }, + { + "epoch": 0.04021878047450893, + "grad_norm": 0.5182517766952515, + "learning_rate": 4.998642277373783e-05, + "loss": 1.8906, + "num_input_tokens_seen": 32857036, + "step": 830 + }, + { + "epoch": 0.04070334409468373, + "grad_norm": 0.46163928508758545, + "learning_rate": 4.998509904873533e-05, + "loss": 1.8566, + "num_input_tokens_seen": 33227492, + "step": 840 + }, + { + "epoch": 0.041187907714858536, + "grad_norm": 0.47485044598579407, + "learning_rate": 4.998371378015314e-05, + "loss": 1.8759, + "num_input_tokens_seen": 33614644, + "step": 850 + }, + { + "epoch": 0.041672471335033344, + "grad_norm": 0.4748258888721466, + "learning_rate": 4.998226697140349e-05, + "loss": 1.877, + "num_input_tokens_seen": 33984924, + "step": 860 + }, + { + "epoch": 0.04215703495520815, + "grad_norm": 0.5473403930664062, + "learning_rate": 4.998075862605017e-05, + "loss": 1.8403, + "num_input_tokens_seen": 34379932, + "step": 870 + }, + { + "epoch": 0.04264159857538296, + "grad_norm": 0.4143201410770416, + "learning_rate": 4.9979188747808545e-05, + "loss": 1.87, + "num_input_tokens_seen": 34769220, + "step": 880 + }, + { + "epoch": 0.04312616219555776, + "grad_norm": 0.43778666853904724, + "learning_rate": 4.997755734054557e-05, + "loss": 1.9247, + "num_input_tokens_seen": 35124728, + "step": 890 + }, + { + "epoch": 0.04361072581573257, + "grad_norm": 0.532357394695282, + "learning_rate": 4.9975864408279725e-05, + "loss": 1.8527, + "num_input_tokens_seen": 35526992, + "step": 900 + }, + { + "epoch": 0.044095289435907375, + "grad_norm": 0.4756541848182678, + "learning_rate": 4.997410995518108e-05, + "loss": 1.8328, + "num_input_tokens_seen": 35932440, + "step": 910 + }, + { + "epoch": 0.04457985305608218, + "grad_norm": 0.4680721163749695, + "learning_rate": 4.997229398557122e-05, + "loss": 1.804, + "num_input_tokens_seen": 36352128, + "step": 920 + }, + { + "epoch": 0.04506441667625699, + "grad_norm": 0.48043662309646606, + "learning_rate": 4.9970416503923254e-05, + "loss": 1.8609, + "num_input_tokens_seen": 36746228, + "step": 930 + }, + { + "epoch": 0.04554898029643179, + "grad_norm": 0.4995608627796173, + "learning_rate": 4.996847751486182e-05, + "loss": 1.8493, + "num_input_tokens_seen": 37138932, + "step": 940 + }, + { + "epoch": 0.0460335439166066, + "grad_norm": 0.5137046575546265, + "learning_rate": 4.996647702316306e-05, + "loss": 1.8672, + "num_input_tokens_seen": 37519424, + "step": 950 + }, + { + "epoch": 0.04651810753678141, + "grad_norm": 0.48803040385246277, + "learning_rate": 4.996441503375461e-05, + "loss": 1.8371, + "num_input_tokens_seen": 37895416, + "step": 960 + }, + { + "epoch": 0.047002671156956215, + "grad_norm": 0.4492471516132355, + "learning_rate": 4.996229155171558e-05, + "loss": 1.8697, + "num_input_tokens_seen": 38307728, + "step": 970 + }, + { + "epoch": 0.04748723477713102, + "grad_norm": 0.4708845913410187, + "learning_rate": 4.9960106582276556e-05, + "loss": 1.8481, + "num_input_tokens_seen": 38697636, + "step": 980 + }, + { + "epoch": 0.04797179839730582, + "grad_norm": 0.49647071957588196, + "learning_rate": 4.995786013081958e-05, + "loss": 1.8812, + "num_input_tokens_seen": 39088844, + "step": 990 + }, + { + "epoch": 0.04845636201748063, + "grad_norm": 0.5095282793045044, + "learning_rate": 4.995555220287814e-05, + "loss": 1.8549, + "num_input_tokens_seen": 39495924, + "step": 1000 + }, + { + "epoch": 0.04894092563765544, + "grad_norm": 0.47867298126220703, + "learning_rate": 4.9953182804137144e-05, + "loss": 1.783, + "num_input_tokens_seen": 39909712, + "step": 1010 + }, + { + "epoch": 0.049425489257830246, + "grad_norm": 0.4481337070465088, + "learning_rate": 4.9950751940432935e-05, + "loss": 1.7918, + "num_input_tokens_seen": 40301008, + "step": 1020 + }, + { + "epoch": 0.049910052878005054, + "grad_norm": 0.4755343496799469, + "learning_rate": 4.994825961775323e-05, + "loss": 1.7714, + "num_input_tokens_seen": 40676476, + "step": 1030 + }, + { + "epoch": 0.050394616498179855, + "grad_norm": 0.5189250707626343, + "learning_rate": 4.994570584223715e-05, + "loss": 1.8269, + "num_input_tokens_seen": 41064096, + "step": 1040 + }, + { + "epoch": 0.05087918011835466, + "grad_norm": 0.47439032793045044, + "learning_rate": 4.994309062017519e-05, + "loss": 1.7933, + "num_input_tokens_seen": 41445416, + "step": 1050 + }, + { + "epoch": 0.05136374373852947, + "grad_norm": 0.4340980052947998, + "learning_rate": 4.994041395800918e-05, + "loss": 1.7959, + "num_input_tokens_seen": 41850336, + "step": 1060 + }, + { + "epoch": 0.05184830735870428, + "grad_norm": 0.4619131088256836, + "learning_rate": 4.993767586233232e-05, + "loss": 1.836, + "num_input_tokens_seen": 42265248, + "step": 1070 + }, + { + "epoch": 0.052332870978879086, + "grad_norm": 0.5889285206794739, + "learning_rate": 4.993487633988912e-05, + "loss": 1.8257, + "num_input_tokens_seen": 42676764, + "step": 1080 + }, + { + "epoch": 0.052817434599053886, + "grad_norm": 0.4897553026676178, + "learning_rate": 4.993201539757538e-05, + "loss": 1.8588, + "num_input_tokens_seen": 43053880, + "step": 1090 + }, + { + "epoch": 0.053301998219228694, + "grad_norm": 0.47212594747543335, + "learning_rate": 4.992909304243822e-05, + "loss": 1.8094, + "num_input_tokens_seen": 43430872, + "step": 1100 + }, + { + "epoch": 0.0537865618394035, + "grad_norm": 0.44554567337036133, + "learning_rate": 4.992610928167601e-05, + "loss": 1.7834, + "num_input_tokens_seen": 43851752, + "step": 1110 + }, + { + "epoch": 0.05427112545957831, + "grad_norm": 0.45216086506843567, + "learning_rate": 4.992306412263839e-05, + "loss": 1.8522, + "num_input_tokens_seen": 44254964, + "step": 1120 + }, + { + "epoch": 0.05475568907975312, + "grad_norm": 0.5445901155471802, + "learning_rate": 4.9919957572826216e-05, + "loss": 1.839, + "num_input_tokens_seen": 44656772, + "step": 1130 + }, + { + "epoch": 0.05524025269992792, + "grad_norm": 0.5541059970855713, + "learning_rate": 4.9916789639891595e-05, + "loss": 1.7898, + "num_input_tokens_seen": 45058836, + "step": 1140 + }, + { + "epoch": 0.055724816320102726, + "grad_norm": 0.5577613711357117, + "learning_rate": 4.99135603316378e-05, + "loss": 1.8336, + "num_input_tokens_seen": 45471592, + "step": 1150 + }, + { + "epoch": 0.05620937994027753, + "grad_norm": 0.47430625557899475, + "learning_rate": 4.991026965601932e-05, + "loss": 1.7995, + "num_input_tokens_seen": 45846124, + "step": 1160 + }, + { + "epoch": 0.05669394356045234, + "grad_norm": 0.5153654217720032, + "learning_rate": 4.990691762114176e-05, + "loss": 1.8068, + "num_input_tokens_seen": 46213352, + "step": 1170 + }, + { + "epoch": 0.05717850718062715, + "grad_norm": 0.5673589110374451, + "learning_rate": 4.99035042352619e-05, + "loss": 1.773, + "num_input_tokens_seen": 46626636, + "step": 1180 + }, + { + "epoch": 0.05766307080080195, + "grad_norm": 0.505126416683197, + "learning_rate": 4.9900029506787645e-05, + "loss": 1.7516, + "num_input_tokens_seen": 47031172, + "step": 1190 + }, + { + "epoch": 0.05814763442097676, + "grad_norm": 0.4553806483745575, + "learning_rate": 4.989649344427796e-05, + "loss": 1.7862, + "num_input_tokens_seen": 47425060, + "step": 1200 + }, + { + "epoch": 0.058632198041151565, + "grad_norm": 0.4654844105243683, + "learning_rate": 4.989289605644294e-05, + "loss": 1.7957, + "num_input_tokens_seen": 47824468, + "step": 1210 + }, + { + "epoch": 0.05911676166132637, + "grad_norm": 0.4679258167743683, + "learning_rate": 4.988923735214369e-05, + "loss": 1.7706, + "num_input_tokens_seen": 48220624, + "step": 1220 + }, + { + "epoch": 0.05960132528150118, + "grad_norm": 0.4447259306907654, + "learning_rate": 4.988551734039239e-05, + "loss": 1.7788, + "num_input_tokens_seen": 48597080, + "step": 1230 + }, + { + "epoch": 0.06008588890167599, + "grad_norm": 0.5004059672355652, + "learning_rate": 4.98817360303522e-05, + "loss": 1.7994, + "num_input_tokens_seen": 49002120, + "step": 1240 + }, + { + "epoch": 0.06057045252185079, + "grad_norm": 0.4713003635406494, + "learning_rate": 4.98778934313373e-05, + "loss": 1.729, + "num_input_tokens_seen": 49399124, + "step": 1250 + }, + { + "epoch": 0.061055016142025596, + "grad_norm": 0.4393056035041809, + "learning_rate": 4.987398955281281e-05, + "loss": 1.7764, + "num_input_tokens_seen": 49820492, + "step": 1260 + }, + { + "epoch": 0.061539579762200404, + "grad_norm": 0.49267756938934326, + "learning_rate": 4.987002440439481e-05, + "loss": 1.7541, + "num_input_tokens_seen": 50240940, + "step": 1270 + }, + { + "epoch": 0.06202414338237521, + "grad_norm": 0.4592071771621704, + "learning_rate": 4.986599799585031e-05, + "loss": 1.8162, + "num_input_tokens_seen": 50644136, + "step": 1280 + }, + { + "epoch": 0.06250870700255001, + "grad_norm": 0.4428696930408478, + "learning_rate": 4.98619103370972e-05, + "loss": 1.7786, + "num_input_tokens_seen": 51030808, + "step": 1290 + }, + { + "epoch": 0.06299327062272482, + "grad_norm": 0.4989221394062042, + "learning_rate": 4.985776143820423e-05, + "loss": 1.7498, + "num_input_tokens_seen": 51408072, + "step": 1300 + }, + { + "epoch": 0.06347783424289963, + "grad_norm": 0.46731674671173096, + "learning_rate": 4.985355130939104e-05, + "loss": 1.7556, + "num_input_tokens_seen": 51788056, + "step": 1310 + }, + { + "epoch": 0.06396239786307444, + "grad_norm": 0.4602036476135254, + "learning_rate": 4.984927996102806e-05, + "loss": 1.7395, + "num_input_tokens_seen": 52195616, + "step": 1320 + }, + { + "epoch": 0.06444696148324924, + "grad_norm": 0.49800294637680054, + "learning_rate": 4.984494740363651e-05, + "loss": 1.7324, + "num_input_tokens_seen": 52605440, + "step": 1330 + }, + { + "epoch": 0.06493152510342405, + "grad_norm": 0.4798908233642578, + "learning_rate": 4.984055364788842e-05, + "loss": 1.7495, + "num_input_tokens_seen": 53005000, + "step": 1340 + }, + { + "epoch": 0.06541608872359886, + "grad_norm": 0.510181188583374, + "learning_rate": 4.9836098704606515e-05, + "loss": 1.7238, + "num_input_tokens_seen": 53428392, + "step": 1350 + }, + { + "epoch": 0.06590065234377367, + "grad_norm": 0.4547858238220215, + "learning_rate": 4.983158258476427e-05, + "loss": 1.7711, + "num_input_tokens_seen": 53787676, + "step": 1360 + }, + { + "epoch": 0.06638521596394846, + "grad_norm": 0.486717164516449, + "learning_rate": 4.982700529948585e-05, + "loss": 1.7099, + "num_input_tokens_seen": 54228120, + "step": 1370 + }, + { + "epoch": 0.06686977958412327, + "grad_norm": 0.4875461459159851, + "learning_rate": 4.982236686004606e-05, + "loss": 1.7542, + "num_input_tokens_seen": 54637396, + "step": 1380 + }, + { + "epoch": 0.06735434320429808, + "grad_norm": 0.4732173681259155, + "learning_rate": 4.9817667277870384e-05, + "loss": 1.725, + "num_input_tokens_seen": 55042360, + "step": 1390 + }, + { + "epoch": 0.06783890682447288, + "grad_norm": 0.4369388818740845, + "learning_rate": 4.981290656453486e-05, + "loss": 1.7288, + "num_input_tokens_seen": 55430008, + "step": 1400 + }, + { + "epoch": 0.06832347044464769, + "grad_norm": 0.42465418577194214, + "learning_rate": 4.9808084731766134e-05, + "loss": 1.7026, + "num_input_tokens_seen": 55815764, + "step": 1410 + }, + { + "epoch": 0.0688080340648225, + "grad_norm": 0.4628385007381439, + "learning_rate": 4.980320179144141e-05, + "loss": 1.7732, + "num_input_tokens_seen": 56164316, + "step": 1420 + }, + { + "epoch": 0.0692925976849973, + "grad_norm": 0.5061572194099426, + "learning_rate": 4.97982577555884e-05, + "loss": 1.7107, + "num_input_tokens_seen": 56523812, + "step": 1430 + }, + { + "epoch": 0.06977716130517211, + "grad_norm": 0.49572449922561646, + "learning_rate": 4.9793252636385305e-05, + "loss": 1.6988, + "num_input_tokens_seen": 56892024, + "step": 1440 + }, + { + "epoch": 0.07026172492534692, + "grad_norm": 0.5096593499183655, + "learning_rate": 4.9788186446160795e-05, + "loss": 1.7888, + "num_input_tokens_seen": 57270864, + "step": 1450 + }, + { + "epoch": 0.07074628854552173, + "grad_norm": 0.4694225490093231, + "learning_rate": 4.978305919739396e-05, + "loss": 1.7126, + "num_input_tokens_seen": 57643264, + "step": 1460 + }, + { + "epoch": 0.07123085216569652, + "grad_norm": 0.43490421772003174, + "learning_rate": 4.9777870902714306e-05, + "loss": 1.6902, + "num_input_tokens_seen": 58035896, + "step": 1470 + }, + { + "epoch": 0.07171541578587133, + "grad_norm": 0.42695242166519165, + "learning_rate": 4.97726215749017e-05, + "loss": 1.7368, + "num_input_tokens_seen": 58412320, + "step": 1480 + }, + { + "epoch": 0.07219997940604614, + "grad_norm": 0.48507386445999146, + "learning_rate": 4.976731122688634e-05, + "loss": 1.6676, + "num_input_tokens_seen": 58792164, + "step": 1490 + }, + { + "epoch": 0.07268454302622095, + "grad_norm": 0.5080244541168213, + "learning_rate": 4.9761939871748734e-05, + "loss": 1.6863, + "num_input_tokens_seen": 59187108, + "step": 1500 + }, + { + "epoch": 0.07316910664639575, + "grad_norm": 0.4605392813682556, + "learning_rate": 4.975650752271967e-05, + "loss": 1.7126, + "num_input_tokens_seen": 59576276, + "step": 1510 + }, + { + "epoch": 0.07365367026657056, + "grad_norm": 0.5134192705154419, + "learning_rate": 4.9751014193180165e-05, + "loss": 1.7108, + "num_input_tokens_seen": 59993792, + "step": 1520 + }, + { + "epoch": 0.07413823388674537, + "grad_norm": 0.4663729667663574, + "learning_rate": 4.974545989666147e-05, + "loss": 1.7352, + "num_input_tokens_seen": 60408024, + "step": 1530 + }, + { + "epoch": 0.07462279750692018, + "grad_norm": 0.4669811427593231, + "learning_rate": 4.973984464684497e-05, + "loss": 1.7421, + "num_input_tokens_seen": 60767824, + "step": 1540 + }, + { + "epoch": 0.07510736112709498, + "grad_norm": 0.46583324670791626, + "learning_rate": 4.973416845756221e-05, + "loss": 1.7187, + "num_input_tokens_seen": 61143088, + "step": 1550 + }, + { + "epoch": 0.07559192474726979, + "grad_norm": 0.4906235635280609, + "learning_rate": 4.9728431342794865e-05, + "loss": 1.7195, + "num_input_tokens_seen": 61531644, + "step": 1560 + }, + { + "epoch": 0.07607648836744459, + "grad_norm": 0.4702226221561432, + "learning_rate": 4.972263331667465e-05, + "loss": 1.6975, + "num_input_tokens_seen": 61937832, + "step": 1570 + }, + { + "epoch": 0.0765610519876194, + "grad_norm": 0.4586687386035919, + "learning_rate": 4.971677439348332e-05, + "loss": 1.7196, + "num_input_tokens_seen": 62332144, + "step": 1580 + }, + { + "epoch": 0.0770456156077942, + "grad_norm": 0.4383968710899353, + "learning_rate": 4.9710854587652654e-05, + "loss": 1.6704, + "num_input_tokens_seen": 62703152, + "step": 1590 + }, + { + "epoch": 0.07753017922796901, + "grad_norm": 0.4787232577800751, + "learning_rate": 4.970487391376438e-05, + "loss": 1.7729, + "num_input_tokens_seen": 63098040, + "step": 1600 + }, + { + "epoch": 0.07801474284814382, + "grad_norm": 0.47267040610313416, + "learning_rate": 4.969883238655017e-05, + "loss": 1.6948, + "num_input_tokens_seen": 63500456, + "step": 1610 + }, + { + "epoch": 0.07849930646831862, + "grad_norm": 0.4243592917919159, + "learning_rate": 4.969273002089157e-05, + "loss": 1.6758, + "num_input_tokens_seen": 63899700, + "step": 1620 + }, + { + "epoch": 0.07898387008849343, + "grad_norm": 0.5044109225273132, + "learning_rate": 4.968656683181999e-05, + "loss": 1.7079, + "num_input_tokens_seen": 64275644, + "step": 1630 + }, + { + "epoch": 0.07946843370866824, + "grad_norm": 0.4761982858181, + "learning_rate": 4.968034283451669e-05, + "loss": 1.6974, + "num_input_tokens_seen": 64658944, + "step": 1640 + }, + { + "epoch": 0.07995299732884305, + "grad_norm": 0.46890366077423096, + "learning_rate": 4.967405804431267e-05, + "loss": 1.6835, + "num_input_tokens_seen": 65053324, + "step": 1650 + }, + { + "epoch": 0.08043756094901786, + "grad_norm": 0.4691985547542572, + "learning_rate": 4.966771247668871e-05, + "loss": 1.6842, + "num_input_tokens_seen": 65471828, + "step": 1660 + }, + { + "epoch": 0.08092212456919266, + "grad_norm": 0.5264722108840942, + "learning_rate": 4.966130614727529e-05, + "loss": 1.6913, + "num_input_tokens_seen": 65888112, + "step": 1670 + }, + { + "epoch": 0.08140668818936746, + "grad_norm": 0.46564558148384094, + "learning_rate": 4.9654839071852535e-05, + "loss": 1.6894, + "num_input_tokens_seen": 66301484, + "step": 1680 + }, + { + "epoch": 0.08189125180954226, + "grad_norm": 0.4879053831100464, + "learning_rate": 4.964831126635022e-05, + "loss": 1.7088, + "num_input_tokens_seen": 66683244, + "step": 1690 + }, + { + "epoch": 0.08237581542971707, + "grad_norm": 0.39720138907432556, + "learning_rate": 4.964172274684772e-05, + "loss": 1.7227, + "num_input_tokens_seen": 67102092, + "step": 1700 + }, + { + "epoch": 0.08286037904989188, + "grad_norm": 0.4953685998916626, + "learning_rate": 4.963507352957395e-05, + "loss": 1.7142, + "num_input_tokens_seen": 67495072, + "step": 1710 + }, + { + "epoch": 0.08334494267006669, + "grad_norm": 0.4297001361846924, + "learning_rate": 4.962836363090734e-05, + "loss": 1.6934, + "num_input_tokens_seen": 67872712, + "step": 1720 + }, + { + "epoch": 0.0838295062902415, + "grad_norm": 0.4302520155906677, + "learning_rate": 4.962159306737578e-05, + "loss": 1.6898, + "num_input_tokens_seen": 68264844, + "step": 1730 + }, + { + "epoch": 0.0843140699104163, + "grad_norm": 0.47304287552833557, + "learning_rate": 4.96147618556566e-05, + "loss": 1.6631, + "num_input_tokens_seen": 68650516, + "step": 1740 + }, + { + "epoch": 0.08479863353059111, + "grad_norm": 0.47196635603904724, + "learning_rate": 4.960787001257652e-05, + "loss": 1.6963, + "num_input_tokens_seen": 69003456, + "step": 1750 + }, + { + "epoch": 0.08528319715076592, + "grad_norm": 0.46627262234687805, + "learning_rate": 4.9600917555111615e-05, + "loss": 1.6733, + "num_input_tokens_seen": 69386088, + "step": 1760 + }, + { + "epoch": 0.08576776077094073, + "grad_norm": 0.4163726568222046, + "learning_rate": 4.9593904500387245e-05, + "loss": 1.6907, + "num_input_tokens_seen": 69762188, + "step": 1770 + }, + { + "epoch": 0.08625232439111552, + "grad_norm": 0.46610623598098755, + "learning_rate": 4.9586830865678046e-05, + "loss": 1.6722, + "num_input_tokens_seen": 70170884, + "step": 1780 + }, + { + "epoch": 0.08673688801129033, + "grad_norm": 0.5110335946083069, + "learning_rate": 4.957969666840788e-05, + "loss": 1.6884, + "num_input_tokens_seen": 70544880, + "step": 1790 + }, + { + "epoch": 0.08722145163146514, + "grad_norm": 0.4962434768676758, + "learning_rate": 4.957250192614979e-05, + "loss": 1.659, + "num_input_tokens_seen": 70937756, + "step": 1800 + }, + { + "epoch": 0.08770601525163994, + "grad_norm": 0.4480324387550354, + "learning_rate": 4.956524665662593e-05, + "loss": 1.6428, + "num_input_tokens_seen": 71295700, + "step": 1810 + }, + { + "epoch": 0.08819057887181475, + "grad_norm": 0.46165645122528076, + "learning_rate": 4.955793087770758e-05, + "loss": 1.7055, + "num_input_tokens_seen": 71677740, + "step": 1820 + }, + { + "epoch": 0.08867514249198956, + "grad_norm": 0.4461020827293396, + "learning_rate": 4.955055460741503e-05, + "loss": 1.7052, + "num_input_tokens_seen": 72088472, + "step": 1830 + }, + { + "epoch": 0.08915970611216437, + "grad_norm": 0.46780285239219666, + "learning_rate": 4.9543117863917624e-05, + "loss": 1.6684, + "num_input_tokens_seen": 72483916, + "step": 1840 + }, + { + "epoch": 0.08964426973233917, + "grad_norm": 0.4738537073135376, + "learning_rate": 4.953562066553359e-05, + "loss": 1.6663, + "num_input_tokens_seen": 72895204, + "step": 1850 + }, + { + "epoch": 0.09012883335251398, + "grad_norm": 0.41888901591300964, + "learning_rate": 4.952806303073015e-05, + "loss": 1.7064, + "num_input_tokens_seen": 73289656, + "step": 1860 + }, + { + "epoch": 0.09061339697268879, + "grad_norm": 0.4899497330188751, + "learning_rate": 4.952044497812334e-05, + "loss": 1.7179, + "num_input_tokens_seen": 73724044, + "step": 1870 + }, + { + "epoch": 0.09109796059286358, + "grad_norm": 0.4161849319934845, + "learning_rate": 4.951276652647803e-05, + "loss": 1.6841, + "num_input_tokens_seen": 74111024, + "step": 1880 + }, + { + "epoch": 0.09158252421303839, + "grad_norm": 0.44479620456695557, + "learning_rate": 4.950502769470787e-05, + "loss": 1.6294, + "num_input_tokens_seen": 74502300, + "step": 1890 + }, + { + "epoch": 0.0920670878332132, + "grad_norm": 0.48210737109184265, + "learning_rate": 4.949722850187525e-05, + "loss": 1.6615, + "num_input_tokens_seen": 74920152, + "step": 1900 + }, + { + "epoch": 0.092551651453388, + "grad_norm": 0.5125983357429504, + "learning_rate": 4.948936896719121e-05, + "loss": 1.6056, + "num_input_tokens_seen": 75318632, + "step": 1910 + }, + { + "epoch": 0.09303621507356281, + "grad_norm": 0.45514819025993347, + "learning_rate": 4.948144911001546e-05, + "loss": 1.6808, + "num_input_tokens_seen": 75682924, + "step": 1920 + }, + { + "epoch": 0.09352077869373762, + "grad_norm": 0.45765024423599243, + "learning_rate": 4.9473468949856295e-05, + "loss": 1.6256, + "num_input_tokens_seen": 76084200, + "step": 1930 + }, + { + "epoch": 0.09400534231391243, + "grad_norm": 0.46034979820251465, + "learning_rate": 4.946542850637051e-05, + "loss": 1.6342, + "num_input_tokens_seen": 76509976, + "step": 1940 + }, + { + "epoch": 0.09448990593408724, + "grad_norm": 0.47426116466522217, + "learning_rate": 4.945732779936343e-05, + "loss": 1.6783, + "num_input_tokens_seen": 76925836, + "step": 1950 + }, + { + "epoch": 0.09497446955426204, + "grad_norm": 0.44276344776153564, + "learning_rate": 4.944916684878881e-05, + "loss": 1.6505, + "num_input_tokens_seen": 77309476, + "step": 1960 + }, + { + "epoch": 0.09545903317443685, + "grad_norm": 0.4252452552318573, + "learning_rate": 4.944094567474878e-05, + "loss": 1.6174, + "num_input_tokens_seen": 77719060, + "step": 1970 + }, + { + "epoch": 0.09594359679461165, + "grad_norm": 0.4776414930820465, + "learning_rate": 4.9432664297493855e-05, + "loss": 1.6953, + "num_input_tokens_seen": 78109656, + "step": 1980 + }, + { + "epoch": 0.09642816041478645, + "grad_norm": 0.5169130563735962, + "learning_rate": 4.94243227374228e-05, + "loss": 1.6782, + "num_input_tokens_seen": 78511580, + "step": 1990 + }, + { + "epoch": 0.09691272403496126, + "grad_norm": 0.41066092252731323, + "learning_rate": 4.941592101508264e-05, + "loss": 1.6487, + "num_input_tokens_seen": 78885660, + "step": 2000 + }, + { + "epoch": 0.09691272403496126, + "eval_loss": 1.7664850950241089, + "eval_runtime": 3.9496, + "eval_samples_per_second": 37.979, + "eval_steps_per_second": 4.811, + "num_input_tokens_seen": 78885660, + "step": 2000 + }, + { + "epoch": 0.09739728765513607, + "grad_norm": 0.4655153751373291, + "learning_rate": 4.94074591511686e-05, + "loss": 1.6776, + "num_input_tokens_seen": 79304256, + "step": 2010 + }, + { + "epoch": 0.09788185127531088, + "grad_norm": 0.47283700108528137, + "learning_rate": 4.939893716652404e-05, + "loss": 1.6605, + "num_input_tokens_seen": 79720364, + "step": 2020 + }, + { + "epoch": 0.09836641489548568, + "grad_norm": 0.433597207069397, + "learning_rate": 4.93903550821404e-05, + "loss": 1.6256, + "num_input_tokens_seen": 80133808, + "step": 2030 + }, + { + "epoch": 0.09885097851566049, + "grad_norm": 0.5006552934646606, + "learning_rate": 4.9381712919157174e-05, + "loss": 1.672, + "num_input_tokens_seen": 80519820, + "step": 2040 + }, + { + "epoch": 0.0993355421358353, + "grad_norm": 0.4227645993232727, + "learning_rate": 4.937301069886184e-05, + "loss": 1.5913, + "num_input_tokens_seen": 80921068, + "step": 2050 + }, + { + "epoch": 0.09982010575601011, + "grad_norm": 0.4382697641849518, + "learning_rate": 4.93642484426898e-05, + "loss": 1.6587, + "num_input_tokens_seen": 81330628, + "step": 2060 + }, + { + "epoch": 0.10030466937618492, + "grad_norm": 0.45559099316596985, + "learning_rate": 4.935542617222434e-05, + "loss": 1.6698, + "num_input_tokens_seen": 81744300, + "step": 2070 + }, + { + "epoch": 0.10078923299635971, + "grad_norm": 0.44068899750709534, + "learning_rate": 4.9346543909196584e-05, + "loss": 1.6527, + "num_input_tokens_seen": 82151876, + "step": 2080 + }, + { + "epoch": 0.10127379661653452, + "grad_norm": 0.4670219123363495, + "learning_rate": 4.933760167548542e-05, + "loss": 1.66, + "num_input_tokens_seen": 82539776, + "step": 2090 + }, + { + "epoch": 0.10175836023670932, + "grad_norm": 0.4150504171848297, + "learning_rate": 4.932859949311745e-05, + "loss": 1.6959, + "num_input_tokens_seen": 82918640, + "step": 2100 + }, + { + "epoch": 0.10224292385688413, + "grad_norm": 0.46419015526771545, + "learning_rate": 4.931953738426698e-05, + "loss": 1.6897, + "num_input_tokens_seen": 83299820, + "step": 2110 + }, + { + "epoch": 0.10272748747705894, + "grad_norm": 0.4582190215587616, + "learning_rate": 4.931041537125587e-05, + "loss": 1.6196, + "num_input_tokens_seen": 83659764, + "step": 2120 + }, + { + "epoch": 0.10321205109723375, + "grad_norm": 0.4939762055873871, + "learning_rate": 4.930123347655358e-05, + "loss": 1.6231, + "num_input_tokens_seen": 84052080, + "step": 2130 + }, + { + "epoch": 0.10369661471740856, + "grad_norm": 0.43949180841445923, + "learning_rate": 4.929199172277705e-05, + "loss": 1.5858, + "num_input_tokens_seen": 84431660, + "step": 2140 + }, + { + "epoch": 0.10418117833758336, + "grad_norm": 0.44852331280708313, + "learning_rate": 4.928269013269069e-05, + "loss": 1.678, + "num_input_tokens_seen": 84828416, + "step": 2150 + }, + { + "epoch": 0.10466574195775817, + "grad_norm": 0.4769739508628845, + "learning_rate": 4.927332872920626e-05, + "loss": 1.6463, + "num_input_tokens_seen": 85231908, + "step": 2160 + }, + { + "epoch": 0.10515030557793298, + "grad_norm": 0.41337233781814575, + "learning_rate": 4.926390753538288e-05, + "loss": 1.6137, + "num_input_tokens_seen": 85644796, + "step": 2170 + }, + { + "epoch": 0.10563486919810777, + "grad_norm": 0.4415737986564636, + "learning_rate": 4.925442657442696e-05, + "loss": 1.638, + "num_input_tokens_seen": 86024508, + "step": 2180 + }, + { + "epoch": 0.10611943281828258, + "grad_norm": 0.4723414480686188, + "learning_rate": 4.924488586969208e-05, + "loss": 1.6074, + "num_input_tokens_seen": 86425788, + "step": 2190 + }, + { + "epoch": 0.10660399643845739, + "grad_norm": 0.4304730296134949, + "learning_rate": 4.923528544467905e-05, + "loss": 1.6436, + "num_input_tokens_seen": 86797336, + "step": 2200 + }, + { + "epoch": 0.1070885600586322, + "grad_norm": 0.4499008059501648, + "learning_rate": 4.9225625323035706e-05, + "loss": 1.6196, + "num_input_tokens_seen": 87182044, + "step": 2210 + }, + { + "epoch": 0.107573123678807, + "grad_norm": 0.415367066860199, + "learning_rate": 4.9215905528557e-05, + "loss": 1.6363, + "num_input_tokens_seen": 87604236, + "step": 2220 + }, + { + "epoch": 0.10805768729898181, + "grad_norm": 0.4271062910556793, + "learning_rate": 4.9206126085184824e-05, + "loss": 1.6101, + "num_input_tokens_seen": 87995600, + "step": 2230 + }, + { + "epoch": 0.10854225091915662, + "grad_norm": 0.4146898686885834, + "learning_rate": 4.919628701700802e-05, + "loss": 1.5793, + "num_input_tokens_seen": 88393892, + "step": 2240 + }, + { + "epoch": 0.10902681453933143, + "grad_norm": 0.43309634923934937, + "learning_rate": 4.918638834826229e-05, + "loss": 1.5593, + "num_input_tokens_seen": 88770548, + "step": 2250 + }, + { + "epoch": 0.10951137815950623, + "grad_norm": 0.42651528120040894, + "learning_rate": 4.917643010333015e-05, + "loss": 1.6263, + "num_input_tokens_seen": 89160792, + "step": 2260 + }, + { + "epoch": 0.10999594177968104, + "grad_norm": 0.4150625467300415, + "learning_rate": 4.916641230674086e-05, + "loss": 1.6196, + "num_input_tokens_seen": 89548084, + "step": 2270 + }, + { + "epoch": 0.11048050539985584, + "grad_norm": 0.4922536313533783, + "learning_rate": 4.915633498317037e-05, + "loss": 1.6234, + "num_input_tokens_seen": 89957316, + "step": 2280 + }, + { + "epoch": 0.11096506902003064, + "grad_norm": 0.436235636472702, + "learning_rate": 4.914619815744126e-05, + "loss": 1.602, + "num_input_tokens_seen": 90352888, + "step": 2290 + }, + { + "epoch": 0.11144963264020545, + "grad_norm": 0.44822990894317627, + "learning_rate": 4.913600185452267e-05, + "loss": 1.6393, + "num_input_tokens_seen": 90729132, + "step": 2300 + }, + { + "epoch": 0.11193419626038026, + "grad_norm": 0.4682275652885437, + "learning_rate": 4.912574609953026e-05, + "loss": 1.5937, + "num_input_tokens_seen": 91133560, + "step": 2310 + }, + { + "epoch": 0.11241875988055507, + "grad_norm": 0.44868946075439453, + "learning_rate": 4.911543091772611e-05, + "loss": 1.6376, + "num_input_tokens_seen": 91514000, + "step": 2320 + }, + { + "epoch": 0.11290332350072987, + "grad_norm": 0.4826831817626953, + "learning_rate": 4.910505633451869e-05, + "loss": 1.6667, + "num_input_tokens_seen": 91919504, + "step": 2330 + }, + { + "epoch": 0.11338788712090468, + "grad_norm": 0.5310072898864746, + "learning_rate": 4.9094622375462806e-05, + "loss": 1.6148, + "num_input_tokens_seen": 92332568, + "step": 2340 + }, + { + "epoch": 0.11387245074107949, + "grad_norm": 0.43625608086586, + "learning_rate": 4.90841290662595e-05, + "loss": 1.6217, + "num_input_tokens_seen": 92716832, + "step": 2350 + }, + { + "epoch": 0.1143570143612543, + "grad_norm": 0.46015968918800354, + "learning_rate": 4.9073576432755995e-05, + "loss": 1.6222, + "num_input_tokens_seen": 93138708, + "step": 2360 + }, + { + "epoch": 0.1148415779814291, + "grad_norm": 0.4389876425266266, + "learning_rate": 4.906296450094568e-05, + "loss": 1.617, + "num_input_tokens_seen": 93552848, + "step": 2370 + }, + { + "epoch": 0.1153261416016039, + "grad_norm": 0.46346965432167053, + "learning_rate": 4.9052293296967975e-05, + "loss": 1.5736, + "num_input_tokens_seen": 93947928, + "step": 2380 + }, + { + "epoch": 0.1158107052217787, + "grad_norm": 0.4337058961391449, + "learning_rate": 4.9041562847108304e-05, + "loss": 1.5778, + "num_input_tokens_seen": 94323152, + "step": 2390 + }, + { + "epoch": 0.11629526884195351, + "grad_norm": 0.4524190425872803, + "learning_rate": 4.903077317779805e-05, + "loss": 1.5926, + "num_input_tokens_seen": 94707744, + "step": 2400 + }, + { + "epoch": 0.11677983246212832, + "grad_norm": 0.4808904826641083, + "learning_rate": 4.901992431561443e-05, + "loss": 1.6427, + "num_input_tokens_seen": 95084712, + "step": 2410 + }, + { + "epoch": 0.11726439608230313, + "grad_norm": 0.4621996581554413, + "learning_rate": 4.9009016287280496e-05, + "loss": 1.5838, + "num_input_tokens_seen": 95498392, + "step": 2420 + }, + { + "epoch": 0.11774895970247794, + "grad_norm": 0.41997554898262024, + "learning_rate": 4.899804911966502e-05, + "loss": 1.6226, + "num_input_tokens_seen": 95880184, + "step": 2430 + }, + { + "epoch": 0.11823352332265274, + "grad_norm": 0.475421279668808, + "learning_rate": 4.898702283978247e-05, + "loss": 1.6259, + "num_input_tokens_seen": 96265332, + "step": 2440 + }, + { + "epoch": 0.11871808694282755, + "grad_norm": 0.45337820053100586, + "learning_rate": 4.8975937474792895e-05, + "loss": 1.6884, + "num_input_tokens_seen": 96681320, + "step": 2450 + }, + { + "epoch": 0.11920265056300236, + "grad_norm": 0.42129579186439514, + "learning_rate": 4.896479305200188e-05, + "loss": 1.5833, + "num_input_tokens_seen": 97088276, + "step": 2460 + }, + { + "epoch": 0.11968721418317717, + "grad_norm": 0.42145466804504395, + "learning_rate": 4.895358959886051e-05, + "loss": 1.6114, + "num_input_tokens_seen": 97501600, + "step": 2470 + }, + { + "epoch": 0.12017177780335198, + "grad_norm": 0.4644761085510254, + "learning_rate": 4.8942327142965244e-05, + "loss": 1.6331, + "num_input_tokens_seen": 97893056, + "step": 2480 + }, + { + "epoch": 0.12065634142352677, + "grad_norm": 0.4692575931549072, + "learning_rate": 4.8931005712057905e-05, + "loss": 1.591, + "num_input_tokens_seen": 98303764, + "step": 2490 + }, + { + "epoch": 0.12114090504370158, + "grad_norm": 0.4318110942840576, + "learning_rate": 4.891962533402556e-05, + "loss": 1.5659, + "num_input_tokens_seen": 98672376, + "step": 2500 + }, + { + "epoch": 0.12162546866387638, + "grad_norm": 0.44102174043655396, + "learning_rate": 4.890818603690049e-05, + "loss": 1.5804, + "num_input_tokens_seen": 99057612, + "step": 2510 + }, + { + "epoch": 0.12211003228405119, + "grad_norm": 0.45612481236457825, + "learning_rate": 4.88966878488601e-05, + "loss": 1.6189, + "num_input_tokens_seen": 99443028, + "step": 2520 + }, + { + "epoch": 0.122594595904226, + "grad_norm": 0.502396821975708, + "learning_rate": 4.888513079822686e-05, + "loss": 1.542, + "num_input_tokens_seen": 99861476, + "step": 2530 + }, + { + "epoch": 0.12307915952440081, + "grad_norm": 0.438915491104126, + "learning_rate": 4.887351491346822e-05, + "loss": 1.554, + "num_input_tokens_seen": 100294464, + "step": 2540 + }, + { + "epoch": 0.12356372314457562, + "grad_norm": 0.474985808134079, + "learning_rate": 4.886184022319657e-05, + "loss": 1.6263, + "num_input_tokens_seen": 100653652, + "step": 2550 + }, + { + "epoch": 0.12404828676475042, + "grad_norm": 0.43759259581565857, + "learning_rate": 4.8850106756169146e-05, + "loss": 1.5717, + "num_input_tokens_seen": 101039388, + "step": 2560 + }, + { + "epoch": 0.12453285038492523, + "grad_norm": 0.4843815267086029, + "learning_rate": 4.8838314541287936e-05, + "loss": 1.6216, + "num_input_tokens_seen": 101443224, + "step": 2570 + }, + { + "epoch": 0.12501741400510002, + "grad_norm": 0.5523076057434082, + "learning_rate": 4.882646360759967e-05, + "loss": 1.5746, + "num_input_tokens_seen": 101857288, + "step": 2580 + }, + { + "epoch": 0.12550197762527485, + "grad_norm": 0.43904462456703186, + "learning_rate": 4.88145539842957e-05, + "loss": 1.5948, + "num_input_tokens_seen": 102260496, + "step": 2590 + }, + { + "epoch": 0.12598654124544964, + "grad_norm": 0.4333835244178772, + "learning_rate": 4.880258570071194e-05, + "loss": 1.6429, + "num_input_tokens_seen": 102644332, + "step": 2600 + }, + { + "epoch": 0.12647110486562446, + "grad_norm": 0.42234155535697937, + "learning_rate": 4.879055878632881e-05, + "loss": 1.5685, + "num_input_tokens_seen": 103032436, + "step": 2610 + }, + { + "epoch": 0.12695566848579926, + "grad_norm": 0.40778931975364685, + "learning_rate": 4.8778473270771144e-05, + "loss": 1.5875, + "num_input_tokens_seen": 103422688, + "step": 2620 + }, + { + "epoch": 0.12744023210597408, + "grad_norm": 0.45513054728507996, + "learning_rate": 4.8766329183808115e-05, + "loss": 1.5896, + "num_input_tokens_seen": 103798060, + "step": 2630 + }, + { + "epoch": 0.12792479572614887, + "grad_norm": 0.4449268579483032, + "learning_rate": 4.875412655535319e-05, + "loss": 1.6182, + "num_input_tokens_seen": 104187120, + "step": 2640 + }, + { + "epoch": 0.12840935934632366, + "grad_norm": 0.4176337718963623, + "learning_rate": 4.8741865415463995e-05, + "loss": 1.5704, + "num_input_tokens_seen": 104566676, + "step": 2650 + }, + { + "epoch": 0.1288939229664985, + "grad_norm": 0.4052715003490448, + "learning_rate": 4.8729545794342326e-05, + "loss": 1.5684, + "num_input_tokens_seen": 104970872, + "step": 2660 + }, + { + "epoch": 0.12937848658667328, + "grad_norm": 0.43383073806762695, + "learning_rate": 4.871716772233401e-05, + "loss": 1.5695, + "num_input_tokens_seen": 105341948, + "step": 2670 + }, + { + "epoch": 0.1298630502068481, + "grad_norm": 0.4440796673297882, + "learning_rate": 4.870473122992886e-05, + "loss": 1.5893, + "num_input_tokens_seen": 105747904, + "step": 2680 + }, + { + "epoch": 0.1303476138270229, + "grad_norm": 0.47211089730262756, + "learning_rate": 4.869223634776059e-05, + "loss": 1.5961, + "num_input_tokens_seen": 106128328, + "step": 2690 + }, + { + "epoch": 0.13083217744719772, + "grad_norm": 0.4548238515853882, + "learning_rate": 4.867968310660671e-05, + "loss": 1.6209, + "num_input_tokens_seen": 106507300, + "step": 2700 + }, + { + "epoch": 0.1313167410673725, + "grad_norm": 0.4506012797355652, + "learning_rate": 4.8667071537388535e-05, + "loss": 1.6299, + "num_input_tokens_seen": 106889452, + "step": 2710 + }, + { + "epoch": 0.13180130468754733, + "grad_norm": 0.42806220054626465, + "learning_rate": 4.8654401671171014e-05, + "loss": 1.5991, + "num_input_tokens_seen": 107252004, + "step": 2720 + }, + { + "epoch": 0.13228586830772213, + "grad_norm": 0.4180905222892761, + "learning_rate": 4.86416735391627e-05, + "loss": 1.5844, + "num_input_tokens_seen": 107629528, + "step": 2730 + }, + { + "epoch": 0.13277043192789692, + "grad_norm": 0.4253162145614624, + "learning_rate": 4.862888717271568e-05, + "loss": 1.5904, + "num_input_tokens_seen": 107996228, + "step": 2740 + }, + { + "epoch": 0.13325499554807174, + "grad_norm": 0.44208529591560364, + "learning_rate": 4.861604260332547e-05, + "loss": 1.5569, + "num_input_tokens_seen": 108380192, + "step": 2750 + }, + { + "epoch": 0.13373955916824654, + "grad_norm": 0.40524107217788696, + "learning_rate": 4.8603139862630966e-05, + "loss": 1.6175, + "num_input_tokens_seen": 108779284, + "step": 2760 + }, + { + "epoch": 0.13422412278842136, + "grad_norm": 0.5582908391952515, + "learning_rate": 4.8590178982414346e-05, + "loss": 1.5905, + "num_input_tokens_seen": 109193100, + "step": 2770 + }, + { + "epoch": 0.13470868640859615, + "grad_norm": 0.4604218900203705, + "learning_rate": 4.8577159994600995e-05, + "loss": 1.5554, + "num_input_tokens_seen": 109569728, + "step": 2780 + }, + { + "epoch": 0.13519325002877097, + "grad_norm": 0.41278553009033203, + "learning_rate": 4.8564082931259426e-05, + "loss": 1.5784, + "num_input_tokens_seen": 109984452, + "step": 2790 + }, + { + "epoch": 0.13567781364894577, + "grad_norm": 0.4047918915748596, + "learning_rate": 4.8550947824601216e-05, + "loss": 1.5317, + "num_input_tokens_seen": 110389128, + "step": 2800 + }, + { + "epoch": 0.1361623772691206, + "grad_norm": 0.4299299716949463, + "learning_rate": 4.853775470698091e-05, + "loss": 1.5954, + "num_input_tokens_seen": 110788072, + "step": 2810 + }, + { + "epoch": 0.13664694088929538, + "grad_norm": 0.4523586332798004, + "learning_rate": 4.8524503610895944e-05, + "loss": 1.5679, + "num_input_tokens_seen": 111191500, + "step": 2820 + }, + { + "epoch": 0.1371315045094702, + "grad_norm": 0.4595218002796173, + "learning_rate": 4.8511194568986563e-05, + "loss": 1.5974, + "num_input_tokens_seen": 111580564, + "step": 2830 + }, + { + "epoch": 0.137616068129645, + "grad_norm": 0.4451237916946411, + "learning_rate": 4.8497827614035755e-05, + "loss": 1.5807, + "num_input_tokens_seen": 111974908, + "step": 2840 + }, + { + "epoch": 0.1381006317498198, + "grad_norm": 0.4401971995830536, + "learning_rate": 4.848440277896915e-05, + "loss": 1.5301, + "num_input_tokens_seen": 112379284, + "step": 2850 + }, + { + "epoch": 0.1385851953699946, + "grad_norm": 0.4478752017021179, + "learning_rate": 4.847092009685496e-05, + "loss": 1.5541, + "num_input_tokens_seen": 112797012, + "step": 2860 + }, + { + "epoch": 0.1390697589901694, + "grad_norm": 0.47638776898384094, + "learning_rate": 4.8457379600903886e-05, + "loss": 1.5861, + "num_input_tokens_seen": 113177816, + "step": 2870 + }, + { + "epoch": 0.13955432261034423, + "grad_norm": 0.4315278232097626, + "learning_rate": 4.844378132446903e-05, + "loss": 1.527, + "num_input_tokens_seen": 113585668, + "step": 2880 + }, + { + "epoch": 0.14003888623051902, + "grad_norm": 0.45071589946746826, + "learning_rate": 4.843012530104581e-05, + "loss": 1.5542, + "num_input_tokens_seen": 114012516, + "step": 2890 + }, + { + "epoch": 0.14052344985069384, + "grad_norm": 0.4602771997451782, + "learning_rate": 4.841641156427189e-05, + "loss": 1.543, + "num_input_tokens_seen": 114413536, + "step": 2900 + }, + { + "epoch": 0.14100801347086864, + "grad_norm": 0.4230845272541046, + "learning_rate": 4.8402640147927134e-05, + "loss": 1.5379, + "num_input_tokens_seen": 114815632, + "step": 2910 + }, + { + "epoch": 0.14149257709104346, + "grad_norm": 0.4051437973976135, + "learning_rate": 4.838881108593342e-05, + "loss": 1.5231, + "num_input_tokens_seen": 115202344, + "step": 2920 + }, + { + "epoch": 0.14197714071121825, + "grad_norm": 0.5333889126777649, + "learning_rate": 4.837492441235467e-05, + "loss": 1.5378, + "num_input_tokens_seen": 115622044, + "step": 2930 + }, + { + "epoch": 0.14246170433139305, + "grad_norm": 0.4066469967365265, + "learning_rate": 4.8360980161396685e-05, + "loss": 1.584, + "num_input_tokens_seen": 115983972, + "step": 2940 + }, + { + "epoch": 0.14294626795156787, + "grad_norm": 0.4050688147544861, + "learning_rate": 4.834697836740712e-05, + "loss": 1.5642, + "num_input_tokens_seen": 116370664, + "step": 2950 + }, + { + "epoch": 0.14343083157174266, + "grad_norm": 0.4575314521789551, + "learning_rate": 4.833291906487533e-05, + "loss": 1.512, + "num_input_tokens_seen": 116802032, + "step": 2960 + }, + { + "epoch": 0.14391539519191748, + "grad_norm": 0.43021175265312195, + "learning_rate": 4.831880228843235e-05, + "loss": 1.5795, + "num_input_tokens_seen": 117193496, + "step": 2970 + }, + { + "epoch": 0.14439995881209228, + "grad_norm": 0.4586018919944763, + "learning_rate": 4.83046280728508e-05, + "loss": 1.556, + "num_input_tokens_seen": 117616412, + "step": 2980 + }, + { + "epoch": 0.1448845224322671, + "grad_norm": 0.47244206070899963, + "learning_rate": 4.8290396453044764e-05, + "loss": 1.5146, + "num_input_tokens_seen": 117972728, + "step": 2990 + }, + { + "epoch": 0.1453690860524419, + "grad_norm": 0.3958747088909149, + "learning_rate": 4.827610746406972e-05, + "loss": 1.5349, + "num_input_tokens_seen": 118356720, + "step": 3000 + }, + { + "epoch": 0.14585364967261671, + "grad_norm": 0.45024189352989197, + "learning_rate": 4.826176114112247e-05, + "loss": 1.6121, + "num_input_tokens_seen": 118769664, + "step": 3010 + }, + { + "epoch": 0.1463382132927915, + "grad_norm": 0.44841715693473816, + "learning_rate": 4.824735751954106e-05, + "loss": 1.5356, + "num_input_tokens_seen": 119180436, + "step": 3020 + }, + { + "epoch": 0.14682277691296633, + "grad_norm": 0.4641439616680145, + "learning_rate": 4.8232896634804634e-05, + "loss": 1.6029, + "num_input_tokens_seen": 119553244, + "step": 3030 + }, + { + "epoch": 0.14730734053314112, + "grad_norm": 0.4200107455253601, + "learning_rate": 4.8218378522533404e-05, + "loss": 1.5575, + "num_input_tokens_seen": 119933780, + "step": 3040 + }, + { + "epoch": 0.14779190415331592, + "grad_norm": 0.47531604766845703, + "learning_rate": 4.8203803218488567e-05, + "loss": 1.5486, + "num_input_tokens_seen": 120345892, + "step": 3050 + }, + { + "epoch": 0.14827646777349074, + "grad_norm": 0.42880284786224365, + "learning_rate": 4.8189170758572154e-05, + "loss": 1.5223, + "num_input_tokens_seen": 120711848, + "step": 3060 + }, + { + "epoch": 0.14876103139366553, + "grad_norm": 0.4171142280101776, + "learning_rate": 4.817448117882703e-05, + "loss": 1.5771, + "num_input_tokens_seen": 121113568, + "step": 3070 + }, + { + "epoch": 0.14924559501384035, + "grad_norm": 0.43676143884658813, + "learning_rate": 4.815973451543672e-05, + "loss": 1.5337, + "num_input_tokens_seen": 121540304, + "step": 3080 + }, + { + "epoch": 0.14973015863401515, + "grad_norm": 0.4129098057746887, + "learning_rate": 4.814493080472538e-05, + "loss": 1.5162, + "num_input_tokens_seen": 121912956, + "step": 3090 + }, + { + "epoch": 0.15021472225418997, + "grad_norm": 0.4339870810508728, + "learning_rate": 4.8130070083157676e-05, + "loss": 1.5347, + "num_input_tokens_seen": 122302884, + "step": 3100 + }, + { + "epoch": 0.15069928587436476, + "grad_norm": 0.4579829275608063, + "learning_rate": 4.8115152387338705e-05, + "loss": 1.5682, + "num_input_tokens_seen": 122729188, + "step": 3110 + }, + { + "epoch": 0.15118384949453958, + "grad_norm": 0.44452232122421265, + "learning_rate": 4.81001777540139e-05, + "loss": 1.5084, + "num_input_tokens_seen": 123099424, + "step": 3120 + }, + { + "epoch": 0.15166841311471438, + "grad_norm": 0.4334086775779724, + "learning_rate": 4.8085146220068955e-05, + "loss": 1.5089, + "num_input_tokens_seen": 123501724, + "step": 3130 + }, + { + "epoch": 0.15215297673488917, + "grad_norm": 0.4431312680244446, + "learning_rate": 4.8070057822529715e-05, + "loss": 1.4841, + "num_input_tokens_seen": 123897884, + "step": 3140 + }, + { + "epoch": 0.152637540355064, + "grad_norm": 0.42815452814102173, + "learning_rate": 4.8054912598562086e-05, + "loss": 1.5234, + "num_input_tokens_seen": 124295448, + "step": 3150 + }, + { + "epoch": 0.1531221039752388, + "grad_norm": 0.44743165373802185, + "learning_rate": 4.8039710585471966e-05, + "loss": 1.5925, + "num_input_tokens_seen": 124680420, + "step": 3160 + }, + { + "epoch": 0.1536066675954136, + "grad_norm": 0.41514381766319275, + "learning_rate": 4.802445182070511e-05, + "loss": 1.5505, + "num_input_tokens_seen": 125076112, + "step": 3170 + }, + { + "epoch": 0.1540912312155884, + "grad_norm": 0.4845329225063324, + "learning_rate": 4.8009136341847094e-05, + "loss": 1.5366, + "num_input_tokens_seen": 125473256, + "step": 3180 + }, + { + "epoch": 0.15457579483576322, + "grad_norm": 0.4526010751724243, + "learning_rate": 4.799376418662318e-05, + "loss": 1.5357, + "num_input_tokens_seen": 125882064, + "step": 3190 + }, + { + "epoch": 0.15506035845593802, + "grad_norm": 0.42789971828460693, + "learning_rate": 4.7978335392898235e-05, + "loss": 1.4749, + "num_input_tokens_seen": 126280000, + "step": 3200 + }, + { + "epoch": 0.15554492207611284, + "grad_norm": 0.4213913083076477, + "learning_rate": 4.796284999867663e-05, + "loss": 1.5433, + "num_input_tokens_seen": 126699892, + "step": 3210 + }, + { + "epoch": 0.15602948569628763, + "grad_norm": 0.39674410223960876, + "learning_rate": 4.794730804210217e-05, + "loss": 1.5379, + "num_input_tokens_seen": 127115872, + "step": 3220 + }, + { + "epoch": 0.15651404931646246, + "grad_norm": 0.45900386571884155, + "learning_rate": 4.793170956145798e-05, + "loss": 1.5696, + "num_input_tokens_seen": 127499596, + "step": 3230 + }, + { + "epoch": 0.15699861293663725, + "grad_norm": 0.43314629793167114, + "learning_rate": 4.791605459516641e-05, + "loss": 1.5543, + "num_input_tokens_seen": 127906492, + "step": 3240 + }, + { + "epoch": 0.15748317655681204, + "grad_norm": 0.408204585313797, + "learning_rate": 4.7900343181788955e-05, + "loss": 1.5537, + "num_input_tokens_seen": 128323396, + "step": 3250 + }, + { + "epoch": 0.15796774017698686, + "grad_norm": 0.45687878131866455, + "learning_rate": 4.788457536002614e-05, + "loss": 1.5473, + "num_input_tokens_seen": 128719820, + "step": 3260 + }, + { + "epoch": 0.15845230379716166, + "grad_norm": 0.423566997051239, + "learning_rate": 4.7868751168717454e-05, + "loss": 1.6038, + "num_input_tokens_seen": 129126672, + "step": 3270 + }, + { + "epoch": 0.15893686741733648, + "grad_norm": 0.4187241196632385, + "learning_rate": 4.785287064684122e-05, + "loss": 1.5793, + "num_input_tokens_seen": 129537864, + "step": 3280 + }, + { + "epoch": 0.15942143103751127, + "grad_norm": 0.45594966411590576, + "learning_rate": 4.783693383351452e-05, + "loss": 1.5753, + "num_input_tokens_seen": 129948744, + "step": 3290 + }, + { + "epoch": 0.1599059946576861, + "grad_norm": 0.4248170554637909, + "learning_rate": 4.782094076799308e-05, + "loss": 1.5443, + "num_input_tokens_seen": 130350392, + "step": 3300 + }, + { + "epoch": 0.1603905582778609, + "grad_norm": 0.491043359041214, + "learning_rate": 4.780489148967122e-05, + "loss": 1.5265, + "num_input_tokens_seen": 130744872, + "step": 3310 + }, + { + "epoch": 0.1608751218980357, + "grad_norm": 0.44143855571746826, + "learning_rate": 4.77887860380817e-05, + "loss": 1.4956, + "num_input_tokens_seen": 131164480, + "step": 3320 + }, + { + "epoch": 0.1613596855182105, + "grad_norm": 0.4861333668231964, + "learning_rate": 4.777262445289565e-05, + "loss": 1.5403, + "num_input_tokens_seen": 131523208, + "step": 3330 + }, + { + "epoch": 0.16184424913838533, + "grad_norm": 0.44125744700431824, + "learning_rate": 4.775640677392246e-05, + "loss": 1.5067, + "num_input_tokens_seen": 131941532, + "step": 3340 + }, + { + "epoch": 0.16232881275856012, + "grad_norm": 0.43593043088912964, + "learning_rate": 4.774013304110972e-05, + "loss": 1.5443, + "num_input_tokens_seen": 132309260, + "step": 3350 + }, + { + "epoch": 0.16281337637873491, + "grad_norm": 0.37986883521080017, + "learning_rate": 4.7723803294543056e-05, + "loss": 1.5141, + "num_input_tokens_seen": 132674688, + "step": 3360 + }, + { + "epoch": 0.16329793999890974, + "grad_norm": 0.4436347484588623, + "learning_rate": 4.7707417574446086e-05, + "loss": 1.5817, + "num_input_tokens_seen": 133074724, + "step": 3370 + }, + { + "epoch": 0.16378250361908453, + "grad_norm": 0.48060745000839233, + "learning_rate": 4.769097592118033e-05, + "loss": 1.5664, + "num_input_tokens_seen": 133457192, + "step": 3380 + }, + { + "epoch": 0.16426706723925935, + "grad_norm": 0.4321390688419342, + "learning_rate": 4.7674478375245013e-05, + "loss": 1.5077, + "num_input_tokens_seen": 133844504, + "step": 3390 + }, + { + "epoch": 0.16475163085943414, + "grad_norm": 0.4549488425254822, + "learning_rate": 4.76579249772771e-05, + "loss": 1.5348, + "num_input_tokens_seen": 134258676, + "step": 3400 + }, + { + "epoch": 0.16523619447960897, + "grad_norm": 0.47234755754470825, + "learning_rate": 4.764131576805111e-05, + "loss": 1.55, + "num_input_tokens_seen": 134615472, + "step": 3410 + }, + { + "epoch": 0.16572075809978376, + "grad_norm": 0.4680884778499603, + "learning_rate": 4.762465078847903e-05, + "loss": 1.4957, + "num_input_tokens_seen": 135018208, + "step": 3420 + }, + { + "epoch": 0.16620532171995858, + "grad_norm": 0.4551864564418793, + "learning_rate": 4.760793007961023e-05, + "loss": 1.4933, + "num_input_tokens_seen": 135419696, + "step": 3430 + }, + { + "epoch": 0.16668988534013338, + "grad_norm": 0.4141683578491211, + "learning_rate": 4.759115368263135e-05, + "loss": 1.5556, + "num_input_tokens_seen": 135800204, + "step": 3440 + }, + { + "epoch": 0.16717444896030817, + "grad_norm": 0.3936518430709839, + "learning_rate": 4.75743216388662e-05, + "loss": 1.5483, + "num_input_tokens_seen": 136190572, + "step": 3450 + }, + { + "epoch": 0.167659012580483, + "grad_norm": 0.41701188683509827, + "learning_rate": 4.7557433989775654e-05, + "loss": 1.5062, + "num_input_tokens_seen": 136592572, + "step": 3460 + }, + { + "epoch": 0.16814357620065778, + "grad_norm": 0.41708120703697205, + "learning_rate": 4.754049077695758e-05, + "loss": 1.557, + "num_input_tokens_seen": 136951096, + "step": 3470 + }, + { + "epoch": 0.1686281398208326, + "grad_norm": 0.5057613253593445, + "learning_rate": 4.752349204214668e-05, + "loss": 1.5247, + "num_input_tokens_seen": 137320496, + "step": 3480 + }, + { + "epoch": 0.1691127034410074, + "grad_norm": 0.4021996855735779, + "learning_rate": 4.750643782721442e-05, + "loss": 1.548, + "num_input_tokens_seen": 137671948, + "step": 3490 + }, + { + "epoch": 0.16959726706118222, + "grad_norm": 0.4202756881713867, + "learning_rate": 4.7489328174168964e-05, + "loss": 1.529, + "num_input_tokens_seen": 138105600, + "step": 3500 + }, + { + "epoch": 0.17008183068135702, + "grad_norm": 0.4259394407272339, + "learning_rate": 4.747216312515498e-05, + "loss": 1.5241, + "num_input_tokens_seen": 138487120, + "step": 3510 + }, + { + "epoch": 0.17056639430153184, + "grad_norm": 0.3875693678855896, + "learning_rate": 4.745494272245361e-05, + "loss": 1.524, + "num_input_tokens_seen": 138885020, + "step": 3520 + }, + { + "epoch": 0.17105095792170663, + "grad_norm": 0.3988937735557556, + "learning_rate": 4.743766700848237e-05, + "loss": 1.5126, + "num_input_tokens_seen": 139288928, + "step": 3530 + }, + { + "epoch": 0.17153552154188145, + "grad_norm": 0.41776183247566223, + "learning_rate": 4.742033602579497e-05, + "loss": 1.5061, + "num_input_tokens_seen": 139667980, + "step": 3540 + }, + { + "epoch": 0.17202008516205625, + "grad_norm": 0.4490506649017334, + "learning_rate": 4.740294981708129e-05, + "loss": 1.5089, + "num_input_tokens_seen": 140092556, + "step": 3550 + }, + { + "epoch": 0.17250464878223104, + "grad_norm": 0.44959497451782227, + "learning_rate": 4.738550842516724e-05, + "loss": 1.5266, + "num_input_tokens_seen": 140485860, + "step": 3560 + }, + { + "epoch": 0.17298921240240586, + "grad_norm": 0.42410293221473694, + "learning_rate": 4.736801189301466e-05, + "loss": 1.5103, + "num_input_tokens_seen": 140883128, + "step": 3570 + }, + { + "epoch": 0.17347377602258066, + "grad_norm": 0.43540823459625244, + "learning_rate": 4.73504602637212e-05, + "loss": 1.5119, + "num_input_tokens_seen": 141262836, + "step": 3580 + }, + { + "epoch": 0.17395833964275548, + "grad_norm": 0.3838653862476349, + "learning_rate": 4.733285358052022e-05, + "loss": 1.4833, + "num_input_tokens_seen": 141659460, + "step": 3590 + }, + { + "epoch": 0.17444290326293027, + "grad_norm": 0.40834611654281616, + "learning_rate": 4.7315191886780727e-05, + "loss": 1.5645, + "num_input_tokens_seen": 142062380, + "step": 3600 + }, + { + "epoch": 0.1749274668831051, + "grad_norm": 0.39526131749153137, + "learning_rate": 4.729747522600719e-05, + "loss": 1.5083, + "num_input_tokens_seen": 142433008, + "step": 3610 + }, + { + "epoch": 0.1754120305032799, + "grad_norm": 0.48167720437049866, + "learning_rate": 4.727970364183949e-05, + "loss": 1.5694, + "num_input_tokens_seen": 142814724, + "step": 3620 + }, + { + "epoch": 0.1758965941234547, + "grad_norm": 0.40276476740837097, + "learning_rate": 4.72618771780528e-05, + "loss": 1.4578, + "num_input_tokens_seen": 143208888, + "step": 3630 + }, + { + "epoch": 0.1763811577436295, + "grad_norm": 0.46488335728645325, + "learning_rate": 4.724399587855747e-05, + "loss": 1.5344, + "num_input_tokens_seen": 143612984, + "step": 3640 + }, + { + "epoch": 0.1768657213638043, + "grad_norm": 0.4321140944957733, + "learning_rate": 4.7226059787398914e-05, + "loss": 1.5565, + "num_input_tokens_seen": 144004420, + "step": 3650 + }, + { + "epoch": 0.17735028498397912, + "grad_norm": 0.42687898874282837, + "learning_rate": 4.720806894875751e-05, + "loss": 1.4911, + "num_input_tokens_seen": 144385500, + "step": 3660 + }, + { + "epoch": 0.1778348486041539, + "grad_norm": 0.39949852228164673, + "learning_rate": 4.7190023406948506e-05, + "loss": 1.4925, + "num_input_tokens_seen": 144786456, + "step": 3670 + }, + { + "epoch": 0.17831941222432873, + "grad_norm": 0.45870667695999146, + "learning_rate": 4.7171923206421886e-05, + "loss": 1.5279, + "num_input_tokens_seen": 145150804, + "step": 3680 + }, + { + "epoch": 0.17880397584450353, + "grad_norm": 0.3944126069545746, + "learning_rate": 4.715376839176226e-05, + "loss": 1.5185, + "num_input_tokens_seen": 145542392, + "step": 3690 + }, + { + "epoch": 0.17928853946467835, + "grad_norm": 0.41185203194618225, + "learning_rate": 4.713555900768879e-05, + "loss": 1.4337, + "num_input_tokens_seen": 145911624, + "step": 3700 + }, + { + "epoch": 0.17977310308485314, + "grad_norm": 0.39053046703338623, + "learning_rate": 4.711729509905501e-05, + "loss": 1.5279, + "num_input_tokens_seen": 146287476, + "step": 3710 + }, + { + "epoch": 0.18025766670502796, + "grad_norm": 0.41288021206855774, + "learning_rate": 4.709897671084881e-05, + "loss": 1.5093, + "num_input_tokens_seen": 146688592, + "step": 3720 + }, + { + "epoch": 0.18074223032520276, + "grad_norm": 0.400301992893219, + "learning_rate": 4.7080603888192256e-05, + "loss": 1.4321, + "num_input_tokens_seen": 147100860, + "step": 3730 + }, + { + "epoch": 0.18122679394537758, + "grad_norm": 0.39671310782432556, + "learning_rate": 4.7062176676341476e-05, + "loss": 1.4853, + "num_input_tokens_seen": 147506564, + "step": 3740 + }, + { + "epoch": 0.18171135756555237, + "grad_norm": 0.4283420741558075, + "learning_rate": 4.7043695120686594e-05, + "loss": 1.4895, + "num_input_tokens_seen": 147906624, + "step": 3750 + }, + { + "epoch": 0.18219592118572717, + "grad_norm": 0.44357410073280334, + "learning_rate": 4.7025159266751586e-05, + "loss": 1.5736, + "num_input_tokens_seen": 148321696, + "step": 3760 + }, + { + "epoch": 0.182680484805902, + "grad_norm": 0.4820682406425476, + "learning_rate": 4.7006569160194185e-05, + "loss": 1.4884, + "num_input_tokens_seen": 148710300, + "step": 3770 + }, + { + "epoch": 0.18316504842607678, + "grad_norm": 0.46169939637184143, + "learning_rate": 4.698792484680574e-05, + "loss": 1.4714, + "num_input_tokens_seen": 149111440, + "step": 3780 + }, + { + "epoch": 0.1836496120462516, + "grad_norm": 0.42826372385025024, + "learning_rate": 4.6969226372511153e-05, + "loss": 1.5092, + "num_input_tokens_seen": 149498596, + "step": 3790 + }, + { + "epoch": 0.1841341756664264, + "grad_norm": 0.4284881055355072, + "learning_rate": 4.695047378336871e-05, + "loss": 1.5269, + "num_input_tokens_seen": 149928100, + "step": 3800 + }, + { + "epoch": 0.18461873928660122, + "grad_norm": 0.41511744260787964, + "learning_rate": 4.693166712556999e-05, + "loss": 1.5135, + "num_input_tokens_seen": 150349972, + "step": 3810 + }, + { + "epoch": 0.185103302906776, + "grad_norm": 0.43388912081718445, + "learning_rate": 4.6912806445439786e-05, + "loss": 1.5093, + "num_input_tokens_seen": 150745692, + "step": 3820 + }, + { + "epoch": 0.18558786652695083, + "grad_norm": 0.4395964741706848, + "learning_rate": 4.689389178943593e-05, + "loss": 1.5737, + "num_input_tokens_seen": 151156868, + "step": 3830 + }, + { + "epoch": 0.18607243014712563, + "grad_norm": 0.4370187819004059, + "learning_rate": 4.6874923204149215e-05, + "loss": 1.5126, + "num_input_tokens_seen": 151565152, + "step": 3840 + }, + { + "epoch": 0.18655699376730042, + "grad_norm": 0.4095444083213806, + "learning_rate": 4.685590073630327e-05, + "loss": 1.4713, + "num_input_tokens_seen": 151965476, + "step": 3850 + }, + { + "epoch": 0.18704155738747524, + "grad_norm": 0.3944699764251709, + "learning_rate": 4.683682443275447e-05, + "loss": 1.5337, + "num_input_tokens_seen": 152338220, + "step": 3860 + }, + { + "epoch": 0.18752612100765004, + "grad_norm": 0.4351947009563446, + "learning_rate": 4.681769434049177e-05, + "loss": 1.491, + "num_input_tokens_seen": 152724792, + "step": 3870 + }, + { + "epoch": 0.18801068462782486, + "grad_norm": 0.5131449699401855, + "learning_rate": 4.6798510506636626e-05, + "loss": 1.5185, + "num_input_tokens_seen": 153099052, + "step": 3880 + }, + { + "epoch": 0.18849524824799965, + "grad_norm": 0.43832552433013916, + "learning_rate": 4.6779272978442904e-05, + "loss": 1.5165, + "num_input_tokens_seen": 153488160, + "step": 3890 + }, + { + "epoch": 0.18897981186817447, + "grad_norm": 0.4259849488735199, + "learning_rate": 4.6759981803296666e-05, + "loss": 1.4512, + "num_input_tokens_seen": 153864880, + "step": 3900 + }, + { + "epoch": 0.18946437548834927, + "grad_norm": 0.46592122316360474, + "learning_rate": 4.674063702871617e-05, + "loss": 1.4473, + "num_input_tokens_seen": 154258796, + "step": 3910 + }, + { + "epoch": 0.1899489391085241, + "grad_norm": 0.4338749349117279, + "learning_rate": 4.672123870235169e-05, + "loss": 1.5149, + "num_input_tokens_seen": 154659628, + "step": 3920 + }, + { + "epoch": 0.19043350272869888, + "grad_norm": 0.4241959750652313, + "learning_rate": 4.6701786871985395e-05, + "loss": 1.546, + "num_input_tokens_seen": 155065544, + "step": 3930 + }, + { + "epoch": 0.1909180663488737, + "grad_norm": 0.39792391657829285, + "learning_rate": 4.6682281585531264e-05, + "loss": 1.5081, + "num_input_tokens_seen": 155439252, + "step": 3940 + }, + { + "epoch": 0.1914026299690485, + "grad_norm": 0.4416780173778534, + "learning_rate": 4.666272289103492e-05, + "loss": 1.5018, + "num_input_tokens_seen": 155848612, + "step": 3950 + }, + { + "epoch": 0.1918871935892233, + "grad_norm": 0.42491331696510315, + "learning_rate": 4.664311083667359e-05, + "loss": 1.5042, + "num_input_tokens_seen": 156231544, + "step": 3960 + }, + { + "epoch": 0.19237175720939811, + "grad_norm": 0.4376972019672394, + "learning_rate": 4.6623445470755875e-05, + "loss": 1.5095, + "num_input_tokens_seen": 156622212, + "step": 3970 + }, + { + "epoch": 0.1928563208295729, + "grad_norm": 0.4348180890083313, + "learning_rate": 4.660372684172176e-05, + "loss": 1.4884, + "num_input_tokens_seen": 157011628, + "step": 3980 + }, + { + "epoch": 0.19334088444974773, + "grad_norm": 0.4411153793334961, + "learning_rate": 4.658395499814238e-05, + "loss": 1.5275, + "num_input_tokens_seen": 157391712, + "step": 3990 + }, + { + "epoch": 0.19382544806992252, + "grad_norm": 0.4086403548717499, + "learning_rate": 4.656412998871996e-05, + "loss": 1.4957, + "num_input_tokens_seen": 157778628, + "step": 4000 + }, + { + "epoch": 0.19382544806992252, + "eval_loss": 1.6085283756256104, + "eval_runtime": 3.8392, + "eval_samples_per_second": 39.07, + "eval_steps_per_second": 4.949, + "num_input_tokens_seen": 157778628, + "step": 4000 + }, + { + "epoch": 0.19431001169009735, + "grad_norm": 0.4232029914855957, + "learning_rate": 4.65442518622877e-05, + "loss": 1.4919, + "num_input_tokens_seen": 158177392, + "step": 4010 + }, + { + "epoch": 0.19479457531027214, + "grad_norm": 0.4746312201023102, + "learning_rate": 4.652432066780962e-05, + "loss": 1.4801, + "num_input_tokens_seen": 158587736, + "step": 4020 + }, + { + "epoch": 0.19527913893044696, + "grad_norm": 0.4055451452732086, + "learning_rate": 4.650433645438047e-05, + "loss": 1.5079, + "num_input_tokens_seen": 158997624, + "step": 4030 + }, + { + "epoch": 0.19576370255062175, + "grad_norm": 0.47638824582099915, + "learning_rate": 4.648429927122558e-05, + "loss": 1.5269, + "num_input_tokens_seen": 159406580, + "step": 4040 + }, + { + "epoch": 0.19624826617079658, + "grad_norm": 0.426217257976532, + "learning_rate": 4.646420916770078e-05, + "loss": 1.4881, + "num_input_tokens_seen": 159776464, + "step": 4050 + }, + { + "epoch": 0.19673282979097137, + "grad_norm": 0.3979707360267639, + "learning_rate": 4.644406619329223e-05, + "loss": 1.5584, + "num_input_tokens_seen": 160136328, + "step": 4060 + }, + { + "epoch": 0.19721739341114616, + "grad_norm": 0.37950029969215393, + "learning_rate": 4.642387039761635e-05, + "loss": 1.5176, + "num_input_tokens_seen": 160532608, + "step": 4070 + }, + { + "epoch": 0.19770195703132099, + "grad_norm": 0.4001169502735138, + "learning_rate": 4.6403621830419644e-05, + "loss": 1.4584, + "num_input_tokens_seen": 160930792, + "step": 4080 + }, + { + "epoch": 0.19818652065149578, + "grad_norm": 0.4448148012161255, + "learning_rate": 4.6383320541578604e-05, + "loss": 1.5309, + "num_input_tokens_seen": 161311248, + "step": 4090 + }, + { + "epoch": 0.1986710842716706, + "grad_norm": 0.4266856014728546, + "learning_rate": 4.63629665810996e-05, + "loss": 1.4883, + "num_input_tokens_seen": 161718084, + "step": 4100 + }, + { + "epoch": 0.1991556478918454, + "grad_norm": 0.4387410581111908, + "learning_rate": 4.6342559999118736e-05, + "loss": 1.4552, + "num_input_tokens_seen": 162101052, + "step": 4110 + }, + { + "epoch": 0.19964021151202022, + "grad_norm": 0.3797076344490051, + "learning_rate": 4.632210084590175e-05, + "loss": 1.5668, + "num_input_tokens_seen": 162513632, + "step": 4120 + }, + { + "epoch": 0.200124775132195, + "grad_norm": 0.4161923825740814, + "learning_rate": 4.630158917184385e-05, + "loss": 1.5182, + "num_input_tokens_seen": 162905012, + "step": 4130 + }, + { + "epoch": 0.20060933875236983, + "grad_norm": 0.41969460248947144, + "learning_rate": 4.6281025027469625e-05, + "loss": 1.4945, + "num_input_tokens_seen": 163281652, + "step": 4140 + }, + { + "epoch": 0.20109390237254463, + "grad_norm": 0.4282332956790924, + "learning_rate": 4.626040846343291e-05, + "loss": 1.5109, + "num_input_tokens_seen": 163652028, + "step": 4150 + }, + { + "epoch": 0.20157846599271942, + "grad_norm": 0.42534980177879333, + "learning_rate": 4.623973953051667e-05, + "loss": 1.4954, + "num_input_tokens_seen": 164028336, + "step": 4160 + }, + { + "epoch": 0.20206302961289424, + "grad_norm": 0.4472944736480713, + "learning_rate": 4.621901827963283e-05, + "loss": 1.4857, + "num_input_tokens_seen": 164392624, + "step": 4170 + }, + { + "epoch": 0.20254759323306903, + "grad_norm": 0.42009228467941284, + "learning_rate": 4.619824476182223e-05, + "loss": 1.5229, + "num_input_tokens_seen": 164753004, + "step": 4180 + }, + { + "epoch": 0.20303215685324386, + "grad_norm": 0.5307909250259399, + "learning_rate": 4.617741902825443e-05, + "loss": 1.4997, + "num_input_tokens_seen": 165122832, + "step": 4190 + }, + { + "epoch": 0.20351672047341865, + "grad_norm": 0.42677533626556396, + "learning_rate": 4.615654113022761e-05, + "loss": 1.4825, + "num_input_tokens_seen": 165469868, + "step": 4200 + }, + { + "epoch": 0.20400128409359347, + "grad_norm": 0.38291576504707336, + "learning_rate": 4.6135611119168465e-05, + "loss": 1.5681, + "num_input_tokens_seen": 165869628, + "step": 4210 + }, + { + "epoch": 0.20448584771376827, + "grad_norm": 0.40809643268585205, + "learning_rate": 4.611462904663201e-05, + "loss": 1.4854, + "num_input_tokens_seen": 166262208, + "step": 4220 + }, + { + "epoch": 0.2049704113339431, + "grad_norm": 0.43991562724113464, + "learning_rate": 4.6093594964301534e-05, + "loss": 1.4577, + "num_input_tokens_seen": 166640892, + "step": 4230 + }, + { + "epoch": 0.20545497495411788, + "grad_norm": 0.432851642370224, + "learning_rate": 4.607250892398843e-05, + "loss": 1.4723, + "num_input_tokens_seen": 167080484, + "step": 4240 + }, + { + "epoch": 0.2059395385742927, + "grad_norm": 0.41228345036506653, + "learning_rate": 4.605137097763207e-05, + "loss": 1.4305, + "num_input_tokens_seen": 167486976, + "step": 4250 + }, + { + "epoch": 0.2064241021944675, + "grad_norm": 0.4661470949649811, + "learning_rate": 4.603018117729968e-05, + "loss": 1.4348, + "num_input_tokens_seen": 167871328, + "step": 4260 + }, + { + "epoch": 0.2069086658146423, + "grad_norm": 0.39085134863853455, + "learning_rate": 4.600893957518622e-05, + "loss": 1.4937, + "num_input_tokens_seen": 168255252, + "step": 4270 + }, + { + "epoch": 0.2073932294348171, + "grad_norm": 0.3891197443008423, + "learning_rate": 4.5987646223614244e-05, + "loss": 1.4864, + "num_input_tokens_seen": 168634480, + "step": 4280 + }, + { + "epoch": 0.2078777930549919, + "grad_norm": 0.40408509969711304, + "learning_rate": 4.5966301175033785e-05, + "loss": 1.4056, + "num_input_tokens_seen": 169049616, + "step": 4290 + }, + { + "epoch": 0.20836235667516673, + "grad_norm": 0.4421607553958893, + "learning_rate": 4.59449044820222e-05, + "loss": 1.491, + "num_input_tokens_seen": 169439824, + "step": 4300 + }, + { + "epoch": 0.20884692029534152, + "grad_norm": 0.3921188712120056, + "learning_rate": 4.5923456197284065e-05, + "loss": 1.4783, + "num_input_tokens_seen": 169796656, + "step": 4310 + }, + { + "epoch": 0.20933148391551634, + "grad_norm": 0.4339124262332916, + "learning_rate": 4.590195637365105e-05, + "loss": 1.4668, + "num_input_tokens_seen": 170178796, + "step": 4320 + }, + { + "epoch": 0.20981604753569114, + "grad_norm": 0.41062822937965393, + "learning_rate": 4.588040506408176e-05, + "loss": 1.4628, + "num_input_tokens_seen": 170578188, + "step": 4330 + }, + { + "epoch": 0.21030061115586596, + "grad_norm": 0.40424343943595886, + "learning_rate": 4.5858802321661616e-05, + "loss": 1.4596, + "num_input_tokens_seen": 170964776, + "step": 4340 + }, + { + "epoch": 0.21078517477604075, + "grad_norm": 0.4466400742530823, + "learning_rate": 4.5837148199602745e-05, + "loss": 1.5198, + "num_input_tokens_seen": 171343616, + "step": 4350 + }, + { + "epoch": 0.21126973839621555, + "grad_norm": 0.45615527033805847, + "learning_rate": 4.581544275124383e-05, + "loss": 1.4517, + "num_input_tokens_seen": 171718620, + "step": 4360 + }, + { + "epoch": 0.21175430201639037, + "grad_norm": 0.45989635586738586, + "learning_rate": 4.5793686030049974e-05, + "loss": 1.4469, + "num_input_tokens_seen": 172101080, + "step": 4370 + }, + { + "epoch": 0.21223886563656516, + "grad_norm": 0.3968257009983063, + "learning_rate": 4.577187808961258e-05, + "loss": 1.4241, + "num_input_tokens_seen": 172502216, + "step": 4380 + }, + { + "epoch": 0.21272342925673998, + "grad_norm": 0.4084954559803009, + "learning_rate": 4.5750018983649214e-05, + "loss": 1.4621, + "num_input_tokens_seen": 172905268, + "step": 4390 + }, + { + "epoch": 0.21320799287691478, + "grad_norm": 0.42288362979888916, + "learning_rate": 4.5728108766003474e-05, + "loss": 1.4642, + "num_input_tokens_seen": 173303252, + "step": 4400 + }, + { + "epoch": 0.2136925564970896, + "grad_norm": 0.40454548597335815, + "learning_rate": 4.570614749064486e-05, + "loss": 1.4608, + "num_input_tokens_seen": 173696264, + "step": 4410 + }, + { + "epoch": 0.2141771201172644, + "grad_norm": 0.416332870721817, + "learning_rate": 4.568413521166863e-05, + "loss": 1.4757, + "num_input_tokens_seen": 174067360, + "step": 4420 + }, + { + "epoch": 0.2146616837374392, + "grad_norm": 0.4173552095890045, + "learning_rate": 4.566207198329568e-05, + "loss": 1.4377, + "num_input_tokens_seen": 174474464, + "step": 4430 + }, + { + "epoch": 0.215146247357614, + "grad_norm": 0.4277855455875397, + "learning_rate": 4.563995785987241e-05, + "loss": 1.4673, + "num_input_tokens_seen": 174876064, + "step": 4440 + }, + { + "epoch": 0.21563081097778883, + "grad_norm": 0.45221036672592163, + "learning_rate": 4.561779289587058e-05, + "loss": 1.4808, + "num_input_tokens_seen": 175281952, + "step": 4450 + }, + { + "epoch": 0.21611537459796362, + "grad_norm": 0.3993636667728424, + "learning_rate": 4.559557714588717e-05, + "loss": 1.5006, + "num_input_tokens_seen": 175657752, + "step": 4460 + }, + { + "epoch": 0.21659993821813842, + "grad_norm": 0.42281702160835266, + "learning_rate": 4.557331066464428e-05, + "loss": 1.4489, + "num_input_tokens_seen": 176038544, + "step": 4470 + }, + { + "epoch": 0.21708450183831324, + "grad_norm": 0.4503469467163086, + "learning_rate": 4.555099350698895e-05, + "loss": 1.5062, + "num_input_tokens_seen": 176415768, + "step": 4480 + }, + { + "epoch": 0.21756906545848803, + "grad_norm": 0.4171935021877289, + "learning_rate": 4.5528625727893065e-05, + "loss": 1.4701, + "num_input_tokens_seen": 176823204, + "step": 4490 + }, + { + "epoch": 0.21805362907866285, + "grad_norm": 0.41661641001701355, + "learning_rate": 4.5506207382453184e-05, + "loss": 1.485, + "num_input_tokens_seen": 177218572, + "step": 4500 + }, + { + "epoch": 0.21853819269883765, + "grad_norm": 0.4115769863128662, + "learning_rate": 4.548373852589044e-05, + "loss": 1.4757, + "num_input_tokens_seen": 177591148, + "step": 4510 + }, + { + "epoch": 0.21902275631901247, + "grad_norm": 0.42897987365722656, + "learning_rate": 4.5461219213550365e-05, + "loss": 1.4452, + "num_input_tokens_seen": 177978272, + "step": 4520 + }, + { + "epoch": 0.21950731993918726, + "grad_norm": 0.4455084204673767, + "learning_rate": 4.5438649500902796e-05, + "loss": 1.5184, + "num_input_tokens_seen": 178398380, + "step": 4530 + }, + { + "epoch": 0.21999188355936208, + "grad_norm": 0.3757345974445343, + "learning_rate": 4.5416029443541694e-05, + "loss": 1.4825, + "num_input_tokens_seen": 178807360, + "step": 4540 + }, + { + "epoch": 0.22047644717953688, + "grad_norm": 0.4476206302642822, + "learning_rate": 4.539335909718504e-05, + "loss": 1.5054, + "num_input_tokens_seen": 179185000, + "step": 4550 + }, + { + "epoch": 0.22096101079971167, + "grad_norm": 0.3936362862586975, + "learning_rate": 4.5370638517674716e-05, + "loss": 1.477, + "num_input_tokens_seen": 179593392, + "step": 4560 + }, + { + "epoch": 0.2214455744198865, + "grad_norm": 0.41432103514671326, + "learning_rate": 4.534786776097627e-05, + "loss": 1.4547, + "num_input_tokens_seen": 179985908, + "step": 4570 + }, + { + "epoch": 0.2219301380400613, + "grad_norm": 0.3996790051460266, + "learning_rate": 4.532504688317891e-05, + "loss": 1.4691, + "num_input_tokens_seen": 180414548, + "step": 4580 + }, + { + "epoch": 0.2224147016602361, + "grad_norm": 0.4232152998447418, + "learning_rate": 4.530217594049529e-05, + "loss": 1.4798, + "num_input_tokens_seen": 180809424, + "step": 4590 + }, + { + "epoch": 0.2228992652804109, + "grad_norm": 0.4331175684928894, + "learning_rate": 4.527925498926136e-05, + "loss": 1.4674, + "num_input_tokens_seen": 181195556, + "step": 4600 + }, + { + "epoch": 0.22338382890058572, + "grad_norm": 0.4475395083427429, + "learning_rate": 4.5256284085936275e-05, + "loss": 1.4507, + "num_input_tokens_seen": 181583096, + "step": 4610 + }, + { + "epoch": 0.22386839252076052, + "grad_norm": 0.42866751551628113, + "learning_rate": 4.523326328710222e-05, + "loss": 1.4855, + "num_input_tokens_seen": 181999512, + "step": 4620 + }, + { + "epoch": 0.22435295614093534, + "grad_norm": 0.41574034094810486, + "learning_rate": 4.5210192649464296e-05, + "loss": 1.464, + "num_input_tokens_seen": 182381672, + "step": 4630 + }, + { + "epoch": 0.22483751976111013, + "grad_norm": 0.39740654826164246, + "learning_rate": 4.518707222985035e-05, + "loss": 1.4681, + "num_input_tokens_seen": 182796928, + "step": 4640 + }, + { + "epoch": 0.22532208338128495, + "grad_norm": 0.41937920451164246, + "learning_rate": 4.5163902085210866e-05, + "loss": 1.4463, + "num_input_tokens_seen": 183201244, + "step": 4650 + }, + { + "epoch": 0.22580664700145975, + "grad_norm": 0.39213669300079346, + "learning_rate": 4.514068227261882e-05, + "loss": 1.423, + "num_input_tokens_seen": 183580392, + "step": 4660 + }, + { + "epoch": 0.22629121062163454, + "grad_norm": 0.3789583742618561, + "learning_rate": 4.511741284926949e-05, + "loss": 1.4835, + "num_input_tokens_seen": 183979192, + "step": 4670 + }, + { + "epoch": 0.22677577424180936, + "grad_norm": 0.4139784872531891, + "learning_rate": 4.5094093872480405e-05, + "loss": 1.51, + "num_input_tokens_seen": 184365440, + "step": 4680 + }, + { + "epoch": 0.22726033786198416, + "grad_norm": 0.414145290851593, + "learning_rate": 4.507072539969114e-05, + "loss": 1.4815, + "num_input_tokens_seen": 184735664, + "step": 4690 + }, + { + "epoch": 0.22774490148215898, + "grad_norm": 0.4188004434108734, + "learning_rate": 4.5047307488463156e-05, + "loss": 1.4549, + "num_input_tokens_seen": 185132600, + "step": 4700 + }, + { + "epoch": 0.22822946510233377, + "grad_norm": 0.4184124767780304, + "learning_rate": 4.502384019647974e-05, + "loss": 1.5019, + "num_input_tokens_seen": 185517396, + "step": 4710 + }, + { + "epoch": 0.2287140287225086, + "grad_norm": 0.3586943745613098, + "learning_rate": 4.5000323581545784e-05, + "loss": 1.4141, + "num_input_tokens_seen": 185908324, + "step": 4720 + }, + { + "epoch": 0.2291985923426834, + "grad_norm": 0.40754491090774536, + "learning_rate": 4.497675770158768e-05, + "loss": 1.4299, + "num_input_tokens_seen": 186287004, + "step": 4730 + }, + { + "epoch": 0.2296831559628582, + "grad_norm": 0.38670873641967773, + "learning_rate": 4.4953142614653175e-05, + "loss": 1.4442, + "num_input_tokens_seen": 186713356, + "step": 4740 + }, + { + "epoch": 0.230167719583033, + "grad_norm": 0.40814658999443054, + "learning_rate": 4.4929478378911214e-05, + "loss": 1.5235, + "num_input_tokens_seen": 187111252, + "step": 4750 + }, + { + "epoch": 0.2306522832032078, + "grad_norm": 0.4206823408603668, + "learning_rate": 4.490576505265182e-05, + "loss": 1.4814, + "num_input_tokens_seen": 187499448, + "step": 4760 + }, + { + "epoch": 0.23113684682338262, + "grad_norm": 0.5019489526748657, + "learning_rate": 4.488200269428592e-05, + "loss": 1.424, + "num_input_tokens_seen": 187884064, + "step": 4770 + }, + { + "epoch": 0.2316214104435574, + "grad_norm": 0.39812424778938293, + "learning_rate": 4.4858191362345224e-05, + "loss": 1.4475, + "num_input_tokens_seen": 188262996, + "step": 4780 + }, + { + "epoch": 0.23210597406373223, + "grad_norm": 0.43520602583885193, + "learning_rate": 4.483433111548208e-05, + "loss": 1.4525, + "num_input_tokens_seen": 188660764, + "step": 4790 + }, + { + "epoch": 0.23259053768390703, + "grad_norm": 0.40959790349006653, + "learning_rate": 4.4810422012469315e-05, + "loss": 1.4299, + "num_input_tokens_seen": 189046960, + "step": 4800 + }, + { + "epoch": 0.23307510130408185, + "grad_norm": 0.42006418108940125, + "learning_rate": 4.478646411220011e-05, + "loss": 1.5022, + "num_input_tokens_seen": 189438452, + "step": 4810 + }, + { + "epoch": 0.23355966492425664, + "grad_norm": 0.410160094499588, + "learning_rate": 4.476245747368783e-05, + "loss": 1.4756, + "num_input_tokens_seen": 189823928, + "step": 4820 + }, + { + "epoch": 0.23404422854443147, + "grad_norm": 0.4331664741039276, + "learning_rate": 4.473840215606589e-05, + "loss": 1.4821, + "num_input_tokens_seen": 190244028, + "step": 4830 + }, + { + "epoch": 0.23452879216460626, + "grad_norm": 0.4130837619304657, + "learning_rate": 4.4714298218587654e-05, + "loss": 1.4746, + "num_input_tokens_seen": 190647640, + "step": 4840 + }, + { + "epoch": 0.23501335578478108, + "grad_norm": 0.392727792263031, + "learning_rate": 4.469014572062618e-05, + "loss": 1.4636, + "num_input_tokens_seen": 191073988, + "step": 4850 + }, + { + "epoch": 0.23549791940495587, + "grad_norm": 0.4510425627231598, + "learning_rate": 4.466594472167419e-05, + "loss": 1.4326, + "num_input_tokens_seen": 191476524, + "step": 4860 + }, + { + "epoch": 0.23598248302513067, + "grad_norm": 0.40726661682128906, + "learning_rate": 4.4641695281343867e-05, + "loss": 1.4472, + "num_input_tokens_seen": 191852232, + "step": 4870 + }, + { + "epoch": 0.2364670466453055, + "grad_norm": 0.46587809920310974, + "learning_rate": 4.461739745936669e-05, + "loss": 1.4811, + "num_input_tokens_seen": 192231868, + "step": 4880 + }, + { + "epoch": 0.23695161026548028, + "grad_norm": 0.37719038128852844, + "learning_rate": 4.459305131559334e-05, + "loss": 1.4508, + "num_input_tokens_seen": 192629400, + "step": 4890 + }, + { + "epoch": 0.2374361738856551, + "grad_norm": 0.4082610309123993, + "learning_rate": 4.4568656909993515e-05, + "loss": 1.4626, + "num_input_tokens_seen": 193036008, + "step": 4900 + }, + { + "epoch": 0.2379207375058299, + "grad_norm": 0.4302626848220825, + "learning_rate": 4.454421430265579e-05, + "loss": 1.4371, + "num_input_tokens_seen": 193442156, + "step": 4910 + }, + { + "epoch": 0.23840530112600472, + "grad_norm": 0.42628800868988037, + "learning_rate": 4.451972355378748e-05, + "loss": 1.4682, + "num_input_tokens_seen": 193831248, + "step": 4920 + }, + { + "epoch": 0.23888986474617951, + "grad_norm": 0.3756248950958252, + "learning_rate": 4.449518472371447e-05, + "loss": 1.4266, + "num_input_tokens_seen": 194205320, + "step": 4930 + }, + { + "epoch": 0.23937442836635434, + "grad_norm": 0.41749587655067444, + "learning_rate": 4.447059787288107e-05, + "loss": 1.4397, + "num_input_tokens_seen": 194598988, + "step": 4940 + }, + { + "epoch": 0.23985899198652913, + "grad_norm": 0.39159849286079407, + "learning_rate": 4.444596306184992e-05, + "loss": 1.3895, + "num_input_tokens_seen": 195006708, + "step": 4950 + }, + { + "epoch": 0.24034355560670395, + "grad_norm": 0.44320693612098694, + "learning_rate": 4.4421280351301744e-05, + "loss": 1.4094, + "num_input_tokens_seen": 195416052, + "step": 4960 + }, + { + "epoch": 0.24082811922687875, + "grad_norm": 0.4559500813484192, + "learning_rate": 4.439654980203527e-05, + "loss": 1.4498, + "num_input_tokens_seen": 195806840, + "step": 4970 + }, + { + "epoch": 0.24131268284705354, + "grad_norm": 0.4020100235939026, + "learning_rate": 4.437177147496709e-05, + "loss": 1.4628, + "num_input_tokens_seen": 196212692, + "step": 4980 + }, + { + "epoch": 0.24179724646722836, + "grad_norm": 0.39954742789268494, + "learning_rate": 4.434694543113145e-05, + "loss": 1.4211, + "num_input_tokens_seen": 196623736, + "step": 4990 + }, + { + "epoch": 0.24228181008740315, + "grad_norm": 0.4263516366481781, + "learning_rate": 4.4322071731680146e-05, + "loss": 1.453, + "num_input_tokens_seen": 197006512, + "step": 5000 + }, + { + "epoch": 0.24276637370757798, + "grad_norm": 0.4143199026584625, + "learning_rate": 4.429715043788235e-05, + "loss": 1.4952, + "num_input_tokens_seen": 197391012, + "step": 5010 + }, + { + "epoch": 0.24325093732775277, + "grad_norm": 0.4607667028903961, + "learning_rate": 4.427218161112449e-05, + "loss": 1.4704, + "num_input_tokens_seen": 197751048, + "step": 5020 + }, + { + "epoch": 0.2437355009479276, + "grad_norm": 0.4273362159729004, + "learning_rate": 4.4247165312910034e-05, + "loss": 1.4698, + "num_input_tokens_seen": 198159508, + "step": 5030 + }, + { + "epoch": 0.24422006456810239, + "grad_norm": 0.39402100443840027, + "learning_rate": 4.4222101604859445e-05, + "loss": 1.4527, + "num_input_tokens_seen": 198550724, + "step": 5040 + }, + { + "epoch": 0.2447046281882772, + "grad_norm": 0.42816653847694397, + "learning_rate": 4.419699054870992e-05, + "loss": 1.4948, + "num_input_tokens_seen": 198905612, + "step": 5050 + }, + { + "epoch": 0.245189191808452, + "grad_norm": 0.4140118360519409, + "learning_rate": 4.417183220631528e-05, + "loss": 1.4628, + "num_input_tokens_seen": 199308252, + "step": 5060 + }, + { + "epoch": 0.2456737554286268, + "grad_norm": 0.3967956304550171, + "learning_rate": 4.4146626639645874e-05, + "loss": 1.4542, + "num_input_tokens_seen": 199703748, + "step": 5070 + }, + { + "epoch": 0.24615831904880162, + "grad_norm": 0.4145415425300598, + "learning_rate": 4.412137391078832e-05, + "loss": 1.4535, + "num_input_tokens_seen": 200118988, + "step": 5080 + }, + { + "epoch": 0.2466428826689764, + "grad_norm": 0.39303335547447205, + "learning_rate": 4.4096074081945425e-05, + "loss": 1.4249, + "num_input_tokens_seen": 200527544, + "step": 5090 + }, + { + "epoch": 0.24712744628915123, + "grad_norm": 0.4353494942188263, + "learning_rate": 4.4070727215436025e-05, + "loss": 1.4521, + "num_input_tokens_seen": 200930660, + "step": 5100 + }, + { + "epoch": 0.24761200990932603, + "grad_norm": 0.4420648515224457, + "learning_rate": 4.4045333373694795e-05, + "loss": 1.4591, + "num_input_tokens_seen": 201323040, + "step": 5110 + }, + { + "epoch": 0.24809657352950085, + "grad_norm": 0.40382125973701477, + "learning_rate": 4.4019892619272144e-05, + "loss": 1.4383, + "num_input_tokens_seen": 201734800, + "step": 5120 + }, + { + "epoch": 0.24858113714967564, + "grad_norm": 0.4114893078804016, + "learning_rate": 4.399440501483403e-05, + "loss": 1.415, + "num_input_tokens_seen": 202108596, + "step": 5130 + }, + { + "epoch": 0.24906570076985046, + "grad_norm": 0.4096544086933136, + "learning_rate": 4.3968870623161804e-05, + "loss": 1.4244, + "num_input_tokens_seen": 202487992, + "step": 5140 + }, + { + "epoch": 0.24955026439002526, + "grad_norm": 0.40928128361701965, + "learning_rate": 4.3943289507152066e-05, + "loss": 1.3944, + "num_input_tokens_seen": 202886772, + "step": 5150 + }, + { + "epoch": 0.25003482801020005, + "grad_norm": 0.4127601087093353, + "learning_rate": 4.391766172981653e-05, + "loss": 1.4302, + "num_input_tokens_seen": 203310128, + "step": 5160 + }, + { + "epoch": 0.25051939163037484, + "grad_norm": 0.4010636508464813, + "learning_rate": 4.38919873542818e-05, + "loss": 1.4494, + "num_input_tokens_seen": 203716500, + "step": 5170 + }, + { + "epoch": 0.2510039552505497, + "grad_norm": 0.479033499956131, + "learning_rate": 4.38662664437893e-05, + "loss": 1.466, + "num_input_tokens_seen": 204104804, + "step": 5180 + }, + { + "epoch": 0.2514885188707245, + "grad_norm": 0.4351840317249298, + "learning_rate": 4.384049906169509e-05, + "loss": 1.4538, + "num_input_tokens_seen": 204478748, + "step": 5190 + }, + { + "epoch": 0.2519730824908993, + "grad_norm": 0.45871713757514954, + "learning_rate": 4.381468527146965e-05, + "loss": 1.4958, + "num_input_tokens_seen": 204854344, + "step": 5200 + }, + { + "epoch": 0.2524576461110741, + "grad_norm": 0.42551910877227783, + "learning_rate": 4.378882513669782e-05, + "loss": 1.4588, + "num_input_tokens_seen": 205245060, + "step": 5210 + }, + { + "epoch": 0.2529422097312489, + "grad_norm": 0.39652207493782043, + "learning_rate": 4.376291872107856e-05, + "loss": 1.4118, + "num_input_tokens_seen": 205643280, + "step": 5220 + }, + { + "epoch": 0.2534267733514237, + "grad_norm": 0.4601559042930603, + "learning_rate": 4.373696608842486e-05, + "loss": 1.4046, + "num_input_tokens_seen": 206032812, + "step": 5230 + }, + { + "epoch": 0.2539113369715985, + "grad_norm": 0.4585094153881073, + "learning_rate": 4.371096730266354e-05, + "loss": 1.4753, + "num_input_tokens_seen": 206405924, + "step": 5240 + }, + { + "epoch": 0.2543959005917733, + "grad_norm": 0.39976418018341064, + "learning_rate": 4.3684922427835094e-05, + "loss": 1.5263, + "num_input_tokens_seen": 206774840, + "step": 5250 + }, + { + "epoch": 0.25488046421194815, + "grad_norm": 0.41982370615005493, + "learning_rate": 4.365883152809356e-05, + "loss": 1.4208, + "num_input_tokens_seen": 207172068, + "step": 5260 + }, + { + "epoch": 0.25536502783212295, + "grad_norm": 0.4455938935279846, + "learning_rate": 4.3632694667706345e-05, + "loss": 1.4489, + "num_input_tokens_seen": 207533668, + "step": 5270 + }, + { + "epoch": 0.25584959145229774, + "grad_norm": 0.46062058210372925, + "learning_rate": 4.360651191105405e-05, + "loss": 1.4252, + "num_input_tokens_seen": 207962000, + "step": 5280 + }, + { + "epoch": 0.25633415507247254, + "grad_norm": 0.46039023995399475, + "learning_rate": 4.358028332263034e-05, + "loss": 1.4279, + "num_input_tokens_seen": 208396128, + "step": 5290 + }, + { + "epoch": 0.25681871869264733, + "grad_norm": 0.4030589759349823, + "learning_rate": 4.355400896704177e-05, + "loss": 1.4416, + "num_input_tokens_seen": 208771020, + "step": 5300 + }, + { + "epoch": 0.2573032823128222, + "grad_norm": 0.46707648038864136, + "learning_rate": 4.3527688909007645e-05, + "loss": 1.4595, + "num_input_tokens_seen": 209175816, + "step": 5310 + }, + { + "epoch": 0.257787845932997, + "grad_norm": 0.39080390334129333, + "learning_rate": 4.350132321335982e-05, + "loss": 1.4442, + "num_input_tokens_seen": 209573164, + "step": 5320 + }, + { + "epoch": 0.25827240955317177, + "grad_norm": 0.3796825706958771, + "learning_rate": 4.347491194504257e-05, + "loss": 1.3894, + "num_input_tokens_seen": 209973236, + "step": 5330 + }, + { + "epoch": 0.25875697317334656, + "grad_norm": 0.413033127784729, + "learning_rate": 4.344845516911244e-05, + "loss": 1.4657, + "num_input_tokens_seen": 210345012, + "step": 5340 + }, + { + "epoch": 0.2592415367935214, + "grad_norm": 0.4229408800601959, + "learning_rate": 4.342195295073806e-05, + "loss": 1.4725, + "num_input_tokens_seen": 210769268, + "step": 5350 + }, + { + "epoch": 0.2597261004136962, + "grad_norm": 0.39224693179130554, + "learning_rate": 4.339540535519999e-05, + "loss": 1.4123, + "num_input_tokens_seen": 211153696, + "step": 5360 + }, + { + "epoch": 0.260210664033871, + "grad_norm": 0.4225060045719147, + "learning_rate": 4.3368812447890575e-05, + "loss": 1.4308, + "num_input_tokens_seen": 211552748, + "step": 5370 + }, + { + "epoch": 0.2606952276540458, + "grad_norm": 0.43410396575927734, + "learning_rate": 4.334217429431376e-05, + "loss": 1.4755, + "num_input_tokens_seen": 211943156, + "step": 5380 + }, + { + "epoch": 0.2611797912742206, + "grad_norm": 0.3979720175266266, + "learning_rate": 4.331549096008496e-05, + "loss": 1.4245, + "num_input_tokens_seen": 212315716, + "step": 5390 + }, + { + "epoch": 0.26166435489439543, + "grad_norm": 0.3867861032485962, + "learning_rate": 4.328876251093086e-05, + "loss": 1.4195, + "num_input_tokens_seen": 212702260, + "step": 5400 + }, + { + "epoch": 0.26214891851457023, + "grad_norm": 0.4254813492298126, + "learning_rate": 4.326198901268926e-05, + "loss": 1.4395, + "num_input_tokens_seen": 213112036, + "step": 5410 + }, + { + "epoch": 0.262633482134745, + "grad_norm": 0.37997907400131226, + "learning_rate": 4.323517053130898e-05, + "loss": 1.4331, + "num_input_tokens_seen": 213509980, + "step": 5420 + }, + { + "epoch": 0.2631180457549198, + "grad_norm": 0.4054989218711853, + "learning_rate": 4.320830713284958e-05, + "loss": 1.4607, + "num_input_tokens_seen": 213898376, + "step": 5430 + }, + { + "epoch": 0.26360260937509467, + "grad_norm": 0.5132128596305847, + "learning_rate": 4.3181398883481304e-05, + "loss": 1.4274, + "num_input_tokens_seen": 214290036, + "step": 5440 + }, + { + "epoch": 0.26408717299526946, + "grad_norm": 0.40065979957580566, + "learning_rate": 4.315444584948485e-05, + "loss": 1.4295, + "num_input_tokens_seen": 214675368, + "step": 5450 + }, + { + "epoch": 0.26457173661544425, + "grad_norm": 0.39053869247436523, + "learning_rate": 4.3127448097251235e-05, + "loss": 1.4356, + "num_input_tokens_seen": 215066532, + "step": 5460 + }, + { + "epoch": 0.26505630023561905, + "grad_norm": 0.413308709859848, + "learning_rate": 4.310040569328164e-05, + "loss": 1.4318, + "num_input_tokens_seen": 215432972, + "step": 5470 + }, + { + "epoch": 0.26554086385579384, + "grad_norm": 0.38464999198913574, + "learning_rate": 4.3073318704187206e-05, + "loss": 1.4088, + "num_input_tokens_seen": 215839824, + "step": 5480 + }, + { + "epoch": 0.2660254274759687, + "grad_norm": 0.39387422800064087, + "learning_rate": 4.3046187196688923e-05, + "loss": 1.446, + "num_input_tokens_seen": 216225156, + "step": 5490 + }, + { + "epoch": 0.2665099910961435, + "grad_norm": 0.3859797716140747, + "learning_rate": 4.3019011237617434e-05, + "loss": 1.4121, + "num_input_tokens_seen": 216625012, + "step": 5500 + }, + { + "epoch": 0.2669945547163183, + "grad_norm": 0.41637757420539856, + "learning_rate": 4.2991790893912856e-05, + "loss": 1.4053, + "num_input_tokens_seen": 217002304, + "step": 5510 + }, + { + "epoch": 0.26747911833649307, + "grad_norm": 0.4308488368988037, + "learning_rate": 4.296452623262465e-05, + "loss": 1.4176, + "num_input_tokens_seen": 217381264, + "step": 5520 + }, + { + "epoch": 0.2679636819566679, + "grad_norm": 0.42404836416244507, + "learning_rate": 4.293721732091145e-05, + "loss": 1.4504, + "num_input_tokens_seen": 217798076, + "step": 5530 + }, + { + "epoch": 0.2684482455768427, + "grad_norm": 0.42752760648727417, + "learning_rate": 4.290986422604087e-05, + "loss": 1.4442, + "num_input_tokens_seen": 218198560, + "step": 5540 + }, + { + "epoch": 0.2689328091970175, + "grad_norm": 0.4021061956882477, + "learning_rate": 4.288246701538936e-05, + "loss": 1.4238, + "num_input_tokens_seen": 218581244, + "step": 5550 + }, + { + "epoch": 0.2694173728171923, + "grad_norm": 0.3673078119754791, + "learning_rate": 4.285502575644206e-05, + "loss": 1.4188, + "num_input_tokens_seen": 218957676, + "step": 5560 + }, + { + "epoch": 0.2699019364373671, + "grad_norm": 0.40478530526161194, + "learning_rate": 4.282754051679256e-05, + "loss": 1.4707, + "num_input_tokens_seen": 219324792, + "step": 5570 + }, + { + "epoch": 0.27038650005754195, + "grad_norm": 0.41956064105033875, + "learning_rate": 4.280001136414283e-05, + "loss": 1.4583, + "num_input_tokens_seen": 219696608, + "step": 5580 + }, + { + "epoch": 0.27087106367771674, + "grad_norm": 0.4060947597026825, + "learning_rate": 4.2772438366303004e-05, + "loss": 1.4013, + "num_input_tokens_seen": 220080788, + "step": 5590 + }, + { + "epoch": 0.27135562729789153, + "grad_norm": 0.3806227445602417, + "learning_rate": 4.274482159119119e-05, + "loss": 1.4192, + "num_input_tokens_seen": 220493132, + "step": 5600 + }, + { + "epoch": 0.2718401909180663, + "grad_norm": 0.3854431211948395, + "learning_rate": 4.2717161106833336e-05, + "loss": 1.4181, + "num_input_tokens_seen": 220880964, + "step": 5610 + }, + { + "epoch": 0.2723247545382412, + "grad_norm": 0.40384823083877563, + "learning_rate": 4.2689456981363074e-05, + "loss": 1.4363, + "num_input_tokens_seen": 221269648, + "step": 5620 + }, + { + "epoch": 0.27280931815841597, + "grad_norm": 0.3799000680446625, + "learning_rate": 4.2661709283021514e-05, + "loss": 1.4415, + "num_input_tokens_seen": 221654464, + "step": 5630 + }, + { + "epoch": 0.27329388177859076, + "grad_norm": 0.4527069628238678, + "learning_rate": 4.26339180801571e-05, + "loss": 1.4193, + "num_input_tokens_seen": 222030672, + "step": 5640 + }, + { + "epoch": 0.27377844539876556, + "grad_norm": 0.41296154260635376, + "learning_rate": 4.260608344122544e-05, + "loss": 1.3981, + "num_input_tokens_seen": 222409816, + "step": 5650 + }, + { + "epoch": 0.2742630090189404, + "grad_norm": 0.37670767307281494, + "learning_rate": 4.257820543478913e-05, + "loss": 1.4171, + "num_input_tokens_seen": 222803080, + "step": 5660 + }, + { + "epoch": 0.2747475726391152, + "grad_norm": 0.3829585611820221, + "learning_rate": 4.255028412951761e-05, + "loss": 1.4174, + "num_input_tokens_seen": 223208420, + "step": 5670 + }, + { + "epoch": 0.27523213625929, + "grad_norm": 0.41849249601364136, + "learning_rate": 4.2522319594186934e-05, + "loss": 1.4399, + "num_input_tokens_seen": 223609392, + "step": 5680 + }, + { + "epoch": 0.2757166998794648, + "grad_norm": 0.39865440130233765, + "learning_rate": 4.2494311897679664e-05, + "loss": 1.4429, + "num_input_tokens_seen": 223997220, + "step": 5690 + }, + { + "epoch": 0.2762012634996396, + "grad_norm": 0.43166518211364746, + "learning_rate": 4.246626110898469e-05, + "loss": 1.4143, + "num_input_tokens_seen": 224395400, + "step": 5700 + }, + { + "epoch": 0.27668582711981443, + "grad_norm": 0.41193270683288574, + "learning_rate": 4.2438167297197027e-05, + "loss": 1.4019, + "num_input_tokens_seen": 224779756, + "step": 5710 + }, + { + "epoch": 0.2771703907399892, + "grad_norm": 0.38215872645378113, + "learning_rate": 4.2410030531517665e-05, + "loss": 1.4117, + "num_input_tokens_seen": 225155936, + "step": 5720 + }, + { + "epoch": 0.277654954360164, + "grad_norm": 0.3913954794406891, + "learning_rate": 4.2381850881253415e-05, + "loss": 1.4038, + "num_input_tokens_seen": 225541516, + "step": 5730 + }, + { + "epoch": 0.2781395179803388, + "grad_norm": 0.4006546139717102, + "learning_rate": 4.23536284158167e-05, + "loss": 1.4241, + "num_input_tokens_seen": 225902060, + "step": 5740 + }, + { + "epoch": 0.27862408160051366, + "grad_norm": 0.39373400807380676, + "learning_rate": 4.232536320472543e-05, + "loss": 1.422, + "num_input_tokens_seen": 226304908, + "step": 5750 + }, + { + "epoch": 0.27910864522068846, + "grad_norm": 0.40954354405403137, + "learning_rate": 4.2297055317602785e-05, + "loss": 1.4074, + "num_input_tokens_seen": 226677752, + "step": 5760 + }, + { + "epoch": 0.27959320884086325, + "grad_norm": 0.41480016708374023, + "learning_rate": 4.226870482417707e-05, + "loss": 1.4142, + "num_input_tokens_seen": 227050196, + "step": 5770 + }, + { + "epoch": 0.28007777246103804, + "grad_norm": 0.3755822479724884, + "learning_rate": 4.2240311794281564e-05, + "loss": 1.4397, + "num_input_tokens_seen": 227446460, + "step": 5780 + }, + { + "epoch": 0.28056233608121284, + "grad_norm": 0.44134095311164856, + "learning_rate": 4.221187629785428e-05, + "loss": 1.4487, + "num_input_tokens_seen": 227831552, + "step": 5790 + }, + { + "epoch": 0.2810468997013877, + "grad_norm": 0.4083091616630554, + "learning_rate": 4.218339840493786e-05, + "loss": 1.414, + "num_input_tokens_seen": 228221528, + "step": 5800 + }, + { + "epoch": 0.2815314633215625, + "grad_norm": 0.4252268671989441, + "learning_rate": 4.215487818567937e-05, + "loss": 1.3964, + "num_input_tokens_seen": 228590960, + "step": 5810 + }, + { + "epoch": 0.2820160269417373, + "grad_norm": 0.3926514983177185, + "learning_rate": 4.212631571033015e-05, + "loss": 1.3921, + "num_input_tokens_seen": 228975244, + "step": 5820 + }, + { + "epoch": 0.28250059056191207, + "grad_norm": 0.41333383321762085, + "learning_rate": 4.20977110492456e-05, + "loss": 1.4569, + "num_input_tokens_seen": 229370872, + "step": 5830 + }, + { + "epoch": 0.2829851541820869, + "grad_norm": 0.41047653555870056, + "learning_rate": 4.206906427288506e-05, + "loss": 1.465, + "num_input_tokens_seen": 229788924, + "step": 5840 + }, + { + "epoch": 0.2834697178022617, + "grad_norm": 0.4507644474506378, + "learning_rate": 4.204037545181158e-05, + "loss": 1.4222, + "num_input_tokens_seen": 230192956, + "step": 5850 + }, + { + "epoch": 0.2839542814224365, + "grad_norm": 0.4235022962093353, + "learning_rate": 4.201164465669179e-05, + "loss": 1.4538, + "num_input_tokens_seen": 230540380, + "step": 5860 + }, + { + "epoch": 0.2844388450426113, + "grad_norm": 0.3815031051635742, + "learning_rate": 4.1982871958295734e-05, + "loss": 1.44, + "num_input_tokens_seen": 230931500, + "step": 5870 + }, + { + "epoch": 0.2849234086627861, + "grad_norm": 0.4607156813144684, + "learning_rate": 4.1954057427496615e-05, + "loss": 1.3994, + "num_input_tokens_seen": 231318500, + "step": 5880 + }, + { + "epoch": 0.28540797228296094, + "grad_norm": 0.4481658637523651, + "learning_rate": 4.192520113527075e-05, + "loss": 1.4693, + "num_input_tokens_seen": 231716316, + "step": 5890 + }, + { + "epoch": 0.28589253590313574, + "grad_norm": 0.41151443123817444, + "learning_rate": 4.1896303152697254e-05, + "loss": 1.4007, + "num_input_tokens_seen": 232105992, + "step": 5900 + }, + { + "epoch": 0.28637709952331053, + "grad_norm": 0.3963596522808075, + "learning_rate": 4.186736355095798e-05, + "loss": 1.3917, + "num_input_tokens_seen": 232505400, + "step": 5910 + }, + { + "epoch": 0.2868616631434853, + "grad_norm": 0.42216241359710693, + "learning_rate": 4.183838240133728e-05, + "loss": 1.43, + "num_input_tokens_seen": 232922304, + "step": 5920 + }, + { + "epoch": 0.2873462267636602, + "grad_norm": 0.43040603399276733, + "learning_rate": 4.1809359775221854e-05, + "loss": 1.4388, + "num_input_tokens_seen": 233313888, + "step": 5930 + }, + { + "epoch": 0.28783079038383497, + "grad_norm": 0.4640505611896515, + "learning_rate": 4.178029574410056e-05, + "loss": 1.4517, + "num_input_tokens_seen": 233693256, + "step": 5940 + }, + { + "epoch": 0.28831535400400976, + "grad_norm": 0.40719953179359436, + "learning_rate": 4.175119037956425e-05, + "loss": 1.4023, + "num_input_tokens_seen": 234072408, + "step": 5950 + }, + { + "epoch": 0.28879991762418455, + "grad_norm": 0.3908650279045105, + "learning_rate": 4.17220437533056e-05, + "loss": 1.4172, + "num_input_tokens_seen": 234464800, + "step": 5960 + }, + { + "epoch": 0.2892844812443594, + "grad_norm": 0.4344137907028198, + "learning_rate": 4.169285593711889e-05, + "loss": 1.394, + "num_input_tokens_seen": 234870688, + "step": 5970 + }, + { + "epoch": 0.2897690448645342, + "grad_norm": 0.4607277512550354, + "learning_rate": 4.16636270028999e-05, + "loss": 1.428, + "num_input_tokens_seen": 235291024, + "step": 5980 + }, + { + "epoch": 0.290253608484709, + "grad_norm": 0.4087435007095337, + "learning_rate": 4.163435702264567e-05, + "loss": 1.4582, + "num_input_tokens_seen": 235698736, + "step": 5990 + }, + { + "epoch": 0.2907381721048838, + "grad_norm": 0.4173283576965332, + "learning_rate": 4.160504606845432e-05, + "loss": 1.4224, + "num_input_tokens_seen": 236103764, + "step": 6000 + }, + { + "epoch": 0.2907381721048838, + "eval_loss": 1.5239336490631104, + "eval_runtime": 4.4295, + "eval_samples_per_second": 33.864, + "eval_steps_per_second": 4.289, + "num_input_tokens_seen": 236103764, + "step": 6000 + }, + { + "epoch": 0.2912227357250586, + "grad_norm": 0.42691048979759216, + "learning_rate": 4.157569421252496e-05, + "loss": 1.4339, + "num_input_tokens_seen": 236508456, + "step": 6010 + }, + { + "epoch": 0.29170729934523343, + "grad_norm": 0.4078880548477173, + "learning_rate": 4.15463015271574e-05, + "loss": 1.4053, + "num_input_tokens_seen": 236915292, + "step": 6020 + }, + { + "epoch": 0.2921918629654082, + "grad_norm": 0.5139662623405457, + "learning_rate": 4.151686808475204e-05, + "loss": 1.4309, + "num_input_tokens_seen": 237303592, + "step": 6030 + }, + { + "epoch": 0.292676426585583, + "grad_norm": 0.40330371260643005, + "learning_rate": 4.1487393957809664e-05, + "loss": 1.4339, + "num_input_tokens_seen": 237688980, + "step": 6040 + }, + { + "epoch": 0.2931609902057578, + "grad_norm": 0.45068076252937317, + "learning_rate": 4.145787921893128e-05, + "loss": 1.4589, + "num_input_tokens_seen": 238057980, + "step": 6050 + }, + { + "epoch": 0.29364555382593266, + "grad_norm": 0.38412150740623474, + "learning_rate": 4.1428323940817933e-05, + "loss": 1.4576, + "num_input_tokens_seen": 238451220, + "step": 6060 + }, + { + "epoch": 0.29413011744610745, + "grad_norm": 0.385979026556015, + "learning_rate": 4.139872819627051e-05, + "loss": 1.4162, + "num_input_tokens_seen": 238829748, + "step": 6070 + }, + { + "epoch": 0.29461468106628225, + "grad_norm": 0.4156942069530487, + "learning_rate": 4.1369092058189586e-05, + "loss": 1.3992, + "num_input_tokens_seen": 239203752, + "step": 6080 + }, + { + "epoch": 0.29509924468645704, + "grad_norm": 0.4118397533893585, + "learning_rate": 4.133941559957524e-05, + "loss": 1.4029, + "num_input_tokens_seen": 239590056, + "step": 6090 + }, + { + "epoch": 0.29558380830663183, + "grad_norm": 0.47945645451545715, + "learning_rate": 4.130969889352686e-05, + "loss": 1.3959, + "num_input_tokens_seen": 239985824, + "step": 6100 + }, + { + "epoch": 0.2960683719268067, + "grad_norm": 0.36734700202941895, + "learning_rate": 4.1279942013242966e-05, + "loss": 1.3805, + "num_input_tokens_seen": 240361856, + "step": 6110 + }, + { + "epoch": 0.2965529355469815, + "grad_norm": 0.41420719027519226, + "learning_rate": 4.125014503202106e-05, + "loss": 1.3727, + "num_input_tokens_seen": 240780880, + "step": 6120 + }, + { + "epoch": 0.29703749916715627, + "grad_norm": 0.43215250968933105, + "learning_rate": 4.122030802325738e-05, + "loss": 1.4033, + "num_input_tokens_seen": 241189188, + "step": 6130 + }, + { + "epoch": 0.29752206278733107, + "grad_norm": 0.42517751455307007, + "learning_rate": 4.119043106044681e-05, + "loss": 1.4514, + "num_input_tokens_seen": 241610008, + "step": 6140 + }, + { + "epoch": 0.2980066264075059, + "grad_norm": 0.38953685760498047, + "learning_rate": 4.116051421718261e-05, + "loss": 1.4399, + "num_input_tokens_seen": 242006608, + "step": 6150 + }, + { + "epoch": 0.2984911900276807, + "grad_norm": 0.4071796238422394, + "learning_rate": 4.113055756715628e-05, + "loss": 1.3734, + "num_input_tokens_seen": 242416040, + "step": 6160 + }, + { + "epoch": 0.2989757536478555, + "grad_norm": 0.42148080468177795, + "learning_rate": 4.11005611841574e-05, + "loss": 1.4508, + "num_input_tokens_seen": 242794628, + "step": 6170 + }, + { + "epoch": 0.2994603172680303, + "grad_norm": 0.4180959463119507, + "learning_rate": 4.107052514207339e-05, + "loss": 1.4159, + "num_input_tokens_seen": 243164260, + "step": 6180 + }, + { + "epoch": 0.2999448808882051, + "grad_norm": 0.36293745040893555, + "learning_rate": 4.1040449514889375e-05, + "loss": 1.3633, + "num_input_tokens_seen": 243550040, + "step": 6190 + }, + { + "epoch": 0.30042944450837994, + "grad_norm": 0.4095574617385864, + "learning_rate": 4.1010334376687975e-05, + "loss": 1.4784, + "num_input_tokens_seen": 243908832, + "step": 6200 + }, + { + "epoch": 0.30091400812855473, + "grad_norm": 0.4131157100200653, + "learning_rate": 4.0980179801649146e-05, + "loss": 1.4184, + "num_input_tokens_seen": 244298944, + "step": 6210 + }, + { + "epoch": 0.3013985717487295, + "grad_norm": 0.39260920882225037, + "learning_rate": 4.094998586404998e-05, + "loss": 1.4639, + "num_input_tokens_seen": 244692220, + "step": 6220 + }, + { + "epoch": 0.3018831353689043, + "grad_norm": 0.43091267347335815, + "learning_rate": 4.0919752638264516e-05, + "loss": 1.4488, + "num_input_tokens_seen": 245050952, + "step": 6230 + }, + { + "epoch": 0.30236769898907917, + "grad_norm": 0.42741450667381287, + "learning_rate": 4.088948019876359e-05, + "loss": 1.4434, + "num_input_tokens_seen": 245465356, + "step": 6240 + }, + { + "epoch": 0.30285226260925396, + "grad_norm": 0.4109672009944916, + "learning_rate": 4.085916862011463e-05, + "loss": 1.4374, + "num_input_tokens_seen": 245880668, + "step": 6250 + }, + { + "epoch": 0.30333682622942876, + "grad_norm": 0.4069707691669464, + "learning_rate": 4.082881797698143e-05, + "loss": 1.3555, + "num_input_tokens_seen": 246284348, + "step": 6260 + }, + { + "epoch": 0.30382138984960355, + "grad_norm": 0.4159133732318878, + "learning_rate": 4.0798428344124064e-05, + "loss": 1.3921, + "num_input_tokens_seen": 246662396, + "step": 6270 + }, + { + "epoch": 0.30430595346977835, + "grad_norm": 0.42207905650138855, + "learning_rate": 4.07679997963986e-05, + "loss": 1.3996, + "num_input_tokens_seen": 247049844, + "step": 6280 + }, + { + "epoch": 0.3047905170899532, + "grad_norm": 0.3645390272140503, + "learning_rate": 4.0737532408757014e-05, + "loss": 1.4054, + "num_input_tokens_seen": 247441300, + "step": 6290 + }, + { + "epoch": 0.305275080710128, + "grad_norm": 0.41653063893318176, + "learning_rate": 4.0707026256246894e-05, + "loss": 1.3984, + "num_input_tokens_seen": 247851160, + "step": 6300 + }, + { + "epoch": 0.3057596443303028, + "grad_norm": 0.41180601716041565, + "learning_rate": 4.0676481414011345e-05, + "loss": 1.3664, + "num_input_tokens_seen": 248245884, + "step": 6310 + }, + { + "epoch": 0.3062442079504776, + "grad_norm": 0.4273717403411865, + "learning_rate": 4.064589795728878e-05, + "loss": 1.4068, + "num_input_tokens_seen": 248649064, + "step": 6320 + }, + { + "epoch": 0.3067287715706524, + "grad_norm": 0.3938922882080078, + "learning_rate": 4.06152759614127e-05, + "loss": 1.4184, + "num_input_tokens_seen": 249030280, + "step": 6330 + }, + { + "epoch": 0.3072133351908272, + "grad_norm": 0.4099946916103363, + "learning_rate": 4.0584615501811577e-05, + "loss": 1.3773, + "num_input_tokens_seen": 249430164, + "step": 6340 + }, + { + "epoch": 0.307697898811002, + "grad_norm": 0.4208473265171051, + "learning_rate": 4.055391665400858e-05, + "loss": 1.3534, + "num_input_tokens_seen": 249806996, + "step": 6350 + }, + { + "epoch": 0.3081824624311768, + "grad_norm": 0.4177727997303009, + "learning_rate": 4.052317949362147e-05, + "loss": 1.3888, + "num_input_tokens_seen": 250196904, + "step": 6360 + }, + { + "epoch": 0.30866702605135166, + "grad_norm": 0.4083334505558014, + "learning_rate": 4.049240409636237e-05, + "loss": 1.3849, + "num_input_tokens_seen": 250581560, + "step": 6370 + }, + { + "epoch": 0.30915158967152645, + "grad_norm": 0.4475148022174835, + "learning_rate": 4.046159053803758e-05, + "loss": 1.3855, + "num_input_tokens_seen": 250966948, + "step": 6380 + }, + { + "epoch": 0.30963615329170124, + "grad_norm": 0.39256733655929565, + "learning_rate": 4.0430738894547426e-05, + "loss": 1.4499, + "num_input_tokens_seen": 251353488, + "step": 6390 + }, + { + "epoch": 0.31012071691187604, + "grad_norm": 0.3913814127445221, + "learning_rate": 4.0399849241886e-05, + "loss": 1.3994, + "num_input_tokens_seen": 251742612, + "step": 6400 + }, + { + "epoch": 0.31060528053205083, + "grad_norm": 0.41286328434944153, + "learning_rate": 4.0368921656141065e-05, + "loss": 1.3918, + "num_input_tokens_seen": 252141148, + "step": 6410 + }, + { + "epoch": 0.3110898441522257, + "grad_norm": 0.39640408754348755, + "learning_rate": 4.03379562134938e-05, + "loss": 1.3757, + "num_input_tokens_seen": 252532656, + "step": 6420 + }, + { + "epoch": 0.3115744077724005, + "grad_norm": 0.40248820185661316, + "learning_rate": 4.030695299021863e-05, + "loss": 1.3941, + "num_input_tokens_seen": 252899616, + "step": 6430 + }, + { + "epoch": 0.31205897139257527, + "grad_norm": 0.3976843059062958, + "learning_rate": 4.027591206268304e-05, + "loss": 1.3953, + "num_input_tokens_seen": 253271420, + "step": 6440 + }, + { + "epoch": 0.31254353501275006, + "grad_norm": 0.42651301622390747, + "learning_rate": 4.02448335073474e-05, + "loss": 1.4118, + "num_input_tokens_seen": 253688464, + "step": 6450 + }, + { + "epoch": 0.3130280986329249, + "grad_norm": 0.4326588213443756, + "learning_rate": 4.0213717400764766e-05, + "loss": 1.3611, + "num_input_tokens_seen": 254091036, + "step": 6460 + }, + { + "epoch": 0.3135126622530997, + "grad_norm": 0.43498626351356506, + "learning_rate": 4.018256381958068e-05, + "loss": 1.3872, + "num_input_tokens_seen": 254494504, + "step": 6470 + }, + { + "epoch": 0.3139972258732745, + "grad_norm": 0.3556174039840698, + "learning_rate": 4.0151372840533e-05, + "loss": 1.4149, + "num_input_tokens_seen": 254877836, + "step": 6480 + }, + { + "epoch": 0.3144817894934493, + "grad_norm": 0.40494486689567566, + "learning_rate": 4.0120144540451706e-05, + "loss": 1.4471, + "num_input_tokens_seen": 255276796, + "step": 6490 + }, + { + "epoch": 0.3149663531136241, + "grad_norm": 0.42257654666900635, + "learning_rate": 4.008887899625868e-05, + "loss": 1.3782, + "num_input_tokens_seen": 255671500, + "step": 6500 + }, + { + "epoch": 0.31545091673379894, + "grad_norm": 0.4325736165046692, + "learning_rate": 4.005757628496759e-05, + "loss": 1.4032, + "num_input_tokens_seen": 256064776, + "step": 6510 + }, + { + "epoch": 0.31593548035397373, + "grad_norm": 0.4220854341983795, + "learning_rate": 4.002623648368361e-05, + "loss": 1.3634, + "num_input_tokens_seen": 256461304, + "step": 6520 + }, + { + "epoch": 0.3164200439741485, + "grad_norm": 0.38566797971725464, + "learning_rate": 3.9994859669603316e-05, + "loss": 1.3689, + "num_input_tokens_seen": 256882232, + "step": 6530 + }, + { + "epoch": 0.3169046075943233, + "grad_norm": 0.38369229435920715, + "learning_rate": 3.99634459200144e-05, + "loss": 1.4368, + "num_input_tokens_seen": 257264380, + "step": 6540 + }, + { + "epoch": 0.31738917121449817, + "grad_norm": 0.432253897190094, + "learning_rate": 3.9931995312295596e-05, + "loss": 1.4141, + "num_input_tokens_seen": 257641472, + "step": 6550 + }, + { + "epoch": 0.31787373483467296, + "grad_norm": 0.3968162536621094, + "learning_rate": 3.9900507923916394e-05, + "loss": 1.428, + "num_input_tokens_seen": 258040272, + "step": 6560 + }, + { + "epoch": 0.31835829845484775, + "grad_norm": 0.44724634289741516, + "learning_rate": 3.9868983832436876e-05, + "loss": 1.402, + "num_input_tokens_seen": 258430376, + "step": 6570 + }, + { + "epoch": 0.31884286207502255, + "grad_norm": 0.39584001898765564, + "learning_rate": 3.983742311550755e-05, + "loss": 1.3927, + "num_input_tokens_seen": 258847780, + "step": 6580 + }, + { + "epoch": 0.31932742569519734, + "grad_norm": 0.446869432926178, + "learning_rate": 3.9805825850869125e-05, + "loss": 1.4192, + "num_input_tokens_seen": 259228264, + "step": 6590 + }, + { + "epoch": 0.3198119893153722, + "grad_norm": 0.3873324394226074, + "learning_rate": 3.977419211635235e-05, + "loss": 1.37, + "num_input_tokens_seen": 259636616, + "step": 6600 + }, + { + "epoch": 0.320296552935547, + "grad_norm": 0.4095400869846344, + "learning_rate": 3.9742521989877795e-05, + "loss": 1.3664, + "num_input_tokens_seen": 260029232, + "step": 6610 + }, + { + "epoch": 0.3207811165557218, + "grad_norm": 0.3852662146091461, + "learning_rate": 3.971081554945568e-05, + "loss": 1.3546, + "num_input_tokens_seen": 260456224, + "step": 6620 + }, + { + "epoch": 0.3212656801758966, + "grad_norm": 0.3864857256412506, + "learning_rate": 3.967907287318566e-05, + "loss": 1.3756, + "num_input_tokens_seen": 260837824, + "step": 6630 + }, + { + "epoch": 0.3217502437960714, + "grad_norm": 0.3785705268383026, + "learning_rate": 3.964729403925666e-05, + "loss": 1.4206, + "num_input_tokens_seen": 261235928, + "step": 6640 + }, + { + "epoch": 0.3222348074162462, + "grad_norm": 0.37969115376472473, + "learning_rate": 3.961547912594667e-05, + "loss": 1.406, + "num_input_tokens_seen": 261622640, + "step": 6650 + }, + { + "epoch": 0.322719371036421, + "grad_norm": 0.40466952323913574, + "learning_rate": 3.958362821162254e-05, + "loss": 1.393, + "num_input_tokens_seen": 262061520, + "step": 6660 + }, + { + "epoch": 0.3232039346565958, + "grad_norm": 0.40950527787208557, + "learning_rate": 3.955174137473979e-05, + "loss": 1.4314, + "num_input_tokens_seen": 262449712, + "step": 6670 + }, + { + "epoch": 0.32368849827677065, + "grad_norm": 0.4362260699272156, + "learning_rate": 3.951981869384247e-05, + "loss": 1.4466, + "num_input_tokens_seen": 262842776, + "step": 6680 + }, + { + "epoch": 0.32417306189694545, + "grad_norm": 0.4015698730945587, + "learning_rate": 3.948786024756287e-05, + "loss": 1.4056, + "num_input_tokens_seen": 263186520, + "step": 6690 + }, + { + "epoch": 0.32465762551712024, + "grad_norm": 0.3874407112598419, + "learning_rate": 3.9455866114621396e-05, + "loss": 1.379, + "num_input_tokens_seen": 263570516, + "step": 6700 + }, + { + "epoch": 0.32514218913729503, + "grad_norm": 0.3963698744773865, + "learning_rate": 3.9423836373826375e-05, + "loss": 1.4146, + "num_input_tokens_seen": 263984720, + "step": 6710 + }, + { + "epoch": 0.32562675275746983, + "grad_norm": 0.38951539993286133, + "learning_rate": 3.9391771104073805e-05, + "loss": 1.3588, + "num_input_tokens_seen": 264376356, + "step": 6720 + }, + { + "epoch": 0.3261113163776447, + "grad_norm": 0.4249687194824219, + "learning_rate": 3.9359670384347244e-05, + "loss": 1.4055, + "num_input_tokens_seen": 264737708, + "step": 6730 + }, + { + "epoch": 0.32659587999781947, + "grad_norm": 0.38605302572250366, + "learning_rate": 3.9327534293717537e-05, + "loss": 1.3578, + "num_input_tokens_seen": 265138996, + "step": 6740 + }, + { + "epoch": 0.32708044361799427, + "grad_norm": 0.41331490874290466, + "learning_rate": 3.929536291134267e-05, + "loss": 1.3902, + "num_input_tokens_seen": 265509020, + "step": 6750 + }, + { + "epoch": 0.32756500723816906, + "grad_norm": 0.372742623090744, + "learning_rate": 3.926315631646756e-05, + "loss": 1.3639, + "num_input_tokens_seen": 265879672, + "step": 6760 + }, + { + "epoch": 0.3280495708583439, + "grad_norm": 0.5427610874176025, + "learning_rate": 3.9230914588423864e-05, + "loss": 1.3371, + "num_input_tokens_seen": 266275292, + "step": 6770 + }, + { + "epoch": 0.3285341344785187, + "grad_norm": 0.4112084209918976, + "learning_rate": 3.9198637806629756e-05, + "loss": 1.4216, + "num_input_tokens_seen": 266671844, + "step": 6780 + }, + { + "epoch": 0.3290186980986935, + "grad_norm": 0.3875449299812317, + "learning_rate": 3.916632605058978e-05, + "loss": 1.3905, + "num_input_tokens_seen": 267057556, + "step": 6790 + }, + { + "epoch": 0.3295032617188683, + "grad_norm": 0.3956202566623688, + "learning_rate": 3.913397939989461e-05, + "loss": 1.3913, + "num_input_tokens_seen": 267444300, + "step": 6800 + }, + { + "epoch": 0.3299878253390431, + "grad_norm": 0.41076987981796265, + "learning_rate": 3.910159793422091e-05, + "loss": 1.4032, + "num_input_tokens_seen": 267811660, + "step": 6810 + }, + { + "epoch": 0.33047238895921793, + "grad_norm": 0.41515880823135376, + "learning_rate": 3.9069181733331056e-05, + "loss": 1.4098, + "num_input_tokens_seen": 268195916, + "step": 6820 + }, + { + "epoch": 0.3309569525793927, + "grad_norm": 0.5097044110298157, + "learning_rate": 3.9036730877073e-05, + "loss": 1.3921, + "num_input_tokens_seen": 268578008, + "step": 6830 + }, + { + "epoch": 0.3314415161995675, + "grad_norm": 0.43721285462379456, + "learning_rate": 3.900424544538006e-05, + "loss": 1.3893, + "num_input_tokens_seen": 269003192, + "step": 6840 + }, + { + "epoch": 0.3319260798197423, + "grad_norm": 0.42796212434768677, + "learning_rate": 3.897172551827073e-05, + "loss": 1.4325, + "num_input_tokens_seen": 269393648, + "step": 6850 + }, + { + "epoch": 0.33241064343991716, + "grad_norm": 0.36507412791252136, + "learning_rate": 3.8939171175848447e-05, + "loss": 1.4346, + "num_input_tokens_seen": 269809016, + "step": 6860 + }, + { + "epoch": 0.33289520706009196, + "grad_norm": 0.39873602986335754, + "learning_rate": 3.8906582498301455e-05, + "loss": 1.4412, + "num_input_tokens_seen": 270180224, + "step": 6870 + }, + { + "epoch": 0.33337977068026675, + "grad_norm": 0.3825407326221466, + "learning_rate": 3.887395956590254e-05, + "loss": 1.3762, + "num_input_tokens_seen": 270581856, + "step": 6880 + }, + { + "epoch": 0.33386433430044155, + "grad_norm": 0.40756458044052124, + "learning_rate": 3.884130245900889e-05, + "loss": 1.4066, + "num_input_tokens_seen": 270962900, + "step": 6890 + }, + { + "epoch": 0.33434889792061634, + "grad_norm": 0.41066256165504456, + "learning_rate": 3.880861125806186e-05, + "loss": 1.3903, + "num_input_tokens_seen": 271374220, + "step": 6900 + }, + { + "epoch": 0.3348334615407912, + "grad_norm": 0.4207150340080261, + "learning_rate": 3.877588604358678e-05, + "loss": 1.3709, + "num_input_tokens_seen": 271765116, + "step": 6910 + }, + { + "epoch": 0.335318025160966, + "grad_norm": 0.4337269067764282, + "learning_rate": 3.8743126896192784e-05, + "loss": 1.3251, + "num_input_tokens_seen": 272147924, + "step": 6920 + }, + { + "epoch": 0.3358025887811408, + "grad_norm": 0.4367486536502838, + "learning_rate": 3.871033389657255e-05, + "loss": 1.3525, + "num_input_tokens_seen": 272511972, + "step": 6930 + }, + { + "epoch": 0.33628715240131557, + "grad_norm": 0.43631139397621155, + "learning_rate": 3.867750712550219e-05, + "loss": 1.3855, + "num_input_tokens_seen": 272876876, + "step": 6940 + }, + { + "epoch": 0.3367717160214904, + "grad_norm": 0.40306615829467773, + "learning_rate": 3.8644646663840976e-05, + "loss": 1.3868, + "num_input_tokens_seen": 273287016, + "step": 6950 + }, + { + "epoch": 0.3372562796416652, + "grad_norm": 0.3835316598415375, + "learning_rate": 3.861175259253117e-05, + "loss": 1.3667, + "num_input_tokens_seen": 273674052, + "step": 6960 + }, + { + "epoch": 0.33774084326184, + "grad_norm": 0.38782796263694763, + "learning_rate": 3.857882499259782e-05, + "loss": 1.353, + "num_input_tokens_seen": 274083796, + "step": 6970 + }, + { + "epoch": 0.3382254068820148, + "grad_norm": 0.395175039768219, + "learning_rate": 3.854586394514855e-05, + "loss": 1.3281, + "num_input_tokens_seen": 274493468, + "step": 6980 + }, + { + "epoch": 0.3387099705021896, + "grad_norm": 0.45938950777053833, + "learning_rate": 3.851286953137341e-05, + "loss": 1.4144, + "num_input_tokens_seen": 274858868, + "step": 6990 + }, + { + "epoch": 0.33919453412236444, + "grad_norm": 0.40472185611724854, + "learning_rate": 3.847984183254461e-05, + "loss": 1.3877, + "num_input_tokens_seen": 275233524, + "step": 7000 + }, + { + "epoch": 0.33967909774253924, + "grad_norm": 0.4177361726760864, + "learning_rate": 3.8446780930016336e-05, + "loss": 1.3834, + "num_input_tokens_seen": 275600504, + "step": 7010 + }, + { + "epoch": 0.34016366136271403, + "grad_norm": 0.4230976700782776, + "learning_rate": 3.8413686905224595e-05, + "loss": 1.3601, + "num_input_tokens_seen": 275991592, + "step": 7020 + }, + { + "epoch": 0.3406482249828888, + "grad_norm": 0.3965121805667877, + "learning_rate": 3.838055983968695e-05, + "loss": 1.3801, + "num_input_tokens_seen": 276355724, + "step": 7030 + }, + { + "epoch": 0.3411327886030637, + "grad_norm": 0.39479291439056396, + "learning_rate": 3.8347399815002385e-05, + "loss": 1.3976, + "num_input_tokens_seen": 276749052, + "step": 7040 + }, + { + "epoch": 0.34161735222323847, + "grad_norm": 0.37649762630462646, + "learning_rate": 3.8314206912851036e-05, + "loss": 1.3981, + "num_input_tokens_seen": 277163328, + "step": 7050 + }, + { + "epoch": 0.34210191584341326, + "grad_norm": 0.389952152967453, + "learning_rate": 3.828098121499404e-05, + "loss": 1.3929, + "num_input_tokens_seen": 277532872, + "step": 7060 + }, + { + "epoch": 0.34258647946358806, + "grad_norm": 0.4221923053264618, + "learning_rate": 3.82477228032733e-05, + "loss": 1.3894, + "num_input_tokens_seen": 277930776, + "step": 7070 + }, + { + "epoch": 0.3430710430837629, + "grad_norm": 0.4203970730304718, + "learning_rate": 3.821443175961134e-05, + "loss": 1.3611, + "num_input_tokens_seen": 278313072, + "step": 7080 + }, + { + "epoch": 0.3435556067039377, + "grad_norm": 0.44928234815597534, + "learning_rate": 3.818110816601101e-05, + "loss": 1.4055, + "num_input_tokens_seen": 278692232, + "step": 7090 + }, + { + "epoch": 0.3440401703241125, + "grad_norm": 0.40566375851631165, + "learning_rate": 3.814775210455538e-05, + "loss": 1.3874, + "num_input_tokens_seen": 279070076, + "step": 7100 + }, + { + "epoch": 0.3445247339442873, + "grad_norm": 0.4111347794532776, + "learning_rate": 3.811436365740748e-05, + "loss": 1.4077, + "num_input_tokens_seen": 279468284, + "step": 7110 + }, + { + "epoch": 0.3450092975644621, + "grad_norm": 0.3757016956806183, + "learning_rate": 3.808094290681011e-05, + "loss": 1.3623, + "num_input_tokens_seen": 279858588, + "step": 7120 + }, + { + "epoch": 0.34549386118463693, + "grad_norm": 0.41814279556274414, + "learning_rate": 3.8047489935085635e-05, + "loss": 1.3726, + "num_input_tokens_seen": 280269536, + "step": 7130 + }, + { + "epoch": 0.3459784248048117, + "grad_norm": 0.3829779624938965, + "learning_rate": 3.801400482463581e-05, + "loss": 1.3783, + "num_input_tokens_seen": 280677220, + "step": 7140 + }, + { + "epoch": 0.3464629884249865, + "grad_norm": 0.40414533019065857, + "learning_rate": 3.798048765794151e-05, + "loss": 1.3914, + "num_input_tokens_seen": 281062416, + "step": 7150 + }, + { + "epoch": 0.3469475520451613, + "grad_norm": 0.4087967574596405, + "learning_rate": 3.7946938517562635e-05, + "loss": 1.4013, + "num_input_tokens_seen": 281462400, + "step": 7160 + }, + { + "epoch": 0.34743211566533616, + "grad_norm": 0.3836404085159302, + "learning_rate": 3.791335748613779e-05, + "loss": 1.4126, + "num_input_tokens_seen": 281857064, + "step": 7170 + }, + { + "epoch": 0.34791667928551095, + "grad_norm": 0.4010114073753357, + "learning_rate": 3.7879744646384154e-05, + "loss": 1.4138, + "num_input_tokens_seen": 282243900, + "step": 7180 + }, + { + "epoch": 0.34840124290568575, + "grad_norm": 0.42484843730926514, + "learning_rate": 3.7846100081097255e-05, + "loss": 1.3773, + "num_input_tokens_seen": 282661384, + "step": 7190 + }, + { + "epoch": 0.34888580652586054, + "grad_norm": 0.42864614725112915, + "learning_rate": 3.7812423873150775e-05, + "loss": 1.4116, + "num_input_tokens_seen": 283074744, + "step": 7200 + }, + { + "epoch": 0.34937037014603534, + "grad_norm": 0.4581463634967804, + "learning_rate": 3.777871610549632e-05, + "loss": 1.355, + "num_input_tokens_seen": 283452024, + "step": 7210 + }, + { + "epoch": 0.3498549337662102, + "grad_norm": 0.4017016887664795, + "learning_rate": 3.774497686116327e-05, + "loss": 1.393, + "num_input_tokens_seen": 283843012, + "step": 7220 + }, + { + "epoch": 0.350339497386385, + "grad_norm": 0.387157142162323, + "learning_rate": 3.7711206223258493e-05, + "loss": 1.3497, + "num_input_tokens_seen": 284225640, + "step": 7230 + }, + { + "epoch": 0.3508240610065598, + "grad_norm": 0.42208153009414673, + "learning_rate": 3.767740427496621e-05, + "loss": 1.4008, + "num_input_tokens_seen": 284608708, + "step": 7240 + }, + { + "epoch": 0.35130862462673457, + "grad_norm": 0.4280160367488861, + "learning_rate": 3.764357109954777e-05, + "loss": 1.326, + "num_input_tokens_seen": 285029224, + "step": 7250 + }, + { + "epoch": 0.3517931882469094, + "grad_norm": 0.39940527081489563, + "learning_rate": 3.7609706780341425e-05, + "loss": 1.3688, + "num_input_tokens_seen": 285413332, + "step": 7260 + }, + { + "epoch": 0.3522777518670842, + "grad_norm": 0.40198463201522827, + "learning_rate": 3.757581140076217e-05, + "loss": 1.3835, + "num_input_tokens_seen": 285805028, + "step": 7270 + }, + { + "epoch": 0.352762315487259, + "grad_norm": 0.4302620589733124, + "learning_rate": 3.754188504430147e-05, + "loss": 1.3317, + "num_input_tokens_seen": 286217520, + "step": 7280 + }, + { + "epoch": 0.3532468791074338, + "grad_norm": 0.41558027267456055, + "learning_rate": 3.750792779452712e-05, + "loss": 1.3584, + "num_input_tokens_seen": 286618988, + "step": 7290 + }, + { + "epoch": 0.3537314427276086, + "grad_norm": 0.4010438621044159, + "learning_rate": 3.7473939735082995e-05, + "loss": 1.3638, + "num_input_tokens_seen": 286993540, + "step": 7300 + }, + { + "epoch": 0.35421600634778344, + "grad_norm": 0.4007185995578766, + "learning_rate": 3.743992094968888e-05, + "loss": 1.3662, + "num_input_tokens_seen": 287379072, + "step": 7310 + }, + { + "epoch": 0.35470056996795823, + "grad_norm": 0.4279842674732208, + "learning_rate": 3.740587152214022e-05, + "loss": 1.3124, + "num_input_tokens_seen": 287779160, + "step": 7320 + }, + { + "epoch": 0.35518513358813303, + "grad_norm": 0.4510299563407898, + "learning_rate": 3.737179153630797e-05, + "loss": 1.4197, + "num_input_tokens_seen": 288150168, + "step": 7330 + }, + { + "epoch": 0.3556696972083078, + "grad_norm": 0.4274813234806061, + "learning_rate": 3.733768107613832e-05, + "loss": 1.3953, + "num_input_tokens_seen": 288526312, + "step": 7340 + }, + { + "epoch": 0.35615426082848267, + "grad_norm": 0.4202810525894165, + "learning_rate": 3.730354022565257e-05, + "loss": 1.4117, + "num_input_tokens_seen": 288933160, + "step": 7350 + }, + { + "epoch": 0.35663882444865747, + "grad_norm": 0.38731688261032104, + "learning_rate": 3.7269369068946816e-05, + "loss": 1.3829, + "num_input_tokens_seen": 289312352, + "step": 7360 + }, + { + "epoch": 0.35712338806883226, + "grad_norm": 0.39851441979408264, + "learning_rate": 3.7235167690191856e-05, + "loss": 1.354, + "num_input_tokens_seen": 289728744, + "step": 7370 + }, + { + "epoch": 0.35760795168900705, + "grad_norm": 0.35968631505966187, + "learning_rate": 3.7200936173632915e-05, + "loss": 1.4089, + "num_input_tokens_seen": 290118632, + "step": 7380 + }, + { + "epoch": 0.3580925153091819, + "grad_norm": 0.41429969668388367, + "learning_rate": 3.716667460358945e-05, + "loss": 1.3695, + "num_input_tokens_seen": 290507048, + "step": 7390 + }, + { + "epoch": 0.3585770789293567, + "grad_norm": 0.37059974670410156, + "learning_rate": 3.7132383064454956e-05, + "loss": 1.3864, + "num_input_tokens_seen": 290871436, + "step": 7400 + }, + { + "epoch": 0.3590616425495315, + "grad_norm": 0.3918827474117279, + "learning_rate": 3.7098061640696734e-05, + "loss": 1.3879, + "num_input_tokens_seen": 291230652, + "step": 7410 + }, + { + "epoch": 0.3595462061697063, + "grad_norm": 0.36059802770614624, + "learning_rate": 3.706371041685571e-05, + "loss": 1.3579, + "num_input_tokens_seen": 291641780, + "step": 7420 + }, + { + "epoch": 0.3600307697898811, + "grad_norm": 0.4405132234096527, + "learning_rate": 3.70293294775462e-05, + "loss": 1.3851, + "num_input_tokens_seen": 292025908, + "step": 7430 + }, + { + "epoch": 0.3605153334100559, + "grad_norm": 0.4112297594547272, + "learning_rate": 3.6994918907455734e-05, + "loss": 1.3806, + "num_input_tokens_seen": 292393396, + "step": 7440 + }, + { + "epoch": 0.3609998970302307, + "grad_norm": 0.39005085825920105, + "learning_rate": 3.696047879134481e-05, + "loss": 1.3836, + "num_input_tokens_seen": 292779588, + "step": 7450 + }, + { + "epoch": 0.3614844606504055, + "grad_norm": 0.4453318417072296, + "learning_rate": 3.692600921404672e-05, + "loss": 1.37, + "num_input_tokens_seen": 293187468, + "step": 7460 + }, + { + "epoch": 0.3619690242705803, + "grad_norm": 0.4039851725101471, + "learning_rate": 3.689151026046732e-05, + "loss": 1.3835, + "num_input_tokens_seen": 293571348, + "step": 7470 + }, + { + "epoch": 0.36245358789075516, + "grad_norm": 0.4166390001773834, + "learning_rate": 3.685698201558482e-05, + "loss": 1.3745, + "num_input_tokens_seen": 293971468, + "step": 7480 + }, + { + "epoch": 0.36293815151092995, + "grad_norm": 0.41377830505371094, + "learning_rate": 3.6822424564449584e-05, + "loss": 1.3585, + "num_input_tokens_seen": 294316832, + "step": 7490 + }, + { + "epoch": 0.36342271513110475, + "grad_norm": 0.3810221254825592, + "learning_rate": 3.6787837992183916e-05, + "loss": 1.3045, + "num_input_tokens_seen": 294725576, + "step": 7500 + }, + { + "epoch": 0.36390727875127954, + "grad_norm": 0.3837166428565979, + "learning_rate": 3.675322238398186e-05, + "loss": 1.3864, + "num_input_tokens_seen": 295122992, + "step": 7510 + }, + { + "epoch": 0.36439184237145433, + "grad_norm": 0.4303184151649475, + "learning_rate": 3.671857782510897e-05, + "loss": 1.4116, + "num_input_tokens_seen": 295537492, + "step": 7520 + }, + { + "epoch": 0.3648764059916292, + "grad_norm": 0.4150845408439636, + "learning_rate": 3.668390440090212e-05, + "loss": 1.3549, + "num_input_tokens_seen": 295910304, + "step": 7530 + }, + { + "epoch": 0.365360969611804, + "grad_norm": 0.39363113045692444, + "learning_rate": 3.6649202196769284e-05, + "loss": 1.3322, + "num_input_tokens_seen": 296280276, + "step": 7540 + }, + { + "epoch": 0.36584553323197877, + "grad_norm": 0.395766943693161, + "learning_rate": 3.6614471298189323e-05, + "loss": 1.3772, + "num_input_tokens_seen": 296664476, + "step": 7550 + }, + { + "epoch": 0.36633009685215356, + "grad_norm": 0.3787136971950531, + "learning_rate": 3.6579711790711777e-05, + "loss": 1.3246, + "num_input_tokens_seen": 297062148, + "step": 7560 + }, + { + "epoch": 0.3668146604723284, + "grad_norm": 0.44149187207221985, + "learning_rate": 3.654492375995666e-05, + "loss": 1.3614, + "num_input_tokens_seen": 297468140, + "step": 7570 + }, + { + "epoch": 0.3672992240925032, + "grad_norm": 0.4064597189426422, + "learning_rate": 3.6510107291614254e-05, + "loss": 1.3648, + "num_input_tokens_seen": 297882592, + "step": 7580 + }, + { + "epoch": 0.367783787712678, + "grad_norm": 0.425382137298584, + "learning_rate": 3.647526247144486e-05, + "loss": 1.3475, + "num_input_tokens_seen": 298248144, + "step": 7590 + }, + { + "epoch": 0.3682683513328528, + "grad_norm": 0.4232633411884308, + "learning_rate": 3.644038938527866e-05, + "loss": 1.3726, + "num_input_tokens_seen": 298645072, + "step": 7600 + }, + { + "epoch": 0.3687529149530276, + "grad_norm": 0.4248599708080292, + "learning_rate": 3.640548811901541e-05, + "loss": 1.3719, + "num_input_tokens_seen": 299051544, + "step": 7610 + }, + { + "epoch": 0.36923747857320244, + "grad_norm": 0.4088067412376404, + "learning_rate": 3.637055875862433e-05, + "loss": 1.3693, + "num_input_tokens_seen": 299444432, + "step": 7620 + }, + { + "epoch": 0.36972204219337723, + "grad_norm": 0.4372110366821289, + "learning_rate": 3.6335601390143797e-05, + "loss": 1.3766, + "num_input_tokens_seen": 299838044, + "step": 7630 + }, + { + "epoch": 0.370206605813552, + "grad_norm": 0.391181081533432, + "learning_rate": 3.630061609968121e-05, + "loss": 1.3456, + "num_input_tokens_seen": 300211940, + "step": 7640 + }, + { + "epoch": 0.3706911694337268, + "grad_norm": 0.4158838391304016, + "learning_rate": 3.6265602973412736e-05, + "loss": 1.3756, + "num_input_tokens_seen": 300598508, + "step": 7650 + }, + { + "epoch": 0.37117573305390167, + "grad_norm": 0.39379456639289856, + "learning_rate": 3.623056209758309e-05, + "loss": 1.413, + "num_input_tokens_seen": 301012928, + "step": 7660 + }, + { + "epoch": 0.37166029667407646, + "grad_norm": 0.40179499983787537, + "learning_rate": 3.619549355850536e-05, + "loss": 1.367, + "num_input_tokens_seen": 301381712, + "step": 7670 + }, + { + "epoch": 0.37214486029425126, + "grad_norm": 0.3952077627182007, + "learning_rate": 3.616039744256078e-05, + "loss": 1.3613, + "num_input_tokens_seen": 301797060, + "step": 7680 + }, + { + "epoch": 0.37262942391442605, + "grad_norm": 0.41166162490844727, + "learning_rate": 3.61252738361985e-05, + "loss": 1.3913, + "num_input_tokens_seen": 302205268, + "step": 7690 + }, + { + "epoch": 0.37311398753460084, + "grad_norm": 0.38384559750556946, + "learning_rate": 3.609012282593538e-05, + "loss": 1.3422, + "num_input_tokens_seen": 302592424, + "step": 7700 + }, + { + "epoch": 0.3735985511547757, + "grad_norm": 0.4590049982070923, + "learning_rate": 3.605494449835578e-05, + "loss": 1.3665, + "num_input_tokens_seen": 302963964, + "step": 7710 + }, + { + "epoch": 0.3740831147749505, + "grad_norm": 0.43561649322509766, + "learning_rate": 3.601973894011137e-05, + "loss": 1.3385, + "num_input_tokens_seen": 303350216, + "step": 7720 + }, + { + "epoch": 0.3745676783951253, + "grad_norm": 0.42796000838279724, + "learning_rate": 3.598450623792088e-05, + "loss": 1.362, + "num_input_tokens_seen": 303774160, + "step": 7730 + }, + { + "epoch": 0.3750522420153001, + "grad_norm": 0.37895604968070984, + "learning_rate": 3.5949246478569885e-05, + "loss": 1.3766, + "num_input_tokens_seen": 304184292, + "step": 7740 + }, + { + "epoch": 0.3755368056354749, + "grad_norm": 0.3759874105453491, + "learning_rate": 3.591395974891065e-05, + "loss": 1.3544, + "num_input_tokens_seen": 304615976, + "step": 7750 + }, + { + "epoch": 0.3760213692556497, + "grad_norm": 0.4173840880393982, + "learning_rate": 3.5878646135861826e-05, + "loss": 1.3919, + "num_input_tokens_seen": 305014124, + "step": 7760 + }, + { + "epoch": 0.3765059328758245, + "grad_norm": 0.45354533195495605, + "learning_rate": 3.5843305726408323e-05, + "loss": 1.3588, + "num_input_tokens_seen": 305416284, + "step": 7770 + }, + { + "epoch": 0.3769904964959993, + "grad_norm": 0.3822251260280609, + "learning_rate": 3.580793860760103e-05, + "loss": 1.3708, + "num_input_tokens_seen": 305811776, + "step": 7780 + }, + { + "epoch": 0.37747506011617415, + "grad_norm": 0.3847945034503937, + "learning_rate": 3.5772544866556634e-05, + "loss": 1.3743, + "num_input_tokens_seen": 306210288, + "step": 7790 + }, + { + "epoch": 0.37795962373634895, + "grad_norm": 0.3933156132698059, + "learning_rate": 3.5737124590457404e-05, + "loss": 1.3817, + "num_input_tokens_seen": 306612992, + "step": 7800 + }, + { + "epoch": 0.37844418735652374, + "grad_norm": 0.42493316531181335, + "learning_rate": 3.570167786655096e-05, + "loss": 1.3504, + "num_input_tokens_seen": 306981668, + "step": 7810 + }, + { + "epoch": 0.37892875097669854, + "grad_norm": 0.4432273507118225, + "learning_rate": 3.566620478215008e-05, + "loss": 1.3901, + "num_input_tokens_seen": 307387500, + "step": 7820 + }, + { + "epoch": 0.37941331459687333, + "grad_norm": 0.4004375636577606, + "learning_rate": 3.5630705424632475e-05, + "loss": 1.4102, + "num_input_tokens_seen": 307810852, + "step": 7830 + }, + { + "epoch": 0.3798978782170482, + "grad_norm": 0.38001713156700134, + "learning_rate": 3.5595179881440554e-05, + "loss": 1.4202, + "num_input_tokens_seen": 308189112, + "step": 7840 + }, + { + "epoch": 0.380382441837223, + "grad_norm": 0.40970560908317566, + "learning_rate": 3.5559628240081244e-05, + "loss": 1.3796, + "num_input_tokens_seen": 308593308, + "step": 7850 + }, + { + "epoch": 0.38086700545739777, + "grad_norm": 0.4416448473930359, + "learning_rate": 3.5524050588125744e-05, + "loss": 1.3305, + "num_input_tokens_seen": 308977396, + "step": 7860 + }, + { + "epoch": 0.38135156907757256, + "grad_norm": 0.3899739682674408, + "learning_rate": 3.548844701320934e-05, + "loss": 1.3648, + "num_input_tokens_seen": 309358540, + "step": 7870 + }, + { + "epoch": 0.3818361326977474, + "grad_norm": 0.4026392698287964, + "learning_rate": 3.545281760303116e-05, + "loss": 1.3735, + "num_input_tokens_seen": 309781936, + "step": 7880 + }, + { + "epoch": 0.3823206963179222, + "grad_norm": 0.41612690687179565, + "learning_rate": 3.5417162445353965e-05, + "loss": 1.346, + "num_input_tokens_seen": 310174720, + "step": 7890 + }, + { + "epoch": 0.382805259938097, + "grad_norm": 0.3857744336128235, + "learning_rate": 3.5381481628003964e-05, + "loss": 1.386, + "num_input_tokens_seen": 310557832, + "step": 7900 + }, + { + "epoch": 0.3832898235582718, + "grad_norm": 0.4121117889881134, + "learning_rate": 3.534577523887053e-05, + "loss": 1.3303, + "num_input_tokens_seen": 310940032, + "step": 7910 + }, + { + "epoch": 0.3837743871784466, + "grad_norm": 0.45025011897087097, + "learning_rate": 3.5310043365906046e-05, + "loss": 1.3234, + "num_input_tokens_seen": 311335136, + "step": 7920 + }, + { + "epoch": 0.38425895079862143, + "grad_norm": 0.3884856700897217, + "learning_rate": 3.527428609712569e-05, + "loss": 1.3849, + "num_input_tokens_seen": 311699972, + "step": 7930 + }, + { + "epoch": 0.38474351441879623, + "grad_norm": 0.37907713651657104, + "learning_rate": 3.5238503520607144e-05, + "loss": 1.3602, + "num_input_tokens_seen": 312078664, + "step": 7940 + }, + { + "epoch": 0.385228078038971, + "grad_norm": 0.37585073709487915, + "learning_rate": 3.520269572449047e-05, + "loss": 1.3895, + "num_input_tokens_seen": 312453588, + "step": 7950 + }, + { + "epoch": 0.3857126416591458, + "grad_norm": 0.395000696182251, + "learning_rate": 3.516686279697784e-05, + "loss": 1.3542, + "num_input_tokens_seen": 312865692, + "step": 7960 + }, + { + "epoch": 0.38619720527932067, + "grad_norm": 0.40303224325180054, + "learning_rate": 3.513100482633332e-05, + "loss": 1.3497, + "num_input_tokens_seen": 313258996, + "step": 7970 + }, + { + "epoch": 0.38668176889949546, + "grad_norm": 0.4608016312122345, + "learning_rate": 3.509512190088269e-05, + "loss": 1.3596, + "num_input_tokens_seen": 313656952, + "step": 7980 + }, + { + "epoch": 0.38716633251967025, + "grad_norm": 0.393044114112854, + "learning_rate": 3.505921410901316e-05, + "loss": 1.3828, + "num_input_tokens_seen": 314048964, + "step": 7990 + }, + { + "epoch": 0.38765089613984505, + "grad_norm": 0.41909295320510864, + "learning_rate": 3.50232815391732e-05, + "loss": 1.3764, + "num_input_tokens_seen": 314442716, + "step": 8000 + }, + { + "epoch": 0.38765089613984505, + "eval_loss": 1.471493124961853, + "eval_runtime": 4.7345, + "eval_samples_per_second": 31.683, + "eval_steps_per_second": 4.013, + "num_input_tokens_seen": 314442716, + "step": 8000 + }, + { + "epoch": 0.38813545976001984, + "grad_norm": 0.4128382205963135, + "learning_rate": 3.498732427987236e-05, + "loss": 1.3531, + "num_input_tokens_seen": 314818980, + "step": 8010 + }, + { + "epoch": 0.3886200233801947, + "grad_norm": 0.4025222659111023, + "learning_rate": 3.4951342419680946e-05, + "loss": 1.3588, + "num_input_tokens_seen": 315239696, + "step": 8020 + }, + { + "epoch": 0.3891045870003695, + "grad_norm": 0.4064643383026123, + "learning_rate": 3.491533604722987e-05, + "loss": 1.3689, + "num_input_tokens_seen": 315645796, + "step": 8030 + }, + { + "epoch": 0.3895891506205443, + "grad_norm": 0.36004194617271423, + "learning_rate": 3.4879305251210474e-05, + "loss": 1.3705, + "num_input_tokens_seen": 316049168, + "step": 8040 + }, + { + "epoch": 0.39007371424071907, + "grad_norm": 0.36490750312805176, + "learning_rate": 3.4843250120374206e-05, + "loss": 1.3596, + "num_input_tokens_seen": 316455320, + "step": 8050 + }, + { + "epoch": 0.3905582778608939, + "grad_norm": 0.3944610357284546, + "learning_rate": 3.4807170743532466e-05, + "loss": 1.3421, + "num_input_tokens_seen": 316841708, + "step": 8060 + }, + { + "epoch": 0.3910428414810687, + "grad_norm": 0.3791704475879669, + "learning_rate": 3.4771067209556405e-05, + "loss": 1.3642, + "num_input_tokens_seen": 317231160, + "step": 8070 + }, + { + "epoch": 0.3915274051012435, + "grad_norm": 0.39615359902381897, + "learning_rate": 3.4734939607376635e-05, + "loss": 1.3042, + "num_input_tokens_seen": 317616996, + "step": 8080 + }, + { + "epoch": 0.3920119687214183, + "grad_norm": 0.3907454013824463, + "learning_rate": 3.469878802598308e-05, + "loss": 1.3768, + "num_input_tokens_seen": 318013328, + "step": 8090 + }, + { + "epoch": 0.39249653234159315, + "grad_norm": 0.46760374307632446, + "learning_rate": 3.466261255442473e-05, + "loss": 1.3638, + "num_input_tokens_seen": 318396864, + "step": 8100 + }, + { + "epoch": 0.39298109596176795, + "grad_norm": 0.38027724623680115, + "learning_rate": 3.4626413281809434e-05, + "loss": 1.3292, + "num_input_tokens_seen": 318748480, + "step": 8110 + }, + { + "epoch": 0.39346565958194274, + "grad_norm": 0.38730815052986145, + "learning_rate": 3.4590190297303623e-05, + "loss": 1.3718, + "num_input_tokens_seen": 319146300, + "step": 8120 + }, + { + "epoch": 0.39395022320211753, + "grad_norm": 0.39373862743377686, + "learning_rate": 3.455394369013218e-05, + "loss": 1.384, + "num_input_tokens_seen": 319537280, + "step": 8130 + }, + { + "epoch": 0.3944347868222923, + "grad_norm": 0.40906721353530884, + "learning_rate": 3.4517673549578154e-05, + "loss": 1.3672, + "num_input_tokens_seen": 319904616, + "step": 8140 + }, + { + "epoch": 0.3949193504424672, + "grad_norm": 0.4988723397254944, + "learning_rate": 3.448137996498258e-05, + "loss": 1.3505, + "num_input_tokens_seen": 320278336, + "step": 8150 + }, + { + "epoch": 0.39540391406264197, + "grad_norm": 0.4247932434082031, + "learning_rate": 3.44450630257442e-05, + "loss": 1.3632, + "num_input_tokens_seen": 320673012, + "step": 8160 + }, + { + "epoch": 0.39588847768281676, + "grad_norm": 0.37421149015426636, + "learning_rate": 3.440872282131934e-05, + "loss": 1.3142, + "num_input_tokens_seen": 321069804, + "step": 8170 + }, + { + "epoch": 0.39637304130299156, + "grad_norm": 0.41044512391090393, + "learning_rate": 3.4372359441221594e-05, + "loss": 1.3794, + "num_input_tokens_seen": 321501056, + "step": 8180 + }, + { + "epoch": 0.3968576049231664, + "grad_norm": 0.3953944742679596, + "learning_rate": 3.4335972975021646e-05, + "loss": 1.3451, + "num_input_tokens_seen": 321913760, + "step": 8190 + }, + { + "epoch": 0.3973421685433412, + "grad_norm": 0.44824719429016113, + "learning_rate": 3.429956351234705e-05, + "loss": 1.4203, + "num_input_tokens_seen": 322313132, + "step": 8200 + }, + { + "epoch": 0.397826732163516, + "grad_norm": 0.4102291166782379, + "learning_rate": 3.426313114288203e-05, + "loss": 1.3424, + "num_input_tokens_seen": 322675076, + "step": 8210 + }, + { + "epoch": 0.3983112957836908, + "grad_norm": 0.375482976436615, + "learning_rate": 3.4226675956367195e-05, + "loss": 1.3142, + "num_input_tokens_seen": 323090684, + "step": 8220 + }, + { + "epoch": 0.3987958594038656, + "grad_norm": 0.42846444249153137, + "learning_rate": 3.419019804259937e-05, + "loss": 1.3529, + "num_input_tokens_seen": 323463428, + "step": 8230 + }, + { + "epoch": 0.39928042302404043, + "grad_norm": 0.42404618859291077, + "learning_rate": 3.4153697491431375e-05, + "loss": 1.3657, + "num_input_tokens_seen": 323846056, + "step": 8240 + }, + { + "epoch": 0.3997649866442152, + "grad_norm": 0.40639814734458923, + "learning_rate": 3.411717439277178e-05, + "loss": 1.3446, + "num_input_tokens_seen": 324257868, + "step": 8250 + }, + { + "epoch": 0.40024955026439, + "grad_norm": 0.41451844573020935, + "learning_rate": 3.40806288365847e-05, + "loss": 1.379, + "num_input_tokens_seen": 324644548, + "step": 8260 + }, + { + "epoch": 0.4007341138845648, + "grad_norm": 0.42741525173187256, + "learning_rate": 3.404406091288956e-05, + "loss": 1.383, + "num_input_tokens_seen": 325051824, + "step": 8270 + }, + { + "epoch": 0.40121867750473966, + "grad_norm": 0.35797035694122314, + "learning_rate": 3.4007470711760885e-05, + "loss": 1.3184, + "num_input_tokens_seen": 325435196, + "step": 8280 + }, + { + "epoch": 0.40170324112491446, + "grad_norm": 0.44313570857048035, + "learning_rate": 3.397085832332808e-05, + "loss": 1.3761, + "num_input_tokens_seen": 325815308, + "step": 8290 + }, + { + "epoch": 0.40218780474508925, + "grad_norm": 0.4352870285511017, + "learning_rate": 3.393422383777518e-05, + "loss": 1.3331, + "num_input_tokens_seen": 326215248, + "step": 8300 + }, + { + "epoch": 0.40267236836526404, + "grad_norm": 0.427741140127182, + "learning_rate": 3.389756734534069e-05, + "loss": 1.3546, + "num_input_tokens_seen": 326587588, + "step": 8310 + }, + { + "epoch": 0.40315693198543884, + "grad_norm": 0.39280158281326294, + "learning_rate": 3.386088893631727e-05, + "loss": 1.3251, + "num_input_tokens_seen": 326969008, + "step": 8320 + }, + { + "epoch": 0.4036414956056137, + "grad_norm": 0.37476789951324463, + "learning_rate": 3.382418870105161e-05, + "loss": 1.3944, + "num_input_tokens_seen": 327337052, + "step": 8330 + }, + { + "epoch": 0.4041260592257885, + "grad_norm": 0.3965238332748413, + "learning_rate": 3.3787466729944156e-05, + "loss": 1.3314, + "num_input_tokens_seen": 327726100, + "step": 8340 + }, + { + "epoch": 0.4046106228459633, + "grad_norm": 0.3928758203983307, + "learning_rate": 3.375072311344887e-05, + "loss": 1.3753, + "num_input_tokens_seen": 328097960, + "step": 8350 + }, + { + "epoch": 0.40509518646613807, + "grad_norm": 0.4109261631965637, + "learning_rate": 3.371395794207304e-05, + "loss": 1.3061, + "num_input_tokens_seen": 328495520, + "step": 8360 + }, + { + "epoch": 0.4055797500863129, + "grad_norm": 0.3748038113117218, + "learning_rate": 3.3677171306377066e-05, + "loss": 1.3259, + "num_input_tokens_seen": 328901644, + "step": 8370 + }, + { + "epoch": 0.4060643137064877, + "grad_norm": 0.40184059739112854, + "learning_rate": 3.36403632969742e-05, + "loss": 1.372, + "num_input_tokens_seen": 329298316, + "step": 8380 + }, + { + "epoch": 0.4065488773266625, + "grad_norm": 0.42977675795555115, + "learning_rate": 3.360353400453035e-05, + "loss": 1.3228, + "num_input_tokens_seen": 329697248, + "step": 8390 + }, + { + "epoch": 0.4070334409468373, + "grad_norm": 0.42152881622314453, + "learning_rate": 3.356668351976385e-05, + "loss": 1.3153, + "num_input_tokens_seen": 330092604, + "step": 8400 + }, + { + "epoch": 0.4075180045670121, + "grad_norm": 0.42566007375717163, + "learning_rate": 3.352981193344523e-05, + "loss": 1.3555, + "num_input_tokens_seen": 330476288, + "step": 8410 + }, + { + "epoch": 0.40800256818718694, + "grad_norm": 0.4720443785190582, + "learning_rate": 3.349291933639701e-05, + "loss": 1.4095, + "num_input_tokens_seen": 330852500, + "step": 8420 + }, + { + "epoch": 0.40848713180736174, + "grad_norm": 0.4077743887901306, + "learning_rate": 3.345600581949344e-05, + "loss": 1.3875, + "num_input_tokens_seen": 331243588, + "step": 8430 + }, + { + "epoch": 0.40897169542753653, + "grad_norm": 0.37246859073638916, + "learning_rate": 3.3419071473660316e-05, + "loss": 1.34, + "num_input_tokens_seen": 331625036, + "step": 8440 + }, + { + "epoch": 0.4094562590477113, + "grad_norm": 0.40044400095939636, + "learning_rate": 3.338211638987475e-05, + "loss": 1.3883, + "num_input_tokens_seen": 332007224, + "step": 8450 + }, + { + "epoch": 0.4099408226678862, + "grad_norm": 0.40075942873954773, + "learning_rate": 3.33451406591649e-05, + "loss": 1.3145, + "num_input_tokens_seen": 332377820, + "step": 8460 + }, + { + "epoch": 0.41042538628806097, + "grad_norm": 0.41642826795578003, + "learning_rate": 3.330814437260983e-05, + "loss": 1.3723, + "num_input_tokens_seen": 332768892, + "step": 8470 + }, + { + "epoch": 0.41090994990823576, + "grad_norm": 0.45073971152305603, + "learning_rate": 3.32711276213392e-05, + "loss": 1.3439, + "num_input_tokens_seen": 333169136, + "step": 8480 + }, + { + "epoch": 0.41139451352841055, + "grad_norm": 0.4418103098869324, + "learning_rate": 3.32340904965331e-05, + "loss": 1.3355, + "num_input_tokens_seen": 333560236, + "step": 8490 + }, + { + "epoch": 0.4118790771485854, + "grad_norm": 0.38057634234428406, + "learning_rate": 3.3197033089421794e-05, + "loss": 1.3748, + "num_input_tokens_seen": 333908344, + "step": 8500 + }, + { + "epoch": 0.4123636407687602, + "grad_norm": 0.38415491580963135, + "learning_rate": 3.31599554912855e-05, + "loss": 1.3595, + "num_input_tokens_seen": 334331680, + "step": 8510 + }, + { + "epoch": 0.412848204388935, + "grad_norm": 0.4075462818145752, + "learning_rate": 3.3122857793454186e-05, + "loss": 1.349, + "num_input_tokens_seen": 334721924, + "step": 8520 + }, + { + "epoch": 0.4133327680091098, + "grad_norm": 0.4361826479434967, + "learning_rate": 3.308574008730732e-05, + "loss": 1.3445, + "num_input_tokens_seen": 335116152, + "step": 8530 + }, + { + "epoch": 0.4138173316292846, + "grad_norm": 0.4111301898956299, + "learning_rate": 3.304860246427366e-05, + "loss": 1.388, + "num_input_tokens_seen": 335498760, + "step": 8540 + }, + { + "epoch": 0.41430189524945943, + "grad_norm": 0.4673983156681061, + "learning_rate": 3.301144501583102e-05, + "loss": 1.3624, + "num_input_tokens_seen": 335894792, + "step": 8550 + }, + { + "epoch": 0.4147864588696342, + "grad_norm": 0.4051591157913208, + "learning_rate": 3.297426783350606e-05, + "loss": 1.2684, + "num_input_tokens_seen": 336281536, + "step": 8560 + }, + { + "epoch": 0.415271022489809, + "grad_norm": 0.4352741241455078, + "learning_rate": 3.293707100887401e-05, + "loss": 1.336, + "num_input_tokens_seen": 336691052, + "step": 8570 + }, + { + "epoch": 0.4157555861099838, + "grad_norm": 0.43101462721824646, + "learning_rate": 3.2899854633558534e-05, + "loss": 1.3345, + "num_input_tokens_seen": 337098468, + "step": 8580 + }, + { + "epoch": 0.41624014973015866, + "grad_norm": 0.43431204557418823, + "learning_rate": 3.2862618799231424e-05, + "loss": 1.3372, + "num_input_tokens_seen": 337498280, + "step": 8590 + }, + { + "epoch": 0.41672471335033345, + "grad_norm": 0.3821309804916382, + "learning_rate": 3.2825363597612405e-05, + "loss": 1.3787, + "num_input_tokens_seen": 337889884, + "step": 8600 + }, + { + "epoch": 0.41720927697050825, + "grad_norm": 0.39203059673309326, + "learning_rate": 3.2788089120468924e-05, + "loss": 1.3589, + "num_input_tokens_seen": 338259116, + "step": 8610 + }, + { + "epoch": 0.41769384059068304, + "grad_norm": 0.39438045024871826, + "learning_rate": 3.275079545961588e-05, + "loss": 1.2828, + "num_input_tokens_seen": 338648980, + "step": 8620 + }, + { + "epoch": 0.41817840421085783, + "grad_norm": 0.4104815423488617, + "learning_rate": 3.271348270691546e-05, + "loss": 1.3274, + "num_input_tokens_seen": 339078392, + "step": 8630 + }, + { + "epoch": 0.4186629678310327, + "grad_norm": 0.39212706685066223, + "learning_rate": 3.2676150954276846e-05, + "loss": 1.3259, + "num_input_tokens_seen": 339453980, + "step": 8640 + }, + { + "epoch": 0.4191475314512075, + "grad_norm": 0.3868629038333893, + "learning_rate": 3.263880029365604e-05, + "loss": 1.3381, + "num_input_tokens_seen": 339852160, + "step": 8650 + }, + { + "epoch": 0.41963209507138227, + "grad_norm": 0.3927762806415558, + "learning_rate": 3.260143081705561e-05, + "loss": 1.3366, + "num_input_tokens_seen": 340245664, + "step": 8660 + }, + { + "epoch": 0.42011665869155707, + "grad_norm": 0.4036239683628082, + "learning_rate": 3.256404261652449e-05, + "loss": 1.3157, + "num_input_tokens_seen": 340637260, + "step": 8670 + }, + { + "epoch": 0.4206012223117319, + "grad_norm": 0.41092178225517273, + "learning_rate": 3.2526635784157695e-05, + "loss": 1.3727, + "num_input_tokens_seen": 341032376, + "step": 8680 + }, + { + "epoch": 0.4210857859319067, + "grad_norm": 0.40871837735176086, + "learning_rate": 3.248921041209618e-05, + "loss": 1.3247, + "num_input_tokens_seen": 341418744, + "step": 8690 + }, + { + "epoch": 0.4215703495520815, + "grad_norm": 0.41571468114852905, + "learning_rate": 3.245176659252654e-05, + "loss": 1.3083, + "num_input_tokens_seen": 341804060, + "step": 8700 + }, + { + "epoch": 0.4220549131722563, + "grad_norm": 0.38524648547172546, + "learning_rate": 3.241430441768081e-05, + "loss": 1.2899, + "num_input_tokens_seen": 342183932, + "step": 8710 + }, + { + "epoch": 0.4225394767924311, + "grad_norm": 0.3792508542537689, + "learning_rate": 3.2376823979836256e-05, + "loss": 1.3544, + "num_input_tokens_seen": 342580412, + "step": 8720 + }, + { + "epoch": 0.42302404041260594, + "grad_norm": 0.3552215099334717, + "learning_rate": 3.233932537131511e-05, + "loss": 1.3571, + "num_input_tokens_seen": 342982212, + "step": 8730 + }, + { + "epoch": 0.42350860403278073, + "grad_norm": 0.40093865990638733, + "learning_rate": 3.230180868448437e-05, + "loss": 1.3159, + "num_input_tokens_seen": 343375004, + "step": 8740 + }, + { + "epoch": 0.4239931676529555, + "grad_norm": 0.42487427592277527, + "learning_rate": 3.2264274011755575e-05, + "loss": 1.3574, + "num_input_tokens_seen": 343760048, + "step": 8750 + }, + { + "epoch": 0.4244777312731303, + "grad_norm": 0.370417058467865, + "learning_rate": 3.222672144558455e-05, + "loss": 1.3853, + "num_input_tokens_seen": 344142832, + "step": 8760 + }, + { + "epoch": 0.42496229489330517, + "grad_norm": 0.42524397373199463, + "learning_rate": 3.21891510784712e-05, + "loss": 1.3497, + "num_input_tokens_seen": 344534952, + "step": 8770 + }, + { + "epoch": 0.42544685851347996, + "grad_norm": 0.36123722791671753, + "learning_rate": 3.215156300295928e-05, + "loss": 1.3211, + "num_input_tokens_seen": 344914096, + "step": 8780 + }, + { + "epoch": 0.42593142213365476, + "grad_norm": 0.38714513182640076, + "learning_rate": 3.2113957311636154e-05, + "loss": 1.3753, + "num_input_tokens_seen": 345315972, + "step": 8790 + }, + { + "epoch": 0.42641598575382955, + "grad_norm": 0.38927286863327026, + "learning_rate": 3.207633409713262e-05, + "loss": 1.3256, + "num_input_tokens_seen": 345704748, + "step": 8800 + }, + { + "epoch": 0.42690054937400435, + "grad_norm": 0.3460540771484375, + "learning_rate": 3.203869345212258e-05, + "loss": 1.358, + "num_input_tokens_seen": 346089592, + "step": 8810 + }, + { + "epoch": 0.4273851129941792, + "grad_norm": 0.39570000767707825, + "learning_rate": 3.20010354693229e-05, + "loss": 1.3397, + "num_input_tokens_seen": 346495852, + "step": 8820 + }, + { + "epoch": 0.427869676614354, + "grad_norm": 0.5098708271980286, + "learning_rate": 3.196336024149316e-05, + "loss": 1.3545, + "num_input_tokens_seen": 346912816, + "step": 8830 + }, + { + "epoch": 0.4283542402345288, + "grad_norm": 0.40923282504081726, + "learning_rate": 3.192566786143541e-05, + "loss": 1.3697, + "num_input_tokens_seen": 347337700, + "step": 8840 + }, + { + "epoch": 0.4288388038547036, + "grad_norm": 0.40802377462387085, + "learning_rate": 3.1887958421993944e-05, + "loss": 1.3471, + "num_input_tokens_seen": 347738832, + "step": 8850 + }, + { + "epoch": 0.4293233674748784, + "grad_norm": 0.4220910668373108, + "learning_rate": 3.185023201605508e-05, + "loss": 1.3619, + "num_input_tokens_seen": 348131200, + "step": 8860 + }, + { + "epoch": 0.4298079310950532, + "grad_norm": 0.4093485474586487, + "learning_rate": 3.181248873654693e-05, + "loss": 1.3157, + "num_input_tokens_seen": 348533868, + "step": 8870 + }, + { + "epoch": 0.430292494715228, + "grad_norm": 0.4146769940853119, + "learning_rate": 3.177472867643917e-05, + "loss": 1.3552, + "num_input_tokens_seen": 348915400, + "step": 8880 + }, + { + "epoch": 0.4307770583354028, + "grad_norm": 0.4098985493183136, + "learning_rate": 3.1736951928742804e-05, + "loss": 1.2922, + "num_input_tokens_seen": 349298476, + "step": 8890 + }, + { + "epoch": 0.43126162195557766, + "grad_norm": 0.3871714174747467, + "learning_rate": 3.169915858650996e-05, + "loss": 1.3978, + "num_input_tokens_seen": 349662284, + "step": 8900 + }, + { + "epoch": 0.43174618557575245, + "grad_norm": 0.4378563463687897, + "learning_rate": 3.166134874283361e-05, + "loss": 1.3481, + "num_input_tokens_seen": 350084716, + "step": 8910 + }, + { + "epoch": 0.43223074919592724, + "grad_norm": 0.3912814259529114, + "learning_rate": 3.16235224908474e-05, + "loss": 1.3555, + "num_input_tokens_seen": 350479224, + "step": 8920 + }, + { + "epoch": 0.43271531281610204, + "grad_norm": 0.41960766911506653, + "learning_rate": 3.158567992372538e-05, + "loss": 1.3159, + "num_input_tokens_seen": 350878796, + "step": 8930 + }, + { + "epoch": 0.43319987643627683, + "grad_norm": 0.3939763009548187, + "learning_rate": 3.154782113468179e-05, + "loss": 1.3948, + "num_input_tokens_seen": 351259368, + "step": 8940 + }, + { + "epoch": 0.4336844400564517, + "grad_norm": 0.37786418199539185, + "learning_rate": 3.1509946216970844e-05, + "loss": 1.3577, + "num_input_tokens_seen": 351642444, + "step": 8950 + }, + { + "epoch": 0.4341690036766265, + "grad_norm": 0.41558992862701416, + "learning_rate": 3.1472055263886443e-05, + "loss": 1.336, + "num_input_tokens_seen": 352081616, + "step": 8960 + }, + { + "epoch": 0.43465356729680127, + "grad_norm": 0.407986044883728, + "learning_rate": 3.143414836876204e-05, + "loss": 1.3638, + "num_input_tokens_seen": 352492396, + "step": 8970 + }, + { + "epoch": 0.43513813091697606, + "grad_norm": 0.3700858950614929, + "learning_rate": 3.13962256249703e-05, + "loss": 1.3362, + "num_input_tokens_seen": 352908204, + "step": 8980 + }, + { + "epoch": 0.4356226945371509, + "grad_norm": 0.43616652488708496, + "learning_rate": 3.1358287125922986e-05, + "loss": 1.3545, + "num_input_tokens_seen": 353322600, + "step": 8990 + }, + { + "epoch": 0.4361072581573257, + "grad_norm": 0.40746721625328064, + "learning_rate": 3.132033296507063e-05, + "loss": 1.3484, + "num_input_tokens_seen": 353715168, + "step": 9000 + }, + { + "epoch": 0.4365918217775005, + "grad_norm": 0.3873213529586792, + "learning_rate": 3.128236323590234e-05, + "loss": 1.338, + "num_input_tokens_seen": 354097720, + "step": 9010 + }, + { + "epoch": 0.4370763853976753, + "grad_norm": 0.37148579955101013, + "learning_rate": 3.1244378031945585e-05, + "loss": 1.3348, + "num_input_tokens_seen": 354523140, + "step": 9020 + }, + { + "epoch": 0.4375609490178501, + "grad_norm": 0.4213094711303711, + "learning_rate": 3.1206377446765966e-05, + "loss": 1.304, + "num_input_tokens_seen": 354914168, + "step": 9030 + }, + { + "epoch": 0.43804551263802494, + "grad_norm": 0.4143025875091553, + "learning_rate": 3.1168361573966945e-05, + "loss": 1.3531, + "num_input_tokens_seen": 355288384, + "step": 9040 + }, + { + "epoch": 0.43853007625819973, + "grad_norm": 0.4011501669883728, + "learning_rate": 3.113033050718966e-05, + "loss": 1.3663, + "num_input_tokens_seen": 355652940, + "step": 9050 + }, + { + "epoch": 0.4390146398783745, + "grad_norm": 0.37911659479141235, + "learning_rate": 3.109228434011265e-05, + "loss": 1.3144, + "num_input_tokens_seen": 356034868, + "step": 9060 + }, + { + "epoch": 0.4394992034985493, + "grad_norm": 0.4045577049255371, + "learning_rate": 3.105422316645169e-05, + "loss": 1.3398, + "num_input_tokens_seen": 356432868, + "step": 9070 + }, + { + "epoch": 0.43998376711872417, + "grad_norm": 0.4345623254776001, + "learning_rate": 3.101614707995948e-05, + "loss": 1.3056, + "num_input_tokens_seen": 356832276, + "step": 9080 + }, + { + "epoch": 0.44046833073889896, + "grad_norm": 0.42983555793762207, + "learning_rate": 3.097805617442546e-05, + "loss": 1.3681, + "num_input_tokens_seen": 357220024, + "step": 9090 + }, + { + "epoch": 0.44095289435907375, + "grad_norm": 0.3999558389186859, + "learning_rate": 3.09399505436756e-05, + "loss": 1.3268, + "num_input_tokens_seen": 357614528, + "step": 9100 + }, + { + "epoch": 0.44143745797924855, + "grad_norm": 0.4042194187641144, + "learning_rate": 3.090183028157211e-05, + "loss": 1.3262, + "num_input_tokens_seen": 357978768, + "step": 9110 + }, + { + "epoch": 0.44192202159942334, + "grad_norm": 0.4250580370426178, + "learning_rate": 3.086369548201326e-05, + "loss": 1.3547, + "num_input_tokens_seen": 358366236, + "step": 9120 + }, + { + "epoch": 0.4424065852195982, + "grad_norm": 0.394369900226593, + "learning_rate": 3.082554623893312e-05, + "loss": 1.3799, + "num_input_tokens_seen": 358751508, + "step": 9130 + }, + { + "epoch": 0.442891148839773, + "grad_norm": 0.38379648327827454, + "learning_rate": 3.0787382646301324e-05, + "loss": 1.3512, + "num_input_tokens_seen": 359143272, + "step": 9140 + }, + { + "epoch": 0.4433757124599478, + "grad_norm": 0.3755399286746979, + "learning_rate": 3.074920479812289e-05, + "loss": 1.3386, + "num_input_tokens_seen": 359526224, + "step": 9150 + }, + { + "epoch": 0.4438602760801226, + "grad_norm": 0.4327252209186554, + "learning_rate": 3.0711012788437916e-05, + "loss": 1.3552, + "num_input_tokens_seen": 359890840, + "step": 9160 + }, + { + "epoch": 0.4443448397002974, + "grad_norm": 0.4325873851776123, + "learning_rate": 3.067280671132139e-05, + "loss": 1.2917, + "num_input_tokens_seen": 360291064, + "step": 9170 + }, + { + "epoch": 0.4448294033204722, + "grad_norm": 0.45133915543556213, + "learning_rate": 3.063458666088296e-05, + "loss": 1.3375, + "num_input_tokens_seen": 360685988, + "step": 9180 + }, + { + "epoch": 0.445313966940647, + "grad_norm": 0.42375510931015015, + "learning_rate": 3.0596352731266684e-05, + "loss": 1.3736, + "num_input_tokens_seen": 361111956, + "step": 9190 + }, + { + "epoch": 0.4457985305608218, + "grad_norm": 0.3929903209209442, + "learning_rate": 3.055810501665082e-05, + "loss": 1.3429, + "num_input_tokens_seen": 361512340, + "step": 9200 + }, + { + "epoch": 0.44628309418099665, + "grad_norm": 0.4164799749851227, + "learning_rate": 3.051984361124756e-05, + "loss": 1.3907, + "num_input_tokens_seen": 361904800, + "step": 9210 + }, + { + "epoch": 0.44676765780117145, + "grad_norm": 0.39341410994529724, + "learning_rate": 3.0481568609302846e-05, + "loss": 1.3494, + "num_input_tokens_seen": 362296512, + "step": 9220 + }, + { + "epoch": 0.44725222142134624, + "grad_norm": 0.37080663442611694, + "learning_rate": 3.0443280105096096e-05, + "loss": 1.3476, + "num_input_tokens_seen": 362693456, + "step": 9230 + }, + { + "epoch": 0.44773678504152103, + "grad_norm": 0.436410129070282, + "learning_rate": 3.0404978192939974e-05, + "loss": 1.3445, + "num_input_tokens_seen": 363121352, + "step": 9240 + }, + { + "epoch": 0.44822134866169583, + "grad_norm": 0.39854753017425537, + "learning_rate": 3.0366662967180198e-05, + "loss": 1.3764, + "num_input_tokens_seen": 363511128, + "step": 9250 + }, + { + "epoch": 0.4487059122818707, + "grad_norm": 0.4296603202819824, + "learning_rate": 3.0328334522195262e-05, + "loss": 1.3258, + "num_input_tokens_seen": 363885672, + "step": 9260 + }, + { + "epoch": 0.44919047590204547, + "grad_norm": 0.42518150806427, + "learning_rate": 3.0289992952396234e-05, + "loss": 1.2693, + "num_input_tokens_seen": 364269780, + "step": 9270 + }, + { + "epoch": 0.44967503952222027, + "grad_norm": 0.4305862784385681, + "learning_rate": 3.0251638352226495e-05, + "loss": 1.3471, + "num_input_tokens_seen": 364672480, + "step": 9280 + }, + { + "epoch": 0.45015960314239506, + "grad_norm": 0.3815818727016449, + "learning_rate": 3.0213270816161536e-05, + "loss": 1.3163, + "num_input_tokens_seen": 365041504, + "step": 9290 + }, + { + "epoch": 0.4506441667625699, + "grad_norm": 0.40457382798194885, + "learning_rate": 3.0174890438708715e-05, + "loss": 1.3232, + "num_input_tokens_seen": 365438776, + "step": 9300 + }, + { + "epoch": 0.4511287303827447, + "grad_norm": 0.43254563212394714, + "learning_rate": 3.0136497314406992e-05, + "loss": 1.3224, + "num_input_tokens_seen": 365820180, + "step": 9310 + }, + { + "epoch": 0.4516132940029195, + "grad_norm": 0.39054685831069946, + "learning_rate": 3.0098091537826766e-05, + "loss": 1.3043, + "num_input_tokens_seen": 366203692, + "step": 9320 + }, + { + "epoch": 0.4520978576230943, + "grad_norm": 0.3737923502922058, + "learning_rate": 3.0059673203569572e-05, + "loss": 1.3191, + "num_input_tokens_seen": 366605944, + "step": 9330 + }, + { + "epoch": 0.4525824212432691, + "grad_norm": 0.47919556498527527, + "learning_rate": 3.0021242406267892e-05, + "loss": 1.3303, + "num_input_tokens_seen": 366992400, + "step": 9340 + }, + { + "epoch": 0.45306698486344393, + "grad_norm": 0.44770705699920654, + "learning_rate": 2.9982799240584907e-05, + "loss": 1.3101, + "num_input_tokens_seen": 367423000, + "step": 9350 + }, + { + "epoch": 0.4535515484836187, + "grad_norm": 0.3613060712814331, + "learning_rate": 2.9944343801214253e-05, + "loss": 1.3585, + "num_input_tokens_seen": 367812296, + "step": 9360 + }, + { + "epoch": 0.4540361121037935, + "grad_norm": 0.40847277641296387, + "learning_rate": 2.9905876182879806e-05, + "loss": 1.2867, + "num_input_tokens_seen": 368210368, + "step": 9370 + }, + { + "epoch": 0.4545206757239683, + "grad_norm": 0.4815332889556885, + "learning_rate": 2.986739648033544e-05, + "loss": 1.4084, + "num_input_tokens_seen": 368581832, + "step": 9380 + }, + { + "epoch": 0.45500523934414316, + "grad_norm": 0.4206528663635254, + "learning_rate": 2.9828904788364785e-05, + "loss": 1.3391, + "num_input_tokens_seen": 368993896, + "step": 9390 + }, + { + "epoch": 0.45548980296431796, + "grad_norm": 0.43159064650535583, + "learning_rate": 2.9790401201781037e-05, + "loss": 1.3808, + "num_input_tokens_seen": 369375788, + "step": 9400 + }, + { + "epoch": 0.45597436658449275, + "grad_norm": 0.43968769907951355, + "learning_rate": 2.975188581542665e-05, + "loss": 1.3115, + "num_input_tokens_seen": 369796992, + "step": 9410 + }, + { + "epoch": 0.45645893020466755, + "grad_norm": 0.40895915031433105, + "learning_rate": 2.9713358724173167e-05, + "loss": 1.3758, + "num_input_tokens_seen": 370171640, + "step": 9420 + }, + { + "epoch": 0.45694349382484234, + "grad_norm": 0.3732263147830963, + "learning_rate": 2.9674820022920953e-05, + "loss": 1.3431, + "num_input_tokens_seen": 370532156, + "step": 9430 + }, + { + "epoch": 0.4574280574450172, + "grad_norm": 0.38699179887771606, + "learning_rate": 2.963626980659898e-05, + "loss": 1.295, + "num_input_tokens_seen": 370950576, + "step": 9440 + }, + { + "epoch": 0.457912621065192, + "grad_norm": 0.4024085998535156, + "learning_rate": 2.9597708170164567e-05, + "loss": 1.3202, + "num_input_tokens_seen": 371331076, + "step": 9450 + }, + { + "epoch": 0.4583971846853668, + "grad_norm": 0.3956678509712219, + "learning_rate": 2.955913520860319e-05, + "loss": 1.3289, + "num_input_tokens_seen": 371714060, + "step": 9460 + }, + { + "epoch": 0.45888174830554157, + "grad_norm": 0.41495439410209656, + "learning_rate": 2.9520551016928193e-05, + "loss": 1.3285, + "num_input_tokens_seen": 372137132, + "step": 9470 + }, + { + "epoch": 0.4593663119257164, + "grad_norm": 0.3949216604232788, + "learning_rate": 2.9481955690180606e-05, + "loss": 1.2749, + "num_input_tokens_seen": 372529012, + "step": 9480 + }, + { + "epoch": 0.4598508755458912, + "grad_norm": 0.3913583755493164, + "learning_rate": 2.9443349323428876e-05, + "loss": 1.3173, + "num_input_tokens_seen": 372945044, + "step": 9490 + }, + { + "epoch": 0.460335439166066, + "grad_norm": 0.4193466901779175, + "learning_rate": 2.9404732011768632e-05, + "loss": 1.2946, + "num_input_tokens_seen": 373331840, + "step": 9500 + }, + { + "epoch": 0.4608200027862408, + "grad_norm": 0.4153105914592743, + "learning_rate": 2.936610385032249e-05, + "loss": 1.3241, + "num_input_tokens_seen": 373726396, + "step": 9510 + }, + { + "epoch": 0.4613045664064156, + "grad_norm": 0.41479748487472534, + "learning_rate": 2.932746493423976e-05, + "loss": 1.3622, + "num_input_tokens_seen": 374113504, + "step": 9520 + }, + { + "epoch": 0.46178913002659044, + "grad_norm": 0.4534710645675659, + "learning_rate": 2.9288815358696265e-05, + "loss": 1.3537, + "num_input_tokens_seen": 374495524, + "step": 9530 + }, + { + "epoch": 0.46227369364676524, + "grad_norm": 0.36162427067756653, + "learning_rate": 2.9250155218894083e-05, + "loss": 1.2787, + "num_input_tokens_seen": 374880396, + "step": 9540 + }, + { + "epoch": 0.46275825726694003, + "grad_norm": 0.36983734369277954, + "learning_rate": 2.9211484610061307e-05, + "loss": 1.3314, + "num_input_tokens_seen": 375287296, + "step": 9550 + }, + { + "epoch": 0.4632428208871148, + "grad_norm": 0.4066306948661804, + "learning_rate": 2.9172803627451817e-05, + "loss": 1.3169, + "num_input_tokens_seen": 375663144, + "step": 9560 + }, + { + "epoch": 0.4637273845072897, + "grad_norm": 0.3863866329193115, + "learning_rate": 2.9134112366345055e-05, + "loss": 1.3676, + "num_input_tokens_seen": 376030684, + "step": 9570 + }, + { + "epoch": 0.46421194812746447, + "grad_norm": 0.4485596716403961, + "learning_rate": 2.909541092204576e-05, + "loss": 1.3345, + "num_input_tokens_seen": 376376536, + "step": 9580 + }, + { + "epoch": 0.46469651174763926, + "grad_norm": 0.4069863259792328, + "learning_rate": 2.9056699389883783e-05, + "loss": 1.2957, + "num_input_tokens_seen": 376751920, + "step": 9590 + }, + { + "epoch": 0.46518107536781406, + "grad_norm": 0.41476011276245117, + "learning_rate": 2.9017977865213814e-05, + "loss": 1.323, + "num_input_tokens_seen": 377165456, + "step": 9600 + }, + { + "epoch": 0.4656656389879889, + "grad_norm": 0.38815972208976746, + "learning_rate": 2.8979246443415132e-05, + "loss": 1.2949, + "num_input_tokens_seen": 377554888, + "step": 9610 + }, + { + "epoch": 0.4661502026081637, + "grad_norm": 0.39285364747047424, + "learning_rate": 2.8940505219891432e-05, + "loss": 1.2969, + "num_input_tokens_seen": 377948944, + "step": 9620 + }, + { + "epoch": 0.4666347662283385, + "grad_norm": 0.43776214122772217, + "learning_rate": 2.890175429007054e-05, + "loss": 1.3136, + "num_input_tokens_seen": 378360736, + "step": 9630 + }, + { + "epoch": 0.4671193298485133, + "grad_norm": 0.3744518458843231, + "learning_rate": 2.8862993749404166e-05, + "loss": 1.3181, + "num_input_tokens_seen": 378760840, + "step": 9640 + }, + { + "epoch": 0.4676038934686881, + "grad_norm": 0.3794598877429962, + "learning_rate": 2.8824223693367724e-05, + "loss": 1.3395, + "num_input_tokens_seen": 379173080, + "step": 9650 + }, + { + "epoch": 0.46808845708886293, + "grad_norm": 0.37090277671813965, + "learning_rate": 2.8785444217460067e-05, + "loss": 1.315, + "num_input_tokens_seen": 379569804, + "step": 9660 + }, + { + "epoch": 0.4685730207090377, + "grad_norm": 0.4118629992008209, + "learning_rate": 2.8746655417203216e-05, + "loss": 1.3049, + "num_input_tokens_seen": 379963480, + "step": 9670 + }, + { + "epoch": 0.4690575843292125, + "grad_norm": 0.4254318177700043, + "learning_rate": 2.8707857388142212e-05, + "loss": 1.331, + "num_input_tokens_seen": 380393192, + "step": 9680 + }, + { + "epoch": 0.4695421479493873, + "grad_norm": 0.4114190638065338, + "learning_rate": 2.866905022584478e-05, + "loss": 1.2986, + "num_input_tokens_seen": 380779764, + "step": 9690 + }, + { + "epoch": 0.47002671156956216, + "grad_norm": 0.4168798625469208, + "learning_rate": 2.8630234025901175e-05, + "loss": 1.2768, + "num_input_tokens_seen": 381172796, + "step": 9700 + }, + { + "epoch": 0.47051127518973695, + "grad_norm": 0.39690959453582764, + "learning_rate": 2.8591408883923892e-05, + "loss": 1.2874, + "num_input_tokens_seen": 381548028, + "step": 9710 + }, + { + "epoch": 0.47099583880991175, + "grad_norm": 0.4413483440876007, + "learning_rate": 2.8552574895547468e-05, + "loss": 1.3259, + "num_input_tokens_seen": 381920092, + "step": 9720 + }, + { + "epoch": 0.47148040243008654, + "grad_norm": 0.3835400640964508, + "learning_rate": 2.8513732156428224e-05, + "loss": 1.3392, + "num_input_tokens_seen": 382302052, + "step": 9730 + }, + { + "epoch": 0.47196496605026134, + "grad_norm": 0.3888629972934723, + "learning_rate": 2.8474880762244034e-05, + "loss": 1.3213, + "num_input_tokens_seen": 382707108, + "step": 9740 + }, + { + "epoch": 0.4724495296704362, + "grad_norm": 0.42624571919441223, + "learning_rate": 2.8436020808694086e-05, + "loss": 1.3378, + "num_input_tokens_seen": 383075220, + "step": 9750 + }, + { + "epoch": 0.472934093290611, + "grad_norm": 0.40198907256126404, + "learning_rate": 2.8397152391498677e-05, + "loss": 1.3228, + "num_input_tokens_seen": 383491136, + "step": 9760 + }, + { + "epoch": 0.4734186569107858, + "grad_norm": 0.4753158390522003, + "learning_rate": 2.835827560639892e-05, + "loss": 1.3172, + "num_input_tokens_seen": 383869604, + "step": 9770 + }, + { + "epoch": 0.47390322053096057, + "grad_norm": 0.4025874733924866, + "learning_rate": 2.831939054915656e-05, + "loss": 1.3099, + "num_input_tokens_seen": 384263944, + "step": 9780 + }, + { + "epoch": 0.4743877841511354, + "grad_norm": 0.4010298550128937, + "learning_rate": 2.8280497315553705e-05, + "loss": 1.2988, + "num_input_tokens_seen": 384659820, + "step": 9790 + }, + { + "epoch": 0.4748723477713102, + "grad_norm": 0.3888191878795624, + "learning_rate": 2.8241596001392617e-05, + "loss": 1.3535, + "num_input_tokens_seen": 385047304, + "step": 9800 + }, + { + "epoch": 0.475356911391485, + "grad_norm": 0.4061359763145447, + "learning_rate": 2.8202686702495447e-05, + "loss": 1.3124, + "num_input_tokens_seen": 385440160, + "step": 9810 + }, + { + "epoch": 0.4758414750116598, + "grad_norm": 0.3900851607322693, + "learning_rate": 2.816376951470402e-05, + "loss": 1.3093, + "num_input_tokens_seen": 385848164, + "step": 9820 + }, + { + "epoch": 0.4763260386318346, + "grad_norm": 0.39700040221214294, + "learning_rate": 2.8124844533879607e-05, + "loss": 1.31, + "num_input_tokens_seen": 386286824, + "step": 9830 + }, + { + "epoch": 0.47681060225200944, + "grad_norm": 0.4107182025909424, + "learning_rate": 2.808591185590265e-05, + "loss": 1.2721, + "num_input_tokens_seen": 386666264, + "step": 9840 + }, + { + "epoch": 0.47729516587218423, + "grad_norm": 0.392866313457489, + "learning_rate": 2.8046971576672582e-05, + "loss": 1.3607, + "num_input_tokens_seen": 387054376, + "step": 9850 + }, + { + "epoch": 0.47777972949235903, + "grad_norm": 0.380993127822876, + "learning_rate": 2.8008023792107512e-05, + "loss": 1.3442, + "num_input_tokens_seen": 387450084, + "step": 9860 + }, + { + "epoch": 0.4782642931125338, + "grad_norm": 0.4111970365047455, + "learning_rate": 2.7969068598144095e-05, + "loss": 1.3341, + "num_input_tokens_seen": 387853932, + "step": 9870 + }, + { + "epoch": 0.47874885673270867, + "grad_norm": 0.385023832321167, + "learning_rate": 2.793010609073719e-05, + "loss": 1.3521, + "num_input_tokens_seen": 388260636, + "step": 9880 + }, + { + "epoch": 0.47923342035288347, + "grad_norm": 0.3947872519493103, + "learning_rate": 2.7891136365859683e-05, + "loss": 1.3337, + "num_input_tokens_seen": 388613256, + "step": 9890 + }, + { + "epoch": 0.47971798397305826, + "grad_norm": 0.36505380272865295, + "learning_rate": 2.7852159519502263e-05, + "loss": 1.3526, + "num_input_tokens_seen": 388996424, + "step": 9900 + }, + { + "epoch": 0.48020254759323305, + "grad_norm": 0.40470102429389954, + "learning_rate": 2.7813175647673123e-05, + "loss": 1.2916, + "num_input_tokens_seen": 389343832, + "step": 9910 + }, + { + "epoch": 0.4806871112134079, + "grad_norm": 0.44250667095184326, + "learning_rate": 2.777418484639779e-05, + "loss": 1.3489, + "num_input_tokens_seen": 389719140, + "step": 9920 + }, + { + "epoch": 0.4811716748335827, + "grad_norm": 0.42761561274528503, + "learning_rate": 2.773518721171884e-05, + "loss": 1.2865, + "num_input_tokens_seen": 390108812, + "step": 9930 + }, + { + "epoch": 0.4816562384537575, + "grad_norm": 0.3793517053127289, + "learning_rate": 2.769618283969569e-05, + "loss": 1.3011, + "num_input_tokens_seen": 390505056, + "step": 9940 + }, + { + "epoch": 0.4821408020739323, + "grad_norm": 0.39107024669647217, + "learning_rate": 2.765717182640436e-05, + "loss": 1.3088, + "num_input_tokens_seen": 390913668, + "step": 9950 + }, + { + "epoch": 0.4826253656941071, + "grad_norm": 0.3844979703426361, + "learning_rate": 2.7618154267937206e-05, + "loss": 1.2989, + "num_input_tokens_seen": 391319096, + "step": 9960 + }, + { + "epoch": 0.4831099293142819, + "grad_norm": 0.4240836203098297, + "learning_rate": 2.7579130260402736e-05, + "loss": 1.3159, + "num_input_tokens_seen": 391707568, + "step": 9970 + }, + { + "epoch": 0.4835944929344567, + "grad_norm": 0.3745054006576538, + "learning_rate": 2.7540099899925325e-05, + "loss": 1.2823, + "num_input_tokens_seen": 392088720, + "step": 9980 + }, + { + "epoch": 0.4840790565546315, + "grad_norm": 0.41763851046562195, + "learning_rate": 2.750106328264499e-05, + "loss": 1.3461, + "num_input_tokens_seen": 392488252, + "step": 9990 + }, + { + "epoch": 0.4845636201748063, + "grad_norm": 0.371481716632843, + "learning_rate": 2.746202050471719e-05, + "loss": 1.3553, + "num_input_tokens_seen": 392909044, + "step": 10000 + }, + { + "epoch": 0.4845636201748063, + "eval_loss": 1.4267897605895996, + "eval_runtime": 3.7798, + "eval_samples_per_second": 39.684, + "eval_steps_per_second": 5.027, + "num_input_tokens_seen": 392909044, + "step": 10000 + }, + { + "epoch": 0.48504818379498116, + "grad_norm": 0.4022182822227478, + "learning_rate": 2.742297166231252e-05, + "loss": 1.3079, + "num_input_tokens_seen": 393315936, + "step": 10010 + }, + { + "epoch": 0.48553274741515595, + "grad_norm": 0.3892512917518616, + "learning_rate": 2.738391685161654e-05, + "loss": 1.324, + "num_input_tokens_seen": 393713452, + "step": 10020 + }, + { + "epoch": 0.48601731103533075, + "grad_norm": 0.43548429012298584, + "learning_rate": 2.7344856168829502e-05, + "loss": 1.2849, + "num_input_tokens_seen": 394098204, + "step": 10030 + }, + { + "epoch": 0.48650187465550554, + "grad_norm": 0.3741033375263214, + "learning_rate": 2.7305789710166123e-05, + "loss": 1.256, + "num_input_tokens_seen": 394469340, + "step": 10040 + }, + { + "epoch": 0.48698643827568033, + "grad_norm": 0.3614325225353241, + "learning_rate": 2.726671757185535e-05, + "loss": 1.3384, + "num_input_tokens_seen": 394882124, + "step": 10050 + }, + { + "epoch": 0.4874710018958552, + "grad_norm": 0.4203681945800781, + "learning_rate": 2.7227639850140118e-05, + "loss": 1.3698, + "num_input_tokens_seen": 395262528, + "step": 10060 + }, + { + "epoch": 0.48795556551603, + "grad_norm": 0.42467623949050903, + "learning_rate": 2.7188556641277107e-05, + "loss": 1.3216, + "num_input_tokens_seen": 395660708, + "step": 10070 + }, + { + "epoch": 0.48844012913620477, + "grad_norm": 0.40550142526626587, + "learning_rate": 2.7149468041536535e-05, + "loss": 1.3211, + "num_input_tokens_seen": 396068784, + "step": 10080 + }, + { + "epoch": 0.48892469275637956, + "grad_norm": 0.4358334541320801, + "learning_rate": 2.711037414720187e-05, + "loss": 1.2643, + "num_input_tokens_seen": 396441268, + "step": 10090 + }, + { + "epoch": 0.4894092563765544, + "grad_norm": 0.38662582635879517, + "learning_rate": 2.7071275054569638e-05, + "loss": 1.2977, + "num_input_tokens_seen": 396849108, + "step": 10100 + }, + { + "epoch": 0.4898938199967292, + "grad_norm": 0.4204176664352417, + "learning_rate": 2.703217085994918e-05, + "loss": 1.2823, + "num_input_tokens_seen": 397241560, + "step": 10110 + }, + { + "epoch": 0.490378383616904, + "grad_norm": 0.41810983419418335, + "learning_rate": 2.699306165966238e-05, + "loss": 1.2884, + "num_input_tokens_seen": 397641548, + "step": 10120 + }, + { + "epoch": 0.4908629472370788, + "grad_norm": 0.43055158853530884, + "learning_rate": 2.695394755004347e-05, + "loss": 1.3254, + "num_input_tokens_seen": 398025916, + "step": 10130 + }, + { + "epoch": 0.4913475108572536, + "grad_norm": 0.43699413537979126, + "learning_rate": 2.691482862743877e-05, + "loss": 1.2906, + "num_input_tokens_seen": 398442156, + "step": 10140 + }, + { + "epoch": 0.49183207447742844, + "grad_norm": 0.461458295583725, + "learning_rate": 2.6875704988206457e-05, + "loss": 1.335, + "num_input_tokens_seen": 398846656, + "step": 10150 + }, + { + "epoch": 0.49231663809760323, + "grad_norm": 0.3919227123260498, + "learning_rate": 2.6836576728716313e-05, + "loss": 1.3533, + "num_input_tokens_seen": 399230044, + "step": 10160 + }, + { + "epoch": 0.492801201717778, + "grad_norm": 0.3971085548400879, + "learning_rate": 2.679744394534952e-05, + "loss": 1.2967, + "num_input_tokens_seen": 399626988, + "step": 10170 + }, + { + "epoch": 0.4932857653379528, + "grad_norm": 0.40311896800994873, + "learning_rate": 2.6758306734498383e-05, + "loss": 1.3335, + "num_input_tokens_seen": 399992148, + "step": 10180 + }, + { + "epoch": 0.49377032895812767, + "grad_norm": 0.4395916759967804, + "learning_rate": 2.6719165192566138e-05, + "loss": 1.2957, + "num_input_tokens_seen": 400375556, + "step": 10190 + }, + { + "epoch": 0.49425489257830246, + "grad_norm": 0.41459786891937256, + "learning_rate": 2.6680019415966673e-05, + "loss": 1.313, + "num_input_tokens_seen": 400752968, + "step": 10200 + }, + { + "epoch": 0.49473945619847726, + "grad_norm": 0.43075108528137207, + "learning_rate": 2.6640869501124305e-05, + "loss": 1.3392, + "num_input_tokens_seen": 401136408, + "step": 10210 + }, + { + "epoch": 0.49522401981865205, + "grad_norm": 0.43251320719718933, + "learning_rate": 2.660171554447355e-05, + "loss": 1.267, + "num_input_tokens_seen": 401538004, + "step": 10220 + }, + { + "epoch": 0.49570858343882684, + "grad_norm": 0.3789636492729187, + "learning_rate": 2.6562557642458872e-05, + "loss": 1.3561, + "num_input_tokens_seen": 401913896, + "step": 10230 + }, + { + "epoch": 0.4961931470590017, + "grad_norm": 0.410152405500412, + "learning_rate": 2.652339589153447e-05, + "loss": 1.3323, + "num_input_tokens_seen": 402303296, + "step": 10240 + }, + { + "epoch": 0.4966777106791765, + "grad_norm": 0.4302387237548828, + "learning_rate": 2.648423038816401e-05, + "loss": 1.3612, + "num_input_tokens_seen": 402673940, + "step": 10250 + }, + { + "epoch": 0.4971622742993513, + "grad_norm": 0.4012112021446228, + "learning_rate": 2.6445061228820406e-05, + "loss": 1.3009, + "num_input_tokens_seen": 403069012, + "step": 10260 + }, + { + "epoch": 0.4976468379195261, + "grad_norm": 0.42036718130111694, + "learning_rate": 2.6405888509985576e-05, + "loss": 1.3463, + "num_input_tokens_seen": 403464332, + "step": 10270 + }, + { + "epoch": 0.4981314015397009, + "grad_norm": 0.42190247774124146, + "learning_rate": 2.6366712328150205e-05, + "loss": 1.3038, + "num_input_tokens_seen": 403863168, + "step": 10280 + }, + { + "epoch": 0.4986159651598757, + "grad_norm": 0.4091307520866394, + "learning_rate": 2.6327532779813506e-05, + "loss": 1.2853, + "num_input_tokens_seen": 404242308, + "step": 10290 + }, + { + "epoch": 0.4991005287800505, + "grad_norm": 0.43492767214775085, + "learning_rate": 2.6288349961482993e-05, + "loss": 1.3446, + "num_input_tokens_seen": 404634860, + "step": 10300 + }, + { + "epoch": 0.4995850924002253, + "grad_norm": 0.4058671295642853, + "learning_rate": 2.624916396967423e-05, + "loss": 1.2642, + "num_input_tokens_seen": 405016164, + "step": 10310 + }, + { + "epoch": 0.5000696560204001, + "grad_norm": 0.3747016191482544, + "learning_rate": 2.620997490091058e-05, + "loss": 1.3063, + "num_input_tokens_seen": 405389624, + "step": 10320 + }, + { + "epoch": 0.5005542196405749, + "grad_norm": 0.4174031913280487, + "learning_rate": 2.617078285172302e-05, + "loss": 1.3075, + "num_input_tokens_seen": 405768816, + "step": 10330 + }, + { + "epoch": 0.5010387832607497, + "grad_norm": 0.42868369817733765, + "learning_rate": 2.6131587918649854e-05, + "loss": 1.3153, + "num_input_tokens_seen": 406177564, + "step": 10340 + }, + { + "epoch": 0.5015233468809246, + "grad_norm": 0.4055063724517822, + "learning_rate": 2.6092390198236468e-05, + "loss": 1.3084, + "num_input_tokens_seen": 406576960, + "step": 10350 + }, + { + "epoch": 0.5020079105010994, + "grad_norm": 0.36954593658447266, + "learning_rate": 2.6053189787035147e-05, + "loss": 1.2724, + "num_input_tokens_seen": 407003236, + "step": 10360 + }, + { + "epoch": 0.5024924741212742, + "grad_norm": 0.46568143367767334, + "learning_rate": 2.6013986781604782e-05, + "loss": 1.2893, + "num_input_tokens_seen": 407393488, + "step": 10370 + }, + { + "epoch": 0.502977037741449, + "grad_norm": 0.40515366196632385, + "learning_rate": 2.5974781278510656e-05, + "loss": 1.3126, + "num_input_tokens_seen": 407794284, + "step": 10380 + }, + { + "epoch": 0.5034616013616238, + "grad_norm": 0.4094693064689636, + "learning_rate": 2.5935573374324228e-05, + "loss": 1.288, + "num_input_tokens_seen": 408172776, + "step": 10390 + }, + { + "epoch": 0.5039461649817986, + "grad_norm": 0.4908318519592285, + "learning_rate": 2.5896363165622833e-05, + "loss": 1.3331, + "num_input_tokens_seen": 408581868, + "step": 10400 + }, + { + "epoch": 0.5044307286019734, + "grad_norm": 0.42689284682273865, + "learning_rate": 2.585715074898951e-05, + "loss": 1.2885, + "num_input_tokens_seen": 408941632, + "step": 10410 + }, + { + "epoch": 0.5049152922221481, + "grad_norm": 0.40442904829978943, + "learning_rate": 2.5817936221012733e-05, + "loss": 1.3529, + "num_input_tokens_seen": 409363076, + "step": 10420 + }, + { + "epoch": 0.5053998558423229, + "grad_norm": 0.38854020833969116, + "learning_rate": 2.5778719678286172e-05, + "loss": 1.2953, + "num_input_tokens_seen": 409759340, + "step": 10430 + }, + { + "epoch": 0.5058844194624978, + "grad_norm": 0.41985204815864563, + "learning_rate": 2.5739501217408457e-05, + "loss": 1.3149, + "num_input_tokens_seen": 410179976, + "step": 10440 + }, + { + "epoch": 0.5063689830826726, + "grad_norm": 0.42030394077301025, + "learning_rate": 2.5700280934982947e-05, + "loss": 1.3282, + "num_input_tokens_seen": 410588712, + "step": 10450 + }, + { + "epoch": 0.5068535467028474, + "grad_norm": 0.4288428723812103, + "learning_rate": 2.5661058927617476e-05, + "loss": 1.2864, + "num_input_tokens_seen": 410998856, + "step": 10460 + }, + { + "epoch": 0.5073381103230222, + "grad_norm": 0.4092228412628174, + "learning_rate": 2.5621835291924157e-05, + "loss": 1.2977, + "num_input_tokens_seen": 411403584, + "step": 10470 + }, + { + "epoch": 0.507822673943197, + "grad_norm": 0.40826088190078735, + "learning_rate": 2.5582610124519087e-05, + "loss": 1.2867, + "num_input_tokens_seen": 411806684, + "step": 10480 + }, + { + "epoch": 0.5083072375633718, + "grad_norm": 0.44615307450294495, + "learning_rate": 2.5543383522022137e-05, + "loss": 1.3498, + "num_input_tokens_seen": 412178432, + "step": 10490 + }, + { + "epoch": 0.5087918011835466, + "grad_norm": 0.382753849029541, + "learning_rate": 2.5504155581056734e-05, + "loss": 1.2994, + "num_input_tokens_seen": 412554676, + "step": 10500 + }, + { + "epoch": 0.5092763648037214, + "grad_norm": 0.4127599000930786, + "learning_rate": 2.546492639824957e-05, + "loss": 1.3228, + "num_input_tokens_seen": 412919116, + "step": 10510 + }, + { + "epoch": 0.5097609284238963, + "grad_norm": 0.3876650333404541, + "learning_rate": 2.542569607023042e-05, + "loss": 1.3474, + "num_input_tokens_seen": 413304516, + "step": 10520 + }, + { + "epoch": 0.5102454920440711, + "grad_norm": 0.3939809799194336, + "learning_rate": 2.5386464693631885e-05, + "loss": 1.3047, + "num_input_tokens_seen": 413691892, + "step": 10530 + }, + { + "epoch": 0.5107300556642459, + "grad_norm": 0.4118193984031677, + "learning_rate": 2.5347232365089125e-05, + "loss": 1.2911, + "num_input_tokens_seen": 414112144, + "step": 10540 + }, + { + "epoch": 0.5112146192844207, + "grad_norm": 0.40481314063072205, + "learning_rate": 2.530799918123966e-05, + "loss": 1.341, + "num_input_tokens_seen": 414501584, + "step": 10550 + }, + { + "epoch": 0.5116991829045955, + "grad_norm": 0.4449983537197113, + "learning_rate": 2.526876523872312e-05, + "loss": 1.3046, + "num_input_tokens_seen": 414863372, + "step": 10560 + }, + { + "epoch": 0.5121837465247703, + "grad_norm": 0.37632080912590027, + "learning_rate": 2.5229530634180986e-05, + "loss": 1.3226, + "num_input_tokens_seen": 415256912, + "step": 10570 + }, + { + "epoch": 0.5126683101449451, + "grad_norm": 0.4017525613307953, + "learning_rate": 2.519029546425639e-05, + "loss": 1.2999, + "num_input_tokens_seen": 415645068, + "step": 10580 + }, + { + "epoch": 0.5131528737651199, + "grad_norm": 0.405073881149292, + "learning_rate": 2.5151059825593847e-05, + "loss": 1.3299, + "num_input_tokens_seen": 416025852, + "step": 10590 + }, + { + "epoch": 0.5136374373852947, + "grad_norm": 0.3685546815395355, + "learning_rate": 2.511182381483902e-05, + "loss": 1.3465, + "num_input_tokens_seen": 416410872, + "step": 10600 + }, + { + "epoch": 0.5141220010054696, + "grad_norm": 0.40560224652290344, + "learning_rate": 2.507258752863851e-05, + "loss": 1.3055, + "num_input_tokens_seen": 416840448, + "step": 10610 + }, + { + "epoch": 0.5146065646256444, + "grad_norm": 0.402498334646225, + "learning_rate": 2.503335106363957e-05, + "loss": 1.3216, + "num_input_tokens_seen": 417239552, + "step": 10620 + }, + { + "epoch": 0.5150911282458192, + "grad_norm": 0.40907391905784607, + "learning_rate": 2.4994114516489917e-05, + "loss": 1.3089, + "num_input_tokens_seen": 417648304, + "step": 10630 + }, + { + "epoch": 0.515575691865994, + "grad_norm": 0.4574085474014282, + "learning_rate": 2.4954877983837446e-05, + "loss": 1.2659, + "num_input_tokens_seen": 418049068, + "step": 10640 + }, + { + "epoch": 0.5160602554861687, + "grad_norm": 0.4318368434906006, + "learning_rate": 2.491564156233005e-05, + "loss": 1.3243, + "num_input_tokens_seen": 418453168, + "step": 10650 + }, + { + "epoch": 0.5165448191063435, + "grad_norm": 0.3779464364051819, + "learning_rate": 2.4876405348615303e-05, + "loss": 1.3325, + "num_input_tokens_seen": 418845328, + "step": 10660 + }, + { + "epoch": 0.5170293827265183, + "grad_norm": 0.4640369713306427, + "learning_rate": 2.483716943934031e-05, + "loss": 1.3597, + "num_input_tokens_seen": 419240400, + "step": 10670 + }, + { + "epoch": 0.5175139463466931, + "grad_norm": 0.39406731724739075, + "learning_rate": 2.47979339311514e-05, + "loss": 1.348, + "num_input_tokens_seen": 419604196, + "step": 10680 + }, + { + "epoch": 0.5179985099668679, + "grad_norm": 0.45061835646629333, + "learning_rate": 2.4758698920693933e-05, + "loss": 1.3107, + "num_input_tokens_seen": 420001636, + "step": 10690 + }, + { + "epoch": 0.5184830735870428, + "grad_norm": 0.3857516050338745, + "learning_rate": 2.4719464504612015e-05, + "loss": 1.2952, + "num_input_tokens_seen": 420412924, + "step": 10700 + }, + { + "epoch": 0.5189676372072176, + "grad_norm": 0.40518441796302795, + "learning_rate": 2.4680230779548325e-05, + "loss": 1.3131, + "num_input_tokens_seen": 420817072, + "step": 10710 + }, + { + "epoch": 0.5194522008273924, + "grad_norm": 0.3917674422264099, + "learning_rate": 2.4640997842143797e-05, + "loss": 1.3628, + "num_input_tokens_seen": 421217132, + "step": 10720 + }, + { + "epoch": 0.5199367644475672, + "grad_norm": 0.41887742280960083, + "learning_rate": 2.4601765789037465e-05, + "loss": 1.2873, + "num_input_tokens_seen": 421590708, + "step": 10730 + }, + { + "epoch": 0.520421328067742, + "grad_norm": 0.4056337773799896, + "learning_rate": 2.456253471686617e-05, + "loss": 1.3099, + "num_input_tokens_seen": 421991544, + "step": 10740 + }, + { + "epoch": 0.5209058916879168, + "grad_norm": 0.4133414924144745, + "learning_rate": 2.452330472226432e-05, + "loss": 1.3146, + "num_input_tokens_seen": 422369440, + "step": 10750 + }, + { + "epoch": 0.5213904553080916, + "grad_norm": 0.39193689823150635, + "learning_rate": 2.4484075901863697e-05, + "loss": 1.3141, + "num_input_tokens_seen": 422783140, + "step": 10760 + }, + { + "epoch": 0.5218750189282664, + "grad_norm": 0.447322279214859, + "learning_rate": 2.444484835229316e-05, + "loss": 1.2537, + "num_input_tokens_seen": 423177384, + "step": 10770 + }, + { + "epoch": 0.5223595825484412, + "grad_norm": 0.3933646082878113, + "learning_rate": 2.4405622170178483e-05, + "loss": 1.3232, + "num_input_tokens_seen": 423569888, + "step": 10780 + }, + { + "epoch": 0.5228441461686161, + "grad_norm": 0.4285268187522888, + "learning_rate": 2.436639745214201e-05, + "loss": 1.2985, + "num_input_tokens_seen": 423969632, + "step": 10790 + }, + { + "epoch": 0.5233287097887909, + "grad_norm": 0.40538379549980164, + "learning_rate": 2.432717429480254e-05, + "loss": 1.3513, + "num_input_tokens_seen": 424379412, + "step": 10800 + }, + { + "epoch": 0.5238132734089657, + "grad_norm": 0.4205954670906067, + "learning_rate": 2.4287952794774972e-05, + "loss": 1.2476, + "num_input_tokens_seen": 424732692, + "step": 10810 + }, + { + "epoch": 0.5242978370291405, + "grad_norm": 0.4187707304954529, + "learning_rate": 2.424873304867018e-05, + "loss": 1.2784, + "num_input_tokens_seen": 425099744, + "step": 10820 + }, + { + "epoch": 0.5247824006493153, + "grad_norm": 0.4088019132614136, + "learning_rate": 2.420951515309466e-05, + "loss": 1.3619, + "num_input_tokens_seen": 425533784, + "step": 10830 + }, + { + "epoch": 0.52526696426949, + "grad_norm": 0.4790429472923279, + "learning_rate": 2.4170299204650402e-05, + "loss": 1.3588, + "num_input_tokens_seen": 425907244, + "step": 10840 + }, + { + "epoch": 0.5257515278896648, + "grad_norm": 0.39465224742889404, + "learning_rate": 2.4131085299934552e-05, + "loss": 1.2817, + "num_input_tokens_seen": 426303280, + "step": 10850 + }, + { + "epoch": 0.5262360915098396, + "grad_norm": 0.3853622376918793, + "learning_rate": 2.4091873535539263e-05, + "loss": 1.3173, + "num_input_tokens_seen": 426708880, + "step": 10860 + }, + { + "epoch": 0.5267206551300144, + "grad_norm": 0.47276604175567627, + "learning_rate": 2.40526640080514e-05, + "loss": 1.267, + "num_input_tokens_seen": 427072384, + "step": 10870 + }, + { + "epoch": 0.5272052187501893, + "grad_norm": 0.4207751452922821, + "learning_rate": 2.40134568140523e-05, + "loss": 1.3232, + "num_input_tokens_seen": 427483780, + "step": 10880 + }, + { + "epoch": 0.5276897823703641, + "grad_norm": 0.4074230194091797, + "learning_rate": 2.3974252050117578e-05, + "loss": 1.3059, + "num_input_tokens_seen": 427878960, + "step": 10890 + }, + { + "epoch": 0.5281743459905389, + "grad_norm": 0.4160650968551636, + "learning_rate": 2.3935049812816853e-05, + "loss": 1.284, + "num_input_tokens_seen": 428262904, + "step": 10900 + }, + { + "epoch": 0.5286589096107137, + "grad_norm": 0.4167976975440979, + "learning_rate": 2.3895850198713532e-05, + "loss": 1.3627, + "num_input_tokens_seen": 428669280, + "step": 10910 + }, + { + "epoch": 0.5291434732308885, + "grad_norm": 0.3857592046260834, + "learning_rate": 2.3856653304364528e-05, + "loss": 1.2677, + "num_input_tokens_seen": 429050584, + "step": 10920 + }, + { + "epoch": 0.5296280368510633, + "grad_norm": 0.39841026067733765, + "learning_rate": 2.38174592263201e-05, + "loss": 1.2863, + "num_input_tokens_seen": 429432948, + "step": 10930 + }, + { + "epoch": 0.5301126004712381, + "grad_norm": 0.37593045830726624, + "learning_rate": 2.377826806112352e-05, + "loss": 1.319, + "num_input_tokens_seen": 429843780, + "step": 10940 + }, + { + "epoch": 0.5305971640914129, + "grad_norm": 0.39701202511787415, + "learning_rate": 2.3739079905310925e-05, + "loss": 1.3137, + "num_input_tokens_seen": 430264816, + "step": 10950 + }, + { + "epoch": 0.5310817277115877, + "grad_norm": 0.4029529094696045, + "learning_rate": 2.3699894855411025e-05, + "loss": 1.3228, + "num_input_tokens_seen": 430690612, + "step": 10960 + }, + { + "epoch": 0.5315662913317626, + "grad_norm": 0.4053628742694855, + "learning_rate": 2.366071300794489e-05, + "loss": 1.3084, + "num_input_tokens_seen": 431088820, + "step": 10970 + }, + { + "epoch": 0.5320508549519374, + "grad_norm": 0.4208217263221741, + "learning_rate": 2.362153445942567e-05, + "loss": 1.2865, + "num_input_tokens_seen": 431510088, + "step": 10980 + }, + { + "epoch": 0.5325354185721122, + "grad_norm": 0.38513317704200745, + "learning_rate": 2.3582359306358425e-05, + "loss": 1.2924, + "num_input_tokens_seen": 431912904, + "step": 10990 + }, + { + "epoch": 0.533019982192287, + "grad_norm": 0.3804323077201843, + "learning_rate": 2.354318764523984e-05, + "loss": 1.3148, + "num_input_tokens_seen": 432316308, + "step": 11000 + }, + { + "epoch": 0.5335045458124618, + "grad_norm": 0.4341334104537964, + "learning_rate": 2.3504019572557978e-05, + "loss": 1.3025, + "num_input_tokens_seen": 432706392, + "step": 11010 + }, + { + "epoch": 0.5339891094326366, + "grad_norm": 0.3772577941417694, + "learning_rate": 2.3464855184792103e-05, + "loss": 1.3058, + "num_input_tokens_seen": 433114000, + "step": 11020 + }, + { + "epoch": 0.5344736730528113, + "grad_norm": 0.37960031628608704, + "learning_rate": 2.3425694578412357e-05, + "loss": 1.2901, + "num_input_tokens_seen": 433515624, + "step": 11030 + }, + { + "epoch": 0.5349582366729861, + "grad_norm": 0.3838542401790619, + "learning_rate": 2.338653784987961e-05, + "loss": 1.3182, + "num_input_tokens_seen": 433899616, + "step": 11040 + }, + { + "epoch": 0.5354428002931609, + "grad_norm": 0.40158811211586, + "learning_rate": 2.3347385095645143e-05, + "loss": 1.3337, + "num_input_tokens_seen": 434295476, + "step": 11050 + }, + { + "epoch": 0.5359273639133358, + "grad_norm": 0.41163861751556396, + "learning_rate": 2.3308236412150488e-05, + "loss": 1.2498, + "num_input_tokens_seen": 434692404, + "step": 11060 + }, + { + "epoch": 0.5364119275335106, + "grad_norm": 0.4120139181613922, + "learning_rate": 2.3269091895827096e-05, + "loss": 1.325, + "num_input_tokens_seen": 435084432, + "step": 11070 + }, + { + "epoch": 0.5368964911536854, + "grad_norm": 0.4136423170566559, + "learning_rate": 2.3229951643096215e-05, + "loss": 1.3232, + "num_input_tokens_seen": 435481608, + "step": 11080 + }, + { + "epoch": 0.5373810547738602, + "grad_norm": 0.3973861038684845, + "learning_rate": 2.3190815750368534e-05, + "loss": 1.2942, + "num_input_tokens_seen": 435879740, + "step": 11090 + }, + { + "epoch": 0.537865618394035, + "grad_norm": 0.3832756280899048, + "learning_rate": 2.3151684314044042e-05, + "loss": 1.291, + "num_input_tokens_seen": 436231608, + "step": 11100 + }, + { + "epoch": 0.5383501820142098, + "grad_norm": 0.446608304977417, + "learning_rate": 2.3112557430511734e-05, + "loss": 1.2808, + "num_input_tokens_seen": 436633972, + "step": 11110 + }, + { + "epoch": 0.5388347456343846, + "grad_norm": 0.41678836941719055, + "learning_rate": 2.3073435196149392e-05, + "loss": 1.3131, + "num_input_tokens_seen": 437031292, + "step": 11120 + }, + { + "epoch": 0.5393193092545594, + "grad_norm": 0.3977677822113037, + "learning_rate": 2.3034317707323364e-05, + "loss": 1.2948, + "num_input_tokens_seen": 437405436, + "step": 11130 + }, + { + "epoch": 0.5398038728747342, + "grad_norm": 0.4163575768470764, + "learning_rate": 2.2995205060388265e-05, + "loss": 1.3057, + "num_input_tokens_seen": 437781780, + "step": 11140 + }, + { + "epoch": 0.5402884364949091, + "grad_norm": 0.38908636569976807, + "learning_rate": 2.295609735168684e-05, + "loss": 1.323, + "num_input_tokens_seen": 438167672, + "step": 11150 + }, + { + "epoch": 0.5407730001150839, + "grad_norm": 0.3993242681026459, + "learning_rate": 2.2916994677549614e-05, + "loss": 1.2775, + "num_input_tokens_seen": 438551092, + "step": 11160 + }, + { + "epoch": 0.5412575637352587, + "grad_norm": 0.43453851342201233, + "learning_rate": 2.2877897134294755e-05, + "loss": 1.3369, + "num_input_tokens_seen": 438925056, + "step": 11170 + }, + { + "epoch": 0.5417421273554335, + "grad_norm": 0.38773950934410095, + "learning_rate": 2.2838804818227766e-05, + "loss": 1.3376, + "num_input_tokens_seen": 439315640, + "step": 11180 + }, + { + "epoch": 0.5422266909756083, + "grad_norm": 0.4047930836677551, + "learning_rate": 2.2799717825641297e-05, + "loss": 1.2998, + "num_input_tokens_seen": 439714968, + "step": 11190 + }, + { + "epoch": 0.5427112545957831, + "grad_norm": 0.385567307472229, + "learning_rate": 2.2760636252814858e-05, + "loss": 1.3205, + "num_input_tokens_seen": 440096324, + "step": 11200 + }, + { + "epoch": 0.5431958182159579, + "grad_norm": 0.3837738335132599, + "learning_rate": 2.2721560196014635e-05, + "loss": 1.2782, + "num_input_tokens_seen": 440489640, + "step": 11210 + }, + { + "epoch": 0.5436803818361327, + "grad_norm": 0.4107000231742859, + "learning_rate": 2.26824897514932e-05, + "loss": 1.356, + "num_input_tokens_seen": 440874496, + "step": 11220 + }, + { + "epoch": 0.5441649454563076, + "grad_norm": 0.4507005214691162, + "learning_rate": 2.2643425015489335e-05, + "loss": 1.2991, + "num_input_tokens_seen": 441274860, + "step": 11230 + }, + { + "epoch": 0.5446495090764824, + "grad_norm": 0.40049469470977783, + "learning_rate": 2.260436608422772e-05, + "loss": 1.3207, + "num_input_tokens_seen": 441672140, + "step": 11240 + }, + { + "epoch": 0.5451340726966571, + "grad_norm": 0.4210105836391449, + "learning_rate": 2.2565313053918764e-05, + "loss": 1.3396, + "num_input_tokens_seen": 442056044, + "step": 11250 + }, + { + "epoch": 0.5456186363168319, + "grad_norm": 0.40691545605659485, + "learning_rate": 2.252626602075835e-05, + "loss": 1.3008, + "num_input_tokens_seen": 442444840, + "step": 11260 + }, + { + "epoch": 0.5461031999370067, + "grad_norm": 0.377591073513031, + "learning_rate": 2.2487225080927553e-05, + "loss": 1.2803, + "num_input_tokens_seen": 442832056, + "step": 11270 + }, + { + "epoch": 0.5465877635571815, + "grad_norm": 0.36699971556663513, + "learning_rate": 2.244819033059248e-05, + "loss": 1.3336, + "num_input_tokens_seen": 443227492, + "step": 11280 + }, + { + "epoch": 0.5470723271773563, + "grad_norm": 0.3749854862689972, + "learning_rate": 2.2409161865903952e-05, + "loss": 1.31, + "num_input_tokens_seen": 443640864, + "step": 11290 + }, + { + "epoch": 0.5475568907975311, + "grad_norm": 0.5490464568138123, + "learning_rate": 2.2370139782997342e-05, + "loss": 1.3327, + "num_input_tokens_seen": 444016308, + "step": 11300 + }, + { + "epoch": 0.5480414544177059, + "grad_norm": 0.4179551601409912, + "learning_rate": 2.2331124177992274e-05, + "loss": 1.3102, + "num_input_tokens_seen": 444404972, + "step": 11310 + }, + { + "epoch": 0.5485260180378808, + "grad_norm": 0.3857383728027344, + "learning_rate": 2.2292115146992438e-05, + "loss": 1.2987, + "num_input_tokens_seen": 444771056, + "step": 11320 + }, + { + "epoch": 0.5490105816580556, + "grad_norm": 0.40112438797950745, + "learning_rate": 2.2253112786085313e-05, + "loss": 1.2942, + "num_input_tokens_seen": 445173356, + "step": 11330 + }, + { + "epoch": 0.5494951452782304, + "grad_norm": 0.4082011580467224, + "learning_rate": 2.2214117191341972e-05, + "loss": 1.2856, + "num_input_tokens_seen": 445519032, + "step": 11340 + }, + { + "epoch": 0.5499797088984052, + "grad_norm": 0.423109769821167, + "learning_rate": 2.2175128458816792e-05, + "loss": 1.3118, + "num_input_tokens_seen": 445906772, + "step": 11350 + }, + { + "epoch": 0.55046427251858, + "grad_norm": 0.42687487602233887, + "learning_rate": 2.213614668454728e-05, + "loss": 1.2962, + "num_input_tokens_seen": 446254804, + "step": 11360 + }, + { + "epoch": 0.5509488361387548, + "grad_norm": 0.3661687672138214, + "learning_rate": 2.2097171964553757e-05, + "loss": 1.3353, + "num_input_tokens_seen": 446614404, + "step": 11370 + }, + { + "epoch": 0.5514333997589296, + "grad_norm": 0.39668744802474976, + "learning_rate": 2.2058204394839217e-05, + "loss": 1.3309, + "num_input_tokens_seen": 447010280, + "step": 11380 + }, + { + "epoch": 0.5519179633791044, + "grad_norm": 0.3846248984336853, + "learning_rate": 2.201924407138902e-05, + "loss": 1.2841, + "num_input_tokens_seen": 447381468, + "step": 11390 + }, + { + "epoch": 0.5524025269992792, + "grad_norm": 0.39855462312698364, + "learning_rate": 2.1980291090170664e-05, + "loss": 1.3553, + "num_input_tokens_seen": 447788436, + "step": 11400 + }, + { + "epoch": 0.5528870906194541, + "grad_norm": 0.40765848755836487, + "learning_rate": 2.19413455471336e-05, + "loss": 1.2543, + "num_input_tokens_seen": 448189856, + "step": 11410 + }, + { + "epoch": 0.5533716542396289, + "grad_norm": 0.3901714086532593, + "learning_rate": 2.1902407538208897e-05, + "loss": 1.3356, + "num_input_tokens_seen": 448564344, + "step": 11420 + }, + { + "epoch": 0.5538562178598037, + "grad_norm": 0.3917088806629181, + "learning_rate": 2.1863477159309132e-05, + "loss": 1.3467, + "num_input_tokens_seen": 448945696, + "step": 11430 + }, + { + "epoch": 0.5543407814799785, + "grad_norm": 0.4396210312843323, + "learning_rate": 2.182455450632803e-05, + "loss": 1.3022, + "num_input_tokens_seen": 449313644, + "step": 11440 + }, + { + "epoch": 0.5548253451001532, + "grad_norm": 0.41703107953071594, + "learning_rate": 2.178563967514034e-05, + "loss": 1.2391, + "num_input_tokens_seen": 449716576, + "step": 11450 + }, + { + "epoch": 0.555309908720328, + "grad_norm": 0.405318945646286, + "learning_rate": 2.1746732761601486e-05, + "loss": 1.2382, + "num_input_tokens_seen": 450092104, + "step": 11460 + }, + { + "epoch": 0.5557944723405028, + "grad_norm": 0.37074992060661316, + "learning_rate": 2.1707833861547442e-05, + "loss": 1.2947, + "num_input_tokens_seen": 450487336, + "step": 11470 + }, + { + "epoch": 0.5562790359606776, + "grad_norm": 0.4145805835723877, + "learning_rate": 2.1668943070794407e-05, + "loss": 1.3364, + "num_input_tokens_seen": 450861056, + "step": 11480 + }, + { + "epoch": 0.5567635995808524, + "grad_norm": 0.42270269989967346, + "learning_rate": 2.163006048513863e-05, + "loss": 1.354, + "num_input_tokens_seen": 451258688, + "step": 11490 + }, + { + "epoch": 0.5572481632010273, + "grad_norm": 0.3963187336921692, + "learning_rate": 2.1591186200356122e-05, + "loss": 1.281, + "num_input_tokens_seen": 451641956, + "step": 11500 + }, + { + "epoch": 0.5577327268212021, + "grad_norm": 0.4037591814994812, + "learning_rate": 2.1552320312202485e-05, + "loss": 1.2815, + "num_input_tokens_seen": 452047412, + "step": 11510 + }, + { + "epoch": 0.5582172904413769, + "grad_norm": 0.3662770390510559, + "learning_rate": 2.1513462916412592e-05, + "loss": 1.3455, + "num_input_tokens_seen": 452448840, + "step": 11520 + }, + { + "epoch": 0.5587018540615517, + "grad_norm": 0.3583228290081024, + "learning_rate": 2.147461410870043e-05, + "loss": 1.2794, + "num_input_tokens_seen": 452843044, + "step": 11530 + }, + { + "epoch": 0.5591864176817265, + "grad_norm": 0.37632209062576294, + "learning_rate": 2.143577398475883e-05, + "loss": 1.2674, + "num_input_tokens_seen": 453264900, + "step": 11540 + }, + { + "epoch": 0.5596709813019013, + "grad_norm": 0.3996264636516571, + "learning_rate": 2.139694264025922e-05, + "loss": 1.2744, + "num_input_tokens_seen": 453667984, + "step": 11550 + }, + { + "epoch": 0.5601555449220761, + "grad_norm": 0.3832734525203705, + "learning_rate": 2.135812017085142e-05, + "loss": 1.2997, + "num_input_tokens_seen": 454063124, + "step": 11560 + }, + { + "epoch": 0.5606401085422509, + "grad_norm": 0.4112671911716461, + "learning_rate": 2.1319306672163355e-05, + "loss": 1.326, + "num_input_tokens_seen": 454468736, + "step": 11570 + }, + { + "epoch": 0.5611246721624257, + "grad_norm": 0.40387192368507385, + "learning_rate": 2.1280502239800905e-05, + "loss": 1.2733, + "num_input_tokens_seen": 454868724, + "step": 11580 + }, + { + "epoch": 0.5616092357826006, + "grad_norm": 0.4105452001094818, + "learning_rate": 2.1241706969347554e-05, + "loss": 1.2681, + "num_input_tokens_seen": 455262064, + "step": 11590 + }, + { + "epoch": 0.5620937994027754, + "grad_norm": 0.41394761204719543, + "learning_rate": 2.1202920956364282e-05, + "loss": 1.2333, + "num_input_tokens_seen": 455641940, + "step": 11600 + }, + { + "epoch": 0.5625783630229502, + "grad_norm": 0.47459670901298523, + "learning_rate": 2.116414429638922e-05, + "loss": 1.3239, + "num_input_tokens_seen": 456036468, + "step": 11610 + }, + { + "epoch": 0.563062926643125, + "grad_norm": 0.38530948758125305, + "learning_rate": 2.112537708493749e-05, + "loss": 1.3167, + "num_input_tokens_seen": 456420516, + "step": 11620 + }, + { + "epoch": 0.5635474902632998, + "grad_norm": 0.3913155794143677, + "learning_rate": 2.108661941750091e-05, + "loss": 1.3043, + "num_input_tokens_seen": 456824708, + "step": 11630 + }, + { + "epoch": 0.5640320538834745, + "grad_norm": 0.44334185123443604, + "learning_rate": 2.1047871389547826e-05, + "loss": 1.3133, + "num_input_tokens_seen": 457192180, + "step": 11640 + }, + { + "epoch": 0.5645166175036493, + "grad_norm": 0.41720449924468994, + "learning_rate": 2.1009133096522805e-05, + "loss": 1.2314, + "num_input_tokens_seen": 457554472, + "step": 11650 + }, + { + "epoch": 0.5650011811238241, + "grad_norm": 0.3976454734802246, + "learning_rate": 2.0970404633846453e-05, + "loss": 1.3181, + "num_input_tokens_seen": 457940752, + "step": 11660 + }, + { + "epoch": 0.5654857447439989, + "grad_norm": 0.3785874843597412, + "learning_rate": 2.0931686096915172e-05, + "loss": 1.2951, + "num_input_tokens_seen": 458329516, + "step": 11670 + }, + { + "epoch": 0.5659703083641738, + "grad_norm": 0.42193588614463806, + "learning_rate": 2.0892977581100884e-05, + "loss": 1.2895, + "num_input_tokens_seen": 458728512, + "step": 11680 + }, + { + "epoch": 0.5664548719843486, + "grad_norm": 0.4394080936908722, + "learning_rate": 2.085427918175086e-05, + "loss": 1.287, + "num_input_tokens_seen": 459138172, + "step": 11690 + }, + { + "epoch": 0.5669394356045234, + "grad_norm": 0.39886605739593506, + "learning_rate": 2.0815590994187416e-05, + "loss": 1.3094, + "num_input_tokens_seen": 459549264, + "step": 11700 + }, + { + "epoch": 0.5674239992246982, + "grad_norm": 0.37033554911613464, + "learning_rate": 2.0776913113707766e-05, + "loss": 1.3196, + "num_input_tokens_seen": 459931536, + "step": 11710 + }, + { + "epoch": 0.567908562844873, + "grad_norm": 0.38384774327278137, + "learning_rate": 2.0738245635583675e-05, + "loss": 1.2916, + "num_input_tokens_seen": 460349548, + "step": 11720 + }, + { + "epoch": 0.5683931264650478, + "grad_norm": 0.41543838381767273, + "learning_rate": 2.0699588655061337e-05, + "loss": 1.34, + "num_input_tokens_seen": 460751796, + "step": 11730 + }, + { + "epoch": 0.5688776900852226, + "grad_norm": 0.38962802290916443, + "learning_rate": 2.066094226736104e-05, + "loss": 1.2622, + "num_input_tokens_seen": 461166532, + "step": 11740 + }, + { + "epoch": 0.5693622537053974, + "grad_norm": 0.3820241391658783, + "learning_rate": 2.0622306567677026e-05, + "loss": 1.3371, + "num_input_tokens_seen": 461544700, + "step": 11750 + }, + { + "epoch": 0.5698468173255722, + "grad_norm": 0.37688279151916504, + "learning_rate": 2.0583681651177177e-05, + "loss": 1.3028, + "num_input_tokens_seen": 461923852, + "step": 11760 + }, + { + "epoch": 0.5703313809457471, + "grad_norm": 0.41758424043655396, + "learning_rate": 2.0545067613002844e-05, + "loss": 1.303, + "num_input_tokens_seen": 462284468, + "step": 11770 + }, + { + "epoch": 0.5708159445659219, + "grad_norm": 0.41535744071006775, + "learning_rate": 2.050646454826854e-05, + "loss": 1.2806, + "num_input_tokens_seen": 462662724, + "step": 11780 + }, + { + "epoch": 0.5713005081860967, + "grad_norm": 0.41372743248939514, + "learning_rate": 2.0467872552061785e-05, + "loss": 1.2763, + "num_input_tokens_seen": 463065836, + "step": 11790 + }, + { + "epoch": 0.5717850718062715, + "grad_norm": 0.4078621566295624, + "learning_rate": 2.042929171944283e-05, + "loss": 1.2739, + "num_input_tokens_seen": 463465784, + "step": 11800 + }, + { + "epoch": 0.5722696354264463, + "grad_norm": 0.4270489811897278, + "learning_rate": 2.03907221454444e-05, + "loss": 1.2816, + "num_input_tokens_seen": 463873784, + "step": 11810 + }, + { + "epoch": 0.5727541990466211, + "grad_norm": 0.40874189138412476, + "learning_rate": 2.0352163925071526e-05, + "loss": 1.3118, + "num_input_tokens_seen": 464264876, + "step": 11820 + }, + { + "epoch": 0.5732387626667959, + "grad_norm": 0.4195123016834259, + "learning_rate": 2.031361715330124e-05, + "loss": 1.28, + "num_input_tokens_seen": 464655612, + "step": 11830 + }, + { + "epoch": 0.5737233262869706, + "grad_norm": 0.42843663692474365, + "learning_rate": 2.0275081925082408e-05, + "loss": 1.2797, + "num_input_tokens_seen": 465021840, + "step": 11840 + }, + { + "epoch": 0.5742078899071454, + "grad_norm": 0.4026115834712982, + "learning_rate": 2.0236558335335418e-05, + "loss": 1.2662, + "num_input_tokens_seen": 465412176, + "step": 11850 + }, + { + "epoch": 0.5746924535273203, + "grad_norm": 0.4160013794898987, + "learning_rate": 2.0198046478952034e-05, + "loss": 1.3044, + "num_input_tokens_seen": 465798436, + "step": 11860 + }, + { + "epoch": 0.5751770171474951, + "grad_norm": 0.3887726068496704, + "learning_rate": 2.0159546450795076e-05, + "loss": 1.271, + "num_input_tokens_seen": 466194580, + "step": 11870 + }, + { + "epoch": 0.5756615807676699, + "grad_norm": 0.4644850492477417, + "learning_rate": 2.012105834569827e-05, + "loss": 1.3343, + "num_input_tokens_seen": 466583580, + "step": 11880 + }, + { + "epoch": 0.5761461443878447, + "grad_norm": 0.37905943393707275, + "learning_rate": 2.008258225846594e-05, + "loss": 1.2825, + "num_input_tokens_seen": 466995580, + "step": 11890 + }, + { + "epoch": 0.5766307080080195, + "grad_norm": 0.4146198034286499, + "learning_rate": 2.0044118283872842e-05, + "loss": 1.2915, + "num_input_tokens_seen": 467394060, + "step": 11900 + }, + { + "epoch": 0.5771152716281943, + "grad_norm": 0.40112000703811646, + "learning_rate": 2.0005666516663844e-05, + "loss": 1.2813, + "num_input_tokens_seen": 467762492, + "step": 11910 + }, + { + "epoch": 0.5775998352483691, + "grad_norm": 0.3847469091415405, + "learning_rate": 1.9967227051553798e-05, + "loss": 1.3034, + "num_input_tokens_seen": 468158852, + "step": 11920 + }, + { + "epoch": 0.5780843988685439, + "grad_norm": 0.39969122409820557, + "learning_rate": 1.992879998322723e-05, + "loss": 1.2962, + "num_input_tokens_seen": 468564816, + "step": 11930 + }, + { + "epoch": 0.5785689624887188, + "grad_norm": 0.42271485924720764, + "learning_rate": 1.9890385406338118e-05, + "loss": 1.2821, + "num_input_tokens_seen": 468965016, + "step": 11940 + }, + { + "epoch": 0.5790535261088936, + "grad_norm": 0.4109669029712677, + "learning_rate": 1.9851983415509704e-05, + "loss": 1.2964, + "num_input_tokens_seen": 469374000, + "step": 11950 + }, + { + "epoch": 0.5795380897290684, + "grad_norm": 0.38713210821151733, + "learning_rate": 1.981359410533418e-05, + "loss": 1.2827, + "num_input_tokens_seen": 469791140, + "step": 11960 + }, + { + "epoch": 0.5800226533492432, + "grad_norm": 0.39154934883117676, + "learning_rate": 1.9775217570372556e-05, + "loss": 1.2617, + "num_input_tokens_seen": 470165196, + "step": 11970 + }, + { + "epoch": 0.580507216969418, + "grad_norm": 0.4089168310165405, + "learning_rate": 1.9736853905154334e-05, + "loss": 1.2968, + "num_input_tokens_seen": 470550712, + "step": 11980 + }, + { + "epoch": 0.5809917805895928, + "grad_norm": 0.45818933844566345, + "learning_rate": 1.9698503204177342e-05, + "loss": 1.2931, + "num_input_tokens_seen": 470936976, + "step": 11990 + }, + { + "epoch": 0.5814763442097676, + "grad_norm": 0.42065727710723877, + "learning_rate": 1.9660165561907447e-05, + "loss": 1.3308, + "num_input_tokens_seen": 471314876, + "step": 12000 + }, + { + "epoch": 0.5814763442097676, + "eval_loss": 1.4008795022964478, + "eval_runtime": 3.8344, + "eval_samples_per_second": 39.12, + "eval_steps_per_second": 4.955, + "num_input_tokens_seen": 471314876, + "step": 12000 + }, + { + "epoch": 0.5819609078299424, + "grad_norm": 0.38812288641929626, + "learning_rate": 1.9621841072778387e-05, + "loss": 1.2928, + "num_input_tokens_seen": 471727100, + "step": 12010 + }, + { + "epoch": 0.5824454714501172, + "grad_norm": 0.402675062417984, + "learning_rate": 1.9583529831191448e-05, + "loss": 1.3312, + "num_input_tokens_seen": 472107148, + "step": 12020 + }, + { + "epoch": 0.5829300350702921, + "grad_norm": 0.4156324863433838, + "learning_rate": 1.954523193151534e-05, + "loss": 1.3157, + "num_input_tokens_seen": 472518180, + "step": 12030 + }, + { + "epoch": 0.5834145986904669, + "grad_norm": 0.40975773334503174, + "learning_rate": 1.9506947468085866e-05, + "loss": 1.2565, + "num_input_tokens_seen": 472894628, + "step": 12040 + }, + { + "epoch": 0.5838991623106417, + "grad_norm": 0.4518705904483795, + "learning_rate": 1.9468676535205767e-05, + "loss": 1.2686, + "num_input_tokens_seen": 473270452, + "step": 12050 + }, + { + "epoch": 0.5843837259308164, + "grad_norm": 0.4540398120880127, + "learning_rate": 1.9430419227144443e-05, + "loss": 1.2771, + "num_input_tokens_seen": 473644072, + "step": 12060 + }, + { + "epoch": 0.5848682895509912, + "grad_norm": 0.40955179929733276, + "learning_rate": 1.939217563813771e-05, + "loss": 1.3421, + "num_input_tokens_seen": 474010688, + "step": 12070 + }, + { + "epoch": 0.585352853171166, + "grad_norm": 0.4388551115989685, + "learning_rate": 1.935394586238763e-05, + "loss": 1.3129, + "num_input_tokens_seen": 474390028, + "step": 12080 + }, + { + "epoch": 0.5858374167913408, + "grad_norm": 0.41205498576164246, + "learning_rate": 1.93157299940622e-05, + "loss": 1.2662, + "num_input_tokens_seen": 474801152, + "step": 12090 + }, + { + "epoch": 0.5863219804115156, + "grad_norm": 0.41944679617881775, + "learning_rate": 1.92775281272952e-05, + "loss": 1.2786, + "num_input_tokens_seen": 475187368, + "step": 12100 + }, + { + "epoch": 0.5868065440316904, + "grad_norm": 0.4028654098510742, + "learning_rate": 1.9239340356185892e-05, + "loss": 1.2792, + "num_input_tokens_seen": 475599252, + "step": 12110 + }, + { + "epoch": 0.5872911076518653, + "grad_norm": 0.44524598121643066, + "learning_rate": 1.9201166774798833e-05, + "loss": 1.3175, + "num_input_tokens_seen": 475971476, + "step": 12120 + }, + { + "epoch": 0.5877756712720401, + "grad_norm": 0.3849859833717346, + "learning_rate": 1.9163007477163616e-05, + "loss": 1.2327, + "num_input_tokens_seen": 476379776, + "step": 12130 + }, + { + "epoch": 0.5882602348922149, + "grad_norm": 0.4216983914375305, + "learning_rate": 1.9124862557274668e-05, + "loss": 1.2934, + "num_input_tokens_seen": 476772296, + "step": 12140 + }, + { + "epoch": 0.5887447985123897, + "grad_norm": 0.4107081890106201, + "learning_rate": 1.9086732109090965e-05, + "loss": 1.3092, + "num_input_tokens_seen": 477178020, + "step": 12150 + }, + { + "epoch": 0.5892293621325645, + "grad_norm": 0.43871021270751953, + "learning_rate": 1.904861622653589e-05, + "loss": 1.3208, + "num_input_tokens_seen": 477570016, + "step": 12160 + }, + { + "epoch": 0.5897139257527393, + "grad_norm": 0.4565945565700531, + "learning_rate": 1.9010515003496892e-05, + "loss": 1.2592, + "num_input_tokens_seen": 477957740, + "step": 12170 + }, + { + "epoch": 0.5901984893729141, + "grad_norm": 0.4425964653491974, + "learning_rate": 1.8972428533825345e-05, + "loss": 1.2855, + "num_input_tokens_seen": 478348188, + "step": 12180 + }, + { + "epoch": 0.5906830529930889, + "grad_norm": 0.41607752442359924, + "learning_rate": 1.8934356911336283e-05, + "loss": 1.3222, + "num_input_tokens_seen": 478741832, + "step": 12190 + }, + { + "epoch": 0.5911676166132637, + "grad_norm": 0.43811753392219543, + "learning_rate": 1.8896300229808144e-05, + "loss": 1.2914, + "num_input_tokens_seen": 479136864, + "step": 12200 + }, + { + "epoch": 0.5916521802334386, + "grad_norm": 0.4176853895187378, + "learning_rate": 1.8858258582982597e-05, + "loss": 1.3503, + "num_input_tokens_seen": 479527104, + "step": 12210 + }, + { + "epoch": 0.5921367438536134, + "grad_norm": 0.3926706612110138, + "learning_rate": 1.8820232064564233e-05, + "loss": 1.286, + "num_input_tokens_seen": 479884732, + "step": 12220 + }, + { + "epoch": 0.5926213074737882, + "grad_norm": 0.38026413321495056, + "learning_rate": 1.878222076822043e-05, + "loss": 1.2862, + "num_input_tokens_seen": 480292660, + "step": 12230 + }, + { + "epoch": 0.593105871093963, + "grad_norm": 0.4443627595901489, + "learning_rate": 1.8744224787581024e-05, + "loss": 1.2863, + "num_input_tokens_seen": 480669036, + "step": 12240 + }, + { + "epoch": 0.5935904347141377, + "grad_norm": 0.3894060552120209, + "learning_rate": 1.870624421623816e-05, + "loss": 1.315, + "num_input_tokens_seen": 481033564, + "step": 12250 + }, + { + "epoch": 0.5940749983343125, + "grad_norm": 0.4227178692817688, + "learning_rate": 1.8668279147746e-05, + "loss": 1.3146, + "num_input_tokens_seen": 481449056, + "step": 12260 + }, + { + "epoch": 0.5945595619544873, + "grad_norm": 0.38117271661758423, + "learning_rate": 1.863032967562055e-05, + "loss": 1.2849, + "num_input_tokens_seen": 481832856, + "step": 12270 + }, + { + "epoch": 0.5950441255746621, + "grad_norm": 0.446122944355011, + "learning_rate": 1.859239589333936e-05, + "loss": 1.3231, + "num_input_tokens_seen": 482219828, + "step": 12280 + }, + { + "epoch": 0.5955286891948369, + "grad_norm": 0.4982196092605591, + "learning_rate": 1.855447789434137e-05, + "loss": 1.2839, + "num_input_tokens_seen": 482624804, + "step": 12290 + }, + { + "epoch": 0.5960132528150118, + "grad_norm": 0.44610318541526794, + "learning_rate": 1.851657577202661e-05, + "loss": 1.2712, + "num_input_tokens_seen": 483017968, + "step": 12300 + }, + { + "epoch": 0.5964978164351866, + "grad_norm": 0.4101884067058563, + "learning_rate": 1.8478689619756026e-05, + "loss": 1.3168, + "num_input_tokens_seen": 483408892, + "step": 12310 + }, + { + "epoch": 0.5969823800553614, + "grad_norm": 0.38893336057662964, + "learning_rate": 1.8440819530851225e-05, + "loss": 1.2848, + "num_input_tokens_seen": 483774232, + "step": 12320 + }, + { + "epoch": 0.5974669436755362, + "grad_norm": 0.4360596835613251, + "learning_rate": 1.8402965598594227e-05, + "loss": 1.3122, + "num_input_tokens_seen": 484172492, + "step": 12330 + }, + { + "epoch": 0.597951507295711, + "grad_norm": 0.3694493770599365, + "learning_rate": 1.8365127916227288e-05, + "loss": 1.2786, + "num_input_tokens_seen": 484548300, + "step": 12340 + }, + { + "epoch": 0.5984360709158858, + "grad_norm": 0.4072704315185547, + "learning_rate": 1.8327306576952592e-05, + "loss": 1.3197, + "num_input_tokens_seen": 484966920, + "step": 12350 + }, + { + "epoch": 0.5989206345360606, + "grad_norm": 0.43515607714653015, + "learning_rate": 1.828950167393211e-05, + "loss": 1.2959, + "num_input_tokens_seen": 485363496, + "step": 12360 + }, + { + "epoch": 0.5994051981562354, + "grad_norm": 0.3934329152107239, + "learning_rate": 1.8251713300287294e-05, + "loss": 1.2764, + "num_input_tokens_seen": 485781532, + "step": 12370 + }, + { + "epoch": 0.5998897617764102, + "grad_norm": 0.43624651432037354, + "learning_rate": 1.821394154909891e-05, + "loss": 1.2791, + "num_input_tokens_seen": 486165704, + "step": 12380 + }, + { + "epoch": 0.6003743253965851, + "grad_norm": 0.4212031960487366, + "learning_rate": 1.817618651340675e-05, + "loss": 1.2857, + "num_input_tokens_seen": 486559644, + "step": 12390 + }, + { + "epoch": 0.6008588890167599, + "grad_norm": 0.4342302680015564, + "learning_rate": 1.813844828620946e-05, + "loss": 1.265, + "num_input_tokens_seen": 486973628, + "step": 12400 + }, + { + "epoch": 0.6013434526369347, + "grad_norm": 0.35909196734428406, + "learning_rate": 1.8100726960464254e-05, + "loss": 1.2953, + "num_input_tokens_seen": 487389200, + "step": 12410 + }, + { + "epoch": 0.6018280162571095, + "grad_norm": 0.4191993772983551, + "learning_rate": 1.8063022629086752e-05, + "loss": 1.2548, + "num_input_tokens_seen": 487788196, + "step": 12420 + }, + { + "epoch": 0.6023125798772843, + "grad_norm": 0.4022323191165924, + "learning_rate": 1.8025335384950665e-05, + "loss": 1.2569, + "num_input_tokens_seen": 488152244, + "step": 12430 + }, + { + "epoch": 0.602797143497459, + "grad_norm": 0.3971206545829773, + "learning_rate": 1.7987665320887666e-05, + "loss": 1.3111, + "num_input_tokens_seen": 488555012, + "step": 12440 + }, + { + "epoch": 0.6032817071176338, + "grad_norm": 0.40955471992492676, + "learning_rate": 1.795001252968706e-05, + "loss": 1.2905, + "num_input_tokens_seen": 488963324, + "step": 12450 + }, + { + "epoch": 0.6037662707378086, + "grad_norm": 0.4047541320323944, + "learning_rate": 1.7912377104095645e-05, + "loss": 1.3335, + "num_input_tokens_seen": 489371688, + "step": 12460 + }, + { + "epoch": 0.6042508343579834, + "grad_norm": 0.4085901379585266, + "learning_rate": 1.787475913681743e-05, + "loss": 1.2776, + "num_input_tokens_seen": 489791296, + "step": 12470 + }, + { + "epoch": 0.6047353979781583, + "grad_norm": 0.40488049387931824, + "learning_rate": 1.783715872051341e-05, + "loss": 1.3007, + "num_input_tokens_seen": 490158224, + "step": 12480 + }, + { + "epoch": 0.6052199615983331, + "grad_norm": 0.37421244382858276, + "learning_rate": 1.7799575947801374e-05, + "loss": 1.3284, + "num_input_tokens_seen": 490589156, + "step": 12490 + }, + { + "epoch": 0.6057045252185079, + "grad_norm": 0.44858112931251526, + "learning_rate": 1.776201091125561e-05, + "loss": 1.2499, + "num_input_tokens_seen": 491003784, + "step": 12500 + }, + { + "epoch": 0.6061890888386827, + "grad_norm": 0.3810955286026001, + "learning_rate": 1.7724463703406766e-05, + "loss": 1.2935, + "num_input_tokens_seen": 491417140, + "step": 12510 + }, + { + "epoch": 0.6066736524588575, + "grad_norm": 0.4152781665325165, + "learning_rate": 1.768693441674153e-05, + "loss": 1.3209, + "num_input_tokens_seen": 491809688, + "step": 12520 + }, + { + "epoch": 0.6071582160790323, + "grad_norm": 0.4103911221027374, + "learning_rate": 1.764942314370248e-05, + "loss": 1.2924, + "num_input_tokens_seen": 492210364, + "step": 12530 + }, + { + "epoch": 0.6076427796992071, + "grad_norm": 0.4096033275127411, + "learning_rate": 1.761192997668781e-05, + "loss": 1.25, + "num_input_tokens_seen": 492627760, + "step": 12540 + }, + { + "epoch": 0.6081273433193819, + "grad_norm": 0.42093944549560547, + "learning_rate": 1.7574455008051115e-05, + "loss": 1.2955, + "num_input_tokens_seen": 493015580, + "step": 12550 + }, + { + "epoch": 0.6086119069395567, + "grad_norm": 0.43841052055358887, + "learning_rate": 1.753699833010115e-05, + "loss": 1.2778, + "num_input_tokens_seen": 493413392, + "step": 12560 + }, + { + "epoch": 0.6090964705597316, + "grad_norm": 0.3942921459674835, + "learning_rate": 1.7499560035101653e-05, + "loss": 1.2609, + "num_input_tokens_seen": 493804560, + "step": 12570 + }, + { + "epoch": 0.6095810341799064, + "grad_norm": 0.42653924226760864, + "learning_rate": 1.746214021527103e-05, + "loss": 1.2611, + "num_input_tokens_seen": 494185728, + "step": 12580 + }, + { + "epoch": 0.6100655978000812, + "grad_norm": 0.3946724236011505, + "learning_rate": 1.7424738962782222e-05, + "loss": 1.2609, + "num_input_tokens_seen": 494597784, + "step": 12590 + }, + { + "epoch": 0.610550161420256, + "grad_norm": 0.37868139147758484, + "learning_rate": 1.7387356369762426e-05, + "loss": 1.2714, + "num_input_tokens_seen": 494995584, + "step": 12600 + }, + { + "epoch": 0.6110347250404308, + "grad_norm": 0.42459574341773987, + "learning_rate": 1.734999252829285e-05, + "loss": 1.2983, + "num_input_tokens_seen": 495396436, + "step": 12610 + }, + { + "epoch": 0.6115192886606056, + "grad_norm": 0.3998323976993561, + "learning_rate": 1.7312647530408548e-05, + "loss": 1.2819, + "num_input_tokens_seen": 495790568, + "step": 12620 + }, + { + "epoch": 0.6120038522807804, + "grad_norm": 0.4067113995552063, + "learning_rate": 1.7275321468098133e-05, + "loss": 1.2594, + "num_input_tokens_seen": 496182596, + "step": 12630 + }, + { + "epoch": 0.6124884159009552, + "grad_norm": 0.41096609830856323, + "learning_rate": 1.72380144333036e-05, + "loss": 1.2807, + "num_input_tokens_seen": 496584560, + "step": 12640 + }, + { + "epoch": 0.6129729795211301, + "grad_norm": 0.448887437582016, + "learning_rate": 1.720072651792004e-05, + "loss": 1.2719, + "num_input_tokens_seen": 496979328, + "step": 12650 + }, + { + "epoch": 0.6134575431413049, + "grad_norm": 0.4053841829299927, + "learning_rate": 1.716345781379549e-05, + "loss": 1.3734, + "num_input_tokens_seen": 497342904, + "step": 12660 + }, + { + "epoch": 0.6139421067614796, + "grad_norm": 0.4214945137500763, + "learning_rate": 1.7126208412730628e-05, + "loss": 1.3149, + "num_input_tokens_seen": 497758472, + "step": 12670 + }, + { + "epoch": 0.6144266703816544, + "grad_norm": 0.42576828598976135, + "learning_rate": 1.708897840647861e-05, + "loss": 1.2727, + "num_input_tokens_seen": 498155548, + "step": 12680 + }, + { + "epoch": 0.6149112340018292, + "grad_norm": 0.3854653537273407, + "learning_rate": 1.7051767886744808e-05, + "loss": 1.2532, + "num_input_tokens_seen": 498540656, + "step": 12690 + }, + { + "epoch": 0.615395797622004, + "grad_norm": 0.42444708943367004, + "learning_rate": 1.701457694518661e-05, + "loss": 1.2709, + "num_input_tokens_seen": 498973360, + "step": 12700 + }, + { + "epoch": 0.6158803612421788, + "grad_norm": 0.3879097104072571, + "learning_rate": 1.697740567341314e-05, + "loss": 1.3207, + "num_input_tokens_seen": 499404028, + "step": 12710 + }, + { + "epoch": 0.6163649248623536, + "grad_norm": 0.3972528874874115, + "learning_rate": 1.694025416298511e-05, + "loss": 1.3065, + "num_input_tokens_seen": 499780220, + "step": 12720 + }, + { + "epoch": 0.6168494884825284, + "grad_norm": 0.4193166196346283, + "learning_rate": 1.6903122505414552e-05, + "loss": 1.3359, + "num_input_tokens_seen": 500185720, + "step": 12730 + }, + { + "epoch": 0.6173340521027033, + "grad_norm": 0.40607649087905884, + "learning_rate": 1.686601079216457e-05, + "loss": 1.2789, + "num_input_tokens_seen": 500562168, + "step": 12740 + }, + { + "epoch": 0.6178186157228781, + "grad_norm": 0.4222082197666168, + "learning_rate": 1.682891911464917e-05, + "loss": 1.2436, + "num_input_tokens_seen": 500990044, + "step": 12750 + }, + { + "epoch": 0.6183031793430529, + "grad_norm": 0.40841108560562134, + "learning_rate": 1.6791847564232982e-05, + "loss": 1.2466, + "num_input_tokens_seen": 501376276, + "step": 12760 + }, + { + "epoch": 0.6187877429632277, + "grad_norm": 0.4501042664051056, + "learning_rate": 1.6754796232231084e-05, + "loss": 1.3059, + "num_input_tokens_seen": 501734004, + "step": 12770 + }, + { + "epoch": 0.6192723065834025, + "grad_norm": 0.3940059244632721, + "learning_rate": 1.6717765209908722e-05, + "loss": 1.3457, + "num_input_tokens_seen": 502130812, + "step": 12780 + }, + { + "epoch": 0.6197568702035773, + "grad_norm": 0.372949481010437, + "learning_rate": 1.668075458848115e-05, + "loss": 1.2881, + "num_input_tokens_seen": 502544872, + "step": 12790 + }, + { + "epoch": 0.6202414338237521, + "grad_norm": 0.4232177138328552, + "learning_rate": 1.6643764459113324e-05, + "loss": 1.2777, + "num_input_tokens_seen": 502909916, + "step": 12800 + }, + { + "epoch": 0.6207259974439269, + "grad_norm": 0.40586578845977783, + "learning_rate": 1.6606794912919776e-05, + "loss": 1.2462, + "num_input_tokens_seen": 503313708, + "step": 12810 + }, + { + "epoch": 0.6212105610641017, + "grad_norm": 0.40091219544410706, + "learning_rate": 1.6569846040964293e-05, + "loss": 1.305, + "num_input_tokens_seen": 503697412, + "step": 12820 + }, + { + "epoch": 0.6216951246842766, + "grad_norm": 0.42310020327568054, + "learning_rate": 1.653291793425978e-05, + "loss": 1.2631, + "num_input_tokens_seen": 504107344, + "step": 12830 + }, + { + "epoch": 0.6221796883044514, + "grad_norm": 0.42501911520957947, + "learning_rate": 1.6496010683767936e-05, + "loss": 1.275, + "num_input_tokens_seen": 504491792, + "step": 12840 + }, + { + "epoch": 0.6226642519246262, + "grad_norm": 0.42510515451431274, + "learning_rate": 1.6459124380399144e-05, + "loss": 1.2764, + "num_input_tokens_seen": 504881504, + "step": 12850 + }, + { + "epoch": 0.623148815544801, + "grad_norm": 0.4044088125228882, + "learning_rate": 1.6422259115012165e-05, + "loss": 1.2925, + "num_input_tokens_seen": 505293712, + "step": 12860 + }, + { + "epoch": 0.6236333791649757, + "grad_norm": 0.4045158326625824, + "learning_rate": 1.638541497841392e-05, + "loss": 1.2921, + "num_input_tokens_seen": 505655600, + "step": 12870 + }, + { + "epoch": 0.6241179427851505, + "grad_norm": 0.42825374007225037, + "learning_rate": 1.6348592061359334e-05, + "loss": 1.2857, + "num_input_tokens_seen": 506040624, + "step": 12880 + }, + { + "epoch": 0.6246025064053253, + "grad_norm": 0.39818504452705383, + "learning_rate": 1.6311790454551e-05, + "loss": 1.2945, + "num_input_tokens_seen": 506464380, + "step": 12890 + }, + { + "epoch": 0.6250870700255001, + "grad_norm": 0.46967610716819763, + "learning_rate": 1.6275010248639085e-05, + "loss": 1.2733, + "num_input_tokens_seen": 506879284, + "step": 12900 + }, + { + "epoch": 0.6255716336456749, + "grad_norm": 0.46247559785842896, + "learning_rate": 1.6238251534220982e-05, + "loss": 1.2628, + "num_input_tokens_seen": 507255672, + "step": 12910 + }, + { + "epoch": 0.6260561972658498, + "grad_norm": 0.39192506670951843, + "learning_rate": 1.6201514401841204e-05, + "loss": 1.2749, + "num_input_tokens_seen": 507642376, + "step": 12920 + }, + { + "epoch": 0.6265407608860246, + "grad_norm": 0.41501137614250183, + "learning_rate": 1.6164798941991046e-05, + "loss": 1.2847, + "num_input_tokens_seen": 508025372, + "step": 12930 + }, + { + "epoch": 0.6270253245061994, + "grad_norm": 0.38921937346458435, + "learning_rate": 1.6128105245108464e-05, + "loss": 1.2737, + "num_input_tokens_seen": 508418732, + "step": 12940 + }, + { + "epoch": 0.6275098881263742, + "grad_norm": 0.3864487111568451, + "learning_rate": 1.609143340157777e-05, + "loss": 1.2822, + "num_input_tokens_seen": 508802832, + "step": 12950 + }, + { + "epoch": 0.627994451746549, + "grad_norm": 0.39535877108573914, + "learning_rate": 1.6054783501729488e-05, + "loss": 1.3428, + "num_input_tokens_seen": 509204136, + "step": 12960 + }, + { + "epoch": 0.6284790153667238, + "grad_norm": 0.47437265515327454, + "learning_rate": 1.6018155635840046e-05, + "loss": 1.261, + "num_input_tokens_seen": 509580876, + "step": 12970 + }, + { + "epoch": 0.6289635789868986, + "grad_norm": 0.40359655022621155, + "learning_rate": 1.5981549894131628e-05, + "loss": 1.2695, + "num_input_tokens_seen": 509961156, + "step": 12980 + }, + { + "epoch": 0.6294481426070734, + "grad_norm": 0.4515499770641327, + "learning_rate": 1.5944966366771928e-05, + "loss": 1.3089, + "num_input_tokens_seen": 510362116, + "step": 12990 + }, + { + "epoch": 0.6299327062272482, + "grad_norm": 0.384777307510376, + "learning_rate": 1.5908405143873878e-05, + "loss": 1.296, + "num_input_tokens_seen": 510738872, + "step": 13000 + }, + { + "epoch": 0.6304172698474231, + "grad_norm": 0.36402514576911926, + "learning_rate": 1.587186631549552e-05, + "loss": 1.2881, + "num_input_tokens_seen": 511139464, + "step": 13010 + }, + { + "epoch": 0.6309018334675979, + "grad_norm": 0.36122068762779236, + "learning_rate": 1.5835349971639694e-05, + "loss": 1.2815, + "num_input_tokens_seen": 511548508, + "step": 13020 + }, + { + "epoch": 0.6313863970877727, + "grad_norm": 0.4500305652618408, + "learning_rate": 1.5798856202253885e-05, + "loss": 1.2844, + "num_input_tokens_seen": 511919064, + "step": 13030 + }, + { + "epoch": 0.6318709607079475, + "grad_norm": 0.41978541016578674, + "learning_rate": 1.5762385097229952e-05, + "loss": 1.3016, + "num_input_tokens_seen": 512296380, + "step": 13040 + }, + { + "epoch": 0.6323555243281223, + "grad_norm": 0.40535783767700195, + "learning_rate": 1.5725936746403952e-05, + "loss": 1.3093, + "num_input_tokens_seen": 512698204, + "step": 13050 + }, + { + "epoch": 0.632840087948297, + "grad_norm": 0.41118356585502625, + "learning_rate": 1.568951123955585e-05, + "loss": 1.3026, + "num_input_tokens_seen": 513094244, + "step": 13060 + }, + { + "epoch": 0.6333246515684718, + "grad_norm": 0.4092055559158325, + "learning_rate": 1.565310866640939e-05, + "loss": 1.2814, + "num_input_tokens_seen": 513478972, + "step": 13070 + }, + { + "epoch": 0.6338092151886466, + "grad_norm": 0.42620420455932617, + "learning_rate": 1.561672911663179e-05, + "loss": 1.2906, + "num_input_tokens_seen": 513892816, + "step": 13080 + }, + { + "epoch": 0.6342937788088214, + "grad_norm": 0.40342986583709717, + "learning_rate": 1.558037267983358e-05, + "loss": 1.2528, + "num_input_tokens_seen": 514268756, + "step": 13090 + }, + { + "epoch": 0.6347783424289963, + "grad_norm": 0.44240349531173706, + "learning_rate": 1.554403944556834e-05, + "loss": 1.2711, + "num_input_tokens_seen": 514676816, + "step": 13100 + }, + { + "epoch": 0.6352629060491711, + "grad_norm": 0.38063910603523254, + "learning_rate": 1.5507729503332503e-05, + "loss": 1.2958, + "num_input_tokens_seen": 515102476, + "step": 13110 + }, + { + "epoch": 0.6357474696693459, + "grad_norm": 0.4031839072704315, + "learning_rate": 1.547144294256514e-05, + "loss": 1.3018, + "num_input_tokens_seen": 515484244, + "step": 13120 + }, + { + "epoch": 0.6362320332895207, + "grad_norm": 0.4612565338611603, + "learning_rate": 1.5435179852647712e-05, + "loss": 1.2909, + "num_input_tokens_seen": 515868796, + "step": 13130 + }, + { + "epoch": 0.6367165969096955, + "grad_norm": 0.4232649803161621, + "learning_rate": 1.539894032290389e-05, + "loss": 1.2522, + "num_input_tokens_seen": 516277108, + "step": 13140 + }, + { + "epoch": 0.6372011605298703, + "grad_norm": 0.4205443859100342, + "learning_rate": 1.5362724442599276e-05, + "loss": 1.3396, + "num_input_tokens_seen": 516659132, + "step": 13150 + }, + { + "epoch": 0.6376857241500451, + "grad_norm": 0.3871660828590393, + "learning_rate": 1.532653230094125e-05, + "loss": 1.312, + "num_input_tokens_seen": 517083020, + "step": 13160 + }, + { + "epoch": 0.6381702877702199, + "grad_norm": 0.42630356550216675, + "learning_rate": 1.529036398707869e-05, + "loss": 1.2414, + "num_input_tokens_seen": 517464668, + "step": 13170 + }, + { + "epoch": 0.6386548513903947, + "grad_norm": 0.40976426005363464, + "learning_rate": 1.5254219590101816e-05, + "loss": 1.2985, + "num_input_tokens_seen": 517860704, + "step": 13180 + }, + { + "epoch": 0.6391394150105696, + "grad_norm": 0.4328073561191559, + "learning_rate": 1.5218099199041902e-05, + "loss": 1.2917, + "num_input_tokens_seen": 518276916, + "step": 13190 + }, + { + "epoch": 0.6396239786307444, + "grad_norm": 0.3879329264163971, + "learning_rate": 1.5182002902871123e-05, + "loss": 1.3094, + "num_input_tokens_seen": 518654656, + "step": 13200 + }, + { + "epoch": 0.6401085422509192, + "grad_norm": 0.41468775272369385, + "learning_rate": 1.5145930790502267e-05, + "loss": 1.307, + "num_input_tokens_seen": 519033292, + "step": 13210 + }, + { + "epoch": 0.640593105871094, + "grad_norm": 0.40821224451065063, + "learning_rate": 1.5109882950788586e-05, + "loss": 1.2277, + "num_input_tokens_seen": 519406144, + "step": 13220 + }, + { + "epoch": 0.6410776694912688, + "grad_norm": 0.38090160489082336, + "learning_rate": 1.5073859472523514e-05, + "loss": 1.2876, + "num_input_tokens_seen": 519814676, + "step": 13230 + }, + { + "epoch": 0.6415622331114436, + "grad_norm": 0.39433416724205017, + "learning_rate": 1.50378604444405e-05, + "loss": 1.2361, + "num_input_tokens_seen": 520205984, + "step": 13240 + }, + { + "epoch": 0.6420467967316184, + "grad_norm": 0.3543625771999359, + "learning_rate": 1.5001885955212758e-05, + "loss": 1.276, + "num_input_tokens_seen": 520602212, + "step": 13250 + }, + { + "epoch": 0.6425313603517931, + "grad_norm": 0.43784913420677185, + "learning_rate": 1.4965936093453054e-05, + "loss": 1.307, + "num_input_tokens_seen": 520984044, + "step": 13260 + }, + { + "epoch": 0.6430159239719679, + "grad_norm": 0.38104525208473206, + "learning_rate": 1.4930010947713513e-05, + "loss": 1.287, + "num_input_tokens_seen": 521381332, + "step": 13270 + }, + { + "epoch": 0.6435004875921428, + "grad_norm": 0.42758843302726746, + "learning_rate": 1.4894110606485334e-05, + "loss": 1.3144, + "num_input_tokens_seen": 521758556, + "step": 13280 + }, + { + "epoch": 0.6439850512123176, + "grad_norm": 0.3898857831954956, + "learning_rate": 1.4858235158198675e-05, + "loss": 1.2667, + "num_input_tokens_seen": 522155024, + "step": 13290 + }, + { + "epoch": 0.6444696148324924, + "grad_norm": 0.406747967004776, + "learning_rate": 1.482238469122232e-05, + "loss": 1.277, + "num_input_tokens_seen": 522511408, + "step": 13300 + }, + { + "epoch": 0.6449541784526672, + "grad_norm": 0.3949616849422455, + "learning_rate": 1.4786559293863566e-05, + "loss": 1.2483, + "num_input_tokens_seen": 522886168, + "step": 13310 + }, + { + "epoch": 0.645438742072842, + "grad_norm": 0.4430643916130066, + "learning_rate": 1.4750759054367923e-05, + "loss": 1.2268, + "num_input_tokens_seen": 523282444, + "step": 13320 + }, + { + "epoch": 0.6459233056930168, + "grad_norm": 0.4223615825176239, + "learning_rate": 1.4714984060918962e-05, + "loss": 1.272, + "num_input_tokens_seen": 523654248, + "step": 13330 + }, + { + "epoch": 0.6464078693131916, + "grad_norm": 0.3976212739944458, + "learning_rate": 1.4679234401638043e-05, + "loss": 1.2618, + "num_input_tokens_seen": 524040568, + "step": 13340 + }, + { + "epoch": 0.6468924329333664, + "grad_norm": 0.40596914291381836, + "learning_rate": 1.464351016458414e-05, + "loss": 1.2804, + "num_input_tokens_seen": 524461668, + "step": 13350 + }, + { + "epoch": 0.6473769965535413, + "grad_norm": 0.3791384696960449, + "learning_rate": 1.460781143775359e-05, + "loss": 1.304, + "num_input_tokens_seen": 524855056, + "step": 13360 + }, + { + "epoch": 0.6478615601737161, + "grad_norm": 0.3824685215950012, + "learning_rate": 1.457213830907992e-05, + "loss": 1.288, + "num_input_tokens_seen": 525251636, + "step": 13370 + }, + { + "epoch": 0.6483461237938909, + "grad_norm": 0.39863574504852295, + "learning_rate": 1.453649086643356e-05, + "loss": 1.2836, + "num_input_tokens_seen": 525642284, + "step": 13380 + }, + { + "epoch": 0.6488306874140657, + "grad_norm": 0.4155414402484894, + "learning_rate": 1.4500869197621708e-05, + "loss": 1.2526, + "num_input_tokens_seen": 526045452, + "step": 13390 + }, + { + "epoch": 0.6493152510342405, + "grad_norm": 0.41306737065315247, + "learning_rate": 1.446527339038808e-05, + "loss": 1.2497, + "num_input_tokens_seen": 526454148, + "step": 13400 + }, + { + "epoch": 0.6497998146544153, + "grad_norm": 0.4045097529888153, + "learning_rate": 1.4429703532412642e-05, + "loss": 1.2635, + "num_input_tokens_seen": 526858372, + "step": 13410 + }, + { + "epoch": 0.6502843782745901, + "grad_norm": 0.39056453108787537, + "learning_rate": 1.4394159711311494e-05, + "loss": 1.3007, + "num_input_tokens_seen": 527262932, + "step": 13420 + }, + { + "epoch": 0.6507689418947649, + "grad_norm": 0.4115632474422455, + "learning_rate": 1.435864201463657e-05, + "loss": 1.2925, + "num_input_tokens_seen": 527689048, + "step": 13430 + }, + { + "epoch": 0.6512535055149397, + "grad_norm": 0.399703711271286, + "learning_rate": 1.4323150529875462e-05, + "loss": 1.2915, + "num_input_tokens_seen": 528076752, + "step": 13440 + }, + { + "epoch": 0.6517380691351146, + "grad_norm": 0.39391404390335083, + "learning_rate": 1.4287685344451202e-05, + "loss": 1.2642, + "num_input_tokens_seen": 528478876, + "step": 13450 + }, + { + "epoch": 0.6522226327552894, + "grad_norm": 0.41902291774749756, + "learning_rate": 1.4252246545722048e-05, + "loss": 1.2977, + "num_input_tokens_seen": 528884840, + "step": 13460 + }, + { + "epoch": 0.6527071963754641, + "grad_norm": 0.42969298362731934, + "learning_rate": 1.4216834220981235e-05, + "loss": 1.2721, + "num_input_tokens_seen": 529259760, + "step": 13470 + }, + { + "epoch": 0.6531917599956389, + "grad_norm": 0.41451066732406616, + "learning_rate": 1.4181448457456814e-05, + "loss": 1.2888, + "num_input_tokens_seen": 529676680, + "step": 13480 + }, + { + "epoch": 0.6536763236158137, + "grad_norm": 0.4217407703399658, + "learning_rate": 1.4146089342311391e-05, + "loss": 1.2666, + "num_input_tokens_seen": 530073596, + "step": 13490 + }, + { + "epoch": 0.6541608872359885, + "grad_norm": 0.3723660111427307, + "learning_rate": 1.4110756962641952e-05, + "loss": 1.2463, + "num_input_tokens_seen": 530447064, + "step": 13500 + }, + { + "epoch": 0.6546454508561633, + "grad_norm": 0.42004016041755676, + "learning_rate": 1.4075451405479598e-05, + "loss": 1.2802, + "num_input_tokens_seen": 530851496, + "step": 13510 + }, + { + "epoch": 0.6551300144763381, + "grad_norm": 0.4187595546245575, + "learning_rate": 1.4040172757789388e-05, + "loss": 1.2933, + "num_input_tokens_seen": 531264604, + "step": 13520 + }, + { + "epoch": 0.6556145780965129, + "grad_norm": 0.4250360429286957, + "learning_rate": 1.4004921106470098e-05, + "loss": 1.2751, + "num_input_tokens_seen": 531679120, + "step": 13530 + }, + { + "epoch": 0.6560991417166878, + "grad_norm": 0.37126293778419495, + "learning_rate": 1.3969696538353977e-05, + "loss": 1.284, + "num_input_tokens_seen": 532051596, + "step": 13540 + }, + { + "epoch": 0.6565837053368626, + "grad_norm": 0.3943636119365692, + "learning_rate": 1.3934499140206596e-05, + "loss": 1.2617, + "num_input_tokens_seen": 532444860, + "step": 13550 + }, + { + "epoch": 0.6570682689570374, + "grad_norm": 0.4207961857318878, + "learning_rate": 1.3899328998726574e-05, + "loss": 1.2434, + "num_input_tokens_seen": 532852472, + "step": 13560 + }, + { + "epoch": 0.6575528325772122, + "grad_norm": 0.41514161229133606, + "learning_rate": 1.3864186200545403e-05, + "loss": 1.2763, + "num_input_tokens_seen": 533252152, + "step": 13570 + }, + { + "epoch": 0.658037396197387, + "grad_norm": 0.422195702791214, + "learning_rate": 1.3829070832227234e-05, + "loss": 1.2649, + "num_input_tokens_seen": 533646968, + "step": 13580 + }, + { + "epoch": 0.6585219598175618, + "grad_norm": 0.3905238211154938, + "learning_rate": 1.3793982980268644e-05, + "loss": 1.2159, + "num_input_tokens_seen": 534000320, + "step": 13590 + }, + { + "epoch": 0.6590065234377366, + "grad_norm": 0.43043792247772217, + "learning_rate": 1.3758922731098406e-05, + "loss": 1.3498, + "num_input_tokens_seen": 534396632, + "step": 13600 + }, + { + "epoch": 0.6594910870579114, + "grad_norm": 0.4008113741874695, + "learning_rate": 1.372389017107735e-05, + "loss": 1.2894, + "num_input_tokens_seen": 534782340, + "step": 13610 + }, + { + "epoch": 0.6599756506780862, + "grad_norm": 0.3774920403957367, + "learning_rate": 1.3688885386498052e-05, + "loss": 1.2679, + "num_input_tokens_seen": 535198784, + "step": 13620 + }, + { + "epoch": 0.6604602142982611, + "grad_norm": 0.42692962288856506, + "learning_rate": 1.3653908463584717e-05, + "loss": 1.239, + "num_input_tokens_seen": 535613348, + "step": 13630 + }, + { + "epoch": 0.6609447779184359, + "grad_norm": 0.4580807387828827, + "learning_rate": 1.3618959488492875e-05, + "loss": 1.2745, + "num_input_tokens_seen": 536005904, + "step": 13640 + }, + { + "epoch": 0.6614293415386107, + "grad_norm": 0.4274024963378906, + "learning_rate": 1.3584038547309253e-05, + "loss": 1.2859, + "num_input_tokens_seen": 536409736, + "step": 13650 + }, + { + "epoch": 0.6619139051587855, + "grad_norm": 0.43948325514793396, + "learning_rate": 1.3549145726051514e-05, + "loss": 1.2886, + "num_input_tokens_seen": 536786188, + "step": 13660 + }, + { + "epoch": 0.6623984687789602, + "grad_norm": 0.39728376269340515, + "learning_rate": 1.3514281110668036e-05, + "loss": 1.2886, + "num_input_tokens_seen": 537166236, + "step": 13670 + }, + { + "epoch": 0.662883032399135, + "grad_norm": 0.39317986369132996, + "learning_rate": 1.3479444787037756e-05, + "loss": 1.3038, + "num_input_tokens_seen": 537555492, + "step": 13680 + }, + { + "epoch": 0.6633675960193098, + "grad_norm": 0.40623709559440613, + "learning_rate": 1.3444636840969882e-05, + "loss": 1.292, + "num_input_tokens_seen": 537970244, + "step": 13690 + }, + { + "epoch": 0.6638521596394846, + "grad_norm": 0.4472850263118744, + "learning_rate": 1.340985735820376e-05, + "loss": 1.2384, + "num_input_tokens_seen": 538370372, + "step": 13700 + }, + { + "epoch": 0.6643367232596594, + "grad_norm": 0.3942728638648987, + "learning_rate": 1.3375106424408584e-05, + "loss": 1.2675, + "num_input_tokens_seen": 538759172, + "step": 13710 + }, + { + "epoch": 0.6648212868798343, + "grad_norm": 0.4153252840042114, + "learning_rate": 1.3340384125183263e-05, + "loss": 1.3135, + "num_input_tokens_seen": 539150680, + "step": 13720 + }, + { + "epoch": 0.6653058505000091, + "grad_norm": 0.41260483860969543, + "learning_rate": 1.330569054605616e-05, + "loss": 1.2691, + "num_input_tokens_seen": 539546148, + "step": 13730 + }, + { + "epoch": 0.6657904141201839, + "grad_norm": 0.4087775647640228, + "learning_rate": 1.3271025772484897e-05, + "loss": 1.2561, + "num_input_tokens_seen": 539930372, + "step": 13740 + }, + { + "epoch": 0.6662749777403587, + "grad_norm": 0.40349823236465454, + "learning_rate": 1.3236389889856123e-05, + "loss": 1.2255, + "num_input_tokens_seen": 540338764, + "step": 13750 + }, + { + "epoch": 0.6667595413605335, + "grad_norm": 0.3926162123680115, + "learning_rate": 1.3201782983485356e-05, + "loss": 1.2884, + "num_input_tokens_seen": 540728228, + "step": 13760 + }, + { + "epoch": 0.6672441049807083, + "grad_norm": 0.4218115508556366, + "learning_rate": 1.3167205138616703e-05, + "loss": 1.2595, + "num_input_tokens_seen": 541134348, + "step": 13770 + }, + { + "epoch": 0.6677286686008831, + "grad_norm": 0.4189454913139343, + "learning_rate": 1.3132656440422711e-05, + "loss": 1.3104, + "num_input_tokens_seen": 541508532, + "step": 13780 + }, + { + "epoch": 0.6682132322210579, + "grad_norm": 0.4163365364074707, + "learning_rate": 1.3098136974004136e-05, + "loss": 1.2894, + "num_input_tokens_seen": 541904872, + "step": 13790 + }, + { + "epoch": 0.6686977958412327, + "grad_norm": 0.39865246415138245, + "learning_rate": 1.30636468243897e-05, + "loss": 1.2838, + "num_input_tokens_seen": 542327136, + "step": 13800 + }, + { + "epoch": 0.6691823594614076, + "grad_norm": 0.4374254047870636, + "learning_rate": 1.3029186076535948e-05, + "loss": 1.2899, + "num_input_tokens_seen": 542708876, + "step": 13810 + }, + { + "epoch": 0.6696669230815824, + "grad_norm": 0.39832600951194763, + "learning_rate": 1.2994754815326976e-05, + "loss": 1.3043, + "num_input_tokens_seen": 543083288, + "step": 13820 + }, + { + "epoch": 0.6701514867017572, + "grad_norm": 0.38928577303886414, + "learning_rate": 1.2960353125574264e-05, + "loss": 1.2409, + "num_input_tokens_seen": 543498476, + "step": 13830 + }, + { + "epoch": 0.670636050321932, + "grad_norm": 0.3987908363342285, + "learning_rate": 1.2925981092016434e-05, + "loss": 1.3112, + "num_input_tokens_seen": 543904852, + "step": 13840 + }, + { + "epoch": 0.6711206139421068, + "grad_norm": 0.43027713894844055, + "learning_rate": 1.2891638799319078e-05, + "loss": 1.2587, + "num_input_tokens_seen": 544297824, + "step": 13850 + }, + { + "epoch": 0.6716051775622816, + "grad_norm": 0.4249833822250366, + "learning_rate": 1.2857326332074516e-05, + "loss": 1.3004, + "num_input_tokens_seen": 544682872, + "step": 13860 + }, + { + "epoch": 0.6720897411824563, + "grad_norm": 0.42288899421691895, + "learning_rate": 1.2823043774801625e-05, + "loss": 1.3173, + "num_input_tokens_seen": 545074136, + "step": 13870 + }, + { + "epoch": 0.6725743048026311, + "grad_norm": 0.4400785267353058, + "learning_rate": 1.278879121194556e-05, + "loss": 1.225, + "num_input_tokens_seen": 545481540, + "step": 13880 + }, + { + "epoch": 0.6730588684228059, + "grad_norm": 0.42327094078063965, + "learning_rate": 1.275456872787765e-05, + "loss": 1.2532, + "num_input_tokens_seen": 545873512, + "step": 13890 + }, + { + "epoch": 0.6735434320429808, + "grad_norm": 0.39755240082740784, + "learning_rate": 1.2720376406895086e-05, + "loss": 1.2726, + "num_input_tokens_seen": 546300044, + "step": 13900 + }, + { + "epoch": 0.6740279956631556, + "grad_norm": 0.39591267704963684, + "learning_rate": 1.2686214333220787e-05, + "loss": 1.2871, + "num_input_tokens_seen": 546682376, + "step": 13910 + }, + { + "epoch": 0.6745125592833304, + "grad_norm": 0.3895485997200012, + "learning_rate": 1.2652082591003173e-05, + "loss": 1.2779, + "num_input_tokens_seen": 547085092, + "step": 13920 + }, + { + "epoch": 0.6749971229035052, + "grad_norm": 0.43440917134284973, + "learning_rate": 1.261798126431592e-05, + "loss": 1.2538, + "num_input_tokens_seen": 547501820, + "step": 13930 + }, + { + "epoch": 0.67548168652368, + "grad_norm": 0.39425063133239746, + "learning_rate": 1.2583910437157825e-05, + "loss": 1.3221, + "num_input_tokens_seen": 547901628, + "step": 13940 + }, + { + "epoch": 0.6759662501438548, + "grad_norm": 0.3926689922809601, + "learning_rate": 1.2549870193452513e-05, + "loss": 1.3095, + "num_input_tokens_seen": 548291540, + "step": 13950 + }, + { + "epoch": 0.6764508137640296, + "grad_norm": 0.4056353271007538, + "learning_rate": 1.2515860617048314e-05, + "loss": 1.2636, + "num_input_tokens_seen": 548701312, + "step": 13960 + }, + { + "epoch": 0.6769353773842044, + "grad_norm": 0.43241339921951294, + "learning_rate": 1.2481881791717996e-05, + "loss": 1.2654, + "num_input_tokens_seen": 549068240, + "step": 13970 + }, + { + "epoch": 0.6774199410043792, + "grad_norm": 0.3984728753566742, + "learning_rate": 1.2447933801158593e-05, + "loss": 1.213, + "num_input_tokens_seen": 549458460, + "step": 13980 + }, + { + "epoch": 0.6779045046245541, + "grad_norm": 0.39436209201812744, + "learning_rate": 1.2414016728991171e-05, + "loss": 1.2805, + "num_input_tokens_seen": 549845624, + "step": 13990 + }, + { + "epoch": 0.6783890682447289, + "grad_norm": 0.4029475450515747, + "learning_rate": 1.2380130658760653e-05, + "loss": 1.2622, + "num_input_tokens_seen": 550234352, + "step": 14000 + }, + { + "epoch": 0.6783890682447289, + "eval_loss": 1.3831204175949097, + "eval_runtime": 3.6695, + "eval_samples_per_second": 40.878, + "eval_steps_per_second": 5.178, + "num_input_tokens_seen": 550234352, + "step": 14000 + }, + { + "epoch": 0.6788736318649037, + "grad_norm": 0.3884779214859009, + "learning_rate": 1.2346275673935592e-05, + "loss": 1.2657, + "num_input_tokens_seen": 550605424, + "step": 14010 + }, + { + "epoch": 0.6793581954850785, + "grad_norm": 0.3887692391872406, + "learning_rate": 1.2312451857907983e-05, + "loss": 1.2375, + "num_input_tokens_seen": 551020804, + "step": 14020 + }, + { + "epoch": 0.6798427591052533, + "grad_norm": 0.44228610396385193, + "learning_rate": 1.2278659293993011e-05, + "loss": 1.2443, + "num_input_tokens_seen": 551415568, + "step": 14030 + }, + { + "epoch": 0.6803273227254281, + "grad_norm": 0.4105127155780792, + "learning_rate": 1.2244898065428918e-05, + "loss": 1.256, + "num_input_tokens_seen": 551790224, + "step": 14040 + }, + { + "epoch": 0.6808118863456029, + "grad_norm": 0.367942750453949, + "learning_rate": 1.2211168255376747e-05, + "loss": 1.313, + "num_input_tokens_seen": 552201712, + "step": 14050 + }, + { + "epoch": 0.6812964499657777, + "grad_norm": 0.43449220061302185, + "learning_rate": 1.217746994692014e-05, + "loss": 1.2926, + "num_input_tokens_seen": 552593300, + "step": 14060 + }, + { + "epoch": 0.6817810135859526, + "grad_norm": 0.3836462199687958, + "learning_rate": 1.2143803223065161e-05, + "loss": 1.3065, + "num_input_tokens_seen": 552995148, + "step": 14070 + }, + { + "epoch": 0.6822655772061273, + "grad_norm": 0.4054076373577118, + "learning_rate": 1.2110168166740057e-05, + "loss": 1.3149, + "num_input_tokens_seen": 553394712, + "step": 14080 + }, + { + "epoch": 0.6827501408263021, + "grad_norm": 0.39375001192092896, + "learning_rate": 1.2076564860795095e-05, + "loss": 1.2744, + "num_input_tokens_seen": 553807736, + "step": 14090 + }, + { + "epoch": 0.6832347044464769, + "grad_norm": 0.4356180727481842, + "learning_rate": 1.2042993388002302e-05, + "loss": 1.2847, + "num_input_tokens_seen": 554190868, + "step": 14100 + }, + { + "epoch": 0.6837192680666517, + "grad_norm": 0.4105941951274872, + "learning_rate": 1.2009453831055331e-05, + "loss": 1.2395, + "num_input_tokens_seen": 554599144, + "step": 14110 + }, + { + "epoch": 0.6842038316868265, + "grad_norm": 0.43507689237594604, + "learning_rate": 1.1975946272569177e-05, + "loss": 1.2725, + "num_input_tokens_seen": 554992164, + "step": 14120 + }, + { + "epoch": 0.6846883953070013, + "grad_norm": 0.4143732786178589, + "learning_rate": 1.194247079508006e-05, + "loss": 1.2748, + "num_input_tokens_seen": 555351352, + "step": 14130 + }, + { + "epoch": 0.6851729589271761, + "grad_norm": 0.4113134741783142, + "learning_rate": 1.1909027481045138e-05, + "loss": 1.2729, + "num_input_tokens_seen": 555746752, + "step": 14140 + }, + { + "epoch": 0.6856575225473509, + "grad_norm": 0.400814414024353, + "learning_rate": 1.1875616412842368e-05, + "loss": 1.2404, + "num_input_tokens_seen": 556135216, + "step": 14150 + }, + { + "epoch": 0.6861420861675258, + "grad_norm": 0.386210560798645, + "learning_rate": 1.1842237672770277e-05, + "loss": 1.2877, + "num_input_tokens_seen": 556532556, + "step": 14160 + }, + { + "epoch": 0.6866266497877006, + "grad_norm": 0.41966861486434937, + "learning_rate": 1.1808891343047754e-05, + "loss": 1.2242, + "num_input_tokens_seen": 556962628, + "step": 14170 + }, + { + "epoch": 0.6871112134078754, + "grad_norm": 0.4356541037559509, + "learning_rate": 1.1775577505813868e-05, + "loss": 1.2896, + "num_input_tokens_seen": 557340936, + "step": 14180 + }, + { + "epoch": 0.6875957770280502, + "grad_norm": 0.38502469658851624, + "learning_rate": 1.1742296243127621e-05, + "loss": 1.3201, + "num_input_tokens_seen": 557716816, + "step": 14190 + }, + { + "epoch": 0.688080340648225, + "grad_norm": 0.4457418620586395, + "learning_rate": 1.1709047636967812e-05, + "loss": 1.2624, + "num_input_tokens_seen": 558088256, + "step": 14200 + }, + { + "epoch": 0.6885649042683998, + "grad_norm": 0.43167582154273987, + "learning_rate": 1.1675831769232775e-05, + "loss": 1.274, + "num_input_tokens_seen": 558496660, + "step": 14210 + }, + { + "epoch": 0.6890494678885746, + "grad_norm": 0.42342841625213623, + "learning_rate": 1.1642648721740226e-05, + "loss": 1.2851, + "num_input_tokens_seen": 558846140, + "step": 14220 + }, + { + "epoch": 0.6895340315087494, + "grad_norm": 0.4272722005844116, + "learning_rate": 1.1609498576227008e-05, + "loss": 1.2644, + "num_input_tokens_seen": 559205772, + "step": 14230 + }, + { + "epoch": 0.6900185951289242, + "grad_norm": 0.4222122132778168, + "learning_rate": 1.1576381414348953e-05, + "loss": 1.287, + "num_input_tokens_seen": 559613824, + "step": 14240 + }, + { + "epoch": 0.6905031587490991, + "grad_norm": 0.5002584457397461, + "learning_rate": 1.1543297317680607e-05, + "loss": 1.3271, + "num_input_tokens_seen": 560013268, + "step": 14250 + }, + { + "epoch": 0.6909877223692739, + "grad_norm": 0.4087948501110077, + "learning_rate": 1.1510246367715122e-05, + "loss": 1.2673, + "num_input_tokens_seen": 560392376, + "step": 14260 + }, + { + "epoch": 0.6914722859894487, + "grad_norm": 0.41717860102653503, + "learning_rate": 1.1477228645863944e-05, + "loss": 1.2923, + "num_input_tokens_seen": 560761820, + "step": 14270 + }, + { + "epoch": 0.6919568496096234, + "grad_norm": 0.3851679563522339, + "learning_rate": 1.1444244233456717e-05, + "loss": 1.2749, + "num_input_tokens_seen": 561148148, + "step": 14280 + }, + { + "epoch": 0.6924414132297982, + "grad_norm": 0.41185683012008667, + "learning_rate": 1.1411293211741014e-05, + "loss": 1.234, + "num_input_tokens_seen": 561552512, + "step": 14290 + }, + { + "epoch": 0.692925976849973, + "grad_norm": 0.3968884348869324, + "learning_rate": 1.1378375661882181e-05, + "loss": 1.2464, + "num_input_tokens_seen": 561934928, + "step": 14300 + }, + { + "epoch": 0.6934105404701478, + "grad_norm": 0.38210412859916687, + "learning_rate": 1.1345491664963078e-05, + "loss": 1.247, + "num_input_tokens_seen": 562352708, + "step": 14310 + }, + { + "epoch": 0.6938951040903226, + "grad_norm": 0.3991890847682953, + "learning_rate": 1.1312641301983954e-05, + "loss": 1.2252, + "num_input_tokens_seen": 562733284, + "step": 14320 + }, + { + "epoch": 0.6943796677104974, + "grad_norm": 0.3969036340713501, + "learning_rate": 1.1279824653862197e-05, + "loss": 1.2681, + "num_input_tokens_seen": 563140896, + "step": 14330 + }, + { + "epoch": 0.6948642313306723, + "grad_norm": 0.42541587352752686, + "learning_rate": 1.1247041801432137e-05, + "loss": 1.2644, + "num_input_tokens_seen": 563495812, + "step": 14340 + }, + { + "epoch": 0.6953487949508471, + "grad_norm": 0.3810971677303314, + "learning_rate": 1.1214292825444883e-05, + "loss": 1.2588, + "num_input_tokens_seen": 563921972, + "step": 14350 + }, + { + "epoch": 0.6958333585710219, + "grad_norm": 0.4099074602127075, + "learning_rate": 1.1181577806568064e-05, + "loss": 1.2685, + "num_input_tokens_seen": 564303812, + "step": 14360 + }, + { + "epoch": 0.6963179221911967, + "grad_norm": 0.42169204354286194, + "learning_rate": 1.1148896825385707e-05, + "loss": 1.2461, + "num_input_tokens_seen": 564712192, + "step": 14370 + }, + { + "epoch": 0.6968024858113715, + "grad_norm": 0.42555099725723267, + "learning_rate": 1.111624996239796e-05, + "loss": 1.274, + "num_input_tokens_seen": 565108896, + "step": 14380 + }, + { + "epoch": 0.6972870494315463, + "grad_norm": 0.4312445819377899, + "learning_rate": 1.108363729802096e-05, + "loss": 1.272, + "num_input_tokens_seen": 565492580, + "step": 14390 + }, + { + "epoch": 0.6977716130517211, + "grad_norm": 0.40941333770751953, + "learning_rate": 1.1051058912586579e-05, + "loss": 1.2707, + "num_input_tokens_seen": 565921508, + "step": 14400 + }, + { + "epoch": 0.6982561766718959, + "grad_norm": 0.4134999215602875, + "learning_rate": 1.1018514886342279e-05, + "loss": 1.2187, + "num_input_tokens_seen": 566308100, + "step": 14410 + }, + { + "epoch": 0.6987407402920707, + "grad_norm": 0.3979948163032532, + "learning_rate": 1.0986005299450858e-05, + "loss": 1.2768, + "num_input_tokens_seen": 566705680, + "step": 14420 + }, + { + "epoch": 0.6992253039122456, + "grad_norm": 0.41298606991767883, + "learning_rate": 1.0953530231990311e-05, + "loss": 1.2341, + "num_input_tokens_seen": 567123900, + "step": 14430 + }, + { + "epoch": 0.6997098675324204, + "grad_norm": 0.42366859316825867, + "learning_rate": 1.0921089763953594e-05, + "loss": 1.2959, + "num_input_tokens_seen": 567534336, + "step": 14440 + }, + { + "epoch": 0.7001944311525952, + "grad_norm": 0.39607560634613037, + "learning_rate": 1.0888683975248431e-05, + "loss": 1.2823, + "num_input_tokens_seen": 567938980, + "step": 14450 + }, + { + "epoch": 0.70067899477277, + "grad_norm": 0.38363564014434814, + "learning_rate": 1.0856312945697142e-05, + "loss": 1.2062, + "num_input_tokens_seen": 568304904, + "step": 14460 + }, + { + "epoch": 0.7011635583929448, + "grad_norm": 0.4327394366264343, + "learning_rate": 1.0823976755036393e-05, + "loss": 1.2881, + "num_input_tokens_seen": 568703588, + "step": 14470 + }, + { + "epoch": 0.7016481220131195, + "grad_norm": 0.40510445833206177, + "learning_rate": 1.079167548291708e-05, + "loss": 1.2547, + "num_input_tokens_seen": 569104248, + "step": 14480 + }, + { + "epoch": 0.7021326856332943, + "grad_norm": 0.41055554151535034, + "learning_rate": 1.075940920890404e-05, + "loss": 1.2536, + "num_input_tokens_seen": 569485804, + "step": 14490 + }, + { + "epoch": 0.7026172492534691, + "grad_norm": 0.4175165295600891, + "learning_rate": 1.0727178012475944e-05, + "loss": 1.2334, + "num_input_tokens_seen": 569887012, + "step": 14500 + }, + { + "epoch": 0.7031018128736439, + "grad_norm": 0.43152111768722534, + "learning_rate": 1.0694981973025022e-05, + "loss": 1.2328, + "num_input_tokens_seen": 570274412, + "step": 14510 + }, + { + "epoch": 0.7035863764938188, + "grad_norm": 0.4261281490325928, + "learning_rate": 1.0662821169856948e-05, + "loss": 1.2766, + "num_input_tokens_seen": 570667476, + "step": 14520 + }, + { + "epoch": 0.7040709401139936, + "grad_norm": 0.4019312858581543, + "learning_rate": 1.0630695682190554e-05, + "loss": 1.2272, + "num_input_tokens_seen": 571069868, + "step": 14530 + }, + { + "epoch": 0.7045555037341684, + "grad_norm": 0.4646441638469696, + "learning_rate": 1.0598605589157726e-05, + "loss": 1.2188, + "num_input_tokens_seen": 571471600, + "step": 14540 + }, + { + "epoch": 0.7050400673543432, + "grad_norm": 0.40755367279052734, + "learning_rate": 1.0566550969803127e-05, + "loss": 1.286, + "num_input_tokens_seen": 571874376, + "step": 14550 + }, + { + "epoch": 0.705524630974518, + "grad_norm": 0.4092569947242737, + "learning_rate": 1.0534531903084065e-05, + "loss": 1.2766, + "num_input_tokens_seen": 572267796, + "step": 14560 + }, + { + "epoch": 0.7060091945946928, + "grad_norm": 0.443362295627594, + "learning_rate": 1.0502548467870284e-05, + "loss": 1.2842, + "num_input_tokens_seen": 572646180, + "step": 14570 + }, + { + "epoch": 0.7064937582148676, + "grad_norm": 0.39347755908966064, + "learning_rate": 1.0470600742943726e-05, + "loss": 1.2541, + "num_input_tokens_seen": 573058952, + "step": 14580 + }, + { + "epoch": 0.7069783218350424, + "grad_norm": 0.4157586395740509, + "learning_rate": 1.0438688806998395e-05, + "loss": 1.2842, + "num_input_tokens_seen": 573460344, + "step": 14590 + }, + { + "epoch": 0.7074628854552172, + "grad_norm": 0.45576024055480957, + "learning_rate": 1.0406812738640134e-05, + "loss": 1.2606, + "num_input_tokens_seen": 573848724, + "step": 14600 + }, + { + "epoch": 0.7079474490753921, + "grad_norm": 0.41932404041290283, + "learning_rate": 1.037497261638645e-05, + "loss": 1.2402, + "num_input_tokens_seen": 574233032, + "step": 14610 + }, + { + "epoch": 0.7084320126955669, + "grad_norm": 0.4327371120452881, + "learning_rate": 1.0343168518666272e-05, + "loss": 1.2565, + "num_input_tokens_seen": 574627952, + "step": 14620 + }, + { + "epoch": 0.7089165763157417, + "grad_norm": 0.4439123272895813, + "learning_rate": 1.0311400523819831e-05, + "loss": 1.2774, + "num_input_tokens_seen": 575040240, + "step": 14630 + }, + { + "epoch": 0.7094011399359165, + "grad_norm": 0.4237552583217621, + "learning_rate": 1.0279668710098401e-05, + "loss": 1.2613, + "num_input_tokens_seen": 575434676, + "step": 14640 + }, + { + "epoch": 0.7098857035560913, + "grad_norm": 0.4437905251979828, + "learning_rate": 1.0247973155664156e-05, + "loss": 1.2636, + "num_input_tokens_seen": 575812584, + "step": 14650 + }, + { + "epoch": 0.7103702671762661, + "grad_norm": 0.3982216715812683, + "learning_rate": 1.0216313938589936e-05, + "loss": 1.2758, + "num_input_tokens_seen": 576228776, + "step": 14660 + }, + { + "epoch": 0.7108548307964409, + "grad_norm": 0.36928027868270874, + "learning_rate": 1.0184691136859096e-05, + "loss": 1.2293, + "num_input_tokens_seen": 576617456, + "step": 14670 + }, + { + "epoch": 0.7113393944166156, + "grad_norm": 0.3991413712501526, + "learning_rate": 1.0153104828365261e-05, + "loss": 1.2731, + "num_input_tokens_seen": 576986340, + "step": 14680 + }, + { + "epoch": 0.7118239580367904, + "grad_norm": 0.4070897698402405, + "learning_rate": 1.0121555090912207e-05, + "loss": 1.2772, + "num_input_tokens_seen": 577384544, + "step": 14690 + }, + { + "epoch": 0.7123085216569653, + "grad_norm": 0.3921777904033661, + "learning_rate": 1.0090042002213587e-05, + "loss": 1.2332, + "num_input_tokens_seen": 577772956, + "step": 14700 + }, + { + "epoch": 0.7127930852771401, + "grad_norm": 0.37934672832489014, + "learning_rate": 1.0058565639892808e-05, + "loss": 1.2527, + "num_input_tokens_seen": 578188080, + "step": 14710 + }, + { + "epoch": 0.7132776488973149, + "grad_norm": 0.3901313841342926, + "learning_rate": 1.0027126081482801e-05, + "loss": 1.2523, + "num_input_tokens_seen": 578567732, + "step": 14720 + }, + { + "epoch": 0.7137622125174897, + "grad_norm": 0.4196103513240814, + "learning_rate": 9.995723404425845e-06, + "loss": 1.2586, + "num_input_tokens_seen": 578986928, + "step": 14730 + }, + { + "epoch": 0.7142467761376645, + "grad_norm": 0.40894681215286255, + "learning_rate": 9.964357686073378e-06, + "loss": 1.2532, + "num_input_tokens_seen": 579387500, + "step": 14740 + }, + { + "epoch": 0.7147313397578393, + "grad_norm": 0.38982832431793213, + "learning_rate": 9.933029003685778e-06, + "loss": 1.2751, + "num_input_tokens_seen": 579749804, + "step": 14750 + }, + { + "epoch": 0.7152159033780141, + "grad_norm": 0.3959210515022278, + "learning_rate": 9.901737434432226e-06, + "loss": 1.2404, + "num_input_tokens_seen": 580156476, + "step": 14760 + }, + { + "epoch": 0.7157004669981889, + "grad_norm": 0.4251919984817505, + "learning_rate": 9.870483055390456e-06, + "loss": 1.2346, + "num_input_tokens_seen": 580522596, + "step": 14770 + }, + { + "epoch": 0.7161850306183638, + "grad_norm": 0.48525333404541016, + "learning_rate": 9.839265943546627e-06, + "loss": 1.2809, + "num_input_tokens_seen": 580900272, + "step": 14780 + }, + { + "epoch": 0.7166695942385386, + "grad_norm": 0.3704177439212799, + "learning_rate": 9.808086175795061e-06, + "loss": 1.2318, + "num_input_tokens_seen": 581280140, + "step": 14790 + }, + { + "epoch": 0.7171541578587134, + "grad_norm": 0.4149393141269684, + "learning_rate": 9.77694382893814e-06, + "loss": 1.3142, + "num_input_tokens_seen": 581666396, + "step": 14800 + }, + { + "epoch": 0.7176387214788882, + "grad_norm": 0.4752131402492523, + "learning_rate": 9.745838979686026e-06, + "loss": 1.2692, + "num_input_tokens_seen": 582040104, + "step": 14810 + }, + { + "epoch": 0.718123285099063, + "grad_norm": 0.41852232813835144, + "learning_rate": 9.714771704656553e-06, + "loss": 1.2908, + "num_input_tokens_seen": 582431284, + "step": 14820 + }, + { + "epoch": 0.7186078487192378, + "grad_norm": 0.3744739592075348, + "learning_rate": 9.683742080374968e-06, + "loss": 1.2438, + "num_input_tokens_seen": 582835184, + "step": 14830 + }, + { + "epoch": 0.7190924123394126, + "grad_norm": 0.3988076448440552, + "learning_rate": 9.652750183273806e-06, + "loss": 1.2978, + "num_input_tokens_seen": 583230036, + "step": 14840 + }, + { + "epoch": 0.7195769759595874, + "grad_norm": 0.41156208515167236, + "learning_rate": 9.621796089692667e-06, + "loss": 1.2709, + "num_input_tokens_seen": 583626572, + "step": 14850 + }, + { + "epoch": 0.7200615395797622, + "grad_norm": 0.437142014503479, + "learning_rate": 9.59087987587801e-06, + "loss": 1.2503, + "num_input_tokens_seen": 584037012, + "step": 14860 + }, + { + "epoch": 0.7205461031999371, + "grad_norm": 0.42297014594078064, + "learning_rate": 9.560001617983005e-06, + "loss": 1.2808, + "num_input_tokens_seen": 584438176, + "step": 14870 + }, + { + "epoch": 0.7210306668201119, + "grad_norm": 0.4088895618915558, + "learning_rate": 9.529161392067336e-06, + "loss": 1.2203, + "num_input_tokens_seen": 584823036, + "step": 14880 + }, + { + "epoch": 0.7215152304402866, + "grad_norm": 0.4062732458114624, + "learning_rate": 9.498359274097002e-06, + "loss": 1.2212, + "num_input_tokens_seen": 585208164, + "step": 14890 + }, + { + "epoch": 0.7219997940604614, + "grad_norm": 0.4162246882915497, + "learning_rate": 9.467595339944116e-06, + "loss": 1.3, + "num_input_tokens_seen": 585566620, + "step": 14900 + }, + { + "epoch": 0.7224843576806362, + "grad_norm": 0.38531064987182617, + "learning_rate": 9.436869665386763e-06, + "loss": 1.2206, + "num_input_tokens_seen": 585980372, + "step": 14910 + }, + { + "epoch": 0.722968921300811, + "grad_norm": 0.42929065227508545, + "learning_rate": 9.40618232610876e-06, + "loss": 1.2744, + "num_input_tokens_seen": 586389084, + "step": 14920 + }, + { + "epoch": 0.7234534849209858, + "grad_norm": 0.4108678102493286, + "learning_rate": 9.375533397699523e-06, + "loss": 1.252, + "num_input_tokens_seen": 586790192, + "step": 14930 + }, + { + "epoch": 0.7239380485411606, + "grad_norm": 0.43513932824134827, + "learning_rate": 9.344922955653826e-06, + "loss": 1.2669, + "num_input_tokens_seen": 587176208, + "step": 14940 + }, + { + "epoch": 0.7244226121613354, + "grad_norm": 0.4044954776763916, + "learning_rate": 9.314351075371674e-06, + "loss": 1.2949, + "num_input_tokens_seen": 587570128, + "step": 14950 + }, + { + "epoch": 0.7249071757815103, + "grad_norm": 0.4217306077480316, + "learning_rate": 9.283817832158053e-06, + "loss": 1.3133, + "num_input_tokens_seen": 587951740, + "step": 14960 + }, + { + "epoch": 0.7253917394016851, + "grad_norm": 0.4031387269496918, + "learning_rate": 9.253323301222802e-06, + "loss": 1.2583, + "num_input_tokens_seen": 588343360, + "step": 14970 + }, + { + "epoch": 0.7258763030218599, + "grad_norm": 0.4108414649963379, + "learning_rate": 9.222867557680403e-06, + "loss": 1.2722, + "num_input_tokens_seen": 588756676, + "step": 14980 + }, + { + "epoch": 0.7263608666420347, + "grad_norm": 0.4133433699607849, + "learning_rate": 9.192450676549774e-06, + "loss": 1.2386, + "num_input_tokens_seen": 589152220, + "step": 14990 + }, + { + "epoch": 0.7268454302622095, + "grad_norm": 0.3888295590877533, + "learning_rate": 9.162072732754132e-06, + "loss": 1.2964, + "num_input_tokens_seen": 589548180, + "step": 15000 + }, + { + "epoch": 0.7273299938823843, + "grad_norm": 0.40823793411254883, + "learning_rate": 9.131733801120771e-06, + "loss": 1.249, + "num_input_tokens_seen": 589932416, + "step": 15010 + }, + { + "epoch": 0.7278145575025591, + "grad_norm": 0.41177472472190857, + "learning_rate": 9.1014339563809e-06, + "loss": 1.2636, + "num_input_tokens_seen": 590315064, + "step": 15020 + }, + { + "epoch": 0.7282991211227339, + "grad_norm": 0.4307875633239746, + "learning_rate": 9.071173273169428e-06, + "loss": 1.2832, + "num_input_tokens_seen": 590718608, + "step": 15030 + }, + { + "epoch": 0.7287836847429087, + "grad_norm": 0.3676307499408722, + "learning_rate": 9.040951826024824e-06, + "loss": 1.2616, + "num_input_tokens_seen": 591105428, + "step": 15040 + }, + { + "epoch": 0.7292682483630836, + "grad_norm": 0.4145369529724121, + "learning_rate": 9.010769689388885e-06, + "loss": 1.2932, + "num_input_tokens_seen": 591484840, + "step": 15050 + }, + { + "epoch": 0.7297528119832584, + "grad_norm": 0.38058140873908997, + "learning_rate": 8.980626937606612e-06, + "loss": 1.2451, + "num_input_tokens_seen": 591878184, + "step": 15060 + }, + { + "epoch": 0.7302373756034332, + "grad_norm": 0.41027316451072693, + "learning_rate": 8.950523644925954e-06, + "loss": 1.2548, + "num_input_tokens_seen": 592242680, + "step": 15070 + }, + { + "epoch": 0.730721939223608, + "grad_norm": 0.4113955497741699, + "learning_rate": 8.920459885497703e-06, + "loss": 1.2306, + "num_input_tokens_seen": 592616904, + "step": 15080 + }, + { + "epoch": 0.7312065028437827, + "grad_norm": 0.4587390422821045, + "learning_rate": 8.890435733375232e-06, + "loss": 1.2836, + "num_input_tokens_seen": 592980040, + "step": 15090 + }, + { + "epoch": 0.7316910664639575, + "grad_norm": 0.4060867726802826, + "learning_rate": 8.860451262514386e-06, + "loss": 1.2048, + "num_input_tokens_seen": 593376740, + "step": 15100 + }, + { + "epoch": 0.7321756300841323, + "grad_norm": 0.44127029180526733, + "learning_rate": 8.830506546773257e-06, + "loss": 1.2557, + "num_input_tokens_seen": 593757812, + "step": 15110 + }, + { + "epoch": 0.7326601937043071, + "grad_norm": 0.423213392496109, + "learning_rate": 8.800601659911998e-06, + "loss": 1.2426, + "num_input_tokens_seen": 594122040, + "step": 15120 + }, + { + "epoch": 0.7331447573244819, + "grad_norm": 0.41493481397628784, + "learning_rate": 8.770736675592678e-06, + "loss": 1.2721, + "num_input_tokens_seen": 594520000, + "step": 15130 + }, + { + "epoch": 0.7336293209446568, + "grad_norm": 0.46575504541397095, + "learning_rate": 8.740911667379053e-06, + "loss": 1.2268, + "num_input_tokens_seen": 594873824, + "step": 15140 + }, + { + "epoch": 0.7341138845648316, + "grad_norm": 0.4391193091869354, + "learning_rate": 8.711126708736426e-06, + "loss": 1.313, + "num_input_tokens_seen": 595233156, + "step": 15150 + }, + { + "epoch": 0.7345984481850064, + "grad_norm": 0.41552045941352844, + "learning_rate": 8.681381873031447e-06, + "loss": 1.2626, + "num_input_tokens_seen": 595644240, + "step": 15160 + }, + { + "epoch": 0.7350830118051812, + "grad_norm": 0.3735989034175873, + "learning_rate": 8.651677233531943e-06, + "loss": 1.2961, + "num_input_tokens_seen": 596014316, + "step": 15170 + }, + { + "epoch": 0.735567575425356, + "grad_norm": 0.45374736189842224, + "learning_rate": 8.6220128634067e-06, + "loss": 1.2512, + "num_input_tokens_seen": 596420640, + "step": 15180 + }, + { + "epoch": 0.7360521390455308, + "grad_norm": 0.4039424657821655, + "learning_rate": 8.592388835725352e-06, + "loss": 1.2505, + "num_input_tokens_seen": 596817584, + "step": 15190 + }, + { + "epoch": 0.7365367026657056, + "grad_norm": 0.4076104164123535, + "learning_rate": 8.56280522345812e-06, + "loss": 1.2252, + "num_input_tokens_seen": 597205152, + "step": 15200 + }, + { + "epoch": 0.7370212662858804, + "grad_norm": 0.4360331594944, + "learning_rate": 8.533262099475708e-06, + "loss": 1.2516, + "num_input_tokens_seen": 597588540, + "step": 15210 + }, + { + "epoch": 0.7375058299060552, + "grad_norm": 0.39837440848350525, + "learning_rate": 8.503759536549066e-06, + "loss": 1.2656, + "num_input_tokens_seen": 597989068, + "step": 15220 + }, + { + "epoch": 0.7379903935262301, + "grad_norm": 0.45183074474334717, + "learning_rate": 8.474297607349252e-06, + "loss": 1.3073, + "num_input_tokens_seen": 598396680, + "step": 15230 + }, + { + "epoch": 0.7384749571464049, + "grad_norm": 0.46322163939476013, + "learning_rate": 8.44487638444721e-06, + "loss": 1.281, + "num_input_tokens_seen": 598789612, + "step": 15240 + }, + { + "epoch": 0.7389595207665797, + "grad_norm": 0.38120391964912415, + "learning_rate": 8.415495940313637e-06, + "loss": 1.2602, + "num_input_tokens_seen": 599183232, + "step": 15250 + }, + { + "epoch": 0.7394440843867545, + "grad_norm": 0.3967757225036621, + "learning_rate": 8.386156347318785e-06, + "loss": 1.264, + "num_input_tokens_seen": 599587496, + "step": 15260 + }, + { + "epoch": 0.7399286480069293, + "grad_norm": 0.44284841418266296, + "learning_rate": 8.356857677732258e-06, + "loss": 1.2691, + "num_input_tokens_seen": 599970304, + "step": 15270 + }, + { + "epoch": 0.740413211627104, + "grad_norm": 0.42405474185943604, + "learning_rate": 8.327600003722887e-06, + "loss": 1.3021, + "num_input_tokens_seen": 600356376, + "step": 15280 + }, + { + "epoch": 0.7408977752472788, + "grad_norm": 0.3977855145931244, + "learning_rate": 8.298383397358494e-06, + "loss": 1.2675, + "num_input_tokens_seen": 600728604, + "step": 15290 + }, + { + "epoch": 0.7413823388674536, + "grad_norm": 0.41853830218315125, + "learning_rate": 8.269207930605757e-06, + "loss": 1.2954, + "num_input_tokens_seen": 601127264, + "step": 15300 + }, + { + "epoch": 0.7418669024876284, + "grad_norm": 0.4441795349121094, + "learning_rate": 8.240073675330023e-06, + "loss": 1.2563, + "num_input_tokens_seen": 601500628, + "step": 15310 + }, + { + "epoch": 0.7423514661078033, + "grad_norm": 0.37602853775024414, + "learning_rate": 8.210980703295126e-06, + "loss": 1.2418, + "num_input_tokens_seen": 601875864, + "step": 15320 + }, + { + "epoch": 0.7428360297279781, + "grad_norm": 0.4239734411239624, + "learning_rate": 8.181929086163186e-06, + "loss": 1.2742, + "num_input_tokens_seen": 602271004, + "step": 15330 + }, + { + "epoch": 0.7433205933481529, + "grad_norm": 0.39539673924446106, + "learning_rate": 8.1529188954945e-06, + "loss": 1.2483, + "num_input_tokens_seen": 602647156, + "step": 15340 + }, + { + "epoch": 0.7438051569683277, + "grad_norm": 0.3857654631137848, + "learning_rate": 8.123950202747274e-06, + "loss": 1.2448, + "num_input_tokens_seen": 603043708, + "step": 15350 + }, + { + "epoch": 0.7442897205885025, + "grad_norm": 0.41120001673698425, + "learning_rate": 8.095023079277541e-06, + "loss": 1.2468, + "num_input_tokens_seen": 603427072, + "step": 15360 + }, + { + "epoch": 0.7447742842086773, + "grad_norm": 0.38883382081985474, + "learning_rate": 8.066137596338908e-06, + "loss": 1.2563, + "num_input_tokens_seen": 603820012, + "step": 15370 + }, + { + "epoch": 0.7452588478288521, + "grad_norm": 0.3978869915008545, + "learning_rate": 8.037293825082423e-06, + "loss": 1.2982, + "num_input_tokens_seen": 604199356, + "step": 15380 + }, + { + "epoch": 0.7457434114490269, + "grad_norm": 0.39663106203079224, + "learning_rate": 8.008491836556408e-06, + "loss": 1.2548, + "num_input_tokens_seen": 604594724, + "step": 15390 + }, + { + "epoch": 0.7462279750692017, + "grad_norm": 0.3799634873867035, + "learning_rate": 7.979731701706231e-06, + "loss": 1.2978, + "num_input_tokens_seen": 604995016, + "step": 15400 + }, + { + "epoch": 0.7467125386893766, + "grad_norm": 0.424040824174881, + "learning_rate": 7.951013491374193e-06, + "loss": 1.2551, + "num_input_tokens_seen": 605385864, + "step": 15410 + }, + { + "epoch": 0.7471971023095514, + "grad_norm": 0.47143998742103577, + "learning_rate": 7.922337276299305e-06, + "loss": 1.3032, + "num_input_tokens_seen": 605778132, + "step": 15420 + }, + { + "epoch": 0.7476816659297262, + "grad_norm": 0.439042866230011, + "learning_rate": 7.89370312711715e-06, + "loss": 1.2905, + "num_input_tokens_seen": 606175808, + "step": 15430 + }, + { + "epoch": 0.748166229549901, + "grad_norm": 0.42106932401657104, + "learning_rate": 7.86511111435969e-06, + "loss": 1.251, + "num_input_tokens_seen": 606587468, + "step": 15440 + }, + { + "epoch": 0.7486507931700758, + "grad_norm": 0.434646874666214, + "learning_rate": 7.836561308455109e-06, + "loss": 1.2324, + "num_input_tokens_seen": 606989572, + "step": 15450 + }, + { + "epoch": 0.7491353567902506, + "grad_norm": 0.3873103857040405, + "learning_rate": 7.80805377972759e-06, + "loss": 1.249, + "num_input_tokens_seen": 607386504, + "step": 15460 + }, + { + "epoch": 0.7496199204104254, + "grad_norm": 0.42348745465278625, + "learning_rate": 7.779588598397222e-06, + "loss": 1.1877, + "num_input_tokens_seen": 607763740, + "step": 15470 + }, + { + "epoch": 0.7501044840306001, + "grad_norm": 0.46129289269447327, + "learning_rate": 7.751165834579744e-06, + "loss": 1.2666, + "num_input_tokens_seen": 608138336, + "step": 15480 + }, + { + "epoch": 0.750589047650775, + "grad_norm": 0.4692579507827759, + "learning_rate": 7.722785558286447e-06, + "loss": 1.2385, + "num_input_tokens_seen": 608572348, + "step": 15490 + }, + { + "epoch": 0.7510736112709498, + "grad_norm": 0.4252379536628723, + "learning_rate": 7.694447839423936e-06, + "loss": 1.238, + "num_input_tokens_seen": 608980572, + "step": 15500 + }, + { + "epoch": 0.7515581748911246, + "grad_norm": 0.37597569823265076, + "learning_rate": 7.666152747794006e-06, + "loss": 1.2893, + "num_input_tokens_seen": 609382708, + "step": 15510 + }, + { + "epoch": 0.7520427385112994, + "grad_norm": 0.39012569189071655, + "learning_rate": 7.63790035309346e-06, + "loss": 1.3176, + "num_input_tokens_seen": 609759336, + "step": 15520 + }, + { + "epoch": 0.7525273021314742, + "grad_norm": 0.398588091135025, + "learning_rate": 7.609690724913901e-06, + "loss": 1.2815, + "num_input_tokens_seen": 610153132, + "step": 15530 + }, + { + "epoch": 0.753011865751649, + "grad_norm": 0.4179481565952301, + "learning_rate": 7.581523932741619e-06, + "loss": 1.2689, + "num_input_tokens_seen": 610529384, + "step": 15540 + }, + { + "epoch": 0.7534964293718238, + "grad_norm": 0.3886168599128723, + "learning_rate": 7.553400045957362e-06, + "loss": 1.2504, + "num_input_tokens_seen": 610922988, + "step": 15550 + }, + { + "epoch": 0.7539809929919986, + "grad_norm": 0.3878910541534424, + "learning_rate": 7.525319133836223e-06, + "loss": 1.2506, + "num_input_tokens_seen": 611272416, + "step": 15560 + }, + { + "epoch": 0.7544655566121734, + "grad_norm": 0.4470667541027069, + "learning_rate": 7.497281265547406e-06, + "loss": 1.241, + "num_input_tokens_seen": 611675884, + "step": 15570 + }, + { + "epoch": 0.7549501202323483, + "grad_norm": 0.440876841545105, + "learning_rate": 7.469286510154116e-06, + "loss": 1.2208, + "num_input_tokens_seen": 612084480, + "step": 15580 + }, + { + "epoch": 0.7554346838525231, + "grad_norm": 0.4724419414997101, + "learning_rate": 7.441334936613353e-06, + "loss": 1.2509, + "num_input_tokens_seen": 612451128, + "step": 15590 + }, + { + "epoch": 0.7559192474726979, + "grad_norm": 0.4163588881492615, + "learning_rate": 7.413426613775759e-06, + "loss": 1.2477, + "num_input_tokens_seen": 612831956, + "step": 15600 + }, + { + "epoch": 0.7564038110928727, + "grad_norm": 0.42442935705184937, + "learning_rate": 7.385561610385414e-06, + "loss": 1.2763, + "num_input_tokens_seen": 613196848, + "step": 15610 + }, + { + "epoch": 0.7568883747130475, + "grad_norm": 0.4455620050430298, + "learning_rate": 7.357739995079724e-06, + "loss": 1.2434, + "num_input_tokens_seen": 613631540, + "step": 15620 + }, + { + "epoch": 0.7573729383332223, + "grad_norm": 0.3932070732116699, + "learning_rate": 7.329961836389198e-06, + "loss": 1.2842, + "num_input_tokens_seen": 614045632, + "step": 15630 + }, + { + "epoch": 0.7578575019533971, + "grad_norm": 0.40935105085372925, + "learning_rate": 7.302227202737316e-06, + "loss": 1.2179, + "num_input_tokens_seen": 614417140, + "step": 15640 + }, + { + "epoch": 0.7583420655735719, + "grad_norm": 0.3958241939544678, + "learning_rate": 7.274536162440351e-06, + "loss": 1.2848, + "num_input_tokens_seen": 614833672, + "step": 15650 + }, + { + "epoch": 0.7588266291937467, + "grad_norm": 0.43931275606155396, + "learning_rate": 7.246888783707173e-06, + "loss": 1.2335, + "num_input_tokens_seen": 615222276, + "step": 15660 + }, + { + "epoch": 0.7593111928139216, + "grad_norm": 0.4984539747238159, + "learning_rate": 7.219285134639134e-06, + "loss": 1.2827, + "num_input_tokens_seen": 615624440, + "step": 15670 + }, + { + "epoch": 0.7597957564340964, + "grad_norm": 0.40793275833129883, + "learning_rate": 7.191725283229839e-06, + "loss": 1.2562, + "num_input_tokens_seen": 616014768, + "step": 15680 + }, + { + "epoch": 0.7602803200542712, + "grad_norm": 0.41619017720222473, + "learning_rate": 7.164209297365043e-06, + "loss": 1.2754, + "num_input_tokens_seen": 616391320, + "step": 15690 + }, + { + "epoch": 0.760764883674446, + "grad_norm": 0.43882516026496887, + "learning_rate": 7.136737244822422e-06, + "loss": 1.2868, + "num_input_tokens_seen": 616766344, + "step": 15700 + }, + { + "epoch": 0.7612494472946207, + "grad_norm": 0.3904348611831665, + "learning_rate": 7.109309193271454e-06, + "loss": 1.2305, + "num_input_tokens_seen": 617152204, + "step": 15710 + }, + { + "epoch": 0.7617340109147955, + "grad_norm": 0.41161251068115234, + "learning_rate": 7.081925210273227e-06, + "loss": 1.2739, + "num_input_tokens_seen": 617555064, + "step": 15720 + }, + { + "epoch": 0.7622185745349703, + "grad_norm": 0.44097068905830383, + "learning_rate": 7.054585363280286e-06, + "loss": 1.2619, + "num_input_tokens_seen": 617932060, + "step": 15730 + }, + { + "epoch": 0.7627031381551451, + "grad_norm": 0.46235036849975586, + "learning_rate": 7.027289719636437e-06, + "loss": 1.2797, + "num_input_tokens_seen": 618306600, + "step": 15740 + }, + { + "epoch": 0.7631877017753199, + "grad_norm": 0.421164870262146, + "learning_rate": 7.0000383465766345e-06, + "loss": 1.2303, + "num_input_tokens_seen": 618670828, + "step": 15750 + }, + { + "epoch": 0.7636722653954948, + "grad_norm": 0.3979032039642334, + "learning_rate": 6.972831311226758e-06, + "loss": 1.2491, + "num_input_tokens_seen": 619060076, + "step": 15760 + }, + { + "epoch": 0.7641568290156696, + "grad_norm": 0.41532114148139954, + "learning_rate": 6.945668680603487e-06, + "loss": 1.2922, + "num_input_tokens_seen": 619436564, + "step": 15770 + }, + { + "epoch": 0.7646413926358444, + "grad_norm": 0.41047605872154236, + "learning_rate": 6.918550521614137e-06, + "loss": 1.2638, + "num_input_tokens_seen": 619817996, + "step": 15780 + }, + { + "epoch": 0.7651259562560192, + "grad_norm": 0.39262655377388, + "learning_rate": 6.891476901056445e-06, + "loss": 1.2943, + "num_input_tokens_seen": 620243736, + "step": 15790 + }, + { + "epoch": 0.765610519876194, + "grad_norm": 0.4346751272678375, + "learning_rate": 6.864447885618477e-06, + "loss": 1.2818, + "num_input_tokens_seen": 620638616, + "step": 15800 + }, + { + "epoch": 0.7660950834963688, + "grad_norm": 0.39605090022087097, + "learning_rate": 6.837463541878394e-06, + "loss": 1.2827, + "num_input_tokens_seen": 621007792, + "step": 15810 + }, + { + "epoch": 0.7665796471165436, + "grad_norm": 0.45163053274154663, + "learning_rate": 6.810523936304356e-06, + "loss": 1.2719, + "num_input_tokens_seen": 621383392, + "step": 15820 + }, + { + "epoch": 0.7670642107367184, + "grad_norm": 0.38712289929389954, + "learning_rate": 6.783629135254288e-06, + "loss": 1.2749, + "num_input_tokens_seen": 621788080, + "step": 15830 + }, + { + "epoch": 0.7675487743568932, + "grad_norm": 0.417074590921402, + "learning_rate": 6.756779204975785e-06, + "loss": 1.2551, + "num_input_tokens_seen": 622190428, + "step": 15840 + }, + { + "epoch": 0.7680333379770681, + "grad_norm": 0.4212455749511719, + "learning_rate": 6.729974211605888e-06, + "loss": 1.2964, + "num_input_tokens_seen": 622563676, + "step": 15850 + }, + { + "epoch": 0.7685179015972429, + "grad_norm": 0.43890535831451416, + "learning_rate": 6.703214221170961e-06, + "loss": 1.2982, + "num_input_tokens_seen": 622940944, + "step": 15860 + }, + { + "epoch": 0.7690024652174177, + "grad_norm": 0.4360447824001312, + "learning_rate": 6.676499299586525e-06, + "loss": 1.2862, + "num_input_tokens_seen": 623355512, + "step": 15870 + }, + { + "epoch": 0.7694870288375925, + "grad_norm": 0.41077935695648193, + "learning_rate": 6.649829512657082e-06, + "loss": 1.269, + "num_input_tokens_seen": 623743220, + "step": 15880 + }, + { + "epoch": 0.7699715924577673, + "grad_norm": 0.40932127833366394, + "learning_rate": 6.623204926075938e-06, + "loss": 1.2807, + "num_input_tokens_seen": 624121016, + "step": 15890 + }, + { + "epoch": 0.770456156077942, + "grad_norm": 0.42621883749961853, + "learning_rate": 6.596625605425083e-06, + "loss": 1.2542, + "num_input_tokens_seen": 624541652, + "step": 15900 + }, + { + "epoch": 0.7709407196981168, + "grad_norm": 0.4213699400424957, + "learning_rate": 6.570091616175014e-06, + "loss": 1.2228, + "num_input_tokens_seen": 624945824, + "step": 15910 + }, + { + "epoch": 0.7714252833182916, + "grad_norm": 0.4320572018623352, + "learning_rate": 6.543603023684536e-06, + "loss": 1.2724, + "num_input_tokens_seen": 625344196, + "step": 15920 + }, + { + "epoch": 0.7719098469384664, + "grad_norm": 0.403472363948822, + "learning_rate": 6.5171598932006665e-06, + "loss": 1.3135, + "num_input_tokens_seen": 625739548, + "step": 15930 + }, + { + "epoch": 0.7723944105586413, + "grad_norm": 0.4023309051990509, + "learning_rate": 6.49076228985841e-06, + "loss": 1.261, + "num_input_tokens_seen": 626125928, + "step": 15940 + }, + { + "epoch": 0.7728789741788161, + "grad_norm": 0.39641690254211426, + "learning_rate": 6.464410278680658e-06, + "loss": 1.2736, + "num_input_tokens_seen": 626532060, + "step": 15950 + }, + { + "epoch": 0.7733635377989909, + "grad_norm": 0.39902058243751526, + "learning_rate": 6.4381039245779675e-06, + "loss": 1.2552, + "num_input_tokens_seen": 626924212, + "step": 15960 + }, + { + "epoch": 0.7738481014191657, + "grad_norm": 0.4693625271320343, + "learning_rate": 6.411843292348465e-06, + "loss": 1.31, + "num_input_tokens_seen": 627344236, + "step": 15970 + }, + { + "epoch": 0.7743326650393405, + "grad_norm": 0.4080311357975006, + "learning_rate": 6.385628446677624e-06, + "loss": 1.2568, + "num_input_tokens_seen": 627754136, + "step": 15980 + }, + { + "epoch": 0.7748172286595153, + "grad_norm": 0.4102632403373718, + "learning_rate": 6.359459452138161e-06, + "loss": 1.3095, + "num_input_tokens_seen": 628131720, + "step": 15990 + }, + { + "epoch": 0.7753017922796901, + "grad_norm": 0.4116896986961365, + "learning_rate": 6.33333637318983e-06, + "loss": 1.2585, + "num_input_tokens_seen": 628560668, + "step": 16000 + }, + { + "epoch": 0.7753017922796901, + "eval_loss": 1.3683557510375977, + "eval_runtime": 3.6064, + "eval_samples_per_second": 41.592, + "eval_steps_per_second": 5.268, + "num_input_tokens_seen": 628560668, + "step": 16000 + }, + { + "epoch": 0.7757863558998649, + "grad_norm": 0.3886633813381195, + "learning_rate": 6.3072592741793e-06, + "loss": 1.2845, + "num_input_tokens_seen": 628949040, + "step": 16010 + }, + { + "epoch": 0.7762709195200397, + "grad_norm": 0.4156300127506256, + "learning_rate": 6.28122821933998e-06, + "loss": 1.3016, + "num_input_tokens_seen": 629334480, + "step": 16020 + }, + { + "epoch": 0.7767554831402146, + "grad_norm": 0.40398070216178894, + "learning_rate": 6.255243272791858e-06, + "loss": 1.2423, + "num_input_tokens_seen": 629704276, + "step": 16030 + }, + { + "epoch": 0.7772400467603894, + "grad_norm": 0.3824484944343567, + "learning_rate": 6.2293044985413555e-06, + "loss": 1.2772, + "num_input_tokens_seen": 630086976, + "step": 16040 + }, + { + "epoch": 0.7777246103805642, + "grad_norm": 0.39248552918434143, + "learning_rate": 6.203411960481145e-06, + "loss": 1.254, + "num_input_tokens_seen": 630477336, + "step": 16050 + }, + { + "epoch": 0.778209174000739, + "grad_norm": 0.37206169962882996, + "learning_rate": 6.17756572239003e-06, + "loss": 1.2717, + "num_input_tokens_seen": 630871612, + "step": 16060 + }, + { + "epoch": 0.7786937376209138, + "grad_norm": 0.40856611728668213, + "learning_rate": 6.151765847932747e-06, + "loss": 1.2494, + "num_input_tokens_seen": 631248400, + "step": 16070 + }, + { + "epoch": 0.7791783012410886, + "grad_norm": 0.40056514739990234, + "learning_rate": 6.126012400659856e-06, + "loss": 1.2542, + "num_input_tokens_seen": 631641360, + "step": 16080 + }, + { + "epoch": 0.7796628648612633, + "grad_norm": 0.4258357882499695, + "learning_rate": 6.1003054440075205e-06, + "loss": 1.2451, + "num_input_tokens_seen": 632006744, + "step": 16090 + }, + { + "epoch": 0.7801474284814381, + "grad_norm": 0.39988163113594055, + "learning_rate": 6.074645041297425e-06, + "loss": 1.2772, + "num_input_tokens_seen": 632371660, + "step": 16100 + }, + { + "epoch": 0.7806319921016129, + "grad_norm": 0.44167232513427734, + "learning_rate": 6.049031255736548e-06, + "loss": 1.2338, + "num_input_tokens_seen": 632740436, + "step": 16110 + }, + { + "epoch": 0.7811165557217878, + "grad_norm": 0.3898947536945343, + "learning_rate": 6.023464150417077e-06, + "loss": 1.2255, + "num_input_tokens_seen": 633086408, + "step": 16120 + }, + { + "epoch": 0.7816011193419626, + "grad_norm": 0.43151912093162537, + "learning_rate": 5.997943788316179e-06, + "loss": 1.2518, + "num_input_tokens_seen": 633479808, + "step": 16130 + }, + { + "epoch": 0.7820856829621374, + "grad_norm": 0.39515140652656555, + "learning_rate": 5.972470232295907e-06, + "loss": 1.2406, + "num_input_tokens_seen": 633863396, + "step": 16140 + }, + { + "epoch": 0.7825702465823122, + "grad_norm": 0.4262303411960602, + "learning_rate": 5.947043545103012e-06, + "loss": 1.2579, + "num_input_tokens_seen": 634250168, + "step": 16150 + }, + { + "epoch": 0.783054810202487, + "grad_norm": 0.40577808022499084, + "learning_rate": 5.921663789368806e-06, + "loss": 1.3019, + "num_input_tokens_seen": 634607024, + "step": 16160 + }, + { + "epoch": 0.7835393738226618, + "grad_norm": 0.42164546251296997, + "learning_rate": 5.896331027608978e-06, + "loss": 1.2479, + "num_input_tokens_seen": 635005404, + "step": 16170 + }, + { + "epoch": 0.7840239374428366, + "grad_norm": 0.4001612663269043, + "learning_rate": 5.871045322223481e-06, + "loss": 1.2902, + "num_input_tokens_seen": 635356840, + "step": 16180 + }, + { + "epoch": 0.7845085010630114, + "grad_norm": 0.40439048409461975, + "learning_rate": 5.845806735496362e-06, + "loss": 1.2674, + "num_input_tokens_seen": 635733620, + "step": 16190 + }, + { + "epoch": 0.7849930646831863, + "grad_norm": 0.37555184960365295, + "learning_rate": 5.820615329595575e-06, + "loss": 1.2414, + "num_input_tokens_seen": 636132896, + "step": 16200 + }, + { + "epoch": 0.7854776283033611, + "grad_norm": 0.4048125147819519, + "learning_rate": 5.795471166572894e-06, + "loss": 1.2734, + "num_input_tokens_seen": 636522108, + "step": 16210 + }, + { + "epoch": 0.7859621919235359, + "grad_norm": 0.3880002200603485, + "learning_rate": 5.770374308363693e-06, + "loss": 1.2619, + "num_input_tokens_seen": 636887204, + "step": 16220 + }, + { + "epoch": 0.7864467555437107, + "grad_norm": 0.4102485477924347, + "learning_rate": 5.745324816786854e-06, + "loss": 1.235, + "num_input_tokens_seen": 637306444, + "step": 16230 + }, + { + "epoch": 0.7869313191638855, + "grad_norm": 0.42181381583213806, + "learning_rate": 5.720322753544549e-06, + "loss": 1.2281, + "num_input_tokens_seen": 637683960, + "step": 16240 + }, + { + "epoch": 0.7874158827840603, + "grad_norm": 0.40080034732818604, + "learning_rate": 5.695368180222163e-06, + "loss": 1.1679, + "num_input_tokens_seen": 638089180, + "step": 16250 + }, + { + "epoch": 0.7879004464042351, + "grad_norm": 0.39268258213996887, + "learning_rate": 5.670461158288071e-06, + "loss": 1.2564, + "num_input_tokens_seen": 638480792, + "step": 16260 + }, + { + "epoch": 0.7883850100244099, + "grad_norm": 0.3921567499637604, + "learning_rate": 5.6456017490935405e-06, + "loss": 1.2426, + "num_input_tokens_seen": 638897224, + "step": 16270 + }, + { + "epoch": 0.7888695736445847, + "grad_norm": 0.3720284402370453, + "learning_rate": 5.620790013872543e-06, + "loss": 1.2322, + "num_input_tokens_seen": 639288872, + "step": 16280 + }, + { + "epoch": 0.7893541372647596, + "grad_norm": 0.41209957003593445, + "learning_rate": 5.596026013741631e-06, + "loss": 1.3017, + "num_input_tokens_seen": 639672792, + "step": 16290 + }, + { + "epoch": 0.7898387008849344, + "grad_norm": 0.43729230761528015, + "learning_rate": 5.571309809699771e-06, + "loss": 1.3003, + "num_input_tokens_seen": 640077948, + "step": 16300 + }, + { + "epoch": 0.7903232645051091, + "grad_norm": 0.3936404883861542, + "learning_rate": 5.546641462628194e-06, + "loss": 1.3, + "num_input_tokens_seen": 640466812, + "step": 16310 + }, + { + "epoch": 0.7908078281252839, + "grad_norm": 0.396609365940094, + "learning_rate": 5.522021033290265e-06, + "loss": 1.2776, + "num_input_tokens_seen": 640869644, + "step": 16320 + }, + { + "epoch": 0.7912923917454587, + "grad_norm": 0.39924660325050354, + "learning_rate": 5.4974485823312885e-06, + "loss": 1.2602, + "num_input_tokens_seen": 641267224, + "step": 16330 + }, + { + "epoch": 0.7917769553656335, + "grad_norm": 0.4168340563774109, + "learning_rate": 5.472924170278418e-06, + "loss": 1.2327, + "num_input_tokens_seen": 641650820, + "step": 16340 + }, + { + "epoch": 0.7922615189858083, + "grad_norm": 0.3994021415710449, + "learning_rate": 5.448447857540453e-06, + "loss": 1.257, + "num_input_tokens_seen": 642044792, + "step": 16350 + }, + { + "epoch": 0.7927460826059831, + "grad_norm": 0.40958625078201294, + "learning_rate": 5.424019704407735e-06, + "loss": 1.277, + "num_input_tokens_seen": 642438212, + "step": 16360 + }, + { + "epoch": 0.7932306462261579, + "grad_norm": 0.4128318428993225, + "learning_rate": 5.3996397710519565e-06, + "loss": 1.2549, + "num_input_tokens_seen": 642855308, + "step": 16370 + }, + { + "epoch": 0.7937152098463328, + "grad_norm": 0.38638776540756226, + "learning_rate": 5.37530811752606e-06, + "loss": 1.2651, + "num_input_tokens_seen": 643244872, + "step": 16380 + }, + { + "epoch": 0.7941997734665076, + "grad_norm": 0.41983383893966675, + "learning_rate": 5.351024803764035e-06, + "loss": 1.2892, + "num_input_tokens_seen": 643614276, + "step": 16390 + }, + { + "epoch": 0.7946843370866824, + "grad_norm": 0.41671621799468994, + "learning_rate": 5.32678988958083e-06, + "loss": 1.2707, + "num_input_tokens_seen": 644000564, + "step": 16400 + }, + { + "epoch": 0.7951689007068572, + "grad_norm": 0.4639870822429657, + "learning_rate": 5.302603434672149e-06, + "loss": 1.2279, + "num_input_tokens_seen": 644383116, + "step": 16410 + }, + { + "epoch": 0.795653464327032, + "grad_norm": 0.4280005693435669, + "learning_rate": 5.278465498614349e-06, + "loss": 1.2616, + "num_input_tokens_seen": 644805824, + "step": 16420 + }, + { + "epoch": 0.7961380279472068, + "grad_norm": 0.45124003291130066, + "learning_rate": 5.254376140864273e-06, + "loss": 1.2564, + "num_input_tokens_seen": 645207656, + "step": 16430 + }, + { + "epoch": 0.7966225915673816, + "grad_norm": 0.42907872796058655, + "learning_rate": 5.230335420759089e-06, + "loss": 1.2791, + "num_input_tokens_seen": 645619040, + "step": 16440 + }, + { + "epoch": 0.7971071551875564, + "grad_norm": 0.4425942599773407, + "learning_rate": 5.206343397516178e-06, + "loss": 1.2267, + "num_input_tokens_seen": 646010824, + "step": 16450 + }, + { + "epoch": 0.7975917188077312, + "grad_norm": 0.41618916392326355, + "learning_rate": 5.182400130232962e-06, + "loss": 1.2694, + "num_input_tokens_seen": 646440088, + "step": 16460 + }, + { + "epoch": 0.7980762824279061, + "grad_norm": 0.3770136833190918, + "learning_rate": 5.1585056778867766e-06, + "loss": 1.2545, + "num_input_tokens_seen": 646840628, + "step": 16470 + }, + { + "epoch": 0.7985608460480809, + "grad_norm": 0.39917248487472534, + "learning_rate": 5.134660099334699e-06, + "loss": 1.2714, + "num_input_tokens_seen": 647216352, + "step": 16480 + }, + { + "epoch": 0.7990454096682557, + "grad_norm": 0.41502079367637634, + "learning_rate": 5.110863453313436e-06, + "loss": 1.2083, + "num_input_tokens_seen": 647613200, + "step": 16490 + }, + { + "epoch": 0.7995299732884305, + "grad_norm": 0.43602079153060913, + "learning_rate": 5.087115798439146e-06, + "loss": 1.2545, + "num_input_tokens_seen": 647990596, + "step": 16500 + }, + { + "epoch": 0.8000145369086052, + "grad_norm": 0.3962024748325348, + "learning_rate": 5.063417193207337e-06, + "loss": 1.2527, + "num_input_tokens_seen": 648402756, + "step": 16510 + }, + { + "epoch": 0.80049910052878, + "grad_norm": 0.4318599998950958, + "learning_rate": 5.039767695992664e-06, + "loss": 1.2438, + "num_input_tokens_seen": 648791928, + "step": 16520 + }, + { + "epoch": 0.8009836641489548, + "grad_norm": 0.41063135862350464, + "learning_rate": 5.016167365048857e-06, + "loss": 1.2647, + "num_input_tokens_seen": 649194520, + "step": 16530 + }, + { + "epoch": 0.8014682277691296, + "grad_norm": 0.419166624546051, + "learning_rate": 4.992616258508501e-06, + "loss": 1.2353, + "num_input_tokens_seen": 649580868, + "step": 16540 + }, + { + "epoch": 0.8019527913893044, + "grad_norm": 0.4164583683013916, + "learning_rate": 4.969114434382966e-06, + "loss": 1.2328, + "num_input_tokens_seen": 649985136, + "step": 16550 + }, + { + "epoch": 0.8024373550094793, + "grad_norm": 0.41067010164260864, + "learning_rate": 4.945661950562195e-06, + "loss": 1.2201, + "num_input_tokens_seen": 650381492, + "step": 16560 + }, + { + "epoch": 0.8029219186296541, + "grad_norm": 0.38203445076942444, + "learning_rate": 4.922258864814619e-06, + "loss": 1.242, + "num_input_tokens_seen": 650781812, + "step": 16570 + }, + { + "epoch": 0.8034064822498289, + "grad_norm": 0.3730851709842682, + "learning_rate": 4.8989052347869876e-06, + "loss": 1.2403, + "num_input_tokens_seen": 651179148, + "step": 16580 + }, + { + "epoch": 0.8038910458700037, + "grad_norm": 0.38860711455345154, + "learning_rate": 4.875601118004228e-06, + "loss": 1.2151, + "num_input_tokens_seen": 651589328, + "step": 16590 + }, + { + "epoch": 0.8043756094901785, + "grad_norm": 0.437386691570282, + "learning_rate": 4.852346571869307e-06, + "loss": 1.2417, + "num_input_tokens_seen": 651973144, + "step": 16600 + }, + { + "epoch": 0.8048601731103533, + "grad_norm": 0.3950743079185486, + "learning_rate": 4.8291416536630805e-06, + "loss": 1.2422, + "num_input_tokens_seen": 652374852, + "step": 16610 + }, + { + "epoch": 0.8053447367305281, + "grad_norm": 0.4481166899204254, + "learning_rate": 4.805986420544173e-06, + "loss": 1.2728, + "num_input_tokens_seen": 652767160, + "step": 16620 + }, + { + "epoch": 0.8058293003507029, + "grad_norm": 0.4067986011505127, + "learning_rate": 4.782880929548808e-06, + "loss": 1.2141, + "num_input_tokens_seen": 653149252, + "step": 16630 + }, + { + "epoch": 0.8063138639708777, + "grad_norm": 0.44263729453086853, + "learning_rate": 4.7598252375907035e-06, + "loss": 1.2469, + "num_input_tokens_seen": 653534468, + "step": 16640 + }, + { + "epoch": 0.8067984275910526, + "grad_norm": 0.4220990836620331, + "learning_rate": 4.736819401460893e-06, + "loss": 1.2297, + "num_input_tokens_seen": 653904180, + "step": 16650 + }, + { + "epoch": 0.8072829912112274, + "grad_norm": 0.4025983512401581, + "learning_rate": 4.713863477827626e-06, + "loss": 1.2693, + "num_input_tokens_seen": 654309572, + "step": 16660 + }, + { + "epoch": 0.8077675548314022, + "grad_norm": 0.44055184721946716, + "learning_rate": 4.690957523236178e-06, + "loss": 1.2899, + "num_input_tokens_seen": 654738392, + "step": 16670 + }, + { + "epoch": 0.808252118451577, + "grad_norm": 0.43497493863105774, + "learning_rate": 4.66810159410877e-06, + "loss": 1.2421, + "num_input_tokens_seen": 655115200, + "step": 16680 + }, + { + "epoch": 0.8087366820717518, + "grad_norm": 0.38774144649505615, + "learning_rate": 4.645295746744374e-06, + "loss": 1.2939, + "num_input_tokens_seen": 655499648, + "step": 16690 + }, + { + "epoch": 0.8092212456919265, + "grad_norm": 0.41730353236198425, + "learning_rate": 4.622540037318618e-06, + "loss": 1.2312, + "num_input_tokens_seen": 655885096, + "step": 16700 + }, + { + "epoch": 0.8097058093121013, + "grad_norm": 0.4214309751987457, + "learning_rate": 4.5998345218836304e-06, + "loss": 1.2738, + "num_input_tokens_seen": 656267208, + "step": 16710 + }, + { + "epoch": 0.8101903729322761, + "grad_norm": 0.40911537408828735, + "learning_rate": 4.577179256367886e-06, + "loss": 1.2121, + "num_input_tokens_seen": 656652956, + "step": 16720 + }, + { + "epoch": 0.8106749365524509, + "grad_norm": 0.4146646559238434, + "learning_rate": 4.554574296576092e-06, + "loss": 1.2153, + "num_input_tokens_seen": 657038508, + "step": 16730 + }, + { + "epoch": 0.8111595001726258, + "grad_norm": 0.430595338344574, + "learning_rate": 4.532019698189044e-06, + "loss": 1.2613, + "num_input_tokens_seen": 657434028, + "step": 16740 + }, + { + "epoch": 0.8116440637928006, + "grad_norm": 0.3903404772281647, + "learning_rate": 4.509515516763493e-06, + "loss": 1.2283, + "num_input_tokens_seen": 657845788, + "step": 16750 + }, + { + "epoch": 0.8121286274129754, + "grad_norm": 0.4230201840400696, + "learning_rate": 4.487061807731982e-06, + "loss": 1.2466, + "num_input_tokens_seen": 658242124, + "step": 16760 + }, + { + "epoch": 0.8126131910331502, + "grad_norm": 0.39636996388435364, + "learning_rate": 4.464658626402751e-06, + "loss": 1.2527, + "num_input_tokens_seen": 658637400, + "step": 16770 + }, + { + "epoch": 0.813097754653325, + "grad_norm": 0.4210509657859802, + "learning_rate": 4.442306027959564e-06, + "loss": 1.2837, + "num_input_tokens_seen": 659032724, + "step": 16780 + }, + { + "epoch": 0.8135823182734998, + "grad_norm": 0.4173252582550049, + "learning_rate": 4.4200040674616e-06, + "loss": 1.2855, + "num_input_tokens_seen": 659452924, + "step": 16790 + }, + { + "epoch": 0.8140668818936746, + "grad_norm": 0.43643826246261597, + "learning_rate": 4.397752799843294e-06, + "loss": 1.2185, + "num_input_tokens_seen": 659860564, + "step": 16800 + }, + { + "epoch": 0.8145514455138494, + "grad_norm": 0.4406135380268097, + "learning_rate": 4.375552279914233e-06, + "loss": 1.3007, + "num_input_tokens_seen": 660247376, + "step": 16810 + }, + { + "epoch": 0.8150360091340242, + "grad_norm": 0.41418373584747314, + "learning_rate": 4.353402562358977e-06, + "loss": 1.2831, + "num_input_tokens_seen": 660644512, + "step": 16820 + }, + { + "epoch": 0.8155205727541991, + "grad_norm": 0.3968205749988556, + "learning_rate": 4.331303701736969e-06, + "loss": 1.2532, + "num_input_tokens_seen": 661014408, + "step": 16830 + }, + { + "epoch": 0.8160051363743739, + "grad_norm": 0.4220852255821228, + "learning_rate": 4.309255752482378e-06, + "loss": 1.2457, + "num_input_tokens_seen": 661407992, + "step": 16840 + }, + { + "epoch": 0.8164896999945487, + "grad_norm": 0.418720543384552, + "learning_rate": 4.2872587689039484e-06, + "loss": 1.2173, + "num_input_tokens_seen": 661784464, + "step": 16850 + }, + { + "epoch": 0.8169742636147235, + "grad_norm": 0.38268375396728516, + "learning_rate": 4.265312805184909e-06, + "loss": 1.237, + "num_input_tokens_seen": 662147388, + "step": 16860 + }, + { + "epoch": 0.8174588272348983, + "grad_norm": 0.40058982372283936, + "learning_rate": 4.24341791538281e-06, + "loss": 1.2639, + "num_input_tokens_seen": 662530408, + "step": 16870 + }, + { + "epoch": 0.8179433908550731, + "grad_norm": 0.4105454981327057, + "learning_rate": 4.221574153429392e-06, + "loss": 1.2431, + "num_input_tokens_seen": 662921336, + "step": 16880 + }, + { + "epoch": 0.8184279544752479, + "grad_norm": 0.41436707973480225, + "learning_rate": 4.1997815731304515e-06, + "loss": 1.2628, + "num_input_tokens_seen": 663310688, + "step": 16890 + }, + { + "epoch": 0.8189125180954226, + "grad_norm": 0.39193400740623474, + "learning_rate": 4.178040228165725e-06, + "loss": 1.2385, + "num_input_tokens_seen": 663687484, + "step": 16900 + }, + { + "epoch": 0.8193970817155974, + "grad_norm": 0.4802253246307373, + "learning_rate": 4.156350172088736e-06, + "loss": 1.2784, + "num_input_tokens_seen": 664078036, + "step": 16910 + }, + { + "epoch": 0.8198816453357723, + "grad_norm": 0.4015842080116272, + "learning_rate": 4.134711458326681e-06, + "loss": 1.2329, + "num_input_tokens_seen": 664483676, + "step": 16920 + }, + { + "epoch": 0.8203662089559471, + "grad_norm": 0.40182778239250183, + "learning_rate": 4.11312414018028e-06, + "loss": 1.2525, + "num_input_tokens_seen": 664851144, + "step": 16930 + }, + { + "epoch": 0.8208507725761219, + "grad_norm": 0.4445132315158844, + "learning_rate": 4.091588270823671e-06, + "loss": 1.2506, + "num_input_tokens_seen": 665274412, + "step": 16940 + }, + { + "epoch": 0.8213353361962967, + "grad_norm": 0.43962743878364563, + "learning_rate": 4.070103903304237e-06, + "loss": 1.299, + "num_input_tokens_seen": 665663080, + "step": 16950 + }, + { + "epoch": 0.8218198998164715, + "grad_norm": 0.43457552790641785, + "learning_rate": 4.048671090542522e-06, + "loss": 1.2759, + "num_input_tokens_seen": 666049536, + "step": 16960 + }, + { + "epoch": 0.8223044634366463, + "grad_norm": 0.44402769207954407, + "learning_rate": 4.0272898853320835e-06, + "loss": 1.2648, + "num_input_tokens_seen": 666424352, + "step": 16970 + }, + { + "epoch": 0.8227890270568211, + "grad_norm": 0.4204462468624115, + "learning_rate": 4.005960340339335e-06, + "loss": 1.2761, + "num_input_tokens_seen": 666812808, + "step": 16980 + }, + { + "epoch": 0.8232735906769959, + "grad_norm": 0.4030398428440094, + "learning_rate": 3.984682508103466e-06, + "loss": 1.2865, + "num_input_tokens_seen": 667198772, + "step": 16990 + }, + { + "epoch": 0.8237581542971708, + "grad_norm": 0.3986146152019501, + "learning_rate": 3.963456441036259e-06, + "loss": 1.247, + "num_input_tokens_seen": 667609040, + "step": 17000 + }, + { + "epoch": 0.8242427179173456, + "grad_norm": 0.4464845061302185, + "learning_rate": 3.942282191422017e-06, + "loss": 1.2972, + "num_input_tokens_seen": 668009280, + "step": 17010 + }, + { + "epoch": 0.8247272815375204, + "grad_norm": 0.37678518891334534, + "learning_rate": 3.9211598114173855e-06, + "loss": 1.2387, + "num_input_tokens_seen": 668416796, + "step": 17020 + }, + { + "epoch": 0.8252118451576952, + "grad_norm": 0.38661307096481323, + "learning_rate": 3.900089353051259e-06, + "loss": 1.2197, + "num_input_tokens_seen": 668795620, + "step": 17030 + }, + { + "epoch": 0.82569640877787, + "grad_norm": 0.3967253267765045, + "learning_rate": 3.879070868224616e-06, + "loss": 1.2319, + "num_input_tokens_seen": 669174880, + "step": 17040 + }, + { + "epoch": 0.8261809723980448, + "grad_norm": 0.45880743861198425, + "learning_rate": 3.858104408710445e-06, + "loss": 1.2629, + "num_input_tokens_seen": 669581256, + "step": 17050 + }, + { + "epoch": 0.8266655360182196, + "grad_norm": 0.40062230825424194, + "learning_rate": 3.837190026153548e-06, + "loss": 1.3047, + "num_input_tokens_seen": 669998664, + "step": 17060 + }, + { + "epoch": 0.8271500996383944, + "grad_norm": 0.42311957478523254, + "learning_rate": 3.816327772070483e-06, + "loss": 1.2558, + "num_input_tokens_seen": 670389116, + "step": 17070 + }, + { + "epoch": 0.8276346632585692, + "grad_norm": 0.41024044156074524, + "learning_rate": 3.7955176978493822e-06, + "loss": 1.2791, + "num_input_tokens_seen": 670805700, + "step": 17080 + }, + { + "epoch": 0.8281192268787441, + "grad_norm": 0.4341394305229187, + "learning_rate": 3.7747598547498682e-06, + "loss": 1.229, + "num_input_tokens_seen": 671163656, + "step": 17090 + }, + { + "epoch": 0.8286037904989189, + "grad_norm": 0.42642325162887573, + "learning_rate": 3.754054293902884e-06, + "loss": 1.2721, + "num_input_tokens_seen": 671556364, + "step": 17100 + }, + { + "epoch": 0.8290883541190937, + "grad_norm": 0.41226011514663696, + "learning_rate": 3.7334010663106044e-06, + "loss": 1.2968, + "num_input_tokens_seen": 671957808, + "step": 17110 + }, + { + "epoch": 0.8295729177392684, + "grad_norm": 0.4139423966407776, + "learning_rate": 3.712800222846302e-06, + "loss": 1.2627, + "num_input_tokens_seen": 672364408, + "step": 17120 + }, + { + "epoch": 0.8300574813594432, + "grad_norm": 0.4178198575973511, + "learning_rate": 3.6922518142541994e-06, + "loss": 1.2778, + "num_input_tokens_seen": 672770620, + "step": 17130 + }, + { + "epoch": 0.830542044979618, + "grad_norm": 0.43971824645996094, + "learning_rate": 3.6717558911493784e-06, + "loss": 1.1902, + "num_input_tokens_seen": 673162284, + "step": 17140 + }, + { + "epoch": 0.8310266085997928, + "grad_norm": 0.40751203894615173, + "learning_rate": 3.6513125040176205e-06, + "loss": 1.2592, + "num_input_tokens_seen": 673552816, + "step": 17150 + }, + { + "epoch": 0.8315111722199676, + "grad_norm": 0.3929533064365387, + "learning_rate": 3.6309217032153093e-06, + "loss": 1.2793, + "num_input_tokens_seen": 673963224, + "step": 17160 + }, + { + "epoch": 0.8319957358401424, + "grad_norm": 0.42938390374183655, + "learning_rate": 3.610583538969306e-06, + "loss": 1.2622, + "num_input_tokens_seen": 674355136, + "step": 17170 + }, + { + "epoch": 0.8324802994603173, + "grad_norm": 0.42667803168296814, + "learning_rate": 3.590298061376804e-06, + "loss": 1.3012, + "num_input_tokens_seen": 674750972, + "step": 17180 + }, + { + "epoch": 0.8329648630804921, + "grad_norm": 0.4376852214336395, + "learning_rate": 3.5700653204052167e-06, + "loss": 1.3022, + "num_input_tokens_seen": 675185324, + "step": 17190 + }, + { + "epoch": 0.8334494267006669, + "grad_norm": 0.40134289860725403, + "learning_rate": 3.5498853658920695e-06, + "loss": 1.2425, + "num_input_tokens_seen": 675605452, + "step": 17200 + }, + { + "epoch": 0.8339339903208417, + "grad_norm": 0.4355108439922333, + "learning_rate": 3.5297582475448483e-06, + "loss": 1.2516, + "num_input_tokens_seen": 676001164, + "step": 17210 + }, + { + "epoch": 0.8344185539410165, + "grad_norm": 0.3797595798969269, + "learning_rate": 3.509684014940906e-06, + "loss": 1.2325, + "num_input_tokens_seen": 676377212, + "step": 17220 + }, + { + "epoch": 0.8349031175611913, + "grad_norm": 0.42660579085350037, + "learning_rate": 3.489662717527312e-06, + "loss": 1.2337, + "num_input_tokens_seen": 676770320, + "step": 17230 + }, + { + "epoch": 0.8353876811813661, + "grad_norm": 0.39556610584259033, + "learning_rate": 3.4696944046207574e-06, + "loss": 1.3193, + "num_input_tokens_seen": 677180796, + "step": 17240 + }, + { + "epoch": 0.8358722448015409, + "grad_norm": 0.3952700197696686, + "learning_rate": 3.449779125407426e-06, + "loss": 1.2574, + "num_input_tokens_seen": 677593456, + "step": 17250 + }, + { + "epoch": 0.8363568084217157, + "grad_norm": 0.40727055072784424, + "learning_rate": 3.4299169289428446e-06, + "loss": 1.2599, + "num_input_tokens_seen": 677958328, + "step": 17260 + }, + { + "epoch": 0.8368413720418906, + "grad_norm": 0.4267754852771759, + "learning_rate": 3.4101078641518124e-06, + "loss": 1.2312, + "num_input_tokens_seen": 678375748, + "step": 17270 + }, + { + "epoch": 0.8373259356620654, + "grad_norm": 0.3855496644973755, + "learning_rate": 3.3903519798282353e-06, + "loss": 1.2167, + "num_input_tokens_seen": 678754312, + "step": 17280 + }, + { + "epoch": 0.8378104992822402, + "grad_norm": 0.39305436611175537, + "learning_rate": 3.3706493246350335e-06, + "loss": 1.2569, + "num_input_tokens_seen": 679131624, + "step": 17290 + }, + { + "epoch": 0.838295062902415, + "grad_norm": 0.395431786775589, + "learning_rate": 3.3509999471040136e-06, + "loss": 1.2482, + "num_input_tokens_seen": 679524344, + "step": 17300 + }, + { + "epoch": 0.8387796265225897, + "grad_norm": 0.4162997901439667, + "learning_rate": 3.3314038956357514e-06, + "loss": 1.2221, + "num_input_tokens_seen": 679924936, + "step": 17310 + }, + { + "epoch": 0.8392641901427645, + "grad_norm": 0.42025381326675415, + "learning_rate": 3.3118612184994485e-06, + "loss": 1.2243, + "num_input_tokens_seen": 680326700, + "step": 17320 + }, + { + "epoch": 0.8397487537629393, + "grad_norm": 0.46221596002578735, + "learning_rate": 3.292371963832863e-06, + "loss": 1.2707, + "num_input_tokens_seen": 680740540, + "step": 17330 + }, + { + "epoch": 0.8402333173831141, + "grad_norm": 0.40451374650001526, + "learning_rate": 3.2729361796421387e-06, + "loss": 1.2341, + "num_input_tokens_seen": 681129484, + "step": 17340 + }, + { + "epoch": 0.8407178810032889, + "grad_norm": 0.43298593163490295, + "learning_rate": 3.253553913801727e-06, + "loss": 1.305, + "num_input_tokens_seen": 681512368, + "step": 17350 + }, + { + "epoch": 0.8412024446234638, + "grad_norm": 0.4344673156738281, + "learning_rate": 3.2342252140542373e-06, + "loss": 1.3109, + "num_input_tokens_seen": 681938304, + "step": 17360 + }, + { + "epoch": 0.8416870082436386, + "grad_norm": 0.39718765020370483, + "learning_rate": 3.2149501280103466e-06, + "loss": 1.2778, + "num_input_tokens_seen": 682342636, + "step": 17370 + }, + { + "epoch": 0.8421715718638134, + "grad_norm": 0.4262661933898926, + "learning_rate": 3.195728703148673e-06, + "loss": 1.2626, + "num_input_tokens_seen": 682748112, + "step": 17380 + }, + { + "epoch": 0.8426561354839882, + "grad_norm": 0.4599827826023102, + "learning_rate": 3.1765609868156325e-06, + "loss": 1.285, + "num_input_tokens_seen": 683133464, + "step": 17390 + }, + { + "epoch": 0.843140699104163, + "grad_norm": 0.3915078639984131, + "learning_rate": 3.1574470262253795e-06, + "loss": 1.2881, + "num_input_tokens_seen": 683506720, + "step": 17400 + }, + { + "epoch": 0.8436252627243378, + "grad_norm": 0.4697626233100891, + "learning_rate": 3.138386868459622e-06, + "loss": 1.2346, + "num_input_tokens_seen": 683894420, + "step": 17410 + }, + { + "epoch": 0.8441098263445126, + "grad_norm": 0.4377289116382599, + "learning_rate": 3.119380560467572e-06, + "loss": 1.2772, + "num_input_tokens_seen": 684275120, + "step": 17420 + }, + { + "epoch": 0.8445943899646874, + "grad_norm": 0.4258815050125122, + "learning_rate": 3.1004281490657703e-06, + "loss": 1.2482, + "num_input_tokens_seen": 684658984, + "step": 17430 + }, + { + "epoch": 0.8450789535848622, + "grad_norm": 0.4437004625797272, + "learning_rate": 3.0815296809380167e-06, + "loss": 1.2674, + "num_input_tokens_seen": 685068552, + "step": 17440 + }, + { + "epoch": 0.8455635172050371, + "grad_norm": 0.3915651738643646, + "learning_rate": 3.0626852026352347e-06, + "loss": 1.2626, + "num_input_tokens_seen": 685445348, + "step": 17450 + }, + { + "epoch": 0.8460480808252119, + "grad_norm": 0.4123501777648926, + "learning_rate": 3.043894760575358e-06, + "loss": 1.2912, + "num_input_tokens_seen": 685845820, + "step": 17460 + }, + { + "epoch": 0.8465326444453867, + "grad_norm": 0.4246242344379425, + "learning_rate": 3.0251584010432127e-06, + "loss": 1.2455, + "num_input_tokens_seen": 686224024, + "step": 17470 + }, + { + "epoch": 0.8470172080655615, + "grad_norm": 0.43629172444343567, + "learning_rate": 3.00647617019042e-06, + "loss": 1.2333, + "num_input_tokens_seen": 686627324, + "step": 17480 + }, + { + "epoch": 0.8475017716857363, + "grad_norm": 0.3956005871295929, + "learning_rate": 2.9878481140352495e-06, + "loss": 1.2989, + "num_input_tokens_seen": 687019208, + "step": 17490 + }, + { + "epoch": 0.847986335305911, + "grad_norm": 0.434689462184906, + "learning_rate": 2.96927427846255e-06, + "loss": 1.2314, + "num_input_tokens_seen": 687411440, + "step": 17500 + }, + { + "epoch": 0.8484708989260858, + "grad_norm": 0.3737858831882477, + "learning_rate": 2.9507547092236075e-06, + "loss": 1.2527, + "num_input_tokens_seen": 687806800, + "step": 17510 + }, + { + "epoch": 0.8489554625462606, + "grad_norm": 0.4112042784690857, + "learning_rate": 2.9322894519360237e-06, + "loss": 1.249, + "num_input_tokens_seen": 688225164, + "step": 17520 + }, + { + "epoch": 0.8494400261664354, + "grad_norm": 0.41528812050819397, + "learning_rate": 2.913878552083646e-06, + "loss": 1.3127, + "num_input_tokens_seen": 688617932, + "step": 17530 + }, + { + "epoch": 0.8499245897866103, + "grad_norm": 0.43437185883522034, + "learning_rate": 2.895522055016395e-06, + "loss": 1.2433, + "num_input_tokens_seen": 689007680, + "step": 17540 + }, + { + "epoch": 0.8504091534067851, + "grad_norm": 0.39245957136154175, + "learning_rate": 2.8772200059502153e-06, + "loss": 1.2776, + "num_input_tokens_seen": 689376876, + "step": 17550 + }, + { + "epoch": 0.8508937170269599, + "grad_norm": 0.44739076495170593, + "learning_rate": 2.8589724499669122e-06, + "loss": 1.2425, + "num_input_tokens_seen": 689774952, + "step": 17560 + }, + { + "epoch": 0.8513782806471347, + "grad_norm": 0.400081604719162, + "learning_rate": 2.840779432014079e-06, + "loss": 1.253, + "num_input_tokens_seen": 690191364, + "step": 17570 + }, + { + "epoch": 0.8518628442673095, + "grad_norm": 0.43295279145240784, + "learning_rate": 2.8226409969049627e-06, + "loss": 1.24, + "num_input_tokens_seen": 690593376, + "step": 17580 + }, + { + "epoch": 0.8523474078874843, + "grad_norm": 0.4127790927886963, + "learning_rate": 2.804557189318366e-06, + "loss": 1.2606, + "num_input_tokens_seen": 691002516, + "step": 17590 + }, + { + "epoch": 0.8528319715076591, + "grad_norm": 0.40065881609916687, + "learning_rate": 2.7865280537985233e-06, + "loss": 1.2413, + "num_input_tokens_seen": 691404960, + "step": 17600 + }, + { + "epoch": 0.8533165351278339, + "grad_norm": 0.45817670226097107, + "learning_rate": 2.768553634755011e-06, + "loss": 1.2612, + "num_input_tokens_seen": 691797660, + "step": 17610 + }, + { + "epoch": 0.8538010987480087, + "grad_norm": 0.36435696482658386, + "learning_rate": 2.750633976462616e-06, + "loss": 1.2247, + "num_input_tokens_seen": 692207740, + "step": 17620 + }, + { + "epoch": 0.8542856623681836, + "grad_norm": 0.41439151763916016, + "learning_rate": 2.732769123061249e-06, + "loss": 1.271, + "num_input_tokens_seen": 692585708, + "step": 17630 + }, + { + "epoch": 0.8547702259883584, + "grad_norm": 0.4093399941921234, + "learning_rate": 2.714959118555821e-06, + "loss": 1.2442, + "num_input_tokens_seen": 692977756, + "step": 17640 + }, + { + "epoch": 0.8552547896085332, + "grad_norm": 0.44113224744796753, + "learning_rate": 2.697204006816131e-06, + "loss": 1.2601, + "num_input_tokens_seen": 693350796, + "step": 17650 + }, + { + "epoch": 0.855739353228708, + "grad_norm": 0.40980538725852966, + "learning_rate": 2.6795038315767824e-06, + "loss": 1.2668, + "num_input_tokens_seen": 693759588, + "step": 17660 + }, + { + "epoch": 0.8562239168488828, + "grad_norm": 0.42793384194374084, + "learning_rate": 2.661858636437034e-06, + "loss": 1.2285, + "num_input_tokens_seen": 694174680, + "step": 17670 + }, + { + "epoch": 0.8567084804690576, + "grad_norm": 0.4959481954574585, + "learning_rate": 2.644268464860741e-06, + "loss": 1.2491, + "num_input_tokens_seen": 694561320, + "step": 17680 + }, + { + "epoch": 0.8571930440892324, + "grad_norm": 0.40092337131500244, + "learning_rate": 2.6267333601762088e-06, + "loss": 1.2301, + "num_input_tokens_seen": 694959912, + "step": 17690 + }, + { + "epoch": 0.8576776077094072, + "grad_norm": 0.40096816420555115, + "learning_rate": 2.6092533655761144e-06, + "loss": 1.2886, + "num_input_tokens_seen": 695328628, + "step": 17700 + }, + { + "epoch": 0.8581621713295821, + "grad_norm": 0.4054926335811615, + "learning_rate": 2.591828524117365e-06, + "loss": 1.2309, + "num_input_tokens_seen": 695748200, + "step": 17710 + }, + { + "epoch": 0.8586467349497569, + "grad_norm": 0.38719359040260315, + "learning_rate": 2.5744588787210366e-06, + "loss": 1.2367, + "num_input_tokens_seen": 696133424, + "step": 17720 + }, + { + "epoch": 0.8591312985699316, + "grad_norm": 0.386705219745636, + "learning_rate": 2.557144472172235e-06, + "loss": 1.2824, + "num_input_tokens_seen": 696536372, + "step": 17730 + }, + { + "epoch": 0.8596158621901064, + "grad_norm": 0.4401761293411255, + "learning_rate": 2.5398853471200105e-06, + "loss": 1.2132, + "num_input_tokens_seen": 696913248, + "step": 17740 + }, + { + "epoch": 0.8601004258102812, + "grad_norm": 0.43798384070396423, + "learning_rate": 2.522681546077224e-06, + "loss": 1.2504, + "num_input_tokens_seen": 697311808, + "step": 17750 + }, + { + "epoch": 0.860584989430456, + "grad_norm": 0.4158650040626526, + "learning_rate": 2.5055331114204798e-06, + "loss": 1.2485, + "num_input_tokens_seen": 697710452, + "step": 17760 + }, + { + "epoch": 0.8610695530506308, + "grad_norm": 0.4461188018321991, + "learning_rate": 2.4884400853900034e-06, + "loss": 1.2431, + "num_input_tokens_seen": 698116700, + "step": 17770 + }, + { + "epoch": 0.8615541166708056, + "grad_norm": 0.36897483468055725, + "learning_rate": 2.4714025100895155e-06, + "loss": 1.2314, + "num_input_tokens_seen": 698502984, + "step": 17780 + }, + { + "epoch": 0.8620386802909804, + "grad_norm": 0.38578104972839355, + "learning_rate": 2.4544204274861785e-06, + "loss": 1.2852, + "num_input_tokens_seen": 698869344, + "step": 17790 + }, + { + "epoch": 0.8625232439111553, + "grad_norm": 0.3806360363960266, + "learning_rate": 2.4374938794104407e-06, + "loss": 1.2656, + "num_input_tokens_seen": 699268952, + "step": 17800 + }, + { + "epoch": 0.8630078075313301, + "grad_norm": 0.43705320358276367, + "learning_rate": 2.420622907555975e-06, + "loss": 1.2623, + "num_input_tokens_seen": 699655576, + "step": 17810 + }, + { + "epoch": 0.8634923711515049, + "grad_norm": 0.41822245717048645, + "learning_rate": 2.403807553479548e-06, + "loss": 1.2524, + "num_input_tokens_seen": 700027896, + "step": 17820 + }, + { + "epoch": 0.8639769347716797, + "grad_norm": 0.4171805679798126, + "learning_rate": 2.3870478586009325e-06, + "loss": 1.2786, + "num_input_tokens_seen": 700417852, + "step": 17830 + }, + { + "epoch": 0.8644614983918545, + "grad_norm": 0.4310646057128906, + "learning_rate": 2.3703438642027927e-06, + "loss": 1.2672, + "num_input_tokens_seen": 700843676, + "step": 17840 + }, + { + "epoch": 0.8649460620120293, + "grad_norm": 0.41659244894981384, + "learning_rate": 2.353695611430609e-06, + "loss": 1.2511, + "num_input_tokens_seen": 701217408, + "step": 17850 + }, + { + "epoch": 0.8654306256322041, + "grad_norm": 0.3900809586048126, + "learning_rate": 2.337103141292535e-06, + "loss": 1.2216, + "num_input_tokens_seen": 701584684, + "step": 17860 + }, + { + "epoch": 0.8659151892523789, + "grad_norm": 0.4181850552558899, + "learning_rate": 2.3205664946593348e-06, + "loss": 1.3174, + "num_input_tokens_seen": 701960304, + "step": 17870 + }, + { + "epoch": 0.8663997528725537, + "grad_norm": 0.4225742220878601, + "learning_rate": 2.3040857122642674e-06, + "loss": 1.229, + "num_input_tokens_seen": 702347808, + "step": 17880 + }, + { + "epoch": 0.8668843164927286, + "grad_norm": 0.3922450542449951, + "learning_rate": 2.2876608347029816e-06, + "loss": 1.2615, + "num_input_tokens_seen": 702743944, + "step": 17890 + }, + { + "epoch": 0.8673688801129034, + "grad_norm": 0.3811093866825104, + "learning_rate": 2.2712919024334257e-06, + "loss": 1.2735, + "num_input_tokens_seen": 703123092, + "step": 17900 + }, + { + "epoch": 0.8678534437330782, + "grad_norm": 0.45579883456230164, + "learning_rate": 2.2549789557757327e-06, + "loss": 1.2724, + "num_input_tokens_seen": 703523388, + "step": 17910 + }, + { + "epoch": 0.868338007353253, + "grad_norm": 0.44050538539886475, + "learning_rate": 2.238722034912144e-06, + "loss": 1.2293, + "num_input_tokens_seen": 703903936, + "step": 17920 + }, + { + "epoch": 0.8688225709734277, + "grad_norm": 0.3877395987510681, + "learning_rate": 2.222521179886888e-06, + "loss": 1.2697, + "num_input_tokens_seen": 704304692, + "step": 17930 + }, + { + "epoch": 0.8693071345936025, + "grad_norm": 0.427232027053833, + "learning_rate": 2.206376430606097e-06, + "loss": 1.2374, + "num_input_tokens_seen": 704700272, + "step": 17940 + }, + { + "epoch": 0.8697916982137773, + "grad_norm": 0.39587920904159546, + "learning_rate": 2.1902878268376975e-06, + "loss": 1.2735, + "num_input_tokens_seen": 705078452, + "step": 17950 + }, + { + "epoch": 0.8702762618339521, + "grad_norm": 0.37119200825691223, + "learning_rate": 2.174255408211326e-06, + "loss": 1.3054, + "num_input_tokens_seen": 705480336, + "step": 17960 + }, + { + "epoch": 0.8707608254541269, + "grad_norm": 0.4567846357822418, + "learning_rate": 2.1582792142182117e-06, + "loss": 1.2335, + "num_input_tokens_seen": 705869500, + "step": 17970 + }, + { + "epoch": 0.8712453890743018, + "grad_norm": 0.39730075001716614, + "learning_rate": 2.1423592842111066e-06, + "loss": 1.2125, + "num_input_tokens_seen": 706254284, + "step": 17980 + }, + { + "epoch": 0.8717299526944766, + "grad_norm": 0.41465237736701965, + "learning_rate": 2.1264956574041513e-06, + "loss": 1.2287, + "num_input_tokens_seen": 706671516, + "step": 17990 + }, + { + "epoch": 0.8722145163146514, + "grad_norm": 0.3785446584224701, + "learning_rate": 2.1106883728728155e-06, + "loss": 1.2477, + "num_input_tokens_seen": 707047904, + "step": 18000 + }, + { + "epoch": 0.8722145163146514, + "eval_loss": 1.3607689142227173, + "eval_runtime": 3.6634, + "eval_samples_per_second": 40.946, + "eval_steps_per_second": 5.186, + "num_input_tokens_seen": 707047904, + "step": 18000 + }, + { + "epoch": 0.8726990799348262, + "grad_norm": 0.3779754042625427, + "learning_rate": 2.094937469553787e-06, + "loss": 1.2501, + "num_input_tokens_seen": 707448824, + "step": 18010 + }, + { + "epoch": 0.873183643555001, + "grad_norm": 0.40831905603408813, + "learning_rate": 2.079242986244867e-06, + "loss": 1.3242, + "num_input_tokens_seen": 707830972, + "step": 18020 + }, + { + "epoch": 0.8736682071751758, + "grad_norm": 0.4000668525695801, + "learning_rate": 2.063604961604884e-06, + "loss": 1.2698, + "num_input_tokens_seen": 708228628, + "step": 18030 + }, + { + "epoch": 0.8741527707953506, + "grad_norm": 0.4057333469390869, + "learning_rate": 2.0480234341535952e-06, + "loss": 1.3053, + "num_input_tokens_seen": 708602260, + "step": 18040 + }, + { + "epoch": 0.8746373344155254, + "grad_norm": 0.44601795077323914, + "learning_rate": 2.0324984422716046e-06, + "loss": 1.2743, + "num_input_tokens_seen": 709010156, + "step": 18050 + }, + { + "epoch": 0.8751218980357002, + "grad_norm": 0.4290080964565277, + "learning_rate": 2.0170300242002365e-06, + "loss": 1.2339, + "num_input_tokens_seen": 709382372, + "step": 18060 + }, + { + "epoch": 0.8756064616558751, + "grad_norm": 0.40737321972846985, + "learning_rate": 2.001618218041487e-06, + "loss": 1.2312, + "num_input_tokens_seen": 709780940, + "step": 18070 + }, + { + "epoch": 0.8760910252760499, + "grad_norm": 0.42168939113616943, + "learning_rate": 1.9862630617578816e-06, + "loss": 1.2517, + "num_input_tokens_seen": 710158796, + "step": 18080 + }, + { + "epoch": 0.8765755888962247, + "grad_norm": 0.4381299912929535, + "learning_rate": 1.9709645931724225e-06, + "loss": 1.2813, + "num_input_tokens_seen": 710561412, + "step": 18090 + }, + { + "epoch": 0.8770601525163995, + "grad_norm": 0.44368723034858704, + "learning_rate": 1.955722849968464e-06, + "loss": 1.3046, + "num_input_tokens_seen": 710960164, + "step": 18100 + }, + { + "epoch": 0.8775447161365743, + "grad_norm": 0.41541314125061035, + "learning_rate": 1.9405378696896487e-06, + "loss": 1.2637, + "num_input_tokens_seen": 711347332, + "step": 18110 + }, + { + "epoch": 0.878029279756749, + "grad_norm": 0.3992341160774231, + "learning_rate": 1.925409689739785e-06, + "loss": 1.2553, + "num_input_tokens_seen": 711716796, + "step": 18120 + }, + { + "epoch": 0.8785138433769238, + "grad_norm": 0.47702735662460327, + "learning_rate": 1.910338347382787e-06, + "loss": 1.2407, + "num_input_tokens_seen": 712071888, + "step": 18130 + }, + { + "epoch": 0.8789984069970986, + "grad_norm": 0.38431301712989807, + "learning_rate": 1.8953238797425442e-06, + "loss": 1.235, + "num_input_tokens_seen": 712445676, + "step": 18140 + }, + { + "epoch": 0.8794829706172734, + "grad_norm": 0.4030941128730774, + "learning_rate": 1.8803663238028707e-06, + "loss": 1.2388, + "num_input_tokens_seen": 712841240, + "step": 18150 + }, + { + "epoch": 0.8799675342374483, + "grad_norm": 0.3907192349433899, + "learning_rate": 1.8654657164073884e-06, + "loss": 1.2495, + "num_input_tokens_seen": 713199460, + "step": 18160 + }, + { + "epoch": 0.8804520978576231, + "grad_norm": 0.4238632917404175, + "learning_rate": 1.850622094259441e-06, + "loss": 1.329, + "num_input_tokens_seen": 713586584, + "step": 18170 + }, + { + "epoch": 0.8809366614777979, + "grad_norm": 0.43450093269348145, + "learning_rate": 1.835835493922014e-06, + "loss": 1.2552, + "num_input_tokens_seen": 713936208, + "step": 18180 + }, + { + "epoch": 0.8814212250979727, + "grad_norm": 0.36312198638916016, + "learning_rate": 1.821105951817617e-06, + "loss": 1.2135, + "num_input_tokens_seen": 714345664, + "step": 18190 + }, + { + "epoch": 0.8819057887181475, + "grad_norm": 0.3843260407447815, + "learning_rate": 1.8064335042282387e-06, + "loss": 1.2077, + "num_input_tokens_seen": 714749968, + "step": 18200 + }, + { + "epoch": 0.8823903523383223, + "grad_norm": 0.43907880783081055, + "learning_rate": 1.791818187295205e-06, + "loss": 1.2502, + "num_input_tokens_seen": 715139224, + "step": 18210 + }, + { + "epoch": 0.8828749159584971, + "grad_norm": 0.42311954498291016, + "learning_rate": 1.7772600370191433e-06, + "loss": 1.2607, + "num_input_tokens_seen": 715522824, + "step": 18220 + }, + { + "epoch": 0.8833594795786719, + "grad_norm": 0.41037461161613464, + "learning_rate": 1.762759089259844e-06, + "loss": 1.2311, + "num_input_tokens_seen": 715920216, + "step": 18230 + }, + { + "epoch": 0.8838440431988467, + "grad_norm": 0.4087775945663452, + "learning_rate": 1.7483153797362123e-06, + "loss": 1.2339, + "num_input_tokens_seen": 716298856, + "step": 18240 + }, + { + "epoch": 0.8843286068190216, + "grad_norm": 0.43544670939445496, + "learning_rate": 1.733928944026153e-06, + "loss": 1.258, + "num_input_tokens_seen": 716698420, + "step": 18250 + }, + { + "epoch": 0.8848131704391964, + "grad_norm": 0.4397813081741333, + "learning_rate": 1.7195998175665057e-06, + "loss": 1.2538, + "num_input_tokens_seen": 717109980, + "step": 18260 + }, + { + "epoch": 0.8852977340593712, + "grad_norm": 0.4167066812515259, + "learning_rate": 1.7053280356529283e-06, + "loss": 1.2635, + "num_input_tokens_seen": 717495908, + "step": 18270 + }, + { + "epoch": 0.885782297679546, + "grad_norm": 0.3873845040798187, + "learning_rate": 1.691113633439842e-06, + "loss": 1.2667, + "num_input_tokens_seen": 717871192, + "step": 18280 + }, + { + "epoch": 0.8862668612997208, + "grad_norm": 0.41001996397972107, + "learning_rate": 1.6769566459403224e-06, + "loss": 1.2516, + "num_input_tokens_seen": 718259268, + "step": 18290 + }, + { + "epoch": 0.8867514249198956, + "grad_norm": 0.454142689704895, + "learning_rate": 1.6628571080260196e-06, + "loss": 1.2868, + "num_input_tokens_seen": 718662496, + "step": 18300 + }, + { + "epoch": 0.8872359885400704, + "grad_norm": 0.433021605014801, + "learning_rate": 1.6488150544270776e-06, + "loss": 1.2646, + "num_input_tokens_seen": 719043384, + "step": 18310 + }, + { + "epoch": 0.8877205521602451, + "grad_norm": 0.3922811448574066, + "learning_rate": 1.6348305197320417e-06, + "loss": 1.3003, + "num_input_tokens_seen": 719419832, + "step": 18320 + }, + { + "epoch": 0.8882051157804199, + "grad_norm": 0.41497138142585754, + "learning_rate": 1.6209035383877803e-06, + "loss": 1.2812, + "num_input_tokens_seen": 719822572, + "step": 18330 + }, + { + "epoch": 0.8886896794005948, + "grad_norm": 0.3779890239238739, + "learning_rate": 1.6070341446993875e-06, + "loss": 1.1811, + "num_input_tokens_seen": 720208548, + "step": 18340 + }, + { + "epoch": 0.8891742430207696, + "grad_norm": 0.4199633300304413, + "learning_rate": 1.5932223728301138e-06, + "loss": 1.2312, + "num_input_tokens_seen": 720585180, + "step": 18350 + }, + { + "epoch": 0.8896588066409444, + "grad_norm": 0.40874430537223816, + "learning_rate": 1.579468256801267e-06, + "loss": 1.1845, + "num_input_tokens_seen": 720970764, + "step": 18360 + }, + { + "epoch": 0.8901433702611192, + "grad_norm": 0.37856462597846985, + "learning_rate": 1.5657718304921492e-06, + "loss": 1.2665, + "num_input_tokens_seen": 721361356, + "step": 18370 + }, + { + "epoch": 0.890627933881294, + "grad_norm": 0.37885743379592896, + "learning_rate": 1.5521331276399488e-06, + "loss": 1.2479, + "num_input_tokens_seen": 721724800, + "step": 18380 + }, + { + "epoch": 0.8911124975014688, + "grad_norm": 0.40030255913734436, + "learning_rate": 1.538552181839678e-06, + "loss": 1.2585, + "num_input_tokens_seen": 722116248, + "step": 18390 + }, + { + "epoch": 0.8915970611216436, + "grad_norm": 0.43325039744377136, + "learning_rate": 1.525029026544067e-06, + "loss": 1.2446, + "num_input_tokens_seen": 722489540, + "step": 18400 + }, + { + "epoch": 0.8920816247418184, + "grad_norm": 0.3845292627811432, + "learning_rate": 1.511563695063517e-06, + "loss": 1.258, + "num_input_tokens_seen": 722864464, + "step": 18410 + }, + { + "epoch": 0.8925661883619933, + "grad_norm": 0.4011618196964264, + "learning_rate": 1.4981562205659772e-06, + "loss": 1.1975, + "num_input_tokens_seen": 723261460, + "step": 18420 + }, + { + "epoch": 0.8930507519821681, + "grad_norm": 0.4230853021144867, + "learning_rate": 1.4848066360768935e-06, + "loss": 1.2705, + "num_input_tokens_seen": 723627044, + "step": 18430 + }, + { + "epoch": 0.8935353156023429, + "grad_norm": 0.41304829716682434, + "learning_rate": 1.4715149744791156e-06, + "loss": 1.2206, + "num_input_tokens_seen": 724019484, + "step": 18440 + }, + { + "epoch": 0.8940198792225177, + "grad_norm": 0.40190738439559937, + "learning_rate": 1.458281268512815e-06, + "loss": 1.2595, + "num_input_tokens_seen": 724395804, + "step": 18450 + }, + { + "epoch": 0.8945044428426925, + "grad_norm": 0.40311187505722046, + "learning_rate": 1.445105550775408e-06, + "loss": 1.2737, + "num_input_tokens_seen": 724797948, + "step": 18460 + }, + { + "epoch": 0.8949890064628673, + "grad_norm": 0.42915475368499756, + "learning_rate": 1.431987853721467e-06, + "loss": 1.2391, + "num_input_tokens_seen": 725183560, + "step": 18470 + }, + { + "epoch": 0.8954735700830421, + "grad_norm": 0.406745046377182, + "learning_rate": 1.4189282096626593e-06, + "loss": 1.2801, + "num_input_tokens_seen": 725560960, + "step": 18480 + }, + { + "epoch": 0.8959581337032169, + "grad_norm": 0.43271973729133606, + "learning_rate": 1.405926650767639e-06, + "loss": 1.2748, + "num_input_tokens_seen": 725936388, + "step": 18490 + }, + { + "epoch": 0.8964426973233917, + "grad_norm": 0.41202259063720703, + "learning_rate": 1.3929832090620043e-06, + "loss": 1.2548, + "num_input_tokens_seen": 726331564, + "step": 18500 + }, + { + "epoch": 0.8969272609435666, + "grad_norm": 0.4043997526168823, + "learning_rate": 1.3800979164281775e-06, + "loss": 1.2473, + "num_input_tokens_seen": 726700276, + "step": 18510 + }, + { + "epoch": 0.8974118245637414, + "grad_norm": 0.38937658071517944, + "learning_rate": 1.3672708046053668e-06, + "loss": 1.2406, + "num_input_tokens_seen": 727097284, + "step": 18520 + }, + { + "epoch": 0.8978963881839161, + "grad_norm": 0.4417135715484619, + "learning_rate": 1.3545019051894537e-06, + "loss": 1.2141, + "num_input_tokens_seen": 727502228, + "step": 18530 + }, + { + "epoch": 0.8983809518040909, + "grad_norm": 0.393096923828125, + "learning_rate": 1.34179124963294e-06, + "loss": 1.2459, + "num_input_tokens_seen": 727852368, + "step": 18540 + }, + { + "epoch": 0.8988655154242657, + "grad_norm": 0.42874211072921753, + "learning_rate": 1.3291388692448503e-06, + "loss": 1.2397, + "num_input_tokens_seen": 728254728, + "step": 18550 + }, + { + "epoch": 0.8993500790444405, + "grad_norm": 0.40823593735694885, + "learning_rate": 1.3165447951906773e-06, + "loss": 1.2552, + "num_input_tokens_seen": 728648556, + "step": 18560 + }, + { + "epoch": 0.8998346426646153, + "grad_norm": 0.37802520394325256, + "learning_rate": 1.3040090584922921e-06, + "loss": 1.2287, + "num_input_tokens_seen": 729028520, + "step": 18570 + }, + { + "epoch": 0.9003192062847901, + "grad_norm": 0.39629054069519043, + "learning_rate": 1.29153169002785e-06, + "loss": 1.2521, + "num_input_tokens_seen": 729403340, + "step": 18580 + }, + { + "epoch": 0.9008037699049649, + "grad_norm": 0.38545021414756775, + "learning_rate": 1.2791127205317583e-06, + "loss": 1.2657, + "num_input_tokens_seen": 729803816, + "step": 18590 + }, + { + "epoch": 0.9012883335251398, + "grad_norm": 0.457553893327713, + "learning_rate": 1.2667521805945577e-06, + "loss": 1.2545, + "num_input_tokens_seen": 730207652, + "step": 18600 + }, + { + "epoch": 0.9017728971453146, + "grad_norm": 0.42532676458358765, + "learning_rate": 1.2544501006628768e-06, + "loss": 1.2469, + "num_input_tokens_seen": 730598644, + "step": 18610 + }, + { + "epoch": 0.9022574607654894, + "grad_norm": 0.4223385155200958, + "learning_rate": 1.2422065110393317e-06, + "loss": 1.2694, + "num_input_tokens_seen": 730997112, + "step": 18620 + }, + { + "epoch": 0.9027420243856642, + "grad_norm": 0.404644638299942, + "learning_rate": 1.2300214418824756e-06, + "loss": 1.2535, + "num_input_tokens_seen": 731392988, + "step": 18630 + }, + { + "epoch": 0.903226588005839, + "grad_norm": 0.39279234409332275, + "learning_rate": 1.2178949232067056e-06, + "loss": 1.2253, + "num_input_tokens_seen": 731765160, + "step": 18640 + }, + { + "epoch": 0.9037111516260138, + "grad_norm": 0.41039055585861206, + "learning_rate": 1.2058269848822052e-06, + "loss": 1.2272, + "num_input_tokens_seen": 732135724, + "step": 18650 + }, + { + "epoch": 0.9041957152461886, + "grad_norm": 0.38321247696876526, + "learning_rate": 1.1938176566348518e-06, + "loss": 1.2478, + "num_input_tokens_seen": 732558172, + "step": 18660 + }, + { + "epoch": 0.9046802788663634, + "grad_norm": 0.47206875681877136, + "learning_rate": 1.1818669680461636e-06, + "loss": 1.2375, + "num_input_tokens_seen": 732961984, + "step": 18670 + }, + { + "epoch": 0.9051648424865382, + "grad_norm": 0.3947557210922241, + "learning_rate": 1.169974948553207e-06, + "loss": 1.2745, + "num_input_tokens_seen": 733342248, + "step": 18680 + }, + { + "epoch": 0.9056494061067131, + "grad_norm": 0.3839742839336395, + "learning_rate": 1.1581416274485447e-06, + "loss": 1.2215, + "num_input_tokens_seen": 733722272, + "step": 18690 + }, + { + "epoch": 0.9061339697268879, + "grad_norm": 0.4032100737094879, + "learning_rate": 1.146367033880147e-06, + "loss": 1.2589, + "num_input_tokens_seen": 734138572, + "step": 18700 + }, + { + "epoch": 0.9066185333470627, + "grad_norm": 0.42585238814353943, + "learning_rate": 1.1346511968513218e-06, + "loss": 1.2102, + "num_input_tokens_seen": 734513304, + "step": 18710 + }, + { + "epoch": 0.9071030969672375, + "grad_norm": 0.4173504412174225, + "learning_rate": 1.122994145220657e-06, + "loss": 1.2864, + "num_input_tokens_seen": 734919300, + "step": 18720 + }, + { + "epoch": 0.9075876605874122, + "grad_norm": 0.4611232876777649, + "learning_rate": 1.1113959077019315e-06, + "loss": 1.2136, + "num_input_tokens_seen": 735332452, + "step": 18730 + }, + { + "epoch": 0.908072224207587, + "grad_norm": 0.410653680562973, + "learning_rate": 1.0998565128640615e-06, + "loss": 1.2745, + "num_input_tokens_seen": 735714372, + "step": 18740 + }, + { + "epoch": 0.9085567878277618, + "grad_norm": 0.39335983991622925, + "learning_rate": 1.0883759891310047e-06, + "loss": 1.2148, + "num_input_tokens_seen": 736147668, + "step": 18750 + }, + { + "epoch": 0.9090413514479366, + "grad_norm": 0.4009684920310974, + "learning_rate": 1.0769543647817293e-06, + "loss": 1.255, + "num_input_tokens_seen": 736533020, + "step": 18760 + }, + { + "epoch": 0.9095259150681114, + "grad_norm": 0.39060232043266296, + "learning_rate": 1.0655916679501026e-06, + "loss": 1.28, + "num_input_tokens_seen": 736930600, + "step": 18770 + }, + { + "epoch": 0.9100104786882863, + "grad_norm": 0.3796159327030182, + "learning_rate": 1.0542879266248501e-06, + "loss": 1.295, + "num_input_tokens_seen": 737329028, + "step": 18780 + }, + { + "epoch": 0.9104950423084611, + "grad_norm": 0.41085562109947205, + "learning_rate": 1.0430431686494768e-06, + "loss": 1.2471, + "num_input_tokens_seen": 737729856, + "step": 18790 + }, + { + "epoch": 0.9109796059286359, + "grad_norm": 0.38016316294670105, + "learning_rate": 1.031857421722196e-06, + "loss": 1.2601, + "num_input_tokens_seen": 738134952, + "step": 18800 + }, + { + "epoch": 0.9114641695488107, + "grad_norm": 0.42966845631599426, + "learning_rate": 1.0207307133958676e-06, + "loss": 1.2398, + "num_input_tokens_seen": 738559280, + "step": 18810 + }, + { + "epoch": 0.9119487331689855, + "grad_norm": 0.39582687616348267, + "learning_rate": 1.0096630710779264e-06, + "loss": 1.2218, + "num_input_tokens_seen": 738942248, + "step": 18820 + }, + { + "epoch": 0.9124332967891603, + "grad_norm": 0.3957637846469879, + "learning_rate": 9.98654522030318e-07, + "loss": 1.2582, + "num_input_tokens_seen": 739314768, + "step": 18830 + }, + { + "epoch": 0.9129178604093351, + "grad_norm": 0.4404972493648529, + "learning_rate": 9.877050933694176e-07, + "loss": 1.2526, + "num_input_tokens_seen": 739693344, + "step": 18840 + }, + { + "epoch": 0.9134024240295099, + "grad_norm": 0.4136838912963867, + "learning_rate": 9.768148120659903e-07, + "loss": 1.2503, + "num_input_tokens_seen": 740095948, + "step": 18850 + }, + { + "epoch": 0.9138869876496847, + "grad_norm": 0.4641477167606354, + "learning_rate": 9.65983704945092e-07, + "loss": 1.2321, + "num_input_tokens_seen": 740509384, + "step": 18860 + }, + { + "epoch": 0.9143715512698596, + "grad_norm": 0.37187913060188293, + "learning_rate": 9.55211798686037e-07, + "loss": 1.2635, + "num_input_tokens_seen": 740899344, + "step": 18870 + }, + { + "epoch": 0.9148561148900344, + "grad_norm": 0.4150676131248474, + "learning_rate": 9.444991198223008e-07, + "loss": 1.2356, + "num_input_tokens_seen": 741314140, + "step": 18880 + }, + { + "epoch": 0.9153406785102092, + "grad_norm": 0.413786917924881, + "learning_rate": 9.338456947414837e-07, + "loss": 1.2555, + "num_input_tokens_seen": 741697844, + "step": 18890 + }, + { + "epoch": 0.915825242130384, + "grad_norm": 0.49498802423477173, + "learning_rate": 9.23251549685214e-07, + "loss": 1.2668, + "num_input_tokens_seen": 742079296, + "step": 18900 + }, + { + "epoch": 0.9163098057505588, + "grad_norm": 0.39548417925834656, + "learning_rate": 9.127167107491174e-07, + "loss": 1.2235, + "num_input_tokens_seen": 742477492, + "step": 18910 + }, + { + "epoch": 0.9167943693707336, + "grad_norm": 0.3953016698360443, + "learning_rate": 9.022412038827227e-07, + "loss": 1.2328, + "num_input_tokens_seen": 742879824, + "step": 18920 + }, + { + "epoch": 0.9172789329909083, + "grad_norm": 0.422567754983902, + "learning_rate": 8.918250548894225e-07, + "loss": 1.2207, + "num_input_tokens_seen": 743283828, + "step": 18930 + }, + { + "epoch": 0.9177634966110831, + "grad_norm": 0.4394066333770752, + "learning_rate": 8.814682894263904e-07, + "loss": 1.2525, + "num_input_tokens_seen": 743671012, + "step": 18940 + }, + { + "epoch": 0.9182480602312579, + "grad_norm": 0.3766193389892578, + "learning_rate": 8.711709330045309e-07, + "loss": 1.2441, + "num_input_tokens_seen": 744042668, + "step": 18950 + }, + { + "epoch": 0.9187326238514328, + "grad_norm": 0.4037720561027527, + "learning_rate": 8.609330109884045e-07, + "loss": 1.3088, + "num_input_tokens_seen": 744414936, + "step": 18960 + }, + { + "epoch": 0.9192171874716076, + "grad_norm": 0.38858434557914734, + "learning_rate": 8.507545485961804e-07, + "loss": 1.2561, + "num_input_tokens_seen": 744849212, + "step": 18970 + }, + { + "epoch": 0.9197017510917824, + "grad_norm": 0.44780433177948, + "learning_rate": 8.406355708995672e-07, + "loss": 1.2863, + "num_input_tokens_seen": 745242736, + "step": 18980 + }, + { + "epoch": 0.9201863147119572, + "grad_norm": 0.4300279915332794, + "learning_rate": 8.305761028237353e-07, + "loss": 1.2229, + "num_input_tokens_seen": 745670052, + "step": 18990 + }, + { + "epoch": 0.920670878332132, + "grad_norm": 0.4177449345588684, + "learning_rate": 8.205761691472913e-07, + "loss": 1.2178, + "num_input_tokens_seen": 746084620, + "step": 19000 + }, + { + "epoch": 0.9211554419523068, + "grad_norm": 0.38727110624313354, + "learning_rate": 8.106357945021765e-07, + "loss": 1.2613, + "num_input_tokens_seen": 746466352, + "step": 19010 + }, + { + "epoch": 0.9216400055724816, + "grad_norm": 0.4461565315723419, + "learning_rate": 8.007550033736405e-07, + "loss": 1.2423, + "num_input_tokens_seen": 746860664, + "step": 19020 + }, + { + "epoch": 0.9221245691926564, + "grad_norm": 0.4176614582538605, + "learning_rate": 7.909338201001564e-07, + "loss": 1.2664, + "num_input_tokens_seen": 747238188, + "step": 19030 + }, + { + "epoch": 0.9226091328128312, + "grad_norm": 0.4129350781440735, + "learning_rate": 7.811722688733786e-07, + "loss": 1.2481, + "num_input_tokens_seen": 747640208, + "step": 19040 + }, + { + "epoch": 0.9230936964330061, + "grad_norm": 0.40543922781944275, + "learning_rate": 7.714703737380674e-07, + "loss": 1.2575, + "num_input_tokens_seen": 748033864, + "step": 19050 + }, + { + "epoch": 0.9235782600531809, + "grad_norm": 0.4024973511695862, + "learning_rate": 7.618281585920456e-07, + "loss": 1.2477, + "num_input_tokens_seen": 748427680, + "step": 19060 + }, + { + "epoch": 0.9240628236733557, + "grad_norm": 0.3884994387626648, + "learning_rate": 7.522456471861172e-07, + "loss": 1.2325, + "num_input_tokens_seen": 748812864, + "step": 19070 + }, + { + "epoch": 0.9245473872935305, + "grad_norm": 0.38652002811431885, + "learning_rate": 7.427228631240457e-07, + "loss": 1.2592, + "num_input_tokens_seen": 749169748, + "step": 19080 + }, + { + "epoch": 0.9250319509137053, + "grad_norm": 0.4229782521724701, + "learning_rate": 7.33259829862451e-07, + "loss": 1.2767, + "num_input_tokens_seen": 749552652, + "step": 19090 + }, + { + "epoch": 0.9255165145338801, + "grad_norm": 0.3931877315044403, + "learning_rate": 7.238565707107875e-07, + "loss": 1.2596, + "num_input_tokens_seen": 749938204, + "step": 19100 + }, + { + "epoch": 0.9260010781540549, + "grad_norm": 0.42200735211372375, + "learning_rate": 7.145131088312745e-07, + "loss": 1.2536, + "num_input_tokens_seen": 750322924, + "step": 19110 + }, + { + "epoch": 0.9264856417742297, + "grad_norm": 0.40737777948379517, + "learning_rate": 7.052294672388271e-07, + "loss": 1.2143, + "num_input_tokens_seen": 750735180, + "step": 19120 + }, + { + "epoch": 0.9269702053944046, + "grad_norm": 0.41996297240257263, + "learning_rate": 6.960056688010197e-07, + "loss": 1.2552, + "num_input_tokens_seen": 751134712, + "step": 19130 + }, + { + "epoch": 0.9274547690145793, + "grad_norm": 0.40312203764915466, + "learning_rate": 6.868417362380114e-07, + "loss": 1.2409, + "num_input_tokens_seen": 751509996, + "step": 19140 + }, + { + "epoch": 0.9279393326347541, + "grad_norm": 0.3921646773815155, + "learning_rate": 6.777376921225125e-07, + "loss": 1.2459, + "num_input_tokens_seen": 751937076, + "step": 19150 + }, + { + "epoch": 0.9284238962549289, + "grad_norm": 0.4335014522075653, + "learning_rate": 6.686935588797072e-07, + "loss": 1.2383, + "num_input_tokens_seen": 752333820, + "step": 19160 + }, + { + "epoch": 0.9289084598751037, + "grad_norm": 0.42503783106803894, + "learning_rate": 6.597093587872055e-07, + "loss": 1.2457, + "num_input_tokens_seen": 752732964, + "step": 19170 + }, + { + "epoch": 0.9293930234952785, + "grad_norm": 0.4474683701992035, + "learning_rate": 6.507851139749888e-07, + "loss": 1.2576, + "num_input_tokens_seen": 753114736, + "step": 19180 + }, + { + "epoch": 0.9298775871154533, + "grad_norm": 0.40870001912117004, + "learning_rate": 6.419208464253618e-07, + "loss": 1.2785, + "num_input_tokens_seen": 753496580, + "step": 19190 + }, + { + "epoch": 0.9303621507356281, + "grad_norm": 0.4168875217437744, + "learning_rate": 6.331165779728865e-07, + "loss": 1.2859, + "num_input_tokens_seen": 753875144, + "step": 19200 + }, + { + "epoch": 0.9308467143558029, + "grad_norm": 0.3911028504371643, + "learning_rate": 6.243723303043403e-07, + "loss": 1.2791, + "num_input_tokens_seen": 754260416, + "step": 19210 + }, + { + "epoch": 0.9313312779759778, + "grad_norm": 0.4180554151535034, + "learning_rate": 6.156881249586493e-07, + "loss": 1.2427, + "num_input_tokens_seen": 754653980, + "step": 19220 + }, + { + "epoch": 0.9318158415961526, + "grad_norm": 0.4038565158843994, + "learning_rate": 6.070639833268471e-07, + "loss": 1.2551, + "num_input_tokens_seen": 755023452, + "step": 19230 + }, + { + "epoch": 0.9323004052163274, + "grad_norm": 0.433095246553421, + "learning_rate": 5.984999266520214e-07, + "loss": 1.2713, + "num_input_tokens_seen": 755440128, + "step": 19240 + }, + { + "epoch": 0.9327849688365022, + "grad_norm": 0.4306170642375946, + "learning_rate": 5.899959760292478e-07, + "loss": 1.2995, + "num_input_tokens_seen": 755840776, + "step": 19250 + }, + { + "epoch": 0.933269532456677, + "grad_norm": 0.4218454658985138, + "learning_rate": 5.815521524055623e-07, + "loss": 1.2378, + "num_input_tokens_seen": 756263932, + "step": 19260 + }, + { + "epoch": 0.9337540960768518, + "grad_norm": 0.3932402729988098, + "learning_rate": 5.731684765798772e-07, + "loss": 1.2506, + "num_input_tokens_seen": 756648804, + "step": 19270 + }, + { + "epoch": 0.9342386596970266, + "grad_norm": 0.4214298129081726, + "learning_rate": 5.648449692029656e-07, + "loss": 1.2264, + "num_input_tokens_seen": 757063844, + "step": 19280 + }, + { + "epoch": 0.9347232233172014, + "grad_norm": 0.4255639612674713, + "learning_rate": 5.565816507773797e-07, + "loss": 1.2217, + "num_input_tokens_seen": 757437036, + "step": 19290 + }, + { + "epoch": 0.9352077869373762, + "grad_norm": 0.3967702090740204, + "learning_rate": 5.483785416574239e-07, + "loss": 1.3114, + "num_input_tokens_seen": 757815468, + "step": 19300 + }, + { + "epoch": 0.9356923505575511, + "grad_norm": 0.4092276096343994, + "learning_rate": 5.402356620490878e-07, + "loss": 1.2753, + "num_input_tokens_seen": 758208344, + "step": 19310 + }, + { + "epoch": 0.9361769141777259, + "grad_norm": 0.40975773334503174, + "learning_rate": 5.321530320100076e-07, + "loss": 1.2785, + "num_input_tokens_seen": 758618456, + "step": 19320 + }, + { + "epoch": 0.9366614777979007, + "grad_norm": 0.44307002425193787, + "learning_rate": 5.241306714494021e-07, + "loss": 1.2863, + "num_input_tokens_seen": 759009464, + "step": 19330 + }, + { + "epoch": 0.9371460414180754, + "grad_norm": 0.39563581347465515, + "learning_rate": 5.161686001280503e-07, + "loss": 1.2356, + "num_input_tokens_seen": 759418492, + "step": 19340 + }, + { + "epoch": 0.9376306050382502, + "grad_norm": 0.4311593472957611, + "learning_rate": 5.082668376582111e-07, + "loss": 1.2691, + "num_input_tokens_seen": 759767128, + "step": 19350 + }, + { + "epoch": 0.938115168658425, + "grad_norm": 0.3757501542568207, + "learning_rate": 5.00425403503596e-07, + "loss": 1.2662, + "num_input_tokens_seen": 760168828, + "step": 19360 + }, + { + "epoch": 0.9385997322785998, + "grad_norm": 0.4447803795337677, + "learning_rate": 4.926443169793154e-07, + "loss": 1.3027, + "num_input_tokens_seen": 760588676, + "step": 19370 + }, + { + "epoch": 0.9390842958987746, + "grad_norm": 0.4607948958873749, + "learning_rate": 4.849235972518295e-07, + "loss": 1.2566, + "num_input_tokens_seen": 760956864, + "step": 19380 + }, + { + "epoch": 0.9395688595189494, + "grad_norm": 0.3940238058567047, + "learning_rate": 4.772632633389063e-07, + "loss": 1.2854, + "num_input_tokens_seen": 761318912, + "step": 19390 + }, + { + "epoch": 0.9400534231391243, + "grad_norm": 0.4033295810222626, + "learning_rate": 4.6966333410956023e-07, + "loss": 1.2041, + "num_input_tokens_seen": 761704264, + "step": 19400 + }, + { + "epoch": 0.9405379867592991, + "grad_norm": 0.4064282178878784, + "learning_rate": 4.621238282840279e-07, + "loss": 1.24, + "num_input_tokens_seen": 762085876, + "step": 19410 + }, + { + "epoch": 0.9410225503794739, + "grad_norm": 0.39992567896842957, + "learning_rate": 4.546447644337065e-07, + "loss": 1.2397, + "num_input_tokens_seen": 762474368, + "step": 19420 + }, + { + "epoch": 0.9415071139996487, + "grad_norm": 0.4709777235984802, + "learning_rate": 4.4722616098110684e-07, + "loss": 1.2858, + "num_input_tokens_seen": 762874704, + "step": 19430 + }, + { + "epoch": 0.9419916776198235, + "grad_norm": 0.4245252311229706, + "learning_rate": 4.3986803619981973e-07, + "loss": 1.2707, + "num_input_tokens_seen": 763268244, + "step": 19440 + }, + { + "epoch": 0.9424762412399983, + "grad_norm": 0.44352298974990845, + "learning_rate": 4.325704082144666e-07, + "loss": 1.2645, + "num_input_tokens_seen": 763643580, + "step": 19450 + }, + { + "epoch": 0.9429608048601731, + "grad_norm": 0.39565107226371765, + "learning_rate": 4.2533329500063776e-07, + "loss": 1.2278, + "num_input_tokens_seen": 764054380, + "step": 19460 + }, + { + "epoch": 0.9434453684803479, + "grad_norm": 0.3986281454563141, + "learning_rate": 4.181567143848819e-07, + "loss": 1.2794, + "num_input_tokens_seen": 764458228, + "step": 19470 + }, + { + "epoch": 0.9439299321005227, + "grad_norm": 0.4145820140838623, + "learning_rate": 4.1104068404462514e-07, + "loss": 1.242, + "num_input_tokens_seen": 764816404, + "step": 19480 + }, + { + "epoch": 0.9444144957206976, + "grad_norm": 0.42509499192237854, + "learning_rate": 4.039852215081602e-07, + "loss": 1.2862, + "num_input_tokens_seen": 765190032, + "step": 19490 + }, + { + "epoch": 0.9448990593408724, + "grad_norm": 0.38184764981269836, + "learning_rate": 3.969903441545825e-07, + "loss": 1.2225, + "num_input_tokens_seen": 765565508, + "step": 19500 + }, + { + "epoch": 0.9453836229610472, + "grad_norm": 0.460821270942688, + "learning_rate": 3.9005606921375117e-07, + "loss": 1.2652, + "num_input_tokens_seen": 765974548, + "step": 19510 + }, + { + "epoch": 0.945868186581222, + "grad_norm": 0.4286702573299408, + "learning_rate": 3.831824137662504e-07, + "loss": 1.2459, + "num_input_tokens_seen": 766399956, + "step": 19520 + }, + { + "epoch": 0.9463527502013968, + "grad_norm": 0.38720691204071045, + "learning_rate": 3.7636939474334775e-07, + "loss": 1.2141, + "num_input_tokens_seen": 766793544, + "step": 19530 + }, + { + "epoch": 0.9468373138215715, + "grad_norm": 0.3998976945877075, + "learning_rate": 3.696170289269524e-07, + "loss": 1.2611, + "num_input_tokens_seen": 767186876, + "step": 19540 + }, + { + "epoch": 0.9473218774417463, + "grad_norm": 0.4127962291240692, + "learning_rate": 3.6292533294955966e-07, + "loss": 1.2499, + "num_input_tokens_seen": 767600456, + "step": 19550 + }, + { + "epoch": 0.9478064410619211, + "grad_norm": 0.4086759388446808, + "learning_rate": 3.5629432329424006e-07, + "loss": 1.2433, + "num_input_tokens_seen": 768032396, + "step": 19560 + }, + { + "epoch": 0.9482910046820959, + "grad_norm": 0.3765683174133301, + "learning_rate": 3.49724016294567e-07, + "loss": 1.2663, + "num_input_tokens_seen": 768403312, + "step": 19570 + }, + { + "epoch": 0.9487755683022708, + "grad_norm": 0.47519636154174805, + "learning_rate": 3.432144281345973e-07, + "loss": 1.2692, + "num_input_tokens_seen": 768763468, + "step": 19580 + }, + { + "epoch": 0.9492601319224456, + "grad_norm": 0.41748225688934326, + "learning_rate": 3.3676557484881855e-07, + "loss": 1.2243, + "num_input_tokens_seen": 769110720, + "step": 19590 + }, + { + "epoch": 0.9497446955426204, + "grad_norm": 0.4386734962463379, + "learning_rate": 3.30377472322127e-07, + "loss": 1.2672, + "num_input_tokens_seen": 769514668, + "step": 19600 + }, + { + "epoch": 0.9502292591627952, + "grad_norm": 0.42562198638916016, + "learning_rate": 3.2405013628976077e-07, + "loss": 1.2504, + "num_input_tokens_seen": 769901384, + "step": 19610 + }, + { + "epoch": 0.95071382278297, + "grad_norm": 0.4268267750740051, + "learning_rate": 3.1778358233729154e-07, + "loss": 1.2657, + "num_input_tokens_seen": 770308688, + "step": 19620 + }, + { + "epoch": 0.9511983864031448, + "grad_norm": 0.4298213720321655, + "learning_rate": 3.1157782590056637e-07, + "loss": 1.1705, + "num_input_tokens_seen": 770709140, + "step": 19630 + }, + { + "epoch": 0.9516829500233196, + "grad_norm": 0.4501436650753021, + "learning_rate": 3.054328822656688e-07, + "loss": 1.2964, + "num_input_tokens_seen": 771100080, + "step": 19640 + }, + { + "epoch": 0.9521675136434944, + "grad_norm": 0.43033567070961, + "learning_rate": 2.9934876656890207e-07, + "loss": 1.2118, + "num_input_tokens_seen": 771491372, + "step": 19650 + }, + { + "epoch": 0.9526520772636692, + "grad_norm": 0.4044438302516937, + "learning_rate": 2.933254937967228e-07, + "loss": 1.2532, + "num_input_tokens_seen": 771854668, + "step": 19660 + }, + { + "epoch": 0.9531366408838441, + "grad_norm": 0.38256967067718506, + "learning_rate": 2.8736307878572957e-07, + "loss": 1.2422, + "num_input_tokens_seen": 772233856, + "step": 19670 + }, + { + "epoch": 0.9536212045040189, + "grad_norm": 0.4290720224380493, + "learning_rate": 2.8146153622260475e-07, + "loss": 1.2342, + "num_input_tokens_seen": 772629696, + "step": 19680 + }, + { + "epoch": 0.9541057681241937, + "grad_norm": 0.3771139681339264, + "learning_rate": 2.7562088064410354e-07, + "loss": 1.2594, + "num_input_tokens_seen": 773032252, + "step": 19690 + }, + { + "epoch": 0.9545903317443685, + "grad_norm": 0.4023270606994629, + "learning_rate": 2.6984112643698434e-07, + "loss": 1.2354, + "num_input_tokens_seen": 773437064, + "step": 19700 + }, + { + "epoch": 0.9550748953645433, + "grad_norm": 0.42096689343452454, + "learning_rate": 2.641222878380117e-07, + "loss": 1.2892, + "num_input_tokens_seen": 773816632, + "step": 19710 + }, + { + "epoch": 0.9555594589847181, + "grad_norm": 0.3777530789375305, + "learning_rate": 2.5846437893388977e-07, + "loss": 1.2892, + "num_input_tokens_seen": 774228364, + "step": 19720 + }, + { + "epoch": 0.9560440226048929, + "grad_norm": 0.43396878242492676, + "learning_rate": 2.52867413661248e-07, + "loss": 1.2451, + "num_input_tokens_seen": 774601636, + "step": 19730 + }, + { + "epoch": 0.9565285862250676, + "grad_norm": 0.39147117733955383, + "learning_rate": 2.4733140580658897e-07, + "loss": 1.2891, + "num_input_tokens_seen": 774994136, + "step": 19740 + }, + { + "epoch": 0.9570131498452424, + "grad_norm": 0.43016576766967773, + "learning_rate": 2.4185636900627417e-07, + "loss": 1.2161, + "num_input_tokens_seen": 775389680, + "step": 19750 + }, + { + "epoch": 0.9574977134654173, + "grad_norm": 0.4033438265323639, + "learning_rate": 2.3644231674647688e-07, + "loss": 1.2664, + "num_input_tokens_seen": 775778568, + "step": 19760 + }, + { + "epoch": 0.9579822770855921, + "grad_norm": 0.4349776804447174, + "learning_rate": 2.3108926236314887e-07, + "loss": 1.2115, + "num_input_tokens_seen": 776157176, + "step": 19770 + }, + { + "epoch": 0.9584668407057669, + "grad_norm": 0.4078141748905182, + "learning_rate": 2.2579721904199824e-07, + "loss": 1.2701, + "num_input_tokens_seen": 776547680, + "step": 19780 + }, + { + "epoch": 0.9589514043259417, + "grad_norm": 0.4067167639732361, + "learning_rate": 2.2056619981844495e-07, + "loss": 1.2334, + "num_input_tokens_seen": 776966012, + "step": 19790 + }, + { + "epoch": 0.9594359679461165, + "grad_norm": 0.41261714696884155, + "learning_rate": 2.153962175775931e-07, + "loss": 1.2612, + "num_input_tokens_seen": 777377344, + "step": 19800 + }, + { + "epoch": 0.9599205315662913, + "grad_norm": 0.3794651925563812, + "learning_rate": 2.1028728505420313e-07, + "loss": 1.2746, + "num_input_tokens_seen": 777740524, + "step": 19810 + }, + { + "epoch": 0.9604050951864661, + "grad_norm": 0.3992972671985626, + "learning_rate": 2.0523941483265586e-07, + "loss": 1.2302, + "num_input_tokens_seen": 778140684, + "step": 19820 + }, + { + "epoch": 0.9608896588066409, + "grad_norm": 0.4393966495990753, + "learning_rate": 2.0025261934691897e-07, + "loss": 1.2686, + "num_input_tokens_seen": 778528860, + "step": 19830 + }, + { + "epoch": 0.9613742224268158, + "grad_norm": 0.4146067798137665, + "learning_rate": 1.9532691088053056e-07, + "loss": 1.2656, + "num_input_tokens_seen": 778900016, + "step": 19840 + }, + { + "epoch": 0.9618587860469906, + "grad_norm": 0.43496647477149963, + "learning_rate": 1.904623015665463e-07, + "loss": 1.2421, + "num_input_tokens_seen": 779282552, + "step": 19850 + }, + { + "epoch": 0.9623433496671654, + "grad_norm": 0.43942153453826904, + "learning_rate": 1.8565880338752838e-07, + "loss": 1.2166, + "num_input_tokens_seen": 779634656, + "step": 19860 + }, + { + "epoch": 0.9628279132873402, + "grad_norm": 0.41181692481040955, + "learning_rate": 1.8091642817550935e-07, + "loss": 1.2809, + "num_input_tokens_seen": 780011584, + "step": 19870 + }, + { + "epoch": 0.963312476907515, + "grad_norm": 0.44306066632270813, + "learning_rate": 1.762351876119589e-07, + "loss": 1.2406, + "num_input_tokens_seen": 780398180, + "step": 19880 + }, + { + "epoch": 0.9637970405276898, + "grad_norm": 0.38546454906463623, + "learning_rate": 1.7161509322776437e-07, + "loss": 1.2775, + "num_input_tokens_seen": 780787352, + "step": 19890 + }, + { + "epoch": 0.9642816041478646, + "grad_norm": 0.425279825925827, + "learning_rate": 1.6705615640319472e-07, + "loss": 1.2662, + "num_input_tokens_seen": 781181080, + "step": 19900 + }, + { + "epoch": 0.9647661677680394, + "grad_norm": 0.4292610287666321, + "learning_rate": 1.625583883678755e-07, + "loss": 1.2033, + "num_input_tokens_seen": 781590820, + "step": 19910 + }, + { + "epoch": 0.9652507313882142, + "grad_norm": 0.44324782490730286, + "learning_rate": 1.5812180020075563e-07, + "loss": 1.2729, + "num_input_tokens_seen": 782000476, + "step": 19920 + }, + { + "epoch": 0.9657352950083891, + "grad_norm": 0.4017219543457031, + "learning_rate": 1.537464028300961e-07, + "loss": 1.2573, + "num_input_tokens_seen": 782384008, + "step": 19930 + }, + { + "epoch": 0.9662198586285639, + "grad_norm": 0.4076748490333557, + "learning_rate": 1.4943220703342031e-07, + "loss": 1.2051, + "num_input_tokens_seen": 782770540, + "step": 19940 + }, + { + "epoch": 0.9667044222487386, + "grad_norm": 0.4016420841217041, + "learning_rate": 1.4517922343750546e-07, + "loss": 1.2357, + "num_input_tokens_seen": 783161236, + "step": 19950 + }, + { + "epoch": 0.9671889858689134, + "grad_norm": 0.4644124507904053, + "learning_rate": 1.4098746251834938e-07, + "loss": 1.3148, + "num_input_tokens_seen": 783549976, + "step": 19960 + }, + { + "epoch": 0.9676735494890882, + "grad_norm": 0.4020436108112335, + "learning_rate": 1.3685693460114835e-07, + "loss": 1.2423, + "num_input_tokens_seen": 783956176, + "step": 19970 + }, + { + "epoch": 0.968158113109263, + "grad_norm": 0.42544135451316833, + "learning_rate": 1.3278764986025816e-07, + "loss": 1.2435, + "num_input_tokens_seen": 784370068, + "step": 19980 + }, + { + "epoch": 0.9686426767294378, + "grad_norm": 0.40704938769340515, + "learning_rate": 1.2877961831919416e-07, + "loss": 1.1991, + "num_input_tokens_seen": 784774032, + "step": 19990 + }, + { + "epoch": 0.9691272403496126, + "grad_norm": 0.45113271474838257, + "learning_rate": 1.248328498505813e-07, + "loss": 1.216, + "num_input_tokens_seen": 785148304, + "step": 20000 + }, + { + "epoch": 0.9691272403496126, + "eval_loss": 1.3588515520095825, + "eval_runtime": 4.2882, + "eval_samples_per_second": 34.98, + "eval_steps_per_second": 4.431, + "num_input_tokens_seen": 785148304, + "step": 20000 + }, + { + "epoch": 0.9696118039697874, + "grad_norm": 0.4246367812156677, + "learning_rate": 1.2094735417614579e-07, + "loss": 1.3037, + "num_input_tokens_seen": 785532360, + "step": 20010 + }, + { + "epoch": 0.9700963675899623, + "grad_norm": 0.40048691630363464, + "learning_rate": 1.1712314086668452e-07, + "loss": 1.2873, + "num_input_tokens_seen": 785948652, + "step": 20020 + }, + { + "epoch": 0.9705809312101371, + "grad_norm": 0.39899271726608276, + "learning_rate": 1.13360219342043e-07, + "loss": 1.2443, + "num_input_tokens_seen": 786340648, + "step": 20030 + }, + { + "epoch": 0.9710654948303119, + "grad_norm": 0.41089197993278503, + "learning_rate": 1.0965859887109575e-07, + "loss": 1.2696, + "num_input_tokens_seen": 786729636, + "step": 20040 + }, + { + "epoch": 0.9715500584504867, + "grad_norm": 0.3835326135158539, + "learning_rate": 1.0601828857171037e-07, + "loss": 1.2485, + "num_input_tokens_seen": 787122652, + "step": 20050 + }, + { + "epoch": 0.9720346220706615, + "grad_norm": 0.43811744451522827, + "learning_rate": 1.0243929741074465e-07, + "loss": 1.2809, + "num_input_tokens_seen": 787529572, + "step": 20060 + }, + { + "epoch": 0.9725191856908363, + "grad_norm": 0.4033362865447998, + "learning_rate": 9.892163420400779e-08, + "loss": 1.2607, + "num_input_tokens_seen": 787926640, + "step": 20070 + }, + { + "epoch": 0.9730037493110111, + "grad_norm": 0.4612255096435547, + "learning_rate": 9.546530761624928e-08, + "loss": 1.2536, + "num_input_tokens_seen": 788343440, + "step": 20080 + }, + { + "epoch": 0.9734883129311859, + "grad_norm": 0.4130823314189911, + "learning_rate": 9.20703261611311e-08, + "loss": 1.2822, + "num_input_tokens_seen": 788759176, + "step": 20090 + }, + { + "epoch": 0.9739728765513607, + "grad_norm": 0.46311095356941223, + "learning_rate": 8.873669820121111e-08, + "loss": 1.2476, + "num_input_tokens_seen": 789162040, + "step": 20100 + }, + { + "epoch": 0.9744574401715356, + "grad_norm": 0.3985438942909241, + "learning_rate": 8.546443194791808e-08, + "loss": 1.2453, + "num_input_tokens_seen": 789567068, + "step": 20110 + }, + { + "epoch": 0.9749420037917104, + "grad_norm": 0.40944841504096985, + "learning_rate": 8.22535354615378e-08, + "loss": 1.2376, + "num_input_tokens_seen": 789989960, + "step": 20120 + }, + { + "epoch": 0.9754265674118852, + "grad_norm": 0.41747674345970154, + "learning_rate": 7.910401665118528e-08, + "loss": 1.2289, + "num_input_tokens_seen": 790375840, + "step": 20130 + }, + { + "epoch": 0.97591113103206, + "grad_norm": 0.41349682211875916, + "learning_rate": 7.601588327479092e-08, + "loss": 1.2703, + "num_input_tokens_seen": 790758920, + "step": 20140 + }, + { + "epoch": 0.9763956946522347, + "grad_norm": 0.4096575677394867, + "learning_rate": 7.298914293907833e-08, + "loss": 1.2, + "num_input_tokens_seen": 791138556, + "step": 20150 + }, + { + "epoch": 0.9768802582724095, + "grad_norm": 0.39834490418434143, + "learning_rate": 7.002380309955314e-08, + "loss": 1.2784, + "num_input_tokens_seen": 791537516, + "step": 20160 + }, + { + "epoch": 0.9773648218925843, + "grad_norm": 0.41982364654541016, + "learning_rate": 6.711987106046979e-08, + "loss": 1.2283, + "num_input_tokens_seen": 791914936, + "step": 20170 + }, + { + "epoch": 0.9778493855127591, + "grad_norm": 0.4055969715118408, + "learning_rate": 6.427735397483148e-08, + "loss": 1.243, + "num_input_tokens_seen": 792312344, + "step": 20180 + }, + { + "epoch": 0.9783339491329339, + "grad_norm": 0.490959495306015, + "learning_rate": 6.149625884435407e-08, + "loss": 1.2525, + "num_input_tokens_seen": 792710660, + "step": 20190 + }, + { + "epoch": 0.9788185127531088, + "grad_norm": 0.4319050908088684, + "learning_rate": 5.877659251946332e-08, + "loss": 1.2896, + "num_input_tokens_seen": 793093572, + "step": 20200 + }, + { + "epoch": 0.9793030763732836, + "grad_norm": 0.42025476694107056, + "learning_rate": 5.611836169927276e-08, + "loss": 1.2736, + "num_input_tokens_seen": 793485640, + "step": 20210 + }, + { + "epoch": 0.9797876399934584, + "grad_norm": 0.47515881061553955, + "learning_rate": 5.35215729315669e-08, + "loss": 1.3133, + "num_input_tokens_seen": 793896728, + "step": 20220 + }, + { + "epoch": 0.9802722036136332, + "grad_norm": 0.41248735785484314, + "learning_rate": 5.0986232612787453e-08, + "loss": 1.2762, + "num_input_tokens_seen": 794295484, + "step": 20230 + }, + { + "epoch": 0.980756767233808, + "grad_norm": 0.4190135896205902, + "learning_rate": 4.851234698800833e-08, + "loss": 1.2217, + "num_input_tokens_seen": 794691580, + "step": 20240 + }, + { + "epoch": 0.9812413308539828, + "grad_norm": 0.41553497314453125, + "learning_rate": 4.609992215093839e-08, + "loss": 1.2819, + "num_input_tokens_seen": 795077668, + "step": 20250 + }, + { + "epoch": 0.9817258944741576, + "grad_norm": 0.42695561051368713, + "learning_rate": 4.374896404388818e-08, + "loss": 1.252, + "num_input_tokens_seen": 795454924, + "step": 20260 + }, + { + "epoch": 0.9822104580943324, + "grad_norm": 0.3926050066947937, + "learning_rate": 4.145947845776155e-08, + "loss": 1.2739, + "num_input_tokens_seen": 795863124, + "step": 20270 + }, + { + "epoch": 0.9826950217145072, + "grad_norm": 0.4227527678012848, + "learning_rate": 3.923147103204738e-08, + "loss": 1.2595, + "num_input_tokens_seen": 796247640, + "step": 20280 + }, + { + "epoch": 0.9831795853346821, + "grad_norm": 0.40944164991378784, + "learning_rate": 3.7064947254797365e-08, + "loss": 1.3064, + "num_input_tokens_seen": 796646028, + "step": 20290 + }, + { + "epoch": 0.9836641489548569, + "grad_norm": 0.39912155270576477, + "learning_rate": 3.4959912462620426e-08, + "loss": 1.1981, + "num_input_tokens_seen": 797037764, + "step": 20300 + }, + { + "epoch": 0.9841487125750317, + "grad_norm": 0.4131697118282318, + "learning_rate": 3.2916371840660544e-08, + "loss": 1.2223, + "num_input_tokens_seen": 797429064, + "step": 20310 + }, + { + "epoch": 0.9846332761952065, + "grad_norm": 0.4304356873035431, + "learning_rate": 3.093433042259119e-08, + "loss": 1.2225, + "num_input_tokens_seen": 797810616, + "step": 20320 + }, + { + "epoch": 0.9851178398153813, + "grad_norm": 0.4309372007846832, + "learning_rate": 2.9013793090598708e-08, + "loss": 1.2713, + "num_input_tokens_seen": 798207544, + "step": 20330 + }, + { + "epoch": 0.985602403435556, + "grad_norm": 0.4649990200996399, + "learning_rate": 2.715476457537114e-08, + "loss": 1.2487, + "num_input_tokens_seen": 798598064, + "step": 20340 + }, + { + "epoch": 0.9860869670557308, + "grad_norm": 0.4426388740539551, + "learning_rate": 2.535724945608997e-08, + "loss": 1.2796, + "num_input_tokens_seen": 798970944, + "step": 20350 + }, + { + "epoch": 0.9865715306759056, + "grad_norm": 0.4075920879840851, + "learning_rate": 2.3621252160413443e-08, + "loss": 1.2605, + "num_input_tokens_seen": 799356380, + "step": 20360 + }, + { + "epoch": 0.9870560942960804, + "grad_norm": 0.40336674451828003, + "learning_rate": 2.194677696447378e-08, + "loss": 1.2305, + "num_input_tokens_seen": 799753192, + "step": 20370 + }, + { + "epoch": 0.9875406579162553, + "grad_norm": 0.4300762712955475, + "learning_rate": 2.0333827992852217e-08, + "loss": 1.22, + "num_input_tokens_seen": 800120392, + "step": 20380 + }, + { + "epoch": 0.9880252215364301, + "grad_norm": 0.4360959827899933, + "learning_rate": 1.878240921858454e-08, + "loss": 1.2784, + "num_input_tokens_seen": 800495740, + "step": 20390 + }, + { + "epoch": 0.9885097851566049, + "grad_norm": 0.4353114664554596, + "learning_rate": 1.7292524463144445e-08, + "loss": 1.2539, + "num_input_tokens_seen": 800902016, + "step": 20400 + }, + { + "epoch": 0.9889943487767797, + "grad_norm": 0.43556949496269226, + "learning_rate": 1.5864177396432422e-08, + "loss": 1.227, + "num_input_tokens_seen": 801289824, + "step": 20410 + }, + { + "epoch": 0.9894789123969545, + "grad_norm": 0.4041503071784973, + "learning_rate": 1.4497371536770221e-08, + "loss": 1.2591, + "num_input_tokens_seen": 801688324, + "step": 20420 + }, + { + "epoch": 0.9899634760171293, + "grad_norm": 0.42014312744140625, + "learning_rate": 1.3192110250886957e-08, + "loss": 1.1945, + "num_input_tokens_seen": 802094388, + "step": 20430 + }, + { + "epoch": 0.9904480396373041, + "grad_norm": 0.44136643409729004, + "learning_rate": 1.1948396753919123e-08, + "loss": 1.274, + "num_input_tokens_seen": 802477196, + "step": 20440 + }, + { + "epoch": 0.9909326032574789, + "grad_norm": 0.4420247972011566, + "learning_rate": 1.0766234109393925e-08, + "loss": 1.2007, + "num_input_tokens_seen": 802877584, + "step": 20450 + }, + { + "epoch": 0.9914171668776537, + "grad_norm": 0.4464600086212158, + "learning_rate": 9.645625229232069e-09, + "loss": 1.219, + "num_input_tokens_seen": 803263948, + "step": 20460 + }, + { + "epoch": 0.9919017304978286, + "grad_norm": 0.4290953278541565, + "learning_rate": 8.586572873725551e-09, + "loss": 1.2345, + "num_input_tokens_seen": 803671476, + "step": 20470 + }, + { + "epoch": 0.9923862941180034, + "grad_norm": 0.40474194288253784, + "learning_rate": 7.589079651543207e-09, + "loss": 1.2742, + "num_input_tokens_seen": 804084040, + "step": 20480 + }, + { + "epoch": 0.9928708577381782, + "grad_norm": 0.41104060411453247, + "learning_rate": 6.653148019727939e-09, + "loss": 1.2496, + "num_input_tokens_seen": 804500276, + "step": 20490 + }, + { + "epoch": 0.993355421358353, + "grad_norm": 0.3800486922264099, + "learning_rate": 5.778780283671736e-09, + "loss": 1.2768, + "num_input_tokens_seen": 804875212, + "step": 20500 + }, + { + "epoch": 0.9938399849785278, + "grad_norm": 0.421836256980896, + "learning_rate": 4.96597859712955e-09, + "loss": 1.2106, + "num_input_tokens_seen": 805248472, + "step": 20510 + }, + { + "epoch": 0.9943245485987026, + "grad_norm": 0.4047603905200958, + "learning_rate": 4.2147449622026435e-09, + "loss": 1.2057, + "num_input_tokens_seen": 805647944, + "step": 20520 + }, + { + "epoch": 0.9948091122188774, + "grad_norm": 0.3970736265182495, + "learning_rate": 3.525081229338589e-09, + "loss": 1.2238, + "num_input_tokens_seen": 806023600, + "step": 20530 + }, + { + "epoch": 0.9952936758390521, + "grad_norm": 0.4089398682117462, + "learning_rate": 2.8969890973257198e-09, + "loss": 1.2181, + "num_input_tokens_seen": 806426272, + "step": 20540 + }, + { + "epoch": 0.995778239459227, + "grad_norm": 0.39923080801963806, + "learning_rate": 2.3304701132847996e-09, + "loss": 1.2999, + "num_input_tokens_seen": 806826016, + "step": 20550 + }, + { + "epoch": 0.9962628030794018, + "grad_norm": 0.3768922686576843, + "learning_rate": 1.8255256726745772e-09, + "loss": 1.2206, + "num_input_tokens_seen": 807236380, + "step": 20560 + }, + { + "epoch": 0.9967473666995766, + "grad_norm": 0.39100906252861023, + "learning_rate": 1.3821570192806832e-09, + "loss": 1.266, + "num_input_tokens_seen": 807644712, + "step": 20570 + }, + { + "epoch": 0.9972319303197514, + "grad_norm": 0.40794339776039124, + "learning_rate": 1.0003652452128532e-09, + "loss": 1.2559, + "num_input_tokens_seen": 808025808, + "step": 20580 + }, + { + "epoch": 0.9977164939399262, + "grad_norm": 0.4562262296676636, + "learning_rate": 6.801512909021535e-10, + "loss": 1.2459, + "num_input_tokens_seen": 808411128, + "step": 20590 + }, + { + "epoch": 0.998201057560101, + "grad_norm": 0.41534632444381714, + "learning_rate": 4.21515945106532e-10, + "loss": 1.2734, + "num_input_tokens_seen": 808793980, + "step": 20600 + }, + { + "epoch": 0.9986856211802758, + "grad_norm": 0.4286879897117615, + "learning_rate": 2.2445984489971593e-10, + "loss": 1.2695, + "num_input_tokens_seen": 809188808, + "step": 20610 + }, + { + "epoch": 0.9991701848004506, + "grad_norm": 0.4052944779396057, + "learning_rate": 8.898347567121157e-11, + "loss": 1.2466, + "num_input_tokens_seen": 809544636, + "step": 20620 + }, + { + "epoch": 0.9996547484206254, + "grad_norm": 0.4112201929092407, + "learning_rate": 1.5087171129080092e-11, + "loss": 1.231, + "num_input_tokens_seen": 809914964, + "step": 20630 + }, + { + "epoch": 0.9999939429547479, + "num_input_tokens_seen": 810173896, + "step": 20637, + "total_flos": 4.073925813718745e+18, + "train_loss": 1.2524831550218467, + "train_runtime": 185903.2881, + "train_samples_per_second": 14.209, + "train_steps_per_second": 0.111 + } + ], + "logging_steps": 10, + "max_steps": 20637, + "num_input_tokens_seen": 810173896, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.073925813718745e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}