diff --git "a/QLoRA_french_sft/trainer_state.json" "b/QLoRA_french_sft/trainer_state.json" new file mode 100644--- /dev/null +++ "b/QLoRA_french_sft/trainer_state.json" @@ -0,0 +1,18601 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23217, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001292156609381057, + "grad_norm": 0.9450203776359558, + "learning_rate": 4.999997711254574e-05, + "loss": 1.1858, + "num_input_tokens_seen": 81920, + "step": 10 + }, + { + "epoch": 0.002584313218762114, + "grad_norm": 0.6408165097236633, + "learning_rate": 4.999990845022488e-05, + "loss": 1.5568, + "num_input_tokens_seen": 163840, + "step": 20 + }, + { + "epoch": 0.003876469828143171, + "grad_norm": 0.5522646307945251, + "learning_rate": 4.999979401316311e-05, + "loss": 1.2099, + "num_input_tokens_seen": 245760, + "step": 30 + }, + { + "epoch": 0.005168626437524228, + "grad_norm": 0.5057278871536255, + "learning_rate": 4.999963380156999e-05, + "loss": 1.1902, + "num_input_tokens_seen": 327680, + "step": 40 + }, + { + "epoch": 0.006460783046905285, + "grad_norm": 0.40978753566741943, + "learning_rate": 4.9999427815738856e-05, + "loss": 1.0802, + "num_input_tokens_seen": 409600, + "step": 50 + }, + { + "epoch": 0.007752939656286342, + "grad_norm": 0.4875193238258362, + "learning_rate": 4.999917605604688e-05, + "loss": 1.0393, + "num_input_tokens_seen": 491520, + "step": 60 + }, + { + "epoch": 0.009045096265667399, + "grad_norm": 0.5074454545974731, + "learning_rate": 4.999887852295502e-05, + "loss": 1.1473, + "num_input_tokens_seen": 573440, + "step": 70 + }, + { + "epoch": 0.010337252875048455, + "grad_norm": 0.6053259968757629, + "learning_rate": 4.9998535217008054e-05, + "loss": 0.8868, + "num_input_tokens_seen": 655360, + "step": 80 + }, + { + "epoch": 0.011629409484429512, + "grad_norm": 0.5783246159553528, + "learning_rate": 4.999814613883459e-05, + "loss": 0.9585, + "num_input_tokens_seen": 737280, + "step": 90 + }, + { + "epoch": 0.01292156609381057, + "grad_norm": 0.5514815449714661, + "learning_rate": 4.999771128914701e-05, + "loss": 1.0241, + "num_input_tokens_seen": 819200, + "step": 100 + }, + { + "epoch": 0.014213722703191626, + "grad_norm": 0.6433015465736389, + "learning_rate": 4.999723066874154e-05, + "loss": 1.034, + "num_input_tokens_seen": 901120, + "step": 110 + }, + { + "epoch": 0.015505879312572683, + "grad_norm": 0.5091115832328796, + "learning_rate": 4.9996704278498185e-05, + "loss": 1.2244, + "num_input_tokens_seen": 983040, + "step": 120 + }, + { + "epoch": 0.016798035921953742, + "grad_norm": 0.40583527088165283, + "learning_rate": 4.9996132119380764e-05, + "loss": 1.266, + "num_input_tokens_seen": 1064960, + "step": 130 + }, + { + "epoch": 0.018090192531334797, + "grad_norm": 0.6629675626754761, + "learning_rate": 4.999551419243691e-05, + "loss": 1.1523, + "num_input_tokens_seen": 1146880, + "step": 140 + }, + { + "epoch": 0.019382349140715856, + "grad_norm": 0.6925719976425171, + "learning_rate": 4.9994850498798026e-05, + "loss": 1.1496, + "num_input_tokens_seen": 1228800, + "step": 150 + }, + { + "epoch": 0.02067450575009691, + "grad_norm": 0.43508079648017883, + "learning_rate": 4.999414103967934e-05, + "loss": 0.9302, + "num_input_tokens_seen": 1310720, + "step": 160 + }, + { + "epoch": 0.02196666235947797, + "grad_norm": 0.6177295446395874, + "learning_rate": 4.9993385816379876e-05, + "loss": 0.9475, + "num_input_tokens_seen": 1392640, + "step": 170 + }, + { + "epoch": 0.023258818968859025, + "grad_norm": 0.43350470066070557, + "learning_rate": 4.999258483028243e-05, + "loss": 1.0703, + "num_input_tokens_seen": 1474560, + "step": 180 + }, + { + "epoch": 0.024550975578240083, + "grad_norm": 0.8424047231674194, + "learning_rate": 4.999173808285362e-05, + "loss": 1.1235, + "num_input_tokens_seen": 1556480, + "step": 190 + }, + { + "epoch": 0.02584313218762114, + "grad_norm": 0.5010475516319275, + "learning_rate": 4.999084557564383e-05, + "loss": 1.3278, + "num_input_tokens_seen": 1638400, + "step": 200 + }, + { + "epoch": 0.027135288797002197, + "grad_norm": 0.5917531847953796, + "learning_rate": 4.9989907310287243e-05, + "loss": 1.1928, + "num_input_tokens_seen": 1720320, + "step": 210 + }, + { + "epoch": 0.028427445406383253, + "grad_norm": 0.40859103202819824, + "learning_rate": 4.998892328850181e-05, + "loss": 1.215, + "num_input_tokens_seen": 1802240, + "step": 220 + }, + { + "epoch": 0.02971960201576431, + "grad_norm": 0.7122620344161987, + "learning_rate": 4.9987893512089276e-05, + "loss": 1.3318, + "num_input_tokens_seen": 1884160, + "step": 230 + }, + { + "epoch": 0.031011758625145366, + "grad_norm": 0.6804201602935791, + "learning_rate": 4.998681798293516e-05, + "loss": 1.3226, + "num_input_tokens_seen": 1966080, + "step": 240 + }, + { + "epoch": 0.03230391523452642, + "grad_norm": 0.5440483093261719, + "learning_rate": 4.998569670300876e-05, + "loss": 1.1795, + "num_input_tokens_seen": 2048000, + "step": 250 + }, + { + "epoch": 0.033596071843907484, + "grad_norm": 1.6514626741409302, + "learning_rate": 4.9984529674363114e-05, + "loss": 0.9646, + "num_input_tokens_seen": 2129920, + "step": 260 + }, + { + "epoch": 0.03488822845328854, + "grad_norm": 0.46813198924064636, + "learning_rate": 4.998331689913506e-05, + "loss": 1.1236, + "num_input_tokens_seen": 2211840, + "step": 270 + }, + { + "epoch": 0.036180385062669594, + "grad_norm": 0.6068572998046875, + "learning_rate": 4.998205837954518e-05, + "loss": 0.9761, + "num_input_tokens_seen": 2293760, + "step": 280 + }, + { + "epoch": 0.03747254167205065, + "grad_norm": 0.4501520097255707, + "learning_rate": 4.998075411789783e-05, + "loss": 1.0846, + "num_input_tokens_seen": 2375680, + "step": 290 + }, + { + "epoch": 0.03876469828143171, + "grad_norm": 0.6213204264640808, + "learning_rate": 4.9979404116581104e-05, + "loss": 0.9604, + "num_input_tokens_seen": 2457600, + "step": 300 + }, + { + "epoch": 0.04005685489081277, + "grad_norm": 0.5889080166816711, + "learning_rate": 4.9978008378066844e-05, + "loss": 1.135, + "num_input_tokens_seen": 2539520, + "step": 310 + }, + { + "epoch": 0.04134901150019382, + "grad_norm": 0.5029394030570984, + "learning_rate": 4.997656690491064e-05, + "loss": 1.0162, + "num_input_tokens_seen": 2621440, + "step": 320 + }, + { + "epoch": 0.04264116810957488, + "grad_norm": 0.4100414216518402, + "learning_rate": 4.9975079699751825e-05, + "loss": 1.0928, + "num_input_tokens_seen": 2703360, + "step": 330 + }, + { + "epoch": 0.04393332471895594, + "grad_norm": 0.5180526971817017, + "learning_rate": 4.997354676531348e-05, + "loss": 1.4001, + "num_input_tokens_seen": 2785280, + "step": 340 + }, + { + "epoch": 0.045225481328336994, + "grad_norm": 0.47328633069992065, + "learning_rate": 4.997196810440239e-05, + "loss": 1.2969, + "num_input_tokens_seen": 2867200, + "step": 350 + }, + { + "epoch": 0.04651763793771805, + "grad_norm": 0.4311719834804535, + "learning_rate": 4.997034371990907e-05, + "loss": 1.0298, + "num_input_tokens_seen": 2949120, + "step": 360 + }, + { + "epoch": 0.04780979454709911, + "grad_norm": 0.8229668736457825, + "learning_rate": 4.9968673614807787e-05, + "loss": 1.1618, + "num_input_tokens_seen": 3031040, + "step": 370 + }, + { + "epoch": 0.04910195115648017, + "grad_norm": 0.4691760540008545, + "learning_rate": 4.9966957792156475e-05, + "loss": 0.8982, + "num_input_tokens_seen": 3112960, + "step": 380 + }, + { + "epoch": 0.05039410776586122, + "grad_norm": 0.539076566696167, + "learning_rate": 4.99651962550968e-05, + "loss": 1.15, + "num_input_tokens_seen": 3194880, + "step": 390 + }, + { + "epoch": 0.05168626437524228, + "grad_norm": 0.3812846541404724, + "learning_rate": 4.996338900685414e-05, + "loss": 1.3498, + "num_input_tokens_seen": 3276800, + "step": 400 + }, + { + "epoch": 0.05297842098462334, + "grad_norm": 0.4904411733150482, + "learning_rate": 4.996153605073756e-05, + "loss": 0.8624, + "num_input_tokens_seen": 3358720, + "step": 410 + }, + { + "epoch": 0.054270577594004395, + "grad_norm": 0.49068745970726013, + "learning_rate": 4.9959637390139814e-05, + "loss": 1.1246, + "num_input_tokens_seen": 3440640, + "step": 420 + }, + { + "epoch": 0.05556273420338545, + "grad_norm": 0.47231003642082214, + "learning_rate": 4.995769302853733e-05, + "loss": 0.8941, + "num_input_tokens_seen": 3522560, + "step": 430 + }, + { + "epoch": 0.056854890812766505, + "grad_norm": 0.4713283181190491, + "learning_rate": 4.995570296949024e-05, + "loss": 1.1223, + "num_input_tokens_seen": 3604480, + "step": 440 + }, + { + "epoch": 0.05814704742214757, + "grad_norm": 0.9544013738632202, + "learning_rate": 4.995366721664234e-05, + "loss": 0.7063, + "num_input_tokens_seen": 3686400, + "step": 450 + }, + { + "epoch": 0.05943920403152862, + "grad_norm": 0.43465104699134827, + "learning_rate": 4.995158577372107e-05, + "loss": 0.8703, + "num_input_tokens_seen": 3768320, + "step": 460 + }, + { + "epoch": 0.06073136064090968, + "grad_norm": 6.194292068481445, + "learning_rate": 4.9949458644537556e-05, + "loss": 0.8798, + "num_input_tokens_seen": 3850240, + "step": 470 + }, + { + "epoch": 0.06202351725029073, + "grad_norm": 0.4813326895236969, + "learning_rate": 4.9947285832986553e-05, + "loss": 1.154, + "num_input_tokens_seen": 3932160, + "step": 480 + }, + { + "epoch": 0.0633156738596718, + "grad_norm": 0.37111344933509827, + "learning_rate": 4.9945067343046494e-05, + "loss": 1.179, + "num_input_tokens_seen": 4014080, + "step": 490 + }, + { + "epoch": 0.06460783046905284, + "grad_norm": 0.473379909992218, + "learning_rate": 4.9942803178779396e-05, + "loss": 1.3203, + "num_input_tokens_seen": 4096000, + "step": 500 + }, + { + "epoch": 0.0658999870784339, + "grad_norm": 0.31630370020866394, + "learning_rate": 4.994049334433095e-05, + "loss": 0.95, + "num_input_tokens_seen": 4177920, + "step": 510 + }, + { + "epoch": 0.06719214368781497, + "grad_norm": 0.6394200921058655, + "learning_rate": 4.9938137843930466e-05, + "loss": 0.721, + "num_input_tokens_seen": 4259840, + "step": 520 + }, + { + "epoch": 0.06848430029719602, + "grad_norm": 0.4896668493747711, + "learning_rate": 4.993573668189083e-05, + "loss": 1.1505, + "num_input_tokens_seen": 4341760, + "step": 530 + }, + { + "epoch": 0.06977645690657708, + "grad_norm": 0.4696863293647766, + "learning_rate": 4.9933289862608584e-05, + "loss": 1.1169, + "num_input_tokens_seen": 4423680, + "step": 540 + }, + { + "epoch": 0.07106861351595814, + "grad_norm": 0.4316001236438751, + "learning_rate": 4.9930797390563834e-05, + "loss": 1.0893, + "num_input_tokens_seen": 4505600, + "step": 550 + }, + { + "epoch": 0.07236077012533919, + "grad_norm": 0.47244027256965637, + "learning_rate": 4.9928259270320295e-05, + "loss": 0.6521, + "num_input_tokens_seen": 4587520, + "step": 560 + }, + { + "epoch": 0.07365292673472025, + "grad_norm": 0.4212469160556793, + "learning_rate": 4.992567550652525e-05, + "loss": 0.9833, + "num_input_tokens_seen": 4669440, + "step": 570 + }, + { + "epoch": 0.0749450833441013, + "grad_norm": 0.49635788798332214, + "learning_rate": 4.992304610390955e-05, + "loss": 1.1048, + "num_input_tokens_seen": 4751360, + "step": 580 + }, + { + "epoch": 0.07623723995348236, + "grad_norm": 0.46898409724235535, + "learning_rate": 4.9920371067287645e-05, + "loss": 0.8898, + "num_input_tokens_seen": 4833280, + "step": 590 + }, + { + "epoch": 0.07752939656286342, + "grad_norm": 0.7145845293998718, + "learning_rate": 4.9917650401557505e-05, + "loss": 0.9906, + "num_input_tokens_seen": 4915200, + "step": 600 + }, + { + "epoch": 0.07882155317224447, + "grad_norm": 7.353865623474121, + "learning_rate": 4.9914884111700656e-05, + "loss": 0.9923, + "num_input_tokens_seen": 4997120, + "step": 610 + }, + { + "epoch": 0.08011370978162553, + "grad_norm": 0.7385011911392212, + "learning_rate": 4.991207220278217e-05, + "loss": 0.8434, + "num_input_tokens_seen": 5079040, + "step": 620 + }, + { + "epoch": 0.0814058663910066, + "grad_norm": 0.6693135499954224, + "learning_rate": 4.990921467995064e-05, + "loss": 0.6813, + "num_input_tokens_seen": 5160960, + "step": 630 + }, + { + "epoch": 0.08269802300038764, + "grad_norm": 0.4748353958129883, + "learning_rate": 4.9906311548438184e-05, + "loss": 0.8735, + "num_input_tokens_seen": 5242880, + "step": 640 + }, + { + "epoch": 0.0839901796097687, + "grad_norm": 0.5311540365219116, + "learning_rate": 4.990336281356042e-05, + "loss": 1.1035, + "num_input_tokens_seen": 5324800, + "step": 650 + }, + { + "epoch": 0.08528233621914975, + "grad_norm": 0.4842514991760254, + "learning_rate": 4.9900368480716466e-05, + "loss": 0.8329, + "num_input_tokens_seen": 5406720, + "step": 660 + }, + { + "epoch": 0.08657449282853082, + "grad_norm": 0.6414400935173035, + "learning_rate": 4.9897328555388943e-05, + "loss": 1.1088, + "num_input_tokens_seen": 5488640, + "step": 670 + }, + { + "epoch": 0.08786664943791188, + "grad_norm": 0.39652836322784424, + "learning_rate": 4.989424304314395e-05, + "loss": 1.0048, + "num_input_tokens_seen": 5570560, + "step": 680 + }, + { + "epoch": 0.08915880604729293, + "grad_norm": 0.6044394969940186, + "learning_rate": 4.9891111949631023e-05, + "loss": 0.9396, + "num_input_tokens_seen": 5652480, + "step": 690 + }, + { + "epoch": 0.09045096265667399, + "grad_norm": 0.4656500816345215, + "learning_rate": 4.988793528058321e-05, + "loss": 1.1961, + "num_input_tokens_seen": 5734400, + "step": 700 + }, + { + "epoch": 0.09174311926605505, + "grad_norm": 0.5340274572372437, + "learning_rate": 4.988471304181697e-05, + "loss": 0.9563, + "num_input_tokens_seen": 5816320, + "step": 710 + }, + { + "epoch": 0.0930352758754361, + "grad_norm": 0.4175763726234436, + "learning_rate": 4.988144523923221e-05, + "loss": 0.9487, + "num_input_tokens_seen": 5898240, + "step": 720 + }, + { + "epoch": 0.09432743248481716, + "grad_norm": 0.4877130389213562, + "learning_rate": 4.987813187881226e-05, + "loss": 0.9847, + "num_input_tokens_seen": 5980160, + "step": 730 + }, + { + "epoch": 0.09561958909419822, + "grad_norm": 0.5183830261230469, + "learning_rate": 4.987477296662387e-05, + "loss": 1.3118, + "num_input_tokens_seen": 6062080, + "step": 740 + }, + { + "epoch": 0.09691174570357927, + "grad_norm": 0.4533514678478241, + "learning_rate": 4.987136850881721e-05, + "loss": 1.0996, + "num_input_tokens_seen": 6144000, + "step": 750 + }, + { + "epoch": 0.09820390231296033, + "grad_norm": 0.4317517578601837, + "learning_rate": 4.986791851162582e-05, + "loss": 0.9023, + "num_input_tokens_seen": 6225920, + "step": 760 + }, + { + "epoch": 0.09949605892234138, + "grad_norm": 0.7972986698150635, + "learning_rate": 4.986442298136663e-05, + "loss": 1.0904, + "num_input_tokens_seen": 6307840, + "step": 770 + }, + { + "epoch": 0.10078821553172244, + "grad_norm": 1.0747971534729004, + "learning_rate": 4.986088192443995e-05, + "loss": 0.9374, + "num_input_tokens_seen": 6389760, + "step": 780 + }, + { + "epoch": 0.1020803721411035, + "grad_norm": 0.3895331919193268, + "learning_rate": 4.985729534732944e-05, + "loss": 0.8173, + "num_input_tokens_seen": 6471680, + "step": 790 + }, + { + "epoch": 0.10337252875048455, + "grad_norm": 0.6985827684402466, + "learning_rate": 4.98536632566021e-05, + "loss": 0.9162, + "num_input_tokens_seen": 6553600, + "step": 800 + }, + { + "epoch": 0.10466468535986562, + "grad_norm": 0.4658990502357483, + "learning_rate": 4.9849985658908296e-05, + "loss": 0.7986, + "num_input_tokens_seen": 6635520, + "step": 810 + }, + { + "epoch": 0.10595684196924668, + "grad_norm": 0.3981533944606781, + "learning_rate": 4.9846262560981674e-05, + "loss": 0.7034, + "num_input_tokens_seen": 6717440, + "step": 820 + }, + { + "epoch": 0.10724899857862773, + "grad_norm": 0.6919500231742859, + "learning_rate": 4.9842493969639215e-05, + "loss": 0.9466, + "num_input_tokens_seen": 6799360, + "step": 830 + }, + { + "epoch": 0.10854115518800879, + "grad_norm": 0.5051977038383484, + "learning_rate": 4.9838679891781214e-05, + "loss": 1.2844, + "num_input_tokens_seen": 6881280, + "step": 840 + }, + { + "epoch": 0.10983331179738984, + "grad_norm": 0.49048569798469543, + "learning_rate": 4.983482033439122e-05, + "loss": 0.7703, + "num_input_tokens_seen": 6963200, + "step": 850 + }, + { + "epoch": 0.1111254684067709, + "grad_norm": 0.29757964611053467, + "learning_rate": 4.9830915304536065e-05, + "loss": 1.2312, + "num_input_tokens_seen": 7045120, + "step": 860 + }, + { + "epoch": 0.11241762501615196, + "grad_norm": 0.4364021122455597, + "learning_rate": 4.982696480936586e-05, + "loss": 1.3658, + "num_input_tokens_seen": 7127040, + "step": 870 + }, + { + "epoch": 0.11370978162553301, + "grad_norm": 0.7192522287368774, + "learning_rate": 4.9822968856113926e-05, + "loss": 1.0453, + "num_input_tokens_seen": 7208960, + "step": 880 + }, + { + "epoch": 0.11500193823491407, + "grad_norm": 0.5548785924911499, + "learning_rate": 4.9818927452096855e-05, + "loss": 1.1852, + "num_input_tokens_seen": 7290880, + "step": 890 + }, + { + "epoch": 0.11629409484429513, + "grad_norm": 0.4914171099662781, + "learning_rate": 4.981484060471444e-05, + "loss": 1.2232, + "num_input_tokens_seen": 7372800, + "step": 900 + }, + { + "epoch": 0.11758625145367618, + "grad_norm": 0.4073733687400818, + "learning_rate": 4.981070832144967e-05, + "loss": 1.1144, + "num_input_tokens_seen": 7454720, + "step": 910 + }, + { + "epoch": 0.11887840806305724, + "grad_norm": 0.7221460342407227, + "learning_rate": 4.980653060986877e-05, + "loss": 1.0344, + "num_input_tokens_seen": 7536640, + "step": 920 + }, + { + "epoch": 0.1201705646724383, + "grad_norm": 0.5011032223701477, + "learning_rate": 4.9802307477621084e-05, + "loss": 1.1751, + "num_input_tokens_seen": 7618560, + "step": 930 + }, + { + "epoch": 0.12146272128181936, + "grad_norm": 0.4509833753108978, + "learning_rate": 4.9798038932439175e-05, + "loss": 1.1844, + "num_input_tokens_seen": 7700480, + "step": 940 + }, + { + "epoch": 0.12275487789120042, + "grad_norm": 0.6086503863334656, + "learning_rate": 4.979372498213871e-05, + "loss": 1.0034, + "num_input_tokens_seen": 7782400, + "step": 950 + }, + { + "epoch": 0.12404703450058147, + "grad_norm": 0.9258208274841309, + "learning_rate": 4.978936563461854e-05, + "loss": 0.9102, + "num_input_tokens_seen": 7864320, + "step": 960 + }, + { + "epoch": 0.12533919110996253, + "grad_norm": 0.5192177295684814, + "learning_rate": 4.97849608978606e-05, + "loss": 0.8154, + "num_input_tokens_seen": 7946240, + "step": 970 + }, + { + "epoch": 0.1266313477193436, + "grad_norm": 0.37925633788108826, + "learning_rate": 4.978051077992994e-05, + "loss": 1.0143, + "num_input_tokens_seen": 8028160, + "step": 980 + }, + { + "epoch": 0.12792350432872465, + "grad_norm": 2.0898265838623047, + "learning_rate": 4.9776015288974736e-05, + "loss": 0.7723, + "num_input_tokens_seen": 8110080, + "step": 990 + }, + { + "epoch": 0.1292156609381057, + "grad_norm": 0.667012631893158, + "learning_rate": 4.9771474433226194e-05, + "loss": 1.2866, + "num_input_tokens_seen": 8192000, + "step": 1000 + }, + { + "epoch": 0.13050781754748675, + "grad_norm": 0.4613621234893799, + "learning_rate": 4.976688822099861e-05, + "loss": 1.4743, + "num_input_tokens_seen": 8273920, + "step": 1010 + }, + { + "epoch": 0.1317999741568678, + "grad_norm": 0.7849400043487549, + "learning_rate": 4.976225666068932e-05, + "loss": 0.904, + "num_input_tokens_seen": 8355840, + "step": 1020 + }, + { + "epoch": 0.13309213076624887, + "grad_norm": 0.6418063044548035, + "learning_rate": 4.9757579760778697e-05, + "loss": 0.9119, + "num_input_tokens_seen": 8437760, + "step": 1030 + }, + { + "epoch": 0.13438428737562993, + "grad_norm": 0.47972187399864197, + "learning_rate": 4.9752857529830125e-05, + "loss": 1.1209, + "num_input_tokens_seen": 8519680, + "step": 1040 + }, + { + "epoch": 0.135676443985011, + "grad_norm": 0.4660983681678772, + "learning_rate": 4.9748089976489996e-05, + "loss": 1.2202, + "num_input_tokens_seen": 8601600, + "step": 1050 + }, + { + "epoch": 0.13696860059439203, + "grad_norm": 0.4959300756454468, + "learning_rate": 4.9743277109487674e-05, + "loss": 1.2102, + "num_input_tokens_seen": 8683520, + "step": 1060 + }, + { + "epoch": 0.1382607572037731, + "grad_norm": 0.541445791721344, + "learning_rate": 4.973841893763551e-05, + "loss": 0.9835, + "num_input_tokens_seen": 8765440, + "step": 1070 + }, + { + "epoch": 0.13955291381315416, + "grad_norm": 0.48562517762184143, + "learning_rate": 4.9733515469828795e-05, + "loss": 0.9155, + "num_input_tokens_seen": 8847360, + "step": 1080 + }, + { + "epoch": 0.14084507042253522, + "grad_norm": 0.4447590708732605, + "learning_rate": 4.972856671504576e-05, + "loss": 1.196, + "num_input_tokens_seen": 8929280, + "step": 1090 + }, + { + "epoch": 0.14213722703191628, + "grad_norm": 0.5334303379058838, + "learning_rate": 4.9723572682347566e-05, + "loss": 0.8913, + "num_input_tokens_seen": 9011200, + "step": 1100 + }, + { + "epoch": 0.14342938364129731, + "grad_norm": 0.376122385263443, + "learning_rate": 4.971853338087825e-05, + "loss": 1.1649, + "num_input_tokens_seen": 9093120, + "step": 1110 + }, + { + "epoch": 0.14472154025067838, + "grad_norm": 0.5344268083572388, + "learning_rate": 4.971344881986477e-05, + "loss": 1.084, + "num_input_tokens_seen": 9175040, + "step": 1120 + }, + { + "epoch": 0.14601369686005944, + "grad_norm": 0.8270778059959412, + "learning_rate": 4.9708319008616926e-05, + "loss": 0.9945, + "num_input_tokens_seen": 9256960, + "step": 1130 + }, + { + "epoch": 0.1473058534694405, + "grad_norm": 0.6520871520042419, + "learning_rate": 4.97031439565274e-05, + "loss": 0.8976, + "num_input_tokens_seen": 9338880, + "step": 1140 + }, + { + "epoch": 0.14859801007882156, + "grad_norm": 0.4764862656593323, + "learning_rate": 4.969792367307168e-05, + "loss": 1.063, + "num_input_tokens_seen": 9420800, + "step": 1150 + }, + { + "epoch": 0.1498901666882026, + "grad_norm": 0.48043620586395264, + "learning_rate": 4.9692658167808094e-05, + "loss": 0.8809, + "num_input_tokens_seen": 9502720, + "step": 1160 + }, + { + "epoch": 0.15118232329758366, + "grad_norm": 0.44339507818222046, + "learning_rate": 4.9687347450377755e-05, + "loss": 1.3418, + "num_input_tokens_seen": 9584640, + "step": 1170 + }, + { + "epoch": 0.15247447990696472, + "grad_norm": 0.40918466448783875, + "learning_rate": 4.968199153050457e-05, + "loss": 0.6324, + "num_input_tokens_seen": 9666560, + "step": 1180 + }, + { + "epoch": 0.15376663651634578, + "grad_norm": 0.5043272972106934, + "learning_rate": 4.967659041799522e-05, + "loss": 0.9144, + "num_input_tokens_seen": 9748480, + "step": 1190 + }, + { + "epoch": 0.15505879312572685, + "grad_norm": 0.47593727707862854, + "learning_rate": 4.9671144122739106e-05, + "loss": 0.8396, + "num_input_tokens_seen": 9830400, + "step": 1200 + }, + { + "epoch": 0.1563509497351079, + "grad_norm": 0.3913171887397766, + "learning_rate": 4.966565265470838e-05, + "loss": 1.0608, + "num_input_tokens_seen": 9912320, + "step": 1210 + }, + { + "epoch": 0.15764310634448894, + "grad_norm": 0.8421456813812256, + "learning_rate": 4.9660116023957906e-05, + "loss": 0.9964, + "num_input_tokens_seen": 9994240, + "step": 1220 + }, + { + "epoch": 0.15893526295387, + "grad_norm": 0.46809521317481995, + "learning_rate": 4.9654534240625225e-05, + "loss": 0.8427, + "num_input_tokens_seen": 10076160, + "step": 1230 + }, + { + "epoch": 0.16022741956325107, + "grad_norm": 0.5621390342712402, + "learning_rate": 4.964890731493057e-05, + "loss": 1.2353, + "num_input_tokens_seen": 10158080, + "step": 1240 + }, + { + "epoch": 0.16151957617263213, + "grad_norm": 0.4590226113796234, + "learning_rate": 4.964323525717681e-05, + "loss": 1.0439, + "num_input_tokens_seen": 10240000, + "step": 1250 + }, + { + "epoch": 0.1628117327820132, + "grad_norm": 0.40506160259246826, + "learning_rate": 4.9637518077749476e-05, + "loss": 0.914, + "num_input_tokens_seen": 10321920, + "step": 1260 + }, + { + "epoch": 0.16410388939139423, + "grad_norm": 0.8865328431129456, + "learning_rate": 4.96317557871167e-05, + "loss": 1.0189, + "num_input_tokens_seen": 10403840, + "step": 1270 + }, + { + "epoch": 0.1653960460007753, + "grad_norm": 0.37955254316329956, + "learning_rate": 4.9625948395829216e-05, + "loss": 1.0049, + "num_input_tokens_seen": 10485760, + "step": 1280 + }, + { + "epoch": 0.16668820261015635, + "grad_norm": 0.5044606328010559, + "learning_rate": 4.962009591452032e-05, + "loss": 0.941, + "num_input_tokens_seen": 10567680, + "step": 1290 + }, + { + "epoch": 0.1679803592195374, + "grad_norm": 0.6987674236297607, + "learning_rate": 4.96141983539059e-05, + "loss": 1.1209, + "num_input_tokens_seen": 10649600, + "step": 1300 + }, + { + "epoch": 0.16927251582891847, + "grad_norm": 1.0083003044128418, + "learning_rate": 4.960825572478436e-05, + "loss": 1.0478, + "num_input_tokens_seen": 10731520, + "step": 1310 + }, + { + "epoch": 0.1705646724382995, + "grad_norm": 0.3237228989601135, + "learning_rate": 4.960226803803664e-05, + "loss": 0.9342, + "num_input_tokens_seen": 10813440, + "step": 1320 + }, + { + "epoch": 0.17185682904768057, + "grad_norm": 1.4129977226257324, + "learning_rate": 4.959623530462617e-05, + "loss": 1.0727, + "num_input_tokens_seen": 10895360, + "step": 1330 + }, + { + "epoch": 0.17314898565706163, + "grad_norm": 0.3343133330345154, + "learning_rate": 4.9590157535598855e-05, + "loss": 1.0719, + "num_input_tokens_seen": 10977280, + "step": 1340 + }, + { + "epoch": 0.1744411422664427, + "grad_norm": 0.3956698775291443, + "learning_rate": 4.958403474208308e-05, + "loss": 1.0195, + "num_input_tokens_seen": 11059200, + "step": 1350 + }, + { + "epoch": 0.17573329887582376, + "grad_norm": 0.45558327436447144, + "learning_rate": 4.957786693528965e-05, + "loss": 1.1703, + "num_input_tokens_seen": 11141120, + "step": 1360 + }, + { + "epoch": 0.17702545548520482, + "grad_norm": 0.4159637689590454, + "learning_rate": 4.95716541265118e-05, + "loss": 0.8547, + "num_input_tokens_seen": 11223040, + "step": 1370 + }, + { + "epoch": 0.17831761209458585, + "grad_norm": 0.510883629322052, + "learning_rate": 4.9565396327125155e-05, + "loss": 1.056, + "num_input_tokens_seen": 11304960, + "step": 1380 + }, + { + "epoch": 0.17960976870396692, + "grad_norm": 0.4751374125480652, + "learning_rate": 4.955909354858772e-05, + "loss": 0.8731, + "num_input_tokens_seen": 11386880, + "step": 1390 + }, + { + "epoch": 0.18090192531334798, + "grad_norm": 0.3273867070674896, + "learning_rate": 4.955274580243987e-05, + "loss": 0.7233, + "num_input_tokens_seen": 11468800, + "step": 1400 + }, + { + "epoch": 0.18219408192272904, + "grad_norm": 0.38997891545295715, + "learning_rate": 4.95463531003043e-05, + "loss": 0.8429, + "num_input_tokens_seen": 11550720, + "step": 1410 + }, + { + "epoch": 0.1834862385321101, + "grad_norm": 0.4781230092048645, + "learning_rate": 4.953991545388603e-05, + "loss": 0.9205, + "num_input_tokens_seen": 11632640, + "step": 1420 + }, + { + "epoch": 0.18477839514149114, + "grad_norm": 0.39320600032806396, + "learning_rate": 4.9533432874972366e-05, + "loss": 1.0245, + "num_input_tokens_seen": 11714560, + "step": 1430 + }, + { + "epoch": 0.1860705517508722, + "grad_norm": 1.2111022472381592, + "learning_rate": 4.952690537543287e-05, + "loss": 1.161, + "num_input_tokens_seen": 11796480, + "step": 1440 + }, + { + "epoch": 0.18736270836025326, + "grad_norm": 0.42389845848083496, + "learning_rate": 4.952033296721938e-05, + "loss": 1.4307, + "num_input_tokens_seen": 11878400, + "step": 1450 + }, + { + "epoch": 0.18865486496963432, + "grad_norm": 0.3861338496208191, + "learning_rate": 4.951371566236597e-05, + "loss": 0.9983, + "num_input_tokens_seen": 11960320, + "step": 1460 + }, + { + "epoch": 0.18994702157901538, + "grad_norm": 0.39819157123565674, + "learning_rate": 4.9507053472988867e-05, + "loss": 0.927, + "num_input_tokens_seen": 12042240, + "step": 1470 + }, + { + "epoch": 0.19123917818839645, + "grad_norm": 0.47134819626808167, + "learning_rate": 4.9500346411286534e-05, + "loss": 1.0184, + "num_input_tokens_seen": 12124160, + "step": 1480 + }, + { + "epoch": 0.19253133479777748, + "grad_norm": 0.6282069683074951, + "learning_rate": 4.949359448953959e-05, + "loss": 0.9956, + "num_input_tokens_seen": 12206080, + "step": 1490 + }, + { + "epoch": 0.19382349140715854, + "grad_norm": 0.48138657212257385, + "learning_rate": 4.9486797720110746e-05, + "loss": 0.7369, + "num_input_tokens_seen": 12288000, + "step": 1500 + }, + { + "epoch": 0.1951156480165396, + "grad_norm": 0.42543670535087585, + "learning_rate": 4.947995611544489e-05, + "loss": 0.9916, + "num_input_tokens_seen": 12369920, + "step": 1510 + }, + { + "epoch": 0.19640780462592067, + "grad_norm": 0.6821240782737732, + "learning_rate": 4.947306968806896e-05, + "loss": 1.0502, + "num_input_tokens_seen": 12451840, + "step": 1520 + }, + { + "epoch": 0.19769996123530173, + "grad_norm": 0.7128183245658875, + "learning_rate": 4.946613845059199e-05, + "loss": 1.1209, + "num_input_tokens_seen": 12533760, + "step": 1530 + }, + { + "epoch": 0.19899211784468276, + "grad_norm": 0.5544474720954895, + "learning_rate": 4.945916241570504e-05, + "loss": 1.152, + "num_input_tokens_seen": 12615680, + "step": 1540 + }, + { + "epoch": 0.20028427445406383, + "grad_norm": 0.4839724600315094, + "learning_rate": 4.945214159618121e-05, + "loss": 1.1884, + "num_input_tokens_seen": 12697600, + "step": 1550 + }, + { + "epoch": 0.2015764310634449, + "grad_norm": 0.381164014339447, + "learning_rate": 4.9445076004875596e-05, + "loss": 1.1069, + "num_input_tokens_seen": 12779520, + "step": 1560 + }, + { + "epoch": 0.20286858767282595, + "grad_norm": 0.5115939974784851, + "learning_rate": 4.9437965654725264e-05, + "loss": 1.0457, + "num_input_tokens_seen": 12861440, + "step": 1570 + }, + { + "epoch": 0.204160744282207, + "grad_norm": 0.45792338252067566, + "learning_rate": 4.943081055874925e-05, + "loss": 0.8206, + "num_input_tokens_seen": 12943360, + "step": 1580 + }, + { + "epoch": 0.20545290089158805, + "grad_norm": 0.5342550873756409, + "learning_rate": 4.9423610730048495e-05, + "loss": 1.2474, + "num_input_tokens_seen": 13025280, + "step": 1590 + }, + { + "epoch": 0.2067450575009691, + "grad_norm": 0.6011587381362915, + "learning_rate": 4.941636618180586e-05, + "loss": 0.9485, + "num_input_tokens_seen": 13107200, + "step": 1600 + }, + { + "epoch": 0.20803721411035017, + "grad_norm": 1.3028993606567383, + "learning_rate": 4.94090769272861e-05, + "loss": 1.0329, + "num_input_tokens_seen": 13189120, + "step": 1610 + }, + { + "epoch": 0.20932937071973123, + "grad_norm": 0.4599340260028839, + "learning_rate": 4.940174297983581e-05, + "loss": 1.1409, + "num_input_tokens_seen": 13271040, + "step": 1620 + }, + { + "epoch": 0.2106215273291123, + "grad_norm": 0.4517963230609894, + "learning_rate": 4.93943643528834e-05, + "loss": 0.6724, + "num_input_tokens_seen": 13352960, + "step": 1630 + }, + { + "epoch": 0.21191368393849336, + "grad_norm": 0.4794328212738037, + "learning_rate": 4.938694105993914e-05, + "loss": 0.9835, + "num_input_tokens_seen": 13434880, + "step": 1640 + }, + { + "epoch": 0.2132058405478744, + "grad_norm": 0.43061280250549316, + "learning_rate": 4.937947311459503e-05, + "loss": 1.2099, + "num_input_tokens_seen": 13516800, + "step": 1650 + }, + { + "epoch": 0.21449799715725545, + "grad_norm": 0.4864104986190796, + "learning_rate": 4.937196053052486e-05, + "loss": 0.9276, + "num_input_tokens_seen": 13598720, + "step": 1660 + }, + { + "epoch": 0.21579015376663652, + "grad_norm": 0.40697136521339417, + "learning_rate": 4.9364403321484145e-05, + "loss": 1.4808, + "num_input_tokens_seen": 13680640, + "step": 1670 + }, + { + "epoch": 0.21708231037601758, + "grad_norm": 0.613544762134552, + "learning_rate": 4.9356801501310105e-05, + "loss": 1.248, + "num_input_tokens_seen": 13762560, + "step": 1680 + }, + { + "epoch": 0.21837446698539864, + "grad_norm": 0.25986501574516296, + "learning_rate": 4.934915508392164e-05, + "loss": 0.8261, + "num_input_tokens_seen": 13844480, + "step": 1690 + }, + { + "epoch": 0.21966662359477968, + "grad_norm": 0.4318770468235016, + "learning_rate": 4.9341464083319314e-05, + "loss": 1.2263, + "num_input_tokens_seen": 13926400, + "step": 1700 + }, + { + "epoch": 0.22095878020416074, + "grad_norm": 0.32169216871261597, + "learning_rate": 4.933372851358532e-05, + "loss": 0.8885, + "num_input_tokens_seen": 14008320, + "step": 1710 + }, + { + "epoch": 0.2222509368135418, + "grad_norm": 0.507994532585144, + "learning_rate": 4.932594838888347e-05, + "loss": 1.369, + "num_input_tokens_seen": 14090240, + "step": 1720 + }, + { + "epoch": 0.22354309342292286, + "grad_norm": 0.5168066620826721, + "learning_rate": 4.931812372345913e-05, + "loss": 1.1812, + "num_input_tokens_seen": 14172160, + "step": 1730 + }, + { + "epoch": 0.22483525003230392, + "grad_norm": 0.5198150873184204, + "learning_rate": 4.9310254531639235e-05, + "loss": 0.7136, + "num_input_tokens_seen": 14254080, + "step": 1740 + }, + { + "epoch": 0.22612740664168499, + "grad_norm": 0.4334443509578705, + "learning_rate": 4.930234082783225e-05, + "loss": 1.4446, + "num_input_tokens_seen": 14336000, + "step": 1750 + }, + { + "epoch": 0.22741956325106602, + "grad_norm": 0.4047377109527588, + "learning_rate": 4.9294382626528144e-05, + "loss": 0.7729, + "num_input_tokens_seen": 14417920, + "step": 1760 + }, + { + "epoch": 0.22871171986044708, + "grad_norm": 1.014719009399414, + "learning_rate": 4.928637994229834e-05, + "loss": 0.9155, + "num_input_tokens_seen": 14499840, + "step": 1770 + }, + { + "epoch": 0.23000387646982814, + "grad_norm": 0.5884613394737244, + "learning_rate": 4.9278332789795746e-05, + "loss": 1.0874, + "num_input_tokens_seen": 14581760, + "step": 1780 + }, + { + "epoch": 0.2312960330792092, + "grad_norm": 0.4626663029193878, + "learning_rate": 4.9270241183754637e-05, + "loss": 0.7714, + "num_input_tokens_seen": 14663680, + "step": 1790 + }, + { + "epoch": 0.23258818968859027, + "grad_norm": 0.42093148827552795, + "learning_rate": 4.9262105138990745e-05, + "loss": 1.3797, + "num_input_tokens_seen": 14745600, + "step": 1800 + }, + { + "epoch": 0.2338803462979713, + "grad_norm": 0.3722878396511078, + "learning_rate": 4.925392467040112e-05, + "loss": 1.2271, + "num_input_tokens_seen": 14827520, + "step": 1810 + }, + { + "epoch": 0.23517250290735237, + "grad_norm": 0.4904583990573883, + "learning_rate": 4.924569979296417e-05, + "loss": 0.9982, + "num_input_tokens_seen": 14909440, + "step": 1820 + }, + { + "epoch": 0.23646465951673343, + "grad_norm": 0.5454955101013184, + "learning_rate": 4.9237430521739626e-05, + "loss": 0.9374, + "num_input_tokens_seen": 14991360, + "step": 1830 + }, + { + "epoch": 0.2377568161261145, + "grad_norm": 0.9513818621635437, + "learning_rate": 4.9229116871868485e-05, + "loss": 0.9464, + "num_input_tokens_seen": 15073280, + "step": 1840 + }, + { + "epoch": 0.23904897273549555, + "grad_norm": 1.1613630056381226, + "learning_rate": 4.922075885857301e-05, + "loss": 0.8913, + "num_input_tokens_seen": 15155200, + "step": 1850 + }, + { + "epoch": 0.2403411293448766, + "grad_norm": 0.5714117288589478, + "learning_rate": 4.92123564971567e-05, + "loss": 1.3031, + "num_input_tokens_seen": 15237120, + "step": 1860 + }, + { + "epoch": 0.24163328595425765, + "grad_norm": 0.417524516582489, + "learning_rate": 4.9203909803004245e-05, + "loss": 1.0151, + "num_input_tokens_seen": 15319040, + "step": 1870 + }, + { + "epoch": 0.2429254425636387, + "grad_norm": 0.43594273924827576, + "learning_rate": 4.9195418791581504e-05, + "loss": 1.0122, + "num_input_tokens_seen": 15400960, + "step": 1880 + }, + { + "epoch": 0.24421759917301977, + "grad_norm": 0.5096331238746643, + "learning_rate": 4.918688347843549e-05, + "loss": 1.3312, + "num_input_tokens_seen": 15482880, + "step": 1890 + }, + { + "epoch": 0.24550975578240083, + "grad_norm": 0.4702720046043396, + "learning_rate": 4.917830387919434e-05, + "loss": 1.4312, + "num_input_tokens_seen": 15564800, + "step": 1900 + }, + { + "epoch": 0.2468019123917819, + "grad_norm": 0.49178382754325867, + "learning_rate": 4.9169680009567254e-05, + "loss": 0.8297, + "num_input_tokens_seen": 15646720, + "step": 1910 + }, + { + "epoch": 0.24809406900116293, + "grad_norm": 0.4930388033390045, + "learning_rate": 4.916101188534452e-05, + "loss": 0.9553, + "num_input_tokens_seen": 15728640, + "step": 1920 + }, + { + "epoch": 0.249386225610544, + "grad_norm": 0.44128185510635376, + "learning_rate": 4.9152299522397424e-05, + "loss": 1.2107, + "num_input_tokens_seen": 15810560, + "step": 1930 + }, + { + "epoch": 0.25067838221992506, + "grad_norm": 0.45994389057159424, + "learning_rate": 4.91435429366783e-05, + "loss": 1.3879, + "num_input_tokens_seen": 15892480, + "step": 1940 + }, + { + "epoch": 0.2519705388293061, + "grad_norm": 0.46608877182006836, + "learning_rate": 4.9134742144220394e-05, + "loss": 1.2823, + "num_input_tokens_seen": 15974400, + "step": 1950 + }, + { + "epoch": 0.2532626954386872, + "grad_norm": 0.45161423087120056, + "learning_rate": 4.912589716113794e-05, + "loss": 0.9816, + "num_input_tokens_seen": 16056320, + "step": 1960 + }, + { + "epoch": 0.2545548520480682, + "grad_norm": 0.5524086952209473, + "learning_rate": 4.9117008003626066e-05, + "loss": 0.8746, + "num_input_tokens_seen": 16138240, + "step": 1970 + }, + { + "epoch": 0.2558470086574493, + "grad_norm": 0.5476992726325989, + "learning_rate": 4.910807468796079e-05, + "loss": 0.8788, + "num_input_tokens_seen": 16220160, + "step": 1980 + }, + { + "epoch": 0.25713916526683034, + "grad_norm": 0.5770102739334106, + "learning_rate": 4.9099097230498974e-05, + "loss": 0.9458, + "num_input_tokens_seen": 16302080, + "step": 1990 + }, + { + "epoch": 0.2584313218762114, + "grad_norm": 0.4459473490715027, + "learning_rate": 4.909007564767831e-05, + "loss": 0.7193, + "num_input_tokens_seen": 16384000, + "step": 2000 + }, + { + "epoch": 0.25972347848559246, + "grad_norm": 0.43885576725006104, + "learning_rate": 4.90810099560173e-05, + "loss": 1.1239, + "num_input_tokens_seen": 16465920, + "step": 2010 + }, + { + "epoch": 0.2610156350949735, + "grad_norm": 0.42829430103302, + "learning_rate": 4.907190017211517e-05, + "loss": 1.2067, + "num_input_tokens_seen": 16547840, + "step": 2020 + }, + { + "epoch": 0.2623077917043546, + "grad_norm": 0.47339022159576416, + "learning_rate": 4.906274631265191e-05, + "loss": 1.0478, + "num_input_tokens_seen": 16629760, + "step": 2030 + }, + { + "epoch": 0.2635999483137356, + "grad_norm": 0.4274667799472809, + "learning_rate": 4.90535483943882e-05, + "loss": 1.0864, + "num_input_tokens_seen": 16711680, + "step": 2040 + }, + { + "epoch": 0.26489210492311666, + "grad_norm": 0.508808434009552, + "learning_rate": 4.904430643416541e-05, + "loss": 0.7331, + "num_input_tokens_seen": 16793600, + "step": 2050 + }, + { + "epoch": 0.26618426153249775, + "grad_norm": 0.39238959550857544, + "learning_rate": 4.903502044890551e-05, + "loss": 0.8874, + "num_input_tokens_seen": 16875520, + "step": 2060 + }, + { + "epoch": 0.2674764181418788, + "grad_norm": 0.5183860659599304, + "learning_rate": 4.902569045561113e-05, + "loss": 1.1042, + "num_input_tokens_seen": 16957440, + "step": 2070 + }, + { + "epoch": 0.26876857475125987, + "grad_norm": 0.45784205198287964, + "learning_rate": 4.901631647136543e-05, + "loss": 1.0121, + "num_input_tokens_seen": 17039360, + "step": 2080 + }, + { + "epoch": 0.2700607313606409, + "grad_norm": 0.43935465812683105, + "learning_rate": 4.900689851333216e-05, + "loss": 1.027, + "num_input_tokens_seen": 17121280, + "step": 2090 + }, + { + "epoch": 0.271352887970022, + "grad_norm": 0.3319588303565979, + "learning_rate": 4.899743659875556e-05, + "loss": 1.1295, + "num_input_tokens_seen": 17203200, + "step": 2100 + }, + { + "epoch": 0.27264504457940303, + "grad_norm": 0.5109902024269104, + "learning_rate": 4.8987930744960355e-05, + "loss": 1.3633, + "num_input_tokens_seen": 17285120, + "step": 2110 + }, + { + "epoch": 0.27393720118878406, + "grad_norm": 0.5351001620292664, + "learning_rate": 4.897838096935174e-05, + "loss": 1.1889, + "num_input_tokens_seen": 17367040, + "step": 2120 + }, + { + "epoch": 0.27522935779816515, + "grad_norm": 0.4509984850883484, + "learning_rate": 4.896878728941531e-05, + "loss": 0.927, + "num_input_tokens_seen": 17448960, + "step": 2130 + }, + { + "epoch": 0.2765215144075462, + "grad_norm": 0.7170317769050598, + "learning_rate": 4.8959149722717057e-05, + "loss": 0.7892, + "num_input_tokens_seen": 17530880, + "step": 2140 + }, + { + "epoch": 0.2778136710169273, + "grad_norm": 0.5630227327346802, + "learning_rate": 4.894946828690334e-05, + "loss": 1.0423, + "num_input_tokens_seen": 17612800, + "step": 2150 + }, + { + "epoch": 0.2791058276263083, + "grad_norm": 0.6323631405830383, + "learning_rate": 4.893974299970082e-05, + "loss": 1.2965, + "num_input_tokens_seen": 17694720, + "step": 2160 + }, + { + "epoch": 0.28039798423568935, + "grad_norm": 0.5983838438987732, + "learning_rate": 4.892997387891648e-05, + "loss": 0.9905, + "num_input_tokens_seen": 17776640, + "step": 2170 + }, + { + "epoch": 0.28169014084507044, + "grad_norm": 0.4904590845108032, + "learning_rate": 4.892016094243753e-05, + "loss": 1.1746, + "num_input_tokens_seen": 17858560, + "step": 2180 + }, + { + "epoch": 0.28298229745445147, + "grad_norm": 0.39486417174339294, + "learning_rate": 4.891030420823142e-05, + "loss": 0.818, + "num_input_tokens_seen": 17940480, + "step": 2190 + }, + { + "epoch": 0.28427445406383256, + "grad_norm": 0.48404547572135925, + "learning_rate": 4.89004036943458e-05, + "loss": 0.997, + "num_input_tokens_seen": 18022400, + "step": 2200 + }, + { + "epoch": 0.2855666106732136, + "grad_norm": 0.5132070183753967, + "learning_rate": 4.8890459418908476e-05, + "loss": 1.3072, + "num_input_tokens_seen": 18104320, + "step": 2210 + }, + { + "epoch": 0.28685876728259463, + "grad_norm": 0.5053073167800903, + "learning_rate": 4.888047140012737e-05, + "loss": 0.8763, + "num_input_tokens_seen": 18186240, + "step": 2220 + }, + { + "epoch": 0.2881509238919757, + "grad_norm": 0.4907422363758087, + "learning_rate": 4.8870439656290525e-05, + "loss": 0.8944, + "num_input_tokens_seen": 18268160, + "step": 2230 + }, + { + "epoch": 0.28944308050135675, + "grad_norm": 0.4988420307636261, + "learning_rate": 4.8860364205766006e-05, + "loss": 0.8613, + "num_input_tokens_seen": 18350080, + "step": 2240 + }, + { + "epoch": 0.29073523711073784, + "grad_norm": 0.6583264470100403, + "learning_rate": 4.885024506700195e-05, + "loss": 0.9404, + "num_input_tokens_seen": 18432000, + "step": 2250 + }, + { + "epoch": 0.2920273937201189, + "grad_norm": 0.4961051344871521, + "learning_rate": 4.884008225852644e-05, + "loss": 1.1465, + "num_input_tokens_seen": 18513920, + "step": 2260 + }, + { + "epoch": 0.2933195503294999, + "grad_norm": 0.62845778465271, + "learning_rate": 4.8829875798947554e-05, + "loss": 1.3089, + "num_input_tokens_seen": 18595840, + "step": 2270 + }, + { + "epoch": 0.294611706938881, + "grad_norm": 0.557864248752594, + "learning_rate": 4.8819625706953286e-05, + "loss": 1.0963, + "num_input_tokens_seen": 18677760, + "step": 2280 + }, + { + "epoch": 0.29590386354826204, + "grad_norm": 0.4075135886669159, + "learning_rate": 4.88093320013115e-05, + "loss": 0.6548, + "num_input_tokens_seen": 18759680, + "step": 2290 + }, + { + "epoch": 0.2971960201576431, + "grad_norm": 0.4397824704647064, + "learning_rate": 4.879899470086995e-05, + "loss": 0.7479, + "num_input_tokens_seen": 18841600, + "step": 2300 + }, + { + "epoch": 0.29848817676702416, + "grad_norm": 0.2952675521373749, + "learning_rate": 4.8788613824556194e-05, + "loss": 1.0112, + "num_input_tokens_seen": 18923520, + "step": 2310 + }, + { + "epoch": 0.2997803333764052, + "grad_norm": 0.46947795152664185, + "learning_rate": 4.8778189391377574e-05, + "loss": 1.3439, + "num_input_tokens_seen": 19005440, + "step": 2320 + }, + { + "epoch": 0.3010724899857863, + "grad_norm": 0.5178149342536926, + "learning_rate": 4.876772142042117e-05, + "loss": 1.0994, + "num_input_tokens_seen": 19087360, + "step": 2330 + }, + { + "epoch": 0.3023646465951673, + "grad_norm": 0.45673584938049316, + "learning_rate": 4.875720993085384e-05, + "loss": 1.2049, + "num_input_tokens_seen": 19169280, + "step": 2340 + }, + { + "epoch": 0.3036568032045484, + "grad_norm": 1.184935212135315, + "learning_rate": 4.874665494192206e-05, + "loss": 0.6961, + "num_input_tokens_seen": 19251200, + "step": 2350 + }, + { + "epoch": 0.30494895981392944, + "grad_norm": 0.4711746275424957, + "learning_rate": 4.8736056472951955e-05, + "loss": 0.8927, + "num_input_tokens_seen": 19333120, + "step": 2360 + }, + { + "epoch": 0.3062411164233105, + "grad_norm": 0.34465235471725464, + "learning_rate": 4.8725414543349326e-05, + "loss": 0.9728, + "num_input_tokens_seen": 19415040, + "step": 2370 + }, + { + "epoch": 0.30753327303269157, + "grad_norm": 0.7332857847213745, + "learning_rate": 4.871472917259947e-05, + "loss": 1.3036, + "num_input_tokens_seen": 19496960, + "step": 2380 + }, + { + "epoch": 0.3088254296420726, + "grad_norm": 0.4272123873233795, + "learning_rate": 4.870400038026728e-05, + "loss": 0.9335, + "num_input_tokens_seen": 19578880, + "step": 2390 + }, + { + "epoch": 0.3101175862514537, + "grad_norm": 0.47827640175819397, + "learning_rate": 4.869322818599714e-05, + "loss": 1.1586, + "num_input_tokens_seen": 19660800, + "step": 2400 + }, + { + "epoch": 0.3114097428608347, + "grad_norm": 0.5260847806930542, + "learning_rate": 4.868241260951289e-05, + "loss": 1.2544, + "num_input_tokens_seen": 19742720, + "step": 2410 + }, + { + "epoch": 0.3127018994702158, + "grad_norm": 0.5642371773719788, + "learning_rate": 4.867155367061781e-05, + "loss": 1.1045, + "num_input_tokens_seen": 19824640, + "step": 2420 + }, + { + "epoch": 0.31399405607959685, + "grad_norm": 0.2645663321018219, + "learning_rate": 4.8660651389194576e-05, + "loss": 0.5473, + "num_input_tokens_seen": 19906560, + "step": 2430 + }, + { + "epoch": 0.3152862126889779, + "grad_norm": 0.47711700201034546, + "learning_rate": 4.8649705785205224e-05, + "loss": 1.2962, + "num_input_tokens_seen": 19988480, + "step": 2440 + }, + { + "epoch": 0.316578369298359, + "grad_norm": 0.46864113211631775, + "learning_rate": 4.8638716878691125e-05, + "loss": 0.8808, + "num_input_tokens_seen": 20070400, + "step": 2450 + }, + { + "epoch": 0.31787052590774, + "grad_norm": 0.45676669478416443, + "learning_rate": 4.862768468977293e-05, + "loss": 1.2679, + "num_input_tokens_seen": 20152320, + "step": 2460 + }, + { + "epoch": 0.3191626825171211, + "grad_norm": 1.7806425094604492, + "learning_rate": 4.861660923865052e-05, + "loss": 0.9461, + "num_input_tokens_seen": 20234240, + "step": 2470 + }, + { + "epoch": 0.32045483912650213, + "grad_norm": 0.491797536611557, + "learning_rate": 4.860549054560301e-05, + "loss": 0.7236, + "num_input_tokens_seen": 20316160, + "step": 2480 + }, + { + "epoch": 0.32174699573588317, + "grad_norm": 0.46949175000190735, + "learning_rate": 4.8594328630988696e-05, + "loss": 1.2098, + "num_input_tokens_seen": 20398080, + "step": 2490 + }, + { + "epoch": 0.32303915234526426, + "grad_norm": 0.4671134948730469, + "learning_rate": 4.858312351524499e-05, + "loss": 1.2859, + "num_input_tokens_seen": 20480000, + "step": 2500 + }, + { + "epoch": 0.3243313089546453, + "grad_norm": 0.6649695634841919, + "learning_rate": 4.857187521888843e-05, + "loss": 0.9768, + "num_input_tokens_seen": 20561920, + "step": 2510 + }, + { + "epoch": 0.3256234655640264, + "grad_norm": 0.42521458864212036, + "learning_rate": 4.8560583762514594e-05, + "loss": 0.8317, + "num_input_tokens_seen": 20643840, + "step": 2520 + }, + { + "epoch": 0.3269156221734074, + "grad_norm": 0.6050983667373657, + "learning_rate": 4.854924916679811e-05, + "loss": 0.6581, + "num_input_tokens_seen": 20725760, + "step": 2530 + }, + { + "epoch": 0.32820777878278845, + "grad_norm": 0.6202778220176697, + "learning_rate": 4.8537871452492565e-05, + "loss": 1.248, + "num_input_tokens_seen": 20807680, + "step": 2540 + }, + { + "epoch": 0.32949993539216954, + "grad_norm": 0.4647495746612549, + "learning_rate": 4.852645064043053e-05, + "loss": 0.9416, + "num_input_tokens_seen": 20889600, + "step": 2550 + }, + { + "epoch": 0.3307920920015506, + "grad_norm": 0.5066781044006348, + "learning_rate": 4.851498675152346e-05, + "loss": 1.0115, + "num_input_tokens_seen": 20971520, + "step": 2560 + }, + { + "epoch": 0.33208424861093166, + "grad_norm": 0.252456933259964, + "learning_rate": 4.8503479806761684e-05, + "loss": 0.8474, + "num_input_tokens_seen": 21053440, + "step": 2570 + }, + { + "epoch": 0.3333764052203127, + "grad_norm": 0.4478939175605774, + "learning_rate": 4.84919298272144e-05, + "loss": 1.0834, + "num_input_tokens_seen": 21135360, + "step": 2580 + }, + { + "epoch": 0.33466856182969373, + "grad_norm": 0.6947191953659058, + "learning_rate": 4.848033683402956e-05, + "loss": 1.3049, + "num_input_tokens_seen": 21217280, + "step": 2590 + }, + { + "epoch": 0.3359607184390748, + "grad_norm": 0.7486204504966736, + "learning_rate": 4.84687008484339e-05, + "loss": 1.0012, + "num_input_tokens_seen": 21299200, + "step": 2600 + }, + { + "epoch": 0.33725287504845586, + "grad_norm": 0.2647557854652405, + "learning_rate": 4.8457021891732866e-05, + "loss": 0.6511, + "num_input_tokens_seen": 21381120, + "step": 2610 + }, + { + "epoch": 0.33854503165783695, + "grad_norm": 0.7009825110435486, + "learning_rate": 4.844529998531058e-05, + "loss": 0.715, + "num_input_tokens_seen": 21463040, + "step": 2620 + }, + { + "epoch": 0.339837188267218, + "grad_norm": 0.5106743574142456, + "learning_rate": 4.843353515062982e-05, + "loss": 0.9274, + "num_input_tokens_seen": 21544960, + "step": 2630 + }, + { + "epoch": 0.341129344876599, + "grad_norm": 0.26822468638420105, + "learning_rate": 4.842172740923194e-05, + "loss": 0.6468, + "num_input_tokens_seen": 21626880, + "step": 2640 + }, + { + "epoch": 0.3424215014859801, + "grad_norm": 0.7012847065925598, + "learning_rate": 4.840987678273688e-05, + "loss": 0.9614, + "num_input_tokens_seen": 21708800, + "step": 2650 + }, + { + "epoch": 0.34371365809536114, + "grad_norm": 0.4316296875476837, + "learning_rate": 4.8397983292843095e-05, + "loss": 0.8697, + "num_input_tokens_seen": 21790720, + "step": 2660 + }, + { + "epoch": 0.34500581470474223, + "grad_norm": 0.3868614137172699, + "learning_rate": 4.838604696132753e-05, + "loss": 0.7511, + "num_input_tokens_seen": 21872640, + "step": 2670 + }, + { + "epoch": 0.34629797131412327, + "grad_norm": 0.3558228313922882, + "learning_rate": 4.837406781004554e-05, + "loss": 1.081, + "num_input_tokens_seen": 21954560, + "step": 2680 + }, + { + "epoch": 0.34759012792350436, + "grad_norm": 0.47105276584625244, + "learning_rate": 4.836204586093092e-05, + "loss": 0.9772, + "num_input_tokens_seen": 22036480, + "step": 2690 + }, + { + "epoch": 0.3488822845328854, + "grad_norm": 0.3401098847389221, + "learning_rate": 4.8349981135995826e-05, + "loss": 0.7102, + "num_input_tokens_seen": 22118400, + "step": 2700 + }, + { + "epoch": 0.3501744411422664, + "grad_norm": 0.4399671256542206, + "learning_rate": 4.833787365733071e-05, + "loss": 0.8293, + "num_input_tokens_seen": 22200320, + "step": 2710 + }, + { + "epoch": 0.3514665977516475, + "grad_norm": 0.27914759516716003, + "learning_rate": 4.832572344710433e-05, + "loss": 0.6742, + "num_input_tokens_seen": 22282240, + "step": 2720 + }, + { + "epoch": 0.35275875436102855, + "grad_norm": 0.6157001256942749, + "learning_rate": 4.831353052756367e-05, + "loss": 1.1822, + "num_input_tokens_seen": 22364160, + "step": 2730 + }, + { + "epoch": 0.35405091097040964, + "grad_norm": 0.45960766077041626, + "learning_rate": 4.830129492103392e-05, + "loss": 1.0566, + "num_input_tokens_seen": 22446080, + "step": 2740 + }, + { + "epoch": 0.3553430675797907, + "grad_norm": 0.35881564021110535, + "learning_rate": 4.828901664991845e-05, + "loss": 0.8509, + "num_input_tokens_seen": 22528000, + "step": 2750 + }, + { + "epoch": 0.3566352241891717, + "grad_norm": 0.5361536145210266, + "learning_rate": 4.8276695736698704e-05, + "loss": 0.9067, + "num_input_tokens_seen": 22609920, + "step": 2760 + }, + { + "epoch": 0.3579273807985528, + "grad_norm": 0.39405712485313416, + "learning_rate": 4.826433220393424e-05, + "loss": 0.7381, + "num_input_tokens_seen": 22691840, + "step": 2770 + }, + { + "epoch": 0.35921953740793383, + "grad_norm": 0.42842912673950195, + "learning_rate": 4.825192607426264e-05, + "loss": 0.9375, + "num_input_tokens_seen": 22773760, + "step": 2780 + }, + { + "epoch": 0.3605116940173149, + "grad_norm": 0.5455546975135803, + "learning_rate": 4.823947737039948e-05, + "loss": 1.1993, + "num_input_tokens_seen": 22855680, + "step": 2790 + }, + { + "epoch": 0.36180385062669596, + "grad_norm": 0.6831130981445312, + "learning_rate": 4.82269861151383e-05, + "loss": 0.7907, + "num_input_tokens_seen": 22937600, + "step": 2800 + }, + { + "epoch": 0.363096007236077, + "grad_norm": 0.5363448858261108, + "learning_rate": 4.821445233135053e-05, + "loss": 1.0263, + "num_input_tokens_seen": 23019520, + "step": 2810 + }, + { + "epoch": 0.3643881638454581, + "grad_norm": 0.48520973324775696, + "learning_rate": 4.8201876041985496e-05, + "loss": 0.8387, + "num_input_tokens_seen": 23101440, + "step": 2820 + }, + { + "epoch": 0.3656803204548391, + "grad_norm": 0.48314452171325684, + "learning_rate": 4.8189257270070335e-05, + "loss": 1.1519, + "num_input_tokens_seen": 23183360, + "step": 2830 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 0.8397269248962402, + "learning_rate": 4.817659603870995e-05, + "loss": 0.8801, + "num_input_tokens_seen": 23265280, + "step": 2840 + }, + { + "epoch": 0.36826463367360124, + "grad_norm": 0.5130114555358887, + "learning_rate": 4.8163892371087045e-05, + "loss": 0.8017, + "num_input_tokens_seen": 23347200, + "step": 2850 + }, + { + "epoch": 0.3695567902829823, + "grad_norm": 0.653531551361084, + "learning_rate": 4.815114629046196e-05, + "loss": 0.9614, + "num_input_tokens_seen": 23429120, + "step": 2860 + }, + { + "epoch": 0.37084894689236336, + "grad_norm": 0.39914658665657043, + "learning_rate": 4.813835782017274e-05, + "loss": 0.93, + "num_input_tokens_seen": 23511040, + "step": 2870 + }, + { + "epoch": 0.3721411035017444, + "grad_norm": 0.6135099530220032, + "learning_rate": 4.812552698363502e-05, + "loss": 0.9083, + "num_input_tokens_seen": 23592960, + "step": 2880 + }, + { + "epoch": 0.3734332601111255, + "grad_norm": 0.3109150826931, + "learning_rate": 4.8112653804342015e-05, + "loss": 0.486, + "num_input_tokens_seen": 23674880, + "step": 2890 + }, + { + "epoch": 0.3747254167205065, + "grad_norm": 0.4596582353115082, + "learning_rate": 4.809973830586446e-05, + "loss": 1.176, + "num_input_tokens_seen": 23756800, + "step": 2900 + }, + { + "epoch": 0.37601757332988756, + "grad_norm": 0.4620276391506195, + "learning_rate": 4.8086780511850606e-05, + "loss": 1.007, + "num_input_tokens_seen": 23838720, + "step": 2910 + }, + { + "epoch": 0.37730972993926865, + "grad_norm": 0.5108826160430908, + "learning_rate": 4.807378044602611e-05, + "loss": 1.0682, + "num_input_tokens_seen": 23920640, + "step": 2920 + }, + { + "epoch": 0.3786018865486497, + "grad_norm": 0.48416051268577576, + "learning_rate": 4.806073813219404e-05, + "loss": 0.9145, + "num_input_tokens_seen": 24002560, + "step": 2930 + }, + { + "epoch": 0.37989404315803077, + "grad_norm": 0.4877118766307831, + "learning_rate": 4.8047653594234855e-05, + "loss": 0.6768, + "num_input_tokens_seen": 24084480, + "step": 2940 + }, + { + "epoch": 0.3811861997674118, + "grad_norm": 0.7045567631721497, + "learning_rate": 4.803452685610626e-05, + "loss": 1.3005, + "num_input_tokens_seen": 24166400, + "step": 2950 + }, + { + "epoch": 0.3824783563767929, + "grad_norm": 0.6394911408424377, + "learning_rate": 4.802135794184329e-05, + "loss": 0.9298, + "num_input_tokens_seen": 24248320, + "step": 2960 + }, + { + "epoch": 0.38377051298617393, + "grad_norm": 0.5416907072067261, + "learning_rate": 4.800814687555817e-05, + "loss": 0.7336, + "num_input_tokens_seen": 24330240, + "step": 2970 + }, + { + "epoch": 0.38506266959555496, + "grad_norm": 0.47221288084983826, + "learning_rate": 4.799489368144031e-05, + "loss": 1.221, + "num_input_tokens_seen": 24412160, + "step": 2980 + }, + { + "epoch": 0.38635482620493605, + "grad_norm": 0.49521034955978394, + "learning_rate": 4.798159838375626e-05, + "loss": 0.9109, + "num_input_tokens_seen": 24494080, + "step": 2990 + }, + { + "epoch": 0.3876469828143171, + "grad_norm": 0.4463275671005249, + "learning_rate": 4.796826100684967e-05, + "loss": 0.6901, + "num_input_tokens_seen": 24576000, + "step": 3000 + }, + { + "epoch": 0.3889391394236982, + "grad_norm": 0.46501803398132324, + "learning_rate": 4.795488157514122e-05, + "loss": 1.0645, + "num_input_tokens_seen": 24657920, + "step": 3010 + }, + { + "epoch": 0.3902312960330792, + "grad_norm": 0.40190380811691284, + "learning_rate": 4.794146011312861e-05, + "loss": 1.0953, + "num_input_tokens_seen": 24739840, + "step": 3020 + }, + { + "epoch": 0.39152345264246025, + "grad_norm": 0.5038986206054688, + "learning_rate": 4.7927996645386476e-05, + "loss": 1.2322, + "num_input_tokens_seen": 24821760, + "step": 3030 + }, + { + "epoch": 0.39281560925184134, + "grad_norm": 0.4735121428966522, + "learning_rate": 4.791449119656638e-05, + "loss": 0.7708, + "num_input_tokens_seen": 24903680, + "step": 3040 + }, + { + "epoch": 0.39410776586122237, + "grad_norm": 0.7169963121414185, + "learning_rate": 4.790094379139676e-05, + "loss": 0.8159, + "num_input_tokens_seen": 24985600, + "step": 3050 + }, + { + "epoch": 0.39539992247060346, + "grad_norm": 0.9569246172904968, + "learning_rate": 4.7887354454682854e-05, + "loss": 0.7697, + "num_input_tokens_seen": 25067520, + "step": 3060 + }, + { + "epoch": 0.3966920790799845, + "grad_norm": 0.5772688388824463, + "learning_rate": 4.78737232113067e-05, + "loss": 1.0574, + "num_input_tokens_seen": 25149440, + "step": 3070 + }, + { + "epoch": 0.39798423568936553, + "grad_norm": 0.4956931173801422, + "learning_rate": 4.7860050086227035e-05, + "loss": 1.0678, + "num_input_tokens_seen": 25231360, + "step": 3080 + }, + { + "epoch": 0.3992763922987466, + "grad_norm": 0.39392775297164917, + "learning_rate": 4.784633510447932e-05, + "loss": 0.9143, + "num_input_tokens_seen": 25313280, + "step": 3090 + }, + { + "epoch": 0.40056854890812765, + "grad_norm": 0.40444672107696533, + "learning_rate": 4.7832578291175626e-05, + "loss": 0.8812, + "num_input_tokens_seen": 25395200, + "step": 3100 + }, + { + "epoch": 0.40186070551750874, + "grad_norm": 0.7210461497306824, + "learning_rate": 4.781877967150463e-05, + "loss": 0.6671, + "num_input_tokens_seen": 25477120, + "step": 3110 + }, + { + "epoch": 0.4031528621268898, + "grad_norm": 0.6195704340934753, + "learning_rate": 4.7804939270731564e-05, + "loss": 1.0019, + "num_input_tokens_seen": 25559040, + "step": 3120 + }, + { + "epoch": 0.4044450187362708, + "grad_norm": 0.42882412672042847, + "learning_rate": 4.7791057114198133e-05, + "loss": 0.799, + "num_input_tokens_seen": 25640960, + "step": 3130 + }, + { + "epoch": 0.4057371753456519, + "grad_norm": 0.3896157741546631, + "learning_rate": 4.7777133227322525e-05, + "loss": 1.0606, + "num_input_tokens_seen": 25722880, + "step": 3140 + }, + { + "epoch": 0.40702933195503294, + "grad_norm": 0.3924260139465332, + "learning_rate": 4.776316763559933e-05, + "loss": 0.8224, + "num_input_tokens_seen": 25804800, + "step": 3150 + }, + { + "epoch": 0.408321488564414, + "grad_norm": 0.44281336665153503, + "learning_rate": 4.774916036459949e-05, + "loss": 0.8995, + "num_input_tokens_seen": 25886720, + "step": 3160 + }, + { + "epoch": 0.40961364517379506, + "grad_norm": 0.5028753280639648, + "learning_rate": 4.773511143997026e-05, + "loss": 1.1617, + "num_input_tokens_seen": 25968640, + "step": 3170 + }, + { + "epoch": 0.4109058017831761, + "grad_norm": 0.6678476929664612, + "learning_rate": 4.7721020887435186e-05, + "loss": 0.8537, + "num_input_tokens_seen": 26050560, + "step": 3180 + }, + { + "epoch": 0.4121979583925572, + "grad_norm": 0.6059293746948242, + "learning_rate": 4.7706888732793996e-05, + "loss": 1.5184, + "num_input_tokens_seen": 26132480, + "step": 3190 + }, + { + "epoch": 0.4134901150019382, + "grad_norm": 0.46364808082580566, + "learning_rate": 4.769271500192264e-05, + "loss": 1.1, + "num_input_tokens_seen": 26214400, + "step": 3200 + }, + { + "epoch": 0.4147822716113193, + "grad_norm": 0.6045968532562256, + "learning_rate": 4.767849972077315e-05, + "loss": 1.0147, + "num_input_tokens_seen": 26296320, + "step": 3210 + }, + { + "epoch": 0.41607442822070034, + "grad_norm": 0.6123097538948059, + "learning_rate": 4.766424291537366e-05, + "loss": 1.0684, + "num_input_tokens_seen": 26378240, + "step": 3220 + }, + { + "epoch": 0.41736658483008143, + "grad_norm": 0.3833664059638977, + "learning_rate": 4.7649944611828316e-05, + "loss": 0.7629, + "num_input_tokens_seen": 26460160, + "step": 3230 + }, + { + "epoch": 0.41865874143946247, + "grad_norm": 0.5847949981689453, + "learning_rate": 4.763560483631728e-05, + "loss": 0.8127, + "num_input_tokens_seen": 26542080, + "step": 3240 + }, + { + "epoch": 0.4199508980488435, + "grad_norm": 0.5139411687850952, + "learning_rate": 4.762122361509662e-05, + "loss": 1.0787, + "num_input_tokens_seen": 26624000, + "step": 3250 + }, + { + "epoch": 0.4212430546582246, + "grad_norm": 0.43457046151161194, + "learning_rate": 4.7606800974498287e-05, + "loss": 1.089, + "num_input_tokens_seen": 26705920, + "step": 3260 + }, + { + "epoch": 0.4225352112676056, + "grad_norm": 0.5299356579780579, + "learning_rate": 4.75923369409301e-05, + "loss": 0.7698, + "num_input_tokens_seen": 26787840, + "step": 3270 + }, + { + "epoch": 0.4238273678769867, + "grad_norm": 0.5959427952766418, + "learning_rate": 4.757783154087564e-05, + "loss": 1.2242, + "num_input_tokens_seen": 26869760, + "step": 3280 + }, + { + "epoch": 0.42511952448636775, + "grad_norm": 0.5723779201507568, + "learning_rate": 4.756328480089425e-05, + "loss": 1.2536, + "num_input_tokens_seen": 26951680, + "step": 3290 + }, + { + "epoch": 0.4264116810957488, + "grad_norm": 0.4997044503688812, + "learning_rate": 4.7548696747620956e-05, + "loss": 1.0486, + "num_input_tokens_seen": 27033600, + "step": 3300 + }, + { + "epoch": 0.4277038377051299, + "grad_norm": 0.4314401149749756, + "learning_rate": 4.753406740776643e-05, + "loss": 1.1756, + "num_input_tokens_seen": 27115520, + "step": 3310 + }, + { + "epoch": 0.4289959943145109, + "grad_norm": 0.5475839376449585, + "learning_rate": 4.7519396808116933e-05, + "loss": 1.2511, + "num_input_tokens_seen": 27197440, + "step": 3320 + }, + { + "epoch": 0.430288150923892, + "grad_norm": 0.4945213794708252, + "learning_rate": 4.750468497553429e-05, + "loss": 0.8987, + "num_input_tokens_seen": 27279360, + "step": 3330 + }, + { + "epoch": 0.43158030753327303, + "grad_norm": 0.4221036434173584, + "learning_rate": 4.74899319369558e-05, + "loss": 1.0922, + "num_input_tokens_seen": 27361280, + "step": 3340 + }, + { + "epoch": 0.43287246414265407, + "grad_norm": 0.40295591950416565, + "learning_rate": 4.7475137719394234e-05, + "loss": 1.028, + "num_input_tokens_seen": 27443200, + "step": 3350 + }, + { + "epoch": 0.43416462075203516, + "grad_norm": 0.42078787088394165, + "learning_rate": 4.746030234993775e-05, + "loss": 0.7594, + "num_input_tokens_seen": 27525120, + "step": 3360 + }, + { + "epoch": 0.4354567773614162, + "grad_norm": 0.5644673109054565, + "learning_rate": 4.7445425855749844e-05, + "loss": 1.21, + "num_input_tokens_seen": 27607040, + "step": 3370 + }, + { + "epoch": 0.4367489339707973, + "grad_norm": 0.9208412766456604, + "learning_rate": 4.743050826406934e-05, + "loss": 0.8709, + "num_input_tokens_seen": 27688960, + "step": 3380 + }, + { + "epoch": 0.4380410905801783, + "grad_norm": 0.43149518966674805, + "learning_rate": 4.741554960221027e-05, + "loss": 0.8737, + "num_input_tokens_seen": 27770880, + "step": 3390 + }, + { + "epoch": 0.43933324718955935, + "grad_norm": 0.45487073063850403, + "learning_rate": 4.7400549897561914e-05, + "loss": 1.2981, + "num_input_tokens_seen": 27852800, + "step": 3400 + }, + { + "epoch": 0.44062540379894044, + "grad_norm": 0.587581217288971, + "learning_rate": 4.7385509177588664e-05, + "loss": 0.7498, + "num_input_tokens_seen": 27934720, + "step": 3410 + }, + { + "epoch": 0.4419175604083215, + "grad_norm": 15.474299430847168, + "learning_rate": 4.7370427469830016e-05, + "loss": 1.5585, + "num_input_tokens_seen": 28016640, + "step": 3420 + }, + { + "epoch": 0.44320971701770256, + "grad_norm": 0.41457799077033997, + "learning_rate": 4.735530480190053e-05, + "loss": 1.2198, + "num_input_tokens_seen": 28098560, + "step": 3430 + }, + { + "epoch": 0.4445018736270836, + "grad_norm": 0.41844090819358826, + "learning_rate": 4.734014120148976e-05, + "loss": 0.7208, + "num_input_tokens_seen": 28180480, + "step": 3440 + }, + { + "epoch": 0.44579403023646463, + "grad_norm": 0.3628556728363037, + "learning_rate": 4.73249366963622e-05, + "loss": 0.8981, + "num_input_tokens_seen": 28262400, + "step": 3450 + }, + { + "epoch": 0.4470861868458457, + "grad_norm": 0.6861358284950256, + "learning_rate": 4.730969131435724e-05, + "loss": 0.9347, + "num_input_tokens_seen": 28344320, + "step": 3460 + }, + { + "epoch": 0.44837834345522676, + "grad_norm": 0.48623812198638916, + "learning_rate": 4.729440508338911e-05, + "loss": 1.1323, + "num_input_tokens_seen": 28426240, + "step": 3470 + }, + { + "epoch": 0.44967050006460785, + "grad_norm": 0.41574108600616455, + "learning_rate": 4.727907803144686e-05, + "loss": 1.1095, + "num_input_tokens_seen": 28508160, + "step": 3480 + }, + { + "epoch": 0.4509626566739889, + "grad_norm": 0.5009231567382812, + "learning_rate": 4.726371018659427e-05, + "loss": 1.2198, + "num_input_tokens_seen": 28590080, + "step": 3490 + }, + { + "epoch": 0.45225481328336997, + "grad_norm": 0.48607850074768066, + "learning_rate": 4.724830157696979e-05, + "loss": 1.078, + "num_input_tokens_seen": 28672000, + "step": 3500 + }, + { + "epoch": 0.453546969892751, + "grad_norm": 0.8269723653793335, + "learning_rate": 4.723285223078653e-05, + "loss": 0.9332, + "num_input_tokens_seen": 28753920, + "step": 3510 + }, + { + "epoch": 0.45483912650213204, + "grad_norm": 0.5304045677185059, + "learning_rate": 4.721736217633219e-05, + "loss": 1.3173, + "num_input_tokens_seen": 28835840, + "step": 3520 + }, + { + "epoch": 0.45613128311151313, + "grad_norm": 0.4949178695678711, + "learning_rate": 4.7201831441969016e-05, + "loss": 0.9488, + "num_input_tokens_seen": 28917760, + "step": 3530 + }, + { + "epoch": 0.45742343972089416, + "grad_norm": 0.742077112197876, + "learning_rate": 4.71862600561337e-05, + "loss": 0.8988, + "num_input_tokens_seen": 28999680, + "step": 3540 + }, + { + "epoch": 0.45871559633027525, + "grad_norm": 0.6357890963554382, + "learning_rate": 4.7170648047337415e-05, + "loss": 1.401, + "num_input_tokens_seen": 29081600, + "step": 3550 + }, + { + "epoch": 0.4600077529396563, + "grad_norm": 0.512546956539154, + "learning_rate": 4.7154995444165685e-05, + "loss": 0.768, + "num_input_tokens_seen": 29163520, + "step": 3560 + }, + { + "epoch": 0.4612999095490373, + "grad_norm": 0.4622116684913635, + "learning_rate": 4.713930227527836e-05, + "loss": 0.8524, + "num_input_tokens_seen": 29245440, + "step": 3570 + }, + { + "epoch": 0.4625920661584184, + "grad_norm": 0.620832085609436, + "learning_rate": 4.712356856940958e-05, + "loss": 0.8993, + "num_input_tokens_seen": 29327360, + "step": 3580 + }, + { + "epoch": 0.46388422276779945, + "grad_norm": 0.4882233142852783, + "learning_rate": 4.710779435536772e-05, + "loss": 0.7759, + "num_input_tokens_seen": 29409280, + "step": 3590 + }, + { + "epoch": 0.46517637937718054, + "grad_norm": 0.39704984426498413, + "learning_rate": 4.709197966203528e-05, + "loss": 0.7109, + "num_input_tokens_seen": 29491200, + "step": 3600 + }, + { + "epoch": 0.46646853598656157, + "grad_norm": 0.6212629079818726, + "learning_rate": 4.707612451836892e-05, + "loss": 1.2732, + "num_input_tokens_seen": 29573120, + "step": 3610 + }, + { + "epoch": 0.4677606925959426, + "grad_norm": 0.4958352744579315, + "learning_rate": 4.706022895339936e-05, + "loss": 1.0464, + "num_input_tokens_seen": 29655040, + "step": 3620 + }, + { + "epoch": 0.4690528492053237, + "grad_norm": 0.5100958943367004, + "learning_rate": 4.704429299623129e-05, + "loss": 0.8741, + "num_input_tokens_seen": 29736960, + "step": 3630 + }, + { + "epoch": 0.47034500581470473, + "grad_norm": 0.4343372583389282, + "learning_rate": 4.7028316676043425e-05, + "loss": 0.9055, + "num_input_tokens_seen": 29818880, + "step": 3640 + }, + { + "epoch": 0.4716371624240858, + "grad_norm": 0.5606836676597595, + "learning_rate": 4.7012300022088326e-05, + "loss": 0.9934, + "num_input_tokens_seen": 29900800, + "step": 3650 + }, + { + "epoch": 0.47292931903346686, + "grad_norm": 0.5697008371353149, + "learning_rate": 4.6996243063692446e-05, + "loss": 1.0764, + "num_input_tokens_seen": 29982720, + "step": 3660 + }, + { + "epoch": 0.4742214756428479, + "grad_norm": 0.9284478425979614, + "learning_rate": 4.6980145830255993e-05, + "loss": 0.5566, + "num_input_tokens_seen": 30064640, + "step": 3670 + }, + { + "epoch": 0.475513632252229, + "grad_norm": 0.48587319254875183, + "learning_rate": 4.6964008351252964e-05, + "loss": 0.9627, + "num_input_tokens_seen": 30146560, + "step": 3680 + }, + { + "epoch": 0.47680578886161, + "grad_norm": 0.6687888503074646, + "learning_rate": 4.694783065623102e-05, + "loss": 0.8859, + "num_input_tokens_seen": 30228480, + "step": 3690 + }, + { + "epoch": 0.4780979454709911, + "grad_norm": 0.6682938933372498, + "learning_rate": 4.6931612774811445e-05, + "loss": 1.1395, + "num_input_tokens_seen": 30310400, + "step": 3700 + }, + { + "epoch": 0.47939010208037214, + "grad_norm": 0.5708907246589661, + "learning_rate": 4.691535473668914e-05, + "loss": 0.7113, + "num_input_tokens_seen": 30392320, + "step": 3710 + }, + { + "epoch": 0.4806822586897532, + "grad_norm": 0.5752365589141846, + "learning_rate": 4.68990565716325e-05, + "loss": 1.049, + "num_input_tokens_seen": 30474240, + "step": 3720 + }, + { + "epoch": 0.48197441529913426, + "grad_norm": 0.5052396059036255, + "learning_rate": 4.688271830948342e-05, + "loss": 1.1382, + "num_input_tokens_seen": 30556160, + "step": 3730 + }, + { + "epoch": 0.4832665719085153, + "grad_norm": 0.587308406829834, + "learning_rate": 4.686633998015718e-05, + "loss": 0.8307, + "num_input_tokens_seen": 30638080, + "step": 3740 + }, + { + "epoch": 0.4845587285178964, + "grad_norm": 0.6255913972854614, + "learning_rate": 4.6849921613642456e-05, + "loss": 1.0814, + "num_input_tokens_seen": 30720000, + "step": 3750 + }, + { + "epoch": 0.4858508851272774, + "grad_norm": 0.5788549184799194, + "learning_rate": 4.683346324000122e-05, + "loss": 1.0745, + "num_input_tokens_seen": 30801920, + "step": 3760 + }, + { + "epoch": 0.48714304173665846, + "grad_norm": 0.45166078209877014, + "learning_rate": 4.6816964889368674e-05, + "loss": 1.0342, + "num_input_tokens_seen": 30883840, + "step": 3770 + }, + { + "epoch": 0.48843519834603955, + "grad_norm": 0.38736027479171753, + "learning_rate": 4.680042659195325e-05, + "loss": 1.5249, + "num_input_tokens_seen": 30965760, + "step": 3780 + }, + { + "epoch": 0.4897273549554206, + "grad_norm": 0.5221673846244812, + "learning_rate": 4.678384837803651e-05, + "loss": 0.9, + "num_input_tokens_seen": 31047680, + "step": 3790 + }, + { + "epoch": 0.49101951156480167, + "grad_norm": 0.5739164352416992, + "learning_rate": 4.67672302779731e-05, + "loss": 0.835, + "num_input_tokens_seen": 31129600, + "step": 3800 + }, + { + "epoch": 0.4923116681741827, + "grad_norm": 0.6319538950920105, + "learning_rate": 4.6750572322190716e-05, + "loss": 1.2393, + "num_input_tokens_seen": 31211520, + "step": 3810 + }, + { + "epoch": 0.4936038247835638, + "grad_norm": 0.8037749528884888, + "learning_rate": 4.673387454118999e-05, + "loss": 0.5902, + "num_input_tokens_seen": 31293440, + "step": 3820 + }, + { + "epoch": 0.49489598139294483, + "grad_norm": 0.5781430006027222, + "learning_rate": 4.671713696554452e-05, + "loss": 0.8908, + "num_input_tokens_seen": 31375360, + "step": 3830 + }, + { + "epoch": 0.49618813800232586, + "grad_norm": 0.5275561213493347, + "learning_rate": 4.6700359625900724e-05, + "loss": 0.9977, + "num_input_tokens_seen": 31457280, + "step": 3840 + }, + { + "epoch": 0.49748029461170695, + "grad_norm": 0.7814369201660156, + "learning_rate": 4.668354255297785e-05, + "loss": 0.8617, + "num_input_tokens_seen": 31539200, + "step": 3850 + }, + { + "epoch": 0.498772451221088, + "grad_norm": 0.26794329285621643, + "learning_rate": 4.666668577756793e-05, + "loss": 0.7011, + "num_input_tokens_seen": 31621120, + "step": 3860 + }, + { + "epoch": 0.500064607830469, + "grad_norm": 0.4963877201080322, + "learning_rate": 4.664978933053562e-05, + "loss": 0.8713, + "num_input_tokens_seen": 31703040, + "step": 3870 + }, + { + "epoch": 0.5013567644398501, + "grad_norm": 0.748673141002655, + "learning_rate": 4.6632853242818274e-05, + "loss": 1.0985, + "num_input_tokens_seen": 31784960, + "step": 3880 + }, + { + "epoch": 0.5026489210492312, + "grad_norm": 0.2532234489917755, + "learning_rate": 4.66158775454258e-05, + "loss": 0.8102, + "num_input_tokens_seen": 31866880, + "step": 3890 + }, + { + "epoch": 0.5039410776586122, + "grad_norm": 0.4093916416168213, + "learning_rate": 4.659886226944063e-05, + "loss": 1.0378, + "num_input_tokens_seen": 31948800, + "step": 3900 + }, + { + "epoch": 0.5052332342679933, + "grad_norm": 0.656688392162323, + "learning_rate": 4.658180744601769e-05, + "loss": 0.9426, + "num_input_tokens_seen": 32030720, + "step": 3910 + }, + { + "epoch": 0.5065253908773744, + "grad_norm": 0.7212385535240173, + "learning_rate": 4.6564713106384296e-05, + "loss": 1.1089, + "num_input_tokens_seen": 32112640, + "step": 3920 + }, + { + "epoch": 0.5078175474867554, + "grad_norm": 0.5661940574645996, + "learning_rate": 4.65475792818401e-05, + "loss": 1.0339, + "num_input_tokens_seen": 32194560, + "step": 3930 + }, + { + "epoch": 0.5091097040961364, + "grad_norm": 0.5363628268241882, + "learning_rate": 4.653040600375709e-05, + "loss": 1.1407, + "num_input_tokens_seen": 32276480, + "step": 3940 + }, + { + "epoch": 0.5104018607055175, + "grad_norm": 0.29208219051361084, + "learning_rate": 4.6513193303579476e-05, + "loss": 1.1492, + "num_input_tokens_seen": 32358400, + "step": 3950 + }, + { + "epoch": 0.5116940173148986, + "grad_norm": 0.7336292862892151, + "learning_rate": 4.6495941212823644e-05, + "loss": 0.8435, + "num_input_tokens_seen": 32440320, + "step": 3960 + }, + { + "epoch": 0.5129861739242796, + "grad_norm": 0.45039644837379456, + "learning_rate": 4.647864976307811e-05, + "loss": 0.6948, + "num_input_tokens_seen": 32522240, + "step": 3970 + }, + { + "epoch": 0.5142783305336607, + "grad_norm": 0.4214424788951874, + "learning_rate": 4.646131898600345e-05, + "loss": 0.8107, + "num_input_tokens_seen": 32604160, + "step": 3980 + }, + { + "epoch": 0.5155704871430418, + "grad_norm": 0.5167589783668518, + "learning_rate": 4.644394891333227e-05, + "loss": 1.0497, + "num_input_tokens_seen": 32686080, + "step": 3990 + }, + { + "epoch": 0.5168626437524227, + "grad_norm": 0.5262385010719299, + "learning_rate": 4.64265395768691e-05, + "loss": 0.988, + "num_input_tokens_seen": 32768000, + "step": 4000 + }, + { + "epoch": 0.5181548003618038, + "grad_norm": 0.464677631855011, + "learning_rate": 4.6409091008490365e-05, + "loss": 0.9792, + "num_input_tokens_seen": 32849920, + "step": 4010 + }, + { + "epoch": 0.5194469569711849, + "grad_norm": 0.4901339113712311, + "learning_rate": 4.639160324014433e-05, + "loss": 0.7922, + "num_input_tokens_seen": 32931840, + "step": 4020 + }, + { + "epoch": 0.520739113580566, + "grad_norm": 0.47087928652763367, + "learning_rate": 4.637407630385104e-05, + "loss": 1.2068, + "num_input_tokens_seen": 33013760, + "step": 4030 + }, + { + "epoch": 0.522031270189947, + "grad_norm": 0.510374128818512, + "learning_rate": 4.6356510231702254e-05, + "loss": 0.9503, + "num_input_tokens_seen": 33095680, + "step": 4040 + }, + { + "epoch": 0.5233234267993281, + "grad_norm": 0.5987057685852051, + "learning_rate": 4.633890505586139e-05, + "loss": 0.8793, + "num_input_tokens_seen": 33177600, + "step": 4050 + }, + { + "epoch": 0.5246155834087092, + "grad_norm": 0.5550726652145386, + "learning_rate": 4.6321260808563445e-05, + "loss": 1.0048, + "num_input_tokens_seen": 33259520, + "step": 4060 + }, + { + "epoch": 0.5259077400180902, + "grad_norm": 0.727328896522522, + "learning_rate": 4.630357752211498e-05, + "loss": 0.8116, + "num_input_tokens_seen": 33341440, + "step": 4070 + }, + { + "epoch": 0.5271998966274712, + "grad_norm": 1.1229157447814941, + "learning_rate": 4.6285855228894025e-05, + "loss": 1.202, + "num_input_tokens_seen": 33423360, + "step": 4080 + }, + { + "epoch": 0.5284920532368523, + "grad_norm": 0.8320699334144592, + "learning_rate": 4.626809396135003e-05, + "loss": 1.2678, + "num_input_tokens_seen": 33505280, + "step": 4090 + }, + { + "epoch": 0.5297842098462333, + "grad_norm": 0.3679317533969879, + "learning_rate": 4.6250293752003834e-05, + "loss": 1.052, + "num_input_tokens_seen": 33587200, + "step": 4100 + }, + { + "epoch": 0.5310763664556144, + "grad_norm": 0.5202254056930542, + "learning_rate": 4.623245463344753e-05, + "loss": 0.9749, + "num_input_tokens_seen": 33669120, + "step": 4110 + }, + { + "epoch": 0.5323685230649955, + "grad_norm": 0.6384007930755615, + "learning_rate": 4.6214576638344484e-05, + "loss": 0.9361, + "num_input_tokens_seen": 33751040, + "step": 4120 + }, + { + "epoch": 0.5336606796743766, + "grad_norm": 0.39264151453971863, + "learning_rate": 4.619665979942924e-05, + "loss": 1.0865, + "num_input_tokens_seen": 33832960, + "step": 4130 + }, + { + "epoch": 0.5349528362837576, + "grad_norm": 0.5233849287033081, + "learning_rate": 4.617870414950748e-05, + "loss": 0.8994, + "num_input_tokens_seen": 33914880, + "step": 4140 + }, + { + "epoch": 0.5362449928931387, + "grad_norm": 0.4549662172794342, + "learning_rate": 4.616070972145591e-05, + "loss": 1.0843, + "num_input_tokens_seen": 33996800, + "step": 4150 + }, + { + "epoch": 0.5375371495025197, + "grad_norm": 0.5111933350563049, + "learning_rate": 4.614267654822228e-05, + "loss": 1.1799, + "num_input_tokens_seen": 34078720, + "step": 4160 + }, + { + "epoch": 0.5388293061119007, + "grad_norm": 0.49319344758987427, + "learning_rate": 4.612460466282525e-05, + "loss": 1.018, + "num_input_tokens_seen": 34160640, + "step": 4170 + }, + { + "epoch": 0.5401214627212818, + "grad_norm": 0.5339105725288391, + "learning_rate": 4.610649409835438e-05, + "loss": 1.2698, + "num_input_tokens_seen": 34242560, + "step": 4180 + }, + { + "epoch": 0.5414136193306629, + "grad_norm": 0.6269609928131104, + "learning_rate": 4.608834488797006e-05, + "loss": 1.3887, + "num_input_tokens_seen": 34324480, + "step": 4190 + }, + { + "epoch": 0.542705775940044, + "grad_norm": 0.5303708910942078, + "learning_rate": 4.607015706490341e-05, + "loss": 0.983, + "num_input_tokens_seen": 34406400, + "step": 4200 + }, + { + "epoch": 0.543997932549425, + "grad_norm": 0.3554169237613678, + "learning_rate": 4.6051930662456276e-05, + "loss": 0.8547, + "num_input_tokens_seen": 34488320, + "step": 4210 + }, + { + "epoch": 0.5452900891588061, + "grad_norm": 0.6087794899940491, + "learning_rate": 4.603366571400114e-05, + "loss": 1.1971, + "num_input_tokens_seen": 34570240, + "step": 4220 + }, + { + "epoch": 0.5465822457681871, + "grad_norm": 0.47708624601364136, + "learning_rate": 4.601536225298104e-05, + "loss": 0.927, + "num_input_tokens_seen": 34652160, + "step": 4230 + }, + { + "epoch": 0.5478744023775681, + "grad_norm": 0.2548501789569855, + "learning_rate": 4.5997020312909565e-05, + "loss": 0.8928, + "num_input_tokens_seen": 34734080, + "step": 4240 + }, + { + "epoch": 0.5491665589869492, + "grad_norm": 0.3759148418903351, + "learning_rate": 4.597863992737072e-05, + "loss": 0.9546, + "num_input_tokens_seen": 34816000, + "step": 4250 + }, + { + "epoch": 0.5504587155963303, + "grad_norm": 0.5236682891845703, + "learning_rate": 4.5960221130018946e-05, + "loss": 1.1029, + "num_input_tokens_seen": 34897920, + "step": 4260 + }, + { + "epoch": 0.5517508722057113, + "grad_norm": 0.5889397263526917, + "learning_rate": 4.594176395457897e-05, + "loss": 1.1309, + "num_input_tokens_seen": 34979840, + "step": 4270 + }, + { + "epoch": 0.5530430288150924, + "grad_norm": 0.5856964588165283, + "learning_rate": 4.592326843484583e-05, + "loss": 0.8424, + "num_input_tokens_seen": 35061760, + "step": 4280 + }, + { + "epoch": 0.5543351854244735, + "grad_norm": 0.5260830521583557, + "learning_rate": 4.590473460468475e-05, + "loss": 1.2382, + "num_input_tokens_seen": 35143680, + "step": 4290 + }, + { + "epoch": 0.5556273420338546, + "grad_norm": 0.5527287721633911, + "learning_rate": 4.58861624980311e-05, + "loss": 1.0987, + "num_input_tokens_seen": 35225600, + "step": 4300 + }, + { + "epoch": 0.5569194986432355, + "grad_norm": 0.6066960096359253, + "learning_rate": 4.586755214889035e-05, + "loss": 0.6929, + "num_input_tokens_seen": 35307520, + "step": 4310 + }, + { + "epoch": 0.5582116552526166, + "grad_norm": 0.8027804493904114, + "learning_rate": 4.584890359133797e-05, + "loss": 0.9671, + "num_input_tokens_seen": 35389440, + "step": 4320 + }, + { + "epoch": 0.5595038118619977, + "grad_norm": 0.5972751975059509, + "learning_rate": 4.58302168595194e-05, + "loss": 0.7893, + "num_input_tokens_seen": 35471360, + "step": 4330 + }, + { + "epoch": 0.5607959684713787, + "grad_norm": 0.36377179622650146, + "learning_rate": 4.5811491987649994e-05, + "loss": 0.8821, + "num_input_tokens_seen": 35553280, + "step": 4340 + }, + { + "epoch": 0.5620881250807598, + "grad_norm": 0.5296213626861572, + "learning_rate": 4.579272901001491e-05, + "loss": 0.8082, + "num_input_tokens_seen": 35635200, + "step": 4350 + }, + { + "epoch": 0.5633802816901409, + "grad_norm": 0.4613911509513855, + "learning_rate": 4.57739279609691e-05, + "loss": 0.9638, + "num_input_tokens_seen": 35717120, + "step": 4360 + }, + { + "epoch": 0.5646724382995219, + "grad_norm": 0.5832899212837219, + "learning_rate": 4.57550888749372e-05, + "loss": 1.0087, + "num_input_tokens_seen": 35799040, + "step": 4370 + }, + { + "epoch": 0.5659645949089029, + "grad_norm": 0.461342453956604, + "learning_rate": 4.5736211786413524e-05, + "loss": 1.0578, + "num_input_tokens_seen": 35880960, + "step": 4380 + }, + { + "epoch": 0.567256751518284, + "grad_norm": 0.5198622941970825, + "learning_rate": 4.571729672996195e-05, + "loss": 0.5186, + "num_input_tokens_seen": 35962880, + "step": 4390 + }, + { + "epoch": 0.5685489081276651, + "grad_norm": 0.4189557135105133, + "learning_rate": 4.5698343740215865e-05, + "loss": 1.0137, + "num_input_tokens_seen": 36044800, + "step": 4400 + }, + { + "epoch": 0.5698410647370461, + "grad_norm": 0.4138655364513397, + "learning_rate": 4.5679352851878135e-05, + "loss": 0.91, + "num_input_tokens_seen": 36126720, + "step": 4410 + }, + { + "epoch": 0.5711332213464272, + "grad_norm": 0.327007532119751, + "learning_rate": 4.5660324099721005e-05, + "loss": 0.8943, + "num_input_tokens_seen": 36208640, + "step": 4420 + }, + { + "epoch": 0.5724253779558083, + "grad_norm": 0.48592689633369446, + "learning_rate": 4.5641257518586044e-05, + "loss": 0.8402, + "num_input_tokens_seen": 36290560, + "step": 4430 + }, + { + "epoch": 0.5737175345651893, + "grad_norm": 0.5149102210998535, + "learning_rate": 4.562215314338411e-05, + "loss": 0.8945, + "num_input_tokens_seen": 36372480, + "step": 4440 + }, + { + "epoch": 0.5750096911745703, + "grad_norm": 0.47241947054862976, + "learning_rate": 4.560301100909522e-05, + "loss": 1.3013, + "num_input_tokens_seen": 36454400, + "step": 4450 + }, + { + "epoch": 0.5763018477839514, + "grad_norm": 0.6244943737983704, + "learning_rate": 4.558383115076857e-05, + "loss": 0.8028, + "num_input_tokens_seen": 36536320, + "step": 4460 + }, + { + "epoch": 0.5775940043933324, + "grad_norm": 0.42100536823272705, + "learning_rate": 4.556461360352241e-05, + "loss": 1.1116, + "num_input_tokens_seen": 36618240, + "step": 4470 + }, + { + "epoch": 0.5788861610027135, + "grad_norm": 0.5363230109214783, + "learning_rate": 4.554535840254398e-05, + "loss": 1.094, + "num_input_tokens_seen": 36700160, + "step": 4480 + }, + { + "epoch": 0.5801783176120946, + "grad_norm": 0.6321589350700378, + "learning_rate": 4.552606558308951e-05, + "loss": 0.9717, + "num_input_tokens_seen": 36782080, + "step": 4490 + }, + { + "epoch": 0.5814704742214757, + "grad_norm": 0.7036007046699524, + "learning_rate": 4.550673518048405e-05, + "loss": 1.0816, + "num_input_tokens_seen": 36864000, + "step": 4500 + }, + { + "epoch": 0.5827626308308567, + "grad_norm": 0.4853123724460602, + "learning_rate": 4.548736723012153e-05, + "loss": 0.9271, + "num_input_tokens_seen": 36945920, + "step": 4510 + }, + { + "epoch": 0.5840547874402378, + "grad_norm": 0.7495065331459045, + "learning_rate": 4.5467961767464575e-05, + "loss": 0.9318, + "num_input_tokens_seen": 37027840, + "step": 4520 + }, + { + "epoch": 0.5853469440496188, + "grad_norm": 0.43084239959716797, + "learning_rate": 4.5448518828044515e-05, + "loss": 0.7255, + "num_input_tokens_seen": 37109760, + "step": 4530 + }, + { + "epoch": 0.5866391006589998, + "grad_norm": 0.560870885848999, + "learning_rate": 4.5429038447461315e-05, + "loss": 1.1148, + "num_input_tokens_seen": 37191680, + "step": 4540 + }, + { + "epoch": 0.5879312572683809, + "grad_norm": 0.5736927390098572, + "learning_rate": 4.540952066138347e-05, + "loss": 1.1077, + "num_input_tokens_seen": 37273600, + "step": 4550 + }, + { + "epoch": 0.589223413877762, + "grad_norm": 0.27289411425590515, + "learning_rate": 4.538996550554798e-05, + "loss": 0.923, + "num_input_tokens_seen": 37355520, + "step": 4560 + }, + { + "epoch": 0.5905155704871431, + "grad_norm": 0.4996427893638611, + "learning_rate": 4.537037301576026e-05, + "loss": 0.9954, + "num_input_tokens_seen": 37437440, + "step": 4570 + }, + { + "epoch": 0.5918077270965241, + "grad_norm": 0.4833759367465973, + "learning_rate": 4.535074322789408e-05, + "loss": 0.9237, + "num_input_tokens_seen": 37519360, + "step": 4580 + }, + { + "epoch": 0.5930998837059052, + "grad_norm": 0.20252840220928192, + "learning_rate": 4.5331076177891527e-05, + "loss": 1.0009, + "num_input_tokens_seen": 37601280, + "step": 4590 + }, + { + "epoch": 0.5943920403152863, + "grad_norm": 0.49745073914527893, + "learning_rate": 4.531137190176289e-05, + "loss": 1.0067, + "num_input_tokens_seen": 37683200, + "step": 4600 + }, + { + "epoch": 0.5956841969246672, + "grad_norm": 0.5191871523857117, + "learning_rate": 4.529163043558662e-05, + "loss": 0.7292, + "num_input_tokens_seen": 37765120, + "step": 4610 + }, + { + "epoch": 0.5969763535340483, + "grad_norm": 0.7261500954627991, + "learning_rate": 4.527185181550928e-05, + "loss": 1.0878, + "num_input_tokens_seen": 37847040, + "step": 4620 + }, + { + "epoch": 0.5982685101434294, + "grad_norm": 0.6851155757904053, + "learning_rate": 4.525203607774544e-05, + "loss": 0.741, + "num_input_tokens_seen": 37928960, + "step": 4630 + }, + { + "epoch": 0.5995606667528104, + "grad_norm": 0.5459215044975281, + "learning_rate": 4.5232183258577655e-05, + "loss": 0.498, + "num_input_tokens_seen": 38010880, + "step": 4640 + }, + { + "epoch": 0.6008528233621915, + "grad_norm": 0.3943910300731659, + "learning_rate": 4.5212293394356356e-05, + "loss": 0.854, + "num_input_tokens_seen": 38092800, + "step": 4650 + }, + { + "epoch": 0.6021449799715726, + "grad_norm": 0.402898907661438, + "learning_rate": 4.519236652149981e-05, + "loss": 0.8904, + "num_input_tokens_seen": 38174720, + "step": 4660 + }, + { + "epoch": 0.6034371365809537, + "grad_norm": 0.23560667037963867, + "learning_rate": 4.517240267649405e-05, + "loss": 0.6613, + "num_input_tokens_seen": 38256640, + "step": 4670 + }, + { + "epoch": 0.6047292931903346, + "grad_norm": 0.3227959871292114, + "learning_rate": 4.515240189589282e-05, + "loss": 0.679, + "num_input_tokens_seen": 38338560, + "step": 4680 + }, + { + "epoch": 0.6060214497997157, + "grad_norm": 0.29034727811813354, + "learning_rate": 4.5132364216317446e-05, + "loss": 0.7213, + "num_input_tokens_seen": 38420480, + "step": 4690 + }, + { + "epoch": 0.6073136064090968, + "grad_norm": 0.5373560190200806, + "learning_rate": 4.5112289674456864e-05, + "loss": 1.0668, + "num_input_tokens_seen": 38502400, + "step": 4700 + }, + { + "epoch": 0.6086057630184778, + "grad_norm": 0.48361408710479736, + "learning_rate": 4.509217830706749e-05, + "loss": 0.6861, + "num_input_tokens_seen": 38584320, + "step": 4710 + }, + { + "epoch": 0.6098979196278589, + "grad_norm": 0.4311217963695526, + "learning_rate": 4.5072030150973154e-05, + "loss": 0.9106, + "num_input_tokens_seen": 38666240, + "step": 4720 + }, + { + "epoch": 0.61119007623724, + "grad_norm": 0.47615599632263184, + "learning_rate": 4.505184524306506e-05, + "loss": 0.73, + "num_input_tokens_seen": 38748160, + "step": 4730 + }, + { + "epoch": 0.612482232846621, + "grad_norm": 0.5375292897224426, + "learning_rate": 4.50316236203017e-05, + "loss": 1.3151, + "num_input_tokens_seen": 38830080, + "step": 4740 + }, + { + "epoch": 0.613774389456002, + "grad_norm": 0.42838889360427856, + "learning_rate": 4.5011365319708796e-05, + "loss": 1.0097, + "num_input_tokens_seen": 38912000, + "step": 4750 + }, + { + "epoch": 0.6150665460653831, + "grad_norm": 0.5048761367797852, + "learning_rate": 4.499107037837922e-05, + "loss": 1.0672, + "num_input_tokens_seen": 38993920, + "step": 4760 + }, + { + "epoch": 0.6163587026747642, + "grad_norm": 0.4704453647136688, + "learning_rate": 4.497073883347293e-05, + "loss": 0.8592, + "num_input_tokens_seen": 39075840, + "step": 4770 + }, + { + "epoch": 0.6176508592841452, + "grad_norm": 0.5054408311843872, + "learning_rate": 4.495037072221692e-05, + "loss": 0.979, + "num_input_tokens_seen": 39157760, + "step": 4780 + }, + { + "epoch": 0.6189430158935263, + "grad_norm": 0.4307916462421417, + "learning_rate": 4.49299660819051e-05, + "loss": 1.1349, + "num_input_tokens_seen": 39239680, + "step": 4790 + }, + { + "epoch": 0.6202351725029074, + "grad_norm": 0.7351888418197632, + "learning_rate": 4.490952494989834e-05, + "loss": 0.8819, + "num_input_tokens_seen": 39321600, + "step": 4800 + }, + { + "epoch": 0.6215273291122884, + "grad_norm": 0.7164278030395508, + "learning_rate": 4.4889047363624236e-05, + "loss": 1.0527, + "num_input_tokens_seen": 39403520, + "step": 4810 + }, + { + "epoch": 0.6228194857216695, + "grad_norm": 0.5169610381126404, + "learning_rate": 4.486853336057719e-05, + "loss": 1.1608, + "num_input_tokens_seen": 39485440, + "step": 4820 + }, + { + "epoch": 0.6241116423310505, + "grad_norm": 0.9453770518302917, + "learning_rate": 4.484798297831826e-05, + "loss": 0.9382, + "num_input_tokens_seen": 39567360, + "step": 4830 + }, + { + "epoch": 0.6254037989404316, + "grad_norm": 0.49842312932014465, + "learning_rate": 4.482739625447514e-05, + "loss": 0.9578, + "num_input_tokens_seen": 39649280, + "step": 4840 + }, + { + "epoch": 0.6266959555498126, + "grad_norm": 0.5001227855682373, + "learning_rate": 4.480677322674202e-05, + "loss": 0.5964, + "num_input_tokens_seen": 39731200, + "step": 4850 + }, + { + "epoch": 0.6279881121591937, + "grad_norm": 0.42781203985214233, + "learning_rate": 4.4786113932879605e-05, + "loss": 1.0508, + "num_input_tokens_seen": 39813120, + "step": 4860 + }, + { + "epoch": 0.6292802687685748, + "grad_norm": 0.3244008719921112, + "learning_rate": 4.476541841071498e-05, + "loss": 0.7638, + "num_input_tokens_seen": 39895040, + "step": 4870 + }, + { + "epoch": 0.6305724253779558, + "grad_norm": 0.3672725558280945, + "learning_rate": 4.4744686698141564e-05, + "loss": 0.5432, + "num_input_tokens_seen": 39976960, + "step": 4880 + }, + { + "epoch": 0.6318645819873369, + "grad_norm": 0.44159212708473206, + "learning_rate": 4.472391883311906e-05, + "loss": 1.0626, + "num_input_tokens_seen": 40058880, + "step": 4890 + }, + { + "epoch": 0.633156738596718, + "grad_norm": 0.5819474458694458, + "learning_rate": 4.470311485367335e-05, + "loss": 0.8067, + "num_input_tokens_seen": 40140800, + "step": 4900 + }, + { + "epoch": 0.6344488952060989, + "grad_norm": 0.618045449256897, + "learning_rate": 4.468227479789644e-05, + "loss": 0.8431, + "num_input_tokens_seen": 40222720, + "step": 4910 + }, + { + "epoch": 0.63574105181548, + "grad_norm": 0.6308985948562622, + "learning_rate": 4.4661398703946396e-05, + "loss": 0.92, + "num_input_tokens_seen": 40304640, + "step": 4920 + }, + { + "epoch": 0.6370332084248611, + "grad_norm": 0.49686864018440247, + "learning_rate": 4.464048661004727e-05, + "loss": 1.0994, + "num_input_tokens_seen": 40386560, + "step": 4930 + }, + { + "epoch": 0.6383253650342422, + "grad_norm": 0.5074322819709778, + "learning_rate": 4.461953855448903e-05, + "loss": 1.0147, + "num_input_tokens_seen": 40468480, + "step": 4940 + }, + { + "epoch": 0.6396175216436232, + "grad_norm": 0.7457041144371033, + "learning_rate": 4.4598554575627495e-05, + "loss": 1.1607, + "num_input_tokens_seen": 40550400, + "step": 4950 + }, + { + "epoch": 0.6409096782530043, + "grad_norm": 0.5672173500061035, + "learning_rate": 4.4577534711884244e-05, + "loss": 1.1159, + "num_input_tokens_seen": 40632320, + "step": 4960 + }, + { + "epoch": 0.6422018348623854, + "grad_norm": 0.3177810609340668, + "learning_rate": 4.455647900174658e-05, + "loss": 1.0039, + "num_input_tokens_seen": 40714240, + "step": 4970 + }, + { + "epoch": 0.6434939914717663, + "grad_norm": 0.6089414358139038, + "learning_rate": 4.453538748376742e-05, + "loss": 0.8848, + "num_input_tokens_seen": 40796160, + "step": 4980 + }, + { + "epoch": 0.6447861480811474, + "grad_norm": 0.44886818528175354, + "learning_rate": 4.451426019656526e-05, + "loss": 0.624, + "num_input_tokens_seen": 40878080, + "step": 4990 + }, + { + "epoch": 0.6460783046905285, + "grad_norm": 0.336834192276001, + "learning_rate": 4.449309717882409e-05, + "loss": 1.0137, + "num_input_tokens_seen": 40960000, + "step": 5000 + }, + { + "epoch": 0.6473704612999095, + "grad_norm": 0.8568751215934753, + "learning_rate": 4.4471898469293324e-05, + "loss": 0.9478, + "num_input_tokens_seen": 41041920, + "step": 5010 + }, + { + "epoch": 0.6486626179092906, + "grad_norm": 0.6555817127227783, + "learning_rate": 4.4450664106787706e-05, + "loss": 0.8829, + "num_input_tokens_seen": 41123840, + "step": 5020 + }, + { + "epoch": 0.6499547745186717, + "grad_norm": 0.656535267829895, + "learning_rate": 4.442939413018728e-05, + "loss": 1.0126, + "num_input_tokens_seen": 41205760, + "step": 5030 + }, + { + "epoch": 0.6512469311280528, + "grad_norm": 0.5044434666633606, + "learning_rate": 4.44080885784373e-05, + "loss": 0.8302, + "num_input_tokens_seen": 41287680, + "step": 5040 + }, + { + "epoch": 0.6525390877374337, + "grad_norm": 1.4959832429885864, + "learning_rate": 4.4386747490548156e-05, + "loss": 1.0349, + "num_input_tokens_seen": 41369600, + "step": 5050 + }, + { + "epoch": 0.6538312443468148, + "grad_norm": 0.623151421546936, + "learning_rate": 4.43653709055953e-05, + "loss": 1.1468, + "num_input_tokens_seen": 41451520, + "step": 5060 + }, + { + "epoch": 0.6551234009561959, + "grad_norm": 0.606468141078949, + "learning_rate": 4.434395886271917e-05, + "loss": 1.1647, + "num_input_tokens_seen": 41533440, + "step": 5070 + }, + { + "epoch": 0.6564155575655769, + "grad_norm": 0.5442026853561401, + "learning_rate": 4.4322511401125156e-05, + "loss": 1.2088, + "num_input_tokens_seen": 41615360, + "step": 5080 + }, + { + "epoch": 0.657707714174958, + "grad_norm": 0.5820073485374451, + "learning_rate": 4.430102856008347e-05, + "loss": 0.8394, + "num_input_tokens_seen": 41697280, + "step": 5090 + }, + { + "epoch": 0.6589998707843391, + "grad_norm": 0.6123668551445007, + "learning_rate": 4.427951037892911e-05, + "loss": 0.7364, + "num_input_tokens_seen": 41779200, + "step": 5100 + }, + { + "epoch": 0.6602920273937202, + "grad_norm": 0.4739777743816376, + "learning_rate": 4.4257956897061805e-05, + "loss": 0.8559, + "num_input_tokens_seen": 41861120, + "step": 5110 + }, + { + "epoch": 0.6615841840031012, + "grad_norm": 0.5354482531547546, + "learning_rate": 4.423636815394588e-05, + "loss": 0.9424, + "num_input_tokens_seen": 41943040, + "step": 5120 + }, + { + "epoch": 0.6628763406124822, + "grad_norm": 0.5862747430801392, + "learning_rate": 4.4214744189110266e-05, + "loss": 1.1937, + "num_input_tokens_seen": 42024960, + "step": 5130 + }, + { + "epoch": 0.6641684972218633, + "grad_norm": 0.4908977150917053, + "learning_rate": 4.4193085042148354e-05, + "loss": 1.0227, + "num_input_tokens_seen": 42106880, + "step": 5140 + }, + { + "epoch": 0.6654606538312443, + "grad_norm": 0.8534203171730042, + "learning_rate": 4.417139075271796e-05, + "loss": 1.3091, + "num_input_tokens_seen": 42188800, + "step": 5150 + }, + { + "epoch": 0.6667528104406254, + "grad_norm": 0.47156473994255066, + "learning_rate": 4.414966136054125e-05, + "loss": 0.6186, + "num_input_tokens_seen": 42270720, + "step": 5160 + }, + { + "epoch": 0.6680449670500065, + "grad_norm": 0.5299347043037415, + "learning_rate": 4.412789690540466e-05, + "loss": 0.7974, + "num_input_tokens_seen": 42352640, + "step": 5170 + }, + { + "epoch": 0.6693371236593875, + "grad_norm": 0.5082312226295471, + "learning_rate": 4.410609742715883e-05, + "loss": 0.8996, + "num_input_tokens_seen": 42434560, + "step": 5180 + }, + { + "epoch": 0.6706292802687686, + "grad_norm": 0.48898372054100037, + "learning_rate": 4.408426296571852e-05, + "loss": 1.0341, + "num_input_tokens_seen": 42516480, + "step": 5190 + }, + { + "epoch": 0.6719214368781496, + "grad_norm": 0.7845025658607483, + "learning_rate": 4.406239356106257e-05, + "loss": 0.9729, + "num_input_tokens_seen": 42598400, + "step": 5200 + }, + { + "epoch": 0.6732135934875307, + "grad_norm": 0.3541695475578308, + "learning_rate": 4.404048925323375e-05, + "loss": 0.6879, + "num_input_tokens_seen": 42680320, + "step": 5210 + }, + { + "epoch": 0.6745057500969117, + "grad_norm": 0.6555191874504089, + "learning_rate": 4.401855008233879e-05, + "loss": 1.2569, + "num_input_tokens_seen": 42762240, + "step": 5220 + }, + { + "epoch": 0.6757979067062928, + "grad_norm": 0.44755908846855164, + "learning_rate": 4.3996576088548214e-05, + "loss": 1.1061, + "num_input_tokens_seen": 42844160, + "step": 5230 + }, + { + "epoch": 0.6770900633156739, + "grad_norm": 0.4899725615978241, + "learning_rate": 4.397456731209634e-05, + "loss": 1.0198, + "num_input_tokens_seen": 42926080, + "step": 5240 + }, + { + "epoch": 0.6783822199250549, + "grad_norm": 0.30107763409614563, + "learning_rate": 4.395252379328115e-05, + "loss": 0.7051, + "num_input_tokens_seen": 43008000, + "step": 5250 + }, + { + "epoch": 0.679674376534436, + "grad_norm": 0.24033145606517792, + "learning_rate": 4.393044557246424e-05, + "loss": 0.853, + "num_input_tokens_seen": 43089920, + "step": 5260 + }, + { + "epoch": 0.680966533143817, + "grad_norm": 0.5303036570549011, + "learning_rate": 4.3908332690070765e-05, + "loss": 0.9311, + "num_input_tokens_seen": 43171840, + "step": 5270 + }, + { + "epoch": 0.682258689753198, + "grad_norm": 0.4759383499622345, + "learning_rate": 4.388618518658932e-05, + "loss": 1.3277, + "num_input_tokens_seen": 43253760, + "step": 5280 + }, + { + "epoch": 0.6835508463625791, + "grad_norm": 0.4998660385608673, + "learning_rate": 4.3864003102571916e-05, + "loss": 0.8287, + "num_input_tokens_seen": 43335680, + "step": 5290 + }, + { + "epoch": 0.6848430029719602, + "grad_norm": 0.47711753845214844, + "learning_rate": 4.384178647863385e-05, + "loss": 0.6284, + "num_input_tokens_seen": 43417600, + "step": 5300 + }, + { + "epoch": 0.6861351595813413, + "grad_norm": 0.6608704328536987, + "learning_rate": 4.381953535545369e-05, + "loss": 1.1245, + "num_input_tokens_seen": 43499520, + "step": 5310 + }, + { + "epoch": 0.6874273161907223, + "grad_norm": 0.6293668746948242, + "learning_rate": 4.3797249773773165e-05, + "loss": 0.8103, + "num_input_tokens_seen": 43581440, + "step": 5320 + }, + { + "epoch": 0.6887194728001034, + "grad_norm": 0.7585543990135193, + "learning_rate": 4.3774929774397086e-05, + "loss": 0.6222, + "num_input_tokens_seen": 43663360, + "step": 5330 + }, + { + "epoch": 0.6900116294094845, + "grad_norm": 0.6242161393165588, + "learning_rate": 4.375257539819328e-05, + "loss": 1.0592, + "num_input_tokens_seen": 43745280, + "step": 5340 + }, + { + "epoch": 0.6913037860188654, + "grad_norm": 0.47205057740211487, + "learning_rate": 4.373018668609256e-05, + "loss": 1.1099, + "num_input_tokens_seen": 43827200, + "step": 5350 + }, + { + "epoch": 0.6925959426282465, + "grad_norm": 0.6397084593772888, + "learning_rate": 4.370776367908854e-05, + "loss": 1.0354, + "num_input_tokens_seen": 43909120, + "step": 5360 + }, + { + "epoch": 0.6938880992376276, + "grad_norm": 0.5018695592880249, + "learning_rate": 4.368530641823769e-05, + "loss": 1.2438, + "num_input_tokens_seen": 43991040, + "step": 5370 + }, + { + "epoch": 0.6951802558470087, + "grad_norm": 0.5705529451370239, + "learning_rate": 4.3662814944659156e-05, + "loss": 0.9602, + "num_input_tokens_seen": 44072960, + "step": 5380 + }, + { + "epoch": 0.6964724124563897, + "grad_norm": 0.5414160490036011, + "learning_rate": 4.364028929953476e-05, + "loss": 0.9233, + "num_input_tokens_seen": 44154880, + "step": 5390 + }, + { + "epoch": 0.6977645690657708, + "grad_norm": 0.5052191615104675, + "learning_rate": 4.361772952410886e-05, + "loss": 1.1027, + "num_input_tokens_seen": 44236800, + "step": 5400 + }, + { + "epoch": 0.6990567256751519, + "grad_norm": 0.5868683457374573, + "learning_rate": 4.359513565968832e-05, + "loss": 1.0273, + "num_input_tokens_seen": 44318720, + "step": 5410 + }, + { + "epoch": 0.7003488822845328, + "grad_norm": 0.5551066994667053, + "learning_rate": 4.357250774764245e-05, + "loss": 0.8502, + "num_input_tokens_seen": 44400640, + "step": 5420 + }, + { + "epoch": 0.7016410388939139, + "grad_norm": 0.36578068137168884, + "learning_rate": 4.354984582940285e-05, + "loss": 0.7773, + "num_input_tokens_seen": 44482560, + "step": 5430 + }, + { + "epoch": 0.702933195503295, + "grad_norm": 0.8225210309028625, + "learning_rate": 4.35271499464634e-05, + "loss": 0.9399, + "num_input_tokens_seen": 44564480, + "step": 5440 + }, + { + "epoch": 0.704225352112676, + "grad_norm": 0.5141643285751343, + "learning_rate": 4.350442014038021e-05, + "loss": 0.9732, + "num_input_tokens_seen": 44646400, + "step": 5450 + }, + { + "epoch": 0.7055175087220571, + "grad_norm": 0.42161181569099426, + "learning_rate": 4.348165645277145e-05, + "loss": 0.9438, + "num_input_tokens_seen": 44728320, + "step": 5460 + }, + { + "epoch": 0.7068096653314382, + "grad_norm": 0.44062143564224243, + "learning_rate": 4.345885892531735e-05, + "loss": 1.3828, + "num_input_tokens_seen": 44810240, + "step": 5470 + }, + { + "epoch": 0.7081018219408193, + "grad_norm": 0.44696852564811707, + "learning_rate": 4.343602759976011e-05, + "loss": 1.1361, + "num_input_tokens_seen": 44892160, + "step": 5480 + }, + { + "epoch": 0.7093939785502003, + "grad_norm": 0.7456682920455933, + "learning_rate": 4.34131625179038e-05, + "loss": 0.9972, + "num_input_tokens_seen": 44974080, + "step": 5490 + }, + { + "epoch": 0.7106861351595813, + "grad_norm": 0.4710252285003662, + "learning_rate": 4.3390263721614286e-05, + "loss": 1.1289, + "num_input_tokens_seen": 45056000, + "step": 5500 + }, + { + "epoch": 0.7119782917689624, + "grad_norm": 0.4541804790496826, + "learning_rate": 4.33673312528192e-05, + "loss": 0.699, + "num_input_tokens_seen": 45137920, + "step": 5510 + }, + { + "epoch": 0.7132704483783434, + "grad_norm": 0.5204092860221863, + "learning_rate": 4.334436515350779e-05, + "loss": 1.1252, + "num_input_tokens_seen": 45219840, + "step": 5520 + }, + { + "epoch": 0.7145626049877245, + "grad_norm": 0.5446497201919556, + "learning_rate": 4.332136546573092e-05, + "loss": 1.0118, + "num_input_tokens_seen": 45301760, + "step": 5530 + }, + { + "epoch": 0.7158547615971056, + "grad_norm": 0.5484737753868103, + "learning_rate": 4.3298332231600925e-05, + "loss": 0.8958, + "num_input_tokens_seen": 45383680, + "step": 5540 + }, + { + "epoch": 0.7171469182064866, + "grad_norm": 0.4673605263233185, + "learning_rate": 4.327526549329157e-05, + "loss": 0.7937, + "num_input_tokens_seen": 45465600, + "step": 5550 + }, + { + "epoch": 0.7184390748158677, + "grad_norm": 0.4332837760448456, + "learning_rate": 4.325216529303798e-05, + "loss": 0.7012, + "num_input_tokens_seen": 45547520, + "step": 5560 + }, + { + "epoch": 0.7197312314252488, + "grad_norm": 0.5795179009437561, + "learning_rate": 4.3229031673136514e-05, + "loss": 0.7965, + "num_input_tokens_seen": 45629440, + "step": 5570 + }, + { + "epoch": 0.7210233880346298, + "grad_norm": 0.5822563767433167, + "learning_rate": 4.320586467594476e-05, + "loss": 0.9927, + "num_input_tokens_seen": 45711360, + "step": 5580 + }, + { + "epoch": 0.7223155446440108, + "grad_norm": 0.3716464340686798, + "learning_rate": 4.3182664343881415e-05, + "loss": 0.8541, + "num_input_tokens_seen": 45793280, + "step": 5590 + }, + { + "epoch": 0.7236077012533919, + "grad_norm": 0.6027759313583374, + "learning_rate": 4.315943071942619e-05, + "loss": 0.989, + "num_input_tokens_seen": 45875200, + "step": 5600 + }, + { + "epoch": 0.724899857862773, + "grad_norm": 0.4732694923877716, + "learning_rate": 4.313616384511976e-05, + "loss": 1.0818, + "num_input_tokens_seen": 45957120, + "step": 5610 + }, + { + "epoch": 0.726192014472154, + "grad_norm": 0.5868046283721924, + "learning_rate": 4.3112863763563695e-05, + "loss": 1.0183, + "num_input_tokens_seen": 46039040, + "step": 5620 + }, + { + "epoch": 0.7274841710815351, + "grad_norm": 0.5000009536743164, + "learning_rate": 4.308953051742036e-05, + "loss": 1.1861, + "num_input_tokens_seen": 46120960, + "step": 5630 + }, + { + "epoch": 0.7287763276909162, + "grad_norm": 0.5284093022346497, + "learning_rate": 4.3066164149412844e-05, + "loss": 0.8941, + "num_input_tokens_seen": 46202880, + "step": 5640 + }, + { + "epoch": 0.7300684843002972, + "grad_norm": 0.4090714454650879, + "learning_rate": 4.304276470232488e-05, + "loss": 0.8618, + "num_input_tokens_seen": 46284800, + "step": 5650 + }, + { + "epoch": 0.7313606409096782, + "grad_norm": 0.6819685697555542, + "learning_rate": 4.3019332219000766e-05, + "loss": 0.8772, + "num_input_tokens_seen": 46366720, + "step": 5660 + }, + { + "epoch": 0.7326527975190593, + "grad_norm": 0.8263905048370361, + "learning_rate": 4.299586674234529e-05, + "loss": 1.0889, + "num_input_tokens_seen": 46448640, + "step": 5670 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 0.5756688714027405, + "learning_rate": 4.2972368315323676e-05, + "loss": 0.7212, + "num_input_tokens_seen": 46530560, + "step": 5680 + }, + { + "epoch": 0.7352371107378214, + "grad_norm": 0.5566821694374084, + "learning_rate": 4.294883698096143e-05, + "loss": 0.8605, + "num_input_tokens_seen": 46612480, + "step": 5690 + }, + { + "epoch": 0.7365292673472025, + "grad_norm": 0.5805408358573914, + "learning_rate": 4.292527278234435e-05, + "loss": 0.9687, + "num_input_tokens_seen": 46694400, + "step": 5700 + }, + { + "epoch": 0.7378214239565836, + "grad_norm": 0.19487954676151276, + "learning_rate": 4.290167576261841e-05, + "loss": 0.5747, + "num_input_tokens_seen": 46776320, + "step": 5710 + }, + { + "epoch": 0.7391135805659645, + "grad_norm": 0.5343565940856934, + "learning_rate": 4.2878045964989646e-05, + "loss": 0.7948, + "num_input_tokens_seen": 46858240, + "step": 5720 + }, + { + "epoch": 0.7404057371753456, + "grad_norm": 2.30495023727417, + "learning_rate": 4.285438343272414e-05, + "loss": 1.0718, + "num_input_tokens_seen": 46940160, + "step": 5730 + }, + { + "epoch": 0.7416978937847267, + "grad_norm": 0.5874922871589661, + "learning_rate": 4.283068820914791e-05, + "loss": 0.9519, + "num_input_tokens_seen": 47022080, + "step": 5740 + }, + { + "epoch": 0.7429900503941078, + "grad_norm": 0.4819450378417969, + "learning_rate": 4.2806960337646804e-05, + "loss": 1.015, + "num_input_tokens_seen": 47104000, + "step": 5750 + }, + { + "epoch": 0.7442822070034888, + "grad_norm": 0.6047049164772034, + "learning_rate": 4.278319986166649e-05, + "loss": 0.8983, + "num_input_tokens_seen": 47185920, + "step": 5760 + }, + { + "epoch": 0.7455743636128699, + "grad_norm": 0.32134854793548584, + "learning_rate": 4.27594068247123e-05, + "loss": 0.8726, + "num_input_tokens_seen": 47267840, + "step": 5770 + }, + { + "epoch": 0.746866520222251, + "grad_norm": 0.7429246306419373, + "learning_rate": 4.27355812703492e-05, + "loss": 0.7588, + "num_input_tokens_seen": 47349760, + "step": 5780 + }, + { + "epoch": 0.748158676831632, + "grad_norm": 0.5023065805435181, + "learning_rate": 4.2711723242201695e-05, + "loss": 1.1223, + "num_input_tokens_seen": 47431680, + "step": 5790 + }, + { + "epoch": 0.749450833441013, + "grad_norm": 0.38930198550224304, + "learning_rate": 4.268783278395374e-05, + "loss": 0.7947, + "num_input_tokens_seen": 47513600, + "step": 5800 + }, + { + "epoch": 0.7507429900503941, + "grad_norm": 0.488908976316452, + "learning_rate": 4.2663909939348684e-05, + "loss": 0.9494, + "num_input_tokens_seen": 47595520, + "step": 5810 + }, + { + "epoch": 0.7520351466597751, + "grad_norm": 0.4929217994213104, + "learning_rate": 4.263995475218917e-05, + "loss": 0.7431, + "num_input_tokens_seen": 47677440, + "step": 5820 + }, + { + "epoch": 0.7533273032691562, + "grad_norm": 0.4818821847438812, + "learning_rate": 4.2615967266337045e-05, + "loss": 0.8616, + "num_input_tokens_seen": 47759360, + "step": 5830 + }, + { + "epoch": 0.7546194598785373, + "grad_norm": 0.5090234875679016, + "learning_rate": 4.2591947525713326e-05, + "loss": 1.1937, + "num_input_tokens_seen": 47841280, + "step": 5840 + }, + { + "epoch": 0.7559116164879184, + "grad_norm": 0.6463255882263184, + "learning_rate": 4.256789557429806e-05, + "loss": 0.8848, + "num_input_tokens_seen": 47923200, + "step": 5850 + }, + { + "epoch": 0.7572037730972994, + "grad_norm": 0.5930004119873047, + "learning_rate": 4.254381145613027e-05, + "loss": 0.9741, + "num_input_tokens_seen": 48005120, + "step": 5860 + }, + { + "epoch": 0.7584959297066804, + "grad_norm": 0.5355798602104187, + "learning_rate": 4.251969521530791e-05, + "loss": 1.1395, + "num_input_tokens_seen": 48087040, + "step": 5870 + }, + { + "epoch": 0.7597880863160615, + "grad_norm": 0.4238862693309784, + "learning_rate": 4.2495546895987724e-05, + "loss": 1.074, + "num_input_tokens_seen": 48168960, + "step": 5880 + }, + { + "epoch": 0.7610802429254425, + "grad_norm": 0.5069129467010498, + "learning_rate": 4.2471366542385196e-05, + "loss": 0.9192, + "num_input_tokens_seen": 48250880, + "step": 5890 + }, + { + "epoch": 0.7623723995348236, + "grad_norm": 0.436614066362381, + "learning_rate": 4.2447154198774445e-05, + "loss": 0.999, + "num_input_tokens_seen": 48332800, + "step": 5900 + }, + { + "epoch": 0.7636645561442047, + "grad_norm": 0.21864083409309387, + "learning_rate": 4.242290990948821e-05, + "loss": 1.0004, + "num_input_tokens_seen": 48414720, + "step": 5910 + }, + { + "epoch": 0.7649567127535858, + "grad_norm": 0.5218377113342285, + "learning_rate": 4.2398633718917684e-05, + "loss": 0.8302, + "num_input_tokens_seen": 48496640, + "step": 5920 + }, + { + "epoch": 0.7662488693629668, + "grad_norm": 0.6044358611106873, + "learning_rate": 4.237432567151248e-05, + "loss": 1.2325, + "num_input_tokens_seen": 48578560, + "step": 5930 + }, + { + "epoch": 0.7675410259723479, + "grad_norm": 1.3460071086883545, + "learning_rate": 4.234998581178056e-05, + "loss": 0.636, + "num_input_tokens_seen": 48660480, + "step": 5940 + }, + { + "epoch": 0.768833182581729, + "grad_norm": 0.33106595277786255, + "learning_rate": 4.2325614184288096e-05, + "loss": 0.633, + "num_input_tokens_seen": 48742400, + "step": 5950 + }, + { + "epoch": 0.7701253391911099, + "grad_norm": 0.6000261306762695, + "learning_rate": 4.2301210833659464e-05, + "loss": 0.8583, + "num_input_tokens_seen": 48824320, + "step": 5960 + }, + { + "epoch": 0.771417495800491, + "grad_norm": 0.545335054397583, + "learning_rate": 4.227677580457711e-05, + "loss": 0.9888, + "num_input_tokens_seen": 48906240, + "step": 5970 + }, + { + "epoch": 0.7727096524098721, + "grad_norm": 0.5507743954658508, + "learning_rate": 4.2252309141781464e-05, + "loss": 1.0736, + "num_input_tokens_seen": 48988160, + "step": 5980 + }, + { + "epoch": 0.7740018090192531, + "grad_norm": 0.6265859007835388, + "learning_rate": 4.222781089007092e-05, + "loss": 0.995, + "num_input_tokens_seen": 49070080, + "step": 5990 + }, + { + "epoch": 0.7752939656286342, + "grad_norm": 1014203.875, + "learning_rate": 4.220328109430167e-05, + "loss": 0.8812, + "num_input_tokens_seen": 49152000, + "step": 6000 + }, + { + "epoch": 0.7765861222380153, + "grad_norm": 0.7153500318527222, + "learning_rate": 4.217871979938769e-05, + "loss": 1.3075, + "num_input_tokens_seen": 49233920, + "step": 6010 + }, + { + "epoch": 0.7778782788473964, + "grad_norm": 0.7981336712837219, + "learning_rate": 4.215412705030063e-05, + "loss": 0.8665, + "num_input_tokens_seen": 49315840, + "step": 6020 + }, + { + "epoch": 0.7791704354567773, + "grad_norm": 0.6426622867584229, + "learning_rate": 4.21295028920697e-05, + "loss": 0.852, + "num_input_tokens_seen": 49397760, + "step": 6030 + }, + { + "epoch": 0.7804625920661584, + "grad_norm": 0.6837995052337646, + "learning_rate": 4.210484736978166e-05, + "loss": 0.9422, + "num_input_tokens_seen": 49479680, + "step": 6040 + }, + { + "epoch": 0.7817547486755395, + "grad_norm": 0.4627484679222107, + "learning_rate": 4.208016052858067e-05, + "loss": 0.8523, + "num_input_tokens_seen": 49561600, + "step": 6050 + }, + { + "epoch": 0.7830469052849205, + "grad_norm": 0.29264596104621887, + "learning_rate": 4.2055442413668264e-05, + "loss": 0.7916, + "num_input_tokens_seen": 49643520, + "step": 6060 + }, + { + "epoch": 0.7843390618943016, + "grad_norm": 0.4580781161785126, + "learning_rate": 4.2030693070303204e-05, + "loss": 1.1109, + "num_input_tokens_seen": 49725440, + "step": 6070 + }, + { + "epoch": 0.7856312185036827, + "grad_norm": 0.24061749875545502, + "learning_rate": 4.2005912543801444e-05, + "loss": 0.7227, + "num_input_tokens_seen": 49807360, + "step": 6080 + }, + { + "epoch": 0.7869233751130637, + "grad_norm": 0.4084072709083557, + "learning_rate": 4.198110087953606e-05, + "loss": 0.7407, + "num_input_tokens_seen": 49889280, + "step": 6090 + }, + { + "epoch": 0.7882155317224447, + "grad_norm": 0.5031906962394714, + "learning_rate": 4.195625812293709e-05, + "loss": 1.4336, + "num_input_tokens_seen": 49971200, + "step": 6100 + }, + { + "epoch": 0.7895076883318258, + "grad_norm": 0.21128413081169128, + "learning_rate": 4.193138431949155e-05, + "loss": 0.9719, + "num_input_tokens_seen": 50053120, + "step": 6110 + }, + { + "epoch": 0.7907998449412069, + "grad_norm": 0.44993460178375244, + "learning_rate": 4.190647951474328e-05, + "loss": 1.0592, + "num_input_tokens_seen": 50135040, + "step": 6120 + }, + { + "epoch": 0.7920920015505879, + "grad_norm": 0.5219196677207947, + "learning_rate": 4.188154375429288e-05, + "loss": 0.8778, + "num_input_tokens_seen": 50216960, + "step": 6130 + }, + { + "epoch": 0.793384158159969, + "grad_norm": 0.5350174307823181, + "learning_rate": 4.1856577083797646e-05, + "loss": 1.329, + "num_input_tokens_seen": 50298880, + "step": 6140 + }, + { + "epoch": 0.7946763147693501, + "grad_norm": 0.6473702192306519, + "learning_rate": 4.183157954897144e-05, + "loss": 1.2239, + "num_input_tokens_seen": 50380800, + "step": 6150 + }, + { + "epoch": 0.7959684713787311, + "grad_norm": 0.5353970527648926, + "learning_rate": 4.1806551195584685e-05, + "loss": 1.0103, + "num_input_tokens_seen": 50462720, + "step": 6160 + }, + { + "epoch": 0.7972606279881121, + "grad_norm": 0.6963522434234619, + "learning_rate": 4.178149206946419e-05, + "loss": 0.8958, + "num_input_tokens_seen": 50544640, + "step": 6170 + }, + { + "epoch": 0.7985527845974932, + "grad_norm": 0.6284576654434204, + "learning_rate": 4.1756402216493115e-05, + "loss": 1.0327, + "num_input_tokens_seen": 50626560, + "step": 6180 + }, + { + "epoch": 0.7998449412068743, + "grad_norm": 0.49156904220581055, + "learning_rate": 4.17312816826109e-05, + "loss": 0.9924, + "num_input_tokens_seen": 50708480, + "step": 6190 + }, + { + "epoch": 0.8011370978162553, + "grad_norm": 0.8656590580940247, + "learning_rate": 4.1706130513813146e-05, + "loss": 0.7853, + "num_input_tokens_seen": 50790400, + "step": 6200 + }, + { + "epoch": 0.8024292544256364, + "grad_norm": 0.5040183067321777, + "learning_rate": 4.1680948756151564e-05, + "loss": 0.9263, + "num_input_tokens_seen": 50872320, + "step": 6210 + }, + { + "epoch": 0.8037214110350175, + "grad_norm": 0.28626561164855957, + "learning_rate": 4.165573645573384e-05, + "loss": 0.9224, + "num_input_tokens_seen": 50954240, + "step": 6220 + }, + { + "epoch": 0.8050135676443985, + "grad_norm": 0.5348039865493774, + "learning_rate": 4.1630493658723606e-05, + "loss": 1.2382, + "num_input_tokens_seen": 51036160, + "step": 6230 + }, + { + "epoch": 0.8063057242537796, + "grad_norm": 0.4930530786514282, + "learning_rate": 4.160522041134035e-05, + "loss": 1.0701, + "num_input_tokens_seen": 51118080, + "step": 6240 + }, + { + "epoch": 0.8075978808631606, + "grad_norm": 0.27241694927215576, + "learning_rate": 4.1579916759859286e-05, + "loss": 0.9246, + "num_input_tokens_seen": 51200000, + "step": 6250 + }, + { + "epoch": 0.8088900374725416, + "grad_norm": 0.8106761574745178, + "learning_rate": 4.155458275061129e-05, + "loss": 1.1799, + "num_input_tokens_seen": 51281920, + "step": 6260 + }, + { + "epoch": 0.8101821940819227, + "grad_norm": 0.6330926418304443, + "learning_rate": 4.152921842998287e-05, + "loss": 1.2025, + "num_input_tokens_seen": 51363840, + "step": 6270 + }, + { + "epoch": 0.8114743506913038, + "grad_norm": 0.5390750169754028, + "learning_rate": 4.150382384441598e-05, + "loss": 0.7594, + "num_input_tokens_seen": 51445760, + "step": 6280 + }, + { + "epoch": 0.8127665073006849, + "grad_norm": 0.645873486995697, + "learning_rate": 4.147839904040803e-05, + "loss": 0.8714, + "num_input_tokens_seen": 51527680, + "step": 6290 + }, + { + "epoch": 0.8140586639100659, + "grad_norm": 0.2540639638900757, + "learning_rate": 4.145294406451173e-05, + "loss": 1.2455, + "num_input_tokens_seen": 51609600, + "step": 6300 + }, + { + "epoch": 0.815350820519447, + "grad_norm": 0.75770103931427, + "learning_rate": 4.142745896333505e-05, + "loss": 1.1417, + "num_input_tokens_seen": 51691520, + "step": 6310 + }, + { + "epoch": 0.816642977128828, + "grad_norm": 0.505504310131073, + "learning_rate": 4.140194378354113e-05, + "loss": 1.1736, + "num_input_tokens_seen": 51773440, + "step": 6320 + }, + { + "epoch": 0.817935133738209, + "grad_norm": 0.47906693816185, + "learning_rate": 4.137639857184815e-05, + "loss": 0.8252, + "num_input_tokens_seen": 51855360, + "step": 6330 + }, + { + "epoch": 0.8192272903475901, + "grad_norm": 0.7896102666854858, + "learning_rate": 4.1350823375029326e-05, + "loss": 1.2724, + "num_input_tokens_seen": 51937280, + "step": 6340 + }, + { + "epoch": 0.8205194469569712, + "grad_norm": 0.6090170741081238, + "learning_rate": 4.132521823991272e-05, + "loss": 0.6208, + "num_input_tokens_seen": 52019200, + "step": 6350 + }, + { + "epoch": 0.8218116035663522, + "grad_norm": 0.4670509099960327, + "learning_rate": 4.129958321338127e-05, + "loss": 0.8883, + "num_input_tokens_seen": 52101120, + "step": 6360 + }, + { + "epoch": 0.8231037601757333, + "grad_norm": 0.7980589270591736, + "learning_rate": 4.127391834237258e-05, + "loss": 0.7734, + "num_input_tokens_seen": 52183040, + "step": 6370 + }, + { + "epoch": 0.8243959167851144, + "grad_norm": 0.47597405314445496, + "learning_rate": 4.124822367387897e-05, + "loss": 0.8842, + "num_input_tokens_seen": 52264960, + "step": 6380 + }, + { + "epoch": 0.8256880733944955, + "grad_norm": 0.28550368547439575, + "learning_rate": 4.122249925494726e-05, + "loss": 0.8806, + "num_input_tokens_seen": 52346880, + "step": 6390 + }, + { + "epoch": 0.8269802300038764, + "grad_norm": 0.5419972538948059, + "learning_rate": 4.119674513267878e-05, + "loss": 0.9348, + "num_input_tokens_seen": 52428800, + "step": 6400 + }, + { + "epoch": 0.8282723866132575, + "grad_norm": 0.5108693838119507, + "learning_rate": 4.117096135422923e-05, + "loss": 0.7648, + "num_input_tokens_seen": 52510720, + "step": 6410 + }, + { + "epoch": 0.8295645432226386, + "grad_norm": 0.22679108381271362, + "learning_rate": 4.114514796680862e-05, + "loss": 0.6926, + "num_input_tokens_seen": 52592640, + "step": 6420 + }, + { + "epoch": 0.8308566998320196, + "grad_norm": 0.5265064239501953, + "learning_rate": 4.111930501768116e-05, + "loss": 1.0296, + "num_input_tokens_seen": 52674560, + "step": 6430 + }, + { + "epoch": 0.8321488564414007, + "grad_norm": 0.41228848695755005, + "learning_rate": 4.1093432554165196e-05, + "loss": 0.9102, + "num_input_tokens_seen": 52756480, + "step": 6440 + }, + { + "epoch": 0.8334410130507818, + "grad_norm": 0.35673919320106506, + "learning_rate": 4.106753062363311e-05, + "loss": 0.8159, + "num_input_tokens_seen": 52838400, + "step": 6450 + }, + { + "epoch": 0.8347331696601629, + "grad_norm": 0.5186113715171814, + "learning_rate": 4.104159927351125e-05, + "loss": 1.1631, + "num_input_tokens_seen": 52920320, + "step": 6460 + }, + { + "epoch": 0.8360253262695438, + "grad_norm": 0.45875003933906555, + "learning_rate": 4.1015638551279825e-05, + "loss": 0.8534, + "num_input_tokens_seen": 53002240, + "step": 6470 + }, + { + "epoch": 0.8373174828789249, + "grad_norm": 0.5119809508323669, + "learning_rate": 4.098964850447281e-05, + "loss": 1.0583, + "num_input_tokens_seen": 53084160, + "step": 6480 + }, + { + "epoch": 0.838609639488306, + "grad_norm": 0.35101157426834106, + "learning_rate": 4.0963629180677896e-05, + "loss": 0.8959, + "num_input_tokens_seen": 53166080, + "step": 6490 + }, + { + "epoch": 0.839901796097687, + "grad_norm": 0.5710439085960388, + "learning_rate": 4.093758062753638e-05, + "loss": 1.1624, + "num_input_tokens_seen": 53248000, + "step": 6500 + }, + { + "epoch": 0.8411939527070681, + "grad_norm": 0.34572339057922363, + "learning_rate": 4.0911502892743035e-05, + "loss": 1.2004, + "num_input_tokens_seen": 53329920, + "step": 6510 + }, + { + "epoch": 0.8424861093164492, + "grad_norm": 0.5202426910400391, + "learning_rate": 4.088539602404613e-05, + "loss": 1.2094, + "num_input_tokens_seen": 53411840, + "step": 6520 + }, + { + "epoch": 0.8437782659258302, + "grad_norm": 0.46138355135917664, + "learning_rate": 4.085926006924723e-05, + "loss": 1.0853, + "num_input_tokens_seen": 53493760, + "step": 6530 + }, + { + "epoch": 0.8450704225352113, + "grad_norm": 0.5127395391464233, + "learning_rate": 4.083309507620118e-05, + "loss": 0.8684, + "num_input_tokens_seen": 53575680, + "step": 6540 + }, + { + "epoch": 0.8463625791445923, + "grad_norm": 0.5306764245033264, + "learning_rate": 4.080690109281597e-05, + "loss": 1.1792, + "num_input_tokens_seen": 53657600, + "step": 6550 + }, + { + "epoch": 0.8476547357539734, + "grad_norm": 0.48974373936653137, + "learning_rate": 4.078067816705272e-05, + "loss": 0.8527, + "num_input_tokens_seen": 53739520, + "step": 6560 + }, + { + "epoch": 0.8489468923633544, + "grad_norm": 0.4819824993610382, + "learning_rate": 4.075442634692548e-05, + "loss": 0.9558, + "num_input_tokens_seen": 53821440, + "step": 6570 + }, + { + "epoch": 0.8502390489727355, + "grad_norm": 0.5945402979850769, + "learning_rate": 4.072814568050125e-05, + "loss": 1.0556, + "num_input_tokens_seen": 53903360, + "step": 6580 + }, + { + "epoch": 0.8515312055821166, + "grad_norm": 0.26948097348213196, + "learning_rate": 4.070183621589983e-05, + "loss": 0.9564, + "num_input_tokens_seen": 53985280, + "step": 6590 + }, + { + "epoch": 0.8528233621914976, + "grad_norm": 0.5950406789779663, + "learning_rate": 4.067549800129375e-05, + "loss": 1.2202, + "num_input_tokens_seen": 54067200, + "step": 6600 + }, + { + "epoch": 0.8541155188008787, + "grad_norm": 1.1179322004318237, + "learning_rate": 4.06491310849082e-05, + "loss": 0.9542, + "num_input_tokens_seen": 54149120, + "step": 6610 + }, + { + "epoch": 0.8554076754102597, + "grad_norm": 0.5142911076545715, + "learning_rate": 4.0622735515020896e-05, + "loss": 1.0358, + "num_input_tokens_seen": 54231040, + "step": 6620 + }, + { + "epoch": 0.8566998320196407, + "grad_norm": 0.6860795021057129, + "learning_rate": 4.059631133996203e-05, + "loss": 1.2331, + "num_input_tokens_seen": 54312960, + "step": 6630 + }, + { + "epoch": 0.8579919886290218, + "grad_norm": 0.47902247309684753, + "learning_rate": 4.0569858608114177e-05, + "loss": 0.9423, + "num_input_tokens_seen": 54394880, + "step": 6640 + }, + { + "epoch": 0.8592841452384029, + "grad_norm": 0.46857351064682007, + "learning_rate": 4.054337736791218e-05, + "loss": 1.1609, + "num_input_tokens_seen": 54476800, + "step": 6650 + }, + { + "epoch": 0.860576301847784, + "grad_norm": 0.5907356142997742, + "learning_rate": 4.05168676678431e-05, + "loss": 0.9658, + "num_input_tokens_seen": 54558720, + "step": 6660 + }, + { + "epoch": 0.861868458457165, + "grad_norm": 0.26403316855430603, + "learning_rate": 4.04903295564461e-05, + "loss": 0.7992, + "num_input_tokens_seen": 54640640, + "step": 6670 + }, + { + "epoch": 0.8631606150665461, + "grad_norm": 0.6622225642204285, + "learning_rate": 4.046376308231237e-05, + "loss": 0.9918, + "num_input_tokens_seen": 54722560, + "step": 6680 + }, + { + "epoch": 0.8644527716759272, + "grad_norm": 0.16060075163841248, + "learning_rate": 4.0437168294085013e-05, + "loss": 0.9523, + "num_input_tokens_seen": 54804480, + "step": 6690 + }, + { + "epoch": 0.8657449282853081, + "grad_norm": 0.5184667706489563, + "learning_rate": 4.0410545240459005e-05, + "loss": 1.1628, + "num_input_tokens_seen": 54886400, + "step": 6700 + }, + { + "epoch": 0.8670370848946892, + "grad_norm": 0.4965847134590149, + "learning_rate": 4.0383893970181054e-05, + "loss": 1.0468, + "num_input_tokens_seen": 54968320, + "step": 6710 + }, + { + "epoch": 0.8683292415040703, + "grad_norm": 6.185298442840576, + "learning_rate": 4.0357214532049535e-05, + "loss": 1.2028, + "num_input_tokens_seen": 55050240, + "step": 6720 + }, + { + "epoch": 0.8696213981134514, + "grad_norm": 0.43425482511520386, + "learning_rate": 4.03305069749144e-05, + "loss": 0.503, + "num_input_tokens_seen": 55132160, + "step": 6730 + }, + { + "epoch": 0.8709135547228324, + "grad_norm": 0.2627141773700714, + "learning_rate": 4.03037713476771e-05, + "loss": 1.0739, + "num_input_tokens_seen": 55214080, + "step": 6740 + }, + { + "epoch": 0.8722057113322135, + "grad_norm": 0.5449861288070679, + "learning_rate": 4.027700769929046e-05, + "loss": 0.7428, + "num_input_tokens_seen": 55296000, + "step": 6750 + }, + { + "epoch": 0.8734978679415946, + "grad_norm": 0.5020452737808228, + "learning_rate": 4.025021607875862e-05, + "loss": 1.0242, + "num_input_tokens_seen": 55377920, + "step": 6760 + }, + { + "epoch": 0.8747900245509755, + "grad_norm": 0.5593128800392151, + "learning_rate": 4.0223396535136945e-05, + "loss": 1.2703, + "num_input_tokens_seen": 55459840, + "step": 6770 + }, + { + "epoch": 0.8760821811603566, + "grad_norm": 0.4105585813522339, + "learning_rate": 4.019654911753193e-05, + "loss": 0.8773, + "num_input_tokens_seen": 55541760, + "step": 6780 + }, + { + "epoch": 0.8773743377697377, + "grad_norm": 0.37367942929267883, + "learning_rate": 4.016967387510108e-05, + "loss": 1.094, + "num_input_tokens_seen": 55623680, + "step": 6790 + }, + { + "epoch": 0.8786664943791187, + "grad_norm": 0.5563257932662964, + "learning_rate": 4.014277085705288e-05, + "loss": 0.9265, + "num_input_tokens_seen": 55705600, + "step": 6800 + }, + { + "epoch": 0.8799586509884998, + "grad_norm": 0.7399642467498779, + "learning_rate": 4.011584011264665e-05, + "loss": 0.9153, + "num_input_tokens_seen": 55787520, + "step": 6810 + }, + { + "epoch": 0.8812508075978809, + "grad_norm": 0.6172972321510315, + "learning_rate": 4.0088881691192474e-05, + "loss": 0.7931, + "num_input_tokens_seen": 55869440, + "step": 6820 + }, + { + "epoch": 0.882542964207262, + "grad_norm": 0.4733666181564331, + "learning_rate": 4.006189564205115e-05, + "loss": 0.9963, + "num_input_tokens_seen": 55951360, + "step": 6830 + }, + { + "epoch": 0.883835120816643, + "grad_norm": 0.6193355321884155, + "learning_rate": 4.0034882014634015e-05, + "loss": 0.7636, + "num_input_tokens_seen": 56033280, + "step": 6840 + }, + { + "epoch": 0.885127277426024, + "grad_norm": 0.4271906018257141, + "learning_rate": 4.000784085840293e-05, + "loss": 0.7192, + "num_input_tokens_seen": 56115200, + "step": 6850 + }, + { + "epoch": 0.8864194340354051, + "grad_norm": 0.5514196157455444, + "learning_rate": 3.9980772222870156e-05, + "loss": 0.9082, + "num_input_tokens_seen": 56197120, + "step": 6860 + }, + { + "epoch": 0.8877115906447861, + "grad_norm": 0.5211493968963623, + "learning_rate": 3.995367615759825e-05, + "loss": 1.2218, + "num_input_tokens_seen": 56279040, + "step": 6870 + }, + { + "epoch": 0.8890037472541672, + "grad_norm": 0.5780483484268188, + "learning_rate": 3.992655271220003e-05, + "loss": 1.0894, + "num_input_tokens_seen": 56360960, + "step": 6880 + }, + { + "epoch": 0.8902959038635483, + "grad_norm": 0.537337601184845, + "learning_rate": 3.98994019363384e-05, + "loss": 1.0208, + "num_input_tokens_seen": 56442880, + "step": 6890 + }, + { + "epoch": 0.8915880604729293, + "grad_norm": 0.4431285858154297, + "learning_rate": 3.9872223879726356e-05, + "loss": 0.7955, + "num_input_tokens_seen": 56524800, + "step": 6900 + }, + { + "epoch": 0.8928802170823104, + "grad_norm": 0.44692462682724, + "learning_rate": 3.98450185921268e-05, + "loss": 1.262, + "num_input_tokens_seen": 56606720, + "step": 6910 + }, + { + "epoch": 0.8941723736916914, + "grad_norm": 0.5244356989860535, + "learning_rate": 3.981778612335253e-05, + "loss": 1.1836, + "num_input_tokens_seen": 56688640, + "step": 6920 + }, + { + "epoch": 0.8954645303010725, + "grad_norm": 0.5537554025650024, + "learning_rate": 3.979052652326609e-05, + "loss": 0.7662, + "num_input_tokens_seen": 56770560, + "step": 6930 + }, + { + "epoch": 0.8967566869104535, + "grad_norm": 0.5707337856292725, + "learning_rate": 3.976323984177971e-05, + "loss": 0.7414, + "num_input_tokens_seen": 56852480, + "step": 6940 + }, + { + "epoch": 0.8980488435198346, + "grad_norm": 0.4836041331291199, + "learning_rate": 3.97359261288552e-05, + "loss": 1.0458, + "num_input_tokens_seen": 56934400, + "step": 6950 + }, + { + "epoch": 0.8993410001292157, + "grad_norm": 0.47009721398353577, + "learning_rate": 3.970858543450387e-05, + "loss": 1.0642, + "num_input_tokens_seen": 57016320, + "step": 6960 + }, + { + "epoch": 0.9006331567385967, + "grad_norm": 0.6852318644523621, + "learning_rate": 3.968121780878643e-05, + "loss": 1.3093, + "num_input_tokens_seen": 57098240, + "step": 6970 + }, + { + "epoch": 0.9019253133479778, + "grad_norm": 0.24057495594024658, + "learning_rate": 3.965382330181291e-05, + "loss": 0.6589, + "num_input_tokens_seen": 57180160, + "step": 6980 + }, + { + "epoch": 0.9032174699573589, + "grad_norm": 0.33139023184776306, + "learning_rate": 3.962640196374254e-05, + "loss": 0.6929, + "num_input_tokens_seen": 57262080, + "step": 6990 + }, + { + "epoch": 0.9045096265667399, + "grad_norm": 0.5860159993171692, + "learning_rate": 3.9598953844783705e-05, + "loss": 1.1313, + "num_input_tokens_seen": 57344000, + "step": 7000 + }, + { + "epoch": 0.9058017831761209, + "grad_norm": 0.36473211646080017, + "learning_rate": 3.957147899519379e-05, + "loss": 0.8748, + "num_input_tokens_seen": 57425920, + "step": 7010 + }, + { + "epoch": 0.907093939785502, + "grad_norm": 0.547675371170044, + "learning_rate": 3.954397746527916e-05, + "loss": 0.3491, + "num_input_tokens_seen": 57507840, + "step": 7020 + }, + { + "epoch": 0.9083860963948831, + "grad_norm": 0.3887852430343628, + "learning_rate": 3.951644930539502e-05, + "loss": 0.7414, + "num_input_tokens_seen": 57589760, + "step": 7030 + }, + { + "epoch": 0.9096782530042641, + "grad_norm": 0.5362502932548523, + "learning_rate": 3.9488894565945305e-05, + "loss": 0.8839, + "num_input_tokens_seen": 57671680, + "step": 7040 + }, + { + "epoch": 0.9109704096136452, + "grad_norm": 0.578965425491333, + "learning_rate": 3.9461313297382666e-05, + "loss": 0.5389, + "num_input_tokens_seen": 57753600, + "step": 7050 + }, + { + "epoch": 0.9122625662230263, + "grad_norm": 0.6477794051170349, + "learning_rate": 3.94337055502083e-05, + "loss": 1.0182, + "num_input_tokens_seen": 57835520, + "step": 7060 + }, + { + "epoch": 0.9135547228324072, + "grad_norm": 0.505005955696106, + "learning_rate": 3.9406071374971887e-05, + "loss": 1.0376, + "num_input_tokens_seen": 57917440, + "step": 7070 + }, + { + "epoch": 0.9148468794417883, + "grad_norm": 1.0744434595108032, + "learning_rate": 3.93784108222715e-05, + "loss": 0.6095, + "num_input_tokens_seen": 57999360, + "step": 7080 + }, + { + "epoch": 0.9161390360511694, + "grad_norm": 0.518518328666687, + "learning_rate": 3.935072394275352e-05, + "loss": 1.0714, + "num_input_tokens_seen": 58081280, + "step": 7090 + }, + { + "epoch": 0.9174311926605505, + "grad_norm": 0.5842453241348267, + "learning_rate": 3.9323010787112505e-05, + "loss": 1.116, + "num_input_tokens_seen": 58163200, + "step": 7100 + }, + { + "epoch": 0.9187233492699315, + "grad_norm": 0.43083658814430237, + "learning_rate": 3.929527140609115e-05, + "loss": 1.1167, + "num_input_tokens_seen": 58245120, + "step": 7110 + }, + { + "epoch": 0.9200155058793126, + "grad_norm": 2.2373194694519043, + "learning_rate": 3.926750585048016e-05, + "loss": 0.7755, + "num_input_tokens_seen": 58327040, + "step": 7120 + }, + { + "epoch": 0.9213076624886937, + "grad_norm": 0.5515663623809814, + "learning_rate": 3.9239714171118167e-05, + "loss": 0.7525, + "num_input_tokens_seen": 58408960, + "step": 7130 + }, + { + "epoch": 0.9225998190980746, + "grad_norm": 0.5348473191261292, + "learning_rate": 3.921189641889163e-05, + "loss": 0.9602, + "num_input_tokens_seen": 58490880, + "step": 7140 + }, + { + "epoch": 0.9238919757074557, + "grad_norm": 0.4317019581794739, + "learning_rate": 3.918405264473476e-05, + "loss": 0.7652, + "num_input_tokens_seen": 58572800, + "step": 7150 + }, + { + "epoch": 0.9251841323168368, + "grad_norm": 0.48419129848480225, + "learning_rate": 3.9156182899629404e-05, + "loss": 0.5639, + "num_input_tokens_seen": 58654720, + "step": 7160 + }, + { + "epoch": 0.9264762889262178, + "grad_norm": 0.33040711283683777, + "learning_rate": 3.912828723460495e-05, + "loss": 0.8571, + "num_input_tokens_seen": 58736640, + "step": 7170 + }, + { + "epoch": 0.9277684455355989, + "grad_norm": 0.46214792132377625, + "learning_rate": 3.9100365700738275e-05, + "loss": 0.747, + "num_input_tokens_seen": 58818560, + "step": 7180 + }, + { + "epoch": 0.92906060214498, + "grad_norm": 0.4923725426197052, + "learning_rate": 3.907241834915359e-05, + "loss": 0.8309, + "num_input_tokens_seen": 58900480, + "step": 7190 + }, + { + "epoch": 0.9303527587543611, + "grad_norm": 0.5313102006912231, + "learning_rate": 3.904444523102242e-05, + "loss": 0.9932, + "num_input_tokens_seen": 58982400, + "step": 7200 + }, + { + "epoch": 0.931644915363742, + "grad_norm": 0.5704349875450134, + "learning_rate": 3.901644639756342e-05, + "loss": 0.9913, + "num_input_tokens_seen": 59064320, + "step": 7210 + }, + { + "epoch": 0.9329370719731231, + "grad_norm": 0.48948806524276733, + "learning_rate": 3.898842190004235e-05, + "loss": 0.9762, + "num_input_tokens_seen": 59146240, + "step": 7220 + }, + { + "epoch": 0.9342292285825042, + "grad_norm": 0.6252711415290833, + "learning_rate": 3.896037178977196e-05, + "loss": 0.6812, + "num_input_tokens_seen": 59228160, + "step": 7230 + }, + { + "epoch": 0.9355213851918852, + "grad_norm": 0.7012498378753662, + "learning_rate": 3.893229611811192e-05, + "loss": 1.3312, + "num_input_tokens_seen": 59310080, + "step": 7240 + }, + { + "epoch": 0.9368135418012663, + "grad_norm": 0.5096800923347473, + "learning_rate": 3.8904194936468665e-05, + "loss": 1.0935, + "num_input_tokens_seen": 59392000, + "step": 7250 + }, + { + "epoch": 0.9381056984106474, + "grad_norm": 0.47949451208114624, + "learning_rate": 3.887606829629536e-05, + "loss": 0.7918, + "num_input_tokens_seen": 59473920, + "step": 7260 + }, + { + "epoch": 0.9393978550200285, + "grad_norm": 0.256607323884964, + "learning_rate": 3.884791624909178e-05, + "loss": 0.8013, + "num_input_tokens_seen": 59555840, + "step": 7270 + }, + { + "epoch": 0.9406900116294095, + "grad_norm": 0.45952385663986206, + "learning_rate": 3.881973884640422e-05, + "loss": 1.0558, + "num_input_tokens_seen": 59637760, + "step": 7280 + }, + { + "epoch": 0.9419821682387906, + "grad_norm": 0.8293887376785278, + "learning_rate": 3.87915361398254e-05, + "loss": 1.1583, + "num_input_tokens_seen": 59719680, + "step": 7290 + }, + { + "epoch": 0.9432743248481716, + "grad_norm": 0.6213748455047607, + "learning_rate": 3.8763308180994384e-05, + "loss": 0.4953, + "num_input_tokens_seen": 59801600, + "step": 7300 + }, + { + "epoch": 0.9445664814575526, + "grad_norm": 0.7034590244293213, + "learning_rate": 3.873505502159645e-05, + "loss": 1.1762, + "num_input_tokens_seen": 59883520, + "step": 7310 + }, + { + "epoch": 0.9458586380669337, + "grad_norm": 0.4893646240234375, + "learning_rate": 3.8706776713363025e-05, + "loss": 0.7593, + "num_input_tokens_seen": 59965440, + "step": 7320 + }, + { + "epoch": 0.9471507946763148, + "grad_norm": 0.7708596587181091, + "learning_rate": 3.86784733080716e-05, + "loss": 0.6101, + "num_input_tokens_seen": 60047360, + "step": 7330 + }, + { + "epoch": 0.9484429512856958, + "grad_norm": 0.541378915309906, + "learning_rate": 3.86501448575456e-05, + "loss": 1.1621, + "num_input_tokens_seen": 60129280, + "step": 7340 + }, + { + "epoch": 0.9497351078950769, + "grad_norm": 0.46613985300064087, + "learning_rate": 3.862179141365431e-05, + "loss": 0.6934, + "num_input_tokens_seen": 60211200, + "step": 7350 + }, + { + "epoch": 0.951027264504458, + "grad_norm": 0.37554696202278137, + "learning_rate": 3.859341302831279e-05, + "loss": 1.064, + "num_input_tokens_seen": 60293120, + "step": 7360 + }, + { + "epoch": 0.952319421113839, + "grad_norm": 0.7480294108390808, + "learning_rate": 3.856500975348176e-05, + "loss": 0.9418, + "num_input_tokens_seen": 60375040, + "step": 7370 + }, + { + "epoch": 0.95361157772322, + "grad_norm": 0.5717000961303711, + "learning_rate": 3.8536581641167506e-05, + "loss": 0.8951, + "num_input_tokens_seen": 60456960, + "step": 7380 + }, + { + "epoch": 0.9549037343326011, + "grad_norm": 0.45350784063339233, + "learning_rate": 3.85081287434218e-05, + "loss": 0.9263, + "num_input_tokens_seen": 60538880, + "step": 7390 + }, + { + "epoch": 0.9561958909419822, + "grad_norm": 0.839464545249939, + "learning_rate": 3.84796511123418e-05, + "loss": 1.1756, + "num_input_tokens_seen": 60620800, + "step": 7400 + }, + { + "epoch": 0.9574880475513632, + "grad_norm": 0.5966989994049072, + "learning_rate": 3.845114880006994e-05, + "loss": 0.9992, + "num_input_tokens_seen": 60702720, + "step": 7410 + }, + { + "epoch": 0.9587802041607443, + "grad_norm": 0.5134586095809937, + "learning_rate": 3.842262185879384e-05, + "loss": 1.1923, + "num_input_tokens_seen": 60784640, + "step": 7420 + }, + { + "epoch": 0.9600723607701254, + "grad_norm": 0.2532576024532318, + "learning_rate": 3.8394070340746234e-05, + "loss": 0.828, + "num_input_tokens_seen": 60866560, + "step": 7430 + }, + { + "epoch": 0.9613645173795063, + "grad_norm": 0.4352121949195862, + "learning_rate": 3.836549429820485e-05, + "loss": 0.9765, + "num_input_tokens_seen": 60948480, + "step": 7440 + }, + { + "epoch": 0.9626566739888874, + "grad_norm": 0.5217087864875793, + "learning_rate": 3.833689378349231e-05, + "loss": 0.9062, + "num_input_tokens_seen": 61030400, + "step": 7450 + }, + { + "epoch": 0.9639488305982685, + "grad_norm": 0.5556167960166931, + "learning_rate": 3.830826884897606e-05, + "loss": 1.0222, + "num_input_tokens_seen": 61112320, + "step": 7460 + }, + { + "epoch": 0.9652409872076496, + "grad_norm": 0.4434933662414551, + "learning_rate": 3.827961954706825e-05, + "loss": 1.118, + "num_input_tokens_seen": 61194240, + "step": 7470 + }, + { + "epoch": 0.9665331438170306, + "grad_norm": 0.8783986568450928, + "learning_rate": 3.825094593022563e-05, + "loss": 1.078, + "num_input_tokens_seen": 61276160, + "step": 7480 + }, + { + "epoch": 0.9678253004264117, + "grad_norm": 0.26476526260375977, + "learning_rate": 3.8222248050949505e-05, + "loss": 0.665, + "num_input_tokens_seen": 61358080, + "step": 7490 + }, + { + "epoch": 0.9691174570357928, + "grad_norm": 0.5986683368682861, + "learning_rate": 3.8193525961785584e-05, + "loss": 0.9969, + "num_input_tokens_seen": 61440000, + "step": 7500 + }, + { + "epoch": 0.9704096136451738, + "grad_norm": 0.2270795851945877, + "learning_rate": 3.8164779715323905e-05, + "loss": 0.9114, + "num_input_tokens_seen": 61521920, + "step": 7510 + }, + { + "epoch": 0.9717017702545548, + "grad_norm": 0.780129075050354, + "learning_rate": 3.813600936419874e-05, + "loss": 0.7856, + "num_input_tokens_seen": 61603840, + "step": 7520 + }, + { + "epoch": 0.9729939268639359, + "grad_norm": 0.5222942233085632, + "learning_rate": 3.81072149610885e-05, + "loss": 1.0216, + "num_input_tokens_seen": 61685760, + "step": 7530 + }, + { + "epoch": 0.9742860834733169, + "grad_norm": 0.5253265500068665, + "learning_rate": 3.807839655871563e-05, + "loss": 0.7388, + "num_input_tokens_seen": 61767680, + "step": 7540 + }, + { + "epoch": 0.975578240082698, + "grad_norm": 0.46493276953697205, + "learning_rate": 3.8049554209846514e-05, + "loss": 0.6208, + "num_input_tokens_seen": 61849600, + "step": 7550 + }, + { + "epoch": 0.9768703966920791, + "grad_norm": 0.7733160853385925, + "learning_rate": 3.802068796729139e-05, + "loss": 1.1114, + "num_input_tokens_seen": 61931520, + "step": 7560 + }, + { + "epoch": 0.9781625533014602, + "grad_norm": 0.21577703952789307, + "learning_rate": 3.7991797883904254e-05, + "loss": 0.9243, + "num_input_tokens_seen": 62013440, + "step": 7570 + }, + { + "epoch": 0.9794547099108412, + "grad_norm": 0.6388958692550659, + "learning_rate": 3.796288401258272e-05, + "loss": 0.8196, + "num_input_tokens_seen": 62095360, + "step": 7580 + }, + { + "epoch": 0.9807468665202222, + "grad_norm": 0.2966822683811188, + "learning_rate": 3.7933946406268e-05, + "loss": 0.8933, + "num_input_tokens_seen": 62177280, + "step": 7590 + }, + { + "epoch": 0.9820390231296033, + "grad_norm": 0.5425664186477661, + "learning_rate": 3.790498511794473e-05, + "loss": 0.9035, + "num_input_tokens_seen": 62259200, + "step": 7600 + }, + { + "epoch": 0.9833311797389843, + "grad_norm": 0.47928035259246826, + "learning_rate": 3.787600020064095e-05, + "loss": 0.9369, + "num_input_tokens_seen": 62341120, + "step": 7610 + }, + { + "epoch": 0.9846233363483654, + "grad_norm": 0.4411972463130951, + "learning_rate": 3.7846991707427905e-05, + "loss": 0.7782, + "num_input_tokens_seen": 62423040, + "step": 7620 + }, + { + "epoch": 0.9859154929577465, + "grad_norm": 0.5929402112960815, + "learning_rate": 3.7817959691420056e-05, + "loss": 0.8775, + "num_input_tokens_seen": 62504960, + "step": 7630 + }, + { + "epoch": 0.9872076495671276, + "grad_norm": 0.5183133482933044, + "learning_rate": 3.778890420577492e-05, + "loss": 0.7959, + "num_input_tokens_seen": 62586880, + "step": 7640 + }, + { + "epoch": 0.9884998061765086, + "grad_norm": 0.3291929364204407, + "learning_rate": 3.775982530369298e-05, + "loss": 1.0962, + "num_input_tokens_seen": 62668800, + "step": 7650 + }, + { + "epoch": 0.9897919627858897, + "grad_norm": 0.5607266426086426, + "learning_rate": 3.77307230384176e-05, + "loss": 1.1062, + "num_input_tokens_seen": 62750720, + "step": 7660 + }, + { + "epoch": 0.9910841193952707, + "grad_norm": 0.6233817338943481, + "learning_rate": 3.7701597463234916e-05, + "loss": 0.6531, + "num_input_tokens_seen": 62832640, + "step": 7670 + }, + { + "epoch": 0.9923762760046517, + "grad_norm": 0.5039672255516052, + "learning_rate": 3.767244863147377e-05, + "loss": 0.8184, + "num_input_tokens_seen": 62914560, + "step": 7680 + }, + { + "epoch": 0.9936684326140328, + "grad_norm": 0.5114099979400635, + "learning_rate": 3.764327659650553e-05, + "loss": 1.1191, + "num_input_tokens_seen": 62996480, + "step": 7690 + }, + { + "epoch": 0.9949605892234139, + "grad_norm": 0.6352154612541199, + "learning_rate": 3.7614081411744116e-05, + "loss": 1.1411, + "num_input_tokens_seen": 63078400, + "step": 7700 + }, + { + "epoch": 0.9962527458327949, + "grad_norm": 0.5263626575469971, + "learning_rate": 3.75848631306458e-05, + "loss": 1.0581, + "num_input_tokens_seen": 63160320, + "step": 7710 + }, + { + "epoch": 0.997544902442176, + "grad_norm": 0.5167150497436523, + "learning_rate": 3.755562180670914e-05, + "loss": 0.8535, + "num_input_tokens_seen": 63242240, + "step": 7720 + }, + { + "epoch": 0.9988370590515571, + "grad_norm": 0.5089849829673767, + "learning_rate": 3.75263574934749e-05, + "loss": 0.7162, + "num_input_tokens_seen": 63324160, + "step": 7730 + }, + { + "epoch": 1.000129215660938, + "grad_norm": 0.412061870098114, + "learning_rate": 3.7497070244525925e-05, + "loss": 0.6882, + "num_input_tokens_seen": 63406080, + "step": 7740 + }, + { + "epoch": 1.0014213722703191, + "grad_norm": 0.6777093410491943, + "learning_rate": 3.746776011348706e-05, + "loss": 1.1799, + "num_input_tokens_seen": 63488000, + "step": 7750 + }, + { + "epoch": 1.0027135288797002, + "grad_norm": 0.5311883091926575, + "learning_rate": 3.7438427154025045e-05, + "loss": 0.5552, + "num_input_tokens_seen": 63569920, + "step": 7760 + }, + { + "epoch": 1.0040056854890813, + "grad_norm": 0.5286181569099426, + "learning_rate": 3.7409071419848436e-05, + "loss": 1.1324, + "num_input_tokens_seen": 63651840, + "step": 7770 + }, + { + "epoch": 1.0052978420984624, + "grad_norm": 0.5085585117340088, + "learning_rate": 3.7379692964707456e-05, + "loss": 0.9001, + "num_input_tokens_seen": 63733760, + "step": 7780 + }, + { + "epoch": 1.0065899987078435, + "grad_norm": 0.3858306109905243, + "learning_rate": 3.735029184239396e-05, + "loss": 1.1056, + "num_input_tokens_seen": 63815680, + "step": 7790 + }, + { + "epoch": 1.0078821553172244, + "grad_norm": 0.5353860855102539, + "learning_rate": 3.73208681067413e-05, + "loss": 0.9637, + "num_input_tokens_seen": 63897600, + "step": 7800 + }, + { + "epoch": 1.0091743119266054, + "grad_norm": 0.3585314452648163, + "learning_rate": 3.7291421811624216e-05, + "loss": 0.6649, + "num_input_tokens_seen": 63979520, + "step": 7810 + }, + { + "epoch": 1.0104664685359865, + "grad_norm": 0.515984296798706, + "learning_rate": 3.726195301095877e-05, + "loss": 0.5021, + "num_input_tokens_seen": 64061440, + "step": 7820 + }, + { + "epoch": 1.0117586251453676, + "grad_norm": 0.5484461784362793, + "learning_rate": 3.7232461758702244e-05, + "loss": 0.9135, + "num_input_tokens_seen": 64143360, + "step": 7830 + }, + { + "epoch": 1.0130507817547487, + "grad_norm": 0.6354055404663086, + "learning_rate": 3.7202948108852984e-05, + "loss": 0.6548, + "num_input_tokens_seen": 64225280, + "step": 7840 + }, + { + "epoch": 1.0143429383641298, + "grad_norm": 0.6655207276344299, + "learning_rate": 3.717341211545039e-05, + "loss": 0.6679, + "num_input_tokens_seen": 64307200, + "step": 7850 + }, + { + "epoch": 1.015635094973511, + "grad_norm": 0.5719141364097595, + "learning_rate": 3.714385383257477e-05, + "loss": 1.115, + "num_input_tokens_seen": 64389120, + "step": 7860 + }, + { + "epoch": 1.0169272515828918, + "grad_norm": 0.6638296842575073, + "learning_rate": 3.711427331434721e-05, + "loss": 0.7203, + "num_input_tokens_seen": 64471040, + "step": 7870 + }, + { + "epoch": 1.0182194081922729, + "grad_norm": 0.5564717650413513, + "learning_rate": 3.7084670614929554e-05, + "loss": 0.7494, + "num_input_tokens_seen": 64552960, + "step": 7880 + }, + { + "epoch": 1.019511564801654, + "grad_norm": 0.42404186725616455, + "learning_rate": 3.7055045788524214e-05, + "loss": 0.7702, + "num_input_tokens_seen": 64634880, + "step": 7890 + }, + { + "epoch": 1.020803721411035, + "grad_norm": 0.5771876573562622, + "learning_rate": 3.702539888937414e-05, + "loss": 1.0498, + "num_input_tokens_seen": 64716800, + "step": 7900 + }, + { + "epoch": 1.0220958780204161, + "grad_norm": 0.6172158718109131, + "learning_rate": 3.699572997176272e-05, + "loss": 0.8016, + "num_input_tokens_seen": 64798720, + "step": 7910 + }, + { + "epoch": 1.0233880346297972, + "grad_norm": 0.6441952586174011, + "learning_rate": 3.696603909001361e-05, + "loss": 0.741, + "num_input_tokens_seen": 64880640, + "step": 7920 + }, + { + "epoch": 1.024680191239178, + "grad_norm": 0.5712997913360596, + "learning_rate": 3.69363262984907e-05, + "loss": 0.7687, + "num_input_tokens_seen": 64962560, + "step": 7930 + }, + { + "epoch": 1.0259723478485592, + "grad_norm": 0.5301061272621155, + "learning_rate": 3.690659165159803e-05, + "loss": 1.0326, + "num_input_tokens_seen": 65044480, + "step": 7940 + }, + { + "epoch": 1.0272645044579403, + "grad_norm": 0.4970210790634155, + "learning_rate": 3.6876835203779615e-05, + "loss": 0.961, + "num_input_tokens_seen": 65126400, + "step": 7950 + }, + { + "epoch": 1.0285566610673214, + "grad_norm": 0.5584379434585571, + "learning_rate": 3.68470570095194e-05, + "loss": 0.8982, + "num_input_tokens_seen": 65208320, + "step": 7960 + }, + { + "epoch": 1.0298488176767024, + "grad_norm": 0.6428322792053223, + "learning_rate": 3.681725712334115e-05, + "loss": 0.8534, + "num_input_tokens_seen": 65290240, + "step": 7970 + }, + { + "epoch": 1.0311409742860835, + "grad_norm": 0.2095443606376648, + "learning_rate": 3.678743559980835e-05, + "loss": 0.6313, + "num_input_tokens_seen": 65372160, + "step": 7980 + }, + { + "epoch": 1.0324331308954646, + "grad_norm": 0.6237980127334595, + "learning_rate": 3.67575924935241e-05, + "loss": 0.9277, + "num_input_tokens_seen": 65454080, + "step": 7990 + }, + { + "epoch": 1.0337252875048455, + "grad_norm": 0.5029511451721191, + "learning_rate": 3.672772785913102e-05, + "loss": 0.8789, + "num_input_tokens_seen": 65536000, + "step": 8000 + }, + { + "epoch": 1.0350174441142266, + "grad_norm": 0.2065262496471405, + "learning_rate": 3.669784175131115e-05, + "loss": 0.4837, + "num_input_tokens_seen": 65617920, + "step": 8010 + }, + { + "epoch": 1.0363096007236077, + "grad_norm": 0.44190657138824463, + "learning_rate": 3.666793422478583e-05, + "loss": 0.7244, + "num_input_tokens_seen": 65699840, + "step": 8020 + }, + { + "epoch": 1.0376017573329888, + "grad_norm": 0.49034252762794495, + "learning_rate": 3.663800533431564e-05, + "loss": 0.63, + "num_input_tokens_seen": 65781760, + "step": 8030 + }, + { + "epoch": 1.0388939139423699, + "grad_norm": 0.6004568934440613, + "learning_rate": 3.660805513470027e-05, + "loss": 1.0153, + "num_input_tokens_seen": 65863680, + "step": 8040 + }, + { + "epoch": 1.040186070551751, + "grad_norm": 0.49091774225234985, + "learning_rate": 3.657808368077843e-05, + "loss": 0.7835, + "num_input_tokens_seen": 65945600, + "step": 8050 + }, + { + "epoch": 1.041478227161132, + "grad_norm": 0.46571245789527893, + "learning_rate": 3.654809102742773e-05, + "loss": 0.7988, + "num_input_tokens_seen": 66027520, + "step": 8060 + }, + { + "epoch": 1.042770383770513, + "grad_norm": 0.7933652400970459, + "learning_rate": 3.651807722956462e-05, + "loss": 1.3038, + "num_input_tokens_seen": 66109440, + "step": 8070 + }, + { + "epoch": 1.044062540379894, + "grad_norm": 0.584967851638794, + "learning_rate": 3.648804234214425e-05, + "loss": 0.7774, + "num_input_tokens_seen": 66191360, + "step": 8080 + }, + { + "epoch": 1.045354696989275, + "grad_norm": 0.49486130475997925, + "learning_rate": 3.645798642016039e-05, + "loss": 0.7951, + "num_input_tokens_seen": 66273280, + "step": 8090 + }, + { + "epoch": 1.0466468535986562, + "grad_norm": 0.5906932353973389, + "learning_rate": 3.642790951864532e-05, + "loss": 0.7105, + "num_input_tokens_seen": 66355200, + "step": 8100 + }, + { + "epoch": 1.0479390102080373, + "grad_norm": 0.6410627961158752, + "learning_rate": 3.639781169266975e-05, + "loss": 0.644, + "num_input_tokens_seen": 66437120, + "step": 8110 + }, + { + "epoch": 1.0492311668174183, + "grad_norm": 0.8290396332740784, + "learning_rate": 3.636769299734267e-05, + "loss": 0.7002, + "num_input_tokens_seen": 66519040, + "step": 8120 + }, + { + "epoch": 1.0505233234267992, + "grad_norm": 0.5405248403549194, + "learning_rate": 3.63375534878113e-05, + "loss": 0.9435, + "num_input_tokens_seen": 66600960, + "step": 8130 + }, + { + "epoch": 1.0518154800361803, + "grad_norm": 0.4574602544307709, + "learning_rate": 3.6307393219261e-05, + "loss": 0.8839, + "num_input_tokens_seen": 66682880, + "step": 8140 + }, + { + "epoch": 1.0531076366455614, + "grad_norm": 0.5063459873199463, + "learning_rate": 3.627721224691507e-05, + "loss": 0.8676, + "num_input_tokens_seen": 66764800, + "step": 8150 + }, + { + "epoch": 1.0543997932549425, + "grad_norm": 0.5998566150665283, + "learning_rate": 3.6247010626034795e-05, + "loss": 0.6555, + "num_input_tokens_seen": 66846720, + "step": 8160 + }, + { + "epoch": 1.0556919498643236, + "grad_norm": 0.6061252951622009, + "learning_rate": 3.621678841191922e-05, + "loss": 0.8207, + "num_input_tokens_seen": 66928640, + "step": 8170 + }, + { + "epoch": 1.0569841064737047, + "grad_norm": 0.6531092524528503, + "learning_rate": 3.618654565990511e-05, + "loss": 1.0707, + "num_input_tokens_seen": 67010560, + "step": 8180 + }, + { + "epoch": 1.0582762630830858, + "grad_norm": 0.7719940543174744, + "learning_rate": 3.615628242536682e-05, + "loss": 0.7523, + "num_input_tokens_seen": 67092480, + "step": 8190 + }, + { + "epoch": 1.0595684196924666, + "grad_norm": 0.28535163402557373, + "learning_rate": 3.612599876371625e-05, + "loss": 0.7847, + "num_input_tokens_seen": 67174400, + "step": 8200 + }, + { + "epoch": 1.0608605763018477, + "grad_norm": 0.5150948166847229, + "learning_rate": 3.609569473040265e-05, + "loss": 1.0002, + "num_input_tokens_seen": 67256320, + "step": 8210 + }, + { + "epoch": 1.0621527329112288, + "grad_norm": 0.47803130745887756, + "learning_rate": 3.6065370380912587e-05, + "loss": 0.9216, + "num_input_tokens_seen": 67338240, + "step": 8220 + }, + { + "epoch": 1.06344488952061, + "grad_norm": 0.5799218416213989, + "learning_rate": 3.603502577076986e-05, + "loss": 0.9941, + "num_input_tokens_seen": 67420160, + "step": 8230 + }, + { + "epoch": 1.064737046129991, + "grad_norm": 0.23019935190677643, + "learning_rate": 3.600466095553532e-05, + "loss": 0.6576, + "num_input_tokens_seen": 67502080, + "step": 8240 + }, + { + "epoch": 1.066029202739372, + "grad_norm": 0.4924595057964325, + "learning_rate": 3.5974275990806846e-05, + "loss": 1.2263, + "num_input_tokens_seen": 67584000, + "step": 8250 + }, + { + "epoch": 1.0673213593487532, + "grad_norm": 0.6706606149673462, + "learning_rate": 3.5943870932219184e-05, + "loss": 0.7686, + "num_input_tokens_seen": 67665920, + "step": 8260 + }, + { + "epoch": 1.068613515958134, + "grad_norm": 0.4998605251312256, + "learning_rate": 3.59134458354439e-05, + "loss": 0.9912, + "num_input_tokens_seen": 67747840, + "step": 8270 + }, + { + "epoch": 1.0699056725675151, + "grad_norm": 0.5857017040252686, + "learning_rate": 3.588300075618922e-05, + "loss": 0.7774, + "num_input_tokens_seen": 67829760, + "step": 8280 + }, + { + "epoch": 1.0711978291768962, + "grad_norm": 0.23385043442249298, + "learning_rate": 3.5852535750199977e-05, + "loss": 1.002, + "num_input_tokens_seen": 67911680, + "step": 8290 + }, + { + "epoch": 1.0724899857862773, + "grad_norm": 0.5875013470649719, + "learning_rate": 3.5822050873257494e-05, + "loss": 0.767, + "num_input_tokens_seen": 67993600, + "step": 8300 + }, + { + "epoch": 1.0737821423956584, + "grad_norm": 0.5456327795982361, + "learning_rate": 3.579154618117946e-05, + "loss": 1.0006, + "num_input_tokens_seen": 68075520, + "step": 8310 + }, + { + "epoch": 1.0750742990050395, + "grad_norm": 0.2565229535102844, + "learning_rate": 3.576102172981986e-05, + "loss": 0.4659, + "num_input_tokens_seen": 68157440, + "step": 8320 + }, + { + "epoch": 1.0763664556144206, + "grad_norm": 0.510844886302948, + "learning_rate": 3.5730477575068845e-05, + "loss": 0.9332, + "num_input_tokens_seen": 68239360, + "step": 8330 + }, + { + "epoch": 1.0776586122238014, + "grad_norm": 0.5561206936836243, + "learning_rate": 3.5699913772852664e-05, + "loss": 0.7617, + "num_input_tokens_seen": 68321280, + "step": 8340 + }, + { + "epoch": 1.0789507688331825, + "grad_norm": 0.6247846484184265, + "learning_rate": 3.566933037913351e-05, + "loss": 1.1367, + "num_input_tokens_seen": 68403200, + "step": 8350 + }, + { + "epoch": 1.0802429254425636, + "grad_norm": 0.8798884749412537, + "learning_rate": 3.5638727449909473e-05, + "loss": 0.6604, + "num_input_tokens_seen": 68485120, + "step": 8360 + }, + { + "epoch": 1.0815350820519447, + "grad_norm": 0.44991663098335266, + "learning_rate": 3.560810504121441e-05, + "loss": 1.0806, + "num_input_tokens_seen": 68567040, + "step": 8370 + }, + { + "epoch": 1.0828272386613258, + "grad_norm": 0.3702957332134247, + "learning_rate": 3.5577463209117833e-05, + "loss": 0.7424, + "num_input_tokens_seen": 68648960, + "step": 8380 + }, + { + "epoch": 1.0841193952707069, + "grad_norm": 0.8632538318634033, + "learning_rate": 3.554680200972482e-05, + "loss": 0.78, + "num_input_tokens_seen": 68730880, + "step": 8390 + }, + { + "epoch": 1.085411551880088, + "grad_norm": 0.5223381519317627, + "learning_rate": 3.551612149917593e-05, + "loss": 1.0331, + "num_input_tokens_seen": 68812800, + "step": 8400 + }, + { + "epoch": 1.0867037084894688, + "grad_norm": 0.6399882435798645, + "learning_rate": 3.548542173364705e-05, + "loss": 1.3448, + "num_input_tokens_seen": 68894720, + "step": 8410 + }, + { + "epoch": 1.08799586509885, + "grad_norm": 0.8672426342964172, + "learning_rate": 3.545470276934934e-05, + "loss": 0.9444, + "num_input_tokens_seen": 68976640, + "step": 8420 + }, + { + "epoch": 1.089288021708231, + "grad_norm": 0.998540461063385, + "learning_rate": 3.542396466252913e-05, + "loss": 0.6718, + "num_input_tokens_seen": 69058560, + "step": 8430 + }, + { + "epoch": 1.0905801783176121, + "grad_norm": 0.8492835760116577, + "learning_rate": 3.539320746946775e-05, + "loss": 0.8492, + "num_input_tokens_seen": 69140480, + "step": 8440 + }, + { + "epoch": 1.0918723349269932, + "grad_norm": 0.47759363055229187, + "learning_rate": 3.5362431246481536e-05, + "loss": 0.5818, + "num_input_tokens_seen": 69222400, + "step": 8450 + }, + { + "epoch": 1.0931644915363743, + "grad_norm": 0.5898222327232361, + "learning_rate": 3.533163604992163e-05, + "loss": 0.8846, + "num_input_tokens_seen": 69304320, + "step": 8460 + }, + { + "epoch": 1.0944566481457552, + "grad_norm": 0.672615110874176, + "learning_rate": 3.5300821936173926e-05, + "loss": 0.8992, + "num_input_tokens_seen": 69386240, + "step": 8470 + }, + { + "epoch": 1.0957488047551363, + "grad_norm": 0.265041321516037, + "learning_rate": 3.526998896165894e-05, + "loss": 0.9501, + "num_input_tokens_seen": 69468160, + "step": 8480 + }, + { + "epoch": 1.0970409613645173, + "grad_norm": 0.5370000600814819, + "learning_rate": 3.523913718283175e-05, + "loss": 1.1183, + "num_input_tokens_seen": 69550080, + "step": 8490 + }, + { + "epoch": 1.0983331179738984, + "grad_norm": 0.6693633794784546, + "learning_rate": 3.520826665618184e-05, + "loss": 0.9322, + "num_input_tokens_seen": 69632000, + "step": 8500 + }, + { + "epoch": 1.0996252745832795, + "grad_norm": 0.7608528733253479, + "learning_rate": 3.5177377438233044e-05, + "loss": 0.6564, + "num_input_tokens_seen": 69713920, + "step": 8510 + }, + { + "epoch": 1.1009174311926606, + "grad_norm": 0.5762624740600586, + "learning_rate": 3.514646958554339e-05, + "loss": 1.0945, + "num_input_tokens_seen": 69795840, + "step": 8520 + }, + { + "epoch": 1.1022095878020417, + "grad_norm": 0.7134038209915161, + "learning_rate": 3.511554315470507e-05, + "loss": 0.8922, + "num_input_tokens_seen": 69877760, + "step": 8530 + }, + { + "epoch": 1.1035017444114226, + "grad_norm": 0.6153433322906494, + "learning_rate": 3.508459820234423e-05, + "loss": 0.9603, + "num_input_tokens_seen": 69959680, + "step": 8540 + }, + { + "epoch": 1.1047939010208037, + "grad_norm": 0.765910267829895, + "learning_rate": 3.5053634785121e-05, + "loss": 0.8906, + "num_input_tokens_seen": 70041600, + "step": 8550 + }, + { + "epoch": 1.1060860576301847, + "grad_norm": 0.4776328206062317, + "learning_rate": 3.5022652959729266e-05, + "loss": 0.7746, + "num_input_tokens_seen": 70123520, + "step": 8560 + }, + { + "epoch": 1.1073782142395658, + "grad_norm": 0.5070037245750427, + "learning_rate": 3.499165278289663e-05, + "loss": 0.7997, + "num_input_tokens_seen": 70205440, + "step": 8570 + }, + { + "epoch": 1.108670370848947, + "grad_norm": 0.759103000164032, + "learning_rate": 3.496063431138431e-05, + "loss": 0.7416, + "num_input_tokens_seen": 70287360, + "step": 8580 + }, + { + "epoch": 1.109962527458328, + "grad_norm": 0.6531251668930054, + "learning_rate": 3.492959760198702e-05, + "loss": 0.7489, + "num_input_tokens_seen": 70369280, + "step": 8590 + }, + { + "epoch": 1.111254684067709, + "grad_norm": 0.34367960691452026, + "learning_rate": 3.489854271153285e-05, + "loss": 0.8175, + "num_input_tokens_seen": 70451200, + "step": 8600 + }, + { + "epoch": 1.11254684067709, + "grad_norm": 0.7481246590614319, + "learning_rate": 3.4867469696883204e-05, + "loss": 0.6624, + "num_input_tokens_seen": 70533120, + "step": 8610 + }, + { + "epoch": 1.113838997286471, + "grad_norm": 0.6177128553390503, + "learning_rate": 3.483637861493264e-05, + "loss": 0.8943, + "num_input_tokens_seen": 70615040, + "step": 8620 + }, + { + "epoch": 1.1151311538958522, + "grad_norm": 0.699070930480957, + "learning_rate": 3.480526952260884e-05, + "loss": 0.9308, + "num_input_tokens_seen": 70696960, + "step": 8630 + }, + { + "epoch": 1.1164233105052332, + "grad_norm": 0.6072255969047546, + "learning_rate": 3.477414247687241e-05, + "loss": 0.6464, + "num_input_tokens_seen": 70778880, + "step": 8640 + }, + { + "epoch": 1.1177154671146143, + "grad_norm": 0.5646083950996399, + "learning_rate": 3.4742997534716884e-05, + "loss": 0.6793, + "num_input_tokens_seen": 70860800, + "step": 8650 + }, + { + "epoch": 1.1190076237239954, + "grad_norm": 0.571111798286438, + "learning_rate": 3.471183475316851e-05, + "loss": 0.95, + "num_input_tokens_seen": 70942720, + "step": 8660 + }, + { + "epoch": 1.1202997803333763, + "grad_norm": 0.2792120575904846, + "learning_rate": 3.468065418928625e-05, + "loss": 0.8991, + "num_input_tokens_seen": 71024640, + "step": 8670 + }, + { + "epoch": 1.1215919369427574, + "grad_norm": 0.8360409736633301, + "learning_rate": 3.4649455900161596e-05, + "loss": 1.0195, + "num_input_tokens_seen": 71106560, + "step": 8680 + }, + { + "epoch": 1.1228840935521385, + "grad_norm": 0.5351372957229614, + "learning_rate": 3.461823994291849e-05, + "loss": 1.0533, + "num_input_tokens_seen": 71188480, + "step": 8690 + }, + { + "epoch": 1.1241762501615196, + "grad_norm": 1.5099799633026123, + "learning_rate": 3.458700637471325e-05, + "loss": 0.7323, + "num_input_tokens_seen": 71270400, + "step": 8700 + }, + { + "epoch": 1.1254684067709007, + "grad_norm": 0.3392684757709503, + "learning_rate": 3.455575525273442e-05, + "loss": 0.4897, + "num_input_tokens_seen": 71352320, + "step": 8710 + }, + { + "epoch": 1.1267605633802817, + "grad_norm": 0.29004672169685364, + "learning_rate": 3.4524486634202685e-05, + "loss": 0.9862, + "num_input_tokens_seen": 71434240, + "step": 8720 + }, + { + "epoch": 1.1280527199896628, + "grad_norm": 0.9040658473968506, + "learning_rate": 3.4493200576370776e-05, + "loss": 0.7261, + "num_input_tokens_seen": 71516160, + "step": 8730 + }, + { + "epoch": 1.1293448765990437, + "grad_norm": 0.509054958820343, + "learning_rate": 3.4461897136523356e-05, + "loss": 0.8157, + "num_input_tokens_seen": 71598080, + "step": 8740 + }, + { + "epoch": 1.1306370332084248, + "grad_norm": 0.5921317338943481, + "learning_rate": 3.44305763719769e-05, + "loss": 1.0178, + "num_input_tokens_seen": 71680000, + "step": 8750 + }, + { + "epoch": 1.1319291898178059, + "grad_norm": 0.7024086117744446, + "learning_rate": 3.4399238340079607e-05, + "loss": 0.9631, + "num_input_tokens_seen": 71761920, + "step": 8760 + }, + { + "epoch": 1.133221346427187, + "grad_norm": 0.2952291667461395, + "learning_rate": 3.4367883098211316e-05, + "loss": 0.7918, + "num_input_tokens_seen": 71843840, + "step": 8770 + }, + { + "epoch": 1.134513503036568, + "grad_norm": 0.6267214417457581, + "learning_rate": 3.4336510703783345e-05, + "loss": 0.8197, + "num_input_tokens_seen": 71925760, + "step": 8780 + }, + { + "epoch": 1.1358056596459492, + "grad_norm": 0.2442624568939209, + "learning_rate": 3.4305121214238446e-05, + "loss": 0.6943, + "num_input_tokens_seen": 72007680, + "step": 8790 + }, + { + "epoch": 1.1370978162553302, + "grad_norm": 0.6830674409866333, + "learning_rate": 3.427371468705065e-05, + "loss": 0.9242, + "num_input_tokens_seen": 72089600, + "step": 8800 + }, + { + "epoch": 1.138389972864711, + "grad_norm": 0.41818463802337646, + "learning_rate": 3.42422911797252e-05, + "loss": 0.8938, + "num_input_tokens_seen": 72171520, + "step": 8810 + }, + { + "epoch": 1.1396821294740922, + "grad_norm": 0.5191003680229187, + "learning_rate": 3.4210850749798415e-05, + "loss": 1.0008, + "num_input_tokens_seen": 72253440, + "step": 8820 + }, + { + "epoch": 1.1409742860834733, + "grad_norm": 0.6296160817146301, + "learning_rate": 3.417939345483762e-05, + "loss": 0.6786, + "num_input_tokens_seen": 72335360, + "step": 8830 + }, + { + "epoch": 1.1422664426928544, + "grad_norm": 0.4805864989757538, + "learning_rate": 3.4147919352440995e-05, + "loss": 0.9551, + "num_input_tokens_seen": 72417280, + "step": 8840 + }, + { + "epoch": 1.1435585993022355, + "grad_norm": 0.64635169506073, + "learning_rate": 3.411642850023751e-05, + "loss": 0.7622, + "num_input_tokens_seen": 72499200, + "step": 8850 + }, + { + "epoch": 1.1448507559116166, + "grad_norm": 0.37850773334503174, + "learning_rate": 3.40849209558868e-05, + "loss": 0.5537, + "num_input_tokens_seen": 72581120, + "step": 8860 + }, + { + "epoch": 1.1461429125209976, + "grad_norm": 0.836184561252594, + "learning_rate": 3.405339677707906e-05, + "loss": 0.6828, + "num_input_tokens_seen": 72663040, + "step": 8870 + }, + { + "epoch": 1.1474350691303785, + "grad_norm": 0.5654054880142212, + "learning_rate": 3.402185602153495e-05, + "loss": 0.8754, + "num_input_tokens_seen": 72744960, + "step": 8880 + }, + { + "epoch": 1.1487272257397596, + "grad_norm": 0.6414130926132202, + "learning_rate": 3.3990298747005485e-05, + "loss": 1.1836, + "num_input_tokens_seen": 72826880, + "step": 8890 + }, + { + "epoch": 1.1500193823491407, + "grad_norm": 0.5052874088287354, + "learning_rate": 3.395872501127191e-05, + "loss": 0.9686, + "num_input_tokens_seen": 72908800, + "step": 8900 + }, + { + "epoch": 1.1513115389585218, + "grad_norm": 0.6401844620704651, + "learning_rate": 3.392713487214561e-05, + "loss": 0.9425, + "num_input_tokens_seen": 72990720, + "step": 8910 + }, + { + "epoch": 1.1526036955679029, + "grad_norm": 0.49744102358818054, + "learning_rate": 3.389552838746804e-05, + "loss": 0.6426, + "num_input_tokens_seen": 73072640, + "step": 8920 + }, + { + "epoch": 1.153895852177284, + "grad_norm": 0.6056483387947083, + "learning_rate": 3.386390561511055e-05, + "loss": 0.9311, + "num_input_tokens_seen": 73154560, + "step": 8930 + }, + { + "epoch": 1.155188008786665, + "grad_norm": 0.7190436720848083, + "learning_rate": 3.38322666129743e-05, + "loss": 0.94, + "num_input_tokens_seen": 73236480, + "step": 8940 + }, + { + "epoch": 1.156480165396046, + "grad_norm": 0.6182407736778259, + "learning_rate": 3.380061143899021e-05, + "loss": 0.8698, + "num_input_tokens_seen": 73318400, + "step": 8950 + }, + { + "epoch": 1.157772322005427, + "grad_norm": 0.22560502588748932, + "learning_rate": 3.376894015111876e-05, + "loss": 1.0229, + "num_input_tokens_seen": 73400320, + "step": 8960 + }, + { + "epoch": 1.159064478614808, + "grad_norm": 0.4778119921684265, + "learning_rate": 3.373725280735e-05, + "loss": 0.7057, + "num_input_tokens_seen": 73482240, + "step": 8970 + }, + { + "epoch": 1.1603566352241892, + "grad_norm": 0.5372010469436646, + "learning_rate": 3.3705549465703314e-05, + "loss": 0.8812, + "num_input_tokens_seen": 73564160, + "step": 8980 + }, + { + "epoch": 1.1616487918335703, + "grad_norm": 0.6138181090354919, + "learning_rate": 3.3673830184227414e-05, + "loss": 0.9767, + "num_input_tokens_seen": 73646080, + "step": 8990 + }, + { + "epoch": 1.1629409484429514, + "grad_norm": 0.7862790822982788, + "learning_rate": 3.3642095021000184e-05, + "loss": 1.0073, + "num_input_tokens_seen": 73728000, + "step": 9000 + }, + { + "epoch": 1.1642331050523325, + "grad_norm": 0.6123846769332886, + "learning_rate": 3.36103440341286e-05, + "loss": 0.5677, + "num_input_tokens_seen": 73809920, + "step": 9010 + }, + { + "epoch": 1.1655252616617133, + "grad_norm": 0.6137543320655823, + "learning_rate": 3.35785772817486e-05, + "loss": 0.5504, + "num_input_tokens_seen": 73891840, + "step": 9020 + }, + { + "epoch": 1.1668174182710944, + "grad_norm": 0.6806734204292297, + "learning_rate": 3.3546794822024976e-05, + "loss": 0.949, + "num_input_tokens_seen": 73973760, + "step": 9030 + }, + { + "epoch": 1.1681095748804755, + "grad_norm": 0.7158621549606323, + "learning_rate": 3.351499671315131e-05, + "loss": 0.9297, + "num_input_tokens_seen": 74055680, + "step": 9040 + }, + { + "epoch": 1.1694017314898566, + "grad_norm": 0.9826081395149231, + "learning_rate": 3.348318301334983e-05, + "loss": 0.993, + "num_input_tokens_seen": 74137600, + "step": 9050 + }, + { + "epoch": 1.1706938880992377, + "grad_norm": 0.6197934746742249, + "learning_rate": 3.3451353780871286e-05, + "loss": 0.6176, + "num_input_tokens_seen": 74219520, + "step": 9060 + }, + { + "epoch": 1.1719860447086188, + "grad_norm": 0.22937451303005219, + "learning_rate": 3.341950907399489e-05, + "loss": 0.5138, + "num_input_tokens_seen": 74301440, + "step": 9070 + }, + { + "epoch": 1.1732782013179996, + "grad_norm": 0.591966450214386, + "learning_rate": 3.338764895102821e-05, + "loss": 0.7563, + "num_input_tokens_seen": 74383360, + "step": 9080 + }, + { + "epoch": 1.1745703579273807, + "grad_norm": 0.6814236640930176, + "learning_rate": 3.335577347030697e-05, + "loss": 1.3017, + "num_input_tokens_seen": 74465280, + "step": 9090 + }, + { + "epoch": 1.1758625145367618, + "grad_norm": 0.9694181084632874, + "learning_rate": 3.33238826901951e-05, + "loss": 1.1248, + "num_input_tokens_seen": 74547200, + "step": 9100 + }, + { + "epoch": 1.177154671146143, + "grad_norm": 0.24693933129310608, + "learning_rate": 3.329197666908447e-05, + "loss": 0.6756, + "num_input_tokens_seen": 74629120, + "step": 9110 + }, + { + "epoch": 1.178446827755524, + "grad_norm": 0.5937339663505554, + "learning_rate": 3.32600554653949e-05, + "loss": 0.9454, + "num_input_tokens_seen": 74711040, + "step": 9120 + }, + { + "epoch": 1.179738984364905, + "grad_norm": 0.7767700552940369, + "learning_rate": 3.322811913757401e-05, + "loss": 0.863, + "num_input_tokens_seen": 74792960, + "step": 9130 + }, + { + "epoch": 1.181031140974286, + "grad_norm": 0.6188927292823792, + "learning_rate": 3.319616774409709e-05, + "loss": 0.8522, + "num_input_tokens_seen": 74874880, + "step": 9140 + }, + { + "epoch": 1.182323297583667, + "grad_norm": 0.5785146355628967, + "learning_rate": 3.316420134346701e-05, + "loss": 0.8277, + "num_input_tokens_seen": 74956800, + "step": 9150 + }, + { + "epoch": 1.1836154541930481, + "grad_norm": 0.6327735781669617, + "learning_rate": 3.313221999421415e-05, + "loss": 0.8846, + "num_input_tokens_seen": 75038720, + "step": 9160 + }, + { + "epoch": 1.1849076108024292, + "grad_norm": 0.338527113199234, + "learning_rate": 3.310022375489623e-05, + "loss": 0.6351, + "num_input_tokens_seen": 75120640, + "step": 9170 + }, + { + "epoch": 1.1861997674118103, + "grad_norm": 1.0302643775939941, + "learning_rate": 3.306821268409827e-05, + "loss": 1.023, + "num_input_tokens_seen": 75202560, + "step": 9180 + }, + { + "epoch": 1.1874919240211914, + "grad_norm": 0.4798928499221802, + "learning_rate": 3.30361868404324e-05, + "loss": 1.0803, + "num_input_tokens_seen": 75284480, + "step": 9190 + }, + { + "epoch": 1.1887840806305725, + "grad_norm": 0.48873379826545715, + "learning_rate": 3.300414628253783e-05, + "loss": 0.7852, + "num_input_tokens_seen": 75366400, + "step": 9200 + }, + { + "epoch": 1.1900762372399534, + "grad_norm": 0.8409120440483093, + "learning_rate": 3.297209106908072e-05, + "loss": 0.8063, + "num_input_tokens_seen": 75448320, + "step": 9210 + }, + { + "epoch": 1.1913683938493345, + "grad_norm": 0.26400497555732727, + "learning_rate": 3.294002125875402e-05, + "loss": 0.6504, + "num_input_tokens_seen": 75530240, + "step": 9220 + }, + { + "epoch": 1.1926605504587156, + "grad_norm": 0.6408680081367493, + "learning_rate": 3.290793691027746e-05, + "loss": 0.7654, + "num_input_tokens_seen": 75612160, + "step": 9230 + }, + { + "epoch": 1.1939527070680966, + "grad_norm": 0.8105805516242981, + "learning_rate": 3.287583808239735e-05, + "loss": 0.7852, + "num_input_tokens_seen": 75694080, + "step": 9240 + }, + { + "epoch": 1.1952448636774777, + "grad_norm": 0.5738030076026917, + "learning_rate": 3.284372483388652e-05, + "loss": 0.5354, + "num_input_tokens_seen": 75776000, + "step": 9250 + }, + { + "epoch": 1.1965370202868588, + "grad_norm": 0.5625064373016357, + "learning_rate": 3.2811597223544234e-05, + "loss": 0.8227, + "num_input_tokens_seen": 75857920, + "step": 9260 + }, + { + "epoch": 1.19782917689624, + "grad_norm": 0.5138773918151855, + "learning_rate": 3.277945531019601e-05, + "loss": 0.9902, + "num_input_tokens_seen": 75939840, + "step": 9270 + }, + { + "epoch": 1.1991213335056208, + "grad_norm": 0.5785451531410217, + "learning_rate": 3.274729915269358e-05, + "loss": 0.6786, + "num_input_tokens_seen": 76021760, + "step": 9280 + }, + { + "epoch": 1.2004134901150019, + "grad_norm": 0.3385809063911438, + "learning_rate": 3.271512880991476e-05, + "loss": 0.7933, + "num_input_tokens_seen": 76103680, + "step": 9290 + }, + { + "epoch": 1.201705646724383, + "grad_norm": 0.2932916581630707, + "learning_rate": 3.268294434076332e-05, + "loss": 0.8867, + "num_input_tokens_seen": 76185600, + "step": 9300 + }, + { + "epoch": 1.202997803333764, + "grad_norm": 0.5734339952468872, + "learning_rate": 3.26507458041689e-05, + "loss": 0.9081, + "num_input_tokens_seen": 76267520, + "step": 9310 + }, + { + "epoch": 1.2042899599431451, + "grad_norm": 0.6236972808837891, + "learning_rate": 3.261853325908691e-05, + "loss": 1.1582, + "num_input_tokens_seen": 76349440, + "step": 9320 + }, + { + "epoch": 1.2055821165525262, + "grad_norm": 0.6559344530105591, + "learning_rate": 3.2586306764498395e-05, + "loss": 1.1172, + "num_input_tokens_seen": 76431360, + "step": 9330 + }, + { + "epoch": 1.2068742731619073, + "grad_norm": 0.5605185627937317, + "learning_rate": 3.255406637940996e-05, + "loss": 0.8069, + "num_input_tokens_seen": 76513280, + "step": 9340 + }, + { + "epoch": 1.2081664297712882, + "grad_norm": 0.697979748249054, + "learning_rate": 3.252181216285363e-05, + "loss": 1.0322, + "num_input_tokens_seen": 76595200, + "step": 9350 + }, + { + "epoch": 1.2094585863806693, + "grad_norm": 0.7321066856384277, + "learning_rate": 3.2489544173886745e-05, + "loss": 1.1227, + "num_input_tokens_seen": 76677120, + "step": 9360 + }, + { + "epoch": 1.2107507429900504, + "grad_norm": 0.623622715473175, + "learning_rate": 3.245726247159189e-05, + "loss": 0.9295, + "num_input_tokens_seen": 76759040, + "step": 9370 + }, + { + "epoch": 1.2120428995994315, + "grad_norm": 0.35311391949653625, + "learning_rate": 3.242496711507673e-05, + "loss": 1.0272, + "num_input_tokens_seen": 76840960, + "step": 9380 + }, + { + "epoch": 1.2133350562088125, + "grad_norm": 0.5266684889793396, + "learning_rate": 3.239265816347397e-05, + "loss": 0.9163, + "num_input_tokens_seen": 76922880, + "step": 9390 + }, + { + "epoch": 1.2146272128181936, + "grad_norm": 0.4497738182544708, + "learning_rate": 3.236033567594115e-05, + "loss": 0.8623, + "num_input_tokens_seen": 77004800, + "step": 9400 + }, + { + "epoch": 1.2159193694275747, + "grad_norm": 0.5825258493423462, + "learning_rate": 3.232799971166064e-05, + "loss": 0.9241, + "num_input_tokens_seen": 77086720, + "step": 9410 + }, + { + "epoch": 1.2172115260369556, + "grad_norm": 0.9179586172103882, + "learning_rate": 3.2295650329839474e-05, + "loss": 0.75, + "num_input_tokens_seen": 77168640, + "step": 9420 + }, + { + "epoch": 1.2185036826463367, + "grad_norm": 0.2291688621044159, + "learning_rate": 3.2263287589709255e-05, + "loss": 0.6456, + "num_input_tokens_seen": 77250560, + "step": 9430 + }, + { + "epoch": 1.2197958392557178, + "grad_norm": 0.4688994288444519, + "learning_rate": 3.2230911550526035e-05, + "loss": 0.8976, + "num_input_tokens_seen": 77332480, + "step": 9440 + }, + { + "epoch": 1.2210879958650989, + "grad_norm": 0.5401532649993896, + "learning_rate": 3.219852227157022e-05, + "loss": 0.9984, + "num_input_tokens_seen": 77414400, + "step": 9450 + }, + { + "epoch": 1.22238015247448, + "grad_norm": 0.7293322086334229, + "learning_rate": 3.216611981214648e-05, + "loss": 1.0182, + "num_input_tokens_seen": 77496320, + "step": 9460 + }, + { + "epoch": 1.223672309083861, + "grad_norm": 0.19243964552879333, + "learning_rate": 3.2133704231583576e-05, + "loss": 0.6102, + "num_input_tokens_seen": 77578240, + "step": 9470 + }, + { + "epoch": 1.2249644656932421, + "grad_norm": 0.556209146976471, + "learning_rate": 3.210127558923434e-05, + "loss": 0.868, + "num_input_tokens_seen": 77660160, + "step": 9480 + }, + { + "epoch": 1.226256622302623, + "grad_norm": 0.29714149236679077, + "learning_rate": 3.206883394447547e-05, + "loss": 0.9574, + "num_input_tokens_seen": 77742080, + "step": 9490 + }, + { + "epoch": 1.227548778912004, + "grad_norm": 0.4670780599117279, + "learning_rate": 3.203637935670752e-05, + "loss": 0.852, + "num_input_tokens_seen": 77824000, + "step": 9500 + }, + { + "epoch": 1.2288409355213852, + "grad_norm": 0.5203260779380798, + "learning_rate": 3.200391188535472e-05, + "loss": 0.7199, + "num_input_tokens_seen": 77905920, + "step": 9510 + }, + { + "epoch": 1.2301330921307663, + "grad_norm": 0.28679078817367554, + "learning_rate": 3.197143158986489e-05, + "loss": 0.7419, + "num_input_tokens_seen": 77987840, + "step": 9520 + }, + { + "epoch": 1.2314252487401474, + "grad_norm": 0.5256805419921875, + "learning_rate": 3.193893852970932e-05, + "loss": 0.7953, + "num_input_tokens_seen": 78069760, + "step": 9530 + }, + { + "epoch": 1.2327174053495285, + "grad_norm": 0.8118154406547546, + "learning_rate": 3.1906432764382695e-05, + "loss": 0.8027, + "num_input_tokens_seen": 78151680, + "step": 9540 + }, + { + "epoch": 1.2340095619589095, + "grad_norm": 0.3833126425743103, + "learning_rate": 3.187391435340295e-05, + "loss": 0.7832, + "num_input_tokens_seen": 78233600, + "step": 9550 + }, + { + "epoch": 1.2353017185682904, + "grad_norm": 0.3153989315032959, + "learning_rate": 3.184138335631118e-05, + "loss": 0.7582, + "num_input_tokens_seen": 78315520, + "step": 9560 + }, + { + "epoch": 1.2365938751776715, + "grad_norm": 0.5784737467765808, + "learning_rate": 3.1808839832671523e-05, + "loss": 1.1442, + "num_input_tokens_seen": 78397440, + "step": 9570 + }, + { + "epoch": 1.2378860317870526, + "grad_norm": 0.6450271010398865, + "learning_rate": 3.1776283842071045e-05, + "loss": 0.9673, + "num_input_tokens_seen": 78479360, + "step": 9580 + }, + { + "epoch": 1.2391781883964337, + "grad_norm": 0.5860177874565125, + "learning_rate": 3.174371544411964e-05, + "loss": 0.8106, + "num_input_tokens_seen": 78561280, + "step": 9590 + }, + { + "epoch": 1.2404703450058148, + "grad_norm": 0.6213310956954956, + "learning_rate": 3.1711134698449946e-05, + "loss": 0.9658, + "num_input_tokens_seen": 78643200, + "step": 9600 + }, + { + "epoch": 1.2417625016151959, + "grad_norm": 0.5813376903533936, + "learning_rate": 3.167854166471717e-05, + "loss": 0.9531, + "num_input_tokens_seen": 78725120, + "step": 9610 + }, + { + "epoch": 1.2430546582245767, + "grad_norm": 0.3016079366207123, + "learning_rate": 3.164593640259904e-05, + "loss": 0.8195, + "num_input_tokens_seen": 78807040, + "step": 9620 + }, + { + "epoch": 1.2443468148339578, + "grad_norm": 0.7248777747154236, + "learning_rate": 3.161331897179568e-05, + "loss": 0.9972, + "num_input_tokens_seen": 78888960, + "step": 9630 + }, + { + "epoch": 1.245638971443339, + "grad_norm": 0.19406473636627197, + "learning_rate": 3.1580689432029484e-05, + "loss": 0.5308, + "num_input_tokens_seen": 78970880, + "step": 9640 + }, + { + "epoch": 1.24693112805272, + "grad_norm": 0.8860630989074707, + "learning_rate": 3.154804784304502e-05, + "loss": 1.0639, + "num_input_tokens_seen": 79052800, + "step": 9650 + }, + { + "epoch": 1.248223284662101, + "grad_norm": 0.792770504951477, + "learning_rate": 3.151539426460892e-05, + "loss": 1.2022, + "num_input_tokens_seen": 79134720, + "step": 9660 + }, + { + "epoch": 1.2495154412714822, + "grad_norm": 0.7376882433891296, + "learning_rate": 3.148272875650976e-05, + "loss": 0.8717, + "num_input_tokens_seen": 79216640, + "step": 9670 + }, + { + "epoch": 1.250807597880863, + "grad_norm": 0.5317556262016296, + "learning_rate": 3.145005137855796e-05, + "loss": 0.9134, + "num_input_tokens_seen": 79298560, + "step": 9680 + }, + { + "epoch": 1.2520997544902444, + "grad_norm": 0.2553159296512604, + "learning_rate": 3.14173621905857e-05, + "loss": 0.671, + "num_input_tokens_seen": 79380480, + "step": 9690 + }, + { + "epoch": 1.2533919110996252, + "grad_norm": 0.6660608649253845, + "learning_rate": 3.138466125244674e-05, + "loss": 1.0354, + "num_input_tokens_seen": 79462400, + "step": 9700 + }, + { + "epoch": 1.2546840677090063, + "grad_norm": 0.5901172757148743, + "learning_rate": 3.13519486240164e-05, + "loss": 1.3345, + "num_input_tokens_seen": 79544320, + "step": 9710 + }, + { + "epoch": 1.2559762243183874, + "grad_norm": 0.5691861510276794, + "learning_rate": 3.1319224365191366e-05, + "loss": 0.7637, + "num_input_tokens_seen": 79626240, + "step": 9720 + }, + { + "epoch": 1.2572683809277685, + "grad_norm": 0.5517555475234985, + "learning_rate": 3.128648853588965e-05, + "loss": 0.5803, + "num_input_tokens_seen": 79708160, + "step": 9730 + }, + { + "epoch": 1.2585605375371496, + "grad_norm": 0.9708176255226135, + "learning_rate": 3.1253741196050425e-05, + "loss": 0.4912, + "num_input_tokens_seen": 79790080, + "step": 9740 + }, + { + "epoch": 1.2598526941465304, + "grad_norm": 0.689449667930603, + "learning_rate": 3.122098240563396e-05, + "loss": 0.7291, + "num_input_tokens_seen": 79872000, + "step": 9750 + }, + { + "epoch": 1.2611448507559115, + "grad_norm": 0.8461759686470032, + "learning_rate": 3.118821222462147e-05, + "loss": 1.0072, + "num_input_tokens_seen": 79953920, + "step": 9760 + }, + { + "epoch": 1.2624370073652926, + "grad_norm": 0.9328972697257996, + "learning_rate": 3.1155430713015034e-05, + "loss": 0.8663, + "num_input_tokens_seen": 80035840, + "step": 9770 + }, + { + "epoch": 1.2637291639746737, + "grad_norm": 0.19583484530448914, + "learning_rate": 3.1122637930837486e-05, + "loss": 0.8375, + "num_input_tokens_seen": 80117760, + "step": 9780 + }, + { + "epoch": 1.2650213205840548, + "grad_norm": 0.22281506657600403, + "learning_rate": 3.10898339381323e-05, + "loss": 0.7423, + "num_input_tokens_seen": 80199680, + "step": 9790 + }, + { + "epoch": 1.266313477193436, + "grad_norm": 0.6488370299339294, + "learning_rate": 3.1057018794963454e-05, + "loss": 1.3639, + "num_input_tokens_seen": 80281600, + "step": 9800 + }, + { + "epoch": 1.267605633802817, + "grad_norm": 1.4011627435684204, + "learning_rate": 3.102419256141536e-05, + "loss": 0.7074, + "num_input_tokens_seen": 80363520, + "step": 9810 + }, + { + "epoch": 1.2688977904121979, + "grad_norm": 0.5548164248466492, + "learning_rate": 3.0991355297592734e-05, + "loss": 1.1645, + "num_input_tokens_seen": 80445440, + "step": 9820 + }, + { + "epoch": 1.270189947021579, + "grad_norm": 0.8125051856040955, + "learning_rate": 3.095850706362047e-05, + "loss": 0.699, + "num_input_tokens_seen": 80527360, + "step": 9830 + }, + { + "epoch": 1.27148210363096, + "grad_norm": 0.8188523650169373, + "learning_rate": 3.092564791964358e-05, + "loss": 1.1145, + "num_input_tokens_seen": 80609280, + "step": 9840 + }, + { + "epoch": 1.2727742602403411, + "grad_norm": 0.553065836429596, + "learning_rate": 3.089277792582704e-05, + "loss": 0.9243, + "num_input_tokens_seen": 80691200, + "step": 9850 + }, + { + "epoch": 1.2740664168497222, + "grad_norm": 0.6267945170402527, + "learning_rate": 3.085989714235568e-05, + "loss": 0.8664, + "num_input_tokens_seen": 80773120, + "step": 9860 + }, + { + "epoch": 1.2753585734591033, + "grad_norm": 0.551201343536377, + "learning_rate": 3.082700562943409e-05, + "loss": 0.8113, + "num_input_tokens_seen": 80855040, + "step": 9870 + }, + { + "epoch": 1.2766507300684844, + "grad_norm": 0.39583033323287964, + "learning_rate": 3.079410344728652e-05, + "loss": 1.0342, + "num_input_tokens_seen": 80936960, + "step": 9880 + }, + { + "epoch": 1.2779428866778653, + "grad_norm": 0.7506781816482544, + "learning_rate": 3.076119065615674e-05, + "loss": 0.8064, + "num_input_tokens_seen": 81018880, + "step": 9890 + }, + { + "epoch": 1.2792350432872464, + "grad_norm": 0.6547626852989197, + "learning_rate": 3.0728267316307945e-05, + "loss": 0.7267, + "num_input_tokens_seen": 81100800, + "step": 9900 + }, + { + "epoch": 1.2805271998966274, + "grad_norm": 0.8237736225128174, + "learning_rate": 3.069533348802266e-05, + "loss": 0.6364, + "num_input_tokens_seen": 81182720, + "step": 9910 + }, + { + "epoch": 1.2818193565060085, + "grad_norm": 0.5278318524360657, + "learning_rate": 3.0662389231602595e-05, + "loss": 0.7681, + "num_input_tokens_seen": 81264640, + "step": 9920 + }, + { + "epoch": 1.2831115131153896, + "grad_norm": 0.48821374773979187, + "learning_rate": 3.062943460736857e-05, + "loss": 0.6366, + "num_input_tokens_seen": 81346560, + "step": 9930 + }, + { + "epoch": 1.2844036697247707, + "grad_norm": 0.6118844747543335, + "learning_rate": 3.059646967566038e-05, + "loss": 0.8101, + "num_input_tokens_seen": 81428480, + "step": 9940 + }, + { + "epoch": 1.2856958263341518, + "grad_norm": 0.7095397710800171, + "learning_rate": 3.0563494496836686e-05, + "loss": 0.5936, + "num_input_tokens_seen": 81510400, + "step": 9950 + }, + { + "epoch": 1.2869879829435327, + "grad_norm": 0.8946405053138733, + "learning_rate": 3.0530509131274935e-05, + "loss": 0.8716, + "num_input_tokens_seen": 81592320, + "step": 9960 + }, + { + "epoch": 1.2882801395529138, + "grad_norm": 1.0983461141586304, + "learning_rate": 3.0497513639371195e-05, + "loss": 0.7882, + "num_input_tokens_seen": 81674240, + "step": 9970 + }, + { + "epoch": 1.2895722961622949, + "grad_norm": 0.9059731364250183, + "learning_rate": 3.04645080815401e-05, + "loss": 0.8302, + "num_input_tokens_seen": 81756160, + "step": 9980 + }, + { + "epoch": 1.290864452771676, + "grad_norm": 0.6056944131851196, + "learning_rate": 3.04314925182147e-05, + "loss": 0.9569, + "num_input_tokens_seen": 81838080, + "step": 9990 + }, + { + "epoch": 1.292156609381057, + "grad_norm": 0.536361575126648, + "learning_rate": 3.0398467009846375e-05, + "loss": 0.9173, + "num_input_tokens_seen": 81920000, + "step": 10000 + }, + { + "epoch": 1.2934487659904381, + "grad_norm": 0.6062700152397156, + "learning_rate": 3.0365431616904714e-05, + "loss": 0.9092, + "num_input_tokens_seen": 82001920, + "step": 10010 + }, + { + "epoch": 1.2947409225998192, + "grad_norm": 0.5536112189292908, + "learning_rate": 3.03323863998774e-05, + "loss": 0.8183, + "num_input_tokens_seen": 82083840, + "step": 10020 + }, + { + "epoch": 1.2960330792092, + "grad_norm": 0.6913787126541138, + "learning_rate": 3.02993314192701e-05, + "loss": 1.2083, + "num_input_tokens_seen": 82165760, + "step": 10030 + }, + { + "epoch": 1.2973252358185812, + "grad_norm": 0.8038882613182068, + "learning_rate": 3.0266266735606358e-05, + "loss": 1.1272, + "num_input_tokens_seen": 82247680, + "step": 10040 + }, + { + "epoch": 1.2986173924279623, + "grad_norm": 0.5819315910339355, + "learning_rate": 3.0233192409427492e-05, + "loss": 0.6936, + "num_input_tokens_seen": 82329600, + "step": 10050 + }, + { + "epoch": 1.2999095490373433, + "grad_norm": 0.5388988852500916, + "learning_rate": 3.0200108501292466e-05, + "loss": 1.0776, + "num_input_tokens_seen": 82411520, + "step": 10060 + }, + { + "epoch": 1.3012017056467244, + "grad_norm": 0.2774782180786133, + "learning_rate": 3.0167015071777815e-05, + "loss": 0.5866, + "num_input_tokens_seen": 82493440, + "step": 10070 + }, + { + "epoch": 1.3024938622561053, + "grad_norm": 0.20768149197101593, + "learning_rate": 3.0133912181477475e-05, + "loss": 0.903, + "num_input_tokens_seen": 82575360, + "step": 10080 + }, + { + "epoch": 1.3037860188654866, + "grad_norm": 0.5745561718940735, + "learning_rate": 3.010079989100271e-05, + "loss": 1.1334, + "num_input_tokens_seen": 82657280, + "step": 10090 + }, + { + "epoch": 1.3050781754748675, + "grad_norm": 0.5522057414054871, + "learning_rate": 3.0067678260982018e-05, + "loss": 0.708, + "num_input_tokens_seen": 82739200, + "step": 10100 + }, + { + "epoch": 1.3063703320842486, + "grad_norm": 0.5716668367385864, + "learning_rate": 3.003454735206097e-05, + "loss": 1.094, + "num_input_tokens_seen": 82821120, + "step": 10110 + }, + { + "epoch": 1.3076624886936297, + "grad_norm": 0.5987653732299805, + "learning_rate": 3.000140722490215e-05, + "loss": 0.7914, + "num_input_tokens_seen": 82903040, + "step": 10120 + }, + { + "epoch": 1.3089546453030108, + "grad_norm": 0.754988431930542, + "learning_rate": 2.9968257940184997e-05, + "loss": 0.882, + "num_input_tokens_seen": 82984960, + "step": 10130 + }, + { + "epoch": 1.3102468019123918, + "grad_norm": 0.2810363471508026, + "learning_rate": 2.9935099558605728e-05, + "loss": 0.7434, + "num_input_tokens_seen": 83066880, + "step": 10140 + }, + { + "epoch": 1.3115389585217727, + "grad_norm": 0.6438339352607727, + "learning_rate": 2.9901932140877232e-05, + "loss": 1.0257, + "num_input_tokens_seen": 83148800, + "step": 10150 + }, + { + "epoch": 1.312831115131154, + "grad_norm": 0.8908247947692871, + "learning_rate": 2.9868755747728927e-05, + "loss": 0.6914, + "num_input_tokens_seen": 83230720, + "step": 10160 + }, + { + "epoch": 1.314123271740535, + "grad_norm": 0.5516972541809082, + "learning_rate": 2.9835570439906657e-05, + "loss": 0.9318, + "num_input_tokens_seen": 83312640, + "step": 10170 + }, + { + "epoch": 1.315415428349916, + "grad_norm": 0.6137295961380005, + "learning_rate": 2.9802376278172612e-05, + "loss": 0.9633, + "num_input_tokens_seen": 83394560, + "step": 10180 + }, + { + "epoch": 1.316707584959297, + "grad_norm": 0.8933919668197632, + "learning_rate": 2.976917332330517e-05, + "loss": 1.1187, + "num_input_tokens_seen": 83476480, + "step": 10190 + }, + { + "epoch": 1.3179997415686782, + "grad_norm": 0.5550206303596497, + "learning_rate": 2.973596163609883e-05, + "loss": 0.9031, + "num_input_tokens_seen": 83558400, + "step": 10200 + }, + { + "epoch": 1.3192918981780593, + "grad_norm": 0.7092847228050232, + "learning_rate": 2.970274127736406e-05, + "loss": 0.8322, + "num_input_tokens_seen": 83640320, + "step": 10210 + }, + { + "epoch": 1.3205840547874401, + "grad_norm": 0.9514045119285583, + "learning_rate": 2.966951230792722e-05, + "loss": 0.7939, + "num_input_tokens_seen": 83722240, + "step": 10220 + }, + { + "epoch": 1.3218762113968214, + "grad_norm": 0.6002041697502136, + "learning_rate": 2.9636274788630437e-05, + "loss": 1.0287, + "num_input_tokens_seen": 83804160, + "step": 10230 + }, + { + "epoch": 1.3231683680062023, + "grad_norm": 0.3773179054260254, + "learning_rate": 2.9603028780331475e-05, + "loss": 0.8743, + "num_input_tokens_seen": 83886080, + "step": 10240 + }, + { + "epoch": 1.3244605246155834, + "grad_norm": 0.5780991911888123, + "learning_rate": 2.9569774343903662e-05, + "loss": 1.089, + "num_input_tokens_seen": 83968000, + "step": 10250 + }, + { + "epoch": 1.3257526812249645, + "grad_norm": 0.7904821634292603, + "learning_rate": 2.9536511540235744e-05, + "loss": 0.9047, + "num_input_tokens_seen": 84049920, + "step": 10260 + }, + { + "epoch": 1.3270448378343456, + "grad_norm": 0.6269042491912842, + "learning_rate": 2.9503240430231803e-05, + "loss": 0.7258, + "num_input_tokens_seen": 84131840, + "step": 10270 + }, + { + "epoch": 1.3283369944437267, + "grad_norm": 0.5256585478782654, + "learning_rate": 2.9469961074811103e-05, + "loss": 0.6985, + "num_input_tokens_seen": 84213760, + "step": 10280 + }, + { + "epoch": 1.3296291510531075, + "grad_norm": 0.6166103482246399, + "learning_rate": 2.9436673534908044e-05, + "loss": 0.8227, + "num_input_tokens_seen": 84295680, + "step": 10290 + }, + { + "epoch": 1.3309213076624886, + "grad_norm": 0.9073558449745178, + "learning_rate": 2.940337787147197e-05, + "loss": 0.883, + "num_input_tokens_seen": 84377600, + "step": 10300 + }, + { + "epoch": 1.3322134642718697, + "grad_norm": 0.6241338849067688, + "learning_rate": 2.9370074145467132e-05, + "loss": 1.2221, + "num_input_tokens_seen": 84459520, + "step": 10310 + }, + { + "epoch": 1.3335056208812508, + "grad_norm": 0.6946704983711243, + "learning_rate": 2.9336762417872516e-05, + "loss": 0.7841, + "num_input_tokens_seen": 84541440, + "step": 10320 + }, + { + "epoch": 1.3347977774906319, + "grad_norm": 1.0090748071670532, + "learning_rate": 2.9303442749681787e-05, + "loss": 0.8774, + "num_input_tokens_seen": 84623360, + "step": 10330 + }, + { + "epoch": 1.336089934100013, + "grad_norm": 0.5861837863922119, + "learning_rate": 2.927011520190313e-05, + "loss": 0.7681, + "num_input_tokens_seen": 84705280, + "step": 10340 + }, + { + "epoch": 1.337382090709394, + "grad_norm": 0.5563804507255554, + "learning_rate": 2.9236779835559165e-05, + "loss": 0.9598, + "num_input_tokens_seen": 84787200, + "step": 10350 + }, + { + "epoch": 1.338674247318775, + "grad_norm": 0.6404061317443848, + "learning_rate": 2.9203436711686817e-05, + "loss": 0.6958, + "num_input_tokens_seen": 84869120, + "step": 10360 + }, + { + "epoch": 1.339966403928156, + "grad_norm": 0.7478158473968506, + "learning_rate": 2.917008589133724e-05, + "loss": 1.0247, + "num_input_tokens_seen": 84951040, + "step": 10370 + }, + { + "epoch": 1.3412585605375371, + "grad_norm": 0.5483884811401367, + "learning_rate": 2.913672743557565e-05, + "loss": 0.5671, + "num_input_tokens_seen": 85032960, + "step": 10380 + }, + { + "epoch": 1.3425507171469182, + "grad_norm": 0.7392348647117615, + "learning_rate": 2.9103361405481272e-05, + "loss": 0.6446, + "num_input_tokens_seen": 85114880, + "step": 10390 + }, + { + "epoch": 1.3438428737562993, + "grad_norm": 1.0634355545043945, + "learning_rate": 2.906998786214717e-05, + "loss": 1.0231, + "num_input_tokens_seen": 85196800, + "step": 10400 + }, + { + "epoch": 1.3451350303656804, + "grad_norm": 0.5788999199867249, + "learning_rate": 2.9036606866680187e-05, + "loss": 0.8076, + "num_input_tokens_seen": 85278720, + "step": 10410 + }, + { + "epoch": 1.3464271869750615, + "grad_norm": 0.5242962837219238, + "learning_rate": 2.90032184802008e-05, + "loss": 1.0485, + "num_input_tokens_seen": 85360640, + "step": 10420 + }, + { + "epoch": 1.3477193435844423, + "grad_norm": 0.7182193994522095, + "learning_rate": 2.8969822763843018e-05, + "loss": 1.0753, + "num_input_tokens_seen": 85442560, + "step": 10430 + }, + { + "epoch": 1.3490115001938234, + "grad_norm": 1.0175756216049194, + "learning_rate": 2.8936419778754294e-05, + "loss": 0.9629, + "num_input_tokens_seen": 85524480, + "step": 10440 + }, + { + "epoch": 1.3503036568032045, + "grad_norm": 2.7674241065979004, + "learning_rate": 2.8903009586095353e-05, + "loss": 0.7803, + "num_input_tokens_seen": 85606400, + "step": 10450 + }, + { + "epoch": 1.3515958134125856, + "grad_norm": 0.27844467759132385, + "learning_rate": 2.8869592247040138e-05, + "loss": 0.9991, + "num_input_tokens_seen": 85688320, + "step": 10460 + }, + { + "epoch": 1.3528879700219667, + "grad_norm": 0.18930500745773315, + "learning_rate": 2.883616782277569e-05, + "loss": 0.7521, + "num_input_tokens_seen": 85770240, + "step": 10470 + }, + { + "epoch": 1.3541801266313478, + "grad_norm": 0.9251275658607483, + "learning_rate": 2.8802736374501994e-05, + "loss": 1.186, + "num_input_tokens_seen": 85852160, + "step": 10480 + }, + { + "epoch": 1.3554722832407289, + "grad_norm": 0.6336624026298523, + "learning_rate": 2.8769297963431908e-05, + "loss": 1.1393, + "num_input_tokens_seen": 85934080, + "step": 10490 + }, + { + "epoch": 1.3567644398501097, + "grad_norm": 0.6428319811820984, + "learning_rate": 2.8735852650791035e-05, + "loss": 0.6548, + "num_input_tokens_seen": 86016000, + "step": 10500 + }, + { + "epoch": 1.3580565964594908, + "grad_norm": 0.6062332391738892, + "learning_rate": 2.870240049781764e-05, + "loss": 0.9466, + "num_input_tokens_seen": 86097920, + "step": 10510 + }, + { + "epoch": 1.359348753068872, + "grad_norm": 0.6514495611190796, + "learning_rate": 2.8668941565762475e-05, + "loss": 0.9157, + "num_input_tokens_seen": 86179840, + "step": 10520 + }, + { + "epoch": 1.360640909678253, + "grad_norm": 0.593705952167511, + "learning_rate": 2.8635475915888732e-05, + "loss": 0.8647, + "num_input_tokens_seen": 86261760, + "step": 10530 + }, + { + "epoch": 1.361933066287634, + "grad_norm": 0.8103840947151184, + "learning_rate": 2.8602003609471888e-05, + "loss": 0.8976, + "num_input_tokens_seen": 86343680, + "step": 10540 + }, + { + "epoch": 1.3632252228970152, + "grad_norm": 0.28389742970466614, + "learning_rate": 2.856852470779962e-05, + "loss": 1.0414, + "num_input_tokens_seen": 86425600, + "step": 10550 + }, + { + "epoch": 1.3645173795063963, + "grad_norm": 0.7390335202217102, + "learning_rate": 2.853503927217167e-05, + "loss": 0.9523, + "num_input_tokens_seen": 86507520, + "step": 10560 + }, + { + "epoch": 1.3658095361157772, + "grad_norm": 0.5908385515213013, + "learning_rate": 2.8501547363899744e-05, + "loss": 0.7521, + "num_input_tokens_seen": 86589440, + "step": 10570 + }, + { + "epoch": 1.3671016927251582, + "grad_norm": 0.8718129992485046, + "learning_rate": 2.846804904430741e-05, + "loss": 0.9591, + "num_input_tokens_seen": 86671360, + "step": 10580 + }, + { + "epoch": 1.3683938493345393, + "grad_norm": 0.6367455124855042, + "learning_rate": 2.8434544374729965e-05, + "loss": 0.9854, + "num_input_tokens_seen": 86753280, + "step": 10590 + }, + { + "epoch": 1.3696860059439204, + "grad_norm": 0.7115527391433716, + "learning_rate": 2.8401033416514345e-05, + "loss": 1.0439, + "num_input_tokens_seen": 86835200, + "step": 10600 + }, + { + "epoch": 1.3709781625533015, + "grad_norm": 0.5063609480857849, + "learning_rate": 2.8367516231018976e-05, + "loss": 0.6884, + "num_input_tokens_seen": 86917120, + "step": 10610 + }, + { + "epoch": 1.3722703191626824, + "grad_norm": 1.5641584396362305, + "learning_rate": 2.8333992879613712e-05, + "loss": 0.9114, + "num_input_tokens_seen": 86999040, + "step": 10620 + }, + { + "epoch": 1.3735624757720637, + "grad_norm": 0.7738798260688782, + "learning_rate": 2.830046342367969e-05, + "loss": 0.8389, + "num_input_tokens_seen": 87080960, + "step": 10630 + }, + { + "epoch": 1.3748546323814446, + "grad_norm": 0.6683527827262878, + "learning_rate": 2.826692792460921e-05, + "loss": 0.5361, + "num_input_tokens_seen": 87162880, + "step": 10640 + }, + { + "epoch": 1.3761467889908257, + "grad_norm": 0.26369839906692505, + "learning_rate": 2.823338644380566e-05, + "loss": 1.0727, + "num_input_tokens_seen": 87244800, + "step": 10650 + }, + { + "epoch": 1.3774389456002067, + "grad_norm": 0.6391986012458801, + "learning_rate": 2.8199839042683363e-05, + "loss": 0.5925, + "num_input_tokens_seen": 87326720, + "step": 10660 + }, + { + "epoch": 1.3787311022095878, + "grad_norm": 0.6280574798583984, + "learning_rate": 2.8166285782667483e-05, + "loss": 0.9665, + "num_input_tokens_seen": 87408640, + "step": 10670 + }, + { + "epoch": 1.380023258818969, + "grad_norm": 0.8266412615776062, + "learning_rate": 2.8132726725193926e-05, + "loss": 0.9418, + "num_input_tokens_seen": 87490560, + "step": 10680 + }, + { + "epoch": 1.3813154154283498, + "grad_norm": 0.26182371377944946, + "learning_rate": 2.8099161931709195e-05, + "loss": 0.6926, + "num_input_tokens_seen": 87572480, + "step": 10690 + }, + { + "epoch": 1.382607572037731, + "grad_norm": 0.6281419992446899, + "learning_rate": 2.806559146367031e-05, + "loss": 1.0825, + "num_input_tokens_seen": 87654400, + "step": 10700 + }, + { + "epoch": 1.383899728647112, + "grad_norm": 0.5953611135482788, + "learning_rate": 2.803201538254467e-05, + "loss": 0.63, + "num_input_tokens_seen": 87736320, + "step": 10710 + }, + { + "epoch": 1.385191885256493, + "grad_norm": 0.5723059177398682, + "learning_rate": 2.799843374980996e-05, + "loss": 0.8605, + "num_input_tokens_seen": 87818240, + "step": 10720 + }, + { + "epoch": 1.3864840418658742, + "grad_norm": 0.2304965704679489, + "learning_rate": 2.796484662695402e-05, + "loss": 1.0539, + "num_input_tokens_seen": 87900160, + "step": 10730 + }, + { + "epoch": 1.3877761984752552, + "grad_norm": 0.6087478399276733, + "learning_rate": 2.7931254075474768e-05, + "loss": 0.7748, + "num_input_tokens_seen": 87982080, + "step": 10740 + }, + { + "epoch": 1.3890683550846363, + "grad_norm": 3.60947322845459, + "learning_rate": 2.789765615688003e-05, + "loss": 0.9568, + "num_input_tokens_seen": 88064000, + "step": 10750 + }, + { + "epoch": 1.3903605116940172, + "grad_norm": 0.5466760993003845, + "learning_rate": 2.786405293268747e-05, + "loss": 1.0008, + "num_input_tokens_seen": 88145920, + "step": 10760 + }, + { + "epoch": 1.3916526683033985, + "grad_norm": 0.8872881531715393, + "learning_rate": 2.7830444464424466e-05, + "loss": 0.8518, + "num_input_tokens_seen": 88227840, + "step": 10770 + }, + { + "epoch": 1.3929448249127794, + "grad_norm": 0.9006865620613098, + "learning_rate": 2.7796830813628004e-05, + "loss": 0.8596, + "num_input_tokens_seen": 88309760, + "step": 10780 + }, + { + "epoch": 1.3942369815221605, + "grad_norm": 0.2589983344078064, + "learning_rate": 2.776321204184456e-05, + "loss": 0.9577, + "num_input_tokens_seen": 88391680, + "step": 10790 + }, + { + "epoch": 1.3955291381315416, + "grad_norm": 0.2515423893928528, + "learning_rate": 2.772958821062997e-05, + "loss": 0.9272, + "num_input_tokens_seen": 88473600, + "step": 10800 + }, + { + "epoch": 1.3968212947409226, + "grad_norm": 0.8317649364471436, + "learning_rate": 2.7695959381549364e-05, + "loss": 0.8736, + "num_input_tokens_seen": 88555520, + "step": 10810 + }, + { + "epoch": 1.3981134513503037, + "grad_norm": 0.8371811509132385, + "learning_rate": 2.7662325616176993e-05, + "loss": 0.7469, + "num_input_tokens_seen": 88637440, + "step": 10820 + }, + { + "epoch": 1.3994056079596846, + "grad_norm": 0.5307440161705017, + "learning_rate": 2.7628686976096164e-05, + "loss": 0.9801, + "num_input_tokens_seen": 88719360, + "step": 10830 + }, + { + "epoch": 1.4006977645690657, + "grad_norm": 0.5864846110343933, + "learning_rate": 2.7595043522899093e-05, + "loss": 1.1059, + "num_input_tokens_seen": 88801280, + "step": 10840 + }, + { + "epoch": 1.4019899211784468, + "grad_norm": 0.8047347068786621, + "learning_rate": 2.756139531818684e-05, + "loss": 1.1236, + "num_input_tokens_seen": 88883200, + "step": 10850 + }, + { + "epoch": 1.4032820777878279, + "grad_norm": 0.6610074043273926, + "learning_rate": 2.7527742423569124e-05, + "loss": 0.7606, + "num_input_tokens_seen": 88965120, + "step": 10860 + }, + { + "epoch": 1.404574234397209, + "grad_norm": 0.7933541536331177, + "learning_rate": 2.7494084900664273e-05, + "loss": 0.7754, + "num_input_tokens_seen": 89047040, + "step": 10870 + }, + { + "epoch": 1.40586639100659, + "grad_norm": 0.6643183827400208, + "learning_rate": 2.746042281109911e-05, + "loss": 0.9668, + "num_input_tokens_seen": 89128960, + "step": 10880 + }, + { + "epoch": 1.4071585476159711, + "grad_norm": 0.6505457758903503, + "learning_rate": 2.7426756216508776e-05, + "loss": 0.9552, + "num_input_tokens_seen": 89210880, + "step": 10890 + }, + { + "epoch": 1.408450704225352, + "grad_norm": 0.7870205640792847, + "learning_rate": 2.7393085178536686e-05, + "loss": 0.8455, + "num_input_tokens_seen": 89292800, + "step": 10900 + }, + { + "epoch": 1.409742860834733, + "grad_norm": 0.5975139737129211, + "learning_rate": 2.7359409758834397e-05, + "loss": 0.515, + "num_input_tokens_seen": 89374720, + "step": 10910 + }, + { + "epoch": 1.4110350174441142, + "grad_norm": 0.8741236329078674, + "learning_rate": 2.7325730019061474e-05, + "loss": 0.5575, + "num_input_tokens_seen": 89456640, + "step": 10920 + }, + { + "epoch": 1.4123271740534953, + "grad_norm": 0.6208974719047546, + "learning_rate": 2.729204602088539e-05, + "loss": 0.7631, + "num_input_tokens_seen": 89538560, + "step": 10930 + }, + { + "epoch": 1.4136193306628764, + "grad_norm": 0.6551584005355835, + "learning_rate": 2.7258357825981433e-05, + "loss": 0.8117, + "num_input_tokens_seen": 89620480, + "step": 10940 + }, + { + "epoch": 1.4149114872722575, + "grad_norm": 1.0060181617736816, + "learning_rate": 2.7224665496032565e-05, + "loss": 0.7802, + "num_input_tokens_seen": 89702400, + "step": 10950 + }, + { + "epoch": 1.4162036438816386, + "grad_norm": 0.6730552315711975, + "learning_rate": 2.7190969092729308e-05, + "loss": 0.8345, + "num_input_tokens_seen": 89784320, + "step": 10960 + }, + { + "epoch": 1.4174958004910194, + "grad_norm": 0.5672726035118103, + "learning_rate": 2.7157268677769666e-05, + "loss": 1.1491, + "num_input_tokens_seen": 89866240, + "step": 10970 + }, + { + "epoch": 1.4187879571004005, + "grad_norm": 0.6032657623291016, + "learning_rate": 2.712356431285896e-05, + "loss": 1.3288, + "num_input_tokens_seen": 89948160, + "step": 10980 + }, + { + "epoch": 1.4200801137097816, + "grad_norm": 0.6020485758781433, + "learning_rate": 2.7089856059709774e-05, + "loss": 0.8851, + "num_input_tokens_seen": 90030080, + "step": 10990 + }, + { + "epoch": 1.4213722703191627, + "grad_norm": 0.7019742727279663, + "learning_rate": 2.7056143980041787e-05, + "loss": 0.7634, + "num_input_tokens_seen": 90112000, + "step": 11000 + }, + { + "epoch": 1.4226644269285438, + "grad_norm": 0.6318424940109253, + "learning_rate": 2.70224281355817e-05, + "loss": 0.8556, + "num_input_tokens_seen": 90193920, + "step": 11010 + }, + { + "epoch": 1.4239565835379249, + "grad_norm": 0.5350080132484436, + "learning_rate": 2.6988708588063093e-05, + "loss": 0.7443, + "num_input_tokens_seen": 90275840, + "step": 11020 + }, + { + "epoch": 1.425248740147306, + "grad_norm": 0.6152655482292175, + "learning_rate": 2.695498539922634e-05, + "loss": 0.9481, + "num_input_tokens_seen": 90357760, + "step": 11030 + }, + { + "epoch": 1.4265408967566868, + "grad_norm": 0.9493053555488586, + "learning_rate": 2.6921258630818475e-05, + "loss": 0.817, + "num_input_tokens_seen": 90439680, + "step": 11040 + }, + { + "epoch": 1.427833053366068, + "grad_norm": 8.474701881408691, + "learning_rate": 2.6887528344593087e-05, + "loss": 0.6403, + "num_input_tokens_seen": 90521600, + "step": 11050 + }, + { + "epoch": 1.429125209975449, + "grad_norm": 0.1764688491821289, + "learning_rate": 2.685379460231021e-05, + "loss": 0.8245, + "num_input_tokens_seen": 90603520, + "step": 11060 + }, + { + "epoch": 1.43041736658483, + "grad_norm": 1.4876654148101807, + "learning_rate": 2.6820057465736197e-05, + "loss": 1.2493, + "num_input_tokens_seen": 90685440, + "step": 11070 + }, + { + "epoch": 1.4317095231942112, + "grad_norm": 0.8256916403770447, + "learning_rate": 2.6786316996643623e-05, + "loss": 0.9927, + "num_input_tokens_seen": 90767360, + "step": 11080 + }, + { + "epoch": 1.4330016798035923, + "grad_norm": 0.5362711548805237, + "learning_rate": 2.6752573256811165e-05, + "loss": 0.9965, + "num_input_tokens_seen": 90849280, + "step": 11090 + }, + { + "epoch": 1.4342938364129734, + "grad_norm": 0.6876575350761414, + "learning_rate": 2.6718826308023487e-05, + "loss": 0.8742, + "num_input_tokens_seen": 90931200, + "step": 11100 + }, + { + "epoch": 1.4355859930223542, + "grad_norm": 0.5102024674415588, + "learning_rate": 2.668507621207113e-05, + "loss": 0.9033, + "num_input_tokens_seen": 91013120, + "step": 11110 + }, + { + "epoch": 1.4368781496317353, + "grad_norm": 1.2666608095169067, + "learning_rate": 2.6651323030750396e-05, + "loss": 0.7038, + "num_input_tokens_seen": 91095040, + "step": 11120 + }, + { + "epoch": 1.4381703062411164, + "grad_norm": 0.5506048202514648, + "learning_rate": 2.6617566825863237e-05, + "loss": 0.7839, + "num_input_tokens_seen": 91176960, + "step": 11130 + }, + { + "epoch": 1.4394624628504975, + "grad_norm": 0.5495603084564209, + "learning_rate": 2.6583807659217137e-05, + "loss": 0.7871, + "num_input_tokens_seen": 91258880, + "step": 11140 + }, + { + "epoch": 1.4407546194598786, + "grad_norm": 0.5707017183303833, + "learning_rate": 2.6550045592625007e-05, + "loss": 1.0343, + "num_input_tokens_seen": 91340800, + "step": 11150 + }, + { + "epoch": 1.4420467760692595, + "grad_norm": 0.6786594390869141, + "learning_rate": 2.651628068790507e-05, + "loss": 1.1501, + "num_input_tokens_seen": 91422720, + "step": 11160 + }, + { + "epoch": 1.4433389326786408, + "grad_norm": 1.807134747505188, + "learning_rate": 2.648251300688073e-05, + "loss": 0.8592, + "num_input_tokens_seen": 91504640, + "step": 11170 + }, + { + "epoch": 1.4446310892880216, + "grad_norm": 0.5651780366897583, + "learning_rate": 2.6448742611380515e-05, + "loss": 0.8201, + "num_input_tokens_seen": 91586560, + "step": 11180 + }, + { + "epoch": 1.4459232458974027, + "grad_norm": 0.630124568939209, + "learning_rate": 2.6414969563237874e-05, + "loss": 1.074, + "num_input_tokens_seen": 91668480, + "step": 11190 + }, + { + "epoch": 1.4472154025067838, + "grad_norm": 0.7884289026260376, + "learning_rate": 2.6381193924291143e-05, + "loss": 0.8012, + "num_input_tokens_seen": 91750400, + "step": 11200 + }, + { + "epoch": 1.448507559116165, + "grad_norm": 0.6598234176635742, + "learning_rate": 2.63474157563834e-05, + "loss": 1.1103, + "num_input_tokens_seen": 91832320, + "step": 11210 + }, + { + "epoch": 1.449799715725546, + "grad_norm": 0.7381977438926697, + "learning_rate": 2.6313635121362322e-05, + "loss": 0.6353, + "num_input_tokens_seen": 91914240, + "step": 11220 + }, + { + "epoch": 1.4510918723349269, + "grad_norm": 0.29279613494873047, + "learning_rate": 2.6279852081080153e-05, + "loss": 0.8131, + "num_input_tokens_seen": 91996160, + "step": 11230 + }, + { + "epoch": 1.4523840289443082, + "grad_norm": 0.6259168386459351, + "learning_rate": 2.6246066697393494e-05, + "loss": 0.8539, + "num_input_tokens_seen": 92078080, + "step": 11240 + }, + { + "epoch": 1.453676185553689, + "grad_norm": 0.2948238253593445, + "learning_rate": 2.6212279032163283e-05, + "loss": 0.799, + "num_input_tokens_seen": 92160000, + "step": 11250 + }, + { + "epoch": 1.4549683421630701, + "grad_norm": 0.8171486854553223, + "learning_rate": 2.6178489147254598e-05, + "loss": 0.5722, + "num_input_tokens_seen": 92241920, + "step": 11260 + }, + { + "epoch": 1.4562604987724512, + "grad_norm": 0.7403898239135742, + "learning_rate": 2.6144697104536597e-05, + "loss": 0.8796, + "num_input_tokens_seen": 92323840, + "step": 11270 + }, + { + "epoch": 1.4575526553818323, + "grad_norm": 0.6193887591362, + "learning_rate": 2.6110902965882383e-05, + "loss": 1.1459, + "num_input_tokens_seen": 92405760, + "step": 11280 + }, + { + "epoch": 1.4588448119912134, + "grad_norm": 0.9331411719322205, + "learning_rate": 2.607710679316891e-05, + "loss": 0.7492, + "num_input_tokens_seen": 92487680, + "step": 11290 + }, + { + "epoch": 1.4601369686005943, + "grad_norm": 0.6176254153251648, + "learning_rate": 2.6043308648276833e-05, + "loss": 0.9672, + "num_input_tokens_seen": 92569600, + "step": 11300 + }, + { + "epoch": 1.4614291252099754, + "grad_norm": 0.836600661277771, + "learning_rate": 2.6009508593090448e-05, + "loss": 0.606, + "num_input_tokens_seen": 92651520, + "step": 11310 + }, + { + "epoch": 1.4627212818193565, + "grad_norm": 0.6364132165908813, + "learning_rate": 2.5975706689497513e-05, + "loss": 0.7642, + "num_input_tokens_seen": 92733440, + "step": 11320 + }, + { + "epoch": 1.4640134384287375, + "grad_norm": 0.6128113865852356, + "learning_rate": 2.59419029993892e-05, + "loss": 0.8852, + "num_input_tokens_seen": 92815360, + "step": 11330 + }, + { + "epoch": 1.4653055950381186, + "grad_norm": 0.6446326375007629, + "learning_rate": 2.590809758465995e-05, + "loss": 1.1366, + "num_input_tokens_seen": 92897280, + "step": 11340 + }, + { + "epoch": 1.4665977516474997, + "grad_norm": 0.9294217824935913, + "learning_rate": 2.5874290507207337e-05, + "loss": 1.004, + "num_input_tokens_seen": 92979200, + "step": 11350 + }, + { + "epoch": 1.4678899082568808, + "grad_norm": 0.5920690298080444, + "learning_rate": 2.584048182893201e-05, + "loss": 1.1198, + "num_input_tokens_seen": 93061120, + "step": 11360 + }, + { + "epoch": 1.4691820648662617, + "grad_norm": 0.5159122943878174, + "learning_rate": 2.580667161173753e-05, + "loss": 0.7811, + "num_input_tokens_seen": 93143040, + "step": 11370 + }, + { + "epoch": 1.4704742214756428, + "grad_norm": 0.8452631235122681, + "learning_rate": 2.577285991753028e-05, + "loss": 1.0665, + "num_input_tokens_seen": 93224960, + "step": 11380 + }, + { + "epoch": 1.4717663780850239, + "grad_norm": 0.6538841724395752, + "learning_rate": 2.5739046808219348e-05, + "loss": 0.9232, + "num_input_tokens_seen": 93306880, + "step": 11390 + }, + { + "epoch": 1.473058534694405, + "grad_norm": 0.7873169779777527, + "learning_rate": 2.570523234571642e-05, + "loss": 0.8771, + "num_input_tokens_seen": 93388800, + "step": 11400 + }, + { + "epoch": 1.474350691303786, + "grad_norm": 0.36491021513938904, + "learning_rate": 2.5671416591935636e-05, + "loss": 0.7952, + "num_input_tokens_seen": 93470720, + "step": 11410 + }, + { + "epoch": 1.4756428479131671, + "grad_norm": 0.5437069535255432, + "learning_rate": 2.563759960879354e-05, + "loss": 0.8034, + "num_input_tokens_seen": 93552640, + "step": 11420 + }, + { + "epoch": 1.4769350045225482, + "grad_norm": 0.3457854986190796, + "learning_rate": 2.5603781458208885e-05, + "loss": 0.9861, + "num_input_tokens_seen": 93634560, + "step": 11430 + }, + { + "epoch": 1.478227161131929, + "grad_norm": 0.6639708280563354, + "learning_rate": 2.55699622021026e-05, + "loss": 0.9154, + "num_input_tokens_seen": 93716480, + "step": 11440 + }, + { + "epoch": 1.4795193177413102, + "grad_norm": 0.5724432468414307, + "learning_rate": 2.55361419023976e-05, + "loss": 0.696, + "num_input_tokens_seen": 93798400, + "step": 11450 + }, + { + "epoch": 1.4808114743506913, + "grad_norm": 0.8275769948959351, + "learning_rate": 2.5502320621018732e-05, + "loss": 1.0897, + "num_input_tokens_seen": 93880320, + "step": 11460 + }, + { + "epoch": 1.4821036309600724, + "grad_norm": 0.6131581664085388, + "learning_rate": 2.5468498419892656e-05, + "loss": 0.9171, + "num_input_tokens_seen": 93962240, + "step": 11470 + }, + { + "epoch": 1.4833957875694535, + "grad_norm": 1.2641193866729736, + "learning_rate": 2.5434675360947692e-05, + "loss": 0.875, + "num_input_tokens_seen": 94044160, + "step": 11480 + }, + { + "epoch": 1.4846879441788345, + "grad_norm": 0.8318386673927307, + "learning_rate": 2.5400851506113728e-05, + "loss": 0.7646, + "num_input_tokens_seen": 94126080, + "step": 11490 + }, + { + "epoch": 1.4859801007882156, + "grad_norm": 0.6216618418693542, + "learning_rate": 2.5367026917322117e-05, + "loss": 0.9129, + "num_input_tokens_seen": 94208000, + "step": 11500 + }, + { + "epoch": 1.4872722573975965, + "grad_norm": 0.6390239000320435, + "learning_rate": 2.5333201656505567e-05, + "loss": 1.2751, + "num_input_tokens_seen": 94289920, + "step": 11510 + }, + { + "epoch": 1.4885644140069776, + "grad_norm": 0.8883216381072998, + "learning_rate": 2.5299375785598005e-05, + "loss": 0.8298, + "num_input_tokens_seen": 94371840, + "step": 11520 + }, + { + "epoch": 1.4898565706163587, + "grad_norm": 0.4036071300506592, + "learning_rate": 2.5265549366534475e-05, + "loss": 1.0023, + "num_input_tokens_seen": 94453760, + "step": 11530 + }, + { + "epoch": 1.4911487272257398, + "grad_norm": 0.9503811597824097, + "learning_rate": 2.5231722461251017e-05, + "loss": 0.8267, + "num_input_tokens_seen": 94535680, + "step": 11540 + }, + { + "epoch": 1.4924408838351209, + "grad_norm": 0.9378893971443176, + "learning_rate": 2.519789513168459e-05, + "loss": 0.6509, + "num_input_tokens_seen": 94617600, + "step": 11550 + }, + { + "epoch": 1.493733040444502, + "grad_norm": 0.6294723153114319, + "learning_rate": 2.5164067439772898e-05, + "loss": 0.7988, + "num_input_tokens_seen": 94699520, + "step": 11560 + }, + { + "epoch": 1.495025197053883, + "grad_norm": 2.2070484161376953, + "learning_rate": 2.5130239447454328e-05, + "loss": 0.7822, + "num_input_tokens_seen": 94781440, + "step": 11570 + }, + { + "epoch": 1.496317353663264, + "grad_norm": 0.5767642259597778, + "learning_rate": 2.509641121666781e-05, + "loss": 0.9319, + "num_input_tokens_seen": 94863360, + "step": 11580 + }, + { + "epoch": 1.497609510272645, + "grad_norm": 0.5216675400733948, + "learning_rate": 2.5062582809352704e-05, + "loss": 0.8723, + "num_input_tokens_seen": 94945280, + "step": 11590 + }, + { + "epoch": 1.498901666882026, + "grad_norm": 0.5917890667915344, + "learning_rate": 2.5028754287448695e-05, + "loss": 1.0985, + "num_input_tokens_seen": 95027200, + "step": 11600 + }, + { + "epoch": 1.5001938234914072, + "grad_norm": 0.5695708394050598, + "learning_rate": 2.4994925712895697e-05, + "loss": 0.898, + "num_input_tokens_seen": 95109120, + "step": 11610 + }, + { + "epoch": 1.5014859801007883, + "grad_norm": 0.6653333902359009, + "learning_rate": 2.4961097147633698e-05, + "loss": 1.2631, + "num_input_tokens_seen": 95191040, + "step": 11620 + }, + { + "epoch": 1.5027781367101691, + "grad_norm": 0.7980930209159851, + "learning_rate": 2.4927268653602684e-05, + "loss": 0.9865, + "num_input_tokens_seen": 95272960, + "step": 11630 + }, + { + "epoch": 1.5040702933195504, + "grad_norm": 0.27931126952171326, + "learning_rate": 2.489344029274249e-05, + "loss": 1.063, + "num_input_tokens_seen": 95354880, + "step": 11640 + }, + { + "epoch": 1.5053624499289313, + "grad_norm": 0.4017459452152252, + "learning_rate": 2.4859612126992737e-05, + "loss": 0.4969, + "num_input_tokens_seen": 95436800, + "step": 11650 + }, + { + "epoch": 1.5066546065383124, + "grad_norm": 0.9155080318450928, + "learning_rate": 2.4825784218292664e-05, + "loss": 0.6354, + "num_input_tokens_seen": 95518720, + "step": 11660 + }, + { + "epoch": 1.5079467631476935, + "grad_norm": 0.7411128282546997, + "learning_rate": 2.479195662858105e-05, + "loss": 0.9183, + "num_input_tokens_seen": 95600640, + "step": 11670 + }, + { + "epoch": 1.5092389197570746, + "grad_norm": 1.0255645513534546, + "learning_rate": 2.4758129419796094e-05, + "loss": 0.8669, + "num_input_tokens_seen": 95682560, + "step": 11680 + }, + { + "epoch": 1.5105310763664557, + "grad_norm": 0.612771213054657, + "learning_rate": 2.4724302653875275e-05, + "loss": 1.1856, + "num_input_tokens_seen": 95764480, + "step": 11690 + }, + { + "epoch": 1.5118232329758365, + "grad_norm": 0.7555944919586182, + "learning_rate": 2.4690476392755298e-05, + "loss": 0.7345, + "num_input_tokens_seen": 95846400, + "step": 11700 + }, + { + "epoch": 1.5131153895852179, + "grad_norm": 0.7884678244590759, + "learning_rate": 2.4656650698371903e-05, + "loss": 0.5009, + "num_input_tokens_seen": 95928320, + "step": 11710 + }, + { + "epoch": 1.5144075461945987, + "grad_norm": 0.5397049188613892, + "learning_rate": 2.462282563265982e-05, + "loss": 0.7891, + "num_input_tokens_seen": 96010240, + "step": 11720 + }, + { + "epoch": 1.5156997028039798, + "grad_norm": 0.6234694123268127, + "learning_rate": 2.4589001257552637e-05, + "loss": 0.6393, + "num_input_tokens_seen": 96092160, + "step": 11730 + }, + { + "epoch": 1.516991859413361, + "grad_norm": 0.6878401637077332, + "learning_rate": 2.455517763498264e-05, + "loss": 0.8309, + "num_input_tokens_seen": 96174080, + "step": 11740 + }, + { + "epoch": 1.518284016022742, + "grad_norm": 0.23810110986232758, + "learning_rate": 2.452135482688077e-05, + "loss": 0.7155, + "num_input_tokens_seen": 96256000, + "step": 11750 + }, + { + "epoch": 1.519576172632123, + "grad_norm": 0.5660243630409241, + "learning_rate": 2.4487532895176457e-05, + "loss": 0.8101, + "num_input_tokens_seen": 96337920, + "step": 11760 + }, + { + "epoch": 1.520868329241504, + "grad_norm": 0.9258086085319519, + "learning_rate": 2.4453711901797543e-05, + "loss": 0.5187, + "num_input_tokens_seen": 96419840, + "step": 11770 + }, + { + "epoch": 1.5221604858508853, + "grad_norm": 0.5753411054611206, + "learning_rate": 2.4419891908670127e-05, + "loss": 1.1635, + "num_input_tokens_seen": 96501760, + "step": 11780 + }, + { + "epoch": 1.5234526424602661, + "grad_norm": 0.6174433827400208, + "learning_rate": 2.4386072977718503e-05, + "loss": 0.8433, + "num_input_tokens_seen": 96583680, + "step": 11790 + }, + { + "epoch": 1.5247447990696472, + "grad_norm": 0.5415393710136414, + "learning_rate": 2.4352255170865025e-05, + "loss": 0.9885, + "num_input_tokens_seen": 96665600, + "step": 11800 + }, + { + "epoch": 1.5260369556790283, + "grad_norm": 0.5386336445808411, + "learning_rate": 2.4318438550029946e-05, + "loss": 1.0425, + "num_input_tokens_seen": 96747520, + "step": 11810 + }, + { + "epoch": 1.5273291122884094, + "grad_norm": 0.721867561340332, + "learning_rate": 2.4284623177131395e-05, + "loss": 0.6342, + "num_input_tokens_seen": 96829440, + "step": 11820 + }, + { + "epoch": 1.5286212688977905, + "grad_norm": 0.5259259343147278, + "learning_rate": 2.4250809114085183e-05, + "loss": 1.1709, + "num_input_tokens_seen": 96911360, + "step": 11830 + }, + { + "epoch": 1.5299134255071714, + "grad_norm": 0.7975210547447205, + "learning_rate": 2.421699642280475e-05, + "loss": 1.0094, + "num_input_tokens_seen": 96993280, + "step": 11840 + }, + { + "epoch": 1.5312055821165527, + "grad_norm": 0.5395787954330444, + "learning_rate": 2.4183185165200998e-05, + "loss": 0.7204, + "num_input_tokens_seen": 97075200, + "step": 11850 + }, + { + "epoch": 1.5324977387259335, + "grad_norm": 0.6701288223266602, + "learning_rate": 2.4149375403182216e-05, + "loss": 0.7901, + "num_input_tokens_seen": 97157120, + "step": 11860 + }, + { + "epoch": 1.5337898953353146, + "grad_norm": 0.7676398158073425, + "learning_rate": 2.4115567198653963e-05, + "loss": 1.0571, + "num_input_tokens_seen": 97239040, + "step": 11870 + }, + { + "epoch": 1.5350820519446957, + "grad_norm": 0.38611501455307007, + "learning_rate": 2.4081760613518924e-05, + "loss": 0.5656, + "num_input_tokens_seen": 97320960, + "step": 11880 + }, + { + "epoch": 1.5363742085540768, + "grad_norm": 0.5478717088699341, + "learning_rate": 2.4047955709676852e-05, + "loss": 1.2245, + "num_input_tokens_seen": 97402880, + "step": 11890 + }, + { + "epoch": 1.537666365163458, + "grad_norm": 0.5873267650604248, + "learning_rate": 2.401415254902438e-05, + "loss": 0.9675, + "num_input_tokens_seen": 97484800, + "step": 11900 + }, + { + "epoch": 1.5389585217728388, + "grad_norm": 0.5424181222915649, + "learning_rate": 2.3980351193455e-05, + "loss": 1.0065, + "num_input_tokens_seen": 97566720, + "step": 11910 + }, + { + "epoch": 1.54025067838222, + "grad_norm": 0.47638383507728577, + "learning_rate": 2.3946551704858838e-05, + "loss": 0.8522, + "num_input_tokens_seen": 97648640, + "step": 11920 + }, + { + "epoch": 1.541542834991601, + "grad_norm": 0.6472703218460083, + "learning_rate": 2.3912754145122663e-05, + "loss": 0.9128, + "num_input_tokens_seen": 97730560, + "step": 11930 + }, + { + "epoch": 1.542834991600982, + "grad_norm": 0.6344046592712402, + "learning_rate": 2.3878958576129664e-05, + "loss": 0.8387, + "num_input_tokens_seen": 97812480, + "step": 11940 + }, + { + "epoch": 1.5441271482103631, + "grad_norm": 0.8018627762794495, + "learning_rate": 2.3845165059759402e-05, + "loss": 0.9149, + "num_input_tokens_seen": 97894400, + "step": 11950 + }, + { + "epoch": 1.545419304819744, + "grad_norm": 0.398837685585022, + "learning_rate": 2.3811373657887705e-05, + "loss": 0.8989, + "num_input_tokens_seen": 97976320, + "step": 11960 + }, + { + "epoch": 1.5467114614291253, + "grad_norm": 0.6712442636489868, + "learning_rate": 2.3777584432386474e-05, + "loss": 0.8954, + "num_input_tokens_seen": 98058240, + "step": 11970 + }, + { + "epoch": 1.5480036180385062, + "grad_norm": 0.685006856918335, + "learning_rate": 2.3743797445123688e-05, + "loss": 0.9203, + "num_input_tokens_seen": 98140160, + "step": 11980 + }, + { + "epoch": 1.5492957746478875, + "grad_norm": 1.1985323429107666, + "learning_rate": 2.3710012757963175e-05, + "loss": 1.0436, + "num_input_tokens_seen": 98222080, + "step": 11990 + }, + { + "epoch": 1.5505879312572683, + "grad_norm": 0.6150766015052795, + "learning_rate": 2.367623043276459e-05, + "loss": 1.0798, + "num_input_tokens_seen": 98304000, + "step": 12000 + }, + { + "epoch": 1.5518800878666494, + "grad_norm": 0.5276364684104919, + "learning_rate": 2.364245053138323e-05, + "loss": 1.0669, + "num_input_tokens_seen": 98385920, + "step": 12010 + }, + { + "epoch": 1.5531722444760305, + "grad_norm": 0.291031152009964, + "learning_rate": 2.3608673115669978e-05, + "loss": 0.8337, + "num_input_tokens_seen": 98467840, + "step": 12020 + }, + { + "epoch": 1.5544644010854114, + "grad_norm": 0.4851202964782715, + "learning_rate": 2.3574898247471167e-05, + "loss": 1.2425, + "num_input_tokens_seen": 98549760, + "step": 12030 + }, + { + "epoch": 1.5557565576947927, + "grad_norm": 0.6985998153686523, + "learning_rate": 2.354112598862845e-05, + "loss": 0.9847, + "num_input_tokens_seen": 98631680, + "step": 12040 + }, + { + "epoch": 1.5570487143041736, + "grad_norm": 0.26096710562705994, + "learning_rate": 2.350735640097871e-05, + "loss": 0.79, + "num_input_tokens_seen": 98713600, + "step": 12050 + }, + { + "epoch": 1.5583408709135549, + "grad_norm": 0.830028772354126, + "learning_rate": 2.347358954635393e-05, + "loss": 0.8655, + "num_input_tokens_seen": 98795520, + "step": 12060 + }, + { + "epoch": 1.5596330275229358, + "grad_norm": 0.9304326772689819, + "learning_rate": 2.3439825486581116e-05, + "loss": 0.7855, + "num_input_tokens_seen": 98877440, + "step": 12070 + }, + { + "epoch": 1.5609251841323168, + "grad_norm": 0.505251407623291, + "learning_rate": 2.3406064283482115e-05, + "loss": 0.911, + "num_input_tokens_seen": 98959360, + "step": 12080 + }, + { + "epoch": 1.562217340741698, + "grad_norm": 0.6220946907997131, + "learning_rate": 2.337230599887358e-05, + "loss": 0.6987, + "num_input_tokens_seen": 99041280, + "step": 12090 + }, + { + "epoch": 1.5635094973510788, + "grad_norm": 1.1672903299331665, + "learning_rate": 2.3338550694566817e-05, + "loss": 0.7693, + "num_input_tokens_seen": 99123200, + "step": 12100 + }, + { + "epoch": 1.5648016539604601, + "grad_norm": 0.5260895490646362, + "learning_rate": 2.3304798432367645e-05, + "loss": 0.5875, + "num_input_tokens_seen": 99205120, + "step": 12110 + }, + { + "epoch": 1.566093810569841, + "grad_norm": 0.7243925333023071, + "learning_rate": 2.327104927407634e-05, + "loss": 1.1252, + "num_input_tokens_seen": 99287040, + "step": 12120 + }, + { + "epoch": 1.567385967179222, + "grad_norm": 0.5413958430290222, + "learning_rate": 2.3237303281487487e-05, + "loss": 0.8912, + "num_input_tokens_seen": 99368960, + "step": 12130 + }, + { + "epoch": 1.5686781237886032, + "grad_norm": 0.6078057885169983, + "learning_rate": 2.3203560516389882e-05, + "loss": 0.6356, + "num_input_tokens_seen": 99450880, + "step": 12140 + }, + { + "epoch": 1.5699702803979843, + "grad_norm": 0.9383816719055176, + "learning_rate": 2.3169821040566387e-05, + "loss": 1.1298, + "num_input_tokens_seen": 99532800, + "step": 12150 + }, + { + "epoch": 1.5712624370073653, + "grad_norm": 0.6377266049385071, + "learning_rate": 2.313608491579387e-05, + "loss": 1.0096, + "num_input_tokens_seen": 99614720, + "step": 12160 + }, + { + "epoch": 1.5725545936167462, + "grad_norm": 0.5839458703994751, + "learning_rate": 2.3102352203843063e-05, + "loss": 0.7456, + "num_input_tokens_seen": 99696640, + "step": 12170 + }, + { + "epoch": 1.5738467502261275, + "grad_norm": 0.5719941854476929, + "learning_rate": 2.306862296647841e-05, + "loss": 0.8698, + "num_input_tokens_seen": 99778560, + "step": 12180 + }, + { + "epoch": 1.5751389068355084, + "grad_norm": 0.7049387693405151, + "learning_rate": 2.3034897265458056e-05, + "loss": 0.6883, + "num_input_tokens_seen": 99860480, + "step": 12190 + }, + { + "epoch": 1.5764310634448895, + "grad_norm": 0.5691668391227722, + "learning_rate": 2.3001175162533606e-05, + "loss": 0.8952, + "num_input_tokens_seen": 99942400, + "step": 12200 + }, + { + "epoch": 1.5777232200542706, + "grad_norm": 0.9318535327911377, + "learning_rate": 2.2967456719450127e-05, + "loss": 0.7274, + "num_input_tokens_seen": 100024320, + "step": 12210 + }, + { + "epoch": 1.5790153766636517, + "grad_norm": 0.4153442680835724, + "learning_rate": 2.2933741997945954e-05, + "loss": 0.8773, + "num_input_tokens_seen": 100106240, + "step": 12220 + }, + { + "epoch": 1.5803075332730327, + "grad_norm": 0.7407699227333069, + "learning_rate": 2.290003105975262e-05, + "loss": 0.9009, + "num_input_tokens_seen": 100188160, + "step": 12230 + }, + { + "epoch": 1.5815996898824136, + "grad_norm": 0.6650556325912476, + "learning_rate": 2.2866323966594736e-05, + "loss": 0.6566, + "num_input_tokens_seen": 100270080, + "step": 12240 + }, + { + "epoch": 1.582891846491795, + "grad_norm": 0.2326425462961197, + "learning_rate": 2.283262078018985e-05, + "loss": 0.7216, + "num_input_tokens_seen": 100352000, + "step": 12250 + }, + { + "epoch": 1.5841840031011758, + "grad_norm": 0.9894224405288696, + "learning_rate": 2.27989215622484e-05, + "loss": 0.9039, + "num_input_tokens_seen": 100433920, + "step": 12260 + }, + { + "epoch": 1.5854761597105569, + "grad_norm": 0.6276862621307373, + "learning_rate": 2.2765226374473504e-05, + "loss": 0.9027, + "num_input_tokens_seen": 100515840, + "step": 12270 + }, + { + "epoch": 1.586768316319938, + "grad_norm": 0.33717378973960876, + "learning_rate": 2.2731535278560944e-05, + "loss": 0.7404, + "num_input_tokens_seen": 100597760, + "step": 12280 + }, + { + "epoch": 1.588060472929319, + "grad_norm": 0.8220341205596924, + "learning_rate": 2.269784833619898e-05, + "loss": 0.8567, + "num_input_tokens_seen": 100679680, + "step": 12290 + }, + { + "epoch": 1.5893526295387002, + "grad_norm": 0.7151164412498474, + "learning_rate": 2.2664165609068304e-05, + "loss": 0.9177, + "num_input_tokens_seen": 100761600, + "step": 12300 + }, + { + "epoch": 1.590644786148081, + "grad_norm": 0.5890063047409058, + "learning_rate": 2.263048715884184e-05, + "loss": 0.8511, + "num_input_tokens_seen": 100843520, + "step": 12310 + }, + { + "epoch": 1.5919369427574623, + "grad_norm": 0.919540286064148, + "learning_rate": 2.2596813047184715e-05, + "loss": 0.8876, + "num_input_tokens_seen": 100925440, + "step": 12320 + }, + { + "epoch": 1.5932290993668432, + "grad_norm": 0.8143252730369568, + "learning_rate": 2.2563143335754118e-05, + "loss": 0.9139, + "num_input_tokens_seen": 101007360, + "step": 12330 + }, + { + "epoch": 1.5945212559762243, + "grad_norm": 0.48788896203041077, + "learning_rate": 2.252947808619914e-05, + "loss": 0.5727, + "num_input_tokens_seen": 101089280, + "step": 12340 + }, + { + "epoch": 1.5958134125856054, + "grad_norm": 0.5264637470245361, + "learning_rate": 2.249581736016076e-05, + "loss": 0.7672, + "num_input_tokens_seen": 101171200, + "step": 12350 + }, + { + "epoch": 1.5971055691949865, + "grad_norm": 0.5975572466850281, + "learning_rate": 2.2462161219271622e-05, + "loss": 0.8705, + "num_input_tokens_seen": 101253120, + "step": 12360 + }, + { + "epoch": 1.5983977258043676, + "grad_norm": 0.2425682097673416, + "learning_rate": 2.242850972515601e-05, + "loss": 0.8957, + "num_input_tokens_seen": 101335040, + "step": 12370 + }, + { + "epoch": 1.5996898824137484, + "grad_norm": 0.5979612469673157, + "learning_rate": 2.2394862939429677e-05, + "loss": 1.0392, + "num_input_tokens_seen": 101416960, + "step": 12380 + }, + { + "epoch": 1.6009820390231297, + "grad_norm": 0.6197012066841125, + "learning_rate": 2.236122092369977e-05, + "loss": 0.7616, + "num_input_tokens_seen": 101498880, + "step": 12390 + }, + { + "epoch": 1.6022741956325106, + "grad_norm": 0.734671413898468, + "learning_rate": 2.2327583739564696e-05, + "loss": 1.1416, + "num_input_tokens_seen": 101580800, + "step": 12400 + }, + { + "epoch": 1.6035663522418917, + "grad_norm": 0.4598730504512787, + "learning_rate": 2.229395144861402e-05, + "loss": 0.8276, + "num_input_tokens_seen": 101662720, + "step": 12410 + }, + { + "epoch": 1.6048585088512728, + "grad_norm": 0.7918890714645386, + "learning_rate": 2.2260324112428336e-05, + "loss": 1.044, + "num_input_tokens_seen": 101744640, + "step": 12420 + }, + { + "epoch": 1.6061506654606539, + "grad_norm": 1.4990414381027222, + "learning_rate": 2.2226701792579176e-05, + "loss": 0.5725, + "num_input_tokens_seen": 101826560, + "step": 12430 + }, + { + "epoch": 1.607442822070035, + "grad_norm": 0.7049417495727539, + "learning_rate": 2.219308455062889e-05, + "loss": 0.8819, + "num_input_tokens_seen": 101908480, + "step": 12440 + }, + { + "epoch": 1.6087349786794158, + "grad_norm": 1.0287914276123047, + "learning_rate": 2.2159472448130513e-05, + "loss": 0.6064, + "num_input_tokens_seen": 101990400, + "step": 12450 + }, + { + "epoch": 1.6100271352887972, + "grad_norm": 0.6095555424690247, + "learning_rate": 2.212586554662769e-05, + "loss": 0.7968, + "num_input_tokens_seen": 102072320, + "step": 12460 + }, + { + "epoch": 1.611319291898178, + "grad_norm": 0.7661923170089722, + "learning_rate": 2.2092263907654544e-05, + "loss": 0.6481, + "num_input_tokens_seen": 102154240, + "step": 12470 + }, + { + "epoch": 1.612611448507559, + "grad_norm": 0.6737602353096008, + "learning_rate": 2.2058667592735532e-05, + "loss": 0.722, + "num_input_tokens_seen": 102236160, + "step": 12480 + }, + { + "epoch": 1.6139036051169402, + "grad_norm": 0.5054346919059753, + "learning_rate": 2.20250766633854e-05, + "loss": 1.321, + "num_input_tokens_seen": 102318080, + "step": 12490 + }, + { + "epoch": 1.615195761726321, + "grad_norm": 0.5641369819641113, + "learning_rate": 2.199149118110901e-05, + "loss": 0.6964, + "num_input_tokens_seen": 102400000, + "step": 12500 + }, + { + "epoch": 1.6164879183357024, + "grad_norm": 0.27639254927635193, + "learning_rate": 2.1957911207401267e-05, + "loss": 0.965, + "num_input_tokens_seen": 102481920, + "step": 12510 + }, + { + "epoch": 1.6177800749450832, + "grad_norm": 0.8085689544677734, + "learning_rate": 2.192433680374696e-05, + "loss": 0.6768, + "num_input_tokens_seen": 102563840, + "step": 12520 + }, + { + "epoch": 1.6190722315544646, + "grad_norm": 0.7601416110992432, + "learning_rate": 2.1890768031620705e-05, + "loss": 0.975, + "num_input_tokens_seen": 102645760, + "step": 12530 + }, + { + "epoch": 1.6203643881638454, + "grad_norm": 0.615585207939148, + "learning_rate": 2.1857204952486824e-05, + "loss": 1.0603, + "num_input_tokens_seen": 102727680, + "step": 12540 + }, + { + "epoch": 1.6216565447732265, + "grad_norm": 0.7139294147491455, + "learning_rate": 2.182364762779916e-05, + "loss": 0.737, + "num_input_tokens_seen": 102809600, + "step": 12550 + }, + { + "epoch": 1.6229487013826076, + "grad_norm": 0.38794225454330444, + "learning_rate": 2.1790096119001077e-05, + "loss": 0.8256, + "num_input_tokens_seen": 102891520, + "step": 12560 + }, + { + "epoch": 1.6242408579919885, + "grad_norm": 0.8240430951118469, + "learning_rate": 2.1756550487525247e-05, + "loss": 1.0425, + "num_input_tokens_seen": 102973440, + "step": 12570 + }, + { + "epoch": 1.6255330146013698, + "grad_norm": 0.7485008239746094, + "learning_rate": 2.1723010794793612e-05, + "loss": 0.6268, + "num_input_tokens_seen": 103055360, + "step": 12580 + }, + { + "epoch": 1.6268251712107507, + "grad_norm": 0.5244811177253723, + "learning_rate": 2.168947710221722e-05, + "loss": 0.9757, + "num_input_tokens_seen": 103137280, + "step": 12590 + }, + { + "epoch": 1.628117327820132, + "grad_norm": 0.6735374927520752, + "learning_rate": 2.165594947119613e-05, + "loss": 0.7146, + "num_input_tokens_seen": 103219200, + "step": 12600 + }, + { + "epoch": 1.6294094844295128, + "grad_norm": 0.7091389894485474, + "learning_rate": 2.1622427963119337e-05, + "loss": 1.4099, + "num_input_tokens_seen": 103301120, + "step": 12610 + }, + { + "epoch": 1.630701641038894, + "grad_norm": 0.21646621823310852, + "learning_rate": 2.1588912639364567e-05, + "loss": 1.0245, + "num_input_tokens_seen": 103383040, + "step": 12620 + }, + { + "epoch": 1.631993797648275, + "grad_norm": 0.4036228358745575, + "learning_rate": 2.1555403561298287e-05, + "loss": 1.1632, + "num_input_tokens_seen": 103464960, + "step": 12630 + }, + { + "epoch": 1.6332859542576559, + "grad_norm": 0.5068750977516174, + "learning_rate": 2.152190079027547e-05, + "loss": 0.747, + "num_input_tokens_seen": 103546880, + "step": 12640 + }, + { + "epoch": 1.6345781108670372, + "grad_norm": 0.657037079334259, + "learning_rate": 2.148840438763959e-05, + "loss": 1.056, + "num_input_tokens_seen": 103628800, + "step": 12650 + }, + { + "epoch": 1.635870267476418, + "grad_norm": 0.7527886033058167, + "learning_rate": 2.1454914414722417e-05, + "loss": 0.9465, + "num_input_tokens_seen": 103710720, + "step": 12660 + }, + { + "epoch": 1.6371624240857992, + "grad_norm": 0.504432201385498, + "learning_rate": 2.1421430932843988e-05, + "loss": 1.1174, + "num_input_tokens_seen": 103792640, + "step": 12670 + }, + { + "epoch": 1.6384545806951802, + "grad_norm": 0.23135700821876526, + "learning_rate": 2.138795400331242e-05, + "loss": 0.6262, + "num_input_tokens_seen": 103874560, + "step": 12680 + }, + { + "epoch": 1.6397467373045613, + "grad_norm": 0.5520825982093811, + "learning_rate": 2.135448368742385e-05, + "loss": 1.066, + "num_input_tokens_seen": 103956480, + "step": 12690 + }, + { + "epoch": 1.6410388939139424, + "grad_norm": 0.7319164276123047, + "learning_rate": 2.1321020046462318e-05, + "loss": 0.9554, + "num_input_tokens_seen": 104038400, + "step": 12700 + }, + { + "epoch": 1.6423310505233233, + "grad_norm": 0.40483906865119934, + "learning_rate": 2.128756314169961e-05, + "loss": 1.0322, + "num_input_tokens_seen": 104120320, + "step": 12710 + }, + { + "epoch": 1.6436232071327046, + "grad_norm": 0.683261513710022, + "learning_rate": 2.1254113034395212e-05, + "loss": 0.6685, + "num_input_tokens_seen": 104202240, + "step": 12720 + }, + { + "epoch": 1.6449153637420855, + "grad_norm": 0.6625344157218933, + "learning_rate": 2.122066978579613e-05, + "loss": 0.7232, + "num_input_tokens_seen": 104284160, + "step": 12730 + }, + { + "epoch": 1.6462075203514666, + "grad_norm": 0.5251320600509644, + "learning_rate": 2.1187233457136858e-05, + "loss": 0.8379, + "num_input_tokens_seen": 104366080, + "step": 12740 + }, + { + "epoch": 1.6474996769608476, + "grad_norm": 0.37905430793762207, + "learning_rate": 2.1153804109639157e-05, + "loss": 0.7044, + "num_input_tokens_seen": 104448000, + "step": 12750 + }, + { + "epoch": 1.6487918335702287, + "grad_norm": 0.7667315006256104, + "learning_rate": 2.1120381804512066e-05, + "loss": 0.7293, + "num_input_tokens_seen": 104529920, + "step": 12760 + }, + { + "epoch": 1.6500839901796098, + "grad_norm": 0.8801140785217285, + "learning_rate": 2.1086966602951696e-05, + "loss": 0.9354, + "num_input_tokens_seen": 104611840, + "step": 12770 + }, + { + "epoch": 1.6513761467889907, + "grad_norm": 0.6519145369529724, + "learning_rate": 2.105355856614115e-05, + "loss": 0.699, + "num_input_tokens_seen": 104693760, + "step": 12780 + }, + { + "epoch": 1.652668303398372, + "grad_norm": 0.5816752910614014, + "learning_rate": 2.1020157755250437e-05, + "loss": 0.6137, + "num_input_tokens_seen": 104775680, + "step": 12790 + }, + { + "epoch": 1.6539604600077529, + "grad_norm": 0.8572224378585815, + "learning_rate": 2.09867642314363e-05, + "loss": 0.8616, + "num_input_tokens_seen": 104857600, + "step": 12800 + }, + { + "epoch": 1.655252616617134, + "grad_norm": 0.367939293384552, + "learning_rate": 2.0953378055842183e-05, + "loss": 0.668, + "num_input_tokens_seen": 104939520, + "step": 12810 + }, + { + "epoch": 1.656544773226515, + "grad_norm": 0.5176908373832703, + "learning_rate": 2.0919999289598027e-05, + "loss": 0.961, + "num_input_tokens_seen": 105021440, + "step": 12820 + }, + { + "epoch": 1.6578369298358961, + "grad_norm": 0.21786247193813324, + "learning_rate": 2.088662799382024e-05, + "loss": 0.6605, + "num_input_tokens_seen": 105103360, + "step": 12830 + }, + { + "epoch": 1.6591290864452772, + "grad_norm": 0.7281939387321472, + "learning_rate": 2.0853264229611557e-05, + "loss": 1.176, + "num_input_tokens_seen": 105185280, + "step": 12840 + }, + { + "epoch": 1.660421243054658, + "grad_norm": 0.6157118678092957, + "learning_rate": 2.081990805806089e-05, + "loss": 0.9159, + "num_input_tokens_seen": 105267200, + "step": 12850 + }, + { + "epoch": 1.6617133996640394, + "grad_norm": 0.3668000102043152, + "learning_rate": 2.078655954024327e-05, + "loss": 0.6293, + "num_input_tokens_seen": 105349120, + "step": 12860 + }, + { + "epoch": 1.6630055562734203, + "grad_norm": 0.6818766593933105, + "learning_rate": 2.075321873721972e-05, + "loss": 0.7573, + "num_input_tokens_seen": 105431040, + "step": 12870 + }, + { + "epoch": 1.6642977128828014, + "grad_norm": 0.6922422647476196, + "learning_rate": 2.0719885710037122e-05, + "loss": 0.4979, + "num_input_tokens_seen": 105512960, + "step": 12880 + }, + { + "epoch": 1.6655898694921825, + "grad_norm": 0.6830812692642212, + "learning_rate": 2.0686560519728117e-05, + "loss": 1.026, + "num_input_tokens_seen": 105594880, + "step": 12890 + }, + { + "epoch": 1.6668820261015636, + "grad_norm": 0.574186384677887, + "learning_rate": 2.0653243227311014e-05, + "loss": 0.5754, + "num_input_tokens_seen": 105676800, + "step": 12900 + }, + { + "epoch": 1.6681741827109446, + "grad_norm": 1.0729289054870605, + "learning_rate": 2.0619933893789673e-05, + "loss": 0.8647, + "num_input_tokens_seen": 105758720, + "step": 12910 + }, + { + "epoch": 1.6694663393203255, + "grad_norm": 0.6160525679588318, + "learning_rate": 2.0586632580153328e-05, + "loss": 1.2784, + "num_input_tokens_seen": 105840640, + "step": 12920 + }, + { + "epoch": 1.6707584959297068, + "grad_norm": 0.5736418962478638, + "learning_rate": 2.0553339347376592e-05, + "loss": 0.8836, + "num_input_tokens_seen": 105922560, + "step": 12930 + }, + { + "epoch": 1.6720506525390877, + "grad_norm": 0.743789553642273, + "learning_rate": 2.0520054256419236e-05, + "loss": 0.909, + "num_input_tokens_seen": 106004480, + "step": 12940 + }, + { + "epoch": 1.6733428091484688, + "grad_norm": 0.6692883372306824, + "learning_rate": 2.0486777368226143e-05, + "loss": 0.7516, + "num_input_tokens_seen": 106086400, + "step": 12950 + }, + { + "epoch": 1.6746349657578499, + "grad_norm": 1.0037450790405273, + "learning_rate": 2.045350874372717e-05, + "loss": 0.977, + "num_input_tokens_seen": 106168320, + "step": 12960 + }, + { + "epoch": 1.675927122367231, + "grad_norm": 0.8744035363197327, + "learning_rate": 2.0420248443837048e-05, + "loss": 0.8461, + "num_input_tokens_seen": 106250240, + "step": 12970 + }, + { + "epoch": 1.677219278976612, + "grad_norm": 0.7862047553062439, + "learning_rate": 2.0386996529455276e-05, + "loss": 0.7468, + "num_input_tokens_seen": 106332160, + "step": 12980 + }, + { + "epoch": 1.678511435585993, + "grad_norm": 0.8968716263771057, + "learning_rate": 2.0353753061465972e-05, + "loss": 0.7406, + "num_input_tokens_seen": 106414080, + "step": 12990 + }, + { + "epoch": 1.6798035921953742, + "grad_norm": 0.9072631001472473, + "learning_rate": 2.0320518100737817e-05, + "loss": 0.9977, + "num_input_tokens_seen": 106496000, + "step": 13000 + }, + { + "epoch": 1.681095748804755, + "grad_norm": 0.5240357518196106, + "learning_rate": 2.0287291708123888e-05, + "loss": 0.867, + "num_input_tokens_seen": 106577920, + "step": 13010 + }, + { + "epoch": 1.6823879054141362, + "grad_norm": 0.6482962369918823, + "learning_rate": 2.0254073944461603e-05, + "loss": 0.86, + "num_input_tokens_seen": 106659840, + "step": 13020 + }, + { + "epoch": 1.6836800620235173, + "grad_norm": 0.7693918347358704, + "learning_rate": 2.0220864870572555e-05, + "loss": 0.9676, + "num_input_tokens_seen": 106741760, + "step": 13030 + }, + { + "epoch": 1.6849722186328981, + "grad_norm": 0.6665438413619995, + "learning_rate": 2.0187664547262446e-05, + "loss": 0.7239, + "num_input_tokens_seen": 106823680, + "step": 13040 + }, + { + "epoch": 1.6862643752422795, + "grad_norm": 0.2982095777988434, + "learning_rate": 2.0154473035320936e-05, + "loss": 0.9287, + "num_input_tokens_seen": 106905600, + "step": 13050 + }, + { + "epoch": 1.6875565318516603, + "grad_norm": 14.455750465393066, + "learning_rate": 2.0121290395521566e-05, + "loss": 0.7107, + "num_input_tokens_seen": 106987520, + "step": 13060 + }, + { + "epoch": 1.6888486884610416, + "grad_norm": 0.7520779371261597, + "learning_rate": 2.008811668862164e-05, + "loss": 0.7824, + "num_input_tokens_seen": 107069440, + "step": 13070 + }, + { + "epoch": 1.6901408450704225, + "grad_norm": 0.6052248477935791, + "learning_rate": 2.0054951975362067e-05, + "loss": 0.6419, + "num_input_tokens_seen": 107151360, + "step": 13080 + }, + { + "epoch": 1.6914330016798036, + "grad_norm": 0.2903338372707367, + "learning_rate": 2.0021796316467346e-05, + "loss": 0.5254, + "num_input_tokens_seen": 107233280, + "step": 13090 + }, + { + "epoch": 1.6927251582891847, + "grad_norm": 0.5044610500335693, + "learning_rate": 1.9988649772645346e-05, + "loss": 0.6578, + "num_input_tokens_seen": 107315200, + "step": 13100 + }, + { + "epoch": 1.6940173148985656, + "grad_norm": 0.6834781169891357, + "learning_rate": 1.995551240458728e-05, + "loss": 0.7578, + "num_input_tokens_seen": 107397120, + "step": 13110 + }, + { + "epoch": 1.6953094715079469, + "grad_norm": 0.47251078486442566, + "learning_rate": 1.9922384272967535e-05, + "loss": 0.6271, + "num_input_tokens_seen": 107479040, + "step": 13120 + }, + { + "epoch": 1.6966016281173277, + "grad_norm": 0.27154985070228577, + "learning_rate": 1.9889265438443607e-05, + "loss": 0.6214, + "num_input_tokens_seen": 107560960, + "step": 13130 + }, + { + "epoch": 1.697893784726709, + "grad_norm": 0.7411037683486938, + "learning_rate": 1.985615596165597e-05, + "loss": 1.0424, + "num_input_tokens_seen": 107642880, + "step": 13140 + }, + { + "epoch": 1.69918594133609, + "grad_norm": 0.7935945391654968, + "learning_rate": 1.982305590322793e-05, + "loss": 1.0063, + "num_input_tokens_seen": 107724800, + "step": 13150 + }, + { + "epoch": 1.700478097945471, + "grad_norm": 0.5809916257858276, + "learning_rate": 1.97899653237656e-05, + "loss": 0.796, + "num_input_tokens_seen": 107806720, + "step": 13160 + }, + { + "epoch": 1.701770254554852, + "grad_norm": 0.7981992363929749, + "learning_rate": 1.9756884283857685e-05, + "loss": 0.8084, + "num_input_tokens_seen": 107888640, + "step": 13170 + }, + { + "epoch": 1.703062411164233, + "grad_norm": 0.6108459234237671, + "learning_rate": 1.9723812844075473e-05, + "loss": 1.0913, + "num_input_tokens_seen": 107970560, + "step": 13180 + }, + { + "epoch": 1.7043545677736143, + "grad_norm": 0.6195822954177856, + "learning_rate": 1.9690751064972625e-05, + "loss": 1.1137, + "num_input_tokens_seen": 108052480, + "step": 13190 + }, + { + "epoch": 1.7056467243829951, + "grad_norm": 0.6808088421821594, + "learning_rate": 1.965769900708515e-05, + "loss": 0.9466, + "num_input_tokens_seen": 108134400, + "step": 13200 + }, + { + "epoch": 1.7069388809923762, + "grad_norm": 0.6025552749633789, + "learning_rate": 1.9624656730931258e-05, + "loss": 1.0154, + "num_input_tokens_seen": 108216320, + "step": 13210 + }, + { + "epoch": 1.7082310376017573, + "grad_norm": 0.6096053123474121, + "learning_rate": 1.959162429701121e-05, + "loss": 0.9086, + "num_input_tokens_seen": 108298240, + "step": 13220 + }, + { + "epoch": 1.7095231942111384, + "grad_norm": 0.7634369730949402, + "learning_rate": 1.955860176580729e-05, + "loss": 0.9669, + "num_input_tokens_seen": 108380160, + "step": 13230 + }, + { + "epoch": 1.7108153508205195, + "grad_norm": 5.748688220977783, + "learning_rate": 1.9525589197783618e-05, + "loss": 0.8869, + "num_input_tokens_seen": 108462080, + "step": 13240 + }, + { + "epoch": 1.7121075074299004, + "grad_norm": 0.9097843170166016, + "learning_rate": 1.9492586653386103e-05, + "loss": 1.3718, + "num_input_tokens_seen": 108544000, + "step": 13250 + }, + { + "epoch": 1.7133996640392817, + "grad_norm": 0.6823452711105347, + "learning_rate": 1.945959419304226e-05, + "loss": 1.1025, + "num_input_tokens_seen": 108625920, + "step": 13260 + }, + { + "epoch": 1.7146918206486625, + "grad_norm": 0.8677635192871094, + "learning_rate": 1.942661187716118e-05, + "loss": 0.7662, + "num_input_tokens_seen": 108707840, + "step": 13270 + }, + { + "epoch": 1.7159839772580436, + "grad_norm": 0.8804795742034912, + "learning_rate": 1.9393639766133363e-05, + "loss": 0.9356, + "num_input_tokens_seen": 108789760, + "step": 13280 + }, + { + "epoch": 1.7172761338674247, + "grad_norm": 0.6428921222686768, + "learning_rate": 1.936067792033061e-05, + "loss": 0.608, + "num_input_tokens_seen": 108871680, + "step": 13290 + }, + { + "epoch": 1.7185682904768058, + "grad_norm": 0.5811804533004761, + "learning_rate": 1.9327726400105963e-05, + "loss": 1.0139, + "num_input_tokens_seen": 108953600, + "step": 13300 + }, + { + "epoch": 1.719860447086187, + "grad_norm": 1.0158241987228394, + "learning_rate": 1.9294785265793514e-05, + "loss": 0.7744, + "num_input_tokens_seen": 109035520, + "step": 13310 + }, + { + "epoch": 1.7211526036955678, + "grad_norm": 1.1166173219680786, + "learning_rate": 1.9261854577708366e-05, + "loss": 0.7847, + "num_input_tokens_seen": 109117440, + "step": 13320 + }, + { + "epoch": 1.722444760304949, + "grad_norm": 0.5615746974945068, + "learning_rate": 1.9228934396146486e-05, + "loss": 0.9334, + "num_input_tokens_seen": 109199360, + "step": 13330 + }, + { + "epoch": 1.72373691691433, + "grad_norm": 0.474103182554245, + "learning_rate": 1.9196024781384607e-05, + "loss": 0.8011, + "num_input_tokens_seen": 109281280, + "step": 13340 + }, + { + "epoch": 1.725029073523711, + "grad_norm": 0.5429681539535522, + "learning_rate": 1.9163125793680125e-05, + "loss": 0.8737, + "num_input_tokens_seen": 109363200, + "step": 13350 + }, + { + "epoch": 1.7263212301330921, + "grad_norm": 0.22766174376010895, + "learning_rate": 1.9130237493270948e-05, + "loss": 0.5359, + "num_input_tokens_seen": 109445120, + "step": 13360 + }, + { + "epoch": 1.7276133867424732, + "grad_norm": 0.9762837290763855, + "learning_rate": 1.9097359940375452e-05, + "loss": 0.6703, + "num_input_tokens_seen": 109527040, + "step": 13370 + }, + { + "epoch": 1.7289055433518543, + "grad_norm": 0.3081243932247162, + "learning_rate": 1.9064493195192293e-05, + "loss": 0.8231, + "num_input_tokens_seen": 109608960, + "step": 13380 + }, + { + "epoch": 1.7301976999612352, + "grad_norm": 0.8469387292861938, + "learning_rate": 1.9031637317900386e-05, + "loss": 0.7302, + "num_input_tokens_seen": 109690880, + "step": 13390 + }, + { + "epoch": 1.7314898565706165, + "grad_norm": 0.8543357253074646, + "learning_rate": 1.8998792368658703e-05, + "loss": 0.7282, + "num_input_tokens_seen": 109772800, + "step": 13400 + }, + { + "epoch": 1.7327820131799974, + "grad_norm": 0.5329307317733765, + "learning_rate": 1.8965958407606236e-05, + "loss": 0.8997, + "num_input_tokens_seen": 109854720, + "step": 13410 + }, + { + "epoch": 1.7340741697893785, + "grad_norm": 0.7571797966957092, + "learning_rate": 1.893313549486184e-05, + "loss": 0.8703, + "num_input_tokens_seen": 109936640, + "step": 13420 + }, + { + "epoch": 1.7353663263987595, + "grad_norm": 0.6506255865097046, + "learning_rate": 1.890032369052415e-05, + "loss": 0.5685, + "num_input_tokens_seen": 110018560, + "step": 13430 + }, + { + "epoch": 1.7366584830081406, + "grad_norm": 0.2349756509065628, + "learning_rate": 1.8867523054671475e-05, + "loss": 0.9621, + "num_input_tokens_seen": 110100480, + "step": 13440 + }, + { + "epoch": 1.7379506396175217, + "grad_norm": 0.5002673864364624, + "learning_rate": 1.8834733647361635e-05, + "loss": 0.8279, + "num_input_tokens_seen": 110182400, + "step": 13450 + }, + { + "epoch": 1.7392427962269026, + "grad_norm": 0.4068397581577301, + "learning_rate": 1.880195552863194e-05, + "loss": 0.6358, + "num_input_tokens_seen": 110264320, + "step": 13460 + }, + { + "epoch": 1.740534952836284, + "grad_norm": 0.6473715901374817, + "learning_rate": 1.8769188758498973e-05, + "loss": 0.8022, + "num_input_tokens_seen": 110346240, + "step": 13470 + }, + { + "epoch": 1.7418271094456648, + "grad_norm": 0.29056867957115173, + "learning_rate": 1.8736433396958605e-05, + "loss": 0.6169, + "num_input_tokens_seen": 110428160, + "step": 13480 + }, + { + "epoch": 1.7431192660550459, + "grad_norm": 0.49378782510757446, + "learning_rate": 1.8703689503985754e-05, + "loss": 0.6387, + "num_input_tokens_seen": 110510080, + "step": 13490 + }, + { + "epoch": 1.744411422664427, + "grad_norm": 0.570563554763794, + "learning_rate": 1.867095713953439e-05, + "loss": 1.0567, + "num_input_tokens_seen": 110592000, + "step": 13500 + }, + { + "epoch": 1.745703579273808, + "grad_norm": 0.7375537753105164, + "learning_rate": 1.8638236363537348e-05, + "loss": 0.5526, + "num_input_tokens_seen": 110673920, + "step": 13510 + }, + { + "epoch": 1.7469957358831891, + "grad_norm": 0.6009176969528198, + "learning_rate": 1.8605527235906235e-05, + "loss": 0.987, + "num_input_tokens_seen": 110755840, + "step": 13520 + }, + { + "epoch": 1.74828789249257, + "grad_norm": 0.41143083572387695, + "learning_rate": 1.8572829816531364e-05, + "loss": 0.604, + "num_input_tokens_seen": 110837760, + "step": 13530 + }, + { + "epoch": 1.7495800491019513, + "grad_norm": 1.1467337608337402, + "learning_rate": 1.854014416528157e-05, + "loss": 0.6048, + "num_input_tokens_seen": 110919680, + "step": 13540 + }, + { + "epoch": 1.7508722057113322, + "grad_norm": 0.6407749652862549, + "learning_rate": 1.8507470342004182e-05, + "loss": 0.8796, + "num_input_tokens_seen": 111001600, + "step": 13550 + }, + { + "epoch": 1.7521643623207133, + "grad_norm": 0.8775169253349304, + "learning_rate": 1.847480840652483e-05, + "loss": 0.8039, + "num_input_tokens_seen": 111083520, + "step": 13560 + }, + { + "epoch": 1.7534565189300944, + "grad_norm": 0.9216033220291138, + "learning_rate": 1.844215841864741e-05, + "loss": 0.6875, + "num_input_tokens_seen": 111165440, + "step": 13570 + }, + { + "epoch": 1.7547486755394752, + "grad_norm": 0.7694590091705322, + "learning_rate": 1.8409520438153933e-05, + "loss": 0.9024, + "num_input_tokens_seen": 111247360, + "step": 13580 + }, + { + "epoch": 1.7560408321488565, + "grad_norm": 0.25773632526397705, + "learning_rate": 1.8376894524804416e-05, + "loss": 0.6588, + "num_input_tokens_seen": 111329280, + "step": 13590 + }, + { + "epoch": 1.7573329887582374, + "grad_norm": 0.8692495226860046, + "learning_rate": 1.8344280738336796e-05, + "loss": 0.9931, + "num_input_tokens_seen": 111411200, + "step": 13600 + }, + { + "epoch": 1.7586251453676187, + "grad_norm": 0.17592206597328186, + "learning_rate": 1.8311679138466772e-05, + "loss": 0.9949, + "num_input_tokens_seen": 111493120, + "step": 13610 + }, + { + "epoch": 1.7599173019769996, + "grad_norm": 0.8434010148048401, + "learning_rate": 1.827908978488779e-05, + "loss": 1.0308, + "num_input_tokens_seen": 111575040, + "step": 13620 + }, + { + "epoch": 1.7612094585863807, + "grad_norm": 1.8556253910064697, + "learning_rate": 1.8246512737270798e-05, + "loss": 0.996, + "num_input_tokens_seen": 111656960, + "step": 13630 + }, + { + "epoch": 1.7625016151957618, + "grad_norm": 4.145997047424316, + "learning_rate": 1.8213948055264278e-05, + "loss": 1.0672, + "num_input_tokens_seen": 111738880, + "step": 13640 + }, + { + "epoch": 1.7637937718051426, + "grad_norm": 0.3059723377227783, + "learning_rate": 1.8181395798494048e-05, + "loss": 0.5107, + "num_input_tokens_seen": 111820800, + "step": 13650 + }, + { + "epoch": 1.765085928414524, + "grad_norm": 0.9706294536590576, + "learning_rate": 1.8148856026563148e-05, + "loss": 1.0283, + "num_input_tokens_seen": 111902720, + "step": 13660 + }, + { + "epoch": 1.7663780850239048, + "grad_norm": 0.6483558416366577, + "learning_rate": 1.81163287990518e-05, + "loss": 0.9913, + "num_input_tokens_seen": 111984640, + "step": 13670 + }, + { + "epoch": 1.767670241633286, + "grad_norm": 0.5920456647872925, + "learning_rate": 1.8083814175517234e-05, + "loss": 0.5184, + "num_input_tokens_seen": 112066560, + "step": 13680 + }, + { + "epoch": 1.768962398242667, + "grad_norm": 0.6596719622612, + "learning_rate": 1.80513122154936e-05, + "loss": 0.962, + "num_input_tokens_seen": 112148480, + "step": 13690 + }, + { + "epoch": 1.770254554852048, + "grad_norm": 0.7937291264533997, + "learning_rate": 1.8018822978491872e-05, + "loss": 0.6034, + "num_input_tokens_seen": 112230400, + "step": 13700 + }, + { + "epoch": 1.7715467114614292, + "grad_norm": 0.5990145206451416, + "learning_rate": 1.798634652399972e-05, + "loss": 0.8974, + "num_input_tokens_seen": 112312320, + "step": 13710 + }, + { + "epoch": 1.77283886807081, + "grad_norm": 0.6401026844978333, + "learning_rate": 1.795388291148143e-05, + "loss": 1.0047, + "num_input_tokens_seen": 112394240, + "step": 13720 + }, + { + "epoch": 1.7741310246801913, + "grad_norm": 0.6857889294624329, + "learning_rate": 1.7921432200377734e-05, + "loss": 0.608, + "num_input_tokens_seen": 112476160, + "step": 13730 + }, + { + "epoch": 1.7754231812895722, + "grad_norm": 0.6856977939605713, + "learning_rate": 1.7888994450105788e-05, + "loss": 1.2077, + "num_input_tokens_seen": 112558080, + "step": 13740 + }, + { + "epoch": 1.7767153378989533, + "grad_norm": 0.4340798258781433, + "learning_rate": 1.785656972005897e-05, + "loss": 1.0667, + "num_input_tokens_seen": 112640000, + "step": 13750 + }, + { + "epoch": 1.7780074945083344, + "grad_norm": 0.6757582426071167, + "learning_rate": 1.7824158069606867e-05, + "loss": 0.8964, + "num_input_tokens_seen": 112721920, + "step": 13760 + }, + { + "epoch": 1.7792996511177155, + "grad_norm": 0.8289663791656494, + "learning_rate": 1.7791759558095077e-05, + "loss": 0.6691, + "num_input_tokens_seen": 112803840, + "step": 13770 + }, + { + "epoch": 1.7805918077270966, + "grad_norm": 0.620309054851532, + "learning_rate": 1.775937424484515e-05, + "loss": 0.9059, + "num_input_tokens_seen": 112885760, + "step": 13780 + }, + { + "epoch": 1.7818839643364774, + "grad_norm": 0.7046340703964233, + "learning_rate": 1.7727002189154502e-05, + "loss": 0.724, + "num_input_tokens_seen": 112967680, + "step": 13790 + }, + { + "epoch": 1.7831761209458588, + "grad_norm": 0.7934147715568542, + "learning_rate": 1.7694643450296216e-05, + "loss": 1.064, + "num_input_tokens_seen": 113049600, + "step": 13800 + }, + { + "epoch": 1.7844682775552396, + "grad_norm": 0.5995201468467712, + "learning_rate": 1.7662298087519052e-05, + "loss": 0.9433, + "num_input_tokens_seen": 113131520, + "step": 13810 + }, + { + "epoch": 1.7857604341646207, + "grad_norm": 0.8406663537025452, + "learning_rate": 1.762996616004723e-05, + "loss": 0.8747, + "num_input_tokens_seen": 113213440, + "step": 13820 + }, + { + "epoch": 1.7870525907740018, + "grad_norm": 0.5735936760902405, + "learning_rate": 1.7597647727080408e-05, + "loss": 0.9011, + "num_input_tokens_seen": 113295360, + "step": 13830 + }, + { + "epoch": 1.788344747383383, + "grad_norm": 0.7899253368377686, + "learning_rate": 1.7565342847793502e-05, + "loss": 0.9851, + "num_input_tokens_seen": 113377280, + "step": 13840 + }, + { + "epoch": 1.789636903992764, + "grad_norm": 0.9319769740104675, + "learning_rate": 1.7533051581336644e-05, + "loss": 0.9077, + "num_input_tokens_seen": 113459200, + "step": 13850 + }, + { + "epoch": 1.7909290606021449, + "grad_norm": 0.6628711819648743, + "learning_rate": 1.7500773986835013e-05, + "loss": 0.9925, + "num_input_tokens_seen": 113541120, + "step": 13860 + }, + { + "epoch": 1.7922212172115262, + "grad_norm": 0.373602032661438, + "learning_rate": 1.7468510123388775e-05, + "loss": 0.7818, + "num_input_tokens_seen": 113623040, + "step": 13870 + }, + { + "epoch": 1.793513373820907, + "grad_norm": 0.574948787689209, + "learning_rate": 1.743626005007294e-05, + "loss": 0.9479, + "num_input_tokens_seen": 113704960, + "step": 13880 + }, + { + "epoch": 1.7948055304302881, + "grad_norm": 0.7120422124862671, + "learning_rate": 1.740402382593727e-05, + "loss": 0.9624, + "num_input_tokens_seen": 113786880, + "step": 13890 + }, + { + "epoch": 1.7960976870396692, + "grad_norm": 0.5288504958152771, + "learning_rate": 1.7371801510006193e-05, + "loss": 0.9844, + "num_input_tokens_seen": 113868800, + "step": 13900 + }, + { + "epoch": 1.7973898436490503, + "grad_norm": 0.5549502372741699, + "learning_rate": 1.733959316127862e-05, + "loss": 0.8112, + "num_input_tokens_seen": 113950720, + "step": 13910 + }, + { + "epoch": 1.7986820002584314, + "grad_norm": 0.6821995377540588, + "learning_rate": 1.730739883872795e-05, + "loss": 0.9197, + "num_input_tokens_seen": 114032640, + "step": 13920 + }, + { + "epoch": 1.7999741568678123, + "grad_norm": 0.44002169370651245, + "learning_rate": 1.7275218601301848e-05, + "loss": 0.6208, + "num_input_tokens_seen": 114114560, + "step": 13930 + }, + { + "epoch": 1.8012663134771936, + "grad_norm": 0.7106473445892334, + "learning_rate": 1.7243052507922226e-05, + "loss": 0.9374, + "num_input_tokens_seen": 114196480, + "step": 13940 + }, + { + "epoch": 1.8025584700865744, + "grad_norm": 0.6388192772865295, + "learning_rate": 1.7210900617485075e-05, + "loss": 1.0343, + "num_input_tokens_seen": 114278400, + "step": 13950 + }, + { + "epoch": 1.8038506266959555, + "grad_norm": 0.644347071647644, + "learning_rate": 1.7178762988860393e-05, + "loss": 0.8684, + "num_input_tokens_seen": 114360320, + "step": 13960 + }, + { + "epoch": 1.8051427833053366, + "grad_norm": 0.5687951445579529, + "learning_rate": 1.7146639680892062e-05, + "loss": 1.0918, + "num_input_tokens_seen": 114442240, + "step": 13970 + }, + { + "epoch": 1.8064349399147177, + "grad_norm": 0.775614321231842, + "learning_rate": 1.711453075239773e-05, + "loss": 0.7429, + "num_input_tokens_seen": 114524160, + "step": 13980 + }, + { + "epoch": 1.8077270965240988, + "grad_norm": 0.25670918822288513, + "learning_rate": 1.7082436262168745e-05, + "loss": 0.7727, + "num_input_tokens_seen": 114606080, + "step": 13990 + }, + { + "epoch": 1.8090192531334797, + "grad_norm": 0.9331316351890564, + "learning_rate": 1.705035626896998e-05, + "loss": 1.0647, + "num_input_tokens_seen": 114688000, + "step": 14000 + }, + { + "epoch": 1.810311409742861, + "grad_norm": 1.6310906410217285, + "learning_rate": 1.7018290831539795e-05, + "loss": 0.7367, + "num_input_tokens_seen": 114769920, + "step": 14010 + }, + { + "epoch": 1.8116035663522418, + "grad_norm": 0.9030694961547852, + "learning_rate": 1.6986240008589903e-05, + "loss": 0.9311, + "num_input_tokens_seen": 114851840, + "step": 14020 + }, + { + "epoch": 1.812895722961623, + "grad_norm": 0.6895557045936584, + "learning_rate": 1.695420385880522e-05, + "loss": 0.9413, + "num_input_tokens_seen": 114933760, + "step": 14030 + }, + { + "epoch": 1.814187879571004, + "grad_norm": 0.7710975408554077, + "learning_rate": 1.6922182440843843e-05, + "loss": 0.7935, + "num_input_tokens_seen": 115015680, + "step": 14040 + }, + { + "epoch": 1.8154800361803851, + "grad_norm": 0.979817807674408, + "learning_rate": 1.689017581333685e-05, + "loss": 0.9507, + "num_input_tokens_seen": 115097600, + "step": 14050 + }, + { + "epoch": 1.8167721927897662, + "grad_norm": 0.2595027983188629, + "learning_rate": 1.685818403488827e-05, + "loss": 1.1626, + "num_input_tokens_seen": 115179520, + "step": 14060 + }, + { + "epoch": 1.818064349399147, + "grad_norm": 0.20249100029468536, + "learning_rate": 1.6826207164074924e-05, + "loss": 0.4731, + "num_input_tokens_seen": 115261440, + "step": 14070 + }, + { + "epoch": 1.8193565060085284, + "grad_norm": 0.5475636124610901, + "learning_rate": 1.6794245259446347e-05, + "loss": 0.8077, + "num_input_tokens_seen": 115343360, + "step": 14080 + }, + { + "epoch": 1.8206486626179093, + "grad_norm": 0.6575671434402466, + "learning_rate": 1.6762298379524684e-05, + "loss": 0.8489, + "num_input_tokens_seen": 115425280, + "step": 14090 + }, + { + "epoch": 1.8219408192272903, + "grad_norm": 0.5988595485687256, + "learning_rate": 1.6730366582804535e-05, + "loss": 0.5274, + "num_input_tokens_seen": 115507200, + "step": 14100 + }, + { + "epoch": 1.8232329758366714, + "grad_norm": 0.2947266399860382, + "learning_rate": 1.6698449927752924e-05, + "loss": 0.659, + "num_input_tokens_seen": 115589120, + "step": 14110 + }, + { + "epoch": 1.8245251324460523, + "grad_norm": 0.7184574604034424, + "learning_rate": 1.6666548472809104e-05, + "loss": 1.1145, + "num_input_tokens_seen": 115671040, + "step": 14120 + }, + { + "epoch": 1.8258172890554336, + "grad_norm": 0.8786848783493042, + "learning_rate": 1.6634662276384548e-05, + "loss": 0.8391, + "num_input_tokens_seen": 115752960, + "step": 14130 + }, + { + "epoch": 1.8271094456648145, + "grad_norm": 0.6880916357040405, + "learning_rate": 1.660279139686275e-05, + "loss": 0.802, + "num_input_tokens_seen": 115834880, + "step": 14140 + }, + { + "epoch": 1.8284016022741958, + "grad_norm": 0.8588114380836487, + "learning_rate": 1.657093589259917e-05, + "loss": 0.7398, + "num_input_tokens_seen": 115916800, + "step": 14150 + }, + { + "epoch": 1.8296937588835767, + "grad_norm": 0.6294015049934387, + "learning_rate": 1.6539095821921136e-05, + "loss": 0.8751, + "num_input_tokens_seen": 115998720, + "step": 14160 + }, + { + "epoch": 1.8309859154929577, + "grad_norm": 0.8277295231819153, + "learning_rate": 1.650727124312768e-05, + "loss": 0.7839, + "num_input_tokens_seen": 116080640, + "step": 14170 + }, + { + "epoch": 1.8322780721023388, + "grad_norm": 0.5361806154251099, + "learning_rate": 1.6475462214489513e-05, + "loss": 0.8393, + "num_input_tokens_seen": 116162560, + "step": 14180 + }, + { + "epoch": 1.8335702287117197, + "grad_norm": 0.5568910241127014, + "learning_rate": 1.6443668794248828e-05, + "loss": 0.6922, + "num_input_tokens_seen": 116244480, + "step": 14190 + }, + { + "epoch": 1.834862385321101, + "grad_norm": 0.615774393081665, + "learning_rate": 1.641189104061928e-05, + "loss": 0.9365, + "num_input_tokens_seen": 116326400, + "step": 14200 + }, + { + "epoch": 1.8361545419304819, + "grad_norm": 0.5470173358917236, + "learning_rate": 1.63801290117858e-05, + "loss": 0.8715, + "num_input_tokens_seen": 116408320, + "step": 14210 + }, + { + "epoch": 1.837446698539863, + "grad_norm": 0.7241597771644592, + "learning_rate": 1.6348382765904567e-05, + "loss": 0.8787, + "num_input_tokens_seen": 116490240, + "step": 14220 + }, + { + "epoch": 1.838738855149244, + "grad_norm": 0.7052979469299316, + "learning_rate": 1.631665236110283e-05, + "loss": 0.8164, + "num_input_tokens_seen": 116572160, + "step": 14230 + }, + { + "epoch": 1.8400310117586252, + "grad_norm": 0.6551203727722168, + "learning_rate": 1.6284937855478837e-05, + "loss": 1.0631, + "num_input_tokens_seen": 116654080, + "step": 14240 + }, + { + "epoch": 1.8413231683680062, + "grad_norm": 0.6758942604064941, + "learning_rate": 1.6253239307101748e-05, + "loss": 1.008, + "num_input_tokens_seen": 116736000, + "step": 14250 + }, + { + "epoch": 1.8426153249773871, + "grad_norm": 0.588173508644104, + "learning_rate": 1.6221556774011474e-05, + "loss": 1.006, + "num_input_tokens_seen": 116817920, + "step": 14260 + }, + { + "epoch": 1.8439074815867684, + "grad_norm": 0.2741510272026062, + "learning_rate": 1.6189890314218634e-05, + "loss": 0.8024, + "num_input_tokens_seen": 116899840, + "step": 14270 + }, + { + "epoch": 1.8451996381961493, + "grad_norm": 0.38996413350105286, + "learning_rate": 1.6158239985704378e-05, + "loss": 0.9059, + "num_input_tokens_seen": 116981760, + "step": 14280 + }, + { + "epoch": 1.8464917948055304, + "grad_norm": 0.6522138118743896, + "learning_rate": 1.6126605846420366e-05, + "loss": 0.7783, + "num_input_tokens_seen": 117063680, + "step": 14290 + }, + { + "epoch": 1.8477839514149115, + "grad_norm": 0.5174360275268555, + "learning_rate": 1.609498795428857e-05, + "loss": 0.8708, + "num_input_tokens_seen": 117145600, + "step": 14300 + }, + { + "epoch": 1.8490761080242926, + "grad_norm": 0.47565585374832153, + "learning_rate": 1.606338636720125e-05, + "loss": 0.4722, + "num_input_tokens_seen": 117227520, + "step": 14310 + }, + { + "epoch": 1.8503682646336737, + "grad_norm": 0.5831138491630554, + "learning_rate": 1.6031801143020785e-05, + "loss": 0.9373, + "num_input_tokens_seen": 117309440, + "step": 14320 + }, + { + "epoch": 1.8516604212430545, + "grad_norm": 0.5426406860351562, + "learning_rate": 1.6000232339579616e-05, + "loss": 0.9453, + "num_input_tokens_seen": 117391360, + "step": 14330 + }, + { + "epoch": 1.8529525778524358, + "grad_norm": 0.899085283279419, + "learning_rate": 1.5968680014680105e-05, + "loss": 0.8859, + "num_input_tokens_seen": 117473280, + "step": 14340 + }, + { + "epoch": 1.8542447344618167, + "grad_norm": 0.5543519258499146, + "learning_rate": 1.5937144226094426e-05, + "loss": 1.0594, + "num_input_tokens_seen": 117555200, + "step": 14350 + }, + { + "epoch": 1.8555368910711978, + "grad_norm": 0.8025951385498047, + "learning_rate": 1.590562503156452e-05, + "loss": 0.8882, + "num_input_tokens_seen": 117637120, + "step": 14360 + }, + { + "epoch": 1.8568290476805789, + "grad_norm": 0.6252597570419312, + "learning_rate": 1.5874122488801888e-05, + "loss": 0.6613, + "num_input_tokens_seen": 117719040, + "step": 14370 + }, + { + "epoch": 1.85812120428996, + "grad_norm": 0.5692026615142822, + "learning_rate": 1.5842636655487585e-05, + "loss": 0.6339, + "num_input_tokens_seen": 117800960, + "step": 14380 + }, + { + "epoch": 1.859413360899341, + "grad_norm": 0.651740550994873, + "learning_rate": 1.5811167589272068e-05, + "loss": 1.0055, + "num_input_tokens_seen": 117882880, + "step": 14390 + }, + { + "epoch": 1.860705517508722, + "grad_norm": 0.6472187638282776, + "learning_rate": 1.577971534777507e-05, + "loss": 0.7919, + "num_input_tokens_seen": 117964800, + "step": 14400 + }, + { + "epoch": 1.8619976741181032, + "grad_norm": 0.5527770519256592, + "learning_rate": 1.5748279988585528e-05, + "loss": 0.7019, + "num_input_tokens_seen": 118046720, + "step": 14410 + }, + { + "epoch": 1.863289830727484, + "grad_norm": 0.3149762451648712, + "learning_rate": 1.571686156926147e-05, + "loss": 0.7455, + "num_input_tokens_seen": 118128640, + "step": 14420 + }, + { + "epoch": 1.8645819873368652, + "grad_norm": 0.6503821015357971, + "learning_rate": 1.5685460147329917e-05, + "loss": 0.68, + "num_input_tokens_seen": 118210560, + "step": 14430 + }, + { + "epoch": 1.8658741439462463, + "grad_norm": 0.6401498913764954, + "learning_rate": 1.5654075780286742e-05, + "loss": 0.9481, + "num_input_tokens_seen": 118292480, + "step": 14440 + }, + { + "epoch": 1.8671663005556274, + "grad_norm": 0.5408946871757507, + "learning_rate": 1.562270852559661e-05, + "loss": 1.0229, + "num_input_tokens_seen": 118374400, + "step": 14450 + }, + { + "epoch": 1.8684584571650085, + "grad_norm": 0.7171612977981567, + "learning_rate": 1.5591358440692865e-05, + "loss": 1.0381, + "num_input_tokens_seen": 118456320, + "step": 14460 + }, + { + "epoch": 1.8697506137743893, + "grad_norm": 0.7857020497322083, + "learning_rate": 1.5560025582977377e-05, + "loss": 0.988, + "num_input_tokens_seen": 118538240, + "step": 14470 + }, + { + "epoch": 1.8710427703837706, + "grad_norm": 0.3020257353782654, + "learning_rate": 1.5528710009820513e-05, + "loss": 1.0819, + "num_input_tokens_seen": 118620160, + "step": 14480 + }, + { + "epoch": 1.8723349269931515, + "grad_norm": 0.6323096752166748, + "learning_rate": 1.5497411778560954e-05, + "loss": 1.2201, + "num_input_tokens_seen": 118702080, + "step": 14490 + }, + { + "epoch": 1.8736270836025326, + "grad_norm": 0.7573959827423096, + "learning_rate": 1.5466130946505664e-05, + "loss": 0.7508, + "num_input_tokens_seen": 118784000, + "step": 14500 + }, + { + "epoch": 1.8749192402119137, + "grad_norm": 0.48801571130752563, + "learning_rate": 1.5434867570929724e-05, + "loss": 0.8983, + "num_input_tokens_seen": 118865920, + "step": 14510 + }, + { + "epoch": 1.8762113968212948, + "grad_norm": 0.34075334668159485, + "learning_rate": 1.5403621709076247e-05, + "loss": 0.6051, + "num_input_tokens_seen": 118947840, + "step": 14520 + }, + { + "epoch": 1.8775035534306759, + "grad_norm": 0.27687662839889526, + "learning_rate": 1.5372393418156323e-05, + "loss": 0.7321, + "num_input_tokens_seen": 119029760, + "step": 14530 + }, + { + "epoch": 1.8787957100400567, + "grad_norm": 0.7669917941093445, + "learning_rate": 1.5341182755348806e-05, + "loss": 0.887, + "num_input_tokens_seen": 119111680, + "step": 14540 + }, + { + "epoch": 1.880087866649438, + "grad_norm": 0.30796754360198975, + "learning_rate": 1.530998977780033e-05, + "loss": 0.7769, + "num_input_tokens_seen": 119193600, + "step": 14550 + }, + { + "epoch": 1.881380023258819, + "grad_norm": 0.6400482654571533, + "learning_rate": 1.5278814542625107e-05, + "loss": 0.7488, + "num_input_tokens_seen": 119275520, + "step": 14560 + }, + { + "epoch": 1.8826721798682, + "grad_norm": 0.6054263114929199, + "learning_rate": 1.5247657106904891e-05, + "loss": 0.8852, + "num_input_tokens_seen": 119357440, + "step": 14570 + }, + { + "epoch": 1.883964336477581, + "grad_norm": 0.19290970265865326, + "learning_rate": 1.5216517527688818e-05, + "loss": 0.8831, + "num_input_tokens_seen": 119439360, + "step": 14580 + }, + { + "epoch": 1.8852564930869622, + "grad_norm": 0.5725274085998535, + "learning_rate": 1.5185395861993353e-05, + "loss": 0.8191, + "num_input_tokens_seen": 119521280, + "step": 14590 + }, + { + "epoch": 1.8865486496963433, + "grad_norm": 0.7249799370765686, + "learning_rate": 1.515429216680216e-05, + "loss": 0.9853, + "num_input_tokens_seen": 119603200, + "step": 14600 + }, + { + "epoch": 1.8878408063057242, + "grad_norm": 0.7958240509033203, + "learning_rate": 1.5123206499065967e-05, + "loss": 0.9514, + "num_input_tokens_seen": 119685120, + "step": 14610 + }, + { + "epoch": 1.8891329629151055, + "grad_norm": 0.6748232245445251, + "learning_rate": 1.5092138915702545e-05, + "loss": 1.2975, + "num_input_tokens_seen": 119767040, + "step": 14620 + }, + { + "epoch": 1.8904251195244863, + "grad_norm": 0.9683583974838257, + "learning_rate": 1.5061089473596501e-05, + "loss": 0.8597, + "num_input_tokens_seen": 119848960, + "step": 14630 + }, + { + "epoch": 1.8917172761338674, + "grad_norm": 0.6267436742782593, + "learning_rate": 1.5030058229599275e-05, + "loss": 0.7876, + "num_input_tokens_seen": 119930880, + "step": 14640 + }, + { + "epoch": 1.8930094327432485, + "grad_norm": 0.18290306627750397, + "learning_rate": 1.4999045240528935e-05, + "loss": 0.4448, + "num_input_tokens_seen": 120012800, + "step": 14650 + }, + { + "epoch": 1.8943015893526294, + "grad_norm": 0.908818244934082, + "learning_rate": 1.4968050563170177e-05, + "loss": 0.9315, + "num_input_tokens_seen": 120094720, + "step": 14660 + }, + { + "epoch": 1.8955937459620107, + "grad_norm": 0.8734130859375, + "learning_rate": 1.4937074254274117e-05, + "loss": 1.0145, + "num_input_tokens_seen": 120176640, + "step": 14670 + }, + { + "epoch": 1.8968859025713916, + "grad_norm": 0.6656931042671204, + "learning_rate": 1.4906116370558276e-05, + "loss": 0.6827, + "num_input_tokens_seen": 120258560, + "step": 14680 + }, + { + "epoch": 1.8981780591807729, + "grad_norm": 0.612580418586731, + "learning_rate": 1.4875176968706434e-05, + "loss": 0.8853, + "num_input_tokens_seen": 120340480, + "step": 14690 + }, + { + "epoch": 1.8994702157901537, + "grad_norm": 0.30025026202201843, + "learning_rate": 1.4844256105368504e-05, + "loss": 0.7183, + "num_input_tokens_seen": 120422400, + "step": 14700 + }, + { + "epoch": 1.9007623723995348, + "grad_norm": 0.3702358603477478, + "learning_rate": 1.4813353837160488e-05, + "loss": 0.5776, + "num_input_tokens_seen": 120504320, + "step": 14710 + }, + { + "epoch": 1.902054529008916, + "grad_norm": 0.5429428219795227, + "learning_rate": 1.4782470220664313e-05, + "loss": 1.1882, + "num_input_tokens_seen": 120586240, + "step": 14720 + }, + { + "epoch": 1.9033466856182968, + "grad_norm": 0.8188362717628479, + "learning_rate": 1.4751605312427786e-05, + "loss": 0.9617, + "num_input_tokens_seen": 120668160, + "step": 14730 + }, + { + "epoch": 1.904638842227678, + "grad_norm": 0.8028152585029602, + "learning_rate": 1.472075916896442e-05, + "loss": 0.8622, + "num_input_tokens_seen": 120750080, + "step": 14740 + }, + { + "epoch": 1.905930998837059, + "grad_norm": 0.839156985282898, + "learning_rate": 1.4689931846753402e-05, + "loss": 0.9725, + "num_input_tokens_seen": 120832000, + "step": 14750 + }, + { + "epoch": 1.90722315544644, + "grad_norm": 0.7380186319351196, + "learning_rate": 1.4659123402239454e-05, + "loss": 1.2234, + "num_input_tokens_seen": 120913920, + "step": 14760 + }, + { + "epoch": 1.9085153120558211, + "grad_norm": 0.5705264806747437, + "learning_rate": 1.4628333891832713e-05, + "loss": 0.8926, + "num_input_tokens_seen": 120995840, + "step": 14770 + }, + { + "epoch": 1.9098074686652022, + "grad_norm": 0.7131834626197815, + "learning_rate": 1.4597563371908663e-05, + "loss": 1.0101, + "num_input_tokens_seen": 121077760, + "step": 14780 + }, + { + "epoch": 1.9110996252745833, + "grad_norm": 0.9028184413909912, + "learning_rate": 1.4566811898808013e-05, + "loss": 0.9358, + "num_input_tokens_seen": 121159680, + "step": 14790 + }, + { + "epoch": 1.9123917818839642, + "grad_norm": 0.5184687972068787, + "learning_rate": 1.4536079528836605e-05, + "loss": 0.7874, + "num_input_tokens_seen": 121241600, + "step": 14800 + }, + { + "epoch": 1.9136839384933455, + "grad_norm": 0.6445053815841675, + "learning_rate": 1.4505366318265278e-05, + "loss": 0.6709, + "num_input_tokens_seen": 121323520, + "step": 14810 + }, + { + "epoch": 1.9149760951027264, + "grad_norm": 0.3584889769554138, + "learning_rate": 1.4474672323329819e-05, + "loss": 0.6944, + "num_input_tokens_seen": 121405440, + "step": 14820 + }, + { + "epoch": 1.9162682517121075, + "grad_norm": 0.5627390742301941, + "learning_rate": 1.4443997600230832e-05, + "loss": 0.8012, + "num_input_tokens_seen": 121487360, + "step": 14830 + }, + { + "epoch": 1.9175604083214886, + "grad_norm": 0.6850423216819763, + "learning_rate": 1.4413342205133604e-05, + "loss": 0.9204, + "num_input_tokens_seen": 121569280, + "step": 14840 + }, + { + "epoch": 1.9188525649308696, + "grad_norm": 0.7621904611587524, + "learning_rate": 1.4382706194168066e-05, + "loss": 0.6648, + "num_input_tokens_seen": 121651200, + "step": 14850 + }, + { + "epoch": 1.9201447215402507, + "grad_norm": 0.636451244354248, + "learning_rate": 1.4352089623428627e-05, + "loss": 0.8368, + "num_input_tokens_seen": 121733120, + "step": 14860 + }, + { + "epoch": 1.9214368781496316, + "grad_norm": 0.5977798700332642, + "learning_rate": 1.4321492548974137e-05, + "loss": 1.248, + "num_input_tokens_seen": 121815040, + "step": 14870 + }, + { + "epoch": 1.922729034759013, + "grad_norm": 0.24377988278865814, + "learning_rate": 1.42909150268277e-05, + "loss": 1.0932, + "num_input_tokens_seen": 121896960, + "step": 14880 + }, + { + "epoch": 1.9240211913683938, + "grad_norm": 0.6114721894264221, + "learning_rate": 1.4260357112976664e-05, + "loss": 0.8562, + "num_input_tokens_seen": 121978880, + "step": 14890 + }, + { + "epoch": 1.9253133479777749, + "grad_norm": 0.5565524101257324, + "learning_rate": 1.4229818863372463e-05, + "loss": 0.9967, + "num_input_tokens_seen": 122060800, + "step": 14900 + }, + { + "epoch": 1.926605504587156, + "grad_norm": 0.9591652154922485, + "learning_rate": 1.4199300333930515e-05, + "loss": 1.2298, + "num_input_tokens_seen": 122142720, + "step": 14910 + }, + { + "epoch": 1.927897661196537, + "grad_norm": 0.7166655659675598, + "learning_rate": 1.4168801580530119e-05, + "loss": 0.6193, + "num_input_tokens_seen": 122224640, + "step": 14920 + }, + { + "epoch": 1.9291898178059181, + "grad_norm": 0.5449065566062927, + "learning_rate": 1.4138322659014408e-05, + "loss": 0.9493, + "num_input_tokens_seen": 122306560, + "step": 14930 + }, + { + "epoch": 1.930481974415299, + "grad_norm": 1.1408071517944336, + "learning_rate": 1.4107863625190163e-05, + "loss": 0.8788, + "num_input_tokens_seen": 122388480, + "step": 14940 + }, + { + "epoch": 1.9317741310246803, + "grad_norm": 0.3808801770210266, + "learning_rate": 1.4077424534827752e-05, + "loss": 0.7922, + "num_input_tokens_seen": 122470400, + "step": 14950 + }, + { + "epoch": 1.9330662876340612, + "grad_norm": 0.7727448344230652, + "learning_rate": 1.4047005443661048e-05, + "loss": 1.057, + "num_input_tokens_seen": 122552320, + "step": 14960 + }, + { + "epoch": 1.9343584442434423, + "grad_norm": 0.7547351121902466, + "learning_rate": 1.4016606407387312e-05, + "loss": 0.9366, + "num_input_tokens_seen": 122634240, + "step": 14970 + }, + { + "epoch": 1.9356506008528234, + "grad_norm": 0.9177370667457581, + "learning_rate": 1.398622748166704e-05, + "loss": 0.3856, + "num_input_tokens_seen": 122716160, + "step": 14980 + }, + { + "epoch": 1.9369427574622045, + "grad_norm": 0.519792914390564, + "learning_rate": 1.3955868722123955e-05, + "loss": 0.7705, + "num_input_tokens_seen": 122798080, + "step": 14990 + }, + { + "epoch": 1.9382349140715855, + "grad_norm": 0.8138200044631958, + "learning_rate": 1.3925530184344818e-05, + "loss": 0.7423, + "num_input_tokens_seen": 122880000, + "step": 15000 + }, + { + "epoch": 1.9395270706809664, + "grad_norm": 0.6865666508674622, + "learning_rate": 1.3895211923879397e-05, + "loss": 1.0697, + "num_input_tokens_seen": 122961920, + "step": 15010 + }, + { + "epoch": 1.9408192272903477, + "grad_norm": 0.6570258736610413, + "learning_rate": 1.3864913996240304e-05, + "loss": 0.967, + "num_input_tokens_seen": 123043840, + "step": 15020 + }, + { + "epoch": 1.9421113838997286, + "grad_norm": 1.129082441329956, + "learning_rate": 1.3834636456902944e-05, + "loss": 0.8996, + "num_input_tokens_seen": 123125760, + "step": 15030 + }, + { + "epoch": 1.9434035405091097, + "grad_norm": 0.1828085333108902, + "learning_rate": 1.3804379361305363e-05, + "loss": 0.5727, + "num_input_tokens_seen": 123207680, + "step": 15040 + }, + { + "epoch": 1.9446956971184908, + "grad_norm": 0.5514014363288879, + "learning_rate": 1.3774142764848207e-05, + "loss": 0.9822, + "num_input_tokens_seen": 123289600, + "step": 15050 + }, + { + "epoch": 1.9459878537278719, + "grad_norm": 0.51252681016922, + "learning_rate": 1.3743926722894579e-05, + "loss": 0.7735, + "num_input_tokens_seen": 123371520, + "step": 15060 + }, + { + "epoch": 1.947280010337253, + "grad_norm": 0.6373499631881714, + "learning_rate": 1.3713731290769921e-05, + "loss": 1.0148, + "num_input_tokens_seen": 123453440, + "step": 15070 + }, + { + "epoch": 1.9485721669466338, + "grad_norm": 0.7882586121559143, + "learning_rate": 1.3683556523761981e-05, + "loss": 0.8874, + "num_input_tokens_seen": 123535360, + "step": 15080 + }, + { + "epoch": 1.9498643235560151, + "grad_norm": 0.8441190719604492, + "learning_rate": 1.365340247712064e-05, + "loss": 0.9097, + "num_input_tokens_seen": 123617280, + "step": 15090 + }, + { + "epoch": 1.951156480165396, + "grad_norm": 0.6517019271850586, + "learning_rate": 1.362326920605783e-05, + "loss": 0.8473, + "num_input_tokens_seen": 123699200, + "step": 15100 + }, + { + "epoch": 1.952448636774777, + "grad_norm": 0.4873802363872528, + "learning_rate": 1.3593156765747483e-05, + "loss": 0.7964, + "num_input_tokens_seen": 123781120, + "step": 15110 + }, + { + "epoch": 1.9537407933841582, + "grad_norm": 0.6770155429840088, + "learning_rate": 1.3563065211325349e-05, + "loss": 0.795, + "num_input_tokens_seen": 123863040, + "step": 15120 + }, + { + "epoch": 1.9550329499935393, + "grad_norm": 0.4398241937160492, + "learning_rate": 1.3532994597888971e-05, + "loss": 0.6041, + "num_input_tokens_seen": 123944960, + "step": 15130 + }, + { + "epoch": 1.9563251066029204, + "grad_norm": 0.2949928939342499, + "learning_rate": 1.3502944980497514e-05, + "loss": 0.7019, + "num_input_tokens_seen": 124026880, + "step": 15140 + }, + { + "epoch": 1.9576172632123012, + "grad_norm": 0.7733688354492188, + "learning_rate": 1.3472916414171738e-05, + "loss": 1.2308, + "num_input_tokens_seen": 124108800, + "step": 15150 + }, + { + "epoch": 1.9589094198216825, + "grad_norm": 0.6464848518371582, + "learning_rate": 1.3442908953893816e-05, + "loss": 0.8531, + "num_input_tokens_seen": 124190720, + "step": 15160 + }, + { + "epoch": 1.9602015764310634, + "grad_norm": 0.7353049516677856, + "learning_rate": 1.3412922654607318e-05, + "loss": 0.8579, + "num_input_tokens_seen": 124272640, + "step": 15170 + }, + { + "epoch": 1.9614937330404445, + "grad_norm": 0.8654274344444275, + "learning_rate": 1.338295757121703e-05, + "loss": 0.7652, + "num_input_tokens_seen": 124354560, + "step": 15180 + }, + { + "epoch": 1.9627858896498256, + "grad_norm": 0.5775660872459412, + "learning_rate": 1.3353013758588923e-05, + "loss": 0.8972, + "num_input_tokens_seen": 124436480, + "step": 15190 + }, + { + "epoch": 1.9640780462592065, + "grad_norm": 0.5791903138160706, + "learning_rate": 1.3323091271550011e-05, + "loss": 0.8894, + "num_input_tokens_seen": 124518400, + "step": 15200 + }, + { + "epoch": 1.9653702028685878, + "grad_norm": 0.8218627572059631, + "learning_rate": 1.3293190164888242e-05, + "loss": 1.0825, + "num_input_tokens_seen": 124600320, + "step": 15210 + }, + { + "epoch": 1.9666623594779686, + "grad_norm": 0.3494950234889984, + "learning_rate": 1.3263310493352454e-05, + "loss": 0.8476, + "num_input_tokens_seen": 124682240, + "step": 15220 + }, + { + "epoch": 1.96795451608735, + "grad_norm": 0.5643335580825806, + "learning_rate": 1.3233452311652197e-05, + "loss": 0.6004, + "num_input_tokens_seen": 124764160, + "step": 15230 + }, + { + "epoch": 1.9692466726967308, + "grad_norm": 0.6971800923347473, + "learning_rate": 1.3203615674457709e-05, + "loss": 0.4468, + "num_input_tokens_seen": 124846080, + "step": 15240 + }, + { + "epoch": 1.970538829306112, + "grad_norm": 0.690518319606781, + "learning_rate": 1.3173800636399744e-05, + "loss": 0.704, + "num_input_tokens_seen": 124928000, + "step": 15250 + }, + { + "epoch": 1.971830985915493, + "grad_norm": 0.568469226360321, + "learning_rate": 1.3144007252069552e-05, + "loss": 0.5738, + "num_input_tokens_seen": 125009920, + "step": 15260 + }, + { + "epoch": 1.9731231425248739, + "grad_norm": 0.6457722783088684, + "learning_rate": 1.3114235576018686e-05, + "loss": 0.6915, + "num_input_tokens_seen": 125091840, + "step": 15270 + }, + { + "epoch": 1.9744152991342552, + "grad_norm": 0.6253653168678284, + "learning_rate": 1.3084485662758994e-05, + "loss": 0.6731, + "num_input_tokens_seen": 125173760, + "step": 15280 + }, + { + "epoch": 1.975707455743636, + "grad_norm": 0.6036949753761292, + "learning_rate": 1.3054757566762454e-05, + "loss": 0.8532, + "num_input_tokens_seen": 125255680, + "step": 15290 + }, + { + "epoch": 1.9769996123530171, + "grad_norm": 1.014550805091858, + "learning_rate": 1.3025051342461087e-05, + "loss": 1.3152, + "num_input_tokens_seen": 125337600, + "step": 15300 + }, + { + "epoch": 1.9782917689623982, + "grad_norm": 0.5557976961135864, + "learning_rate": 1.2995367044246903e-05, + "loss": 0.6177, + "num_input_tokens_seen": 125419520, + "step": 15310 + }, + { + "epoch": 1.9795839255717793, + "grad_norm": 1.1027765274047852, + "learning_rate": 1.2965704726471729e-05, + "loss": 0.7341, + "num_input_tokens_seen": 125501440, + "step": 15320 + }, + { + "epoch": 1.9808760821811604, + "grad_norm": 0.8762277364730835, + "learning_rate": 1.2936064443447157e-05, + "loss": 0.7912, + "num_input_tokens_seen": 125583360, + "step": 15330 + }, + { + "epoch": 1.9821682387905413, + "grad_norm": 0.5100016593933105, + "learning_rate": 1.2906446249444457e-05, + "loss": 1.0051, + "num_input_tokens_seen": 125665280, + "step": 15340 + }, + { + "epoch": 1.9834603953999226, + "grad_norm": 0.583299994468689, + "learning_rate": 1.2876850198694409e-05, + "loss": 0.4944, + "num_input_tokens_seen": 125747200, + "step": 15350 + }, + { + "epoch": 1.9847525520093035, + "grad_norm": 0.277055025100708, + "learning_rate": 1.2847276345387299e-05, + "loss": 0.7472, + "num_input_tokens_seen": 125829120, + "step": 15360 + }, + { + "epoch": 1.9860447086186845, + "grad_norm": 1.0343480110168457, + "learning_rate": 1.2817724743672715e-05, + "loss": 0.6023, + "num_input_tokens_seen": 125911040, + "step": 15370 + }, + { + "epoch": 1.9873368652280656, + "grad_norm": 0.6616178750991821, + "learning_rate": 1.2788195447659562e-05, + "loss": 1.1141, + "num_input_tokens_seen": 125992960, + "step": 15380 + }, + { + "epoch": 1.9886290218374467, + "grad_norm": 0.5394881367683411, + "learning_rate": 1.2758688511415848e-05, + "loss": 0.9213, + "num_input_tokens_seen": 126074880, + "step": 15390 + }, + { + "epoch": 1.9899211784468278, + "grad_norm": 0.7356629967689514, + "learning_rate": 1.2729203988968674e-05, + "loss": 0.7153, + "num_input_tokens_seen": 126156800, + "step": 15400 + }, + { + "epoch": 1.9912133350562087, + "grad_norm": 0.22516459226608276, + "learning_rate": 1.2699741934304104e-05, + "loss": 0.5926, + "num_input_tokens_seen": 126238720, + "step": 15410 + }, + { + "epoch": 1.99250549166559, + "grad_norm": 0.6779276728630066, + "learning_rate": 1.2670302401367035e-05, + "loss": 0.9406, + "num_input_tokens_seen": 126320640, + "step": 15420 + }, + { + "epoch": 1.9937976482749709, + "grad_norm": 0.5746923089027405, + "learning_rate": 1.2640885444061163e-05, + "loss": 0.9174, + "num_input_tokens_seen": 126402560, + "step": 15430 + }, + { + "epoch": 1.995089804884352, + "grad_norm": 0.6525429487228394, + "learning_rate": 1.2611491116248802e-05, + "loss": 1.254, + "num_input_tokens_seen": 126484480, + "step": 15440 + }, + { + "epoch": 1.996381961493733, + "grad_norm": 0.30466407537460327, + "learning_rate": 1.2582119471750888e-05, + "loss": 0.7516, + "num_input_tokens_seen": 126566400, + "step": 15450 + }, + { + "epoch": 1.9976741181031141, + "grad_norm": 0.6706373691558838, + "learning_rate": 1.2552770564346781e-05, + "loss": 0.9337, + "num_input_tokens_seen": 126648320, + "step": 15460 + }, + { + "epoch": 1.9989662747124952, + "grad_norm": 0.7642003297805786, + "learning_rate": 1.2523444447774213e-05, + "loss": 0.9429, + "num_input_tokens_seen": 126730240, + "step": 15470 + }, + { + "epoch": 2.000258431321876, + "grad_norm": 0.71830153465271, + "learning_rate": 1.2494141175729216e-05, + "loss": 0.9215, + "num_input_tokens_seen": 126812160, + "step": 15480 + }, + { + "epoch": 2.0015505879312574, + "grad_norm": 0.5191543698310852, + "learning_rate": 1.2464860801865954e-05, + "loss": 0.611, + "num_input_tokens_seen": 126894080, + "step": 15490 + }, + { + "epoch": 2.0028427445406383, + "grad_norm": 0.8038392066955566, + "learning_rate": 1.2435603379796704e-05, + "loss": 1.1059, + "num_input_tokens_seen": 126976000, + "step": 15500 + }, + { + "epoch": 2.0041349011500196, + "grad_norm": 0.5426739454269409, + "learning_rate": 1.240636896309168e-05, + "loss": 0.9048, + "num_input_tokens_seen": 127057920, + "step": 15510 + }, + { + "epoch": 2.0054270577594004, + "grad_norm": 0.6243565082550049, + "learning_rate": 1.237715760527901e-05, + "loss": 0.7904, + "num_input_tokens_seen": 127139840, + "step": 15520 + }, + { + "epoch": 2.0067192143687813, + "grad_norm": 0.35670045018196106, + "learning_rate": 1.2347969359844566e-05, + "loss": 0.7196, + "num_input_tokens_seen": 127221760, + "step": 15530 + }, + { + "epoch": 2.0080113709781626, + "grad_norm": 0.9198890924453735, + "learning_rate": 1.2318804280231939e-05, + "loss": 1.102, + "num_input_tokens_seen": 127303680, + "step": 15540 + }, + { + "epoch": 2.0093035275875435, + "grad_norm": 0.5498625636100769, + "learning_rate": 1.2289662419842258e-05, + "loss": 0.5505, + "num_input_tokens_seen": 127385600, + "step": 15550 + }, + { + "epoch": 2.010595684196925, + "grad_norm": 0.872312068939209, + "learning_rate": 1.2260543832034177e-05, + "loss": 0.6824, + "num_input_tokens_seen": 127467520, + "step": 15560 + }, + { + "epoch": 2.0118878408063057, + "grad_norm": 0.6140616536140442, + "learning_rate": 1.2231448570123732e-05, + "loss": 0.9454, + "num_input_tokens_seen": 127549440, + "step": 15570 + }, + { + "epoch": 2.013179997415687, + "grad_norm": 0.32139700651168823, + "learning_rate": 1.2202376687384223e-05, + "loss": 0.9467, + "num_input_tokens_seen": 127631360, + "step": 15580 + }, + { + "epoch": 2.014472154025068, + "grad_norm": 0.5901627540588379, + "learning_rate": 1.2173328237046178e-05, + "loss": 0.6482, + "num_input_tokens_seen": 127713280, + "step": 15590 + }, + { + "epoch": 2.0157643106344487, + "grad_norm": 0.9374563694000244, + "learning_rate": 1.2144303272297186e-05, + "loss": 0.547, + "num_input_tokens_seen": 127795200, + "step": 15600 + }, + { + "epoch": 2.01705646724383, + "grad_norm": 0.6503939032554626, + "learning_rate": 1.2115301846281871e-05, + "loss": 0.9459, + "num_input_tokens_seen": 127877120, + "step": 15610 + }, + { + "epoch": 2.018348623853211, + "grad_norm": 0.645089864730835, + "learning_rate": 1.2086324012101716e-05, + "loss": 0.8437, + "num_input_tokens_seen": 127959040, + "step": 15620 + }, + { + "epoch": 2.019640780462592, + "grad_norm": 0.6811534762382507, + "learning_rate": 1.2057369822815051e-05, + "loss": 0.9276, + "num_input_tokens_seen": 128040960, + "step": 15630 + }, + { + "epoch": 2.020932937071973, + "grad_norm": 0.5539160370826721, + "learning_rate": 1.2028439331436869e-05, + "loss": 0.8422, + "num_input_tokens_seen": 128122880, + "step": 15640 + }, + { + "epoch": 2.0222250936813544, + "grad_norm": 0.5917775630950928, + "learning_rate": 1.1999532590938817e-05, + "loss": 0.9474, + "num_input_tokens_seen": 128204800, + "step": 15650 + }, + { + "epoch": 2.0235172502907353, + "grad_norm": 0.35007065534591675, + "learning_rate": 1.1970649654249017e-05, + "loss": 0.5438, + "num_input_tokens_seen": 128286720, + "step": 15660 + }, + { + "epoch": 2.024809406900116, + "grad_norm": 0.8497019410133362, + "learning_rate": 1.1941790574252013e-05, + "loss": 0.5885, + "num_input_tokens_seen": 128368640, + "step": 15670 + }, + { + "epoch": 2.0261015635094974, + "grad_norm": 0.6658693552017212, + "learning_rate": 1.1912955403788695e-05, + "loss": 0.878, + "num_input_tokens_seen": 128450560, + "step": 15680 + }, + { + "epoch": 2.0273937201188783, + "grad_norm": 0.6294447779655457, + "learning_rate": 1.1884144195656133e-05, + "loss": 0.7301, + "num_input_tokens_seen": 128532480, + "step": 15690 + }, + { + "epoch": 2.0286858767282596, + "grad_norm": 1.484786868095398, + "learning_rate": 1.1855357002607556e-05, + "loss": 0.3667, + "num_input_tokens_seen": 128614400, + "step": 15700 + }, + { + "epoch": 2.0299780333376405, + "grad_norm": 1.5291662216186523, + "learning_rate": 1.1826593877352216e-05, + "loss": 0.6625, + "num_input_tokens_seen": 128696320, + "step": 15710 + }, + { + "epoch": 2.031270189947022, + "grad_norm": 0.41958874464035034, + "learning_rate": 1.1797854872555272e-05, + "loss": 0.8176, + "num_input_tokens_seen": 128778240, + "step": 15720 + }, + { + "epoch": 2.0325623465564027, + "grad_norm": 0.7266905307769775, + "learning_rate": 1.1769140040837755e-05, + "loss": 0.8307, + "num_input_tokens_seen": 128860160, + "step": 15730 + }, + { + "epoch": 2.0338545031657835, + "grad_norm": 0.606711208820343, + "learning_rate": 1.1740449434776402e-05, + "loss": 0.4084, + "num_input_tokens_seen": 128942080, + "step": 15740 + }, + { + "epoch": 2.035146659775165, + "grad_norm": 0.7246209979057312, + "learning_rate": 1.171178310690362e-05, + "loss": 0.664, + "num_input_tokens_seen": 129024000, + "step": 15750 + }, + { + "epoch": 2.0364388163845457, + "grad_norm": 0.588646650314331, + "learning_rate": 1.1683141109707339e-05, + "loss": 0.7079, + "num_input_tokens_seen": 129105920, + "step": 15760 + }, + { + "epoch": 2.037730972993927, + "grad_norm": 0.6580897569656372, + "learning_rate": 1.165452349563095e-05, + "loss": 0.9028, + "num_input_tokens_seen": 129187840, + "step": 15770 + }, + { + "epoch": 2.039023129603308, + "grad_norm": 0.7622004747390747, + "learning_rate": 1.1625930317073221e-05, + "loss": 0.6014, + "num_input_tokens_seen": 129269760, + "step": 15780 + }, + { + "epoch": 2.0403152862126888, + "grad_norm": 0.7260866761207581, + "learning_rate": 1.159736162638813e-05, + "loss": 0.7052, + "num_input_tokens_seen": 129351680, + "step": 15790 + }, + { + "epoch": 2.04160744282207, + "grad_norm": 0.6975520849227905, + "learning_rate": 1.1568817475884868e-05, + "loss": 0.5514, + "num_input_tokens_seen": 129433600, + "step": 15800 + }, + { + "epoch": 2.042899599431451, + "grad_norm": 1.0300558805465698, + "learning_rate": 1.154029791782765e-05, + "loss": 0.9024, + "num_input_tokens_seen": 129515520, + "step": 15810 + }, + { + "epoch": 2.0441917560408323, + "grad_norm": 0.5849384069442749, + "learning_rate": 1.1511803004435704e-05, + "loss": 0.8982, + "num_input_tokens_seen": 129597440, + "step": 15820 + }, + { + "epoch": 2.045483912650213, + "grad_norm": 0.6071699857711792, + "learning_rate": 1.1483332787883096e-05, + "loss": 0.8733, + "num_input_tokens_seen": 129679360, + "step": 15830 + }, + { + "epoch": 2.0467760692595944, + "grad_norm": 0.26067107915878296, + "learning_rate": 1.1454887320298686e-05, + "loss": 0.7505, + "num_input_tokens_seen": 129761280, + "step": 15840 + }, + { + "epoch": 2.0480682258689753, + "grad_norm": 0.8173131942749023, + "learning_rate": 1.1426466653766036e-05, + "loss": 0.6831, + "num_input_tokens_seen": 129843200, + "step": 15850 + }, + { + "epoch": 2.049360382478356, + "grad_norm": 0.7379295229911804, + "learning_rate": 1.1398070840323264e-05, + "loss": 0.8099, + "num_input_tokens_seen": 129925120, + "step": 15860 + }, + { + "epoch": 2.0506525390877375, + "grad_norm": 0.6924763917922974, + "learning_rate": 1.1369699931963018e-05, + "loss": 0.7674, + "num_input_tokens_seen": 130007040, + "step": 15870 + }, + { + "epoch": 2.0519446956971183, + "grad_norm": 0.8446022272109985, + "learning_rate": 1.1341353980632313e-05, + "loss": 0.9515, + "num_input_tokens_seen": 130088960, + "step": 15880 + }, + { + "epoch": 2.0532368523064997, + "grad_norm": 0.9424275755882263, + "learning_rate": 1.1313033038232498e-05, + "loss": 0.9029, + "num_input_tokens_seen": 130170880, + "step": 15890 + }, + { + "epoch": 2.0545290089158805, + "grad_norm": 0.9581369161605835, + "learning_rate": 1.1284737156619096e-05, + "loss": 0.7374, + "num_input_tokens_seen": 130252800, + "step": 15900 + }, + { + "epoch": 2.055821165525262, + "grad_norm": 0.6615799069404602, + "learning_rate": 1.1256466387601782e-05, + "loss": 0.7558, + "num_input_tokens_seen": 130334720, + "step": 15910 + }, + { + "epoch": 2.0571133221346427, + "grad_norm": 0.3666354715824127, + "learning_rate": 1.1228220782944212e-05, + "loss": 0.7905, + "num_input_tokens_seen": 130416640, + "step": 15920 + }, + { + "epoch": 2.0584054787440236, + "grad_norm": 0.6414427757263184, + "learning_rate": 1.1200000394363996e-05, + "loss": 0.9914, + "num_input_tokens_seen": 130498560, + "step": 15930 + }, + { + "epoch": 2.059697635353405, + "grad_norm": 0.23354114592075348, + "learning_rate": 1.1171805273532567e-05, + "loss": 0.7977, + "num_input_tokens_seen": 130580480, + "step": 15940 + }, + { + "epoch": 2.0609897919627858, + "grad_norm": 0.4102308750152588, + "learning_rate": 1.1143635472075074e-05, + "loss": 0.8725, + "num_input_tokens_seen": 130662400, + "step": 15950 + }, + { + "epoch": 2.062281948572167, + "grad_norm": 1.2794528007507324, + "learning_rate": 1.1115491041570337e-05, + "loss": 1.0455, + "num_input_tokens_seen": 130744320, + "step": 15960 + }, + { + "epoch": 2.063574105181548, + "grad_norm": 0.6293236613273621, + "learning_rate": 1.1087372033550685e-05, + "loss": 0.8646, + "num_input_tokens_seen": 130826240, + "step": 15970 + }, + { + "epoch": 2.0648662617909292, + "grad_norm": 0.7114168405532837, + "learning_rate": 1.105927849950194e-05, + "loss": 0.9453, + "num_input_tokens_seen": 130908160, + "step": 15980 + }, + { + "epoch": 2.06615841840031, + "grad_norm": 0.9560719728469849, + "learning_rate": 1.103121049086324e-05, + "loss": 0.822, + "num_input_tokens_seen": 130990080, + "step": 15990 + }, + { + "epoch": 2.067450575009691, + "grad_norm": 0.36301013827323914, + "learning_rate": 1.1003168059027025e-05, + "loss": 0.8453, + "num_input_tokens_seen": 131072000, + "step": 16000 + }, + { + "epoch": 2.0687427316190723, + "grad_norm": 0.5861186385154724, + "learning_rate": 1.0975151255338867e-05, + "loss": 0.9873, + "num_input_tokens_seen": 131153920, + "step": 16010 + }, + { + "epoch": 2.070034888228453, + "grad_norm": 0.6049230098724365, + "learning_rate": 1.094716013109745e-05, + "loss": 0.6615, + "num_input_tokens_seen": 131235840, + "step": 16020 + }, + { + "epoch": 2.0713270448378345, + "grad_norm": 0.3231169283390045, + "learning_rate": 1.0919194737554409e-05, + "loss": 0.5598, + "num_input_tokens_seen": 131317760, + "step": 16030 + }, + { + "epoch": 2.0726192014472153, + "grad_norm": 0.7107923030853271, + "learning_rate": 1.0891255125914269e-05, + "loss": 0.8139, + "num_input_tokens_seen": 131399680, + "step": 16040 + }, + { + "epoch": 2.0739113580565967, + "grad_norm": 0.196472629904747, + "learning_rate": 1.0863341347334376e-05, + "loss": 0.7574, + "num_input_tokens_seen": 131481600, + "step": 16050 + }, + { + "epoch": 2.0752035146659775, + "grad_norm": 0.9992514848709106, + "learning_rate": 1.0835453452924737e-05, + "loss": 0.578, + "num_input_tokens_seen": 131563520, + "step": 16060 + }, + { + "epoch": 2.0764956712753584, + "grad_norm": 0.8521533608436584, + "learning_rate": 1.0807591493747992e-05, + "loss": 0.6804, + "num_input_tokens_seen": 131645440, + "step": 16070 + }, + { + "epoch": 2.0777878278847397, + "grad_norm": 0.5789138078689575, + "learning_rate": 1.0779755520819302e-05, + "loss": 0.5871, + "num_input_tokens_seen": 131727360, + "step": 16080 + }, + { + "epoch": 2.0790799844941206, + "grad_norm": 0.27690398693084717, + "learning_rate": 1.0751945585106205e-05, + "loss": 0.8846, + "num_input_tokens_seen": 131809280, + "step": 16090 + }, + { + "epoch": 2.080372141103502, + "grad_norm": 0.5583693981170654, + "learning_rate": 1.0724161737528616e-05, + "loss": 0.9714, + "num_input_tokens_seen": 131891200, + "step": 16100 + }, + { + "epoch": 2.0816642977128827, + "grad_norm": 1.2210655212402344, + "learning_rate": 1.0696404028958634e-05, + "loss": 0.7469, + "num_input_tokens_seen": 131973120, + "step": 16110 + }, + { + "epoch": 2.082956454322264, + "grad_norm": 1.0592528581619263, + "learning_rate": 1.0668672510220548e-05, + "loss": 1.0468, + "num_input_tokens_seen": 132055040, + "step": 16120 + }, + { + "epoch": 2.084248610931645, + "grad_norm": 0.7899770140647888, + "learning_rate": 1.0640967232090643e-05, + "loss": 0.4886, + "num_input_tokens_seen": 132136960, + "step": 16130 + }, + { + "epoch": 2.085540767541026, + "grad_norm": 0.5646101832389832, + "learning_rate": 1.0613288245297193e-05, + "loss": 0.9022, + "num_input_tokens_seen": 132218880, + "step": 16140 + }, + { + "epoch": 2.086832924150407, + "grad_norm": 0.5926666855812073, + "learning_rate": 1.0585635600520327e-05, + "loss": 0.9351, + "num_input_tokens_seen": 132300800, + "step": 16150 + }, + { + "epoch": 2.088125080759788, + "grad_norm": 1.35215163230896, + "learning_rate": 1.0558009348391926e-05, + "loss": 0.7735, + "num_input_tokens_seen": 132382720, + "step": 16160 + }, + { + "epoch": 2.0894172373691693, + "grad_norm": 0.7683168649673462, + "learning_rate": 1.053040953949557e-05, + "loss": 0.6433, + "num_input_tokens_seen": 132464640, + "step": 16170 + }, + { + "epoch": 2.09070939397855, + "grad_norm": 0.49856263399124146, + "learning_rate": 1.0502836224366389e-05, + "loss": 0.9383, + "num_input_tokens_seen": 132546560, + "step": 16180 + }, + { + "epoch": 2.0920015505879315, + "grad_norm": 1.046005368232727, + "learning_rate": 1.0475289453491038e-05, + "loss": 0.8762, + "num_input_tokens_seen": 132628480, + "step": 16190 + }, + { + "epoch": 2.0932937071973123, + "grad_norm": 0.6279125213623047, + "learning_rate": 1.0447769277307554e-05, + "loss": 0.7354, + "num_input_tokens_seen": 132710400, + "step": 16200 + }, + { + "epoch": 2.094585863806693, + "grad_norm": 0.666612982749939, + "learning_rate": 1.042027574620526e-05, + "loss": 0.6087, + "num_input_tokens_seen": 132792320, + "step": 16210 + }, + { + "epoch": 2.0958780204160745, + "grad_norm": 0.6725567579269409, + "learning_rate": 1.0392808910524735e-05, + "loss": 0.8064, + "num_input_tokens_seen": 132874240, + "step": 16220 + }, + { + "epoch": 2.0971701770254554, + "grad_norm": 0.7392178773880005, + "learning_rate": 1.0365368820557633e-05, + "loss": 0.6069, + "num_input_tokens_seen": 132956160, + "step": 16230 + }, + { + "epoch": 2.0984623336348367, + "grad_norm": 0.7892594337463379, + "learning_rate": 1.0337955526546678e-05, + "loss": 0.6704, + "num_input_tokens_seen": 133038080, + "step": 16240 + }, + { + "epoch": 2.0997544902442176, + "grad_norm": 0.6627844572067261, + "learning_rate": 1.0310569078685494e-05, + "loss": 0.764, + "num_input_tokens_seen": 133120000, + "step": 16250 + }, + { + "epoch": 2.1010466468535984, + "grad_norm": 0.22952121496200562, + "learning_rate": 1.0283209527118584e-05, + "loss": 0.7597, + "num_input_tokens_seen": 133201920, + "step": 16260 + }, + { + "epoch": 2.1023388034629797, + "grad_norm": 1.4370572566986084, + "learning_rate": 1.0255876921941165e-05, + "loss": 0.8361, + "num_input_tokens_seen": 133283840, + "step": 16270 + }, + { + "epoch": 2.1036309600723606, + "grad_norm": 1.0919463634490967, + "learning_rate": 1.0228571313199161e-05, + "loss": 0.7833, + "num_input_tokens_seen": 133365760, + "step": 16280 + }, + { + "epoch": 2.104923116681742, + "grad_norm": 0.7926573753356934, + "learning_rate": 1.0201292750889022e-05, + "loss": 0.6994, + "num_input_tokens_seen": 133447680, + "step": 16290 + }, + { + "epoch": 2.106215273291123, + "grad_norm": 0.636835515499115, + "learning_rate": 1.0174041284957703e-05, + "loss": 0.549, + "num_input_tokens_seen": 133529600, + "step": 16300 + }, + { + "epoch": 2.107507429900504, + "grad_norm": 0.7572881579399109, + "learning_rate": 1.0146816965302546e-05, + "loss": 0.966, + "num_input_tokens_seen": 133611520, + "step": 16310 + }, + { + "epoch": 2.108799586509885, + "grad_norm": 0.6216884255409241, + "learning_rate": 1.011961984177117e-05, + "loss": 0.8817, + "num_input_tokens_seen": 133693440, + "step": 16320 + }, + { + "epoch": 2.1100917431192663, + "grad_norm": 1.0052217245101929, + "learning_rate": 1.0092449964161416e-05, + "loss": 0.7589, + "num_input_tokens_seen": 133775360, + "step": 16330 + }, + { + "epoch": 2.111383899728647, + "grad_norm": 0.8983517289161682, + "learning_rate": 1.006530738222122e-05, + "loss": 0.7143, + "num_input_tokens_seen": 133857280, + "step": 16340 + }, + { + "epoch": 2.112676056338028, + "grad_norm": 0.5024335384368896, + "learning_rate": 1.0038192145648567e-05, + "loss": 0.9325, + "num_input_tokens_seen": 133939200, + "step": 16350 + }, + { + "epoch": 2.1139682129474093, + "grad_norm": 1.1501423120498657, + "learning_rate": 1.001110430409134e-05, + "loss": 0.8195, + "num_input_tokens_seen": 134021120, + "step": 16360 + }, + { + "epoch": 2.11526036955679, + "grad_norm": 0.6280581951141357, + "learning_rate": 9.98404390714729e-06, + "loss": 0.6386, + "num_input_tokens_seen": 134103040, + "step": 16370 + }, + { + "epoch": 2.1165525261661715, + "grad_norm": 0.6069371104240417, + "learning_rate": 9.95701100436389e-06, + "loss": 0.8842, + "num_input_tokens_seen": 134184960, + "step": 16380 + }, + { + "epoch": 2.1178446827755524, + "grad_norm": 0.8397451639175415, + "learning_rate": 9.930005645238302e-06, + "loss": 1.0033, + "num_input_tokens_seen": 134266880, + "step": 16390 + }, + { + "epoch": 2.1191368393849332, + "grad_norm": 0.9031828045845032, + "learning_rate": 9.903027879217237e-06, + "loss": 0.7303, + "num_input_tokens_seen": 134348800, + "step": 16400 + }, + { + "epoch": 2.1204289959943146, + "grad_norm": 0.8852018117904663, + "learning_rate": 9.876077755696868e-06, + "loss": 0.4911, + "num_input_tokens_seen": 134430720, + "step": 16410 + }, + { + "epoch": 2.1217211526036954, + "grad_norm": 0.6135576963424683, + "learning_rate": 9.849155324022799e-06, + "loss": 0.8788, + "num_input_tokens_seen": 134512640, + "step": 16420 + }, + { + "epoch": 2.1230133092130767, + "grad_norm": 0.9423490762710571, + "learning_rate": 9.82226063348988e-06, + "loss": 0.7269, + "num_input_tokens_seen": 134594560, + "step": 16430 + }, + { + "epoch": 2.1243054658224576, + "grad_norm": 0.6482061147689819, + "learning_rate": 9.795393733342203e-06, + "loss": 0.8314, + "num_input_tokens_seen": 134676480, + "step": 16440 + }, + { + "epoch": 2.125597622431839, + "grad_norm": 1.0169423818588257, + "learning_rate": 9.76855467277297e-06, + "loss": 0.7974, + "num_input_tokens_seen": 134758400, + "step": 16450 + }, + { + "epoch": 2.12688977904122, + "grad_norm": 0.5455058813095093, + "learning_rate": 9.741743500924388e-06, + "loss": 0.8748, + "num_input_tokens_seen": 134840320, + "step": 16460 + }, + { + "epoch": 2.1281819356506007, + "grad_norm": 0.3267827033996582, + "learning_rate": 9.71496026688763e-06, + "loss": 0.6756, + "num_input_tokens_seen": 134922240, + "step": 16470 + }, + { + "epoch": 2.129474092259982, + "grad_norm": 0.6928475499153137, + "learning_rate": 9.688205019702684e-06, + "loss": 1.01, + "num_input_tokens_seen": 135004160, + "step": 16480 + }, + { + "epoch": 2.130766248869363, + "grad_norm": 0.7541202306747437, + "learning_rate": 9.661477808358323e-06, + "loss": 0.8252, + "num_input_tokens_seen": 135086080, + "step": 16490 + }, + { + "epoch": 2.132058405478744, + "grad_norm": 1.1462032794952393, + "learning_rate": 9.634778681791962e-06, + "loss": 0.8617, + "num_input_tokens_seen": 135168000, + "step": 16500 + }, + { + "epoch": 2.133350562088125, + "grad_norm": 0.8049100041389465, + "learning_rate": 9.608107688889609e-06, + "loss": 1.2436, + "num_input_tokens_seen": 135249920, + "step": 16510 + }, + { + "epoch": 2.1346427186975063, + "grad_norm": 0.6773203015327454, + "learning_rate": 9.581464878485764e-06, + "loss": 0.7527, + "num_input_tokens_seen": 135331840, + "step": 16520 + }, + { + "epoch": 2.135934875306887, + "grad_norm": 0.743137776851654, + "learning_rate": 9.554850299363294e-06, + "loss": 0.7333, + "num_input_tokens_seen": 135413760, + "step": 16530 + }, + { + "epoch": 2.137227031916268, + "grad_norm": 0.5760340094566345, + "learning_rate": 9.52826400025342e-06, + "loss": 1.0707, + "num_input_tokens_seen": 135495680, + "step": 16540 + }, + { + "epoch": 2.1385191885256494, + "grad_norm": 0.5763425230979919, + "learning_rate": 9.501706029835544e-06, + "loss": 0.7467, + "num_input_tokens_seen": 135577600, + "step": 16550 + }, + { + "epoch": 2.1398113451350302, + "grad_norm": 1.1024504899978638, + "learning_rate": 9.47517643673721e-06, + "loss": 0.6344, + "num_input_tokens_seen": 135659520, + "step": 16560 + }, + { + "epoch": 2.1411035017444116, + "grad_norm": 0.5545995235443115, + "learning_rate": 9.448675269534015e-06, + "loss": 0.844, + "num_input_tokens_seen": 135741440, + "step": 16570 + }, + { + "epoch": 2.1423956583537924, + "grad_norm": 0.710102915763855, + "learning_rate": 9.422202576749492e-06, + "loss": 0.9002, + "num_input_tokens_seen": 135823360, + "step": 16580 + }, + { + "epoch": 2.1436878149631737, + "grad_norm": 0.7064651846885681, + "learning_rate": 9.395758406855053e-06, + "loss": 0.9066, + "num_input_tokens_seen": 135905280, + "step": 16590 + }, + { + "epoch": 2.1449799715725546, + "grad_norm": 0.34538978338241577, + "learning_rate": 9.369342808269862e-06, + "loss": 0.8606, + "num_input_tokens_seen": 135987200, + "step": 16600 + }, + { + "epoch": 2.1462721281819355, + "grad_norm": 0.7004348635673523, + "learning_rate": 9.342955829360806e-06, + "loss": 0.8238, + "num_input_tokens_seen": 136069120, + "step": 16610 + }, + { + "epoch": 2.147564284791317, + "grad_norm": 0.664341390132904, + "learning_rate": 9.31659751844232e-06, + "loss": 0.571, + "num_input_tokens_seen": 136151040, + "step": 16620 + }, + { + "epoch": 2.1488564414006976, + "grad_norm": 0.9971612691879272, + "learning_rate": 9.290267923776397e-06, + "loss": 0.7521, + "num_input_tokens_seen": 136232960, + "step": 16630 + }, + { + "epoch": 2.150148598010079, + "grad_norm": 0.39424338936805725, + "learning_rate": 9.263967093572412e-06, + "loss": 0.3768, + "num_input_tokens_seen": 136314880, + "step": 16640 + }, + { + "epoch": 2.15144075461946, + "grad_norm": 0.7910518646240234, + "learning_rate": 9.237695075987106e-06, + "loss": 0.727, + "num_input_tokens_seen": 136396800, + "step": 16650 + }, + { + "epoch": 2.152732911228841, + "grad_norm": 0.6754958629608154, + "learning_rate": 9.211451919124429e-06, + "loss": 0.8798, + "num_input_tokens_seen": 136478720, + "step": 16660 + }, + { + "epoch": 2.154025067838222, + "grad_norm": 0.6639232039451599, + "learning_rate": 9.185237671035512e-06, + "loss": 1.1385, + "num_input_tokens_seen": 136560640, + "step": 16670 + }, + { + "epoch": 2.155317224447603, + "grad_norm": 0.8237486481666565, + "learning_rate": 9.15905237971856e-06, + "loss": 0.8854, + "num_input_tokens_seen": 136642560, + "step": 16680 + }, + { + "epoch": 2.156609381056984, + "grad_norm": 0.863280713558197, + "learning_rate": 9.132896093118726e-06, + "loss": 0.6467, + "num_input_tokens_seen": 136724480, + "step": 16690 + }, + { + "epoch": 2.157901537666365, + "grad_norm": 0.24760442972183228, + "learning_rate": 9.10676885912809e-06, + "loss": 0.7871, + "num_input_tokens_seen": 136806400, + "step": 16700 + }, + { + "epoch": 2.1591936942757464, + "grad_norm": 0.5246381759643555, + "learning_rate": 9.080670725585511e-06, + "loss": 0.3123, + "num_input_tokens_seen": 136888320, + "step": 16710 + }, + { + "epoch": 2.1604858508851272, + "grad_norm": 0.6992424726486206, + "learning_rate": 9.054601740276586e-06, + "loss": 0.7407, + "num_input_tokens_seen": 136970240, + "step": 16720 + }, + { + "epoch": 2.161778007494508, + "grad_norm": 0.7333977222442627, + "learning_rate": 9.028561950933517e-06, + "loss": 0.6363, + "num_input_tokens_seen": 137052160, + "step": 16730 + }, + { + "epoch": 2.1630701641038894, + "grad_norm": 0.5862302184104919, + "learning_rate": 9.002551405235082e-06, + "loss": 0.7121, + "num_input_tokens_seen": 137134080, + "step": 16740 + }, + { + "epoch": 2.1643623207132703, + "grad_norm": 0.36307257413864136, + "learning_rate": 8.976570150806486e-06, + "loss": 0.9597, + "num_input_tokens_seen": 137216000, + "step": 16750 + }, + { + "epoch": 2.1656544773226516, + "grad_norm": 0.8547699451446533, + "learning_rate": 8.950618235219302e-06, + "loss": 0.7746, + "num_input_tokens_seen": 137297920, + "step": 16760 + }, + { + "epoch": 2.1669466339320325, + "grad_norm": 0.5659860372543335, + "learning_rate": 8.924695705991407e-06, + "loss": 0.4788, + "num_input_tokens_seen": 137379840, + "step": 16770 + }, + { + "epoch": 2.1682387905414138, + "grad_norm": 0.7544788122177124, + "learning_rate": 8.898802610586843e-06, + "loss": 0.6243, + "num_input_tokens_seen": 137461760, + "step": 16780 + }, + { + "epoch": 2.1695309471507946, + "grad_norm": 0.728805661201477, + "learning_rate": 8.872938996415791e-06, + "loss": 0.9596, + "num_input_tokens_seen": 137543680, + "step": 16790 + }, + { + "epoch": 2.170823103760176, + "grad_norm": 0.7234178185462952, + "learning_rate": 8.847104910834414e-06, + "loss": 0.83, + "num_input_tokens_seen": 137625600, + "step": 16800 + }, + { + "epoch": 2.172115260369557, + "grad_norm": 0.29097720980644226, + "learning_rate": 8.821300401144836e-06, + "loss": 0.8546, + "num_input_tokens_seen": 137707520, + "step": 16810 + }, + { + "epoch": 2.1734074169789377, + "grad_norm": 0.6291516423225403, + "learning_rate": 8.795525514595032e-06, + "loss": 0.7418, + "num_input_tokens_seen": 137789440, + "step": 16820 + }, + { + "epoch": 2.174699573588319, + "grad_norm": 0.7936385273933411, + "learning_rate": 8.769780298378705e-06, + "loss": 0.9961, + "num_input_tokens_seen": 137871360, + "step": 16830 + }, + { + "epoch": 2.1759917301977, + "grad_norm": 0.8866328001022339, + "learning_rate": 8.74406479963527e-06, + "loss": 0.5871, + "num_input_tokens_seen": 137953280, + "step": 16840 + }, + { + "epoch": 2.177283886807081, + "grad_norm": 1.1491591930389404, + "learning_rate": 8.718379065449694e-06, + "loss": 1.1771, + "num_input_tokens_seen": 138035200, + "step": 16850 + }, + { + "epoch": 2.178576043416462, + "grad_norm": 0.9200372099876404, + "learning_rate": 8.69272314285248e-06, + "loss": 0.5562, + "num_input_tokens_seen": 138117120, + "step": 16860 + }, + { + "epoch": 2.179868200025843, + "grad_norm": 0.7428059577941895, + "learning_rate": 8.667097078819511e-06, + "loss": 1.0242, + "num_input_tokens_seen": 138199040, + "step": 16870 + }, + { + "epoch": 2.1811603566352242, + "grad_norm": 0.7725750803947449, + "learning_rate": 8.641500920272022e-06, + "loss": 0.8353, + "num_input_tokens_seen": 138280960, + "step": 16880 + }, + { + "epoch": 2.182452513244605, + "grad_norm": 0.8504687547683716, + "learning_rate": 8.6159347140765e-06, + "loss": 0.7933, + "num_input_tokens_seen": 138362880, + "step": 16890 + }, + { + "epoch": 2.1837446698539864, + "grad_norm": 0.8972610831260681, + "learning_rate": 8.59039850704455e-06, + "loss": 0.9106, + "num_input_tokens_seen": 138444800, + "step": 16900 + }, + { + "epoch": 2.1850368264633673, + "grad_norm": 0.9138893485069275, + "learning_rate": 8.564892345932899e-06, + "loss": 0.9258, + "num_input_tokens_seen": 138526720, + "step": 16910 + }, + { + "epoch": 2.1863289830727486, + "grad_norm": 0.8643404841423035, + "learning_rate": 8.539416277443218e-06, + "loss": 0.6456, + "num_input_tokens_seen": 138608640, + "step": 16920 + }, + { + "epoch": 2.1876211396821295, + "grad_norm": 0.5902085304260254, + "learning_rate": 8.513970348222095e-06, + "loss": 0.7899, + "num_input_tokens_seen": 138690560, + "step": 16930 + }, + { + "epoch": 2.1889132962915103, + "grad_norm": 0.7431287169456482, + "learning_rate": 8.488554604860947e-06, + "loss": 1.0066, + "num_input_tokens_seen": 138772480, + "step": 16940 + }, + { + "epoch": 2.1902054529008916, + "grad_norm": 0.330143541097641, + "learning_rate": 8.463169093895887e-06, + "loss": 0.5788, + "num_input_tokens_seen": 138854400, + "step": 16950 + }, + { + "epoch": 2.1914976095102725, + "grad_norm": 0.7542316913604736, + "learning_rate": 8.437813861807712e-06, + "loss": 0.6761, + "num_input_tokens_seen": 138936320, + "step": 16960 + }, + { + "epoch": 2.192789766119654, + "grad_norm": 0.6555983424186707, + "learning_rate": 8.412488955021744e-06, + "loss": 1.3443, + "num_input_tokens_seen": 139018240, + "step": 16970 + }, + { + "epoch": 2.1940819227290347, + "grad_norm": 0.9010647535324097, + "learning_rate": 8.38719441990781e-06, + "loss": 0.8272, + "num_input_tokens_seen": 139100160, + "step": 16980 + }, + { + "epoch": 2.195374079338416, + "grad_norm": 0.4530339539051056, + "learning_rate": 8.361930302780091e-06, + "loss": 0.6814, + "num_input_tokens_seen": 139182080, + "step": 16990 + }, + { + "epoch": 2.196666235947797, + "grad_norm": 0.5172061324119568, + "learning_rate": 8.336696649897116e-06, + "loss": 0.5844, + "num_input_tokens_seen": 139264000, + "step": 17000 + }, + { + "epoch": 2.1979583925571777, + "grad_norm": 1.0065109729766846, + "learning_rate": 8.311493507461593e-06, + "loss": 0.6501, + "num_input_tokens_seen": 139345920, + "step": 17010 + }, + { + "epoch": 2.199250549166559, + "grad_norm": 0.3245972990989685, + "learning_rate": 8.286320921620394e-06, + "loss": 0.6703, + "num_input_tokens_seen": 139427840, + "step": 17020 + }, + { + "epoch": 2.20054270577594, + "grad_norm": 0.8792958855628967, + "learning_rate": 8.261178938464422e-06, + "loss": 0.83, + "num_input_tokens_seen": 139509760, + "step": 17030 + }, + { + "epoch": 2.2018348623853212, + "grad_norm": 0.7680603265762329, + "learning_rate": 8.236067604028563e-06, + "loss": 0.6316, + "num_input_tokens_seen": 139591680, + "step": 17040 + }, + { + "epoch": 2.203127018994702, + "grad_norm": 0.8535128831863403, + "learning_rate": 8.210986964291587e-06, + "loss": 0.7002, + "num_input_tokens_seen": 139673600, + "step": 17050 + }, + { + "epoch": 2.2044191756040834, + "grad_norm": 0.7545835375785828, + "learning_rate": 8.185937065176033e-06, + "loss": 0.9392, + "num_input_tokens_seen": 139755520, + "step": 17060 + }, + { + "epoch": 2.2057113322134643, + "grad_norm": 0.5624322891235352, + "learning_rate": 8.160917952548197e-06, + "loss": 0.828, + "num_input_tokens_seen": 139837440, + "step": 17070 + }, + { + "epoch": 2.207003488822845, + "grad_norm": 1.1098763942718506, + "learning_rate": 8.13592967221796e-06, + "loss": 0.9871, + "num_input_tokens_seen": 139919360, + "step": 17080 + }, + { + "epoch": 2.2082956454322265, + "grad_norm": 0.5870392322540283, + "learning_rate": 8.110972269938793e-06, + "loss": 1.0047, + "num_input_tokens_seen": 140001280, + "step": 17090 + }, + { + "epoch": 2.2095878020416073, + "grad_norm": 0.8012586236000061, + "learning_rate": 8.08604579140759e-06, + "loss": 1.1041, + "num_input_tokens_seen": 140083200, + "step": 17100 + }, + { + "epoch": 2.2108799586509886, + "grad_norm": 0.7019762992858887, + "learning_rate": 8.06115028226466e-06, + "loss": 1.116, + "num_input_tokens_seen": 140165120, + "step": 17110 + }, + { + "epoch": 2.2121721152603695, + "grad_norm": 0.646838366985321, + "learning_rate": 8.036285788093578e-06, + "loss": 0.82, + "num_input_tokens_seen": 140247040, + "step": 17120 + }, + { + "epoch": 2.213464271869751, + "grad_norm": 0.8307018876075745, + "learning_rate": 8.011452354421136e-06, + "loss": 0.6641, + "num_input_tokens_seen": 140328960, + "step": 17130 + }, + { + "epoch": 2.2147564284791317, + "grad_norm": 0.39130905270576477, + "learning_rate": 7.986650026717277e-06, + "loss": 0.5812, + "num_input_tokens_seen": 140410880, + "step": 17140 + }, + { + "epoch": 2.2160485850885125, + "grad_norm": 0.7408241033554077, + "learning_rate": 7.961878850394952e-06, + "loss": 0.8512, + "num_input_tokens_seen": 140492800, + "step": 17150 + }, + { + "epoch": 2.217340741697894, + "grad_norm": 0.6536303758621216, + "learning_rate": 7.937138870810115e-06, + "loss": 0.7622, + "num_input_tokens_seen": 140574720, + "step": 17160 + }, + { + "epoch": 2.2186328983072747, + "grad_norm": 0.9935400485992432, + "learning_rate": 7.912430133261562e-06, + "loss": 0.7604, + "num_input_tokens_seen": 140656640, + "step": 17170 + }, + { + "epoch": 2.219925054916656, + "grad_norm": 0.6916456818580627, + "learning_rate": 7.887752682990903e-06, + "loss": 0.7567, + "num_input_tokens_seen": 140738560, + "step": 17180 + }, + { + "epoch": 2.221217211526037, + "grad_norm": 0.7032532095909119, + "learning_rate": 7.863106565182474e-06, + "loss": 1.0241, + "num_input_tokens_seen": 140820480, + "step": 17190 + }, + { + "epoch": 2.222509368135418, + "grad_norm": 0.6725844740867615, + "learning_rate": 7.838491824963207e-06, + "loss": 0.8592, + "num_input_tokens_seen": 140902400, + "step": 17200 + }, + { + "epoch": 2.223801524744799, + "grad_norm": 0.851714551448822, + "learning_rate": 7.81390850740262e-06, + "loss": 0.9155, + "num_input_tokens_seen": 140984320, + "step": 17210 + }, + { + "epoch": 2.22509368135418, + "grad_norm": 0.7544275522232056, + "learning_rate": 7.78935665751266e-06, + "loss": 1.1792, + "num_input_tokens_seen": 141066240, + "step": 17220 + }, + { + "epoch": 2.2263858379635613, + "grad_norm": 1.2108078002929688, + "learning_rate": 7.764836320247686e-06, + "loss": 0.7382, + "num_input_tokens_seen": 141148160, + "step": 17230 + }, + { + "epoch": 2.227677994572942, + "grad_norm": 0.5292445421218872, + "learning_rate": 7.740347540504336e-06, + "loss": 0.7866, + "num_input_tokens_seen": 141230080, + "step": 17240 + }, + { + "epoch": 2.2289701511823234, + "grad_norm": 0.5671383738517761, + "learning_rate": 7.715890363121484e-06, + "loss": 0.5538, + "num_input_tokens_seen": 141312000, + "step": 17250 + }, + { + "epoch": 2.2302623077917043, + "grad_norm": 1.1066720485687256, + "learning_rate": 7.691464832880135e-06, + "loss": 0.7333, + "num_input_tokens_seen": 141393920, + "step": 17260 + }, + { + "epoch": 2.2315544644010856, + "grad_norm": 0.3376465439796448, + "learning_rate": 7.667070994503334e-06, + "loss": 0.7558, + "num_input_tokens_seen": 141475840, + "step": 17270 + }, + { + "epoch": 2.2328466210104665, + "grad_norm": 0.6462758779525757, + "learning_rate": 7.642708892656125e-06, + "loss": 0.4585, + "num_input_tokens_seen": 141557760, + "step": 17280 + }, + { + "epoch": 2.2341387776198474, + "grad_norm": 0.7883977890014648, + "learning_rate": 7.618378571945417e-06, + "loss": 0.602, + "num_input_tokens_seen": 141639680, + "step": 17290 + }, + { + "epoch": 2.2354309342292287, + "grad_norm": 0.7147501707077026, + "learning_rate": 7.5940800769199345e-06, + "loss": 0.7789, + "num_input_tokens_seen": 141721600, + "step": 17300 + }, + { + "epoch": 2.2367230908386095, + "grad_norm": 0.7497975826263428, + "learning_rate": 7.569813452070146e-06, + "loss": 0.5047, + "num_input_tokens_seen": 141803520, + "step": 17310 + }, + { + "epoch": 2.238015247447991, + "grad_norm": 0.6466274261474609, + "learning_rate": 7.545578741828136e-06, + "loss": 0.6986, + "num_input_tokens_seen": 141885440, + "step": 17320 + }, + { + "epoch": 2.2393074040573717, + "grad_norm": 0.7558254599571228, + "learning_rate": 7.521375990567589e-06, + "loss": 0.8236, + "num_input_tokens_seen": 141967360, + "step": 17330 + }, + { + "epoch": 2.2405995606667526, + "grad_norm": 0.7305789589881897, + "learning_rate": 7.497205242603636e-06, + "loss": 0.469, + "num_input_tokens_seen": 142049280, + "step": 17340 + }, + { + "epoch": 2.241891717276134, + "grad_norm": 0.29660215973854065, + "learning_rate": 7.4730665421928445e-06, + "loss": 0.7266, + "num_input_tokens_seen": 142131200, + "step": 17350 + }, + { + "epoch": 2.2431838738855148, + "grad_norm": 0.26684778928756714, + "learning_rate": 7.4489599335330704e-06, + "loss": 1.0225, + "num_input_tokens_seen": 142213120, + "step": 17360 + }, + { + "epoch": 2.244476030494896, + "grad_norm": 0.6525532007217407, + "learning_rate": 7.424885460763442e-06, + "loss": 0.5151, + "num_input_tokens_seen": 142295040, + "step": 17370 + }, + { + "epoch": 2.245768187104277, + "grad_norm": 0.7799077033996582, + "learning_rate": 7.4008431679642165e-06, + "loss": 0.5883, + "num_input_tokens_seen": 142376960, + "step": 17380 + }, + { + "epoch": 2.2470603437136583, + "grad_norm": 0.9852105975151062, + "learning_rate": 7.3768330991567495e-06, + "loss": 0.6615, + "num_input_tokens_seen": 142458880, + "step": 17390 + }, + { + "epoch": 2.248352500323039, + "grad_norm": 0.7310599684715271, + "learning_rate": 7.3528552983033985e-06, + "loss": 0.7732, + "num_input_tokens_seen": 142540800, + "step": 17400 + }, + { + "epoch": 2.2496446569324204, + "grad_norm": 0.7955625653266907, + "learning_rate": 7.328909809307413e-06, + "loss": 0.954, + "num_input_tokens_seen": 142622720, + "step": 17410 + }, + { + "epoch": 2.2509368135418013, + "grad_norm": 0.8421788215637207, + "learning_rate": 7.304996676012913e-06, + "loss": 0.5088, + "num_input_tokens_seen": 142704640, + "step": 17420 + }, + { + "epoch": 2.252228970151182, + "grad_norm": 0.772693932056427, + "learning_rate": 7.281115942204739e-06, + "loss": 0.812, + "num_input_tokens_seen": 142786560, + "step": 17430 + }, + { + "epoch": 2.2535211267605635, + "grad_norm": 0.9953277707099915, + "learning_rate": 7.257267651608446e-06, + "loss": 0.9194, + "num_input_tokens_seen": 142868480, + "step": 17440 + }, + { + "epoch": 2.2548132833699444, + "grad_norm": 0.6572209000587463, + "learning_rate": 7.233451847890149e-06, + "loss": 0.6278, + "num_input_tokens_seen": 142950400, + "step": 17450 + }, + { + "epoch": 2.2561054399793257, + "grad_norm": 0.3987307846546173, + "learning_rate": 7.209668574656514e-06, + "loss": 0.7189, + "num_input_tokens_seen": 143032320, + "step": 17460 + }, + { + "epoch": 2.2573975965887065, + "grad_norm": 0.6912767291069031, + "learning_rate": 7.185917875454615e-06, + "loss": 0.8608, + "num_input_tokens_seen": 143114240, + "step": 17470 + }, + { + "epoch": 2.2586897531980874, + "grad_norm": 0.6624774932861328, + "learning_rate": 7.162199793771904e-06, + "loss": 0.8747, + "num_input_tokens_seen": 143196160, + "step": 17480 + }, + { + "epoch": 2.2599819098074687, + "grad_norm": 0.3295046091079712, + "learning_rate": 7.138514373036098e-06, + "loss": 0.8996, + "num_input_tokens_seen": 143278080, + "step": 17490 + }, + { + "epoch": 2.2612740664168496, + "grad_norm": 0.7381219267845154, + "learning_rate": 7.11486165661511e-06, + "loss": 0.6536, + "num_input_tokens_seen": 143360000, + "step": 17500 + }, + { + "epoch": 2.262566223026231, + "grad_norm": 0.9237964749336243, + "learning_rate": 7.091241687816988e-06, + "loss": 0.9105, + "num_input_tokens_seen": 143441920, + "step": 17510 + }, + { + "epoch": 2.2638583796356118, + "grad_norm": 0.6693175435066223, + "learning_rate": 7.0676545098897956e-06, + "loss": 0.7303, + "num_input_tokens_seen": 143523840, + "step": 17520 + }, + { + "epoch": 2.265150536244993, + "grad_norm": 0.803263247013092, + "learning_rate": 7.044100166021583e-06, + "loss": 0.6937, + "num_input_tokens_seen": 143605760, + "step": 17530 + }, + { + "epoch": 2.266442692854374, + "grad_norm": 1.2088638544082642, + "learning_rate": 7.020578699340255e-06, + "loss": 0.7094, + "num_input_tokens_seen": 143687680, + "step": 17540 + }, + { + "epoch": 2.2677348494637553, + "grad_norm": 0.6141706109046936, + "learning_rate": 6.997090152913535e-06, + "loss": 1.0294, + "num_input_tokens_seen": 143769600, + "step": 17550 + }, + { + "epoch": 2.269027006073136, + "grad_norm": 0.8863146305084229, + "learning_rate": 6.97363456974888e-06, + "loss": 0.5169, + "num_input_tokens_seen": 143851520, + "step": 17560 + }, + { + "epoch": 2.270319162682517, + "grad_norm": 0.5809889435768127, + "learning_rate": 6.950211992793354e-06, + "loss": 0.8054, + "num_input_tokens_seen": 143933440, + "step": 17570 + }, + { + "epoch": 2.2716113192918983, + "grad_norm": 0.6235867142677307, + "learning_rate": 6.92682246493363e-06, + "loss": 1.089, + "num_input_tokens_seen": 144015360, + "step": 17580 + }, + { + "epoch": 2.272903475901279, + "grad_norm": 0.7494005560874939, + "learning_rate": 6.903466028995828e-06, + "loss": 0.5911, + "num_input_tokens_seen": 144097280, + "step": 17590 + }, + { + "epoch": 2.2741956325106605, + "grad_norm": 0.3730751574039459, + "learning_rate": 6.880142727745517e-06, + "loss": 0.4274, + "num_input_tokens_seen": 144179200, + "step": 17600 + }, + { + "epoch": 2.2754877891200413, + "grad_norm": 0.6000313758850098, + "learning_rate": 6.856852603887556e-06, + "loss": 0.5875, + "num_input_tokens_seen": 144261120, + "step": 17610 + }, + { + "epoch": 2.276779945729422, + "grad_norm": 0.8631287217140198, + "learning_rate": 6.8335957000660925e-06, + "loss": 1.0119, + "num_input_tokens_seen": 144343040, + "step": 17620 + }, + { + "epoch": 2.2780721023388035, + "grad_norm": 0.606353223323822, + "learning_rate": 6.810372058864429e-06, + "loss": 0.8559, + "num_input_tokens_seen": 144424960, + "step": 17630 + }, + { + "epoch": 2.2793642589481844, + "grad_norm": 0.7653380036354065, + "learning_rate": 6.787181722804959e-06, + "loss": 0.9657, + "num_input_tokens_seen": 144506880, + "step": 17640 + }, + { + "epoch": 2.2806564155575657, + "grad_norm": 0.706365168094635, + "learning_rate": 6.764024734349117e-06, + "loss": 0.9287, + "num_input_tokens_seen": 144588800, + "step": 17650 + }, + { + "epoch": 2.2819485721669466, + "grad_norm": 1.115876317024231, + "learning_rate": 6.740901135897257e-06, + "loss": 0.8438, + "num_input_tokens_seen": 144670720, + "step": 17660 + }, + { + "epoch": 2.2832407287763274, + "grad_norm": 0.9500836133956909, + "learning_rate": 6.717810969788596e-06, + "loss": 0.8497, + "num_input_tokens_seen": 144752640, + "step": 17670 + }, + { + "epoch": 2.2845328853857088, + "grad_norm": 0.8140472769737244, + "learning_rate": 6.694754278301154e-06, + "loss": 0.8295, + "num_input_tokens_seen": 144834560, + "step": 17680 + }, + { + "epoch": 2.2858250419950896, + "grad_norm": 0.36906084418296814, + "learning_rate": 6.671731103651641e-06, + "loss": 0.6225, + "num_input_tokens_seen": 144916480, + "step": 17690 + }, + { + "epoch": 2.287117198604471, + "grad_norm": 0.9518002271652222, + "learning_rate": 6.648741487995416e-06, + "loss": 0.7609, + "num_input_tokens_seen": 144998400, + "step": 17700 + }, + { + "epoch": 2.288409355213852, + "grad_norm": 0.6692580580711365, + "learning_rate": 6.625785473426369e-06, + "loss": 0.9989, + "num_input_tokens_seen": 145080320, + "step": 17710 + }, + { + "epoch": 2.289701511823233, + "grad_norm": 1.0394073724746704, + "learning_rate": 6.602863101976886e-06, + "loss": 0.9415, + "num_input_tokens_seen": 145162240, + "step": 17720 + }, + { + "epoch": 2.290993668432614, + "grad_norm": 0.7470195889472961, + "learning_rate": 6.57997441561774e-06, + "loss": 0.8093, + "num_input_tokens_seen": 145244160, + "step": 17730 + }, + { + "epoch": 2.2922858250419953, + "grad_norm": 0.5699132084846497, + "learning_rate": 6.557119456258043e-06, + "loss": 0.6653, + "num_input_tokens_seen": 145326080, + "step": 17740 + }, + { + "epoch": 2.293577981651376, + "grad_norm": 0.5270111560821533, + "learning_rate": 6.534298265745128e-06, + "loss": 0.4557, + "num_input_tokens_seen": 145408000, + "step": 17750 + }, + { + "epoch": 2.294870138260757, + "grad_norm": 1.350411057472229, + "learning_rate": 6.511510885864516e-06, + "loss": 0.4692, + "num_input_tokens_seen": 145489920, + "step": 17760 + }, + { + "epoch": 2.2961622948701383, + "grad_norm": 0.2834679186344147, + "learning_rate": 6.4887573583398255e-06, + "loss": 0.6354, + "num_input_tokens_seen": 145571840, + "step": 17770 + }, + { + "epoch": 2.297454451479519, + "grad_norm": 0.27980419993400574, + "learning_rate": 6.466037724832666e-06, + "loss": 0.4408, + "num_input_tokens_seen": 145653760, + "step": 17780 + }, + { + "epoch": 2.2987466080889005, + "grad_norm": 0.7254844307899475, + "learning_rate": 6.44335202694262e-06, + "loss": 0.9355, + "num_input_tokens_seen": 145735680, + "step": 17790 + }, + { + "epoch": 2.3000387646982814, + "grad_norm": 0.7839716076850891, + "learning_rate": 6.420700306207103e-06, + "loss": 1.1966, + "num_input_tokens_seen": 145817600, + "step": 17800 + }, + { + "epoch": 2.3013309213076623, + "grad_norm": 0.8338444232940674, + "learning_rate": 6.3980826041013464e-06, + "loss": 0.3919, + "num_input_tokens_seen": 145899520, + "step": 17810 + }, + { + "epoch": 2.3026230779170436, + "grad_norm": 0.608752429485321, + "learning_rate": 6.375498962038265e-06, + "loss": 0.9223, + "num_input_tokens_seen": 145981440, + "step": 17820 + }, + { + "epoch": 2.3039152345264244, + "grad_norm": 0.5790471434593201, + "learning_rate": 6.35294942136844e-06, + "loss": 0.5995, + "num_input_tokens_seen": 146063360, + "step": 17830 + }, + { + "epoch": 2.3052073911358058, + "grad_norm": 0.9189697504043579, + "learning_rate": 6.3304340233799805e-06, + "loss": 0.6963, + "num_input_tokens_seen": 146145280, + "step": 17840 + }, + { + "epoch": 2.3064995477451866, + "grad_norm": 0.6007638573646545, + "learning_rate": 6.307952809298517e-06, + "loss": 0.6846, + "num_input_tokens_seen": 146227200, + "step": 17850 + }, + { + "epoch": 2.307791704354568, + "grad_norm": 0.8957340717315674, + "learning_rate": 6.28550582028706e-06, + "loss": 0.542, + "num_input_tokens_seen": 146309120, + "step": 17860 + }, + { + "epoch": 2.309083860963949, + "grad_norm": 0.8787885904312134, + "learning_rate": 6.263093097445957e-06, + "loss": 0.9602, + "num_input_tokens_seen": 146391040, + "step": 17870 + }, + { + "epoch": 2.31037601757333, + "grad_norm": 0.655099630355835, + "learning_rate": 6.240714681812837e-06, + "loss": 0.8196, + "num_input_tokens_seen": 146472960, + "step": 17880 + }, + { + "epoch": 2.311668174182711, + "grad_norm": 1.1180391311645508, + "learning_rate": 6.218370614362484e-06, + "loss": 0.6883, + "num_input_tokens_seen": 146554880, + "step": 17890 + }, + { + "epoch": 2.312960330792092, + "grad_norm": 0.5654224753379822, + "learning_rate": 6.196060936006817e-06, + "loss": 0.5604, + "num_input_tokens_seen": 146636800, + "step": 17900 + }, + { + "epoch": 2.314252487401473, + "grad_norm": 0.617138147354126, + "learning_rate": 6.173785687594761e-06, + "loss": 1.045, + "num_input_tokens_seen": 146718720, + "step": 17910 + }, + { + "epoch": 2.315544644010854, + "grad_norm": 0.2932003140449524, + "learning_rate": 6.1515449099122185e-06, + "loss": 0.8279, + "num_input_tokens_seen": 146800640, + "step": 17920 + }, + { + "epoch": 2.3168368006202353, + "grad_norm": 0.7684862017631531, + "learning_rate": 6.129338643681984e-06, + "loss": 1.0301, + "num_input_tokens_seen": 146882560, + "step": 17930 + }, + { + "epoch": 2.318128957229616, + "grad_norm": 0.6417585015296936, + "learning_rate": 6.107166929563629e-06, + "loss": 0.6595, + "num_input_tokens_seen": 146964480, + "step": 17940 + }, + { + "epoch": 2.319421113838997, + "grad_norm": 0.4829157292842865, + "learning_rate": 6.085029808153503e-06, + "loss": 0.672, + "num_input_tokens_seen": 147046400, + "step": 17950 + }, + { + "epoch": 2.3207132704483784, + "grad_norm": 0.6301985383033752, + "learning_rate": 6.062927319984576e-06, + "loss": 0.7306, + "num_input_tokens_seen": 147128320, + "step": 17960 + }, + { + "epoch": 2.3220054270577593, + "grad_norm": 0.5591002106666565, + "learning_rate": 6.040859505526439e-06, + "loss": 1.0703, + "num_input_tokens_seen": 147210240, + "step": 17970 + }, + { + "epoch": 2.3232975836671406, + "grad_norm": 0.6914545893669128, + "learning_rate": 6.018826405185163e-06, + "loss": 0.9107, + "num_input_tokens_seen": 147292160, + "step": 17980 + }, + { + "epoch": 2.3245897402765214, + "grad_norm": 0.6147554516792297, + "learning_rate": 5.99682805930328e-06, + "loss": 0.9921, + "num_input_tokens_seen": 147374080, + "step": 17990 + }, + { + "epoch": 2.3258818968859027, + "grad_norm": 0.8620073795318604, + "learning_rate": 5.974864508159692e-06, + "loss": 1.121, + "num_input_tokens_seen": 147456000, + "step": 18000 + }, + { + "epoch": 2.3271740534952836, + "grad_norm": 0.7225034236907959, + "learning_rate": 5.952935791969574e-06, + "loss": 0.7416, + "num_input_tokens_seen": 147537920, + "step": 18010 + }, + { + "epoch": 2.328466210104665, + "grad_norm": 0.9885997772216797, + "learning_rate": 5.931041950884314e-06, + "loss": 0.8224, + "num_input_tokens_seen": 147619840, + "step": 18020 + }, + { + "epoch": 2.329758366714046, + "grad_norm": 0.8367599844932556, + "learning_rate": 5.9091830249914685e-06, + "loss": 0.4575, + "num_input_tokens_seen": 147701760, + "step": 18030 + }, + { + "epoch": 2.3310505233234267, + "grad_norm": 0.685955286026001, + "learning_rate": 5.887359054314648e-06, + "loss": 0.8888, + "num_input_tokens_seen": 147783680, + "step": 18040 + }, + { + "epoch": 2.332342679932808, + "grad_norm": 0.8157021403312683, + "learning_rate": 5.8655700788134535e-06, + "loss": 1.0576, + "num_input_tokens_seen": 147865600, + "step": 18050 + }, + { + "epoch": 2.333634836542189, + "grad_norm": 0.7296263575553894, + "learning_rate": 5.843816138383429e-06, + "loss": 0.967, + "num_input_tokens_seen": 147947520, + "step": 18060 + }, + { + "epoch": 2.33492699315157, + "grad_norm": 0.6871107816696167, + "learning_rate": 5.822097272855964e-06, + "loss": 0.5175, + "num_input_tokens_seen": 148029440, + "step": 18070 + }, + { + "epoch": 2.336219149760951, + "grad_norm": 0.9326339364051819, + "learning_rate": 5.800413521998208e-06, + "loss": 0.8177, + "num_input_tokens_seen": 148111360, + "step": 18080 + }, + { + "epoch": 2.337511306370332, + "grad_norm": 0.6087706089019775, + "learning_rate": 5.778764925513045e-06, + "loss": 0.9179, + "num_input_tokens_seen": 148193280, + "step": 18090 + }, + { + "epoch": 2.338803462979713, + "grad_norm": 1.1223865747451782, + "learning_rate": 5.7571515230389586e-06, + "loss": 0.9355, + "num_input_tokens_seen": 148275200, + "step": 18100 + }, + { + "epoch": 2.340095619589094, + "grad_norm": 0.6350956559181213, + "learning_rate": 5.7355733541500285e-06, + "loss": 0.9177, + "num_input_tokens_seen": 148357120, + "step": 18110 + }, + { + "epoch": 2.3413877761984754, + "grad_norm": 0.543329656124115, + "learning_rate": 5.714030458355784e-06, + "loss": 0.8673, + "num_input_tokens_seen": 148439040, + "step": 18120 + }, + { + "epoch": 2.3426799328078562, + "grad_norm": 0.8380277752876282, + "learning_rate": 5.692522875101203e-06, + "loss": 0.8191, + "num_input_tokens_seen": 148520960, + "step": 18130 + }, + { + "epoch": 2.3439720894172376, + "grad_norm": 0.6148279905319214, + "learning_rate": 5.67105064376659e-06, + "loss": 0.4773, + "num_input_tokens_seen": 148602880, + "step": 18140 + }, + { + "epoch": 2.3452642460266184, + "grad_norm": 0.766385555267334, + "learning_rate": 5.649613803667511e-06, + "loss": 0.8423, + "num_input_tokens_seen": 148684800, + "step": 18150 + }, + { + "epoch": 2.3465564026359993, + "grad_norm": 0.24485957622528076, + "learning_rate": 5.628212394054758e-06, + "loss": 0.7039, + "num_input_tokens_seen": 148766720, + "step": 18160 + }, + { + "epoch": 2.3478485592453806, + "grad_norm": 0.30911335349082947, + "learning_rate": 5.606846454114218e-06, + "loss": 0.4886, + "num_input_tokens_seen": 148848640, + "step": 18170 + }, + { + "epoch": 2.3491407158547615, + "grad_norm": 1.1630913019180298, + "learning_rate": 5.5855160229668636e-06, + "loss": 0.8072, + "num_input_tokens_seen": 148930560, + "step": 18180 + }, + { + "epoch": 2.350432872464143, + "grad_norm": 0.8715269565582275, + "learning_rate": 5.564221139668621e-06, + "loss": 0.7343, + "num_input_tokens_seen": 149012480, + "step": 18190 + }, + { + "epoch": 2.3517250290735237, + "grad_norm": 0.7479017376899719, + "learning_rate": 5.542961843210359e-06, + "loss": 1.1034, + "num_input_tokens_seen": 149094400, + "step": 18200 + }, + { + "epoch": 2.353017185682905, + "grad_norm": 0.23909921944141388, + "learning_rate": 5.5217381725177624e-06, + "loss": 0.7438, + "num_input_tokens_seen": 149176320, + "step": 18210 + }, + { + "epoch": 2.354309342292286, + "grad_norm": 0.6988628506660461, + "learning_rate": 5.50055016645129e-06, + "loss": 0.6506, + "num_input_tokens_seen": 149258240, + "step": 18220 + }, + { + "epoch": 2.3556014989016667, + "grad_norm": 0.9495237469673157, + "learning_rate": 5.479397863806115e-06, + "loss": 0.7191, + "num_input_tokens_seen": 149340160, + "step": 18230 + }, + { + "epoch": 2.356893655511048, + "grad_norm": 0.9350152015686035, + "learning_rate": 5.458281303312016e-06, + "loss": 0.8379, + "num_input_tokens_seen": 149422080, + "step": 18240 + }, + { + "epoch": 2.358185812120429, + "grad_norm": 0.8469577431678772, + "learning_rate": 5.437200523633348e-06, + "loss": 0.388, + "num_input_tokens_seen": 149504000, + "step": 18250 + }, + { + "epoch": 2.35947796872981, + "grad_norm": 0.6918286085128784, + "learning_rate": 5.41615556336893e-06, + "loss": 1.0104, + "num_input_tokens_seen": 149585920, + "step": 18260 + }, + { + "epoch": 2.360770125339191, + "grad_norm": 0.8202119469642639, + "learning_rate": 5.39514646105202e-06, + "loss": 0.7642, + "num_input_tokens_seen": 149667840, + "step": 18270 + }, + { + "epoch": 2.362062281948572, + "grad_norm": 0.20039023458957672, + "learning_rate": 5.374173255150194e-06, + "loss": 0.6885, + "num_input_tokens_seen": 149749760, + "step": 18280 + }, + { + "epoch": 2.3633544385579532, + "grad_norm": 44.923805236816406, + "learning_rate": 5.353235984065321e-06, + "loss": 0.6808, + "num_input_tokens_seen": 149831680, + "step": 18290 + }, + { + "epoch": 2.364646595167334, + "grad_norm": 0.7638149261474609, + "learning_rate": 5.332334686133475e-06, + "loss": 0.7185, + "num_input_tokens_seen": 149913600, + "step": 18300 + }, + { + "epoch": 2.3659387517767154, + "grad_norm": 0.9998169541358948, + "learning_rate": 5.311469399624844e-06, + "loss": 0.8391, + "num_input_tokens_seen": 149995520, + "step": 18310 + }, + { + "epoch": 2.3672309083860963, + "grad_norm": 0.6908437609672546, + "learning_rate": 5.290640162743704e-06, + "loss": 0.6135, + "num_input_tokens_seen": 150077440, + "step": 18320 + }, + { + "epoch": 2.3685230649954776, + "grad_norm": 1.3081281185150146, + "learning_rate": 5.269847013628299e-06, + "loss": 0.9426, + "num_input_tokens_seen": 150159360, + "step": 18330 + }, + { + "epoch": 2.3698152216048585, + "grad_norm": 0.7654116749763489, + "learning_rate": 5.24908999035082e-06, + "loss": 0.9153, + "num_input_tokens_seen": 150241280, + "step": 18340 + }, + { + "epoch": 2.37110737821424, + "grad_norm": 1.00165855884552, + "learning_rate": 5.228369130917288e-06, + "loss": 0.4039, + "num_input_tokens_seen": 150323200, + "step": 18350 + }, + { + "epoch": 2.3723995348236206, + "grad_norm": 0.593526303768158, + "learning_rate": 5.207684473267527e-06, + "loss": 0.573, + "num_input_tokens_seen": 150405120, + "step": 18360 + }, + { + "epoch": 2.3736916914330015, + "grad_norm": 0.5847054719924927, + "learning_rate": 5.187036055275077e-06, + "loss": 1.009, + "num_input_tokens_seen": 150487040, + "step": 18370 + }, + { + "epoch": 2.374983848042383, + "grad_norm": 0.7985702157020569, + "learning_rate": 5.16642391474711e-06, + "loss": 0.6435, + "num_input_tokens_seen": 150568960, + "step": 18380 + }, + { + "epoch": 2.3762760046517637, + "grad_norm": 0.8118183612823486, + "learning_rate": 5.145848089424374e-06, + "loss": 0.884, + "num_input_tokens_seen": 150650880, + "step": 18390 + }, + { + "epoch": 2.377568161261145, + "grad_norm": 0.7361833453178406, + "learning_rate": 5.125308616981139e-06, + "loss": 0.7895, + "num_input_tokens_seen": 150732800, + "step": 18400 + }, + { + "epoch": 2.378860317870526, + "grad_norm": 0.6324991583824158, + "learning_rate": 5.1048055350251e-06, + "loss": 0.971, + "num_input_tokens_seen": 150814720, + "step": 18410 + }, + { + "epoch": 2.3801524744799067, + "grad_norm": 0.899131178855896, + "learning_rate": 5.0843388810973195e-06, + "loss": 0.5018, + "num_input_tokens_seen": 150896640, + "step": 18420 + }, + { + "epoch": 2.381444631089288, + "grad_norm": 0.6457487344741821, + "learning_rate": 5.06390869267217e-06, + "loss": 0.953, + "num_input_tokens_seen": 150978560, + "step": 18430 + }, + { + "epoch": 2.382736787698669, + "grad_norm": 0.8277502655982971, + "learning_rate": 5.043515007157263e-06, + "loss": 1.2336, + "num_input_tokens_seen": 151060480, + "step": 18440 + }, + { + "epoch": 2.3840289443080502, + "grad_norm": 0.7911684513092041, + "learning_rate": 5.02315786189334e-06, + "loss": 0.6531, + "num_input_tokens_seen": 151142400, + "step": 18450 + }, + { + "epoch": 2.385321100917431, + "grad_norm": 0.945289671421051, + "learning_rate": 5.002837294154283e-06, + "loss": 0.7825, + "num_input_tokens_seen": 151224320, + "step": 18460 + }, + { + "epoch": 2.3866132575268124, + "grad_norm": 0.6046878695487976, + "learning_rate": 4.982553341146956e-06, + "loss": 0.7779, + "num_input_tokens_seen": 151306240, + "step": 18470 + }, + { + "epoch": 2.3879054141361933, + "grad_norm": 0.8228399753570557, + "learning_rate": 4.962306040011222e-06, + "loss": 0.9538, + "num_input_tokens_seen": 151388160, + "step": 18480 + }, + { + "epoch": 2.3891975707455746, + "grad_norm": 0.7043998837471008, + "learning_rate": 4.942095427819796e-06, + "loss": 1.0866, + "num_input_tokens_seen": 151470080, + "step": 18490 + }, + { + "epoch": 2.3904897273549555, + "grad_norm": 0.5962804555892944, + "learning_rate": 4.921921541578248e-06, + "loss": 0.8085, + "num_input_tokens_seen": 151552000, + "step": 18500 + }, + { + "epoch": 2.3917818839643363, + "grad_norm": 0.6313290596008301, + "learning_rate": 4.901784418224892e-06, + "loss": 0.8541, + "num_input_tokens_seen": 151633920, + "step": 18510 + }, + { + "epoch": 2.3930740405737176, + "grad_norm": 0.6041284203529358, + "learning_rate": 4.881684094630712e-06, + "loss": 0.6467, + "num_input_tokens_seen": 151715840, + "step": 18520 + }, + { + "epoch": 2.3943661971830985, + "grad_norm": 0.7616141438484192, + "learning_rate": 4.861620607599346e-06, + "loss": 0.7586, + "num_input_tokens_seen": 151797760, + "step": 18530 + }, + { + "epoch": 2.39565835379248, + "grad_norm": 0.3103398084640503, + "learning_rate": 4.841593993866949e-06, + "loss": 0.853, + "num_input_tokens_seen": 151879680, + "step": 18540 + }, + { + "epoch": 2.3969505104018607, + "grad_norm": 0.7408877015113831, + "learning_rate": 4.821604290102191e-06, + "loss": 1.02, + "num_input_tokens_seen": 151961600, + "step": 18550 + }, + { + "epoch": 2.3982426670112416, + "grad_norm": 0.24368135631084442, + "learning_rate": 4.801651532906135e-06, + "loss": 0.8047, + "num_input_tokens_seen": 152043520, + "step": 18560 + }, + { + "epoch": 2.399534823620623, + "grad_norm": 0.7442583441734314, + "learning_rate": 4.781735758812217e-06, + "loss": 0.9561, + "num_input_tokens_seen": 152125440, + "step": 18570 + }, + { + "epoch": 2.4008269802300037, + "grad_norm": 0.6959922313690186, + "learning_rate": 4.761857004286141e-06, + "loss": 0.6804, + "num_input_tokens_seen": 152207360, + "step": 18580 + }, + { + "epoch": 2.402119136839385, + "grad_norm": 1.0393434762954712, + "learning_rate": 4.742015305725828e-06, + "loss": 0.9148, + "num_input_tokens_seen": 152289280, + "step": 18590 + }, + { + "epoch": 2.403411293448766, + "grad_norm": 0.9378628134727478, + "learning_rate": 4.7222106994613655e-06, + "loss": 0.6088, + "num_input_tokens_seen": 152371200, + "step": 18600 + }, + { + "epoch": 2.4047034500581472, + "grad_norm": 0.8462372422218323, + "learning_rate": 4.702443221754904e-06, + "loss": 0.7719, + "num_input_tokens_seen": 152453120, + "step": 18610 + }, + { + "epoch": 2.405995606667528, + "grad_norm": 0.6147511005401611, + "learning_rate": 4.6827129088006375e-06, + "loss": 0.8772, + "num_input_tokens_seen": 152535040, + "step": 18620 + }, + { + "epoch": 2.4072877632769094, + "grad_norm": 0.8971914052963257, + "learning_rate": 4.663019796724685e-06, + "loss": 0.7122, + "num_input_tokens_seen": 152616960, + "step": 18630 + }, + { + "epoch": 2.4085799198862903, + "grad_norm": 0.7405256628990173, + "learning_rate": 4.6433639215850696e-06, + "loss": 1.0226, + "num_input_tokens_seen": 152698880, + "step": 18640 + }, + { + "epoch": 2.409872076495671, + "grad_norm": 0.3762902021408081, + "learning_rate": 4.623745319371617e-06, + "loss": 0.6679, + "num_input_tokens_seen": 152780800, + "step": 18650 + }, + { + "epoch": 2.4111642331050525, + "grad_norm": 0.9347968697547913, + "learning_rate": 4.604164026005925e-06, + "loss": 1.0588, + "num_input_tokens_seen": 152862720, + "step": 18660 + }, + { + "epoch": 2.4124563897144333, + "grad_norm": 0.7165658473968506, + "learning_rate": 4.584620077341273e-06, + "loss": 0.8622, + "num_input_tokens_seen": 152944640, + "step": 18670 + }, + { + "epoch": 2.4137485463238146, + "grad_norm": 0.715330958366394, + "learning_rate": 4.565113509162547e-06, + "loss": 0.9585, + "num_input_tokens_seen": 153026560, + "step": 18680 + }, + { + "epoch": 2.4150407029331955, + "grad_norm": 0.5120941996574402, + "learning_rate": 4.5456443571862185e-06, + "loss": 0.5105, + "num_input_tokens_seen": 153108480, + "step": 18690 + }, + { + "epoch": 2.4163328595425764, + "grad_norm": 0.6818029284477234, + "learning_rate": 4.5262126570602135e-06, + "loss": 0.7058, + "num_input_tokens_seen": 153190400, + "step": 18700 + }, + { + "epoch": 2.4176250161519577, + "grad_norm": 0.3870543837547302, + "learning_rate": 4.506818444363925e-06, + "loss": 0.7815, + "num_input_tokens_seen": 153272320, + "step": 18710 + }, + { + "epoch": 2.4189171727613386, + "grad_norm": 0.23925046622753143, + "learning_rate": 4.487461754608066e-06, + "loss": 0.6579, + "num_input_tokens_seen": 153354240, + "step": 18720 + }, + { + "epoch": 2.42020932937072, + "grad_norm": 0.25111180543899536, + "learning_rate": 4.468142623234678e-06, + "loss": 0.5656, + "num_input_tokens_seen": 153436160, + "step": 18730 + }, + { + "epoch": 2.4215014859801007, + "grad_norm": 0.6497703790664673, + "learning_rate": 4.448861085617018e-06, + "loss": 0.9916, + "num_input_tokens_seen": 153518080, + "step": 18740 + }, + { + "epoch": 2.4227936425894816, + "grad_norm": 0.30985990166664124, + "learning_rate": 4.429617177059508e-06, + "loss": 0.3525, + "num_input_tokens_seen": 153600000, + "step": 18750 + }, + { + "epoch": 2.424085799198863, + "grad_norm": 0.848656177520752, + "learning_rate": 4.410410932797671e-06, + "loss": 0.912, + "num_input_tokens_seen": 153681920, + "step": 18760 + }, + { + "epoch": 2.425377955808244, + "grad_norm": 0.6000288724899292, + "learning_rate": 4.391242387998079e-06, + "loss": 0.8084, + "num_input_tokens_seen": 153763840, + "step": 18770 + }, + { + "epoch": 2.426670112417625, + "grad_norm": 0.6745223999023438, + "learning_rate": 4.372111577758261e-06, + "loss": 0.8788, + "num_input_tokens_seen": 153845760, + "step": 18780 + }, + { + "epoch": 2.427962269027006, + "grad_norm": 0.9669370651245117, + "learning_rate": 4.353018537106657e-06, + "loss": 0.8239, + "num_input_tokens_seen": 153927680, + "step": 18790 + }, + { + "epoch": 2.4292544256363873, + "grad_norm": 0.9055847525596619, + "learning_rate": 4.333963301002558e-06, + "loss": 0.6755, + "num_input_tokens_seen": 154009600, + "step": 18800 + }, + { + "epoch": 2.430546582245768, + "grad_norm": 0.5974034070968628, + "learning_rate": 4.314945904336037e-06, + "loss": 0.8883, + "num_input_tokens_seen": 154091520, + "step": 18810 + }, + { + "epoch": 2.4318387388551495, + "grad_norm": 0.6422154307365417, + "learning_rate": 4.295966381927871e-06, + "loss": 0.9761, + "num_input_tokens_seen": 154173440, + "step": 18820 + }, + { + "epoch": 2.4331308954645303, + "grad_norm": 1.6391959190368652, + "learning_rate": 4.2770247685295e-06, + "loss": 0.7079, + "num_input_tokens_seen": 154255360, + "step": 18830 + }, + { + "epoch": 2.434423052073911, + "grad_norm": 1.442500114440918, + "learning_rate": 4.258121098822945e-06, + "loss": 0.8145, + "num_input_tokens_seen": 154337280, + "step": 18840 + }, + { + "epoch": 2.4357152086832925, + "grad_norm": 0.6912276744842529, + "learning_rate": 4.239255407420764e-06, + "loss": 0.668, + "num_input_tokens_seen": 154419200, + "step": 18850 + }, + { + "epoch": 2.4370073652926734, + "grad_norm": 0.5514190196990967, + "learning_rate": 4.220427728865956e-06, + "loss": 0.635, + "num_input_tokens_seen": 154501120, + "step": 18860 + }, + { + "epoch": 2.4382995219020547, + "grad_norm": 0.8538394570350647, + "learning_rate": 4.201638097631938e-06, + "loss": 0.8883, + "num_input_tokens_seen": 154583040, + "step": 18870 + }, + { + "epoch": 2.4395916785114355, + "grad_norm": 1.320961356163025, + "learning_rate": 4.182886548122464e-06, + "loss": 0.6052, + "num_input_tokens_seen": 154664960, + "step": 18880 + }, + { + "epoch": 2.4408838351208164, + "grad_norm": 0.70893394947052, + "learning_rate": 4.164173114671538e-06, + "loss": 0.5808, + "num_input_tokens_seen": 154746880, + "step": 18890 + }, + { + "epoch": 2.4421759917301977, + "grad_norm": 0.386322021484375, + "learning_rate": 4.145497831543402e-06, + "loss": 1.0335, + "num_input_tokens_seen": 154828800, + "step": 18900 + }, + { + "epoch": 2.4434681483395786, + "grad_norm": 0.5800808668136597, + "learning_rate": 4.1268607329324195e-06, + "loss": 0.8685, + "num_input_tokens_seen": 154910720, + "step": 18910 + }, + { + "epoch": 2.44476030494896, + "grad_norm": 1.0976125001907349, + "learning_rate": 4.108261852963061e-06, + "loss": 0.9031, + "num_input_tokens_seen": 154992640, + "step": 18920 + }, + { + "epoch": 2.4460524615583408, + "grad_norm": 0.9657114148139954, + "learning_rate": 4.089701225689793e-06, + "loss": 0.6177, + "num_input_tokens_seen": 155074560, + "step": 18930 + }, + { + "epoch": 2.447344618167722, + "grad_norm": 0.718108594417572, + "learning_rate": 4.071178885097074e-06, + "loss": 0.6072, + "num_input_tokens_seen": 155156480, + "step": 18940 + }, + { + "epoch": 2.448636774777103, + "grad_norm": 0.8545438051223755, + "learning_rate": 4.052694865099232e-06, + "loss": 0.8248, + "num_input_tokens_seen": 155238400, + "step": 18950 + }, + { + "epoch": 2.4499289313864843, + "grad_norm": 0.739714503288269, + "learning_rate": 4.034249199540432e-06, + "loss": 0.8951, + "num_input_tokens_seen": 155320320, + "step": 18960 + }, + { + "epoch": 2.451221087995865, + "grad_norm": 0.3881072998046875, + "learning_rate": 4.015841922194638e-06, + "loss": 0.6641, + "num_input_tokens_seen": 155402240, + "step": 18970 + }, + { + "epoch": 2.452513244605246, + "grad_norm": 0.2270699441432953, + "learning_rate": 3.997473066765489e-06, + "loss": 0.3194, + "num_input_tokens_seen": 155484160, + "step": 18980 + }, + { + "epoch": 2.4538054012146273, + "grad_norm": 0.6020457744598389, + "learning_rate": 3.97914266688631e-06, + "loss": 0.8105, + "num_input_tokens_seen": 155566080, + "step": 18990 + }, + { + "epoch": 2.455097557824008, + "grad_norm": 1.502171277999878, + "learning_rate": 3.96085075611998e-06, + "loss": 0.7238, + "num_input_tokens_seen": 155648000, + "step": 19000 + }, + { + "epoch": 2.4563897144333895, + "grad_norm": 0.7338537573814392, + "learning_rate": 3.942597367958928e-06, + "loss": 0.9272, + "num_input_tokens_seen": 155729920, + "step": 19010 + }, + { + "epoch": 2.4576818710427704, + "grad_norm": 0.7727378010749817, + "learning_rate": 3.924382535825047e-06, + "loss": 0.5264, + "num_input_tokens_seen": 155811840, + "step": 19020 + }, + { + "epoch": 2.4589740276521512, + "grad_norm": 0.7787820100784302, + "learning_rate": 3.906206293069617e-06, + "loss": 0.8021, + "num_input_tokens_seen": 155893760, + "step": 19030 + }, + { + "epoch": 2.4602661842615325, + "grad_norm": 0.7348833084106445, + "learning_rate": 3.88806867297328e-06, + "loss": 0.9558, + "num_input_tokens_seen": 155975680, + "step": 19040 + }, + { + "epoch": 2.4615583408709134, + "grad_norm": 0.7787769436836243, + "learning_rate": 3.869969708745946e-06, + "loss": 1.0861, + "num_input_tokens_seen": 156057600, + "step": 19050 + }, + { + "epoch": 2.4628504974802947, + "grad_norm": 0.7689657807350159, + "learning_rate": 3.85190943352676e-06, + "loss": 0.9696, + "num_input_tokens_seen": 156139520, + "step": 19060 + }, + { + "epoch": 2.4641426540896756, + "grad_norm": 1.0891441106796265, + "learning_rate": 3.833887880384007e-06, + "loss": 0.7405, + "num_input_tokens_seen": 156221440, + "step": 19070 + }, + { + "epoch": 2.465434810699057, + "grad_norm": 0.4140090048313141, + "learning_rate": 3.815905082315102e-06, + "loss": 0.6941, + "num_input_tokens_seen": 156303360, + "step": 19080 + }, + { + "epoch": 2.4667269673084378, + "grad_norm": 0.9087291359901428, + "learning_rate": 3.7979610722464643e-06, + "loss": 0.472, + "num_input_tokens_seen": 156385280, + "step": 19090 + }, + { + "epoch": 2.468019123917819, + "grad_norm": 0.9746045470237732, + "learning_rate": 3.780055883033523e-06, + "loss": 1.1217, + "num_input_tokens_seen": 156467200, + "step": 19100 + }, + { + "epoch": 2.4693112805272, + "grad_norm": 0.43733227252960205, + "learning_rate": 3.762189547460615e-06, + "loss": 0.7157, + "num_input_tokens_seen": 156549120, + "step": 19110 + }, + { + "epoch": 2.470603437136581, + "grad_norm": 0.6600936055183411, + "learning_rate": 3.7443620982409305e-06, + "loss": 0.9332, + "num_input_tokens_seen": 156631040, + "step": 19120 + }, + { + "epoch": 2.471895593745962, + "grad_norm": 0.22970102727413177, + "learning_rate": 3.7265735680164615e-06, + "loss": 0.5993, + "num_input_tokens_seen": 156712960, + "step": 19130 + }, + { + "epoch": 2.473187750355343, + "grad_norm": 0.37218940258026123, + "learning_rate": 3.7088239893579456e-06, + "loss": 0.8641, + "num_input_tokens_seen": 156794880, + "step": 19140 + }, + { + "epoch": 2.4744799069647243, + "grad_norm": 0.8411920666694641, + "learning_rate": 3.6911133947648002e-06, + "loss": 0.5555, + "num_input_tokens_seen": 156876800, + "step": 19150 + }, + { + "epoch": 2.475772063574105, + "grad_norm": 0.7639785408973694, + "learning_rate": 3.6734418166650436e-06, + "loss": 0.7846, + "num_input_tokens_seen": 156958720, + "step": 19160 + }, + { + "epoch": 2.477064220183486, + "grad_norm": 0.5358415842056274, + "learning_rate": 3.655809287415285e-06, + "loss": 0.7447, + "num_input_tokens_seen": 157040640, + "step": 19170 + }, + { + "epoch": 2.4783563767928674, + "grad_norm": 0.6833941340446472, + "learning_rate": 3.638215839300624e-06, + "loss": 0.8456, + "num_input_tokens_seen": 157122560, + "step": 19180 + }, + { + "epoch": 2.4796485334022482, + "grad_norm": 0.5907963514328003, + "learning_rate": 3.6206615045345837e-06, + "loss": 0.9318, + "num_input_tokens_seen": 157204480, + "step": 19190 + }, + { + "epoch": 2.4809406900116295, + "grad_norm": 0.9480105638504028, + "learning_rate": 3.603146315259104e-06, + "loss": 1.074, + "num_input_tokens_seen": 157286400, + "step": 19200 + }, + { + "epoch": 2.4822328466210104, + "grad_norm": 0.9866535067558289, + "learning_rate": 3.5856703035444196e-06, + "loss": 0.6913, + "num_input_tokens_seen": 157368320, + "step": 19210 + }, + { + "epoch": 2.4835250032303917, + "grad_norm": 0.965872585773468, + "learning_rate": 3.568233501389054e-06, + "loss": 0.7036, + "num_input_tokens_seen": 157450240, + "step": 19220 + }, + { + "epoch": 2.4848171598397726, + "grad_norm": 0.752018928527832, + "learning_rate": 3.5508359407197157e-06, + "loss": 0.7991, + "num_input_tokens_seen": 157532160, + "step": 19230 + }, + { + "epoch": 2.4861093164491535, + "grad_norm": 0.6639631986618042, + "learning_rate": 3.5334776533912846e-06, + "loss": 0.6216, + "num_input_tokens_seen": 157614080, + "step": 19240 + }, + { + "epoch": 2.4874014730585348, + "grad_norm": 0.7294473052024841, + "learning_rate": 3.516158671186723e-06, + "loss": 0.9294, + "num_input_tokens_seen": 157696000, + "step": 19250 + }, + { + "epoch": 2.4886936296679156, + "grad_norm": 0.920391321182251, + "learning_rate": 3.4988790258170146e-06, + "loss": 0.8402, + "num_input_tokens_seen": 157777920, + "step": 19260 + }, + { + "epoch": 2.489985786277297, + "grad_norm": 1.0001167058944702, + "learning_rate": 3.481638748921137e-06, + "loss": 0.8772, + "num_input_tokens_seen": 157859840, + "step": 19270 + }, + { + "epoch": 2.491277942886678, + "grad_norm": 1.0365166664123535, + "learning_rate": 3.4644378720659648e-06, + "loss": 0.5995, + "num_input_tokens_seen": 157941760, + "step": 19280 + }, + { + "epoch": 2.492570099496059, + "grad_norm": 0.8598686456680298, + "learning_rate": 3.4472764267462486e-06, + "loss": 0.8775, + "num_input_tokens_seen": 158023680, + "step": 19290 + }, + { + "epoch": 2.49386225610544, + "grad_norm": 0.7252724170684814, + "learning_rate": 3.430154444384523e-06, + "loss": 0.827, + "num_input_tokens_seen": 158105600, + "step": 19300 + }, + { + "epoch": 2.495154412714821, + "grad_norm": 0.9044560194015503, + "learning_rate": 3.4130719563310877e-06, + "loss": 0.6568, + "num_input_tokens_seen": 158187520, + "step": 19310 + }, + { + "epoch": 2.496446569324202, + "grad_norm": 0.3147662281990051, + "learning_rate": 3.396028993863906e-06, + "loss": 0.8331, + "num_input_tokens_seen": 158269440, + "step": 19320 + }, + { + "epoch": 2.497738725933583, + "grad_norm": 0.6290098428726196, + "learning_rate": 3.379025588188578e-06, + "loss": 0.6592, + "num_input_tokens_seen": 158351360, + "step": 19330 + }, + { + "epoch": 2.4990308825429643, + "grad_norm": 0.49024534225463867, + "learning_rate": 3.362061770438285e-06, + "loss": 0.9447, + "num_input_tokens_seen": 158433280, + "step": 19340 + }, + { + "epoch": 2.500323039152345, + "grad_norm": 1.2774690389633179, + "learning_rate": 3.3451375716737067e-06, + "loss": 0.4547, + "num_input_tokens_seen": 158515200, + "step": 19350 + }, + { + "epoch": 2.501615195761726, + "grad_norm": 0.6173495650291443, + "learning_rate": 3.328253022883002e-06, + "loss": 0.8312, + "num_input_tokens_seen": 158597120, + "step": 19360 + }, + { + "epoch": 2.5029073523711074, + "grad_norm": 0.7313811779022217, + "learning_rate": 3.3114081549817018e-06, + "loss": 0.744, + "num_input_tokens_seen": 158679040, + "step": 19370 + }, + { + "epoch": 2.5041995089804887, + "grad_norm": 0.28148242831230164, + "learning_rate": 3.2946029988127068e-06, + "loss": 0.6551, + "num_input_tokens_seen": 158760960, + "step": 19380 + }, + { + "epoch": 2.5054916655898696, + "grad_norm": 0.8187404870986938, + "learning_rate": 3.2778375851462013e-06, + "loss": 1.1236, + "num_input_tokens_seen": 158842880, + "step": 19390 + }, + { + "epoch": 2.5067838221992504, + "grad_norm": 0.7850127220153809, + "learning_rate": 3.2611119446795844e-06, + "loss": 0.7408, + "num_input_tokens_seen": 158924800, + "step": 19400 + }, + { + "epoch": 2.5080759788086318, + "grad_norm": 0.9370409250259399, + "learning_rate": 3.2444261080374546e-06, + "loss": 0.8042, + "num_input_tokens_seen": 159006720, + "step": 19410 + }, + { + "epoch": 2.5093681354180126, + "grad_norm": 0.6807863116264343, + "learning_rate": 3.227780105771505e-06, + "loss": 0.7672, + "num_input_tokens_seen": 159088640, + "step": 19420 + }, + { + "epoch": 2.510660292027394, + "grad_norm": 0.713300347328186, + "learning_rate": 3.2111739683605204e-06, + "loss": 0.9146, + "num_input_tokens_seen": 159170560, + "step": 19430 + }, + { + "epoch": 2.511952448636775, + "grad_norm": 0.7816091179847717, + "learning_rate": 3.194607726210261e-06, + "loss": 0.8129, + "num_input_tokens_seen": 159252480, + "step": 19440 + }, + { + "epoch": 2.5132446052461557, + "grad_norm": 0.1920579969882965, + "learning_rate": 3.178081409653469e-06, + "loss": 0.6187, + "num_input_tokens_seen": 159334400, + "step": 19450 + }, + { + "epoch": 2.514536761855537, + "grad_norm": 0.6369209885597229, + "learning_rate": 3.1615950489497587e-06, + "loss": 0.7696, + "num_input_tokens_seen": 159416320, + "step": 19460 + }, + { + "epoch": 2.515828918464918, + "grad_norm": 0.4802658259868622, + "learning_rate": 3.1451486742856055e-06, + "loss": 0.8106, + "num_input_tokens_seen": 159498240, + "step": 19470 + }, + { + "epoch": 2.517121075074299, + "grad_norm": 0.8319321274757385, + "learning_rate": 3.128742315774255e-06, + "loss": 0.8289, + "num_input_tokens_seen": 159580160, + "step": 19480 + }, + { + "epoch": 2.51841323168368, + "grad_norm": 0.8333976864814758, + "learning_rate": 3.1123760034556943e-06, + "loss": 0.4341, + "num_input_tokens_seen": 159662080, + "step": 19490 + }, + { + "epoch": 2.519705388293061, + "grad_norm": 1.237228274345398, + "learning_rate": 3.0960497672965825e-06, + "loss": 0.7386, + "num_input_tokens_seen": 159744000, + "step": 19500 + }, + { + "epoch": 2.520997544902442, + "grad_norm": 0.8341740369796753, + "learning_rate": 3.0797636371901863e-06, + "loss": 0.6727, + "num_input_tokens_seen": 159825920, + "step": 19510 + }, + { + "epoch": 2.522289701511823, + "grad_norm": 0.6844837069511414, + "learning_rate": 3.063517642956365e-06, + "loss": 0.8113, + "num_input_tokens_seen": 159907840, + "step": 19520 + }, + { + "epoch": 2.5235818581212044, + "grad_norm": 0.8005863428115845, + "learning_rate": 3.0473118143414634e-06, + "loss": 0.793, + "num_input_tokens_seen": 159989760, + "step": 19530 + }, + { + "epoch": 2.5248740147305853, + "grad_norm": 0.6440595984458923, + "learning_rate": 3.031146181018299e-06, + "loss": 0.8666, + "num_input_tokens_seen": 160071680, + "step": 19540 + }, + { + "epoch": 2.526166171339966, + "grad_norm": 0.6987878680229187, + "learning_rate": 3.0150207725860912e-06, + "loss": 0.6603, + "num_input_tokens_seen": 160153600, + "step": 19550 + }, + { + "epoch": 2.5274583279493474, + "grad_norm": 0.9995496273040771, + "learning_rate": 2.9989356185703975e-06, + "loss": 0.6602, + "num_input_tokens_seen": 160235520, + "step": 19560 + }, + { + "epoch": 2.5287504845587288, + "grad_norm": 0.8544566035270691, + "learning_rate": 2.982890748423084e-06, + "loss": 0.4753, + "num_input_tokens_seen": 160317440, + "step": 19570 + }, + { + "epoch": 2.5300426411681096, + "grad_norm": 0.9914432764053345, + "learning_rate": 2.9668861915222364e-06, + "loss": 0.9147, + "num_input_tokens_seen": 160399360, + "step": 19580 + }, + { + "epoch": 2.5313347977774905, + "grad_norm": 0.5936645865440369, + "learning_rate": 2.950921977172155e-06, + "loss": 0.9882, + "num_input_tokens_seen": 160481280, + "step": 19590 + }, + { + "epoch": 2.532626954386872, + "grad_norm": 0.45081570744514465, + "learning_rate": 2.934998134603245e-06, + "loss": 0.5187, + "num_input_tokens_seen": 160563200, + "step": 19600 + }, + { + "epoch": 2.5339191109962527, + "grad_norm": 0.7718636393547058, + "learning_rate": 2.919114692972008e-06, + "loss": 0.7276, + "num_input_tokens_seen": 160645120, + "step": 19610 + }, + { + "epoch": 2.535211267605634, + "grad_norm": 0.371920108795166, + "learning_rate": 2.9032716813609723e-06, + "loss": 0.6929, + "num_input_tokens_seen": 160727040, + "step": 19620 + }, + { + "epoch": 2.536503424215015, + "grad_norm": 0.7801430225372314, + "learning_rate": 2.8874691287786275e-06, + "loss": 0.5275, + "num_input_tokens_seen": 160808960, + "step": 19630 + }, + { + "epoch": 2.5377955808243957, + "grad_norm": 0.8307329416275024, + "learning_rate": 2.8717070641593987e-06, + "loss": 0.6655, + "num_input_tokens_seen": 160890880, + "step": 19640 + }, + { + "epoch": 2.539087737433777, + "grad_norm": 0.44690385460853577, + "learning_rate": 2.8559855163635544e-06, + "loss": 0.6951, + "num_input_tokens_seen": 160972800, + "step": 19650 + }, + { + "epoch": 2.540379894043158, + "grad_norm": 1.1183629035949707, + "learning_rate": 2.8403045141772054e-06, + "loss": 0.6766, + "num_input_tokens_seen": 161054720, + "step": 19660 + }, + { + "epoch": 2.541672050652539, + "grad_norm": 0.7904671430587769, + "learning_rate": 2.824664086312204e-06, + "loss": 0.7144, + "num_input_tokens_seen": 161136640, + "step": 19670 + }, + { + "epoch": 2.54296420726192, + "grad_norm": 0.6050812602043152, + "learning_rate": 2.809064261406111e-06, + "loss": 0.8016, + "num_input_tokens_seen": 161218560, + "step": 19680 + }, + { + "epoch": 2.544256363871301, + "grad_norm": 0.7455185651779175, + "learning_rate": 2.7935050680221565e-06, + "loss": 0.808, + "num_input_tokens_seen": 161300480, + "step": 19690 + }, + { + "epoch": 2.5455485204806823, + "grad_norm": 0.6713374853134155, + "learning_rate": 2.7779865346491576e-06, + "loss": 0.6117, + "num_input_tokens_seen": 161382400, + "step": 19700 + }, + { + "epoch": 2.5468406770900636, + "grad_norm": 0.7479690909385681, + "learning_rate": 2.762508689701504e-06, + "loss": 0.8894, + "num_input_tokens_seen": 161464320, + "step": 19710 + }, + { + "epoch": 2.5481328336994444, + "grad_norm": 0.8708047270774841, + "learning_rate": 2.74707156151906e-06, + "loss": 0.9919, + "num_input_tokens_seen": 161546240, + "step": 19720 + }, + { + "epoch": 2.5494249903088253, + "grad_norm": 0.33944186568260193, + "learning_rate": 2.7316751783671655e-06, + "loss": 1.0815, + "num_input_tokens_seen": 161628160, + "step": 19730 + }, + { + "epoch": 2.5507171469182066, + "grad_norm": 0.6622774004936218, + "learning_rate": 2.716319568436529e-06, + "loss": 0.9753, + "num_input_tokens_seen": 161710080, + "step": 19740 + }, + { + "epoch": 2.5520093035275875, + "grad_norm": 1.0370920896530151, + "learning_rate": 2.7010047598432205e-06, + "loss": 1.0534, + "num_input_tokens_seen": 161792000, + "step": 19750 + }, + { + "epoch": 2.553301460136969, + "grad_norm": 0.35142797231674194, + "learning_rate": 2.6857307806286037e-06, + "loss": 0.5268, + "num_input_tokens_seen": 161873920, + "step": 19760 + }, + { + "epoch": 2.5545936167463497, + "grad_norm": 1.3889226913452148, + "learning_rate": 2.6704976587592688e-06, + "loss": 0.7309, + "num_input_tokens_seen": 161955840, + "step": 19770 + }, + { + "epoch": 2.5558857733557305, + "grad_norm": 0.6227067708969116, + "learning_rate": 2.655305422127016e-06, + "loss": 0.9389, + "num_input_tokens_seen": 162037760, + "step": 19780 + }, + { + "epoch": 2.557177929965112, + "grad_norm": 0.35299497842788696, + "learning_rate": 2.6401540985487667e-06, + "loss": 0.6891, + "num_input_tokens_seen": 162119680, + "step": 19790 + }, + { + "epoch": 2.5584700865744927, + "grad_norm": 1.1497657299041748, + "learning_rate": 2.6250437157665455e-06, + "loss": 0.6537, + "num_input_tokens_seen": 162201600, + "step": 19800 + }, + { + "epoch": 2.559762243183874, + "grad_norm": 0.7763955593109131, + "learning_rate": 2.6099743014474014e-06, + "loss": 0.7283, + "num_input_tokens_seen": 162283520, + "step": 19810 + }, + { + "epoch": 2.561054399793255, + "grad_norm": 0.6793556809425354, + "learning_rate": 2.594945883183386e-06, + "loss": 0.6683, + "num_input_tokens_seen": 162365440, + "step": 19820 + }, + { + "epoch": 2.5623465564026358, + "grad_norm": 0.6524314880371094, + "learning_rate": 2.5799584884914685e-06, + "loss": 0.855, + "num_input_tokens_seen": 162447360, + "step": 19830 + }, + { + "epoch": 2.563638713012017, + "grad_norm": 1.6065878868103027, + "learning_rate": 2.5650121448135222e-06, + "loss": 0.9368, + "num_input_tokens_seen": 162529280, + "step": 19840 + }, + { + "epoch": 2.5649308696213984, + "grad_norm": 0.7918826937675476, + "learning_rate": 2.550106879516237e-06, + "loss": 0.5496, + "num_input_tokens_seen": 162611200, + "step": 19850 + }, + { + "epoch": 2.5662230262307792, + "grad_norm": 0.7481831312179565, + "learning_rate": 2.535242719891112e-06, + "loss": 1.0861, + "num_input_tokens_seen": 162693120, + "step": 19860 + }, + { + "epoch": 2.56751518284016, + "grad_norm": 0.519053041934967, + "learning_rate": 2.5204196931543635e-06, + "loss": 0.4623, + "num_input_tokens_seen": 162775040, + "step": 19870 + }, + { + "epoch": 2.5688073394495414, + "grad_norm": 0.7055248618125916, + "learning_rate": 2.505637826446891e-06, + "loss": 1.0828, + "num_input_tokens_seen": 162856960, + "step": 19880 + }, + { + "epoch": 2.5700994960589223, + "grad_norm": 0.2592199444770813, + "learning_rate": 2.4908971468342535e-06, + "loss": 0.8837, + "num_input_tokens_seen": 162938880, + "step": 19890 + }, + { + "epoch": 2.5713916526683036, + "grad_norm": 0.6449967622756958, + "learning_rate": 2.4761976813065663e-06, + "loss": 0.986, + "num_input_tokens_seen": 163020800, + "step": 19900 + }, + { + "epoch": 2.5726838092776845, + "grad_norm": 0.601736307144165, + "learning_rate": 2.4615394567785055e-06, + "loss": 0.9548, + "num_input_tokens_seen": 163102720, + "step": 19910 + }, + { + "epoch": 2.5739759658870653, + "grad_norm": 0.7824547290802002, + "learning_rate": 2.44692250008923e-06, + "loss": 1.0377, + "num_input_tokens_seen": 163184640, + "step": 19920 + }, + { + "epoch": 2.5752681224964467, + "grad_norm": 0.4208737015724182, + "learning_rate": 2.432346838002325e-06, + "loss": 0.8889, + "num_input_tokens_seen": 163266560, + "step": 19930 + }, + { + "epoch": 2.5765602791058275, + "grad_norm": 0.7913038730621338, + "learning_rate": 2.417812497205782e-06, + "loss": 0.7366, + "num_input_tokens_seen": 163348480, + "step": 19940 + }, + { + "epoch": 2.577852435715209, + "grad_norm": 1.0949358940124512, + "learning_rate": 2.403319504311921e-06, + "loss": 0.9359, + "num_input_tokens_seen": 163430400, + "step": 19950 + }, + { + "epoch": 2.5791445923245897, + "grad_norm": 0.877521276473999, + "learning_rate": 2.3888678858573625e-06, + "loss": 1.1218, + "num_input_tokens_seen": 163512320, + "step": 19960 + }, + { + "epoch": 2.5804367489339706, + "grad_norm": 0.8615255951881409, + "learning_rate": 2.374457668302962e-06, + "loss": 0.8245, + "num_input_tokens_seen": 163594240, + "step": 19970 + }, + { + "epoch": 2.581728905543352, + "grad_norm": 0.5929073691368103, + "learning_rate": 2.360088878033778e-06, + "loss": 1.0923, + "num_input_tokens_seen": 163676160, + "step": 19980 + }, + { + "epoch": 2.5830210621527327, + "grad_norm": 0.7284789681434631, + "learning_rate": 2.3457615413590177e-06, + "loss": 1.161, + "num_input_tokens_seen": 163758080, + "step": 19990 + }, + { + "epoch": 2.584313218762114, + "grad_norm": 1.2667911052703857, + "learning_rate": 2.3314756845119746e-06, + "loss": 0.9189, + "num_input_tokens_seen": 163840000, + "step": 20000 + }, + { + "epoch": 2.585605375371495, + "grad_norm": 0.672854483127594, + "learning_rate": 2.317231333650005e-06, + "loss": 0.8559, + "num_input_tokens_seen": 163921920, + "step": 20010 + }, + { + "epoch": 2.5868975319808762, + "grad_norm": 0.6795851588249207, + "learning_rate": 2.3030285148544577e-06, + "loss": 0.9338, + "num_input_tokens_seen": 164003840, + "step": 20020 + }, + { + "epoch": 2.588189688590257, + "grad_norm": 0.6645368933677673, + "learning_rate": 2.2888672541306525e-06, + "loss": 0.7432, + "num_input_tokens_seen": 164085760, + "step": 20030 + }, + { + "epoch": 2.5894818451996384, + "grad_norm": 0.7248007655143738, + "learning_rate": 2.2747475774077986e-06, + "loss": 0.7769, + "num_input_tokens_seen": 164167680, + "step": 20040 + }, + { + "epoch": 2.5907740018090193, + "grad_norm": 0.7931993007659912, + "learning_rate": 2.2606695105389653e-06, + "loss": 0.8439, + "num_input_tokens_seen": 164249600, + "step": 20050 + }, + { + "epoch": 2.5920661584184, + "grad_norm": 0.8612631559371948, + "learning_rate": 2.2466330793010555e-06, + "loss": 0.772, + "num_input_tokens_seen": 164331520, + "step": 20060 + }, + { + "epoch": 2.5933583150277815, + "grad_norm": 0.7526144981384277, + "learning_rate": 2.2326383093947135e-06, + "loss": 0.7069, + "num_input_tokens_seen": 164413440, + "step": 20070 + }, + { + "epoch": 2.5946504716371623, + "grad_norm": 1.1278603076934814, + "learning_rate": 2.2186852264443196e-06, + "loss": 0.9164, + "num_input_tokens_seen": 164495360, + "step": 20080 + }, + { + "epoch": 2.5959426282465436, + "grad_norm": 0.7620711326599121, + "learning_rate": 2.2047738559979104e-06, + "loss": 0.8105, + "num_input_tokens_seen": 164577280, + "step": 20090 + }, + { + "epoch": 2.5972347848559245, + "grad_norm": 0.6490961909294128, + "learning_rate": 2.1909042235271597e-06, + "loss": 1.0952, + "num_input_tokens_seen": 164659200, + "step": 20100 + }, + { + "epoch": 2.5985269414653054, + "grad_norm": 1.2163642644882202, + "learning_rate": 2.1770763544273098e-06, + "loss": 0.9695, + "num_input_tokens_seen": 164741120, + "step": 20110 + }, + { + "epoch": 2.5998190980746867, + "grad_norm": 0.4267684519290924, + "learning_rate": 2.1632902740171378e-06, + "loss": 0.8496, + "num_input_tokens_seen": 164823040, + "step": 20120 + }, + { + "epoch": 2.6011112546840676, + "grad_norm": 0.7075666785240173, + "learning_rate": 2.1495460075389133e-06, + "loss": 0.7883, + "num_input_tokens_seen": 164904960, + "step": 20130 + }, + { + "epoch": 2.602403411293449, + "grad_norm": 0.6190396547317505, + "learning_rate": 2.1358435801583283e-06, + "loss": 0.6911, + "num_input_tokens_seen": 164986880, + "step": 20140 + }, + { + "epoch": 2.6036955679028297, + "grad_norm": 0.3564266264438629, + "learning_rate": 2.122183016964488e-06, + "loss": 0.5446, + "num_input_tokens_seen": 165068800, + "step": 20150 + }, + { + "epoch": 2.6049877245122106, + "grad_norm": 0.7317832112312317, + "learning_rate": 2.1085643429698236e-06, + "loss": 0.7104, + "num_input_tokens_seen": 165150720, + "step": 20160 + }, + { + "epoch": 2.606279881121592, + "grad_norm": 0.6708076000213623, + "learning_rate": 2.094987583110086e-06, + "loss": 0.9044, + "num_input_tokens_seen": 165232640, + "step": 20170 + }, + { + "epoch": 2.6075720377309732, + "grad_norm": 0.6246494650840759, + "learning_rate": 2.0814527622442626e-06, + "loss": 0.687, + "num_input_tokens_seen": 165314560, + "step": 20180 + }, + { + "epoch": 2.608864194340354, + "grad_norm": 0.7877633571624756, + "learning_rate": 2.067959905154568e-06, + "loss": 0.791, + "num_input_tokens_seen": 165396480, + "step": 20190 + }, + { + "epoch": 2.610156350949735, + "grad_norm": 0.33362698554992676, + "learning_rate": 2.0545090365463788e-06, + "loss": 0.508, + "num_input_tokens_seen": 165478400, + "step": 20200 + }, + { + "epoch": 2.6114485075591163, + "grad_norm": 0.5828604102134705, + "learning_rate": 2.041100181048178e-06, + "loss": 0.9302, + "num_input_tokens_seen": 165560320, + "step": 20210 + }, + { + "epoch": 2.612740664168497, + "grad_norm": 0.7072322964668274, + "learning_rate": 2.0277333632115288e-06, + "loss": 0.7966, + "num_input_tokens_seen": 165642240, + "step": 20220 + }, + { + "epoch": 2.6140328207778785, + "grad_norm": 0.760685920715332, + "learning_rate": 2.0144086075110367e-06, + "loss": 1.1115, + "num_input_tokens_seen": 165724160, + "step": 20230 + }, + { + "epoch": 2.6153249773872593, + "grad_norm": 1.190176248550415, + "learning_rate": 2.001125938344273e-06, + "loss": 0.6785, + "num_input_tokens_seen": 165806080, + "step": 20240 + }, + { + "epoch": 2.61661713399664, + "grad_norm": 0.2874329686164856, + "learning_rate": 1.9878853800317535e-06, + "loss": 0.8553, + "num_input_tokens_seen": 165888000, + "step": 20250 + }, + { + "epoch": 2.6179092906060215, + "grad_norm": 1.3199117183685303, + "learning_rate": 1.9746869568168985e-06, + "loss": 0.7277, + "num_input_tokens_seen": 165969920, + "step": 20260 + }, + { + "epoch": 2.6192014472154024, + "grad_norm": 0.7847501039505005, + "learning_rate": 1.9615306928659677e-06, + "loss": 0.823, + "num_input_tokens_seen": 166051840, + "step": 20270 + }, + { + "epoch": 2.6204936038247837, + "grad_norm": 0.6932337284088135, + "learning_rate": 1.948416612268034e-06, + "loss": 0.9189, + "num_input_tokens_seen": 166133760, + "step": 20280 + }, + { + "epoch": 2.6217857604341646, + "grad_norm": 0.6914857029914856, + "learning_rate": 1.935344739034936e-06, + "loss": 0.8662, + "num_input_tokens_seen": 166215680, + "step": 20290 + }, + { + "epoch": 2.6230779170435454, + "grad_norm": 0.6377202868461609, + "learning_rate": 1.922315097101218e-06, + "loss": 0.9959, + "num_input_tokens_seen": 166297600, + "step": 20300 + }, + { + "epoch": 2.6243700736529267, + "grad_norm": 0.9114018082618713, + "learning_rate": 1.909327710324116e-06, + "loss": 0.4612, + "num_input_tokens_seen": 166379520, + "step": 20310 + }, + { + "epoch": 2.625662230262308, + "grad_norm": 0.7677037715911865, + "learning_rate": 1.8963826024834734e-06, + "loss": 0.9159, + "num_input_tokens_seen": 166461440, + "step": 20320 + }, + { + "epoch": 2.626954386871689, + "grad_norm": 0.4625994563102722, + "learning_rate": 1.8834797972817508e-06, + "loss": 0.6083, + "num_input_tokens_seen": 166543360, + "step": 20330 + }, + { + "epoch": 2.62824654348107, + "grad_norm": 0.6611701250076294, + "learning_rate": 1.8706193183439247e-06, + "loss": 0.8399, + "num_input_tokens_seen": 166625280, + "step": 20340 + }, + { + "epoch": 2.629538700090451, + "grad_norm": 0.528550386428833, + "learning_rate": 1.8578011892174924e-06, + "loss": 0.4694, + "num_input_tokens_seen": 166707200, + "step": 20350 + }, + { + "epoch": 2.630830856699832, + "grad_norm": 0.7156257033348083, + "learning_rate": 1.845025433372402e-06, + "loss": 1.0965, + "num_input_tokens_seen": 166789120, + "step": 20360 + }, + { + "epoch": 2.6321230133092133, + "grad_norm": 0.6360397934913635, + "learning_rate": 1.8322920742010086e-06, + "loss": 0.9704, + "num_input_tokens_seen": 166871040, + "step": 20370 + }, + { + "epoch": 2.633415169918594, + "grad_norm": 1.0048847198486328, + "learning_rate": 1.8196011350180563e-06, + "loss": 1.1178, + "num_input_tokens_seen": 166952960, + "step": 20380 + }, + { + "epoch": 2.634707326527975, + "grad_norm": 0.7727967500686646, + "learning_rate": 1.8069526390605968e-06, + "loss": 0.6866, + "num_input_tokens_seen": 167034880, + "step": 20390 + }, + { + "epoch": 2.6359994831373563, + "grad_norm": 0.8432693481445312, + "learning_rate": 1.7943466094879902e-06, + "loss": 0.8549, + "num_input_tokens_seen": 167116800, + "step": 20400 + }, + { + "epoch": 2.637291639746737, + "grad_norm": 0.6722807884216309, + "learning_rate": 1.7817830693818288e-06, + "loss": 0.7212, + "num_input_tokens_seen": 167198720, + "step": 20410 + }, + { + "epoch": 2.6385837963561185, + "grad_norm": 0.6678622961044312, + "learning_rate": 1.7692620417459004e-06, + "loss": 0.3765, + "num_input_tokens_seen": 167280640, + "step": 20420 + }, + { + "epoch": 2.6398759529654994, + "grad_norm": 0.7946398854255676, + "learning_rate": 1.7567835495061718e-06, + "loss": 0.9786, + "num_input_tokens_seen": 167362560, + "step": 20430 + }, + { + "epoch": 2.6411681095748802, + "grad_norm": 0.26235970854759216, + "learning_rate": 1.7443476155107052e-06, + "loss": 0.8443, + "num_input_tokens_seen": 167444480, + "step": 20440 + }, + { + "epoch": 2.6424602661842616, + "grad_norm": 0.9268401861190796, + "learning_rate": 1.7319542625296613e-06, + "loss": 0.9905, + "num_input_tokens_seen": 167526400, + "step": 20450 + }, + { + "epoch": 2.643752422793643, + "grad_norm": 0.7280152440071106, + "learning_rate": 1.7196035132552135e-06, + "loss": 0.7714, + "num_input_tokens_seen": 167608320, + "step": 20460 + }, + { + "epoch": 2.6450445794030237, + "grad_norm": 0.6951790452003479, + "learning_rate": 1.7072953903015498e-06, + "loss": 0.9703, + "num_input_tokens_seen": 167690240, + "step": 20470 + }, + { + "epoch": 2.6463367360124046, + "grad_norm": 0.5963300466537476, + "learning_rate": 1.6950299162047878e-06, + "loss": 0.7194, + "num_input_tokens_seen": 167772160, + "step": 20480 + }, + { + "epoch": 2.647628892621786, + "grad_norm": 1.0526518821716309, + "learning_rate": 1.682807113422971e-06, + "loss": 0.7812, + "num_input_tokens_seen": 167854080, + "step": 20490 + }, + { + "epoch": 2.648921049231167, + "grad_norm": 0.33847522735595703, + "learning_rate": 1.6706270043360117e-06, + "loss": 0.6726, + "num_input_tokens_seen": 167936000, + "step": 20500 + }, + { + "epoch": 2.650213205840548, + "grad_norm": 0.7132168412208557, + "learning_rate": 1.6584896112456338e-06, + "loss": 0.921, + "num_input_tokens_seen": 168017920, + "step": 20510 + }, + { + "epoch": 2.651505362449929, + "grad_norm": 0.6660868525505066, + "learning_rate": 1.646394956375369e-06, + "loss": 0.8371, + "num_input_tokens_seen": 168099840, + "step": 20520 + }, + { + "epoch": 2.65279751905931, + "grad_norm": 0.7083340287208557, + "learning_rate": 1.6343430618704775e-06, + "loss": 0.6175, + "num_input_tokens_seen": 168181760, + "step": 20530 + }, + { + "epoch": 2.654089675668691, + "grad_norm": 0.35995742678642273, + "learning_rate": 1.622333949797944e-06, + "loss": 0.6474, + "num_input_tokens_seen": 168263680, + "step": 20540 + }, + { + "epoch": 2.655381832278072, + "grad_norm": 0.6416080594062805, + "learning_rate": 1.6103676421463986e-06, + "loss": 0.6756, + "num_input_tokens_seen": 168345600, + "step": 20550 + }, + { + "epoch": 2.6566739888874533, + "grad_norm": 0.6493408679962158, + "learning_rate": 1.5984441608261152e-06, + "loss": 0.8549, + "num_input_tokens_seen": 168427520, + "step": 20560 + }, + { + "epoch": 2.657966145496834, + "grad_norm": 1.0145015716552734, + "learning_rate": 1.5865635276689412e-06, + "loss": 0.6964, + "num_input_tokens_seen": 168509440, + "step": 20570 + }, + { + "epoch": 2.659258302106215, + "grad_norm": 0.8252952098846436, + "learning_rate": 1.5747257644282726e-06, + "loss": 0.8505, + "num_input_tokens_seen": 168591360, + "step": 20580 + }, + { + "epoch": 2.6605504587155964, + "grad_norm": 1.0364081859588623, + "learning_rate": 1.5629308927790077e-06, + "loss": 0.4486, + "num_input_tokens_seen": 168673280, + "step": 20590 + }, + { + "epoch": 2.6618426153249772, + "grad_norm": 0.7188685536384583, + "learning_rate": 1.551178934317521e-06, + "loss": 0.7925, + "num_input_tokens_seen": 168755200, + "step": 20600 + }, + { + "epoch": 2.6631347719343585, + "grad_norm": 0.6135556101799011, + "learning_rate": 1.5394699105616002e-06, + "loss": 0.9941, + "num_input_tokens_seen": 168837120, + "step": 20610 + }, + { + "epoch": 2.6644269285437394, + "grad_norm": 0.39184391498565674, + "learning_rate": 1.5278038429504177e-06, + "loss": 0.8127, + "num_input_tokens_seen": 168919040, + "step": 20620 + }, + { + "epoch": 2.6657190851531203, + "grad_norm": 0.3787175416946411, + "learning_rate": 1.516180752844515e-06, + "loss": 0.3516, + "num_input_tokens_seen": 169000960, + "step": 20630 + }, + { + "epoch": 2.6670112417625016, + "grad_norm": 0.7270440459251404, + "learning_rate": 1.504600661525718e-06, + "loss": 0.7978, + "num_input_tokens_seen": 169082880, + "step": 20640 + }, + { + "epoch": 2.668303398371883, + "grad_norm": 0.8899626731872559, + "learning_rate": 1.493063590197133e-06, + "loss": 0.6851, + "num_input_tokens_seen": 169164800, + "step": 20650 + }, + { + "epoch": 2.6695955549812638, + "grad_norm": 0.7514258027076721, + "learning_rate": 1.4815695599830981e-06, + "loss": 1.0456, + "num_input_tokens_seen": 169246720, + "step": 20660 + }, + { + "epoch": 2.6708877115906446, + "grad_norm": 1.2644627094268799, + "learning_rate": 1.4701185919291372e-06, + "loss": 0.7482, + "num_input_tokens_seen": 169328640, + "step": 20670 + }, + { + "epoch": 2.672179868200026, + "grad_norm": 0.5645007491111755, + "learning_rate": 1.4587107070019368e-06, + "loss": 0.8197, + "num_input_tokens_seen": 169410560, + "step": 20680 + }, + { + "epoch": 2.673472024809407, + "grad_norm": 0.5567523241043091, + "learning_rate": 1.447345926089283e-06, + "loss": 0.8722, + "num_input_tokens_seen": 169492480, + "step": 20690 + }, + { + "epoch": 2.674764181418788, + "grad_norm": 1.2603198289871216, + "learning_rate": 1.436024270000058e-06, + "loss": 0.9786, + "num_input_tokens_seen": 169574400, + "step": 20700 + }, + { + "epoch": 2.676056338028169, + "grad_norm": 1.105191946029663, + "learning_rate": 1.4247457594641662e-06, + "loss": 0.8513, + "num_input_tokens_seen": 169656320, + "step": 20710 + }, + { + "epoch": 2.67734849463755, + "grad_norm": 0.5803696513175964, + "learning_rate": 1.4135104151325184e-06, + "loss": 0.8101, + "num_input_tokens_seen": 169738240, + "step": 20720 + }, + { + "epoch": 2.678640651246931, + "grad_norm": 0.6467001438140869, + "learning_rate": 1.4023182575769956e-06, + "loss": 0.5225, + "num_input_tokens_seen": 169820160, + "step": 20730 + }, + { + "epoch": 2.679932807856312, + "grad_norm": 1.2854204177856445, + "learning_rate": 1.391169307290391e-06, + "loss": 0.7088, + "num_input_tokens_seen": 169902080, + "step": 20740 + }, + { + "epoch": 2.6812249644656934, + "grad_norm": 0.6026389002799988, + "learning_rate": 1.3800635846863973e-06, + "loss": 0.9884, + "num_input_tokens_seen": 169984000, + "step": 20750 + }, + { + "epoch": 2.6825171210750742, + "grad_norm": 0.8329545259475708, + "learning_rate": 1.3690011100995437e-06, + "loss": 1.097, + "num_input_tokens_seen": 170065920, + "step": 20760 + }, + { + "epoch": 2.683809277684455, + "grad_norm": 1.0132107734680176, + "learning_rate": 1.357981903785191e-06, + "loss": 0.7501, + "num_input_tokens_seen": 170147840, + "step": 20770 + }, + { + "epoch": 2.6851014342938364, + "grad_norm": 0.5987882614135742, + "learning_rate": 1.3470059859194583e-06, + "loss": 0.9277, + "num_input_tokens_seen": 170229760, + "step": 20780 + }, + { + "epoch": 2.6863935909032177, + "grad_norm": 1.0641794204711914, + "learning_rate": 1.3360733765992116e-06, + "loss": 0.6113, + "num_input_tokens_seen": 170311680, + "step": 20790 + }, + { + "epoch": 2.6876857475125986, + "grad_norm": 0.6288622617721558, + "learning_rate": 1.325184095842022e-06, + "loss": 0.7514, + "num_input_tokens_seen": 170393600, + "step": 20800 + }, + { + "epoch": 2.6889779041219795, + "grad_norm": 1.762484073638916, + "learning_rate": 1.3143381635861207e-06, + "loss": 0.7529, + "num_input_tokens_seen": 170475520, + "step": 20810 + }, + { + "epoch": 2.6902700607313608, + "grad_norm": 0.8595255017280579, + "learning_rate": 1.3035355996903697e-06, + "loss": 0.8549, + "num_input_tokens_seen": 170557440, + "step": 20820 + }, + { + "epoch": 2.6915622173407416, + "grad_norm": 0.663703441619873, + "learning_rate": 1.2927764239342221e-06, + "loss": 0.7501, + "num_input_tokens_seen": 170639360, + "step": 20830 + }, + { + "epoch": 2.692854373950123, + "grad_norm": 0.9071587920188904, + "learning_rate": 1.2820606560176945e-06, + "loss": 0.9612, + "num_input_tokens_seen": 170721280, + "step": 20840 + }, + { + "epoch": 2.694146530559504, + "grad_norm": 1.5256946086883545, + "learning_rate": 1.2713883155613144e-06, + "loss": 0.8317, + "num_input_tokens_seen": 170803200, + "step": 20850 + }, + { + "epoch": 2.6954386871688847, + "grad_norm": 1.048254370689392, + "learning_rate": 1.2607594221060975e-06, + "loss": 0.6523, + "num_input_tokens_seen": 170885120, + "step": 20860 + }, + { + "epoch": 2.696730843778266, + "grad_norm": 0.8906717896461487, + "learning_rate": 1.2501739951135155e-06, + "loss": 0.8824, + "num_input_tokens_seen": 170967040, + "step": 20870 + }, + { + "epoch": 2.698023000387647, + "grad_norm": 0.6903151869773865, + "learning_rate": 1.2396320539654366e-06, + "loss": 0.8567, + "num_input_tokens_seen": 171048960, + "step": 20880 + }, + { + "epoch": 2.699315156997028, + "grad_norm": 1.00175940990448, + "learning_rate": 1.229133617964126e-06, + "loss": 1.1474, + "num_input_tokens_seen": 171130880, + "step": 20890 + }, + { + "epoch": 2.700607313606409, + "grad_norm": 0.7980899810791016, + "learning_rate": 1.2186787063321743e-06, + "loss": 0.5778, + "num_input_tokens_seen": 171212800, + "step": 20900 + }, + { + "epoch": 2.70189947021579, + "grad_norm": 0.848283052444458, + "learning_rate": 1.208267338212493e-06, + "loss": 0.9957, + "num_input_tokens_seen": 171294720, + "step": 20910 + }, + { + "epoch": 2.7031916268251712, + "grad_norm": 0.7135230898857117, + "learning_rate": 1.1978995326682535e-06, + "loss": 0.8154, + "num_input_tokens_seen": 171376640, + "step": 20920 + }, + { + "epoch": 2.7044837834345525, + "grad_norm": 0.7431100010871887, + "learning_rate": 1.1875753086828727e-06, + "loss": 0.3983, + "num_input_tokens_seen": 171458560, + "step": 20930 + }, + { + "epoch": 2.7057759400439334, + "grad_norm": 0.3723915219306946, + "learning_rate": 1.177294685159963e-06, + "loss": 0.8132, + "num_input_tokens_seen": 171540480, + "step": 20940 + }, + { + "epoch": 2.7070680966533143, + "grad_norm": 0.5851907134056091, + "learning_rate": 1.167057680923317e-06, + "loss": 0.6962, + "num_input_tokens_seen": 171622400, + "step": 20950 + }, + { + "epoch": 2.7083602532626956, + "grad_norm": 0.5370262861251831, + "learning_rate": 1.1568643147168434e-06, + "loss": 0.5466, + "num_input_tokens_seen": 171704320, + "step": 20960 + }, + { + "epoch": 2.7096524098720765, + "grad_norm": 0.6362653374671936, + "learning_rate": 1.1467146052045603e-06, + "loss": 0.9384, + "num_input_tokens_seen": 171786240, + "step": 20970 + }, + { + "epoch": 2.7109445664814578, + "grad_norm": 1.7740426063537598, + "learning_rate": 1.1366085709705515e-06, + "loss": 0.3847, + "num_input_tokens_seen": 171868160, + "step": 20980 + }, + { + "epoch": 2.7122367230908386, + "grad_norm": 0.7024763226509094, + "learning_rate": 1.1265462305189268e-06, + "loss": 0.7502, + "num_input_tokens_seen": 171950080, + "step": 20990 + }, + { + "epoch": 2.7135288797002195, + "grad_norm": 0.7445507645606995, + "learning_rate": 1.1165276022737926e-06, + "loss": 0.8216, + "num_input_tokens_seen": 172032000, + "step": 21000 + }, + { + "epoch": 2.714821036309601, + "grad_norm": 0.6791986227035522, + "learning_rate": 1.1065527045792251e-06, + "loss": 0.6687, + "num_input_tokens_seen": 172113920, + "step": 21010 + }, + { + "epoch": 2.7161131929189817, + "grad_norm": 0.999146580696106, + "learning_rate": 1.0966215556992231e-06, + "loss": 1.0147, + "num_input_tokens_seen": 172195840, + "step": 21020 + }, + { + "epoch": 2.717405349528363, + "grad_norm": 1.1012611389160156, + "learning_rate": 1.0867341738176857e-06, + "loss": 0.9388, + "num_input_tokens_seen": 172277760, + "step": 21030 + }, + { + "epoch": 2.718697506137744, + "grad_norm": 0.8878210783004761, + "learning_rate": 1.076890577038367e-06, + "loss": 0.9003, + "num_input_tokens_seen": 172359680, + "step": 21040 + }, + { + "epoch": 2.7199896627471247, + "grad_norm": 0.6756826043128967, + "learning_rate": 1.0670907833848664e-06, + "loss": 0.6774, + "num_input_tokens_seen": 172441600, + "step": 21050 + }, + { + "epoch": 2.721281819356506, + "grad_norm": 0.6432533860206604, + "learning_rate": 1.0573348108005614e-06, + "loss": 0.9375, + "num_input_tokens_seen": 172523520, + "step": 21060 + }, + { + "epoch": 2.722573975965887, + "grad_norm": 0.5785542726516724, + "learning_rate": 1.0476226771486074e-06, + "loss": 0.5949, + "num_input_tokens_seen": 172605440, + "step": 21070 + }, + { + "epoch": 2.723866132575268, + "grad_norm": 0.6814249753952026, + "learning_rate": 1.0379544002118824e-06, + "loss": 0.5954, + "num_input_tokens_seen": 172687360, + "step": 21080 + }, + { + "epoch": 2.725158289184649, + "grad_norm": 0.6625814437866211, + "learning_rate": 1.0283299976929672e-06, + "loss": 0.4849, + "num_input_tokens_seen": 172769280, + "step": 21090 + }, + { + "epoch": 2.7264504457940304, + "grad_norm": 0.4581983983516693, + "learning_rate": 1.0187494872141102e-06, + "loss": 0.6476, + "num_input_tokens_seen": 172851200, + "step": 21100 + }, + { + "epoch": 2.7277426024034113, + "grad_norm": 0.617347240447998, + "learning_rate": 1.0092128863171846e-06, + "loss": 1.1527, + "num_input_tokens_seen": 172933120, + "step": 21110 + }, + { + "epoch": 2.7290347590127926, + "grad_norm": 0.6780828833580017, + "learning_rate": 9.997202124636785e-07, + "loss": 0.7512, + "num_input_tokens_seen": 173015040, + "step": 21120 + }, + { + "epoch": 2.7303269156221734, + "grad_norm": 1.0733816623687744, + "learning_rate": 9.902714830346437e-07, + "loss": 0.6027, + "num_input_tokens_seen": 173096960, + "step": 21130 + }, + { + "epoch": 2.7316190722315543, + "grad_norm": 0.6169953942298889, + "learning_rate": 9.808667153306612e-07, + "loss": 0.5431, + "num_input_tokens_seen": 173178880, + "step": 21140 + }, + { + "epoch": 2.7329112288409356, + "grad_norm": 0.3251383602619171, + "learning_rate": 9.715059265718335e-07, + "loss": 0.7888, + "num_input_tokens_seen": 173260800, + "step": 21150 + }, + { + "epoch": 2.7342033854503165, + "grad_norm": 0.7879831194877625, + "learning_rate": 9.62189133897723e-07, + "loss": 0.9271, + "num_input_tokens_seen": 173342720, + "step": 21160 + }, + { + "epoch": 2.735495542059698, + "grad_norm": 0.988198459148407, + "learning_rate": 9.52916354367353e-07, + "loss": 0.8073, + "num_input_tokens_seen": 173424640, + "step": 21170 + }, + { + "epoch": 2.7367876986690787, + "grad_norm": 0.5871654152870178, + "learning_rate": 9.436876049591398e-07, + "loss": 1.2963, + "num_input_tokens_seen": 173506560, + "step": 21180 + }, + { + "epoch": 2.7380798552784595, + "grad_norm": 0.9331020712852478, + "learning_rate": 9.345029025708995e-07, + "loss": 0.6766, + "num_input_tokens_seen": 173588480, + "step": 21190 + }, + { + "epoch": 2.739372011887841, + "grad_norm": 1.031977891921997, + "learning_rate": 9.253622640197773e-07, + "loss": 0.5872, + "num_input_tokens_seen": 173670400, + "step": 21200 + }, + { + "epoch": 2.7406641684972217, + "grad_norm": 0.28784510493278503, + "learning_rate": 9.162657060422574e-07, + "loss": 0.5351, + "num_input_tokens_seen": 173752320, + "step": 21210 + }, + { + "epoch": 2.741956325106603, + "grad_norm": 0.7928471565246582, + "learning_rate": 9.072132452941002e-07, + "loss": 0.6825, + "num_input_tokens_seen": 173834240, + "step": 21220 + }, + { + "epoch": 2.743248481715984, + "grad_norm": 0.6947720646858215, + "learning_rate": 8.982048983503271e-07, + "loss": 0.84, + "num_input_tokens_seen": 173916160, + "step": 21230 + }, + { + "epoch": 2.7445406383253648, + "grad_norm": 0.6338273882865906, + "learning_rate": 8.892406817051946e-07, + "loss": 0.4665, + "num_input_tokens_seen": 173998080, + "step": 21240 + }, + { + "epoch": 2.745832794934746, + "grad_norm": 0.6995043754577637, + "learning_rate": 8.803206117721424e-07, + "loss": 0.8926, + "num_input_tokens_seen": 174080000, + "step": 21250 + }, + { + "epoch": 2.7471249515441274, + "grad_norm": 0.9222922325134277, + "learning_rate": 8.714447048837948e-07, + "loss": 0.8874, + "num_input_tokens_seen": 174161920, + "step": 21260 + }, + { + "epoch": 2.7484171081535083, + "grad_norm": 1.1156824827194214, + "learning_rate": 8.626129772918962e-07, + "loss": 0.9445, + "num_input_tokens_seen": 174243840, + "step": 21270 + }, + { + "epoch": 2.749709264762889, + "grad_norm": 0.2606189250946045, + "learning_rate": 8.538254451673138e-07, + "loss": 0.7208, + "num_input_tokens_seen": 174325760, + "step": 21280 + }, + { + "epoch": 2.7510014213722704, + "grad_norm": 1.007570743560791, + "learning_rate": 8.450821245999829e-07, + "loss": 0.6216, + "num_input_tokens_seen": 174407680, + "step": 21290 + }, + { + "epoch": 2.7522935779816513, + "grad_norm": 1.0008625984191895, + "learning_rate": 8.363830315988947e-07, + "loss": 0.9531, + "num_input_tokens_seen": 174489600, + "step": 21300 + }, + { + "epoch": 2.7535857345910326, + "grad_norm": 0.3577563166618347, + "learning_rate": 8.277281820920523e-07, + "loss": 0.7485, + "num_input_tokens_seen": 174571520, + "step": 21310 + }, + { + "epoch": 2.7548778912004135, + "grad_norm": 0.6377021074295044, + "learning_rate": 8.191175919264604e-07, + "loss": 1.0028, + "num_input_tokens_seen": 174653440, + "step": 21320 + }, + { + "epoch": 2.7561700478097944, + "grad_norm": 1.360044002532959, + "learning_rate": 8.105512768680712e-07, + "loss": 0.7594, + "num_input_tokens_seen": 174735360, + "step": 21330 + }, + { + "epoch": 2.7574622044191757, + "grad_norm": 0.5991158485412598, + "learning_rate": 8.02029252601777e-07, + "loss": 0.7408, + "num_input_tokens_seen": 174817280, + "step": 21340 + }, + { + "epoch": 2.7587543610285565, + "grad_norm": 0.593151330947876, + "learning_rate": 7.935515347313793e-07, + "loss": 0.845, + "num_input_tokens_seen": 174899200, + "step": 21350 + }, + { + "epoch": 2.760046517637938, + "grad_norm": 0.6272279620170593, + "learning_rate": 7.851181387795392e-07, + "loss": 0.9223, + "num_input_tokens_seen": 174981120, + "step": 21360 + }, + { + "epoch": 2.7613386742473187, + "grad_norm": 0.7071083188056946, + "learning_rate": 7.767290801877796e-07, + "loss": 0.5397, + "num_input_tokens_seen": 175063040, + "step": 21370 + }, + { + "epoch": 2.7626308308566996, + "grad_norm": 0.23855863511562347, + "learning_rate": 7.683843743164359e-07, + "loss": 0.7537, + "num_input_tokens_seen": 175144960, + "step": 21380 + }, + { + "epoch": 2.763922987466081, + "grad_norm": 0.9484770894050598, + "learning_rate": 7.600840364446333e-07, + "loss": 0.7232, + "num_input_tokens_seen": 175226880, + "step": 21390 + }, + { + "epoch": 2.765215144075462, + "grad_norm": 0.7923175096511841, + "learning_rate": 7.518280817702616e-07, + "loss": 0.683, + "num_input_tokens_seen": 175308800, + "step": 21400 + }, + { + "epoch": 2.766507300684843, + "grad_norm": 1.019843578338623, + "learning_rate": 7.436165254099376e-07, + "loss": 0.9568, + "num_input_tokens_seen": 175390720, + "step": 21410 + }, + { + "epoch": 2.767799457294224, + "grad_norm": 0.6530530452728271, + "learning_rate": 7.354493823990006e-07, + "loss": 0.8146, + "num_input_tokens_seen": 175472640, + "step": 21420 + }, + { + "epoch": 2.7690916139036053, + "grad_norm": 0.8259232044219971, + "learning_rate": 7.273266676914498e-07, + "loss": 0.8714, + "num_input_tokens_seen": 175554560, + "step": 21430 + }, + { + "epoch": 2.770383770512986, + "grad_norm": 0.8644856214523315, + "learning_rate": 7.19248396159955e-07, + "loss": 0.621, + "num_input_tokens_seen": 175636480, + "step": 21440 + }, + { + "epoch": 2.7716759271223674, + "grad_norm": 0.7736506462097168, + "learning_rate": 7.112145825957927e-07, + "loss": 0.8045, + "num_input_tokens_seen": 175718400, + "step": 21450 + }, + { + "epoch": 2.7729680837317483, + "grad_norm": 0.886471152305603, + "learning_rate": 7.03225241708852e-07, + "loss": 1.0378, + "num_input_tokens_seen": 175800320, + "step": 21460 + }, + { + "epoch": 2.774260240341129, + "grad_norm": 0.7383142113685608, + "learning_rate": 6.952803881275894e-07, + "loss": 0.5995, + "num_input_tokens_seen": 175882240, + "step": 21470 + }, + { + "epoch": 2.7755523969505105, + "grad_norm": 1.1481050252914429, + "learning_rate": 6.873800363989935e-07, + "loss": 0.9361, + "num_input_tokens_seen": 175964160, + "step": 21480 + }, + { + "epoch": 2.7768445535598913, + "grad_norm": 1.066712737083435, + "learning_rate": 6.795242009885905e-07, + "loss": 0.7132, + "num_input_tokens_seen": 176046080, + "step": 21490 + }, + { + "epoch": 2.7781367101692727, + "grad_norm": 0.7255296111106873, + "learning_rate": 6.717128962803798e-07, + "loss": 0.9569, + "num_input_tokens_seen": 176128000, + "step": 21500 + }, + { + "epoch": 2.7794288667786535, + "grad_norm": 0.3221868574619293, + "learning_rate": 6.63946136576829e-07, + "loss": 0.5613, + "num_input_tokens_seen": 176209920, + "step": 21510 + }, + { + "epoch": 2.7807210233880344, + "grad_norm": 0.8177877068519592, + "learning_rate": 6.562239360988542e-07, + "loss": 0.8761, + "num_input_tokens_seen": 176291840, + "step": 21520 + }, + { + "epoch": 2.7820131799974157, + "grad_norm": 0.910959780216217, + "learning_rate": 6.485463089857674e-07, + "loss": 0.5046, + "num_input_tokens_seen": 176373760, + "step": 21530 + }, + { + "epoch": 2.783305336606797, + "grad_norm": 1.2782275676727295, + "learning_rate": 6.409132692952874e-07, + "loss": 0.8931, + "num_input_tokens_seen": 176455680, + "step": 21540 + }, + { + "epoch": 2.784597493216178, + "grad_norm": 0.6053476333618164, + "learning_rate": 6.333248310034706e-07, + "loss": 0.9367, + "num_input_tokens_seen": 176537600, + "step": 21550 + }, + { + "epoch": 2.7858896498255588, + "grad_norm": 0.9305321574211121, + "learning_rate": 6.257810080047249e-07, + "loss": 1.0246, + "num_input_tokens_seen": 176619520, + "step": 21560 + }, + { + "epoch": 2.78718180643494, + "grad_norm": 1.467142105102539, + "learning_rate": 6.182818141117625e-07, + "loss": 0.7575, + "num_input_tokens_seen": 176701440, + "step": 21570 + }, + { + "epoch": 2.788473963044321, + "grad_norm": 0.3563095033168793, + "learning_rate": 6.1082726305558e-07, + "loss": 0.8363, + "num_input_tokens_seen": 176783360, + "step": 21580 + }, + { + "epoch": 2.7897661196537022, + "grad_norm": 0.7321064472198486, + "learning_rate": 6.034173684854316e-07, + "loss": 0.8788, + "num_input_tokens_seen": 176865280, + "step": 21590 + }, + { + "epoch": 2.791058276263083, + "grad_norm": 0.5982012152671814, + "learning_rate": 5.960521439688088e-07, + "loss": 0.7845, + "num_input_tokens_seen": 176947200, + "step": 21600 + }, + { + "epoch": 2.792350432872464, + "grad_norm": 0.965398371219635, + "learning_rate": 5.88731602991413e-07, + "loss": 0.8485, + "num_input_tokens_seen": 177029120, + "step": 21610 + }, + { + "epoch": 2.7936425894818453, + "grad_norm": 0.33368125557899475, + "learning_rate": 5.814557589571223e-07, + "loss": 0.9114, + "num_input_tokens_seen": 177111040, + "step": 21620 + }, + { + "epoch": 2.794934746091226, + "grad_norm": 1.154457688331604, + "learning_rate": 5.742246251879829e-07, + "loss": 0.6277, + "num_input_tokens_seen": 177192960, + "step": 21630 + }, + { + "epoch": 2.7962269027006075, + "grad_norm": 0.728986382484436, + "learning_rate": 5.67038214924176e-07, + "loss": 0.8553, + "num_input_tokens_seen": 177274880, + "step": 21640 + }, + { + "epoch": 2.7975190593099883, + "grad_norm": 0.7881826758384705, + "learning_rate": 5.598965413239926e-07, + "loss": 0.7787, + "num_input_tokens_seen": 177356800, + "step": 21650 + }, + { + "epoch": 2.798811215919369, + "grad_norm": 0.42167970538139343, + "learning_rate": 5.527996174638061e-07, + "loss": 0.5017, + "num_input_tokens_seen": 177438720, + "step": 21660 + }, + { + "epoch": 2.8001033725287505, + "grad_norm": 0.8099974393844604, + "learning_rate": 5.457474563380638e-07, + "loss": 1.0671, + "num_input_tokens_seen": 177520640, + "step": 21670 + }, + { + "epoch": 2.8013955291381314, + "grad_norm": 0.619404673576355, + "learning_rate": 5.387400708592422e-07, + "loss": 1.0223, + "num_input_tokens_seen": 177602560, + "step": 21680 + }, + { + "epoch": 2.8026876857475127, + "grad_norm": 1.066259741783142, + "learning_rate": 5.317774738578446e-07, + "loss": 0.7952, + "num_input_tokens_seen": 177684480, + "step": 21690 + }, + { + "epoch": 2.8039798423568936, + "grad_norm": 0.8455848097801208, + "learning_rate": 5.248596780823567e-07, + "loss": 0.5789, + "num_input_tokens_seen": 177766400, + "step": 21700 + }, + { + "epoch": 2.8052719989662744, + "grad_norm": 0.40524500608444214, + "learning_rate": 5.179866961992353e-07, + "loss": 0.9732, + "num_input_tokens_seen": 177848320, + "step": 21710 + }, + { + "epoch": 2.8065641555756558, + "grad_norm": 0.7112755179405212, + "learning_rate": 5.111585407928887e-07, + "loss": 0.6634, + "num_input_tokens_seen": 177930240, + "step": 21720 + }, + { + "epoch": 2.807856312185037, + "grad_norm": 0.7933624386787415, + "learning_rate": 5.043752243656414e-07, + "loss": 0.6454, + "num_input_tokens_seen": 178012160, + "step": 21730 + }, + { + "epoch": 2.809148468794418, + "grad_norm": 0.6080639362335205, + "learning_rate": 4.976367593377218e-07, + "loss": 0.8702, + "num_input_tokens_seen": 178094080, + "step": 21740 + }, + { + "epoch": 2.810440625403799, + "grad_norm": 0.7494640946388245, + "learning_rate": 4.909431580472385e-07, + "loss": 1.0812, + "num_input_tokens_seen": 178176000, + "step": 21750 + }, + { + "epoch": 2.81173278201318, + "grad_norm": 0.49116799235343933, + "learning_rate": 4.842944327501458e-07, + "loss": 0.9232, + "num_input_tokens_seen": 178257920, + "step": 21760 + }, + { + "epoch": 2.813024938622561, + "grad_norm": 0.8744192719459534, + "learning_rate": 4.776905956202393e-07, + "loss": 0.7583, + "num_input_tokens_seen": 178339840, + "step": 21770 + }, + { + "epoch": 2.8143170952319423, + "grad_norm": 0.6495558619499207, + "learning_rate": 4.711316587491188e-07, + "loss": 0.626, + "num_input_tokens_seen": 178421760, + "step": 21780 + }, + { + "epoch": 2.815609251841323, + "grad_norm": 0.5544187426567078, + "learning_rate": 4.646176341461722e-07, + "loss": 0.7818, + "num_input_tokens_seen": 178503680, + "step": 21790 + }, + { + "epoch": 2.816901408450704, + "grad_norm": 0.26983362436294556, + "learning_rate": 4.581485337385588e-07, + "loss": 0.7148, + "num_input_tokens_seen": 178585600, + "step": 21800 + }, + { + "epoch": 2.8181935650600853, + "grad_norm": 0.6384357213973999, + "learning_rate": 4.5172436937117036e-07, + "loss": 0.8498, + "num_input_tokens_seen": 178667520, + "step": 21810 + }, + { + "epoch": 2.819485721669466, + "grad_norm": 1.1162294149398804, + "learning_rate": 4.4534515280663937e-07, + "loss": 1.0099, + "num_input_tokens_seen": 178749440, + "step": 21820 + }, + { + "epoch": 2.8207778782788475, + "grad_norm": 1.0189695358276367, + "learning_rate": 4.390108957252781e-07, + "loss": 0.878, + "num_input_tokens_seen": 178831360, + "step": 21830 + }, + { + "epoch": 2.8220700348882284, + "grad_norm": 0.7732197046279907, + "learning_rate": 4.3272160972509524e-07, + "loss": 0.7988, + "num_input_tokens_seen": 178913280, + "step": 21840 + }, + { + "epoch": 2.8233621914976093, + "grad_norm": 0.9585608839988708, + "learning_rate": 4.264773063217431e-07, + "loss": 1.0454, + "num_input_tokens_seen": 178995200, + "step": 21850 + }, + { + "epoch": 2.8246543481069906, + "grad_norm": 0.694817841053009, + "learning_rate": 4.20277996948526e-07, + "loss": 0.9275, + "num_input_tokens_seen": 179077120, + "step": 21860 + }, + { + "epoch": 2.825946504716372, + "grad_norm": 0.8328717350959778, + "learning_rate": 4.1412369295635023e-07, + "loss": 0.95, + "num_input_tokens_seen": 179159040, + "step": 21870 + }, + { + "epoch": 2.8272386613257527, + "grad_norm": 0.9796121716499329, + "learning_rate": 4.0801440561372694e-07, + "loss": 0.8007, + "num_input_tokens_seen": 179240960, + "step": 21880 + }, + { + "epoch": 2.8285308179351336, + "grad_norm": 1.0329798460006714, + "learning_rate": 4.0195014610674153e-07, + "loss": 0.8786, + "num_input_tokens_seen": 179322880, + "step": 21890 + }, + { + "epoch": 2.829822974544515, + "grad_norm": 0.4001299738883972, + "learning_rate": 3.9593092553902587e-07, + "loss": 0.8457, + "num_input_tokens_seen": 179404800, + "step": 21900 + }, + { + "epoch": 2.831115131153896, + "grad_norm": 0.5981205701828003, + "learning_rate": 3.899567549317529e-07, + "loss": 0.7406, + "num_input_tokens_seen": 179486720, + "step": 21910 + }, + { + "epoch": 2.832407287763277, + "grad_norm": 0.5832979679107666, + "learning_rate": 3.840276452236058e-07, + "loss": 0.7223, + "num_input_tokens_seen": 179568640, + "step": 21920 + }, + { + "epoch": 2.833699444372658, + "grad_norm": 0.6982936263084412, + "learning_rate": 3.7814360727076724e-07, + "loss": 0.8734, + "num_input_tokens_seen": 179650560, + "step": 21930 + }, + { + "epoch": 2.834991600982039, + "grad_norm": 0.81357342004776, + "learning_rate": 3.723046518468859e-07, + "loss": 0.5382, + "num_input_tokens_seen": 179732480, + "step": 21940 + }, + { + "epoch": 2.83628375759142, + "grad_norm": 0.6205422282218933, + "learning_rate": 3.6651078964306807e-07, + "loss": 1.0019, + "num_input_tokens_seen": 179814400, + "step": 21950 + }, + { + "epoch": 2.837575914200801, + "grad_norm": 0.670599102973938, + "learning_rate": 3.607620312678528e-07, + "loss": 0.7448, + "num_input_tokens_seen": 179896320, + "step": 21960 + }, + { + "epoch": 2.8388680708101823, + "grad_norm": 1.0795679092407227, + "learning_rate": 3.550583872471952e-07, + "loss": 0.553, + "num_input_tokens_seen": 179978240, + "step": 21970 + }, + { + "epoch": 2.840160227419563, + "grad_norm": 0.7728472352027893, + "learning_rate": 3.4939986802445256e-07, + "loss": 0.9766, + "num_input_tokens_seen": 180060160, + "step": 21980 + }, + { + "epoch": 2.841452384028944, + "grad_norm": 0.24065038561820984, + "learning_rate": 3.437864839603455e-07, + "loss": 0.3731, + "num_input_tokens_seen": 180142080, + "step": 21990 + }, + { + "epoch": 2.8427445406383254, + "grad_norm": 1.1644612550735474, + "learning_rate": 3.3821824533296633e-07, + "loss": 0.5934, + "num_input_tokens_seen": 180224000, + "step": 22000 + }, + { + "epoch": 2.8440366972477067, + "grad_norm": 0.8106747269630432, + "learning_rate": 3.3269516233773446e-07, + "loss": 0.7189, + "num_input_tokens_seen": 180305920, + "step": 22010 + }, + { + "epoch": 2.8453288538570876, + "grad_norm": 0.24490030109882355, + "learning_rate": 3.272172450873967e-07, + "loss": 0.6045, + "num_input_tokens_seen": 180387840, + "step": 22020 + }, + { + "epoch": 2.8466210104664684, + "grad_norm": 1.151792287826538, + "learning_rate": 3.217845036119993e-07, + "loss": 0.4563, + "num_input_tokens_seen": 180469760, + "step": 22030 + }, + { + "epoch": 2.8479131670758497, + "grad_norm": 0.4980817139148712, + "learning_rate": 3.163969478588713e-07, + "loss": 0.7337, + "num_input_tokens_seen": 180551680, + "step": 22040 + }, + { + "epoch": 2.8492053236852306, + "grad_norm": 0.7074184417724609, + "learning_rate": 3.11054587692608e-07, + "loss": 0.8102, + "num_input_tokens_seen": 180633600, + "step": 22050 + }, + { + "epoch": 2.850497480294612, + "grad_norm": 0.7132773995399475, + "learning_rate": 3.057574328950541e-07, + "loss": 0.841, + "num_input_tokens_seen": 180715520, + "step": 22060 + }, + { + "epoch": 2.851789636903993, + "grad_norm": 1.5704045295715332, + "learning_rate": 3.005054931652762e-07, + "loss": 0.5297, + "num_input_tokens_seen": 180797440, + "step": 22070 + }, + { + "epoch": 2.8530817935133737, + "grad_norm": 1.3629149198532104, + "learning_rate": 2.952987781195599e-07, + "loss": 0.7995, + "num_input_tokens_seen": 180879360, + "step": 22080 + }, + { + "epoch": 2.854373950122755, + "grad_norm": 0.6526079773902893, + "learning_rate": 2.901372972913791e-07, + "loss": 0.8583, + "num_input_tokens_seen": 180961280, + "step": 22090 + }, + { + "epoch": 2.855666106732136, + "grad_norm": 0.8587889075279236, + "learning_rate": 2.8502106013138516e-07, + "loss": 0.8178, + "num_input_tokens_seen": 181043200, + "step": 22100 + }, + { + "epoch": 2.856958263341517, + "grad_norm": 0.6336101293563843, + "learning_rate": 2.799500760073931e-07, + "loss": 0.7179, + "num_input_tokens_seen": 181125120, + "step": 22110 + }, + { + "epoch": 2.858250419950898, + "grad_norm": 0.9315862059593201, + "learning_rate": 2.749243542043561e-07, + "loss": 0.4239, + "num_input_tokens_seen": 181207040, + "step": 22120 + }, + { + "epoch": 2.859542576560279, + "grad_norm": 0.7689865827560425, + "learning_rate": 2.699439039243523e-07, + "loss": 0.88, + "num_input_tokens_seen": 181288960, + "step": 22130 + }, + { + "epoch": 2.86083473316966, + "grad_norm": 0.37839198112487793, + "learning_rate": 2.6500873428656483e-07, + "loss": 0.4962, + "num_input_tokens_seen": 181370880, + "step": 22140 + }, + { + "epoch": 2.862126889779041, + "grad_norm": 0.6896770596504211, + "learning_rate": 2.601188543272737e-07, + "loss": 0.7783, + "num_input_tokens_seen": 181452800, + "step": 22150 + }, + { + "epoch": 2.8634190463884224, + "grad_norm": 0.639472484588623, + "learning_rate": 2.552742729998309e-07, + "loss": 1.1341, + "num_input_tokens_seen": 181534720, + "step": 22160 + }, + { + "epoch": 2.8647112029978032, + "grad_norm": 0.6958195567131042, + "learning_rate": 2.5047499917464636e-07, + "loss": 0.6448, + "num_input_tokens_seen": 181616640, + "step": 22170 + }, + { + "epoch": 2.8660033596071846, + "grad_norm": 0.9285911321640015, + "learning_rate": 2.457210416391742e-07, + "loss": 0.7003, + "num_input_tokens_seen": 181698560, + "step": 22180 + }, + { + "epoch": 2.8672955162165654, + "grad_norm": 1.0297704935073853, + "learning_rate": 2.4101240909789325e-07, + "loss": 0.9887, + "num_input_tokens_seen": 181780480, + "step": 22190 + }, + { + "epoch": 2.8685876728259467, + "grad_norm": 0.7845831513404846, + "learning_rate": 2.3634911017229034e-07, + "loss": 0.6259, + "num_input_tokens_seen": 181862400, + "step": 22200 + }, + { + "epoch": 2.8698798294353276, + "grad_norm": 0.7428514361381531, + "learning_rate": 2.3173115340085204e-07, + "loss": 0.9638, + "num_input_tokens_seen": 181944320, + "step": 22210 + }, + { + "epoch": 2.8711719860447085, + "grad_norm": 0.7519323825836182, + "learning_rate": 2.2715854723903974e-07, + "loss": 0.9015, + "num_input_tokens_seen": 182026240, + "step": 22220 + }, + { + "epoch": 2.87246414265409, + "grad_norm": 1.0879383087158203, + "learning_rate": 2.2263130005927558e-07, + "loss": 0.7179, + "num_input_tokens_seen": 182108160, + "step": 22230 + }, + { + "epoch": 2.8737562992634706, + "grad_norm": 0.7938571572303772, + "learning_rate": 2.181494201509343e-07, + "loss": 0.936, + "num_input_tokens_seen": 182190080, + "step": 22240 + }, + { + "epoch": 2.875048455872852, + "grad_norm": 0.4322587549686432, + "learning_rate": 2.1371291572032382e-07, + "loss": 0.5952, + "num_input_tokens_seen": 182272000, + "step": 22250 + }, + { + "epoch": 2.876340612482233, + "grad_norm": 0.47111037373542786, + "learning_rate": 2.0932179489066006e-07, + "loss": 0.7432, + "num_input_tokens_seen": 182353920, + "step": 22260 + }, + { + "epoch": 2.8776327690916137, + "grad_norm": 0.7983747720718384, + "learning_rate": 2.0497606570207829e-07, + "loss": 0.8684, + "num_input_tokens_seen": 182435840, + "step": 22270 + }, + { + "epoch": 2.878924925700995, + "grad_norm": 0.3726508915424347, + "learning_rate": 2.0067573611158853e-07, + "loss": 0.6558, + "num_input_tokens_seen": 182517760, + "step": 22280 + }, + { + "epoch": 2.880217082310376, + "grad_norm": 1.232807993888855, + "learning_rate": 1.9642081399307844e-07, + "loss": 0.562, + "num_input_tokens_seen": 182599680, + "step": 22290 + }, + { + "epoch": 2.881509238919757, + "grad_norm": 1.0021343231201172, + "learning_rate": 1.9221130713729663e-07, + "loss": 0.7792, + "num_input_tokens_seen": 182681600, + "step": 22300 + }, + { + "epoch": 2.882801395529138, + "grad_norm": 0.8979313969612122, + "learning_rate": 1.8804722325183044e-07, + "loss": 0.7825, + "num_input_tokens_seen": 182763520, + "step": 22310 + }, + { + "epoch": 2.884093552138519, + "grad_norm": 0.4306122660636902, + "learning_rate": 1.8392856996110875e-07, + "loss": 0.6558, + "num_input_tokens_seen": 182845440, + "step": 22320 + }, + { + "epoch": 2.8853857087479002, + "grad_norm": 0.7078767418861389, + "learning_rate": 1.7985535480636584e-07, + "loss": 0.6038, + "num_input_tokens_seen": 182927360, + "step": 22330 + }, + { + "epoch": 2.8866778653572815, + "grad_norm": 0.7182630300521851, + "learning_rate": 1.7582758524564425e-07, + "loss": 0.787, + "num_input_tokens_seen": 183009280, + "step": 22340 + }, + { + "epoch": 2.8879700219666624, + "grad_norm": 0.6843809485435486, + "learning_rate": 1.7184526865377805e-07, + "loss": 0.56, + "num_input_tokens_seen": 183091200, + "step": 22350 + }, + { + "epoch": 2.8892621785760433, + "grad_norm": 0.5071055293083191, + "learning_rate": 1.6790841232237064e-07, + "loss": 0.494, + "num_input_tokens_seen": 183173120, + "step": 22360 + }, + { + "epoch": 2.8905543351854246, + "grad_norm": 0.43069300055503845, + "learning_rate": 1.6401702345979485e-07, + "loss": 1.0883, + "num_input_tokens_seen": 183255040, + "step": 22370 + }, + { + "epoch": 2.8918464917948055, + "grad_norm": 0.9334409832954407, + "learning_rate": 1.6017110919116786e-07, + "loss": 0.7701, + "num_input_tokens_seen": 183336960, + "step": 22380 + }, + { + "epoch": 2.8931386484041868, + "grad_norm": 0.9663013815879822, + "learning_rate": 1.5637067655834282e-07, + "loss": 0.7901, + "num_input_tokens_seen": 183418880, + "step": 22390 + }, + { + "epoch": 2.8944308050135676, + "grad_norm": 0.45232853293418884, + "learning_rate": 1.526157325199007e-07, + "loss": 0.8333, + "num_input_tokens_seen": 183500800, + "step": 22400 + }, + { + "epoch": 2.8957229616229485, + "grad_norm": 1.2626268863677979, + "learning_rate": 1.4890628395113072e-07, + "loss": 0.3667, + "num_input_tokens_seen": 183582720, + "step": 22410 + }, + { + "epoch": 2.89701511823233, + "grad_norm": 1.018869400024414, + "learning_rate": 1.452423376440193e-07, + "loss": 1.0644, + "num_input_tokens_seen": 183664640, + "step": 22420 + }, + { + "epoch": 2.8983072748417107, + "grad_norm": 0.7966929078102112, + "learning_rate": 1.4162390030723617e-07, + "loss": 0.6229, + "num_input_tokens_seen": 183746560, + "step": 22430 + }, + { + "epoch": 2.899599431451092, + "grad_norm": 1.1223193407058716, + "learning_rate": 1.380509785661288e-07, + "loss": 0.8879, + "num_input_tokens_seen": 183828480, + "step": 22440 + }, + { + "epoch": 2.900891588060473, + "grad_norm": 0.751388430595398, + "learning_rate": 1.3452357896270308e-07, + "loss": 0.7747, + "num_input_tokens_seen": 183910400, + "step": 22450 + }, + { + "epoch": 2.9021837446698537, + "grad_norm": 0.7575998902320862, + "learning_rate": 1.3104170795561477e-07, + "loss": 0.7773, + "num_input_tokens_seen": 183992320, + "step": 22460 + }, + { + "epoch": 2.903475901279235, + "grad_norm": 0.6274828314781189, + "learning_rate": 1.2760537192015866e-07, + "loss": 0.7762, + "num_input_tokens_seen": 184074240, + "step": 22470 + }, + { + "epoch": 2.9047680578886164, + "grad_norm": 0.9277952909469604, + "learning_rate": 1.242145771482489e-07, + "loss": 0.7858, + "num_input_tokens_seen": 184156160, + "step": 22480 + }, + { + "epoch": 2.9060602144979972, + "grad_norm": 1.2103768587112427, + "learning_rate": 1.2086932984842758e-07, + "loss": 0.9757, + "num_input_tokens_seen": 184238080, + "step": 22490 + }, + { + "epoch": 2.907352371107378, + "grad_norm": 0.7629841566085815, + "learning_rate": 1.1756963614582006e-07, + "loss": 0.8254, + "num_input_tokens_seen": 184320000, + "step": 22500 + }, + { + "epoch": 2.9086445277167594, + "grad_norm": 0.32851213216781616, + "learning_rate": 1.1431550208215736e-07, + "loss": 0.2809, + "num_input_tokens_seen": 184401920, + "step": 22510 + }, + { + "epoch": 2.9099366843261403, + "grad_norm": 0.37616801261901855, + "learning_rate": 1.1110693361574831e-07, + "loss": 0.779, + "num_input_tokens_seen": 184483840, + "step": 22520 + }, + { + "epoch": 2.9112288409355216, + "grad_norm": 1.0593737363815308, + "learning_rate": 1.0794393662147129e-07, + "loss": 0.9263, + "num_input_tokens_seen": 184565760, + "step": 22530 + }, + { + "epoch": 2.9125209975449025, + "grad_norm": 0.390419602394104, + "learning_rate": 1.0482651689075751e-07, + "loss": 0.6628, + "num_input_tokens_seen": 184647680, + "step": 22540 + }, + { + "epoch": 2.9138131541542833, + "grad_norm": 0.5607829689979553, + "learning_rate": 1.0175468013159384e-07, + "loss": 0.7321, + "num_input_tokens_seen": 184729600, + "step": 22550 + }, + { + "epoch": 2.9151053107636646, + "grad_norm": 0.9104856252670288, + "learning_rate": 9.872843196850057e-08, + "loss": 0.8234, + "num_input_tokens_seen": 184811520, + "step": 22560 + }, + { + "epoch": 2.9163974673730455, + "grad_norm": 1.4063791036605835, + "learning_rate": 9.574777794253143e-08, + "loss": 0.5345, + "num_input_tokens_seen": 184893440, + "step": 22570 + }, + { + "epoch": 2.917689623982427, + "grad_norm": 2.190293073654175, + "learning_rate": 9.281272351124859e-08, + "loss": 0.9455, + "num_input_tokens_seen": 184975360, + "step": 22580 + }, + { + "epoch": 2.9189817805918077, + "grad_norm": 0.6696395874023438, + "learning_rate": 8.992327404872825e-08, + "loss": 0.3409, + "num_input_tokens_seen": 185057280, + "step": 22590 + }, + { + "epoch": 2.9202739372011886, + "grad_norm": 1.176025629043579, + "learning_rate": 8.707943484553838e-08, + "loss": 0.5035, + "num_input_tokens_seen": 185139200, + "step": 22600 + }, + { + "epoch": 2.92156609381057, + "grad_norm": 0.6432631015777588, + "learning_rate": 8.428121110874154e-08, + "loss": 0.7041, + "num_input_tokens_seen": 185221120, + "step": 22610 + }, + { + "epoch": 2.9228582504199507, + "grad_norm": 0.8045111298561096, + "learning_rate": 8.152860796187545e-08, + "loss": 0.8394, + "num_input_tokens_seen": 185303040, + "step": 22620 + }, + { + "epoch": 2.924150407029332, + "grad_norm": 0.2948504686355591, + "learning_rate": 7.882163044494462e-08, + "loss": 0.3898, + "num_input_tokens_seen": 185384960, + "step": 22630 + }, + { + "epoch": 2.925442563638713, + "grad_norm": 1.1298701763153076, + "learning_rate": 7.616028351441484e-08, + "loss": 0.7981, + "num_input_tokens_seen": 185466880, + "step": 22640 + }, + { + "epoch": 2.9267347202480942, + "grad_norm": 0.6935545802116394, + "learning_rate": 7.354457204320486e-08, + "loss": 0.8605, + "num_input_tokens_seen": 185548800, + "step": 22650 + }, + { + "epoch": 2.928026876857475, + "grad_norm": 0.624178409576416, + "learning_rate": 7.097450082066969e-08, + "loss": 0.687, + "num_input_tokens_seen": 185630720, + "step": 22660 + }, + { + "epoch": 2.9293190334668564, + "grad_norm": 0.7468099594116211, + "learning_rate": 6.845007455260343e-08, + "loss": 0.6511, + "num_input_tokens_seen": 185712640, + "step": 22670 + }, + { + "epoch": 2.9306111900762373, + "grad_norm": 0.20860843360424042, + "learning_rate": 6.59712978612198e-08, + "loss": 0.4383, + "num_input_tokens_seen": 185794560, + "step": 22680 + }, + { + "epoch": 2.931903346685618, + "grad_norm": 0.3760750889778137, + "learning_rate": 6.353817528514938e-08, + "loss": 0.4383, + "num_input_tokens_seen": 185876480, + "step": 22690 + }, + { + "epoch": 2.9331955032949995, + "grad_norm": 0.42980486154556274, + "learning_rate": 6.11507112794285e-08, + "loss": 0.6782, + "num_input_tokens_seen": 185958400, + "step": 22700 + }, + { + "epoch": 2.9344876599043803, + "grad_norm": 1.127774715423584, + "learning_rate": 5.880891021549928e-08, + "loss": 0.676, + "num_input_tokens_seen": 186040320, + "step": 22710 + }, + { + "epoch": 2.9357798165137616, + "grad_norm": 0.8025173544883728, + "learning_rate": 5.6512776381192903e-08, + "loss": 0.7285, + "num_input_tokens_seen": 186122240, + "step": 22720 + }, + { + "epoch": 2.9370719731231425, + "grad_norm": 0.6310427784919739, + "learning_rate": 5.426231398071302e-08, + "loss": 0.6244, + "num_input_tokens_seen": 186204160, + "step": 22730 + }, + { + "epoch": 2.9383641297325234, + "grad_norm": 0.8183490037918091, + "learning_rate": 5.205752713465794e-08, + "loss": 0.8171, + "num_input_tokens_seen": 186286080, + "step": 22740 + }, + { + "epoch": 2.9396562863419047, + "grad_norm": 1.129209041595459, + "learning_rate": 4.989841987997901e-08, + "loss": 0.9641, + "num_input_tokens_seen": 186368000, + "step": 22750 + }, + { + "epoch": 2.9409484429512855, + "grad_norm": 0.7792519927024841, + "learning_rate": 4.778499616999166e-08, + "loss": 0.7594, + "num_input_tokens_seen": 186449920, + "step": 22760 + }, + { + "epoch": 2.942240599560667, + "grad_norm": 0.7968661189079285, + "learning_rate": 4.57172598743727e-08, + "loss": 0.9113, + "num_input_tokens_seen": 186531840, + "step": 22770 + }, + { + "epoch": 2.9435327561700477, + "grad_norm": 0.6901931166648865, + "learning_rate": 4.369521477913529e-08, + "loss": 0.8022, + "num_input_tokens_seen": 186613760, + "step": 22780 + }, + { + "epoch": 2.9448249127794286, + "grad_norm": 0.7363210916519165, + "learning_rate": 4.171886458664009e-08, + "loss": 0.7282, + "num_input_tokens_seen": 186695680, + "step": 22790 + }, + { + "epoch": 2.94611706938881, + "grad_norm": 0.35474592447280884, + "learning_rate": 3.9788212915573e-08, + "loss": 0.6295, + "num_input_tokens_seen": 186777600, + "step": 22800 + }, + { + "epoch": 2.947409225998191, + "grad_norm": 0.9408590197563171, + "learning_rate": 3.7903263300956285e-08, + "loss": 0.6325, + "num_input_tokens_seen": 186859520, + "step": 22810 + }, + { + "epoch": 2.948701382607572, + "grad_norm": 0.9621270895004272, + "learning_rate": 3.606401919411806e-08, + "loss": 0.8424, + "num_input_tokens_seen": 186941440, + "step": 22820 + }, + { + "epoch": 2.949993539216953, + "grad_norm": 0.752377450466156, + "learning_rate": 3.427048396271171e-08, + "loss": 1.1133, + "num_input_tokens_seen": 187023360, + "step": 22830 + }, + { + "epoch": 2.9512856958263343, + "grad_norm": 0.8131755590438843, + "learning_rate": 3.252266089069367e-08, + "loss": 0.6982, + "num_input_tokens_seen": 187105280, + "step": 22840 + }, + { + "epoch": 2.952577852435715, + "grad_norm": 0.7799323797225952, + "learning_rate": 3.0820553178320667e-08, + "loss": 0.9222, + "num_input_tokens_seen": 187187200, + "step": 22850 + }, + { + "epoch": 2.9538700090450964, + "grad_norm": 0.4065955877304077, + "learning_rate": 2.9164163942146937e-08, + "loss": 0.5809, + "num_input_tokens_seen": 187269120, + "step": 22860 + }, + { + "epoch": 2.9551621656544773, + "grad_norm": 0.6465580463409424, + "learning_rate": 2.7553496215015907e-08, + "loss": 0.7016, + "num_input_tokens_seen": 187351040, + "step": 22870 + }, + { + "epoch": 2.956454322263858, + "grad_norm": 0.608161985874176, + "learning_rate": 2.5988552946051848e-08, + "loss": 0.5702, + "num_input_tokens_seen": 187432960, + "step": 22880 + }, + { + "epoch": 2.9577464788732395, + "grad_norm": 0.7066859602928162, + "learning_rate": 2.44693370006599e-08, + "loss": 0.6657, + "num_input_tokens_seen": 187514880, + "step": 22890 + }, + { + "epoch": 2.9590386354826204, + "grad_norm": 0.31225040555000305, + "learning_rate": 2.2995851160520498e-08, + "loss": 0.6522, + "num_input_tokens_seen": 187596800, + "step": 22900 + }, + { + "epoch": 2.9603307920920017, + "grad_norm": 0.7561403512954712, + "learning_rate": 2.156809812358107e-08, + "loss": 0.8856, + "num_input_tokens_seen": 187678720, + "step": 22910 + }, + { + "epoch": 2.9616229487013825, + "grad_norm": 1.5660463571548462, + "learning_rate": 2.0186080504050466e-08, + "loss": 0.718, + "num_input_tokens_seen": 187760640, + "step": 22920 + }, + { + "epoch": 2.9629151053107634, + "grad_norm": 0.6079100370407104, + "learning_rate": 1.8849800832401733e-08, + "loss": 0.7686, + "num_input_tokens_seen": 187842560, + "step": 22930 + }, + { + "epoch": 2.9642072619201447, + "grad_norm": 0.6655999422073364, + "learning_rate": 1.75592615553527e-08, + "loss": 0.8922, + "num_input_tokens_seen": 187924480, + "step": 22940 + }, + { + "epoch": 2.965499418529526, + "grad_norm": 0.7710443139076233, + "learning_rate": 1.6314465035879855e-08, + "loss": 0.5413, + "num_input_tokens_seen": 188006400, + "step": 22950 + }, + { + "epoch": 2.966791575138907, + "grad_norm": 1.0660786628723145, + "learning_rate": 1.5115413553201674e-08, + "loss": 0.4049, + "num_input_tokens_seen": 188088320, + "step": 22960 + }, + { + "epoch": 2.9680837317482878, + "grad_norm": 0.6799715161323547, + "learning_rate": 1.3962109302773085e-08, + "loss": 1.021, + "num_input_tokens_seen": 188170240, + "step": 22970 + }, + { + "epoch": 2.969375888357669, + "grad_norm": 1.0657424926757812, + "learning_rate": 1.2854554396291018e-08, + "loss": 0.4626, + "num_input_tokens_seen": 188252160, + "step": 22980 + }, + { + "epoch": 2.97066804496705, + "grad_norm": 1.7538788318634033, + "learning_rate": 1.1792750861686074e-08, + "loss": 0.9079, + "num_input_tokens_seen": 188334080, + "step": 22990 + }, + { + "epoch": 2.9719602015764313, + "grad_norm": 0.40748029947280884, + "learning_rate": 1.0776700643116976e-08, + "loss": 0.8018, + "num_input_tokens_seen": 188416000, + "step": 23000 + }, + { + "epoch": 2.973252358185812, + "grad_norm": 0.6334804892539978, + "learning_rate": 9.806405600967794e-09, + "loss": 0.6943, + "num_input_tokens_seen": 188497920, + "step": 23010 + }, + { + "epoch": 2.974544514795193, + "grad_norm": 0.8970416188240051, + "learning_rate": 8.881867511845166e-09, + "loss": 1.0121, + "num_input_tokens_seen": 188579840, + "step": 23020 + }, + { + "epoch": 2.9758366714045743, + "grad_norm": 0.32017982006073, + "learning_rate": 8.00308806857275e-09, + "loss": 1.0772, + "num_input_tokens_seen": 188661760, + "step": 23030 + }, + { + "epoch": 2.977128828013955, + "grad_norm": 0.7373923659324646, + "learning_rate": 7.1700688801940034e-09, + "loss": 0.7207, + "num_input_tokens_seen": 188743680, + "step": 23040 + }, + { + "epoch": 2.9784209846233365, + "grad_norm": 0.6554174423217773, + "learning_rate": 6.382811471963846e-09, + "loss": 0.589, + "num_input_tokens_seen": 188825600, + "step": 23050 + }, + { + "epoch": 2.9797131412327174, + "grad_norm": 0.23197783529758453, + "learning_rate": 5.6413172853486685e-09, + "loss": 0.5539, + "num_input_tokens_seen": 188907520, + "step": 23060 + }, + { + "epoch": 2.9810052978420982, + "grad_norm": 0.8123525977134705, + "learning_rate": 4.94558767802078e-09, + "loss": 1.0715, + "num_input_tokens_seen": 188989440, + "step": 23070 + }, + { + "epoch": 2.9822974544514795, + "grad_norm": 0.8512336015701294, + "learning_rate": 4.295623923858405e-09, + "loss": 0.9075, + "num_input_tokens_seen": 189071360, + "step": 23080 + }, + { + "epoch": 2.983589611060861, + "grad_norm": 0.5837238430976868, + "learning_rate": 3.6914272129429106e-09, + "loss": 0.9787, + "num_input_tokens_seen": 189153280, + "step": 23090 + }, + { + "epoch": 2.9848817676702417, + "grad_norm": 0.7648143172264099, + "learning_rate": 3.1329986515560295e-09, + "loss": 0.397, + "num_input_tokens_seen": 189235200, + "step": 23100 + }, + { + "epoch": 2.9861739242796226, + "grad_norm": 0.7528153657913208, + "learning_rate": 2.6203392621798605e-09, + "loss": 0.8219, + "num_input_tokens_seen": 189317120, + "step": 23110 + }, + { + "epoch": 2.987466080889004, + "grad_norm": 0.9112277626991272, + "learning_rate": 2.153449983491318e-09, + "loss": 0.93, + "num_input_tokens_seen": 189399040, + "step": 23120 + }, + { + "epoch": 2.9887582374983848, + "grad_norm": 0.6468605995178223, + "learning_rate": 1.7323316703621305e-09, + "loss": 1.1683, + "num_input_tokens_seen": 189480960, + "step": 23130 + }, + { + "epoch": 2.990050394107766, + "grad_norm": 0.8173196315765381, + "learning_rate": 1.356985093856067e-09, + "loss": 0.5163, + "num_input_tokens_seen": 189562880, + "step": 23140 + }, + { + "epoch": 2.991342550717147, + "grad_norm": 0.9355853796005249, + "learning_rate": 1.0274109412372613e-09, + "loss": 1.0414, + "num_input_tokens_seen": 189644800, + "step": 23150 + }, + { + "epoch": 2.992634707326528, + "grad_norm": 0.6136227250099182, + "learning_rate": 7.436098159480099e-10, + "loss": 0.743, + "num_input_tokens_seen": 189726720, + "step": 23160 + }, + { + "epoch": 2.993926863935909, + "grad_norm": 0.31119829416275024, + "learning_rate": 5.055822376337505e-10, + "loss": 0.941, + "num_input_tokens_seen": 189808640, + "step": 23170 + }, + { + "epoch": 2.99521902054529, + "grad_norm": 0.6250201463699341, + "learning_rate": 3.1332864211808254e-10, + "loss": 0.6753, + "num_input_tokens_seen": 189890560, + "step": 23180 + }, + { + "epoch": 2.9965111771546713, + "grad_norm": 0.5410284996032715, + "learning_rate": 1.6684938141664498e-10, + "loss": 0.8739, + "num_input_tokens_seen": 189972480, + "step": 23190 + }, + { + "epoch": 2.997803333764052, + "grad_norm": 0.7503390908241272, + "learning_rate": 6.614472373434044e-11, + "loss": 0.851, + "num_input_tokens_seen": 190054400, + "step": 23200 + }, + { + "epoch": 2.999095490373433, + "grad_norm": 0.6340755224227905, + "learning_rate": 1.1214853459784457e-11, + "loss": 0.7338, + "num_input_tokens_seen": 190136320, + "step": 23210 + } + ], + "logging_steps": 10, + "max_steps": 23217, + "num_input_tokens_seen": 190193664, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.660062333952852e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}