{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 4390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002277904328018223, "grad_norm": 386.0, "learning_rate": 4.5558086560364467e-07, "loss": 47.0249, "step": 1 }, { "epoch": 0.011389521640091117, "grad_norm": 430.0, "learning_rate": 2.2779043280182233e-06, "loss": 48.5743, "step": 5 }, { "epoch": 0.022779043280182234, "grad_norm": 324.0, "learning_rate": 4.555808656036447e-06, "loss": 47.0262, "step": 10 }, { "epoch": 0.03416856492027335, "grad_norm": 178.0, "learning_rate": 6.83371298405467e-06, "loss": 40.673, "step": 15 }, { "epoch": 0.04555808656036447, "grad_norm": 125.5, "learning_rate": 9.111617312072893e-06, "loss": 35.8229, "step": 20 }, { "epoch": 0.05694760820045558, "grad_norm": 76.5, "learning_rate": 1.1389521640091117e-05, "loss": 29.5881, "step": 25 }, { "epoch": 0.0683371298405467, "grad_norm": 24.75, "learning_rate": 1.366742596810934e-05, "loss": 26.1961, "step": 30 }, { "epoch": 0.07972665148063782, "grad_norm": 20.25, "learning_rate": 1.5945330296127563e-05, "loss": 24.1324, "step": 35 }, { "epoch": 0.09111617312072894, "grad_norm": 14.75, "learning_rate": 1.8223234624145787e-05, "loss": 22.6848, "step": 40 }, { "epoch": 0.10250569476082004, "grad_norm": 9.375, "learning_rate": 2.050113895216401e-05, "loss": 21.496, "step": 45 }, { "epoch": 0.11389521640091116, "grad_norm": 6.09375, "learning_rate": 2.2779043280182233e-05, "loss": 20.4174, "step": 50 }, { "epoch": 0.1252847380410023, "grad_norm": 4.6875, "learning_rate": 2.505694760820046e-05, "loss": 19.6892, "step": 55 }, { "epoch": 0.1366742596810934, "grad_norm": 4.25, "learning_rate": 2.733485193621868e-05, "loss": 18.9022, "step": 60 }, { "epoch": 0.1480637813211845, "grad_norm": 4.71875, "learning_rate": 2.96127562642369e-05, "loss": 18.4344, "step": 65 }, { "epoch": 0.15945330296127563, "grad_norm": 6.53125, "learning_rate": 3.189066059225513e-05, "loss": 18.1733, "step": 70 }, { "epoch": 0.17084282460136674, "grad_norm": 7.34375, "learning_rate": 3.416856492027335e-05, "loss": 17.6679, "step": 75 }, { "epoch": 0.18223234624145787, "grad_norm": 9.875, "learning_rate": 3.6446469248291574e-05, "loss": 16.8998, "step": 80 }, { "epoch": 0.19362186788154898, "grad_norm": 14.125, "learning_rate": 3.87243735763098e-05, "loss": 15.9599, "step": 85 }, { "epoch": 0.20501138952164008, "grad_norm": 23.625, "learning_rate": 4.100227790432802e-05, "loss": 14.2835, "step": 90 }, { "epoch": 0.2164009111617312, "grad_norm": 29.875, "learning_rate": 4.3280182232346244e-05, "loss": 11.33, "step": 95 }, { "epoch": 0.22779043280182232, "grad_norm": 29.0, "learning_rate": 4.555808656036447e-05, "loss": 7.3568, "step": 100 }, { "epoch": 0.23917995444191345, "grad_norm": 15.375, "learning_rate": 4.783599088838269e-05, "loss": 3.988, "step": 105 }, { "epoch": 0.2505694760820046, "grad_norm": 3.171875, "learning_rate": 5.011389521640092e-05, "loss": 2.5546, "step": 110 }, { "epoch": 0.2619589977220957, "grad_norm": 2.265625, "learning_rate": 5.239179954441914e-05, "loss": 2.0898, "step": 115 }, { "epoch": 0.2733485193621868, "grad_norm": 1.5390625, "learning_rate": 5.466970387243736e-05, "loss": 1.8634, "step": 120 }, { "epoch": 0.2847380410022779, "grad_norm": 1.375, "learning_rate": 5.6947608200455584e-05, "loss": 1.7283, "step": 125 }, { "epoch": 0.296127562642369, "grad_norm": 1.0625, "learning_rate": 5.92255125284738e-05, "loss": 1.652, "step": 130 }, { "epoch": 0.30751708428246016, "grad_norm": 0.609375, "learning_rate": 6.150341685649203e-05, "loss": 1.5754, "step": 135 }, { "epoch": 0.31890660592255127, "grad_norm": 0.796875, "learning_rate": 6.378132118451025e-05, "loss": 1.5234, "step": 140 }, { "epoch": 0.33029612756264237, "grad_norm": 1.546875, "learning_rate": 6.605922551252848e-05, "loss": 1.461, "step": 145 }, { "epoch": 0.3416856492027335, "grad_norm": 0.96484375, "learning_rate": 6.83371298405467e-05, "loss": 1.4367, "step": 150 }, { "epoch": 0.3530751708428246, "grad_norm": 1.5078125, "learning_rate": 7.061503416856492e-05, "loss": 1.4151, "step": 155 }, { "epoch": 0.36446469248291574, "grad_norm": 2.4375, "learning_rate": 7.289293849658315e-05, "loss": 1.3879, "step": 160 }, { "epoch": 0.37585421412300685, "grad_norm": 1.1328125, "learning_rate": 7.517084282460137e-05, "loss": 1.3668, "step": 165 }, { "epoch": 0.38724373576309795, "grad_norm": 2.109375, "learning_rate": 7.74487471526196e-05, "loss": 1.3422, "step": 170 }, { "epoch": 0.39863325740318906, "grad_norm": 2.09375, "learning_rate": 7.972665148063782e-05, "loss": 1.3269, "step": 175 }, { "epoch": 0.41002277904328016, "grad_norm": 0.57421875, "learning_rate": 8.200455580865604e-05, "loss": 1.3143, "step": 180 }, { "epoch": 0.4214123006833713, "grad_norm": 3.1875, "learning_rate": 8.428246013667426e-05, "loss": 1.2994, "step": 185 }, { "epoch": 0.4328018223234624, "grad_norm": 0.91796875, "learning_rate": 8.656036446469249e-05, "loss": 1.2815, "step": 190 }, { "epoch": 0.44419134396355353, "grad_norm": 0.99609375, "learning_rate": 8.883826879271071e-05, "loss": 1.2721, "step": 195 }, { "epoch": 0.45558086560364464, "grad_norm": 0.67578125, "learning_rate": 9.111617312072893e-05, "loss": 1.2607, "step": 200 }, { "epoch": 0.46697038724373574, "grad_norm": 1.828125, "learning_rate": 9.339407744874716e-05, "loss": 1.2512, "step": 205 }, { "epoch": 0.4783599088838269, "grad_norm": 3.0, "learning_rate": 9.567198177676538e-05, "loss": 1.2626, "step": 210 }, { "epoch": 0.489749430523918, "grad_norm": 4.21875, "learning_rate": 9.79498861047836e-05, "loss": 1.2506, "step": 215 }, { "epoch": 0.5011389521640092, "grad_norm": 2.125, "learning_rate": 0.00010022779043280184, "loss": 1.2473, "step": 220 }, { "epoch": 0.5125284738041003, "grad_norm": 1.59375, "learning_rate": 0.00010250569476082006, "loss": 1.2319, "step": 225 }, { "epoch": 0.5239179954441914, "grad_norm": 2.609375, "learning_rate": 0.00010478359908883827, "loss": 1.2246, "step": 230 }, { "epoch": 0.5353075170842825, "grad_norm": 2.78125, "learning_rate": 0.0001070615034168565, "loss": 1.2092, "step": 235 }, { "epoch": 0.5466970387243736, "grad_norm": 1.1015625, "learning_rate": 0.00010933940774487472, "loss": 1.1988, "step": 240 }, { "epoch": 0.5580865603644647, "grad_norm": 2.875, "learning_rate": 0.00011161731207289294, "loss": 1.1988, "step": 245 }, { "epoch": 0.5694760820045558, "grad_norm": 0.86328125, "learning_rate": 0.00011389521640091117, "loss": 1.1985, "step": 250 }, { "epoch": 0.5808656036446469, "grad_norm": 2.078125, "learning_rate": 0.00011617312072892939, "loss": 1.197, "step": 255 }, { "epoch": 0.592255125284738, "grad_norm": 3.921875, "learning_rate": 0.0001184510250569476, "loss": 1.1889, "step": 260 }, { "epoch": 0.6036446469248291, "grad_norm": 1.578125, "learning_rate": 0.00012072892938496582, "loss": 1.1738, "step": 265 }, { "epoch": 0.6150341685649203, "grad_norm": 1.234375, "learning_rate": 0.00012300683371298406, "loss": 1.1693, "step": 270 }, { "epoch": 0.6264236902050114, "grad_norm": 2.578125, "learning_rate": 0.00012528473804100228, "loss": 1.1681, "step": 275 }, { "epoch": 0.6378132118451025, "grad_norm": 7.84375, "learning_rate": 0.0001275626423690205, "loss": 1.1611, "step": 280 }, { "epoch": 0.6492027334851936, "grad_norm": 0.9765625, "learning_rate": 0.00012984054669703873, "loss": 1.1557, "step": 285 }, { "epoch": 0.6605922551252847, "grad_norm": 4.84375, "learning_rate": 0.00013211845102505695, "loss": 1.151, "step": 290 }, { "epoch": 0.6719817767653758, "grad_norm": 2.171875, "learning_rate": 0.00013439635535307518, "loss": 1.1527, "step": 295 }, { "epoch": 0.683371298405467, "grad_norm": 2.6875, "learning_rate": 0.0001366742596810934, "loss": 1.143, "step": 300 }, { "epoch": 0.6947608200455581, "grad_norm": 2.421875, "learning_rate": 0.00013895216400911162, "loss": 1.1439, "step": 305 }, { "epoch": 0.7061503416856492, "grad_norm": 0.9375, "learning_rate": 0.00014123006833712985, "loss": 1.1345, "step": 310 }, { "epoch": 0.7175398633257403, "grad_norm": 7.6875, "learning_rate": 0.00014350797266514807, "loss": 1.125, "step": 315 }, { "epoch": 0.7289293849658315, "grad_norm": 8.1875, "learning_rate": 0.0001457858769931663, "loss": 1.1364, "step": 320 }, { "epoch": 0.7403189066059226, "grad_norm": 2.828125, "learning_rate": 0.00014806378132118452, "loss": 1.1339, "step": 325 }, { "epoch": 0.7517084282460137, "grad_norm": 21.25, "learning_rate": 0.00015034168564920274, "loss": 1.1417, "step": 330 }, { "epoch": 0.7630979498861048, "grad_norm": 4.375, "learning_rate": 0.00015261958997722096, "loss": 1.1475, "step": 335 }, { "epoch": 0.7744874715261959, "grad_norm": 0.9609375, "learning_rate": 0.0001548974943052392, "loss": 1.1267, "step": 340 }, { "epoch": 0.785876993166287, "grad_norm": 2.078125, "learning_rate": 0.0001571753986332574, "loss": 1.1184, "step": 345 }, { "epoch": 0.7972665148063781, "grad_norm": 5.6875, "learning_rate": 0.00015945330296127563, "loss": 1.123, "step": 350 }, { "epoch": 0.8086560364464692, "grad_norm": 0.7421875, "learning_rate": 0.00016173120728929386, "loss": 1.1052, "step": 355 }, { "epoch": 0.8200455580865603, "grad_norm": 1.8125, "learning_rate": 0.00016400911161731208, "loss": 1.1108, "step": 360 }, { "epoch": 0.8314350797266514, "grad_norm": 3.5625, "learning_rate": 0.0001662870159453303, "loss": 1.1204, "step": 365 }, { "epoch": 0.8428246013667426, "grad_norm": 1.734375, "learning_rate": 0.00016856492027334853, "loss": 1.1326, "step": 370 }, { "epoch": 0.8542141230068337, "grad_norm": 2.53125, "learning_rate": 0.00017084282460136675, "loss": 1.1137, "step": 375 }, { "epoch": 0.8656036446469249, "grad_norm": 3.75, "learning_rate": 0.00017312072892938497, "loss": 1.1272, "step": 380 }, { "epoch": 0.876993166287016, "grad_norm": 1.15625, "learning_rate": 0.0001753986332574032, "loss": 1.108, "step": 385 }, { "epoch": 0.8883826879271071, "grad_norm": 3.65625, "learning_rate": 0.00017767653758542142, "loss": 1.1221, "step": 390 }, { "epoch": 0.8997722095671982, "grad_norm": 1.09375, "learning_rate": 0.00017995444191343964, "loss": 1.0942, "step": 395 }, { "epoch": 0.9111617312072893, "grad_norm": 0.79296875, "learning_rate": 0.00018223234624145787, "loss": 1.0881, "step": 400 }, { "epoch": 0.9225512528473804, "grad_norm": 1.3671875, "learning_rate": 0.0001845102505694761, "loss": 1.0857, "step": 405 }, { "epoch": 0.9339407744874715, "grad_norm": 1.875, "learning_rate": 0.00018678815489749431, "loss": 1.0843, "step": 410 }, { "epoch": 0.9453302961275627, "grad_norm": 2.34375, "learning_rate": 0.00018906605922551254, "loss": 1.0782, "step": 415 }, { "epoch": 0.9567198177676538, "grad_norm": 2.296875, "learning_rate": 0.00019134396355353076, "loss": 1.0762, "step": 420 }, { "epoch": 0.9681093394077449, "grad_norm": 1.4296875, "learning_rate": 0.00019362186788154898, "loss": 1.0681, "step": 425 }, { "epoch": 0.979498861047836, "grad_norm": 1.46875, "learning_rate": 0.0001958997722095672, "loss": 1.0742, "step": 430 }, { "epoch": 0.9908883826879271, "grad_norm": 18.625, "learning_rate": 0.00019817767653758543, "loss": 1.0805, "step": 435 }, { "epoch": 1.0, "eval_loss": 2.521019220352173, "eval_runtime": 0.2677, "eval_samples_per_second": 37.357, "eval_steps_per_second": 3.736, "step": 439 }, { "epoch": 1.0022779043280183, "grad_norm": 6.6875, "learning_rate": 0.0001999999683877311, "loss": 1.1284, "step": 440 }, { "epoch": 1.0136674259681093, "grad_norm": 1.2578125, "learning_rate": 0.0001999988619604182, "loss": 1.0864, "step": 445 }, { "epoch": 1.0250569476082005, "grad_norm": 3.21875, "learning_rate": 0.00019999617493964692, "loss": 1.0719, "step": 450 }, { "epoch": 1.0364464692482915, "grad_norm": 2.921875, "learning_rate": 0.00019999190736788865, "loss": 1.1239, "step": 455 }, { "epoch": 1.0478359908883828, "grad_norm": 1.421875, "learning_rate": 0.0001999860593125971, "loss": 1.1026, "step": 460 }, { "epoch": 1.0592255125284737, "grad_norm": 1.75, "learning_rate": 0.00019997863086620727, "loss": 1.0776, "step": 465 }, { "epoch": 1.070615034168565, "grad_norm": 1.734375, "learning_rate": 0.0001999696221461341, "loss": 1.072, "step": 470 }, { "epoch": 1.082004555808656, "grad_norm": 2.921875, "learning_rate": 0.0001999590332947704, "loss": 1.073, "step": 475 }, { "epoch": 1.0933940774487472, "grad_norm": 4.46875, "learning_rate": 0.0001999468644794848, "loss": 1.0755, "step": 480 }, { "epoch": 1.1047835990888384, "grad_norm": 14.125, "learning_rate": 0.00019993311589261897, "loss": 1.0603, "step": 485 }, { "epoch": 1.1161731207289294, "grad_norm": 2.4375, "learning_rate": 0.00019991778775148465, "loss": 1.0802, "step": 490 }, { "epoch": 1.1275626423690206, "grad_norm": 3.25, "learning_rate": 0.00019990088029836017, "loss": 1.0647, "step": 495 }, { "epoch": 1.1389521640091116, "grad_norm": 3.9375, "learning_rate": 0.00019988239380048674, "loss": 1.062, "step": 500 }, { "epoch": 1.1503416856492028, "grad_norm": 1.65625, "learning_rate": 0.000199862328550064, "loss": 1.0499, "step": 505 }, { "epoch": 1.1617312072892938, "grad_norm": 1.875, "learning_rate": 0.00019984068486424557, "loss": 1.0475, "step": 510 }, { "epoch": 1.173120728929385, "grad_norm": 3.859375, "learning_rate": 0.0001998174630851341, "loss": 1.0381, "step": 515 }, { "epoch": 1.184510250569476, "grad_norm": 2.15625, "learning_rate": 0.00019979266357977564, "loss": 1.0617, "step": 520 }, { "epoch": 1.1958997722095672, "grad_norm": 1.1171875, "learning_rate": 0.000199766286740154, "loss": 1.051, "step": 525 }, { "epoch": 1.2072892938496582, "grad_norm": 3.09375, "learning_rate": 0.0001997383329831846, "loss": 1.0657, "step": 530 }, { "epoch": 1.2186788154897494, "grad_norm": 1.796875, "learning_rate": 0.00019970880275070762, "loss": 1.0574, "step": 535 }, { "epoch": 1.2300683371298406, "grad_norm": 1.4921875, "learning_rate": 0.00019967769650948135, "loss": 1.0469, "step": 540 }, { "epoch": 1.2414578587699316, "grad_norm": 0.85546875, "learning_rate": 0.00019964501475117462, "loss": 1.0483, "step": 545 }, { "epoch": 1.2528473804100229, "grad_norm": 8.3125, "learning_rate": 0.00019961075799235903, "loss": 1.0401, "step": 550 }, { "epoch": 1.2642369020501139, "grad_norm": 1.3984375, "learning_rate": 0.0001995749267745008, "loss": 1.0436, "step": 555 }, { "epoch": 1.275626423690205, "grad_norm": 1.140625, "learning_rate": 0.00019953752166395228, "loss": 1.0291, "step": 560 }, { "epoch": 1.287015945330296, "grad_norm": 2.625, "learning_rate": 0.00019949854325194294, "loss": 1.0726, "step": 565 }, { "epoch": 1.2984054669703873, "grad_norm": 1.640625, "learning_rate": 0.00019945799215456998, "loss": 1.0269, "step": 570 }, { "epoch": 1.3097949886104785, "grad_norm": 2.59375, "learning_rate": 0.00019941586901278875, "loss": 1.0222, "step": 575 }, { "epoch": 1.3211845102505695, "grad_norm": 1.6171875, "learning_rate": 0.0001993721744924024, "loss": 1.0212, "step": 580 }, { "epoch": 1.3325740318906605, "grad_norm": 1.9609375, "learning_rate": 0.00019932690928405153, "loss": 1.0125, "step": 585 }, { "epoch": 1.3439635535307517, "grad_norm": 1.3671875, "learning_rate": 0.00019928007410320323, "loss": 1.0043, "step": 590 }, { "epoch": 1.355353075170843, "grad_norm": 2.078125, "learning_rate": 0.0001992316696901397, "loss": 1.0229, "step": 595 }, { "epoch": 1.366742596810934, "grad_norm": 4.625, "learning_rate": 0.00019918169680994667, "loss": 1.0179, "step": 600 }, { "epoch": 1.3781321184510251, "grad_norm": 2.6875, "learning_rate": 0.00019913015625250114, "loss": 1.0123, "step": 605 }, { "epoch": 1.3895216400911161, "grad_norm": 0.8359375, "learning_rate": 0.00019907704883245916, "loss": 1.0104, "step": 610 }, { "epoch": 1.4009111617312073, "grad_norm": 1.3984375, "learning_rate": 0.00019902237538924256, "loss": 1.0195, "step": 615 }, { "epoch": 1.4123006833712983, "grad_norm": 2.078125, "learning_rate": 0.00019896613678702617, "loss": 1.0101, "step": 620 }, { "epoch": 1.4236902050113895, "grad_norm": 1.796875, "learning_rate": 0.0001989083339147237, "loss": 1.0192, "step": 625 }, { "epoch": 1.4350797266514808, "grad_norm": 0.90625, "learning_rate": 0.000198848967685974, "loss": 1.0109, "step": 630 }, { "epoch": 1.4464692482915718, "grad_norm": 1.6640625, "learning_rate": 0.0001987880390391264, "loss": 1.0048, "step": 635 }, { "epoch": 1.4578587699316627, "grad_norm": 2.125, "learning_rate": 0.00019872554893722618, "loss": 0.9957, "step": 640 }, { "epoch": 1.469248291571754, "grad_norm": 2.875, "learning_rate": 0.00019866149836799896, "loss": 1.0112, "step": 645 }, { "epoch": 1.4806378132118452, "grad_norm": 2.078125, "learning_rate": 0.0001985958883438354, "loss": 1.0237, "step": 650 }, { "epoch": 1.4920273348519362, "grad_norm": 1.1875, "learning_rate": 0.00019852871990177503, "loss": 1.0246, "step": 655 }, { "epoch": 1.5034168564920274, "grad_norm": 11.625, "learning_rate": 0.00019845999410349002, "loss": 1.0251, "step": 660 }, { "epoch": 1.5148063781321186, "grad_norm": 2.5, "learning_rate": 0.00019838971203526808, "loss": 1.0168, "step": 665 }, { "epoch": 1.5261958997722096, "grad_norm": 13.4375, "learning_rate": 0.00019831787480799568, "loss": 1.0081, "step": 670 }, { "epoch": 1.5375854214123006, "grad_norm": 10.25, "learning_rate": 0.0001982444835571403, "loss": 1.0271, "step": 675 }, { "epoch": 1.5489749430523918, "grad_norm": 2.515625, "learning_rate": 0.00019816953944273237, "loss": 1.0059, "step": 680 }, { "epoch": 1.560364464692483, "grad_norm": 4.34375, "learning_rate": 0.0001980930436493472, "loss": 1.0287, "step": 685 }, { "epoch": 1.571753986332574, "grad_norm": 4.875, "learning_rate": 0.00019801499738608604, "loss": 1.0337, "step": 690 }, { "epoch": 1.583143507972665, "grad_norm": 0.95703125, "learning_rate": 0.00019793540188655704, "loss": 1.0124, "step": 695 }, { "epoch": 1.5945330296127562, "grad_norm": 10.375, "learning_rate": 0.0001978542584088558, "loss": 1.0009, "step": 700 }, { "epoch": 1.6059225512528474, "grad_norm": 0.8984375, "learning_rate": 0.00019777156823554544, "loss": 0.9936, "step": 705 }, { "epoch": 1.6173120728929384, "grad_norm": 12.3125, "learning_rate": 0.00019768733267363624, "loss": 1.0044, "step": 710 }, { "epoch": 1.6287015945330297, "grad_norm": 1.578125, "learning_rate": 0.0001976015530545652, "loss": 0.9959, "step": 715 }, { "epoch": 1.6400911161731209, "grad_norm": 2.40625, "learning_rate": 0.00019751423073417475, "loss": 0.9893, "step": 720 }, { "epoch": 1.6514806378132119, "grad_norm": 8.3125, "learning_rate": 0.0001974253670926915, "loss": 0.9896, "step": 725 }, { "epoch": 1.6628701594533029, "grad_norm": 2.25, "learning_rate": 0.00019733496353470433, "loss": 0.9965, "step": 730 }, { "epoch": 1.674259681093394, "grad_norm": 4.4375, "learning_rate": 0.00019724302148914222, "loss": 0.9817, "step": 735 }, { "epoch": 1.6856492027334853, "grad_norm": 3.859375, "learning_rate": 0.00019714954240925172, "loss": 0.9811, "step": 740 }, { "epoch": 1.6970387243735763, "grad_norm": 2.65625, "learning_rate": 0.00019705452777257377, "loss": 0.9798, "step": 745 }, { "epoch": 1.7084282460136673, "grad_norm": 2.125, "learning_rate": 0.0001969579790809207, "loss": 0.994, "step": 750 }, { "epoch": 1.7198177676537585, "grad_norm": 2.625, "learning_rate": 0.00019685989786035211, "loss": 0.9838, "step": 755 }, { "epoch": 1.7312072892938497, "grad_norm": 2.4375, "learning_rate": 0.00019676028566115102, "loss": 0.9868, "step": 760 }, { "epoch": 1.7425968109339407, "grad_norm": 0.890625, "learning_rate": 0.00019665914405779923, "loss": 0.9933, "step": 765 }, { "epoch": 1.753986332574032, "grad_norm": 2.0, "learning_rate": 0.00019655647464895254, "loss": 0.9949, "step": 770 }, { "epoch": 1.7653758542141231, "grad_norm": 1.5, "learning_rate": 0.00019645227905741534, "loss": 0.9871, "step": 775 }, { "epoch": 1.7767653758542141, "grad_norm": 3.03125, "learning_rate": 0.00019634655893011513, "loss": 0.9855, "step": 780 }, { "epoch": 1.7881548974943051, "grad_norm": 1.4453125, "learning_rate": 0.0001962393159380763, "loss": 1.0098, "step": 785 }, { "epoch": 1.7995444191343963, "grad_norm": 2.359375, "learning_rate": 0.00019613055177639384, "loss": 0.9833, "step": 790 }, { "epoch": 1.8109339407744875, "grad_norm": 1.140625, "learning_rate": 0.0001960202681642066, "loss": 0.9712, "step": 795 }, { "epoch": 1.8223234624145785, "grad_norm": 8.9375, "learning_rate": 0.00019590846684466992, "loss": 0.9792, "step": 800 }, { "epoch": 1.8337129840546698, "grad_norm": 12.25, "learning_rate": 0.00019579514958492826, "loss": 0.978, "step": 805 }, { "epoch": 1.845102505694761, "grad_norm": 18.125, "learning_rate": 0.00019568031817608725, "loss": 0.9891, "step": 810 }, { "epoch": 1.856492027334852, "grad_norm": 5.34375, "learning_rate": 0.00019556397443318523, "loss": 0.9708, "step": 815 }, { "epoch": 1.867881548974943, "grad_norm": 5.46875, "learning_rate": 0.00019544612019516472, "loss": 0.9601, "step": 820 }, { "epoch": 1.8792710706150342, "grad_norm": 2.28125, "learning_rate": 0.00019532675732484333, "loss": 1.0184, "step": 825 }, { "epoch": 1.8906605922551254, "grad_norm": 1.203125, "learning_rate": 0.00019520588770888424, "loss": 0.9742, "step": 830 }, { "epoch": 1.9020501138952164, "grad_norm": 9.75, "learning_rate": 0.00019508351325776642, "loss": 0.9652, "step": 835 }, { "epoch": 1.9134396355353074, "grad_norm": 5.8125, "learning_rate": 0.00019495963590575443, "loss": 1.003, "step": 840 }, { "epoch": 1.9248291571753986, "grad_norm": 8.6875, "learning_rate": 0.00019483425761086793, "loss": 0.9781, "step": 845 }, { "epoch": 1.9362186788154898, "grad_norm": 7.34375, "learning_rate": 0.00019470738035485058, "loss": 0.9943, "step": 850 }, { "epoch": 1.9476082004555808, "grad_norm": 1.4140625, "learning_rate": 0.0001945790061431388, "loss": 0.9864, "step": 855 }, { "epoch": 1.958997722095672, "grad_norm": 1.9609375, "learning_rate": 0.00019444913700483008, "loss": 0.9491, "step": 860 }, { "epoch": 1.9703872437357632, "grad_norm": 0.96484375, "learning_rate": 0.00019431777499265087, "loss": 0.9748, "step": 865 }, { "epoch": 1.9817767653758542, "grad_norm": 8.0, "learning_rate": 0.0001941849221829242, "loss": 0.939, "step": 870 }, { "epoch": 1.9931662870159452, "grad_norm": 1.1484375, "learning_rate": 0.00019405058067553676, "loss": 0.9397, "step": 875 }, { "epoch": 2.0, "eval_loss": 2.4361355304718018, "eval_runtime": 0.2364, "eval_samples_per_second": 42.306, "eval_steps_per_second": 4.231, "step": 878 }, { "epoch": 2.0045558086560367, "grad_norm": 12.625, "learning_rate": 0.00019391475259390584, "loss": 0.9262, "step": 880 }, { "epoch": 2.0159453302961277, "grad_norm": 1.234375, "learning_rate": 0.00019377744008494555, "loss": 0.922, "step": 885 }, { "epoch": 2.0273348519362187, "grad_norm": 1.3125, "learning_rate": 0.00019363864531903323, "loss": 0.9265, "step": 890 }, { "epoch": 2.0387243735763096, "grad_norm": 1.0859375, "learning_rate": 0.00019349837048997478, "loss": 0.9572, "step": 895 }, { "epoch": 2.050113895216401, "grad_norm": 1.484375, "learning_rate": 0.00019335661781497024, "loss": 0.9381, "step": 900 }, { "epoch": 2.061503416856492, "grad_norm": 3.359375, "learning_rate": 0.00019321338953457858, "loss": 0.9678, "step": 905 }, { "epoch": 2.072892938496583, "grad_norm": 1.2734375, "learning_rate": 0.0001930686879126824, "loss": 0.9557, "step": 910 }, { "epoch": 2.084282460136674, "grad_norm": 8.0625, "learning_rate": 0.00019292251523645208, "loss": 0.9384, "step": 915 }, { "epoch": 2.0956719817767655, "grad_norm": 1.8984375, "learning_rate": 0.00019277487381630975, "loss": 0.9409, "step": 920 }, { "epoch": 2.1070615034168565, "grad_norm": 1.265625, "learning_rate": 0.0001926257659858925, "loss": 0.9319, "step": 925 }, { "epoch": 2.1184510250569475, "grad_norm": 0.80859375, "learning_rate": 0.00019247519410201585, "loss": 0.9264, "step": 930 }, { "epoch": 2.129840546697039, "grad_norm": 2.40625, "learning_rate": 0.00019232316054463617, "loss": 0.9069, "step": 935 }, { "epoch": 2.14123006833713, "grad_norm": 0.8203125, "learning_rate": 0.0001921696677168133, "loss": 0.9139, "step": 940 }, { "epoch": 2.152619589977221, "grad_norm": 0.8515625, "learning_rate": 0.00019201471804467245, "loss": 0.9152, "step": 945 }, { "epoch": 2.164009111617312, "grad_norm": 0.62109375, "learning_rate": 0.00019185831397736583, "loss": 0.9038, "step": 950 }, { "epoch": 2.1753986332574033, "grad_norm": 0.796875, "learning_rate": 0.00019170045798703406, "loss": 0.9073, "step": 955 }, { "epoch": 2.1867881548974943, "grad_norm": 0.5625, "learning_rate": 0.00019154115256876702, "loss": 0.9026, "step": 960 }, { "epoch": 2.1981776765375853, "grad_norm": 1.3203125, "learning_rate": 0.00019138040024056435, "loss": 0.8967, "step": 965 }, { "epoch": 2.2095671981776768, "grad_norm": 0.8046875, "learning_rate": 0.00019121820354329577, "loss": 0.8996, "step": 970 }, { "epoch": 2.2209567198177678, "grad_norm": 0.61328125, "learning_rate": 0.00019105456504066082, "loss": 0.9049, "step": 975 }, { "epoch": 2.2323462414578588, "grad_norm": 2.0, "learning_rate": 0.0001908894873191484, "loss": 0.913, "step": 980 }, { "epoch": 2.2437357630979498, "grad_norm": 0.5703125, "learning_rate": 0.00019072297298799589, "loss": 0.9099, "step": 985 }, { "epoch": 2.255125284738041, "grad_norm": 0.5234375, "learning_rate": 0.00019055502467914788, "loss": 0.9016, "step": 990 }, { "epoch": 2.266514806378132, "grad_norm": 4.5625, "learning_rate": 0.00019038564504721454, "loss": 0.89, "step": 995 }, { "epoch": 2.277904328018223, "grad_norm": 0.53125, "learning_rate": 0.00019021483676942973, "loss": 0.9021, "step": 1000 }, { "epoch": 2.289293849658314, "grad_norm": 0.73828125, "learning_rate": 0.00019004260254560867, "loss": 0.9142, "step": 1005 }, { "epoch": 2.3006833712984056, "grad_norm": 0.7421875, "learning_rate": 0.00018986894509810513, "loss": 0.9014, "step": 1010 }, { "epoch": 2.3120728929384966, "grad_norm": 1.5234375, "learning_rate": 0.0001896938671717687, "loss": 0.8982, "step": 1015 }, { "epoch": 2.3234624145785876, "grad_norm": 0.546875, "learning_rate": 0.00018951737153390105, "loss": 0.9056, "step": 1020 }, { "epoch": 2.334851936218679, "grad_norm": 1.0859375, "learning_rate": 0.00018933946097421248, "loss": 0.8869, "step": 1025 }, { "epoch": 2.34624145785877, "grad_norm": 0.6015625, "learning_rate": 0.00018916013830477766, "loss": 0.8821, "step": 1030 }, { "epoch": 2.357630979498861, "grad_norm": 0.51171875, "learning_rate": 0.00018897940635999118, "loss": 0.8865, "step": 1035 }, { "epoch": 2.369020501138952, "grad_norm": 0.9765625, "learning_rate": 0.0001887972679965229, "loss": 0.8943, "step": 1040 }, { "epoch": 2.3804100227790435, "grad_norm": 1.5859375, "learning_rate": 0.00018861372609327263, "loss": 0.8912, "step": 1045 }, { "epoch": 2.3917995444191344, "grad_norm": 0.73828125, "learning_rate": 0.00018842878355132471, "loss": 0.8847, "step": 1050 }, { "epoch": 2.4031890660592254, "grad_norm": 1.8203125, "learning_rate": 0.0001882424432939021, "loss": 0.8833, "step": 1055 }, { "epoch": 2.4145785876993164, "grad_norm": 1.1015625, "learning_rate": 0.00018805470826632024, "loss": 0.8905, "step": 1060 }, { "epoch": 2.425968109339408, "grad_norm": 0.515625, "learning_rate": 0.00018786558143594047, "loss": 0.8828, "step": 1065 }, { "epoch": 2.437357630979499, "grad_norm": 0.7109375, "learning_rate": 0.00018767506579212313, "loss": 0.882, "step": 1070 }, { "epoch": 2.44874715261959, "grad_norm": 0.482421875, "learning_rate": 0.0001874831643461803, "loss": 0.8863, "step": 1075 }, { "epoch": 2.4601366742596813, "grad_norm": 0.4921875, "learning_rate": 0.00018728988013132819, "loss": 0.8787, "step": 1080 }, { "epoch": 2.4715261958997723, "grad_norm": 0.54296875, "learning_rate": 0.0001870952162026392, "loss": 0.8755, "step": 1085 }, { "epoch": 2.4829157175398633, "grad_norm": 0.71484375, "learning_rate": 0.0001868991756369937, "loss": 0.8839, "step": 1090 }, { "epoch": 2.4943052391799543, "grad_norm": 1.0078125, "learning_rate": 0.00018670176153303127, "loss": 0.8818, "step": 1095 }, { "epoch": 2.5056947608200457, "grad_norm": 0.4375, "learning_rate": 0.0001865029770111019, "loss": 0.8875, "step": 1100 }, { "epoch": 2.5170842824601367, "grad_norm": 0.65234375, "learning_rate": 0.00018630282521321645, "loss": 0.8769, "step": 1105 }, { "epoch": 2.5284738041002277, "grad_norm": 0.7421875, "learning_rate": 0.00018610130930299715, "loss": 0.8812, "step": 1110 }, { "epoch": 2.5398633257403187, "grad_norm": 1.046875, "learning_rate": 0.00018589843246562756, "loss": 0.8837, "step": 1115 }, { "epoch": 2.55125284738041, "grad_norm": 0.80078125, "learning_rate": 0.00018569419790780218, "loss": 0.8769, "step": 1120 }, { "epoch": 2.562642369020501, "grad_norm": 0.55078125, "learning_rate": 0.00018548860885767582, "loss": 0.8782, "step": 1125 }, { "epoch": 2.574031890660592, "grad_norm": 7.65625, "learning_rate": 0.00018528166856481254, "loss": 0.8822, "step": 1130 }, { "epoch": 2.5854214123006836, "grad_norm": 0.75390625, "learning_rate": 0.00018507338030013427, "loss": 0.8677, "step": 1135 }, { "epoch": 2.5968109339407746, "grad_norm": 0.6640625, "learning_rate": 0.0001848637473558692, "loss": 0.8775, "step": 1140 }, { "epoch": 2.6082004555808656, "grad_norm": 1.578125, "learning_rate": 0.00018465277304549962, "loss": 0.8699, "step": 1145 }, { "epoch": 2.619589977220957, "grad_norm": 0.578125, "learning_rate": 0.00018444046070370963, "loss": 0.8709, "step": 1150 }, { "epoch": 2.630979498861048, "grad_norm": 0.53125, "learning_rate": 0.00018422681368633238, "loss": 0.8821, "step": 1155 }, { "epoch": 2.642369020501139, "grad_norm": 1.3203125, "learning_rate": 0.00018401183537029714, "loss": 0.8807, "step": 1160 }, { "epoch": 2.65375854214123, "grad_norm": 0.4296875, "learning_rate": 0.00018379552915357575, "loss": 0.8786, "step": 1165 }, { "epoch": 2.665148063781321, "grad_norm": 0.404296875, "learning_rate": 0.00018357789845512901, "loss": 0.8744, "step": 1170 }, { "epoch": 2.6765375854214124, "grad_norm": 0.72265625, "learning_rate": 0.0001833589467148527, "loss": 0.8649, "step": 1175 }, { "epoch": 2.6879271070615034, "grad_norm": 0.76953125, "learning_rate": 0.00018313867739352304, "loss": 0.8716, "step": 1180 }, { "epoch": 2.6993166287015944, "grad_norm": 0.59375, "learning_rate": 0.00018291709397274218, "loss": 0.8658, "step": 1185 }, { "epoch": 2.710706150341686, "grad_norm": 0.55078125, "learning_rate": 0.00018269419995488298, "loss": 0.874, "step": 1190 }, { "epoch": 2.722095671981777, "grad_norm": 1.0703125, "learning_rate": 0.00018246999886303383, "loss": 0.8752, "step": 1195 }, { "epoch": 2.733485193621868, "grad_norm": 0.5625, "learning_rate": 0.00018224449424094288, "loss": 0.8665, "step": 1200 }, { "epoch": 2.7448747152619593, "grad_norm": 1.78125, "learning_rate": 0.00018201768965296194, "loss": 0.866, "step": 1205 }, { "epoch": 2.7562642369020502, "grad_norm": 0.7578125, "learning_rate": 0.00018178958868399033, "loss": 0.8602, "step": 1210 }, { "epoch": 2.7676537585421412, "grad_norm": 0.423828125, "learning_rate": 0.00018156019493941803, "loss": 0.8618, "step": 1215 }, { "epoch": 2.7790432801822322, "grad_norm": 0.458984375, "learning_rate": 0.00018132951204506887, "loss": 0.8658, "step": 1220 }, { "epoch": 2.7904328018223232, "grad_norm": 0.59765625, "learning_rate": 0.00018109754364714305, "loss": 0.8646, "step": 1225 }, { "epoch": 2.8018223234624147, "grad_norm": 0.376953125, "learning_rate": 0.0001808642934121597, "loss": 0.8627, "step": 1230 }, { "epoch": 2.8132118451025057, "grad_norm": 0.66015625, "learning_rate": 0.00018062976502689862, "loss": 0.8639, "step": 1235 }, { "epoch": 2.8246013667425967, "grad_norm": 0.52734375, "learning_rate": 0.00018039396219834237, "loss": 0.8592, "step": 1240 }, { "epoch": 2.835990888382688, "grad_norm": 0.671875, "learning_rate": 0.0001801568886536174, "loss": 0.8575, "step": 1245 }, { "epoch": 2.847380410022779, "grad_norm": 0.455078125, "learning_rate": 0.0001799185481399354, "loss": 0.8581, "step": 1250 }, { "epoch": 2.85876993166287, "grad_norm": 2.40625, "learning_rate": 0.0001796789444245337, "loss": 0.8682, "step": 1255 }, { "epoch": 2.8701594533029615, "grad_norm": 0.8125, "learning_rate": 0.0001794380812946161, "loss": 0.8758, "step": 1260 }, { "epoch": 2.8815489749430525, "grad_norm": 10.5625, "learning_rate": 0.00017919596255729285, "loss": 0.8691, "step": 1265 }, { "epoch": 2.8929384965831435, "grad_norm": 1.3671875, "learning_rate": 0.00017895259203952032, "loss": 0.8629, "step": 1270 }, { "epoch": 2.9043280182232345, "grad_norm": 0.65234375, "learning_rate": 0.00017870797358804084, "loss": 0.8665, "step": 1275 }, { "epoch": 2.9157175398633255, "grad_norm": 0.67578125, "learning_rate": 0.00017846211106932165, "loss": 0.8631, "step": 1280 }, { "epoch": 2.927107061503417, "grad_norm": 0.61328125, "learning_rate": 0.00017821500836949386, "loss": 0.8555, "step": 1285 }, { "epoch": 2.938496583143508, "grad_norm": 0.7109375, "learning_rate": 0.000177966669394291, "loss": 0.8555, "step": 1290 }, { "epoch": 2.949886104783599, "grad_norm": 0.59375, "learning_rate": 0.00017771709806898732, "loss": 0.8577, "step": 1295 }, { "epoch": 2.9612756264236904, "grad_norm": 0.6328125, "learning_rate": 0.00017746629833833585, "loss": 0.8506, "step": 1300 }, { "epoch": 2.9726651480637813, "grad_norm": 0.53125, "learning_rate": 0.00017721427416650577, "loss": 0.8503, "step": 1305 }, { "epoch": 2.9840546697038723, "grad_norm": 1.3671875, "learning_rate": 0.00017696102953702, "loss": 0.8513, "step": 1310 }, { "epoch": 2.995444191343964, "grad_norm": 0.4609375, "learning_rate": 0.00017670656845269214, "loss": 0.8628, "step": 1315 }, { "epoch": 3.0, "eval_loss": 2.405621290206909, "eval_runtime": 0.2428, "eval_samples_per_second": 41.192, "eval_steps_per_second": 4.119, "step": 1317 }, { "epoch": 3.0068337129840548, "grad_norm": 1.203125, "learning_rate": 0.00017645089493556322, "loss": 0.8368, "step": 1320 }, { "epoch": 3.0182232346241458, "grad_norm": 2.765625, "learning_rate": 0.0001761940130268381, "loss": 0.8331, "step": 1325 }, { "epoch": 3.0296127562642368, "grad_norm": 0.61328125, "learning_rate": 0.00017593592678682166, "loss": 0.8446, "step": 1330 }, { "epoch": 3.041002277904328, "grad_norm": 0.72265625, "learning_rate": 0.0001756766402948545, "loss": 0.8303, "step": 1335 }, { "epoch": 3.052391799544419, "grad_norm": 0.9140625, "learning_rate": 0.00017541615764924868, "loss": 0.8381, "step": 1340 }, { "epoch": 3.06378132118451, "grad_norm": 0.7109375, "learning_rate": 0.00017515448296722262, "loss": 0.8353, "step": 1345 }, { "epoch": 3.075170842824601, "grad_norm": 0.703125, "learning_rate": 0.00017489162038483637, "loss": 0.836, "step": 1350 }, { "epoch": 3.0865603644646926, "grad_norm": 0.71484375, "learning_rate": 0.00017462757405692597, "loss": 0.8187, "step": 1355 }, { "epoch": 3.0979498861047836, "grad_norm": 0.41796875, "learning_rate": 0.00017436234815703788, "loss": 0.8301, "step": 1360 }, { "epoch": 3.1093394077448746, "grad_norm": 0.625, "learning_rate": 0.000174095946877363, "loss": 0.8356, "step": 1365 }, { "epoch": 3.120728929384966, "grad_norm": 0.427734375, "learning_rate": 0.00017382837442867055, "loss": 0.824, "step": 1370 }, { "epoch": 3.132118451025057, "grad_norm": 1.3984375, "learning_rate": 0.00017355963504024123, "loss": 0.8378, "step": 1375 }, { "epoch": 3.143507972665148, "grad_norm": 2.6875, "learning_rate": 0.00017328973295980052, "loss": 0.8334, "step": 1380 }, { "epoch": 3.154897494305239, "grad_norm": 0.439453125, "learning_rate": 0.00017301867245345172, "loss": 0.8412, "step": 1385 }, { "epoch": 3.1662870159453305, "grad_norm": 0.54296875, "learning_rate": 0.0001727464578056081, "loss": 0.8302, "step": 1390 }, { "epoch": 3.1776765375854215, "grad_norm": 1.3046875, "learning_rate": 0.0001724730933189256, "loss": 0.8294, "step": 1395 }, { "epoch": 3.1890660592255125, "grad_norm": 0.69140625, "learning_rate": 0.0001721985833142346, "loss": 0.8354, "step": 1400 }, { "epoch": 3.2004555808656034, "grad_norm": 0.9140625, "learning_rate": 0.0001719229321304716, "loss": 0.8379, "step": 1405 }, { "epoch": 3.211845102505695, "grad_norm": 0.5234375, "learning_rate": 0.00017164614412461084, "loss": 0.8245, "step": 1410 }, { "epoch": 3.223234624145786, "grad_norm": 0.4921875, "learning_rate": 0.00017136822367159516, "loss": 0.8202, "step": 1415 }, { "epoch": 3.234624145785877, "grad_norm": 0.458984375, "learning_rate": 0.00017108917516426704, "loss": 0.822, "step": 1420 }, { "epoch": 3.2460136674259683, "grad_norm": 0.486328125, "learning_rate": 0.0001708090030132992, "loss": 0.8299, "step": 1425 }, { "epoch": 3.2574031890660593, "grad_norm": 0.66015625, "learning_rate": 0.00017052771164712465, "loss": 0.8365, "step": 1430 }, { "epoch": 3.2687927107061503, "grad_norm": 0.75390625, "learning_rate": 0.00017024530551186702, "loss": 0.8254, "step": 1435 }, { "epoch": 3.2801822323462413, "grad_norm": 1.1328125, "learning_rate": 0.0001699617890712699, "loss": 0.8257, "step": 1440 }, { "epoch": 3.2915717539863327, "grad_norm": 0.458984375, "learning_rate": 0.00016967716680662667, "loss": 0.8315, "step": 1445 }, { "epoch": 3.3029612756264237, "grad_norm": 0.484375, "learning_rate": 0.0001693914432167094, "loss": 0.8265, "step": 1450 }, { "epoch": 3.3143507972665147, "grad_norm": 0.447265625, "learning_rate": 0.00016910462281769783, "loss": 0.8228, "step": 1455 }, { "epoch": 3.3257403189066057, "grad_norm": 0.453125, "learning_rate": 0.0001688167101431081, "loss": 0.8262, "step": 1460 }, { "epoch": 3.337129840546697, "grad_norm": 1.9375, "learning_rate": 0.0001685277097437208, "loss": 0.8284, "step": 1465 }, { "epoch": 3.348519362186788, "grad_norm": 0.7578125, "learning_rate": 0.00016823762618750938, "loss": 0.8327, "step": 1470 }, { "epoch": 3.359908883826879, "grad_norm": 0.6015625, "learning_rate": 0.00016794646405956774, "loss": 0.8211, "step": 1475 }, { "epoch": 3.3712984054669706, "grad_norm": 0.5625, "learning_rate": 0.0001676542279620378, "loss": 0.8266, "step": 1480 }, { "epoch": 3.3826879271070616, "grad_norm": 0.546875, "learning_rate": 0.00016736092251403673, "loss": 0.8247, "step": 1485 }, { "epoch": 3.3940774487471526, "grad_norm": 0.40234375, "learning_rate": 0.00016706655235158407, "loss": 0.8243, "step": 1490 }, { "epoch": 3.4054669703872436, "grad_norm": 1.015625, "learning_rate": 0.00016677112212752824, "loss": 0.8186, "step": 1495 }, { "epoch": 3.416856492027335, "grad_norm": 0.490234375, "learning_rate": 0.0001664746365114732, "loss": 0.8274, "step": 1500 }, { "epoch": 3.428246013667426, "grad_norm": 0.88671875, "learning_rate": 0.00016617710018970453, "loss": 0.8175, "step": 1505 }, { "epoch": 3.439635535307517, "grad_norm": 0.36328125, "learning_rate": 0.00016587851786511543, "loss": 0.8212, "step": 1510 }, { "epoch": 3.451025056947608, "grad_norm": 0.5234375, "learning_rate": 0.00016557889425713226, "loss": 0.8185, "step": 1515 }, { "epoch": 3.4624145785876994, "grad_norm": 0.515625, "learning_rate": 0.0001652782341016401, "loss": 0.8175, "step": 1520 }, { "epoch": 3.4738041002277904, "grad_norm": 0.46875, "learning_rate": 0.00016497654215090772, "loss": 0.8192, "step": 1525 }, { "epoch": 3.4851936218678814, "grad_norm": 0.40625, "learning_rate": 0.00016467382317351267, "loss": 0.8139, "step": 1530 }, { "epoch": 3.496583143507973, "grad_norm": 0.453125, "learning_rate": 0.00016437008195426578, "loss": 0.8217, "step": 1535 }, { "epoch": 3.507972665148064, "grad_norm": 0.6640625, "learning_rate": 0.00016406532329413546, "loss": 0.8182, "step": 1540 }, { "epoch": 3.519362186788155, "grad_norm": 0.7265625, "learning_rate": 0.000163759552010172, "loss": 0.8187, "step": 1545 }, { "epoch": 3.5307517084282463, "grad_norm": 0.97265625, "learning_rate": 0.00016345277293543136, "loss": 0.8114, "step": 1550 }, { "epoch": 3.5421412300683373, "grad_norm": 0.4921875, "learning_rate": 0.0001631449909188987, "loss": 0.8195, "step": 1555 }, { "epoch": 3.5535307517084282, "grad_norm": 0.51953125, "learning_rate": 0.00016283621082541173, "loss": 0.8122, "step": 1560 }, { "epoch": 3.5649202733485192, "grad_norm": 0.4375, "learning_rate": 0.000162526437535584, "loss": 0.8166, "step": 1565 }, { "epoch": 3.5763097949886102, "grad_norm": 0.494140625, "learning_rate": 0.00016221567594572762, "loss": 0.8194, "step": 1570 }, { "epoch": 3.5876993166287017, "grad_norm": 0.53125, "learning_rate": 0.0001619039309677758, "loss": 0.82, "step": 1575 }, { "epoch": 3.5990888382687927, "grad_norm": 0.4921875, "learning_rate": 0.0001615912075292054, "loss": 0.8184, "step": 1580 }, { "epoch": 3.6104783599088837, "grad_norm": 0.7578125, "learning_rate": 0.0001612775105729588, "loss": 0.8138, "step": 1585 }, { "epoch": 3.621867881548975, "grad_norm": 0.380859375, "learning_rate": 0.0001609628450573661, "loss": 0.8121, "step": 1590 }, { "epoch": 3.633257403189066, "grad_norm": 0.5859375, "learning_rate": 0.00016064721595606635, "loss": 0.8157, "step": 1595 }, { "epoch": 3.644646924829157, "grad_norm": 0.66015625, "learning_rate": 0.00016033062825792935, "loss": 0.8063, "step": 1600 }, { "epoch": 3.6560364464692485, "grad_norm": 0.388671875, "learning_rate": 0.00016001308696697643, "loss": 0.8207, "step": 1605 }, { "epoch": 3.6674259681093395, "grad_norm": 0.48046875, "learning_rate": 0.00015969459710230162, "loss": 0.8067, "step": 1610 }, { "epoch": 3.6788154897494305, "grad_norm": 0.5234375, "learning_rate": 0.00015937516369799216, "loss": 0.8105, "step": 1615 }, { "epoch": 3.6902050113895215, "grad_norm": 0.58984375, "learning_rate": 0.00015905479180304896, "loss": 0.8212, "step": 1620 }, { "epoch": 3.7015945330296125, "grad_norm": 0.71875, "learning_rate": 0.00015873348648130694, "loss": 0.8089, "step": 1625 }, { "epoch": 3.712984054669704, "grad_norm": 0.55078125, "learning_rate": 0.00015841125281135473, "loss": 0.8173, "step": 1630 }, { "epoch": 3.724373576309795, "grad_norm": 1.46875, "learning_rate": 0.00015808809588645467, "loss": 0.8156, "step": 1635 }, { "epoch": 3.735763097949886, "grad_norm": 1.203125, "learning_rate": 0.00015776402081446204, "loss": 0.8083, "step": 1640 }, { "epoch": 3.7471526195899774, "grad_norm": 0.392578125, "learning_rate": 0.00015743903271774455, "loss": 0.8165, "step": 1645 }, { "epoch": 3.7585421412300684, "grad_norm": 0.66015625, "learning_rate": 0.00015711313673310125, "loss": 0.8237, "step": 1650 }, { "epoch": 3.7699316628701594, "grad_norm": 0.390625, "learning_rate": 0.00015678633801168137, "loss": 0.8177, "step": 1655 }, { "epoch": 3.781321184510251, "grad_norm": 0.482421875, "learning_rate": 0.00015645864171890295, "loss": 0.8086, "step": 1660 }, { "epoch": 3.792710706150342, "grad_norm": 0.51953125, "learning_rate": 0.00015613005303437104, "loss": 0.8118, "step": 1665 }, { "epoch": 3.8041002277904328, "grad_norm": 0.390625, "learning_rate": 0.00015580057715179605, "loss": 0.8208, "step": 1670 }, { "epoch": 3.8154897494305238, "grad_norm": 0.48046875, "learning_rate": 0.00015547021927891144, "loss": 0.8076, "step": 1675 }, { "epoch": 3.8268792710706148, "grad_norm": 0.546875, "learning_rate": 0.0001551389846373916, "loss": 0.8137, "step": 1680 }, { "epoch": 3.838268792710706, "grad_norm": 0.62890625, "learning_rate": 0.00015480687846276917, "loss": 0.8117, "step": 1685 }, { "epoch": 3.849658314350797, "grad_norm": 0.44140625, "learning_rate": 0.00015447390600435238, "loss": 0.8035, "step": 1690 }, { "epoch": 3.861047835990888, "grad_norm": 0.75390625, "learning_rate": 0.00015414007252514202, "loss": 0.8105, "step": 1695 }, { "epoch": 3.8724373576309796, "grad_norm": 0.4453125, "learning_rate": 0.00015380538330174827, "loss": 0.8043, "step": 1700 }, { "epoch": 3.8838268792710706, "grad_norm": 0.5546875, "learning_rate": 0.0001534698436243073, "loss": 0.8124, "step": 1705 }, { "epoch": 3.8952164009111616, "grad_norm": 0.53125, "learning_rate": 0.00015313345879639764, "loss": 0.8198, "step": 1710 }, { "epoch": 3.906605922551253, "grad_norm": 0.625, "learning_rate": 0.00015279623413495642, "loss": 0.8057, "step": 1715 }, { "epoch": 3.917995444191344, "grad_norm": 0.462890625, "learning_rate": 0.00015245817497019524, "loss": 0.806, "step": 1720 }, { "epoch": 3.929384965831435, "grad_norm": 0.55078125, "learning_rate": 0.00015211928664551593, "loss": 0.8033, "step": 1725 }, { "epoch": 3.940774487471526, "grad_norm": 0.62890625, "learning_rate": 0.00015177957451742612, "loss": 0.8137, "step": 1730 }, { "epoch": 3.9521640091116175, "grad_norm": 0.376953125, "learning_rate": 0.00015143904395545466, "loss": 0.8075, "step": 1735 }, { "epoch": 3.9635535307517085, "grad_norm": 0.62109375, "learning_rate": 0.0001510977003420665, "loss": 0.8065, "step": 1740 }, { "epoch": 3.9749430523917995, "grad_norm": 0.37109375, "learning_rate": 0.00015075554907257796, "loss": 0.8129, "step": 1745 }, { "epoch": 3.9863325740318905, "grad_norm": 0.7578125, "learning_rate": 0.00015041259555507108, "loss": 0.8064, "step": 1750 }, { "epoch": 3.997722095671982, "grad_norm": 0.46484375, "learning_rate": 0.00015006884521030848, "loss": 0.8131, "step": 1755 }, { "epoch": 4.0, "eval_loss": 2.417691946029663, "eval_runtime": 0.2354, "eval_samples_per_second": 42.48, "eval_steps_per_second": 4.248, "step": 1756 }, { "epoch": 4.009111617312073, "grad_norm": 0.7734375, "learning_rate": 0.00014972430347164742, "loss": 0.7909, "step": 1760 }, { "epoch": 4.020501138952164, "grad_norm": 0.7265625, "learning_rate": 0.0001493789757849541, "loss": 0.7848, "step": 1765 }, { "epoch": 4.031890660592255, "grad_norm": 0.5859375, "learning_rate": 0.00014903286760851737, "loss": 0.7893, "step": 1770 }, { "epoch": 4.043280182232346, "grad_norm": 0.43359375, "learning_rate": 0.0001486859844129628, "loss": 0.7902, "step": 1775 }, { "epoch": 4.054669703872437, "grad_norm": 0.384765625, "learning_rate": 0.00014833833168116582, "loss": 0.7859, "step": 1780 }, { "epoch": 4.066059225512529, "grad_norm": 0.48046875, "learning_rate": 0.00014798991490816532, "loss": 0.782, "step": 1785 }, { "epoch": 4.077448747152619, "grad_norm": 0.4921875, "learning_rate": 0.00014764073960107666, "loss": 0.793, "step": 1790 }, { "epoch": 4.088838268792711, "grad_norm": 0.490234375, "learning_rate": 0.00014729081127900476, "loss": 0.783, "step": 1795 }, { "epoch": 4.100227790432802, "grad_norm": 0.4765625, "learning_rate": 0.00014694013547295672, "loss": 0.7908, "step": 1800 }, { "epoch": 4.111617312072893, "grad_norm": 0.89453125, "learning_rate": 0.0001465887177257545, "loss": 0.7923, "step": 1805 }, { "epoch": 4.123006833712984, "grad_norm": 1.125, "learning_rate": 0.00014623656359194712, "loss": 0.7904, "step": 1810 }, { "epoch": 4.134396355353076, "grad_norm": 1.1953125, "learning_rate": 0.00014588367863772325, "loss": 0.7881, "step": 1815 }, { "epoch": 4.145785876993166, "grad_norm": 0.71875, "learning_rate": 0.00014553006844082283, "loss": 0.7831, "step": 1820 }, { "epoch": 4.157175398633258, "grad_norm": 0.8125, "learning_rate": 0.00014517573859044907, "loss": 0.788, "step": 1825 }, { "epoch": 4.168564920273348, "grad_norm": 1.0078125, "learning_rate": 0.00014482069468718022, "loss": 0.7853, "step": 1830 }, { "epoch": 4.17995444191344, "grad_norm": 0.470703125, "learning_rate": 0.00014446494234288083, "loss": 0.7931, "step": 1835 }, { "epoch": 4.191343963553531, "grad_norm": 0.46484375, "learning_rate": 0.00014410848718061312, "loss": 0.7942, "step": 1840 }, { "epoch": 4.2027334851936216, "grad_norm": 0.453125, "learning_rate": 0.0001437513348345482, "loss": 0.7834, "step": 1845 }, { "epoch": 4.214123006833713, "grad_norm": 0.625, "learning_rate": 0.00014339349094987699, "loss": 0.7797, "step": 1850 }, { "epoch": 4.225512528473804, "grad_norm": 0.6796875, "learning_rate": 0.00014303496118272084, "loss": 0.7876, "step": 1855 }, { "epoch": 4.236902050113895, "grad_norm": 0.43359375, "learning_rate": 0.00014267575120004231, "loss": 0.7943, "step": 1860 }, { "epoch": 4.248291571753986, "grad_norm": 0.486328125, "learning_rate": 0.00014231586667955552, "loss": 0.7929, "step": 1865 }, { "epoch": 4.259681093394078, "grad_norm": 0.451171875, "learning_rate": 0.00014195531330963635, "loss": 0.7842, "step": 1870 }, { "epoch": 4.271070615034168, "grad_norm": 0.466796875, "learning_rate": 0.00014159409678923265, "loss": 0.7878, "step": 1875 }, { "epoch": 4.28246013667426, "grad_norm": 0.53125, "learning_rate": 0.0001412322228277741, "loss": 0.7843, "step": 1880 }, { "epoch": 4.29384965831435, "grad_norm": 0.63671875, "learning_rate": 0.00014086969714508196, "loss": 0.7829, "step": 1885 }, { "epoch": 4.305239179954442, "grad_norm": 0.52734375, "learning_rate": 0.00014050652547127864, "loss": 0.784, "step": 1890 }, { "epoch": 4.316628701594533, "grad_norm": 0.8046875, "learning_rate": 0.00014014271354669718, "loss": 0.7815, "step": 1895 }, { "epoch": 4.328018223234624, "grad_norm": 0.54296875, "learning_rate": 0.00013977826712179058, "loss": 0.7865, "step": 1900 }, { "epoch": 4.339407744874715, "grad_norm": 0.6015625, "learning_rate": 0.0001394131919570407, "loss": 0.7784, "step": 1905 }, { "epoch": 4.350797266514807, "grad_norm": 0.416015625, "learning_rate": 0.00013904749382286734, "loss": 0.7846, "step": 1910 }, { "epoch": 4.362186788154897, "grad_norm": 0.546875, "learning_rate": 0.0001386811784995371, "loss": 0.7905, "step": 1915 }, { "epoch": 4.373576309794989, "grad_norm": 0.6953125, "learning_rate": 0.00013831425177707193, "loss": 0.7936, "step": 1920 }, { "epoch": 4.38496583143508, "grad_norm": 0.58203125, "learning_rate": 0.00013794671945515757, "loss": 0.7828, "step": 1925 }, { "epoch": 4.396355353075171, "grad_norm": 0.90625, "learning_rate": 0.00013757858734305203, "loss": 0.7888, "step": 1930 }, { "epoch": 4.407744874715262, "grad_norm": 0.392578125, "learning_rate": 0.00013720986125949353, "loss": 0.7852, "step": 1935 }, { "epoch": 4.4191343963553535, "grad_norm": 0.72265625, "learning_rate": 0.00013684054703260882, "loss": 0.7921, "step": 1940 }, { "epoch": 4.430523917995444, "grad_norm": 0.474609375, "learning_rate": 0.00013647065049982078, "loss": 0.7872, "step": 1945 }, { "epoch": 4.4419134396355355, "grad_norm": 2.25, "learning_rate": 0.00013610017750775643, "loss": 0.7883, "step": 1950 }, { "epoch": 4.453302961275626, "grad_norm": 0.73828125, "learning_rate": 0.0001357291339121542, "loss": 0.7848, "step": 1955 }, { "epoch": 4.4646924829157175, "grad_norm": 0.396484375, "learning_rate": 0.0001353575255777717, "loss": 0.7894, "step": 1960 }, { "epoch": 4.476082004555809, "grad_norm": 0.97265625, "learning_rate": 0.00013498535837829276, "loss": 0.7911, "step": 1965 }, { "epoch": 4.4874715261958995, "grad_norm": 0.68359375, "learning_rate": 0.00013461263819623476, "loss": 0.7897, "step": 1970 }, { "epoch": 4.498861047835991, "grad_norm": 0.43359375, "learning_rate": 0.00013423937092285555, "loss": 0.7833, "step": 1975 }, { "epoch": 4.510250569476082, "grad_norm": 0.408203125, "learning_rate": 0.00013386556245806034, "loss": 0.8, "step": 1980 }, { "epoch": 4.521640091116173, "grad_norm": 0.482421875, "learning_rate": 0.00013349121871030856, "loss": 0.7984, "step": 1985 }, { "epoch": 4.533029612756264, "grad_norm": 0.42578125, "learning_rate": 0.00013311634559652036, "loss": 0.7938, "step": 1990 }, { "epoch": 4.544419134396355, "grad_norm": 0.484375, "learning_rate": 0.000132740949041983, "loss": 0.7826, "step": 1995 }, { "epoch": 4.555808656036446, "grad_norm": 0.765625, "learning_rate": 0.00013236503498025747, "loss": 0.7922, "step": 2000 }, { "epoch": 4.567198177676538, "grad_norm": 0.98046875, "learning_rate": 0.00013198860935308444, "loss": 0.7796, "step": 2005 }, { "epoch": 4.578587699316628, "grad_norm": 0.44140625, "learning_rate": 0.0001316116781102904, "loss": 0.7926, "step": 2010 }, { "epoch": 4.58997722095672, "grad_norm": 0.412109375, "learning_rate": 0.0001312342472096938, "loss": 0.7877, "step": 2015 }, { "epoch": 4.601366742596811, "grad_norm": 0.447265625, "learning_rate": 0.00013085632261701063, "loss": 0.7903, "step": 2020 }, { "epoch": 4.612756264236902, "grad_norm": 0.72265625, "learning_rate": 0.00013047791030576023, "loss": 0.7826, "step": 2025 }, { "epoch": 4.624145785876993, "grad_norm": 0.52734375, "learning_rate": 0.00013009901625717093, "loss": 0.7823, "step": 2030 }, { "epoch": 4.635535307517085, "grad_norm": 0.396484375, "learning_rate": 0.00012971964646008542, "loss": 0.7884, "step": 2035 }, { "epoch": 4.646924829157175, "grad_norm": 0.392578125, "learning_rate": 0.0001293398069108662, "loss": 0.7846, "step": 2040 }, { "epoch": 4.658314350797267, "grad_norm": 0.64453125, "learning_rate": 0.00012895950361330058, "loss": 0.7822, "step": 2045 }, { "epoch": 4.669703872437358, "grad_norm": 0.66015625, "learning_rate": 0.00012857874257850605, "loss": 0.7899, "step": 2050 }, { "epoch": 4.681093394077449, "grad_norm": 0.40625, "learning_rate": 0.00012819752982483508, "loss": 0.7914, "step": 2055 }, { "epoch": 4.69248291571754, "grad_norm": 0.41796875, "learning_rate": 0.00012781587137778013, "loss": 0.7859, "step": 2060 }, { "epoch": 4.703872437357631, "grad_norm": 0.36328125, "learning_rate": 0.00012743377326987826, "loss": 0.7849, "step": 2065 }, { "epoch": 4.715261958997722, "grad_norm": 0.38671875, "learning_rate": 0.00012705124154061597, "loss": 0.7852, "step": 2070 }, { "epoch": 4.7266514806378135, "grad_norm": 0.515625, "learning_rate": 0.00012666828223633348, "loss": 0.7802, "step": 2075 }, { "epoch": 4.738041002277904, "grad_norm": 0.55078125, "learning_rate": 0.00012628490141012937, "loss": 0.792, "step": 2080 }, { "epoch": 4.7494305239179955, "grad_norm": 0.423828125, "learning_rate": 0.00012590110512176498, "loss": 0.7915, "step": 2085 }, { "epoch": 4.760820045558087, "grad_norm": 0.39453125, "learning_rate": 0.0001255168994375683, "loss": 0.7859, "step": 2090 }, { "epoch": 4.7722095671981775, "grad_norm": 0.421875, "learning_rate": 0.0001251322904303383, "loss": 0.7901, "step": 2095 }, { "epoch": 4.783599088838269, "grad_norm": 0.396484375, "learning_rate": 0.0001247472841792491, "loss": 0.7866, "step": 2100 }, { "epoch": 4.7949886104783594, "grad_norm": 0.53515625, "learning_rate": 0.00012436188676975346, "loss": 0.7846, "step": 2105 }, { "epoch": 4.806378132118451, "grad_norm": 0.58984375, "learning_rate": 0.000123976104293487, "loss": 0.789, "step": 2110 }, { "epoch": 4.817767653758542, "grad_norm": 0.48046875, "learning_rate": 0.00012358994284817167, "loss": 0.7765, "step": 2115 }, { "epoch": 4.829157175398633, "grad_norm": 0.38671875, "learning_rate": 0.00012320340853751952, "loss": 0.7877, "step": 2120 }, { "epoch": 4.840546697038724, "grad_norm": 0.37890625, "learning_rate": 0.00012281650747113612, "loss": 0.7862, "step": 2125 }, { "epoch": 4.851936218678816, "grad_norm": 0.390625, "learning_rate": 0.00012242924576442388, "loss": 0.7897, "step": 2130 }, { "epoch": 4.863325740318906, "grad_norm": 0.412109375, "learning_rate": 0.00012204162953848581, "loss": 0.7782, "step": 2135 }, { "epoch": 4.874715261958998, "grad_norm": 0.431640625, "learning_rate": 0.00012165366492002832, "loss": 0.7796, "step": 2140 }, { "epoch": 4.886104783599089, "grad_norm": 0.392578125, "learning_rate": 0.00012126535804126451, "loss": 0.791, "step": 2145 }, { "epoch": 4.89749430523918, "grad_norm": 0.361328125, "learning_rate": 0.00012087671503981741, "loss": 0.7875, "step": 2150 }, { "epoch": 4.908883826879271, "grad_norm": 0.41796875, "learning_rate": 0.00012048774205862279, "loss": 0.7783, "step": 2155 }, { "epoch": 4.920273348519363, "grad_norm": 0.462890625, "learning_rate": 0.00012009844524583203, "loss": 0.7865, "step": 2160 }, { "epoch": 4.931662870159453, "grad_norm": 0.37890625, "learning_rate": 0.00011970883075471522, "loss": 0.7899, "step": 2165 }, { "epoch": 4.943052391799545, "grad_norm": 0.484375, "learning_rate": 0.00011931890474356358, "loss": 0.7838, "step": 2170 }, { "epoch": 4.954441913439636, "grad_norm": 0.5078125, "learning_rate": 0.00011892867337559221, "loss": 0.7859, "step": 2175 }, { "epoch": 4.965831435079727, "grad_norm": 0.36328125, "learning_rate": 0.00011853814281884283, "loss": 0.7794, "step": 2180 }, { "epoch": 4.977220956719818, "grad_norm": 0.69921875, "learning_rate": 0.00011814731924608616, "loss": 0.7793, "step": 2185 }, { "epoch": 4.988610478359909, "grad_norm": 0.546875, "learning_rate": 0.00011775620883472424, "loss": 0.7818, "step": 2190 }, { "epoch": 5.0, "grad_norm": 0.37890625, "learning_rate": 0.00011736481776669306, "loss": 0.7788, "step": 2195 }, { "epoch": 5.0, "eval_loss": 2.416613817214966, "eval_runtime": 0.2352, "eval_samples_per_second": 42.522, "eval_steps_per_second": 4.252, "step": 2195 }, { "epoch": 5.011389521640091, "grad_norm": 0.43359375, "learning_rate": 0.00011697315222836458, "loss": 0.7695, "step": 2200 }, { "epoch": 5.022779043280182, "grad_norm": 0.37109375, "learning_rate": 0.00011658121841044922, "loss": 0.7684, "step": 2205 }, { "epoch": 5.034168564920273, "grad_norm": 0.380859375, "learning_rate": 0.0001161890225078977, "loss": 0.7578, "step": 2210 }, { "epoch": 5.045558086560365, "grad_norm": 0.392578125, "learning_rate": 0.0001157965707198034, "loss": 0.7611, "step": 2215 }, { "epoch": 5.056947608200455, "grad_norm": 0.40234375, "learning_rate": 0.00011540386924930413, "loss": 0.7611, "step": 2220 }, { "epoch": 5.068337129840547, "grad_norm": 0.5546875, "learning_rate": 0.00011501092430348435, "loss": 0.7644, "step": 2225 }, { "epoch": 5.079726651480637, "grad_norm": 0.4140625, "learning_rate": 0.0001146177420932768, "loss": 0.7635, "step": 2230 }, { "epoch": 5.091116173120729, "grad_norm": 0.3828125, "learning_rate": 0.00011422432883336456, "loss": 0.7639, "step": 2235 }, { "epoch": 5.10250569476082, "grad_norm": 0.474609375, "learning_rate": 0.00011383069074208259, "loss": 0.77, "step": 2240 }, { "epoch": 5.113895216400911, "grad_norm": 0.404296875, "learning_rate": 0.00011343683404131964, "loss": 0.7643, "step": 2245 }, { "epoch": 5.125284738041002, "grad_norm": 0.384765625, "learning_rate": 0.00011304276495641981, "loss": 0.7696, "step": 2250 }, { "epoch": 5.136674259681094, "grad_norm": 0.375, "learning_rate": 0.0001126484897160842, "loss": 0.7619, "step": 2255 }, { "epoch": 5.148063781321184, "grad_norm": 0.412109375, "learning_rate": 0.0001122540145522723, "loss": 0.7612, "step": 2260 }, { "epoch": 5.159453302961276, "grad_norm": 0.38671875, "learning_rate": 0.00011185934570010374, "loss": 0.7596, "step": 2265 }, { "epoch": 5.170842824601367, "grad_norm": 0.373046875, "learning_rate": 0.00011146448939775962, "loss": 0.7652, "step": 2270 }, { "epoch": 5.182232346241458, "grad_norm": 0.390625, "learning_rate": 0.00011106945188638378, "loss": 0.763, "step": 2275 }, { "epoch": 5.193621867881549, "grad_norm": 0.4609375, "learning_rate": 0.00011067423940998438, "loss": 0.7633, "step": 2280 }, { "epoch": 5.20501138952164, "grad_norm": 0.396484375, "learning_rate": 0.00011027885821533508, "loss": 0.7679, "step": 2285 }, { "epoch": 5.216400911161731, "grad_norm": 0.390625, "learning_rate": 0.00010988331455187628, "loss": 0.7661, "step": 2290 }, { "epoch": 5.2277904328018225, "grad_norm": 0.37890625, "learning_rate": 0.00010948761467161637, "loss": 0.7699, "step": 2295 }, { "epoch": 5.239179954441913, "grad_norm": 0.3671875, "learning_rate": 0.00010909176482903295, "loss": 0.7734, "step": 2300 }, { "epoch": 5.2505694760820045, "grad_norm": 0.40625, "learning_rate": 0.00010869577128097404, "loss": 0.7675, "step": 2305 }, { "epoch": 5.261958997722096, "grad_norm": 0.41015625, "learning_rate": 0.00010829964028655885, "loss": 0.7645, "step": 2310 }, { "epoch": 5.2733485193621865, "grad_norm": 0.4609375, "learning_rate": 0.00010790337810707931, "loss": 0.767, "step": 2315 }, { "epoch": 5.284738041002278, "grad_norm": 0.5703125, "learning_rate": 0.00010750699100590076, "loss": 0.7662, "step": 2320 }, { "epoch": 5.296127562642369, "grad_norm": 0.4765625, "learning_rate": 0.00010711048524836311, "loss": 0.7673, "step": 2325 }, { "epoch": 5.30751708428246, "grad_norm": 0.453125, "learning_rate": 0.0001067138671016817, "loss": 0.7666, "step": 2330 }, { "epoch": 5.318906605922551, "grad_norm": 0.37109375, "learning_rate": 0.00010631714283484842, "loss": 0.7687, "step": 2335 }, { "epoch": 5.330296127562642, "grad_norm": 0.37890625, "learning_rate": 0.00010592031871853239, "loss": 0.771, "step": 2340 }, { "epoch": 5.341685649202733, "grad_norm": 0.384765625, "learning_rate": 0.00010552340102498104, "loss": 0.7624, "step": 2345 }, { "epoch": 5.353075170842825, "grad_norm": 0.46875, "learning_rate": 0.00010512639602792088, "loss": 0.7654, "step": 2350 }, { "epoch": 5.364464692482915, "grad_norm": 0.5, "learning_rate": 0.0001047293100024583, "loss": 0.7585, "step": 2355 }, { "epoch": 5.375854214123007, "grad_norm": 0.43359375, "learning_rate": 0.00010433214922498047, "loss": 0.7622, "step": 2360 }, { "epoch": 5.387243735763098, "grad_norm": 0.41796875, "learning_rate": 0.00010393491997305613, "loss": 0.7711, "step": 2365 }, { "epoch": 5.398633257403189, "grad_norm": 0.421875, "learning_rate": 0.0001035376285253363, "loss": 0.7672, "step": 2370 }, { "epoch": 5.41002277904328, "grad_norm": 0.427734375, "learning_rate": 0.00010314028116145509, "loss": 0.7748, "step": 2375 }, { "epoch": 5.421412300683372, "grad_norm": 0.408203125, "learning_rate": 0.00010274288416193034, "loss": 0.7648, "step": 2380 }, { "epoch": 5.432801822323462, "grad_norm": 0.380859375, "learning_rate": 0.00010234544380806461, "loss": 0.7623, "step": 2385 }, { "epoch": 5.444191343963554, "grad_norm": 0.390625, "learning_rate": 0.00010194796638184558, "loss": 0.7707, "step": 2390 }, { "epoch": 5.455580865603645, "grad_norm": 0.439453125, "learning_rate": 0.00010155045816584691, "loss": 0.7629, "step": 2395 }, { "epoch": 5.466970387243736, "grad_norm": 0.92578125, "learning_rate": 0.00010115292544312904, "loss": 0.7728, "step": 2400 }, { "epoch": 5.478359908883827, "grad_norm": 0.421875, "learning_rate": 0.00010075537449713963, "loss": 0.7704, "step": 2405 }, { "epoch": 5.489749430523918, "grad_norm": 0.376953125, "learning_rate": 0.00010035781161161446, "loss": 0.7731, "step": 2410 }, { "epoch": 5.501138952164009, "grad_norm": 0.5, "learning_rate": 9.996024307047798e-05, "loss": 0.7712, "step": 2415 }, { "epoch": 5.5125284738041005, "grad_norm": 0.451171875, "learning_rate": 9.956267515774412e-05, "loss": 0.7647, "step": 2420 }, { "epoch": 5.523917995444191, "grad_norm": 0.408203125, "learning_rate": 9.916511415741676e-05, "loss": 0.7712, "step": 2425 }, { "epoch": 5.5353075170842825, "grad_norm": 0.423828125, "learning_rate": 9.876756635339058e-05, "loss": 0.77, "step": 2430 }, { "epoch": 5.546697038724374, "grad_norm": 0.66015625, "learning_rate": 9.83700380293517e-05, "loss": 0.7637, "step": 2435 }, { "epoch": 5.5580865603644645, "grad_norm": 0.6484375, "learning_rate": 9.797253546867831e-05, "loss": 0.7676, "step": 2440 }, { "epoch": 5.569476082004556, "grad_norm": 0.37890625, "learning_rate": 9.757506495434133e-05, "loss": 0.7601, "step": 2445 }, { "epoch": 5.5808656036446465, "grad_norm": 0.466796875, "learning_rate": 9.71776327688053e-05, "loss": 0.7657, "step": 2450 }, { "epoch": 5.592255125284738, "grad_norm": 0.369140625, "learning_rate": 9.678024519392871e-05, "loss": 0.7737, "step": 2455 }, { "epoch": 5.603644646924829, "grad_norm": 0.455078125, "learning_rate": 9.638290851086518e-05, "loss": 0.7676, "step": 2460 }, { "epoch": 5.61503416856492, "grad_norm": 0.59765625, "learning_rate": 9.598562899996375e-05, "loss": 0.7704, "step": 2465 }, { "epoch": 5.626423690205011, "grad_norm": 0.474609375, "learning_rate": 9.558841294066985e-05, "loss": 0.7631, "step": 2470 }, { "epoch": 5.637813211845103, "grad_norm": 0.451171875, "learning_rate": 9.519126661142597e-05, "loss": 0.7657, "step": 2475 }, { "epoch": 5.649202733485193, "grad_norm": 0.380859375, "learning_rate": 9.479419628957246e-05, "loss": 0.7668, "step": 2480 }, { "epoch": 5.660592255125285, "grad_norm": 0.37109375, "learning_rate": 9.439720825124827e-05, "loss": 0.7604, "step": 2485 }, { "epoch": 5.671981776765376, "grad_norm": 0.380859375, "learning_rate": 9.400030877129176e-05, "loss": 0.7685, "step": 2490 }, { "epoch": 5.683371298405467, "grad_norm": 0.392578125, "learning_rate": 9.360350412314157e-05, "loss": 0.7715, "step": 2495 }, { "epoch": 5.694760820045558, "grad_norm": 0.349609375, "learning_rate": 9.320680057873735e-05, "loss": 0.7628, "step": 2500 }, { "epoch": 5.70615034168565, "grad_norm": 0.369140625, "learning_rate": 9.281020440842079e-05, "loss": 0.7629, "step": 2505 }, { "epoch": 5.71753986332574, "grad_norm": 0.38671875, "learning_rate": 9.241372188083631e-05, "loss": 0.7585, "step": 2510 }, { "epoch": 5.728929384965832, "grad_norm": 0.36328125, "learning_rate": 9.201735926283213e-05, "loss": 0.768, "step": 2515 }, { "epoch": 5.740318906605923, "grad_norm": 0.373046875, "learning_rate": 9.162112281936118e-05, "loss": 0.7658, "step": 2520 }, { "epoch": 5.751708428246014, "grad_norm": 0.388671875, "learning_rate": 9.122501881338199e-05, "loss": 0.7681, "step": 2525 }, { "epoch": 5.763097949886105, "grad_norm": 0.400390625, "learning_rate": 9.082905350575986e-05, "loss": 0.7653, "step": 2530 }, { "epoch": 5.774487471526196, "grad_norm": 0.375, "learning_rate": 9.043323315516775e-05, "loss": 0.7711, "step": 2535 }, { "epoch": 5.785876993166287, "grad_norm": 0.388671875, "learning_rate": 9.003756401798744e-05, "loss": 0.7596, "step": 2540 }, { "epoch": 5.7972665148063784, "grad_norm": 0.384765625, "learning_rate": 8.96420523482106e-05, "loss": 0.7649, "step": 2545 }, { "epoch": 5.808656036446469, "grad_norm": 0.375, "learning_rate": 8.924670439733997e-05, "loss": 0.7686, "step": 2550 }, { "epoch": 5.82004555808656, "grad_norm": 0.3828125, "learning_rate": 8.885152641429049e-05, "loss": 0.771, "step": 2555 }, { "epoch": 5.831435079726651, "grad_norm": 0.44140625, "learning_rate": 8.845652464529057e-05, "loss": 0.7638, "step": 2560 }, { "epoch": 5.842824601366742, "grad_norm": 0.404296875, "learning_rate": 8.806170533378345e-05, "loss": 0.7705, "step": 2565 }, { "epoch": 5.854214123006834, "grad_norm": 0.5234375, "learning_rate": 8.766707472032831e-05, "loss": 0.768, "step": 2570 }, { "epoch": 5.865603644646924, "grad_norm": 0.3828125, "learning_rate": 8.727263904250178e-05, "loss": 0.7626, "step": 2575 }, { "epoch": 5.876993166287016, "grad_norm": 0.63671875, "learning_rate": 8.687840453479938e-05, "loss": 0.7728, "step": 2580 }, { "epoch": 5.888382687927107, "grad_norm": 0.5859375, "learning_rate": 8.648437742853685e-05, "loss": 0.7665, "step": 2585 }, { "epoch": 5.899772209567198, "grad_norm": 0.37109375, "learning_rate": 8.609056395175175e-05, "loss": 0.7613, "step": 2590 }, { "epoch": 5.911161731207289, "grad_norm": 0.40625, "learning_rate": 8.569697032910492e-05, "loss": 0.7712, "step": 2595 }, { "epoch": 5.922551252847381, "grad_norm": 0.443359375, "learning_rate": 8.530360278178227e-05, "loss": 0.7704, "step": 2600 }, { "epoch": 5.933940774487471, "grad_norm": 0.388671875, "learning_rate": 8.491046752739624e-05, "loss": 0.7672, "step": 2605 }, { "epoch": 5.945330296127563, "grad_norm": 0.36328125, "learning_rate": 8.451757077988767e-05, "loss": 0.7701, "step": 2610 }, { "epoch": 5.956719817767654, "grad_norm": 0.40625, "learning_rate": 8.41249187494275e-05, "loss": 0.7666, "step": 2615 }, { "epoch": 5.968109339407745, "grad_norm": 0.58203125, "learning_rate": 8.373251764231872e-05, "loss": 0.7562, "step": 2620 }, { "epoch": 5.979498861047836, "grad_norm": 0.373046875, "learning_rate": 8.334037366089813e-05, "loss": 0.765, "step": 2625 }, { "epoch": 5.990888382687928, "grad_norm": 0.44921875, "learning_rate": 8.294849300343836e-05, "loss": 0.771, "step": 2630 }, { "epoch": 6.0, "eval_loss": 2.432857036590576, "eval_runtime": 0.2435, "eval_samples_per_second": 41.072, "eval_steps_per_second": 4.107, "step": 2634 }, { "epoch": 6.002277904328018, "grad_norm": 0.6328125, "learning_rate": 8.255688186404996e-05, "loss": 0.77, "step": 2635 }, { "epoch": 6.0136674259681095, "grad_norm": 0.56640625, "learning_rate": 8.216554643258342e-05, "loss": 0.748, "step": 2640 }, { "epoch": 6.0250569476082, "grad_norm": 0.58203125, "learning_rate": 8.177449289453134e-05, "loss": 0.7503, "step": 2645 }, { "epoch": 6.0364464692482915, "grad_norm": 0.5, "learning_rate": 8.138372743093076e-05, "loss": 0.7419, "step": 2650 }, { "epoch": 6.047835990888383, "grad_norm": 0.45703125, "learning_rate": 8.099325621826526e-05, "loss": 0.7518, "step": 2655 }, { "epoch": 6.0592255125284735, "grad_norm": 0.37890625, "learning_rate": 8.060308542836755e-05, "loss": 0.76, "step": 2660 }, { "epoch": 6.070615034168565, "grad_norm": 0.5546875, "learning_rate": 8.021322122832178e-05, "loss": 0.7567, "step": 2665 }, { "epoch": 6.082004555808656, "grad_norm": 0.5625, "learning_rate": 7.982366978036618e-05, "loss": 0.7548, "step": 2670 }, { "epoch": 6.093394077448747, "grad_norm": 0.3671875, "learning_rate": 7.943443724179548e-05, "loss": 0.7557, "step": 2675 }, { "epoch": 6.104783599088838, "grad_norm": 0.41796875, "learning_rate": 7.904552976486372e-05, "loss": 0.7571, "step": 2680 }, { "epoch": 6.116173120728929, "grad_norm": 0.376953125, "learning_rate": 7.865695349668703e-05, "loss": 0.7572, "step": 2685 }, { "epoch": 6.12756264236902, "grad_norm": 0.388671875, "learning_rate": 7.826871457914639e-05, "loss": 0.7415, "step": 2690 }, { "epoch": 6.138952164009112, "grad_norm": 0.494140625, "learning_rate": 7.788081914879051e-05, "loss": 0.7547, "step": 2695 }, { "epoch": 6.150341685649202, "grad_norm": 0.408203125, "learning_rate": 7.7493273336739e-05, "loss": 0.7431, "step": 2700 }, { "epoch": 6.161731207289294, "grad_norm": 0.50390625, "learning_rate": 7.710608326858535e-05, "loss": 0.7555, "step": 2705 }, { "epoch": 6.173120728929385, "grad_norm": 0.421875, "learning_rate": 7.67192550643001e-05, "loss": 0.7472, "step": 2710 }, { "epoch": 6.184510250569476, "grad_norm": 0.396484375, "learning_rate": 7.633279483813405e-05, "loss": 0.7569, "step": 2715 }, { "epoch": 6.195899772209567, "grad_norm": 0.3828125, "learning_rate": 7.594670869852185e-05, "loss": 0.7494, "step": 2720 }, { "epoch": 6.207289293849659, "grad_norm": 0.384765625, "learning_rate": 7.556100274798519e-05, "loss": 0.7532, "step": 2725 }, { "epoch": 6.218678815489749, "grad_norm": 0.392578125, "learning_rate": 7.517568308303643e-05, "loss": 0.7581, "step": 2730 }, { "epoch": 6.230068337129841, "grad_norm": 0.390625, "learning_rate": 7.47907557940824e-05, "loss": 0.7569, "step": 2735 }, { "epoch": 6.241457858769932, "grad_norm": 0.38671875, "learning_rate": 7.440622696532775e-05, "loss": 0.7549, "step": 2740 }, { "epoch": 6.252847380410023, "grad_norm": 0.42578125, "learning_rate": 7.402210267467928e-05, "loss": 0.7478, "step": 2745 }, { "epoch": 6.264236902050114, "grad_norm": 0.4609375, "learning_rate": 7.363838899364944e-05, "loss": 0.7515, "step": 2750 }, { "epoch": 6.275626423690205, "grad_norm": 0.392578125, "learning_rate": 7.325509198726064e-05, "loss": 0.7471, "step": 2755 }, { "epoch": 6.287015945330296, "grad_norm": 0.408203125, "learning_rate": 7.287221771394917e-05, "loss": 0.7565, "step": 2760 }, { "epoch": 6.2984054669703875, "grad_norm": 0.462890625, "learning_rate": 7.248977222546968e-05, "loss": 0.7572, "step": 2765 }, { "epoch": 6.309794988610478, "grad_norm": 0.486328125, "learning_rate": 7.210776156679931e-05, "loss": 0.7442, "step": 2770 }, { "epoch": 6.3211845102505695, "grad_norm": 0.45703125, "learning_rate": 7.172619177604223e-05, "loss": 0.7615, "step": 2775 }, { "epoch": 6.332574031890661, "grad_norm": 0.380859375, "learning_rate": 7.134506888433426e-05, "loss": 0.7538, "step": 2780 }, { "epoch": 6.3439635535307515, "grad_norm": 0.466796875, "learning_rate": 7.096439891574745e-05, "loss": 0.7445, "step": 2785 }, { "epoch": 6.355353075170843, "grad_norm": 0.3828125, "learning_rate": 7.058418788719491e-05, "loss": 0.7593, "step": 2790 }, { "epoch": 6.366742596810934, "grad_norm": 0.392578125, "learning_rate": 7.020444180833564e-05, "loss": 0.7603, "step": 2795 }, { "epoch": 6.378132118451025, "grad_norm": 0.412109375, "learning_rate": 6.982516668147967e-05, "loss": 0.7544, "step": 2800 }, { "epoch": 6.389521640091116, "grad_norm": 0.3984375, "learning_rate": 6.944636850149306e-05, "loss": 0.7508, "step": 2805 }, { "epoch": 6.400911161731207, "grad_norm": 0.380859375, "learning_rate": 6.906805325570316e-05, "loss": 0.7587, "step": 2810 }, { "epoch": 6.412300683371298, "grad_norm": 0.361328125, "learning_rate": 6.869022692380411e-05, "loss": 0.7456, "step": 2815 }, { "epoch": 6.42369020501139, "grad_norm": 0.37109375, "learning_rate": 6.831289547776207e-05, "loss": 0.7541, "step": 2820 }, { "epoch": 6.43507972665148, "grad_norm": 0.4765625, "learning_rate": 6.793606488172118e-05, "loss": 0.7477, "step": 2825 }, { "epoch": 6.446469248291572, "grad_norm": 0.400390625, "learning_rate": 6.75597410919089e-05, "loss": 0.7508, "step": 2830 }, { "epoch": 6.457858769931663, "grad_norm": 0.44140625, "learning_rate": 6.718393005654215e-05, "loss": 0.7583, "step": 2835 }, { "epoch": 6.469248291571754, "grad_norm": 0.3984375, "learning_rate": 6.680863771573318e-05, "loss": 0.7533, "step": 2840 }, { "epoch": 6.480637813211845, "grad_norm": 0.376953125, "learning_rate": 6.643387000139565e-05, "loss": 0.7506, "step": 2845 }, { "epoch": 6.492027334851937, "grad_norm": 0.43359375, "learning_rate": 6.6059632837151e-05, "loss": 0.7556, "step": 2850 }, { "epoch": 6.503416856492027, "grad_norm": 0.373046875, "learning_rate": 6.568593213823465e-05, "loss": 0.7505, "step": 2855 }, { "epoch": 6.514806378132119, "grad_norm": 0.3828125, "learning_rate": 6.53127738114026e-05, "loss": 0.7499, "step": 2860 }, { "epoch": 6.52619589977221, "grad_norm": 0.419921875, "learning_rate": 6.494016375483811e-05, "loss": 0.7614, "step": 2865 }, { "epoch": 6.537585421412301, "grad_norm": 0.37109375, "learning_rate": 6.456810785805842e-05, "loss": 0.7412, "step": 2870 }, { "epoch": 6.548974943052392, "grad_norm": 0.38671875, "learning_rate": 6.419661200182158e-05, "loss": 0.7612, "step": 2875 }, { "epoch": 6.560364464692483, "grad_norm": 0.3828125, "learning_rate": 6.38256820580336e-05, "loss": 0.7529, "step": 2880 }, { "epoch": 6.571753986332574, "grad_norm": 0.5234375, "learning_rate": 6.345532388965565e-05, "loss": 0.7536, "step": 2885 }, { "epoch": 6.5831435079726655, "grad_norm": 0.427734375, "learning_rate": 6.308554335061135e-05, "loss": 0.7533, "step": 2890 }, { "epoch": 6.594533029612756, "grad_norm": 0.412109375, "learning_rate": 6.271634628569418e-05, "loss": 0.7502, "step": 2895 }, { "epoch": 6.605922551252847, "grad_norm": 0.40625, "learning_rate": 6.234773853047526e-05, "loss": 0.7513, "step": 2900 }, { "epoch": 6.617312072892939, "grad_norm": 0.373046875, "learning_rate": 6.19797259112109e-05, "loss": 0.7542, "step": 2905 }, { "epoch": 6.628701594533029, "grad_norm": 0.44921875, "learning_rate": 6.161231424475075e-05, "loss": 0.7499, "step": 2910 }, { "epoch": 6.640091116173121, "grad_norm": 0.51171875, "learning_rate": 6.124550933844562e-05, "loss": 0.7522, "step": 2915 }, { "epoch": 6.651480637813211, "grad_norm": 0.455078125, "learning_rate": 6.087931699005588e-05, "loss": 0.7573, "step": 2920 }, { "epoch": 6.662870159453303, "grad_norm": 0.56640625, "learning_rate": 6.0513742987659686e-05, "loss": 0.7557, "step": 2925 }, { "epoch": 6.674259681093394, "grad_norm": 0.451171875, "learning_rate": 6.014879310956154e-05, "loss": 0.7554, "step": 2930 }, { "epoch": 6.685649202733485, "grad_norm": 0.404296875, "learning_rate": 5.978447312420103e-05, "loss": 0.7524, "step": 2935 }, { "epoch": 6.697038724373576, "grad_norm": 0.515625, "learning_rate": 5.9420788790061544e-05, "loss": 0.7571, "step": 2940 }, { "epoch": 6.708428246013668, "grad_norm": 0.3984375, "learning_rate": 5.905774585557922e-05, "loss": 0.7512, "step": 2945 }, { "epoch": 6.719817767653758, "grad_norm": 0.375, "learning_rate": 5.869535005905232e-05, "loss": 0.747, "step": 2950 }, { "epoch": 6.73120728929385, "grad_norm": 0.439453125, "learning_rate": 5.833360712855029e-05, "loss": 0.7562, "step": 2955 }, { "epoch": 6.742596810933941, "grad_norm": 0.5, "learning_rate": 5.7972522781823256e-05, "loss": 0.752, "step": 2960 }, { "epoch": 6.753986332574032, "grad_norm": 0.37109375, "learning_rate": 5.761210272621175e-05, "loss": 0.7494, "step": 2965 }, { "epoch": 6.765375854214123, "grad_norm": 0.375, "learning_rate": 5.7252352658556376e-05, "loss": 0.7533, "step": 2970 }, { "epoch": 6.776765375854215, "grad_norm": 0.3984375, "learning_rate": 5.689327826510796e-05, "loss": 0.7486, "step": 2975 }, { "epoch": 6.788154897494305, "grad_norm": 0.3671875, "learning_rate": 5.653488522143744e-05, "loss": 0.7489, "step": 2980 }, { "epoch": 6.7995444191343966, "grad_norm": 0.4140625, "learning_rate": 5.617717919234624e-05, "loss": 0.7518, "step": 2985 }, { "epoch": 6.810933940774487, "grad_norm": 0.453125, "learning_rate": 5.582016583177687e-05, "loss": 0.7607, "step": 2990 }, { "epoch": 6.8223234624145785, "grad_norm": 0.48046875, "learning_rate": 5.5463850782723346e-05, "loss": 0.7518, "step": 2995 }, { "epoch": 6.83371298405467, "grad_norm": 0.42578125, "learning_rate": 5.5108239677142115e-05, "loss": 0.7445, "step": 3000 }, { "epoch": 6.8451025056947605, "grad_norm": 0.3828125, "learning_rate": 5.475333813586297e-05, "loss": 0.7496, "step": 3005 }, { "epoch": 6.856492027334852, "grad_norm": 0.392578125, "learning_rate": 5.439915176850037e-05, "loss": 0.7585, "step": 3010 }, { "epoch": 6.867881548974943, "grad_norm": 0.3671875, "learning_rate": 5.404568617336456e-05, "loss": 0.7485, "step": 3015 }, { "epoch": 6.879271070615034, "grad_norm": 0.39453125, "learning_rate": 5.369294693737319e-05, "loss": 0.755, "step": 3020 }, { "epoch": 6.890660592255125, "grad_norm": 0.50390625, "learning_rate": 5.334093963596294e-05, "loss": 0.7556, "step": 3025 }, { "epoch": 6.902050113895216, "grad_norm": 0.52734375, "learning_rate": 5.298966983300161e-05, "loss": 0.7474, "step": 3030 }, { "epoch": 6.913439635535307, "grad_norm": 0.396484375, "learning_rate": 5.263914308069986e-05, "loss": 0.7531, "step": 3035 }, { "epoch": 6.924829157175399, "grad_norm": 0.390625, "learning_rate": 5.228936491952363e-05, "loss": 0.7501, "step": 3040 }, { "epoch": 6.936218678815489, "grad_norm": 0.3671875, "learning_rate": 5.194034087810665e-05, "loss": 0.7469, "step": 3045 }, { "epoch": 6.947608200455581, "grad_norm": 0.392578125, "learning_rate": 5.159207647316282e-05, "loss": 0.7559, "step": 3050 }, { "epoch": 6.958997722095672, "grad_norm": 0.37890625, "learning_rate": 5.12445772093992e-05, "loss": 0.748, "step": 3055 }, { "epoch": 6.970387243735763, "grad_norm": 0.3671875, "learning_rate": 5.089784857942892e-05, "loss": 0.7525, "step": 3060 }, { "epoch": 6.981776765375854, "grad_norm": 0.353515625, "learning_rate": 5.055189606368436e-05, "loss": 0.7544, "step": 3065 }, { "epoch": 6.993166287015946, "grad_norm": 0.423828125, "learning_rate": 5.020672513033066e-05, "loss": 0.7459, "step": 3070 }, { "epoch": 7.0, "eval_loss": 2.445798397064209, "eval_runtime": 0.2348, "eval_samples_per_second": 42.591, "eval_steps_per_second": 4.259, "step": 3073 }, { "epoch": 7.004555808656036, "grad_norm": 0.4296875, "learning_rate": 4.9862341235179014e-05, "loss": 0.7506, "step": 3075 }, { "epoch": 7.015945330296128, "grad_norm": 0.466796875, "learning_rate": 4.951874982160079e-05, "loss": 0.7452, "step": 3080 }, { "epoch": 7.027334851936219, "grad_norm": 0.4296875, "learning_rate": 4.917595632044113e-05, "loss": 0.7421, "step": 3085 }, { "epoch": 7.03872437357631, "grad_norm": 0.373046875, "learning_rate": 4.8833966149933364e-05, "loss": 0.744, "step": 3090 }, { "epoch": 7.050113895216401, "grad_norm": 0.376953125, "learning_rate": 4.849278471561328e-05, "loss": 0.7418, "step": 3095 }, { "epoch": 7.061503416856492, "grad_norm": 0.373046875, "learning_rate": 4.815241741023367e-05, "loss": 0.7488, "step": 3100 }, { "epoch": 7.072892938496583, "grad_norm": 0.396484375, "learning_rate": 4.7812869613679103e-05, "loss": 0.7373, "step": 3105 }, { "epoch": 7.0842824601366745, "grad_norm": 0.37890625, "learning_rate": 4.747414669288094e-05, "loss": 0.7441, "step": 3110 }, { "epoch": 7.095671981776765, "grad_norm": 0.388671875, "learning_rate": 4.713625400173247e-05, "loss": 0.7439, "step": 3115 }, { "epoch": 7.1070615034168565, "grad_norm": 0.388671875, "learning_rate": 4.679919688100423e-05, "loss": 0.7471, "step": 3120 }, { "epoch": 7.118451025056948, "grad_norm": 0.427734375, "learning_rate": 4.6462980658259625e-05, "loss": 0.7476, "step": 3125 }, { "epoch": 7.1298405466970385, "grad_norm": 0.384765625, "learning_rate": 4.6127610647770767e-05, "loss": 0.7365, "step": 3130 }, { "epoch": 7.14123006833713, "grad_norm": 0.427734375, "learning_rate": 4.5793092150434405e-05, "loss": 0.7422, "step": 3135 }, { "epoch": 7.152619589977221, "grad_norm": 0.4375, "learning_rate": 4.545943045368826e-05, "loss": 0.7483, "step": 3140 }, { "epoch": 7.164009111617312, "grad_norm": 0.390625, "learning_rate": 4.5126630831427264e-05, "loss": 0.743, "step": 3145 }, { "epoch": 7.175398633257403, "grad_norm": 0.400390625, "learning_rate": 4.479469854392031e-05, "loss": 0.7414, "step": 3150 }, { "epoch": 7.186788154897494, "grad_norm": 0.443359375, "learning_rate": 4.4463638837727196e-05, "loss": 0.7382, "step": 3155 }, { "epoch": 7.198177676537585, "grad_norm": 0.416015625, "learning_rate": 4.413345694561549e-05, "loss": 0.7365, "step": 3160 }, { "epoch": 7.209567198177677, "grad_norm": 0.390625, "learning_rate": 4.3804158086477986e-05, "loss": 0.7412, "step": 3165 }, { "epoch": 7.220956719817767, "grad_norm": 0.375, "learning_rate": 4.34757474652501e-05, "loss": 0.7491, "step": 3170 }, { "epoch": 7.232346241457859, "grad_norm": 0.37890625, "learning_rate": 4.3148230272827784e-05, "loss": 0.7452, "step": 3175 }, { "epoch": 7.24373576309795, "grad_norm": 0.369140625, "learning_rate": 4.282161168598523e-05, "loss": 0.7496, "step": 3180 }, { "epoch": 7.255125284738041, "grad_norm": 0.375, "learning_rate": 4.249589686729319e-05, "loss": 0.7409, "step": 3185 }, { "epoch": 7.266514806378132, "grad_norm": 0.38671875, "learning_rate": 4.217109096503736e-05, "loss": 0.742, "step": 3190 }, { "epoch": 7.277904328018224, "grad_norm": 0.36328125, "learning_rate": 4.184719911313707e-05, "loss": 0.7367, "step": 3195 }, { "epoch": 7.289293849658314, "grad_norm": 0.369140625, "learning_rate": 4.152422643106396e-05, "loss": 0.7467, "step": 3200 }, { "epoch": 7.300683371298406, "grad_norm": 0.38671875, "learning_rate": 4.1202178023761195e-05, "loss": 0.7416, "step": 3205 }, { "epoch": 7.312072892938497, "grad_norm": 0.40234375, "learning_rate": 4.088105898156282e-05, "loss": 0.7483, "step": 3210 }, { "epoch": 7.323462414578588, "grad_norm": 0.388671875, "learning_rate": 4.0560874380113146e-05, "loss": 0.7444, "step": 3215 }, { "epoch": 7.334851936218679, "grad_norm": 0.373046875, "learning_rate": 4.024162928028663e-05, "loss": 0.7417, "step": 3220 }, { "epoch": 7.34624145785877, "grad_norm": 0.37109375, "learning_rate": 3.9923328728107856e-05, "loss": 0.743, "step": 3225 }, { "epoch": 7.357630979498861, "grad_norm": 0.447265625, "learning_rate": 3.960597775467177e-05, "loss": 0.7482, "step": 3230 }, { "epoch": 7.3690205011389525, "grad_norm": 0.404296875, "learning_rate": 3.928958137606421e-05, "loss": 0.7473, "step": 3235 }, { "epoch": 7.380410022779043, "grad_norm": 0.375, "learning_rate": 3.8974144593282534e-05, "loss": 0.7429, "step": 3240 }, { "epoch": 7.3917995444191344, "grad_norm": 0.376953125, "learning_rate": 3.865967239215667e-05, "loss": 0.7481, "step": 3245 }, { "epoch": 7.403189066059226, "grad_norm": 0.37109375, "learning_rate": 3.834616974327021e-05, "loss": 0.7445, "step": 3250 }, { "epoch": 7.414578587699316, "grad_norm": 0.3828125, "learning_rate": 3.80336416018819e-05, "loss": 0.7431, "step": 3255 }, { "epoch": 7.425968109339408, "grad_norm": 0.431640625, "learning_rate": 3.7722092907847305e-05, "loss": 0.7394, "step": 3260 }, { "epoch": 7.437357630979498, "grad_norm": 0.3828125, "learning_rate": 3.741152858554077e-05, "loss": 0.744, "step": 3265 }, { "epoch": 7.44874715261959, "grad_norm": 0.375, "learning_rate": 3.710195354377747e-05, "loss": 0.7408, "step": 3270 }, { "epoch": 7.460136674259681, "grad_norm": 0.3984375, "learning_rate": 3.679337267573597e-05, "loss": 0.7361, "step": 3275 }, { "epoch": 7.471526195899772, "grad_norm": 0.390625, "learning_rate": 3.648579085888085e-05, "loss": 0.7467, "step": 3280 }, { "epoch": 7.482915717539863, "grad_norm": 0.37109375, "learning_rate": 3.6179212954885477e-05, "loss": 0.738, "step": 3285 }, { "epoch": 7.494305239179955, "grad_norm": 0.375, "learning_rate": 3.587364380955529e-05, "loss": 0.7475, "step": 3290 }, { "epoch": 7.505694760820045, "grad_norm": 0.416015625, "learning_rate": 3.556908825275117e-05, "loss": 0.7434, "step": 3295 }, { "epoch": 7.517084282460137, "grad_norm": 0.373046875, "learning_rate": 3.526555109831311e-05, "loss": 0.7477, "step": 3300 }, { "epoch": 7.528473804100228, "grad_norm": 0.38671875, "learning_rate": 3.4963037143984087e-05, "loss": 0.7413, "step": 3305 }, { "epoch": 7.539863325740319, "grad_norm": 0.44921875, "learning_rate": 3.466155117133433e-05, "loss": 0.748, "step": 3310 }, { "epoch": 7.55125284738041, "grad_norm": 0.390625, "learning_rate": 3.436109794568565e-05, "loss": 0.7444, "step": 3315 }, { "epoch": 7.562642369020502, "grad_norm": 0.392578125, "learning_rate": 3.406168221603611e-05, "loss": 0.7387, "step": 3320 }, { "epoch": 7.574031890660592, "grad_norm": 0.376953125, "learning_rate": 3.3763308714984974e-05, "loss": 0.7436, "step": 3325 }, { "epoch": 7.585421412300684, "grad_norm": 0.43359375, "learning_rate": 3.3465982158657984e-05, "loss": 0.7413, "step": 3330 }, { "epoch": 7.596810933940774, "grad_norm": 0.376953125, "learning_rate": 3.3169707246632705e-05, "loss": 0.7423, "step": 3335 }, { "epoch": 7.6082004555808656, "grad_norm": 0.369140625, "learning_rate": 3.287448866186428e-05, "loss": 0.7392, "step": 3340 }, { "epoch": 7.619589977220957, "grad_norm": 0.37890625, "learning_rate": 3.258033107061153e-05, "loss": 0.7461, "step": 3345 }, { "epoch": 7.6309794988610475, "grad_norm": 0.404296875, "learning_rate": 3.228723912236291e-05, "loss": 0.7365, "step": 3350 }, { "epoch": 7.642369020501139, "grad_norm": 0.390625, "learning_rate": 3.199521744976342e-05, "loss": 0.7414, "step": 3355 }, { "epoch": 7.65375854214123, "grad_norm": 0.400390625, "learning_rate": 3.170427066854096e-05, "loss": 0.7548, "step": 3360 }, { "epoch": 7.665148063781321, "grad_norm": 0.365234375, "learning_rate": 3.141440337743369e-05, "loss": 0.739, "step": 3365 }, { "epoch": 7.676537585421412, "grad_norm": 0.384765625, "learning_rate": 3.1125620158117186e-05, "loss": 0.7505, "step": 3370 }, { "epoch": 7.687927107061503, "grad_norm": 0.375, "learning_rate": 3.0837925575132024e-05, "loss": 0.7487, "step": 3375 }, { "epoch": 7.699316628701594, "grad_norm": 0.447265625, "learning_rate": 3.055132417581179e-05, "loss": 0.7427, "step": 3380 }, { "epoch": 7.710706150341686, "grad_norm": 0.3984375, "learning_rate": 3.0265820490210973e-05, "loss": 0.7384, "step": 3385 }, { "epoch": 7.722095671981776, "grad_norm": 0.400390625, "learning_rate": 2.9981419031033498e-05, "loss": 0.7402, "step": 3390 }, { "epoch": 7.733485193621868, "grad_norm": 0.453125, "learning_rate": 2.9698124293561357e-05, "loss": 0.7485, "step": 3395 }, { "epoch": 7.744874715261959, "grad_norm": 0.421875, "learning_rate": 2.941594075558366e-05, "loss": 0.7505, "step": 3400 }, { "epoch": 7.75626423690205, "grad_norm": 0.376953125, "learning_rate": 2.913487287732565e-05, "loss": 0.7446, "step": 3405 }, { "epoch": 7.767653758542141, "grad_norm": 0.41015625, "learning_rate": 2.8854925101378438e-05, "loss": 0.7461, "step": 3410 }, { "epoch": 7.779043280182233, "grad_norm": 0.5078125, "learning_rate": 2.857610185262859e-05, "loss": 0.75, "step": 3415 }, { "epoch": 7.790432801822323, "grad_norm": 0.37890625, "learning_rate": 2.8298407538188288e-05, "loss": 0.7469, "step": 3420 }, { "epoch": 7.801822323462415, "grad_norm": 0.37890625, "learning_rate": 2.8021846547325635e-05, "loss": 0.7437, "step": 3425 }, { "epoch": 7.813211845102506, "grad_norm": 0.416015625, "learning_rate": 2.774642325139535e-05, "loss": 0.7408, "step": 3430 }, { "epoch": 7.824601366742597, "grad_norm": 0.380859375, "learning_rate": 2.7472142003769495e-05, "loss": 0.7431, "step": 3435 }, { "epoch": 7.835990888382688, "grad_norm": 0.375, "learning_rate": 2.7199007139768928e-05, "loss": 0.7475, "step": 3440 }, { "epoch": 7.8473804100227795, "grad_norm": 0.375, "learning_rate": 2.6927022976594607e-05, "loss": 0.7371, "step": 3445 }, { "epoch": 7.85876993166287, "grad_norm": 0.390625, "learning_rate": 2.665619381325929e-05, "loss": 0.7477, "step": 3450 }, { "epoch": 7.8701594533029615, "grad_norm": 0.46484375, "learning_rate": 2.638652393051976e-05, "loss": 0.7433, "step": 3455 }, { "epoch": 7.881548974943052, "grad_norm": 0.4140625, "learning_rate": 2.6118017590809017e-05, "loss": 0.7401, "step": 3460 }, { "epoch": 7.8929384965831435, "grad_norm": 0.3828125, "learning_rate": 2.5850679038169045e-05, "loss": 0.7415, "step": 3465 }, { "epoch": 7.904328018223235, "grad_norm": 0.388671875, "learning_rate": 2.5584512498183544e-05, "loss": 0.7309, "step": 3470 }, { "epoch": 7.9157175398633255, "grad_norm": 0.39453125, "learning_rate": 2.531952217791136e-05, "loss": 0.7422, "step": 3475 }, { "epoch": 7.927107061503417, "grad_norm": 0.380859375, "learning_rate": 2.505571226581984e-05, "loss": 0.7434, "step": 3480 }, { "epoch": 7.9384965831435075, "grad_norm": 0.3671875, "learning_rate": 2.4793086931718634e-05, "loss": 0.7451, "step": 3485 }, { "epoch": 7.949886104783599, "grad_norm": 0.37890625, "learning_rate": 2.4531650326693822e-05, "loss": 0.7455, "step": 3490 }, { "epoch": 7.96127562642369, "grad_norm": 0.4140625, "learning_rate": 2.4271406583042335e-05, "loss": 0.7393, "step": 3495 }, { "epoch": 7.972665148063781, "grad_norm": 0.373046875, "learning_rate": 2.401235981420653e-05, "loss": 0.7443, "step": 3500 }, { "epoch": 7.984054669703872, "grad_norm": 0.3828125, "learning_rate": 2.3754514114709304e-05, "loss": 0.7429, "step": 3505 }, { "epoch": 7.995444191343964, "grad_norm": 0.400390625, "learning_rate": 2.3497873560089322e-05, "loss": 0.745, "step": 3510 }, { "epoch": 8.0, "eval_loss": 2.460949420928955, "eval_runtime": 0.2436, "eval_samples_per_second": 41.058, "eval_steps_per_second": 4.106, "step": 3512 }, { "epoch": 8.006833712984054, "grad_norm": 0.37109375, "learning_rate": 2.3242442206836523e-05, "loss": 0.749, "step": 3515 }, { "epoch": 8.018223234624147, "grad_norm": 0.361328125, "learning_rate": 2.298822409232817e-05, "loss": 0.7467, "step": 3520 }, { "epoch": 8.029612756264237, "grad_norm": 0.41015625, "learning_rate": 2.2735223234764846e-05, "loss": 0.735, "step": 3525 }, { "epoch": 8.041002277904328, "grad_norm": 0.3984375, "learning_rate": 2.2483443633107058e-05, "loss": 0.7392, "step": 3530 }, { "epoch": 8.052391799544418, "grad_norm": 0.453125, "learning_rate": 2.2232889267012038e-05, "loss": 0.7377, "step": 3535 }, { "epoch": 8.06378132118451, "grad_norm": 0.384765625, "learning_rate": 2.1983564096770725e-05, "loss": 0.748, "step": 3540 }, { "epoch": 8.075170842824601, "grad_norm": 0.38671875, "learning_rate": 2.1735472063245354e-05, "loss": 0.7325, "step": 3545 }, { "epoch": 8.086560364464692, "grad_norm": 0.384765625, "learning_rate": 2.1488617087806982e-05, "loss": 0.7372, "step": 3550 }, { "epoch": 8.097949886104784, "grad_norm": 0.388671875, "learning_rate": 2.1243003072273582e-05, "loss": 0.7391, "step": 3555 }, { "epoch": 8.109339407744875, "grad_norm": 0.376953125, "learning_rate": 2.0998633898848442e-05, "loss": 0.7467, "step": 3560 }, { "epoch": 8.120728929384965, "grad_norm": 0.375, "learning_rate": 2.0755513430058672e-05, "loss": 0.7418, "step": 3565 }, { "epoch": 8.132118451025057, "grad_norm": 0.376953125, "learning_rate": 2.0513645508694225e-05, "loss": 0.7383, "step": 3570 }, { "epoch": 8.143507972665148, "grad_norm": 0.376953125, "learning_rate": 2.0273033957747134e-05, "loss": 0.7417, "step": 3575 }, { "epoch": 8.154897494305239, "grad_norm": 0.400390625, "learning_rate": 2.0033682580351144e-05, "loss": 0.7386, "step": 3580 }, { "epoch": 8.166287015945331, "grad_norm": 0.384765625, "learning_rate": 1.9795595159721524e-05, "loss": 0.7449, "step": 3585 }, { "epoch": 8.177676537585421, "grad_norm": 0.3984375, "learning_rate": 1.955877545909528e-05, "loss": 0.7429, "step": 3590 }, { "epoch": 8.189066059225512, "grad_norm": 0.3984375, "learning_rate": 1.932322722167168e-05, "loss": 0.7391, "step": 3595 }, { "epoch": 8.200455580865604, "grad_norm": 0.376953125, "learning_rate": 1.9088954170553198e-05, "loss": 0.7389, "step": 3600 }, { "epoch": 8.211845102505695, "grad_norm": 0.357421875, "learning_rate": 1.8855960008686446e-05, "loss": 0.7406, "step": 3605 }, { "epoch": 8.223234624145785, "grad_norm": 0.373046875, "learning_rate": 1.86242484188038e-05, "loss": 0.736, "step": 3610 }, { "epoch": 8.234624145785878, "grad_norm": 0.369140625, "learning_rate": 1.8393823063365223e-05, "loss": 0.7449, "step": 3615 }, { "epoch": 8.246013667425968, "grad_norm": 0.373046875, "learning_rate": 1.816468758450024e-05, "loss": 0.7346, "step": 3620 }, { "epoch": 8.257403189066059, "grad_norm": 0.380859375, "learning_rate": 1.7936845603950447e-05, "loss": 0.743, "step": 3625 }, { "epoch": 8.268792710706151, "grad_norm": 0.388671875, "learning_rate": 1.7710300723012262e-05, "loss": 0.7421, "step": 3630 }, { "epoch": 8.280182232346242, "grad_norm": 0.380859375, "learning_rate": 1.7485056522480004e-05, "loss": 0.7365, "step": 3635 }, { "epoch": 8.291571753986332, "grad_norm": 0.384765625, "learning_rate": 1.726111656258932e-05, "loss": 0.741, "step": 3640 }, { "epoch": 8.302961275626423, "grad_norm": 0.369140625, "learning_rate": 1.7038484382960796e-05, "loss": 0.736, "step": 3645 }, { "epoch": 8.314350797266515, "grad_norm": 0.3671875, "learning_rate": 1.6817163502544208e-05, "loss": 0.7342, "step": 3650 }, { "epoch": 8.325740318906606, "grad_norm": 0.38671875, "learning_rate": 1.6597157419562703e-05, "loss": 0.7331, "step": 3655 }, { "epoch": 8.337129840546696, "grad_norm": 0.376953125, "learning_rate": 1.6378469611457592e-05, "loss": 0.7375, "step": 3660 }, { "epoch": 8.348519362186789, "grad_norm": 0.384765625, "learning_rate": 1.6161103534833423e-05, "loss": 0.7431, "step": 3665 }, { "epoch": 8.35990888382688, "grad_norm": 0.37890625, "learning_rate": 1.594506262540324e-05, "loss": 0.7431, "step": 3670 }, { "epoch": 8.37129840546697, "grad_norm": 0.390625, "learning_rate": 1.5730350297934448e-05, "loss": 0.7392, "step": 3675 }, { "epoch": 8.382687927107062, "grad_norm": 0.39453125, "learning_rate": 1.5516969946194626e-05, "loss": 0.7355, "step": 3680 }, { "epoch": 8.394077448747153, "grad_norm": 0.37890625, "learning_rate": 1.5304924942898068e-05, "loss": 0.7388, "step": 3685 }, { "epoch": 8.405466970387243, "grad_norm": 0.373046875, "learning_rate": 1.509421863965237e-05, "loss": 0.7441, "step": 3690 }, { "epoch": 8.416856492027335, "grad_norm": 0.37890625, "learning_rate": 1.4884854366905455e-05, "loss": 0.7324, "step": 3695 }, { "epoch": 8.428246013667426, "grad_norm": 0.369140625, "learning_rate": 1.4676835433892989e-05, "loss": 0.7403, "step": 3700 }, { "epoch": 8.439635535307517, "grad_norm": 0.384765625, "learning_rate": 1.4470165128586022e-05, "loss": 0.7422, "step": 3705 }, { "epoch": 8.451025056947609, "grad_norm": 0.388671875, "learning_rate": 1.4264846717639102e-05, "loss": 0.7403, "step": 3710 }, { "epoch": 8.4624145785877, "grad_norm": 0.3828125, "learning_rate": 1.4060883446338502e-05, "loss": 0.7358, "step": 3715 }, { "epoch": 8.47380410022779, "grad_norm": 0.40234375, "learning_rate": 1.3858278538551018e-05, "loss": 0.7384, "step": 3720 }, { "epoch": 8.485193621867882, "grad_norm": 0.3828125, "learning_rate": 1.3657035196673052e-05, "loss": 0.7365, "step": 3725 }, { "epoch": 8.496583143507973, "grad_norm": 0.376953125, "learning_rate": 1.345715660157989e-05, "loss": 0.7389, "step": 3730 }, { "epoch": 8.507972665148063, "grad_norm": 0.369140625, "learning_rate": 1.3258645912575484e-05, "loss": 0.7459, "step": 3735 }, { "epoch": 8.519362186788156, "grad_norm": 0.37890625, "learning_rate": 1.3061506267342472e-05, "loss": 0.7482, "step": 3740 }, { "epoch": 8.530751708428246, "grad_norm": 0.3828125, "learning_rate": 1.2865740781892699e-05, "loss": 0.7441, "step": 3745 }, { "epoch": 8.542141230068337, "grad_norm": 0.373046875, "learning_rate": 1.2671352550517823e-05, "loss": 0.7379, "step": 3750 }, { "epoch": 8.55353075170843, "grad_norm": 0.375, "learning_rate": 1.2478344645740469e-05, "loss": 0.7386, "step": 3755 }, { "epoch": 8.56492027334852, "grad_norm": 0.376953125, "learning_rate": 1.2286720118265659e-05, "loss": 0.7435, "step": 3760 }, { "epoch": 8.57630979498861, "grad_norm": 0.39453125, "learning_rate": 1.209648199693264e-05, "loss": 0.7412, "step": 3765 }, { "epoch": 8.5876993166287, "grad_norm": 0.380859375, "learning_rate": 1.190763328866693e-05, "loss": 0.7414, "step": 3770 }, { "epoch": 8.599088838268793, "grad_norm": 0.380859375, "learning_rate": 1.1720176978432795e-05, "loss": 0.7393, "step": 3775 }, { "epoch": 8.610478359908884, "grad_norm": 0.4140625, "learning_rate": 1.1534116029186181e-05, "loss": 0.7333, "step": 3780 }, { "epoch": 8.621867881548974, "grad_norm": 0.388671875, "learning_rate": 1.1349453381827713e-05, "loss": 0.7345, "step": 3785 }, { "epoch": 8.633257403189067, "grad_norm": 0.373046875, "learning_rate": 1.1166191955156346e-05, "loss": 0.7531, "step": 3790 }, { "epoch": 8.644646924829157, "grad_norm": 0.37109375, "learning_rate": 1.0984334645823158e-05, "loss": 0.7359, "step": 3795 }, { "epoch": 8.656036446469248, "grad_norm": 0.380859375, "learning_rate": 1.0803884328285586e-05, "loss": 0.7441, "step": 3800 }, { "epoch": 8.66742596810934, "grad_norm": 0.376953125, "learning_rate": 1.0624843854762034e-05, "loss": 0.7353, "step": 3805 }, { "epoch": 8.67881548974943, "grad_norm": 0.37109375, "learning_rate": 1.0447216055186681e-05, "loss": 0.7407, "step": 3810 }, { "epoch": 8.690205011389521, "grad_norm": 0.39453125, "learning_rate": 1.0271003737164909e-05, "loss": 0.7372, "step": 3815 }, { "epoch": 8.701594533029613, "grad_norm": 0.369140625, "learning_rate": 1.009620968592876e-05, "loss": 0.7445, "step": 3820 }, { "epoch": 8.712984054669704, "grad_norm": 0.373046875, "learning_rate": 9.922836664293022e-06, "loss": 0.7362, "step": 3825 }, { "epoch": 8.724373576309794, "grad_norm": 0.380859375, "learning_rate": 9.750887412611508e-06, "loss": 0.7408, "step": 3830 }, { "epoch": 8.735763097949887, "grad_norm": 0.376953125, "learning_rate": 9.580364648733775e-06, "loss": 0.7347, "step": 3835 }, { "epoch": 8.747152619589977, "grad_norm": 0.3828125, "learning_rate": 9.411271067962124e-06, "loss": 0.738, "step": 3840 }, { "epoch": 8.758542141230068, "grad_norm": 0.359375, "learning_rate": 9.243609343009086e-06, "loss": 0.7391, "step": 3845 }, { "epoch": 8.76993166287016, "grad_norm": 0.3828125, "learning_rate": 9.07738212395508e-06, "loss": 0.7405, "step": 3850 }, { "epoch": 8.78132118451025, "grad_norm": 0.455078125, "learning_rate": 8.912592038206546e-06, "loss": 0.7391, "step": 3855 }, { "epoch": 8.792710706150341, "grad_norm": 0.37890625, "learning_rate": 8.749241690454424e-06, "loss": 0.7367, "step": 3860 }, { "epoch": 8.804100227790432, "grad_norm": 0.37109375, "learning_rate": 8.587333662633035e-06, "loss": 0.7411, "step": 3865 }, { "epoch": 8.815489749430524, "grad_norm": 0.375, "learning_rate": 8.426870513879182e-06, "loss": 0.7365, "step": 3870 }, { "epoch": 8.826879271070615, "grad_norm": 0.38671875, "learning_rate": 8.267854780491747e-06, "loss": 0.7458, "step": 3875 }, { "epoch": 8.838268792710707, "grad_norm": 0.369140625, "learning_rate": 8.110288975891634e-06, "loss": 0.7326, "step": 3880 }, { "epoch": 8.849658314350798, "grad_norm": 0.3671875, "learning_rate": 7.954175590581992e-06, "loss": 0.7374, "step": 3885 }, { "epoch": 8.861047835990888, "grad_norm": 0.37890625, "learning_rate": 7.799517092108855e-06, "loss": 0.7345, "step": 3890 }, { "epoch": 8.872437357630979, "grad_norm": 0.388671875, "learning_rate": 7.646315925022152e-06, "loss": 0.7384, "step": 3895 }, { "epoch": 8.883826879271071, "grad_norm": 0.37109375, "learning_rate": 7.49457451083706e-06, "loss": 0.7319, "step": 3900 }, { "epoch": 8.895216400911162, "grad_norm": 0.3828125, "learning_rate": 7.344295247995725e-06, "loss": 0.7432, "step": 3905 }, { "epoch": 8.906605922551252, "grad_norm": 0.392578125, "learning_rate": 7.195480511829411e-06, "loss": 0.7392, "step": 3910 }, { "epoch": 8.917995444191344, "grad_norm": 0.376953125, "learning_rate": 7.048132654520856e-06, "loss": 0.7465, "step": 3915 }, { "epoch": 8.929384965831435, "grad_norm": 0.376953125, "learning_rate": 6.902254005067166e-06, "loss": 0.7387, "step": 3920 }, { "epoch": 8.940774487471526, "grad_norm": 0.37890625, "learning_rate": 6.7578468692429345e-06, "loss": 0.74, "step": 3925 }, { "epoch": 8.952164009111618, "grad_norm": 0.41796875, "learning_rate": 6.614913529563927e-06, "loss": 0.7346, "step": 3930 }, { "epoch": 8.963553530751708, "grad_norm": 0.369140625, "learning_rate": 6.4734562452508525e-06, "loss": 0.739, "step": 3935 }, { "epoch": 8.974943052391799, "grad_norm": 0.369140625, "learning_rate": 6.333477252193731e-06, "loss": 0.7418, "step": 3940 }, { "epoch": 8.986332574031891, "grad_norm": 0.373046875, "learning_rate": 6.19497876291657e-06, "loss": 0.7394, "step": 3945 }, { "epoch": 8.997722095671982, "grad_norm": 0.388671875, "learning_rate": 6.057962966542319e-06, "loss": 0.7433, "step": 3950 }, { "epoch": 9.0, "eval_loss": 2.4639732837677, "eval_runtime": 0.2359, "eval_samples_per_second": 42.387, "eval_steps_per_second": 4.239, "step": 3951 }, { "epoch": 9.009111617312072, "grad_norm": 0.37890625, "learning_rate": 5.922432028758362e-06, "loss": 0.7355, "step": 3955 }, { "epoch": 9.020501138952165, "grad_norm": 0.388671875, "learning_rate": 5.788388091782204e-06, "loss": 0.743, "step": 3960 }, { "epoch": 9.031890660592255, "grad_norm": 0.37109375, "learning_rate": 5.655833274327638e-06, "loss": 0.7396, "step": 3965 }, { "epoch": 9.043280182232346, "grad_norm": 0.357421875, "learning_rate": 5.524769671571317e-06, "loss": 0.734, "step": 3970 }, { "epoch": 9.054669703872438, "grad_norm": 0.3828125, "learning_rate": 5.395199355119518e-06, "loss": 0.7406, "step": 3975 }, { "epoch": 9.066059225512529, "grad_norm": 0.38671875, "learning_rate": 5.267124372975518e-06, "loss": 0.7398, "step": 3980 }, { "epoch": 9.07744874715262, "grad_norm": 0.376953125, "learning_rate": 5.140546749507136e-06, "loss": 0.7425, "step": 3985 }, { "epoch": 9.08883826879271, "grad_norm": 0.390625, "learning_rate": 5.0154684854147645e-06, "loss": 0.7335, "step": 3990 }, { "epoch": 9.100227790432802, "grad_norm": 0.392578125, "learning_rate": 4.891891557699779e-06, "loss": 0.7502, "step": 3995 }, { "epoch": 9.111617312072893, "grad_norm": 0.419921875, "learning_rate": 4.769817919633235e-06, "loss": 0.737, "step": 4000 }, { "epoch": 9.123006833712983, "grad_norm": 0.37109375, "learning_rate": 4.649249500725017e-06, "loss": 0.7357, "step": 4005 }, { "epoch": 9.134396355353076, "grad_norm": 0.384765625, "learning_rate": 4.530188206693375e-06, "loss": 0.7419, "step": 4010 }, { "epoch": 9.145785876993166, "grad_norm": 0.376953125, "learning_rate": 4.412635919434749e-06, "loss": 0.7442, "step": 4015 }, { "epoch": 9.157175398633257, "grad_norm": 0.373046875, "learning_rate": 4.296594496994055e-06, "loss": 0.7385, "step": 4020 }, { "epoch": 9.168564920273349, "grad_norm": 0.376953125, "learning_rate": 4.182065773535271e-06, "loss": 0.7394, "step": 4025 }, { "epoch": 9.17995444191344, "grad_norm": 0.369140625, "learning_rate": 4.069051559312531e-06, "loss": 0.7384, "step": 4030 }, { "epoch": 9.19134396355353, "grad_norm": 0.376953125, "learning_rate": 3.957553640641442e-06, "loss": 0.7342, "step": 4035 }, { "epoch": 9.202733485193622, "grad_norm": 0.369140625, "learning_rate": 3.847573779870839e-06, "loss": 0.734, "step": 4040 }, { "epoch": 9.214123006833713, "grad_norm": 0.373046875, "learning_rate": 3.7391137153550137e-06, "loss": 0.742, "step": 4045 }, { "epoch": 9.225512528473804, "grad_norm": 0.37890625, "learning_rate": 3.6321751614261767e-06, "loss": 0.7449, "step": 4050 }, { "epoch": 9.236902050113896, "grad_norm": 0.3828125, "learning_rate": 3.5267598083673304e-06, "loss": 0.7389, "step": 4055 }, { "epoch": 9.248291571753986, "grad_norm": 0.375, "learning_rate": 3.4228693223856136e-06, "loss": 0.7377, "step": 4060 }, { "epoch": 9.259681093394077, "grad_norm": 0.37109375, "learning_rate": 3.320505345585945e-06, "loss": 0.7356, "step": 4065 }, { "epoch": 9.27107061503417, "grad_norm": 0.380859375, "learning_rate": 3.219669495945055e-06, "loss": 0.7484, "step": 4070 }, { "epoch": 9.28246013667426, "grad_norm": 0.369140625, "learning_rate": 3.120363367285917e-06, "loss": 0.7398, "step": 4075 }, { "epoch": 9.29384965831435, "grad_norm": 0.369140625, "learning_rate": 3.022588529252579e-06, "loss": 0.736, "step": 4080 }, { "epoch": 9.305239179954443, "grad_norm": 0.361328125, "learning_rate": 2.9263465272853173e-06, "loss": 0.7336, "step": 4085 }, { "epoch": 9.316628701594533, "grad_norm": 0.40625, "learning_rate": 2.8316388825962324e-06, "loss": 0.7402, "step": 4090 }, { "epoch": 9.328018223234624, "grad_norm": 0.373046875, "learning_rate": 2.738467092145214e-06, "loss": 0.7427, "step": 4095 }, { "epoch": 9.339407744874716, "grad_norm": 0.369140625, "learning_rate": 2.646832628616214e-06, "loss": 0.7395, "step": 4100 }, { "epoch": 9.350797266514807, "grad_norm": 0.37890625, "learning_rate": 2.5567369403940776e-06, "loss": 0.7432, "step": 4105 }, { "epoch": 9.362186788154897, "grad_norm": 0.39453125, "learning_rate": 2.4681814515415404e-06, "loss": 0.7375, "step": 4110 }, { "epoch": 9.373576309794988, "grad_norm": 0.369140625, "learning_rate": 2.3811675617768204e-06, "loss": 0.7441, "step": 4115 }, { "epoch": 9.38496583143508, "grad_norm": 0.369140625, "learning_rate": 2.2956966464514175e-06, "loss": 0.7369, "step": 4120 }, { "epoch": 9.39635535307517, "grad_norm": 0.396484375, "learning_rate": 2.2117700565283838e-06, "loss": 0.734, "step": 4125 }, { "epoch": 9.407744874715261, "grad_norm": 0.37890625, "learning_rate": 2.1293891185610204e-06, "loss": 0.7374, "step": 4130 }, { "epoch": 9.419134396355354, "grad_norm": 0.37109375, "learning_rate": 2.04855513467187e-06, "loss": 0.7387, "step": 4135 }, { "epoch": 9.430523917995444, "grad_norm": 0.404296875, "learning_rate": 1.969269382532113e-06, "loss": 0.7385, "step": 4140 }, { "epoch": 9.441913439635535, "grad_norm": 0.369140625, "learning_rate": 1.8915331153414262e-06, "loss": 0.7313, "step": 4145 }, { "epoch": 9.453302961275627, "grad_norm": 0.384765625, "learning_rate": 1.8153475618081673e-06, "loss": 0.7394, "step": 4150 }, { "epoch": 9.464692482915718, "grad_norm": 0.3984375, "learning_rate": 1.7407139261299e-06, "loss": 0.736, "step": 4155 }, { "epoch": 9.476082004555808, "grad_norm": 0.376953125, "learning_rate": 1.667633387974421e-06, "loss": 0.7403, "step": 4160 }, { "epoch": 9.4874715261959, "grad_norm": 0.373046875, "learning_rate": 1.5961071024610752e-06, "loss": 0.7478, "step": 4165 }, { "epoch": 9.498861047835991, "grad_norm": 0.412109375, "learning_rate": 1.5261362001425138e-06, "loss": 0.7342, "step": 4170 }, { "epoch": 9.510250569476081, "grad_norm": 0.3671875, "learning_rate": 1.457721786986821e-06, "loss": 0.7485, "step": 4175 }, { "epoch": 9.521640091116174, "grad_norm": 0.380859375, "learning_rate": 1.3908649443600707e-06, "loss": 0.7392, "step": 4180 }, { "epoch": 9.533029612756264, "grad_norm": 0.35546875, "learning_rate": 1.3255667290091644e-06, "loss": 0.7362, "step": 4185 }, { "epoch": 9.544419134396355, "grad_norm": 0.359375, "learning_rate": 1.2618281730451432e-06, "loss": 0.7383, "step": 4190 }, { "epoch": 9.555808656036447, "grad_norm": 0.388671875, "learning_rate": 1.1996502839269453e-06, "loss": 0.7361, "step": 4195 }, { "epoch": 9.567198177676538, "grad_norm": 0.37109375, "learning_rate": 1.139034044445375e-06, "loss": 0.7354, "step": 4200 }, { "epoch": 9.578587699316628, "grad_norm": 0.37109375, "learning_rate": 1.0799804127076707e-06, "loss": 0.7354, "step": 4205 }, { "epoch": 9.589977220956719, "grad_norm": 0.37109375, "learning_rate": 1.0224903221222938e-06, "loss": 0.7408, "step": 4210 }, { "epoch": 9.601366742596811, "grad_norm": 0.416015625, "learning_rate": 9.665646813842077e-07, "loss": 0.7329, "step": 4215 }, { "epoch": 9.612756264236902, "grad_norm": 0.373046875, "learning_rate": 9.12204374460468e-07, "loss": 0.729, "step": 4220 }, { "epoch": 9.624145785876994, "grad_norm": 0.380859375, "learning_rate": 8.59410260576321e-07, "loss": 0.734, "step": 4225 }, { "epoch": 9.635535307517085, "grad_norm": 0.388671875, "learning_rate": 8.081831742015822e-07, "loss": 0.7391, "step": 4230 }, { "epoch": 9.646924829157175, "grad_norm": 0.369140625, "learning_rate": 7.585239250374243e-07, "loss": 0.7468, "step": 4235 }, { "epoch": 9.658314350797266, "grad_norm": 0.40234375, "learning_rate": 7.104332980036211e-07, "loss": 0.7347, "step": 4240 }, { "epoch": 9.669703872437358, "grad_norm": 0.369140625, "learning_rate": 6.639120532261456e-07, "loss": 0.7391, "step": 4245 }, { "epoch": 9.681093394077449, "grad_norm": 0.3671875, "learning_rate": 6.189609260251139e-07, "loss": 0.742, "step": 4250 }, { "epoch": 9.69248291571754, "grad_norm": 0.390625, "learning_rate": 5.755806269031827e-07, "loss": 0.7403, "step": 4255 }, { "epoch": 9.703872437357631, "grad_norm": 0.375, "learning_rate": 5.337718415343362e-07, "loss": 0.7401, "step": 4260 }, { "epoch": 9.715261958997722, "grad_norm": 0.36328125, "learning_rate": 4.935352307530062e-07, "loss": 0.7366, "step": 4265 }, { "epoch": 9.726651480637813, "grad_norm": 0.373046875, "learning_rate": 4.548714305436685e-07, "loss": 0.7305, "step": 4270 }, { "epoch": 9.738041002277905, "grad_norm": 0.384765625, "learning_rate": 4.1778105203078565e-07, "loss": 0.7419, "step": 4275 }, { "epoch": 9.749430523917995, "grad_norm": 0.37890625, "learning_rate": 3.822646814691244e-07, "loss": 0.735, "step": 4280 }, { "epoch": 9.760820045558086, "grad_norm": 0.373046875, "learning_rate": 3.483228802344973e-07, "loss": 0.7427, "step": 4285 }, { "epoch": 9.772209567198178, "grad_norm": 0.3828125, "learning_rate": 3.159561848149029e-07, "loss": 0.738, "step": 4290 }, { "epoch": 9.783599088838269, "grad_norm": 0.376953125, "learning_rate": 2.8516510680203224e-07, "loss": 0.7443, "step": 4295 }, { "epoch": 9.79498861047836, "grad_norm": 0.38671875, "learning_rate": 2.5595013288318703e-07, "loss": 0.7361, "step": 4300 }, { "epoch": 9.806378132118452, "grad_norm": 0.375, "learning_rate": 2.2831172483359643e-07, "loss": 0.7446, "step": 4305 }, { "epoch": 9.817767653758542, "grad_norm": 0.400390625, "learning_rate": 2.0225031950910078e-07, "loss": 0.7427, "step": 4310 }, { "epoch": 9.829157175398633, "grad_norm": 0.380859375, "learning_rate": 1.7776632883924615e-07, "loss": 0.7394, "step": 4315 }, { "epoch": 9.840546697038725, "grad_norm": 0.375, "learning_rate": 1.548601398208116e-07, "loss": 0.7364, "step": 4320 }, { "epoch": 9.851936218678816, "grad_norm": 0.37109375, "learning_rate": 1.3353211451161417e-07, "loss": 0.741, "step": 4325 }, { "epoch": 9.863325740318906, "grad_norm": 0.392578125, "learning_rate": 1.1378259002488013e-07, "loss": 0.733, "step": 4330 }, { "epoch": 9.874715261958997, "grad_norm": 0.37109375, "learning_rate": 9.561187852386022e-08, "loss": 0.742, "step": 4335 }, { "epoch": 9.88610478359909, "grad_norm": 0.369140625, "learning_rate": 7.902026721687828e-08, "loss": 0.7417, "step": 4340 }, { "epoch": 9.89749430523918, "grad_norm": 0.376953125, "learning_rate": 6.400801835286796e-08, "loss": 0.7369, "step": 4345 }, { "epoch": 9.90888382687927, "grad_norm": 0.373046875, "learning_rate": 5.05753692171318e-08, "loss": 0.7385, "step": 4350 }, { "epoch": 9.920273348519363, "grad_norm": 0.4140625, "learning_rate": 3.8722532127677404e-08, "loss": 0.737, "step": 4355 }, { "epoch": 9.931662870159453, "grad_norm": 0.3671875, "learning_rate": 2.844969443178691e-08, "loss": 0.7355, "step": 4360 }, { "epoch": 9.943052391799544, "grad_norm": 0.373046875, "learning_rate": 1.9757018503119285e-08, "loss": 0.7433, "step": 4365 }, { "epoch": 9.954441913439636, "grad_norm": 0.369140625, "learning_rate": 1.2644641739101292e-08, "loss": 0.7328, "step": 4370 }, { "epoch": 9.965831435079727, "grad_norm": 0.3671875, "learning_rate": 7.112676558784781e-09, "loss": 0.743, "step": 4375 }, { "epoch": 9.977220956719817, "grad_norm": 0.361328125, "learning_rate": 3.1612104010370068e-09, "loss": 0.7305, "step": 4380 }, { "epoch": 9.98861047835991, "grad_norm": 0.37109375, "learning_rate": 7.903057231750666e-10, "loss": 0.7394, "step": 4385 }, { "epoch": 10.0, "grad_norm": 0.373046875, "learning_rate": 0.0, "loss": 0.7376, "step": 4390 }, { "epoch": 10.0, "eval_loss": 2.4680683612823486, "eval_runtime": 0.2341, "eval_samples_per_second": 42.714, "eval_steps_per_second": 4.271, "step": 4390 }, { "epoch": 10.0, "step": 4390, "total_flos": 1.3402520949471838e+19, "train_loss": 1.3662878394941533, "train_runtime": 10620.987, "train_samples_per_second": 26.444, "train_steps_per_second": 0.413 } ], "logging_steps": 5, "max_steps": 4390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.3402520949471838e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }