{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17380271653645946, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017380271653645947, "grad_norm": 1.1632381677627563, "learning_rate": 4.9978491913828615e-05, "loss": 3.6439, "step": 200 }, { "epoch": 0.0034760543307291894, "grad_norm": 0.6136592626571655, "learning_rate": 4.995676657426156e-05, "loss": 2.3619, "step": 400 }, { "epoch": 0.005214081496093784, "grad_norm": 0.6270021796226501, "learning_rate": 4.99350412346945e-05, "loss": 2.0395, "step": 600 }, { "epoch": 0.006952108661458379, "grad_norm": 0.9146378636360168, "learning_rate": 4.991331589512744e-05, "loss": 1.8894, "step": 800 }, { "epoch": 0.008690135826822973, "grad_norm": 0.7162560224533081, "learning_rate": 4.989159055556039e-05, "loss": 1.8242, "step": 1000 }, { "epoch": 0.010428162992187568, "grad_norm": 0.31322506070137024, "learning_rate": 4.9869865215993326e-05, "loss": 1.8681, "step": 1200 }, { "epoch": 0.012166190157552163, "grad_norm": 0.5570130348205566, "learning_rate": 4.984813987642627e-05, "loss": 1.8099, "step": 1400 }, { "epoch": 0.013904217322916758, "grad_norm": 0.6080171465873718, "learning_rate": 4.982641453685921e-05, "loss": 1.7641, "step": 1600 }, { "epoch": 0.015642244488281352, "grad_norm": 0.553460955619812, "learning_rate": 4.980468919729215e-05, "loss": 1.7712, "step": 1800 }, { "epoch": 0.017380271653645946, "grad_norm": 0.625199019908905, "learning_rate": 4.97829638577251e-05, "loss": 1.7565, "step": 2000 }, { "epoch": 0.019118298819010542, "grad_norm": 0.579010546207428, "learning_rate": 4.9761238518158044e-05, "loss": 1.7322, "step": 2200 }, { "epoch": 0.020856325984375135, "grad_norm": 0.7429983615875244, "learning_rate": 4.9739513178590984e-05, "loss": 1.7407, "step": 2400 }, { "epoch": 0.022594353149739732, "grad_norm": 0.5801926255226135, "learning_rate": 4.971778783902393e-05, "loss": 1.6487, "step": 2600 }, { "epoch": 0.024332380315104325, "grad_norm": 0.7074835300445557, "learning_rate": 4.969606249945687e-05, "loss": 1.6959, "step": 2800 }, { "epoch": 0.02607040748046892, "grad_norm": 0.6824275255203247, "learning_rate": 4.967433715988981e-05, "loss": 1.6958, "step": 3000 }, { "epoch": 0.027808434645833515, "grad_norm": 0.43216443061828613, "learning_rate": 4.9652611820322756e-05, "loss": 1.6824, "step": 3200 }, { "epoch": 0.029546461811198108, "grad_norm": 0.7867545485496521, "learning_rate": 4.9630886480755695e-05, "loss": 1.6671, "step": 3400 }, { "epoch": 0.031284488976562705, "grad_norm": 0.77516108751297, "learning_rate": 4.9609161141188635e-05, "loss": 1.6389, "step": 3600 }, { "epoch": 0.0330225161419273, "grad_norm": 0.5014050602912903, "learning_rate": 4.958743580162158e-05, "loss": 1.629, "step": 3800 }, { "epoch": 0.03476054330729189, "grad_norm": 0.6006432771682739, "learning_rate": 4.956571046205453e-05, "loss": 1.5977, "step": 4000 }, { "epoch": 0.036498570472656484, "grad_norm": 0.6153438091278076, "learning_rate": 4.954398512248747e-05, "loss": 1.5879, "step": 4200 }, { "epoch": 0.038236597638021085, "grad_norm": 0.8877372145652771, "learning_rate": 4.952225978292041e-05, "loss": 1.599, "step": 4400 }, { "epoch": 0.03997462480338568, "grad_norm": 0.7173994183540344, "learning_rate": 4.950053444335335e-05, "loss": 1.6205, "step": 4600 }, { "epoch": 0.04171265196875027, "grad_norm": 0.8379663228988647, "learning_rate": 4.947880910378629e-05, "loss": 1.5794, "step": 4800 }, { "epoch": 0.043450679134114864, "grad_norm": 0.6160171031951904, "learning_rate": 4.945708376421924e-05, "loss": 1.5656, "step": 5000 }, { "epoch": 0.045188706299479464, "grad_norm": 0.8642494082450867, "learning_rate": 4.943535842465218e-05, "loss": 1.5665, "step": 5200 }, { "epoch": 0.04692673346484406, "grad_norm": 0.6872414350509644, "learning_rate": 4.941363308508512e-05, "loss": 1.5552, "step": 5400 }, { "epoch": 0.04866476063020865, "grad_norm": 0.9998211860656738, "learning_rate": 4.9391907745518064e-05, "loss": 1.5458, "step": 5600 }, { "epoch": 0.050402787795573244, "grad_norm": 1.2175588607788086, "learning_rate": 4.937018240595101e-05, "loss": 1.5295, "step": 5800 }, { "epoch": 0.05214081496093784, "grad_norm": 1.0134257078170776, "learning_rate": 4.934845706638395e-05, "loss": 1.516, "step": 6000 }, { "epoch": 0.05387884212630244, "grad_norm": 0.8104642033576965, "learning_rate": 4.9326731726816896e-05, "loss": 1.5285, "step": 6200 }, { "epoch": 0.05561686929166703, "grad_norm": 0.9005429148674011, "learning_rate": 4.9305006387249836e-05, "loss": 1.5069, "step": 6400 }, { "epoch": 0.05735489645703162, "grad_norm": 0.8855582475662231, "learning_rate": 4.9283281047682775e-05, "loss": 1.5046, "step": 6600 }, { "epoch": 0.059092923622396216, "grad_norm": 0.7807704210281372, "learning_rate": 4.926155570811572e-05, "loss": 1.4663, "step": 6800 }, { "epoch": 0.06083095078776081, "grad_norm": 1.2552438974380493, "learning_rate": 4.923983036854866e-05, "loss": 1.486, "step": 7000 }, { "epoch": 0.06256897795312541, "grad_norm": 1.0079654455184937, "learning_rate": 4.92181050289816e-05, "loss": 1.4569, "step": 7200 }, { "epoch": 0.06430700511849, "grad_norm": 1.0267302989959717, "learning_rate": 4.919637968941455e-05, "loss": 1.4746, "step": 7400 }, { "epoch": 0.0660450322838546, "grad_norm": 1.1427829265594482, "learning_rate": 4.9174654349847494e-05, "loss": 1.4867, "step": 7600 }, { "epoch": 0.0677830594492192, "grad_norm": 0.9080005884170532, "learning_rate": 4.915292901028043e-05, "loss": 1.4789, "step": 7800 }, { "epoch": 0.06952108661458378, "grad_norm": 0.78159499168396, "learning_rate": 4.913120367071338e-05, "loss": 1.4435, "step": 8000 }, { "epoch": 0.07125911377994838, "grad_norm": 0.9199485778808594, "learning_rate": 4.910947833114632e-05, "loss": 1.4698, "step": 8200 }, { "epoch": 0.07299714094531297, "grad_norm": 1.1556053161621094, "learning_rate": 4.908775299157926e-05, "loss": 1.4233, "step": 8400 }, { "epoch": 0.07473516811067757, "grad_norm": 0.6093395948410034, "learning_rate": 4.9066027652012205e-05, "loss": 1.4607, "step": 8600 }, { "epoch": 0.07647319527604217, "grad_norm": 0.7765551209449768, "learning_rate": 4.9044302312445144e-05, "loss": 1.4067, "step": 8800 }, { "epoch": 0.07821122244140676, "grad_norm": 0.9261316061019897, "learning_rate": 4.9022576972878084e-05, "loss": 1.4437, "step": 9000 }, { "epoch": 0.07994924960677136, "grad_norm": 0.737016499042511, "learning_rate": 4.900085163331103e-05, "loss": 1.4394, "step": 9200 }, { "epoch": 0.08168727677213594, "grad_norm": 1.0518062114715576, "learning_rate": 4.897912629374397e-05, "loss": 1.442, "step": 9400 }, { "epoch": 0.08342530393750054, "grad_norm": 0.9163209795951843, "learning_rate": 4.8957400954176916e-05, "loss": 1.4126, "step": 9600 }, { "epoch": 0.08516333110286514, "grad_norm": 1.1651362180709839, "learning_rate": 4.893567561460986e-05, "loss": 1.4397, "step": 9800 }, { "epoch": 0.08690135826822973, "grad_norm": 1.2389508485794067, "learning_rate": 4.89139502750428e-05, "loss": 1.4226, "step": 10000 }, { "epoch": 0.08863938543359433, "grad_norm": 1.009730339050293, "learning_rate": 4.889222493547574e-05, "loss": 1.4643, "step": 10200 }, { "epoch": 0.09037741259895893, "grad_norm": 1.3371009826660156, "learning_rate": 4.887049959590869e-05, "loss": 1.4221, "step": 10400 }, { "epoch": 0.09211543976432351, "grad_norm": 1.0338963270187378, "learning_rate": 4.884877425634163e-05, "loss": 1.4122, "step": 10600 }, { "epoch": 0.09385346692968811, "grad_norm": 1.0023767948150635, "learning_rate": 4.8827048916774574e-05, "loss": 1.4034, "step": 10800 }, { "epoch": 0.0955914940950527, "grad_norm": 1.4514521360397339, "learning_rate": 4.880532357720751e-05, "loss": 1.4356, "step": 11000 }, { "epoch": 0.0973295212604173, "grad_norm": 1.0462247133255005, "learning_rate": 4.878359823764045e-05, "loss": 1.4038, "step": 11200 }, { "epoch": 0.0990675484257819, "grad_norm": 1.0881024599075317, "learning_rate": 4.87618728980734e-05, "loss": 1.3521, "step": 11400 }, { "epoch": 0.10080557559114649, "grad_norm": 1.1503826379776, "learning_rate": 4.8740147558506345e-05, "loss": 1.3455, "step": 11600 }, { "epoch": 0.10254360275651109, "grad_norm": 1.1788356304168701, "learning_rate": 4.8718422218939285e-05, "loss": 1.4246, "step": 11800 }, { "epoch": 0.10428162992187567, "grad_norm": 0.9009695649147034, "learning_rate": 4.8696696879372225e-05, "loss": 1.3701, "step": 12000 }, { "epoch": 0.10601965708724027, "grad_norm": 0.7886667251586914, "learning_rate": 4.867497153980517e-05, "loss": 1.3843, "step": 12200 }, { "epoch": 0.10775768425260487, "grad_norm": 1.0017770528793335, "learning_rate": 4.865335482693595e-05, "loss": 1.3785, "step": 12400 }, { "epoch": 0.10949571141796946, "grad_norm": 0.901871383190155, "learning_rate": 4.863162948736889e-05, "loss": 1.3627, "step": 12600 }, { "epoch": 0.11123373858333406, "grad_norm": 0.9240642189979553, "learning_rate": 4.860990414780183e-05, "loss": 1.3397, "step": 12800 }, { "epoch": 0.11297176574869865, "grad_norm": 1.2550582885742188, "learning_rate": 4.8588178808234776e-05, "loss": 1.368, "step": 13000 }, { "epoch": 0.11470979291406325, "grad_norm": 0.9313985705375671, "learning_rate": 4.8566453468667715e-05, "loss": 1.344, "step": 13200 }, { "epoch": 0.11644782007942785, "grad_norm": 0.8634843826293945, "learning_rate": 4.854472812910066e-05, "loss": 1.3308, "step": 13400 }, { "epoch": 0.11818584724479243, "grad_norm": 1.2060052156448364, "learning_rate": 4.85230027895336e-05, "loss": 1.355, "step": 13600 }, { "epoch": 0.11992387441015703, "grad_norm": 1.0419443845748901, "learning_rate": 4.850127744996655e-05, "loss": 1.3469, "step": 13800 }, { "epoch": 0.12166190157552162, "grad_norm": 1.2425956726074219, "learning_rate": 4.847955211039949e-05, "loss": 1.3368, "step": 14000 }, { "epoch": 0.12339992874088622, "grad_norm": 1.0397825241088867, "learning_rate": 4.8457826770832433e-05, "loss": 1.3211, "step": 14200 }, { "epoch": 0.12513795590625082, "grad_norm": 0.8406294584274292, "learning_rate": 4.843621005796321e-05, "loss": 1.3375, "step": 14400 }, { "epoch": 0.1268759830716154, "grad_norm": 0.816184401512146, "learning_rate": 4.841448471839615e-05, "loss": 1.3351, "step": 14600 }, { "epoch": 0.12861401023698, "grad_norm": 1.1904360055923462, "learning_rate": 4.839275937882909e-05, "loss": 1.3174, "step": 14800 }, { "epoch": 0.1303520374023446, "grad_norm": 1.2890825271606445, "learning_rate": 4.837103403926204e-05, "loss": 1.3294, "step": 15000 }, { "epoch": 0.1320900645677092, "grad_norm": 0.9586935639381409, "learning_rate": 4.834930869969498e-05, "loss": 1.2934, "step": 15200 }, { "epoch": 0.13382809173307378, "grad_norm": 0.9654845595359802, "learning_rate": 4.832758336012792e-05, "loss": 1.3386, "step": 15400 }, { "epoch": 0.1355661188984384, "grad_norm": 1.1789395809173584, "learning_rate": 4.8305858020560864e-05, "loss": 1.3499, "step": 15600 }, { "epoch": 0.13730414606380298, "grad_norm": 1.2728456258773804, "learning_rate": 4.82841326809938e-05, "loss": 1.3396, "step": 15800 }, { "epoch": 0.13904217322916756, "grad_norm": 1.0807838439941406, "learning_rate": 4.826240734142675e-05, "loss": 1.3369, "step": 16000 }, { "epoch": 0.14078020039453218, "grad_norm": 1.11849045753479, "learning_rate": 4.8240682001859696e-05, "loss": 1.3664, "step": 16200 }, { "epoch": 0.14251822755989677, "grad_norm": 1.5169202089309692, "learning_rate": 4.821906528899047e-05, "loss": 1.3352, "step": 16400 }, { "epoch": 0.14425625472526135, "grad_norm": 0.8817140460014343, "learning_rate": 4.819733994942341e-05, "loss": 1.2924, "step": 16600 }, { "epoch": 0.14599428189062594, "grad_norm": 1.1285990476608276, "learning_rate": 4.8175614609856355e-05, "loss": 1.3497, "step": 16800 }, { "epoch": 0.14773230905599055, "grad_norm": 1.1072745323181152, "learning_rate": 4.81538892702893e-05, "loss": 1.3129, "step": 17000 }, { "epoch": 0.14947033622135514, "grad_norm": 1.1911921501159668, "learning_rate": 4.813216393072224e-05, "loss": 1.312, "step": 17200 }, { "epoch": 0.15120836338671972, "grad_norm": 0.7891075611114502, "learning_rate": 4.811043859115518e-05, "loss": 1.281, "step": 17400 }, { "epoch": 0.15294639055208434, "grad_norm": 0.9016463756561279, "learning_rate": 4.8088713251588126e-05, "loss": 1.3118, "step": 17600 }, { "epoch": 0.15468441771744892, "grad_norm": 1.1260063648223877, "learning_rate": 4.8066987912021066e-05, "loss": 1.2743, "step": 17800 }, { "epoch": 0.1564224448828135, "grad_norm": 1.0370497703552246, "learning_rate": 4.8045262572454005e-05, "loss": 1.3013, "step": 18000 }, { "epoch": 0.15816047204817812, "grad_norm": 1.4182652235031128, "learning_rate": 4.802353723288695e-05, "loss": 1.2994, "step": 18200 }, { "epoch": 0.1598984992135427, "grad_norm": 1.1322426795959473, "learning_rate": 4.800192052001773e-05, "loss": 1.3339, "step": 18400 }, { "epoch": 0.1616365263789073, "grad_norm": 1.4774497747421265, "learning_rate": 4.798019518045067e-05, "loss": 1.3381, "step": 18600 }, { "epoch": 0.16337455354427188, "grad_norm": 1.3371450901031494, "learning_rate": 4.795846984088361e-05, "loss": 1.304, "step": 18800 }, { "epoch": 0.1651125807096365, "grad_norm": 0.8607128858566284, "learning_rate": 4.793674450131656e-05, "loss": 1.2686, "step": 19000 }, { "epoch": 0.16685060787500108, "grad_norm": 1.1792031526565552, "learning_rate": 4.79150191617495e-05, "loss": 1.3099, "step": 19200 }, { "epoch": 0.16858863504036567, "grad_norm": 1.274556040763855, "learning_rate": 4.789329382218244e-05, "loss": 1.2745, "step": 19400 }, { "epoch": 0.17032666220573028, "grad_norm": 0.7774292230606079, "learning_rate": 4.787156848261539e-05, "loss": 1.2905, "step": 19600 }, { "epoch": 0.17206468937109487, "grad_norm": 1.204541802406311, "learning_rate": 4.784984314304833e-05, "loss": 1.3014, "step": 19800 }, { "epoch": 0.17380271653645946, "grad_norm": 0.9959656000137329, "learning_rate": 4.782811780348127e-05, "loss": 1.2798, "step": 20000 } ], "logging_steps": 200, "max_steps": 460292, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.178779779072e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }