ST3 / trainer_state.json
Gerson Fabian Buenahora Ormaza
load all data
86db4a6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.17380271653645946,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017380271653645947,
"grad_norm": 1.1632381677627563,
"learning_rate": 4.9978491913828615e-05,
"loss": 3.6439,
"step": 200
},
{
"epoch": 0.0034760543307291894,
"grad_norm": 0.6136592626571655,
"learning_rate": 4.995676657426156e-05,
"loss": 2.3619,
"step": 400
},
{
"epoch": 0.005214081496093784,
"grad_norm": 0.6270021796226501,
"learning_rate": 4.99350412346945e-05,
"loss": 2.0395,
"step": 600
},
{
"epoch": 0.006952108661458379,
"grad_norm": 0.9146378636360168,
"learning_rate": 4.991331589512744e-05,
"loss": 1.8894,
"step": 800
},
{
"epoch": 0.008690135826822973,
"grad_norm": 0.7162560224533081,
"learning_rate": 4.989159055556039e-05,
"loss": 1.8242,
"step": 1000
},
{
"epoch": 0.010428162992187568,
"grad_norm": 0.31322506070137024,
"learning_rate": 4.9869865215993326e-05,
"loss": 1.8681,
"step": 1200
},
{
"epoch": 0.012166190157552163,
"grad_norm": 0.5570130348205566,
"learning_rate": 4.984813987642627e-05,
"loss": 1.8099,
"step": 1400
},
{
"epoch": 0.013904217322916758,
"grad_norm": 0.6080171465873718,
"learning_rate": 4.982641453685921e-05,
"loss": 1.7641,
"step": 1600
},
{
"epoch": 0.015642244488281352,
"grad_norm": 0.553460955619812,
"learning_rate": 4.980468919729215e-05,
"loss": 1.7712,
"step": 1800
},
{
"epoch": 0.017380271653645946,
"grad_norm": 0.625199019908905,
"learning_rate": 4.97829638577251e-05,
"loss": 1.7565,
"step": 2000
},
{
"epoch": 0.019118298819010542,
"grad_norm": 0.579010546207428,
"learning_rate": 4.9761238518158044e-05,
"loss": 1.7322,
"step": 2200
},
{
"epoch": 0.020856325984375135,
"grad_norm": 0.7429983615875244,
"learning_rate": 4.9739513178590984e-05,
"loss": 1.7407,
"step": 2400
},
{
"epoch": 0.022594353149739732,
"grad_norm": 0.5801926255226135,
"learning_rate": 4.971778783902393e-05,
"loss": 1.6487,
"step": 2600
},
{
"epoch": 0.024332380315104325,
"grad_norm": 0.7074835300445557,
"learning_rate": 4.969606249945687e-05,
"loss": 1.6959,
"step": 2800
},
{
"epoch": 0.02607040748046892,
"grad_norm": 0.6824275255203247,
"learning_rate": 4.967433715988981e-05,
"loss": 1.6958,
"step": 3000
},
{
"epoch": 0.027808434645833515,
"grad_norm": 0.43216443061828613,
"learning_rate": 4.9652611820322756e-05,
"loss": 1.6824,
"step": 3200
},
{
"epoch": 0.029546461811198108,
"grad_norm": 0.7867545485496521,
"learning_rate": 4.9630886480755695e-05,
"loss": 1.6671,
"step": 3400
},
{
"epoch": 0.031284488976562705,
"grad_norm": 0.77516108751297,
"learning_rate": 4.9609161141188635e-05,
"loss": 1.6389,
"step": 3600
},
{
"epoch": 0.0330225161419273,
"grad_norm": 0.5014050602912903,
"learning_rate": 4.958743580162158e-05,
"loss": 1.629,
"step": 3800
},
{
"epoch": 0.03476054330729189,
"grad_norm": 0.6006432771682739,
"learning_rate": 4.956571046205453e-05,
"loss": 1.5977,
"step": 4000
},
{
"epoch": 0.036498570472656484,
"grad_norm": 0.6153438091278076,
"learning_rate": 4.954398512248747e-05,
"loss": 1.5879,
"step": 4200
},
{
"epoch": 0.038236597638021085,
"grad_norm": 0.8877372145652771,
"learning_rate": 4.952225978292041e-05,
"loss": 1.599,
"step": 4400
},
{
"epoch": 0.03997462480338568,
"grad_norm": 0.7173994183540344,
"learning_rate": 4.950053444335335e-05,
"loss": 1.6205,
"step": 4600
},
{
"epoch": 0.04171265196875027,
"grad_norm": 0.8379663228988647,
"learning_rate": 4.947880910378629e-05,
"loss": 1.5794,
"step": 4800
},
{
"epoch": 0.043450679134114864,
"grad_norm": 0.6160171031951904,
"learning_rate": 4.945708376421924e-05,
"loss": 1.5656,
"step": 5000
},
{
"epoch": 0.045188706299479464,
"grad_norm": 0.8642494082450867,
"learning_rate": 4.943535842465218e-05,
"loss": 1.5665,
"step": 5200
},
{
"epoch": 0.04692673346484406,
"grad_norm": 0.6872414350509644,
"learning_rate": 4.941363308508512e-05,
"loss": 1.5552,
"step": 5400
},
{
"epoch": 0.04866476063020865,
"grad_norm": 0.9998211860656738,
"learning_rate": 4.9391907745518064e-05,
"loss": 1.5458,
"step": 5600
},
{
"epoch": 0.050402787795573244,
"grad_norm": 1.2175588607788086,
"learning_rate": 4.937018240595101e-05,
"loss": 1.5295,
"step": 5800
},
{
"epoch": 0.05214081496093784,
"grad_norm": 1.0134257078170776,
"learning_rate": 4.934845706638395e-05,
"loss": 1.516,
"step": 6000
},
{
"epoch": 0.05387884212630244,
"grad_norm": 0.8104642033576965,
"learning_rate": 4.9326731726816896e-05,
"loss": 1.5285,
"step": 6200
},
{
"epoch": 0.05561686929166703,
"grad_norm": 0.9005429148674011,
"learning_rate": 4.9305006387249836e-05,
"loss": 1.5069,
"step": 6400
},
{
"epoch": 0.05735489645703162,
"grad_norm": 0.8855582475662231,
"learning_rate": 4.9283281047682775e-05,
"loss": 1.5046,
"step": 6600
},
{
"epoch": 0.059092923622396216,
"grad_norm": 0.7807704210281372,
"learning_rate": 4.926155570811572e-05,
"loss": 1.4663,
"step": 6800
},
{
"epoch": 0.06083095078776081,
"grad_norm": 1.2552438974380493,
"learning_rate": 4.923983036854866e-05,
"loss": 1.486,
"step": 7000
},
{
"epoch": 0.06256897795312541,
"grad_norm": 1.0079654455184937,
"learning_rate": 4.92181050289816e-05,
"loss": 1.4569,
"step": 7200
},
{
"epoch": 0.06430700511849,
"grad_norm": 1.0267302989959717,
"learning_rate": 4.919637968941455e-05,
"loss": 1.4746,
"step": 7400
},
{
"epoch": 0.0660450322838546,
"grad_norm": 1.1427829265594482,
"learning_rate": 4.9174654349847494e-05,
"loss": 1.4867,
"step": 7600
},
{
"epoch": 0.0677830594492192,
"grad_norm": 0.9080005884170532,
"learning_rate": 4.915292901028043e-05,
"loss": 1.4789,
"step": 7800
},
{
"epoch": 0.06952108661458378,
"grad_norm": 0.78159499168396,
"learning_rate": 4.913120367071338e-05,
"loss": 1.4435,
"step": 8000
},
{
"epoch": 0.07125911377994838,
"grad_norm": 0.9199485778808594,
"learning_rate": 4.910947833114632e-05,
"loss": 1.4698,
"step": 8200
},
{
"epoch": 0.07299714094531297,
"grad_norm": 1.1556053161621094,
"learning_rate": 4.908775299157926e-05,
"loss": 1.4233,
"step": 8400
},
{
"epoch": 0.07473516811067757,
"grad_norm": 0.6093395948410034,
"learning_rate": 4.9066027652012205e-05,
"loss": 1.4607,
"step": 8600
},
{
"epoch": 0.07647319527604217,
"grad_norm": 0.7765551209449768,
"learning_rate": 4.9044302312445144e-05,
"loss": 1.4067,
"step": 8800
},
{
"epoch": 0.07821122244140676,
"grad_norm": 0.9261316061019897,
"learning_rate": 4.9022576972878084e-05,
"loss": 1.4437,
"step": 9000
},
{
"epoch": 0.07994924960677136,
"grad_norm": 0.737016499042511,
"learning_rate": 4.900085163331103e-05,
"loss": 1.4394,
"step": 9200
},
{
"epoch": 0.08168727677213594,
"grad_norm": 1.0518062114715576,
"learning_rate": 4.897912629374397e-05,
"loss": 1.442,
"step": 9400
},
{
"epoch": 0.08342530393750054,
"grad_norm": 0.9163209795951843,
"learning_rate": 4.8957400954176916e-05,
"loss": 1.4126,
"step": 9600
},
{
"epoch": 0.08516333110286514,
"grad_norm": 1.1651362180709839,
"learning_rate": 4.893567561460986e-05,
"loss": 1.4397,
"step": 9800
},
{
"epoch": 0.08690135826822973,
"grad_norm": 1.2389508485794067,
"learning_rate": 4.89139502750428e-05,
"loss": 1.4226,
"step": 10000
},
{
"epoch": 0.08863938543359433,
"grad_norm": 1.009730339050293,
"learning_rate": 4.889222493547574e-05,
"loss": 1.4643,
"step": 10200
},
{
"epoch": 0.09037741259895893,
"grad_norm": 1.3371009826660156,
"learning_rate": 4.887049959590869e-05,
"loss": 1.4221,
"step": 10400
},
{
"epoch": 0.09211543976432351,
"grad_norm": 1.0338963270187378,
"learning_rate": 4.884877425634163e-05,
"loss": 1.4122,
"step": 10600
},
{
"epoch": 0.09385346692968811,
"grad_norm": 1.0023767948150635,
"learning_rate": 4.8827048916774574e-05,
"loss": 1.4034,
"step": 10800
},
{
"epoch": 0.0955914940950527,
"grad_norm": 1.4514521360397339,
"learning_rate": 4.880532357720751e-05,
"loss": 1.4356,
"step": 11000
},
{
"epoch": 0.0973295212604173,
"grad_norm": 1.0462247133255005,
"learning_rate": 4.878359823764045e-05,
"loss": 1.4038,
"step": 11200
},
{
"epoch": 0.0990675484257819,
"grad_norm": 1.0881024599075317,
"learning_rate": 4.87618728980734e-05,
"loss": 1.3521,
"step": 11400
},
{
"epoch": 0.10080557559114649,
"grad_norm": 1.1503826379776,
"learning_rate": 4.8740147558506345e-05,
"loss": 1.3455,
"step": 11600
},
{
"epoch": 0.10254360275651109,
"grad_norm": 1.1788356304168701,
"learning_rate": 4.8718422218939285e-05,
"loss": 1.4246,
"step": 11800
},
{
"epoch": 0.10428162992187567,
"grad_norm": 0.9009695649147034,
"learning_rate": 4.8696696879372225e-05,
"loss": 1.3701,
"step": 12000
},
{
"epoch": 0.10601965708724027,
"grad_norm": 0.7886667251586914,
"learning_rate": 4.867497153980517e-05,
"loss": 1.3843,
"step": 12200
},
{
"epoch": 0.10775768425260487,
"grad_norm": 1.0017770528793335,
"learning_rate": 4.865335482693595e-05,
"loss": 1.3785,
"step": 12400
},
{
"epoch": 0.10949571141796946,
"grad_norm": 0.901871383190155,
"learning_rate": 4.863162948736889e-05,
"loss": 1.3627,
"step": 12600
},
{
"epoch": 0.11123373858333406,
"grad_norm": 0.9240642189979553,
"learning_rate": 4.860990414780183e-05,
"loss": 1.3397,
"step": 12800
},
{
"epoch": 0.11297176574869865,
"grad_norm": 1.2550582885742188,
"learning_rate": 4.8588178808234776e-05,
"loss": 1.368,
"step": 13000
},
{
"epoch": 0.11470979291406325,
"grad_norm": 0.9313985705375671,
"learning_rate": 4.8566453468667715e-05,
"loss": 1.344,
"step": 13200
},
{
"epoch": 0.11644782007942785,
"grad_norm": 0.8634843826293945,
"learning_rate": 4.854472812910066e-05,
"loss": 1.3308,
"step": 13400
},
{
"epoch": 0.11818584724479243,
"grad_norm": 1.2060052156448364,
"learning_rate": 4.85230027895336e-05,
"loss": 1.355,
"step": 13600
},
{
"epoch": 0.11992387441015703,
"grad_norm": 1.0419443845748901,
"learning_rate": 4.850127744996655e-05,
"loss": 1.3469,
"step": 13800
},
{
"epoch": 0.12166190157552162,
"grad_norm": 1.2425956726074219,
"learning_rate": 4.847955211039949e-05,
"loss": 1.3368,
"step": 14000
},
{
"epoch": 0.12339992874088622,
"grad_norm": 1.0397825241088867,
"learning_rate": 4.8457826770832433e-05,
"loss": 1.3211,
"step": 14200
},
{
"epoch": 0.12513795590625082,
"grad_norm": 0.8406294584274292,
"learning_rate": 4.843621005796321e-05,
"loss": 1.3375,
"step": 14400
},
{
"epoch": 0.1268759830716154,
"grad_norm": 0.816184401512146,
"learning_rate": 4.841448471839615e-05,
"loss": 1.3351,
"step": 14600
},
{
"epoch": 0.12861401023698,
"grad_norm": 1.1904360055923462,
"learning_rate": 4.839275937882909e-05,
"loss": 1.3174,
"step": 14800
},
{
"epoch": 0.1303520374023446,
"grad_norm": 1.2890825271606445,
"learning_rate": 4.837103403926204e-05,
"loss": 1.3294,
"step": 15000
},
{
"epoch": 0.1320900645677092,
"grad_norm": 0.9586935639381409,
"learning_rate": 4.834930869969498e-05,
"loss": 1.2934,
"step": 15200
},
{
"epoch": 0.13382809173307378,
"grad_norm": 0.9654845595359802,
"learning_rate": 4.832758336012792e-05,
"loss": 1.3386,
"step": 15400
},
{
"epoch": 0.1355661188984384,
"grad_norm": 1.1789395809173584,
"learning_rate": 4.8305858020560864e-05,
"loss": 1.3499,
"step": 15600
},
{
"epoch": 0.13730414606380298,
"grad_norm": 1.2728456258773804,
"learning_rate": 4.82841326809938e-05,
"loss": 1.3396,
"step": 15800
},
{
"epoch": 0.13904217322916756,
"grad_norm": 1.0807838439941406,
"learning_rate": 4.826240734142675e-05,
"loss": 1.3369,
"step": 16000
},
{
"epoch": 0.14078020039453218,
"grad_norm": 1.11849045753479,
"learning_rate": 4.8240682001859696e-05,
"loss": 1.3664,
"step": 16200
},
{
"epoch": 0.14251822755989677,
"grad_norm": 1.5169202089309692,
"learning_rate": 4.821906528899047e-05,
"loss": 1.3352,
"step": 16400
},
{
"epoch": 0.14425625472526135,
"grad_norm": 0.8817140460014343,
"learning_rate": 4.819733994942341e-05,
"loss": 1.2924,
"step": 16600
},
{
"epoch": 0.14599428189062594,
"grad_norm": 1.1285990476608276,
"learning_rate": 4.8175614609856355e-05,
"loss": 1.3497,
"step": 16800
},
{
"epoch": 0.14773230905599055,
"grad_norm": 1.1072745323181152,
"learning_rate": 4.81538892702893e-05,
"loss": 1.3129,
"step": 17000
},
{
"epoch": 0.14947033622135514,
"grad_norm": 1.1911921501159668,
"learning_rate": 4.813216393072224e-05,
"loss": 1.312,
"step": 17200
},
{
"epoch": 0.15120836338671972,
"grad_norm": 0.7891075611114502,
"learning_rate": 4.811043859115518e-05,
"loss": 1.281,
"step": 17400
},
{
"epoch": 0.15294639055208434,
"grad_norm": 0.9016463756561279,
"learning_rate": 4.8088713251588126e-05,
"loss": 1.3118,
"step": 17600
},
{
"epoch": 0.15468441771744892,
"grad_norm": 1.1260063648223877,
"learning_rate": 4.8066987912021066e-05,
"loss": 1.2743,
"step": 17800
},
{
"epoch": 0.1564224448828135,
"grad_norm": 1.0370497703552246,
"learning_rate": 4.8045262572454005e-05,
"loss": 1.3013,
"step": 18000
},
{
"epoch": 0.15816047204817812,
"grad_norm": 1.4182652235031128,
"learning_rate": 4.802353723288695e-05,
"loss": 1.2994,
"step": 18200
},
{
"epoch": 0.1598984992135427,
"grad_norm": 1.1322426795959473,
"learning_rate": 4.800192052001773e-05,
"loss": 1.3339,
"step": 18400
},
{
"epoch": 0.1616365263789073,
"grad_norm": 1.4774497747421265,
"learning_rate": 4.798019518045067e-05,
"loss": 1.3381,
"step": 18600
},
{
"epoch": 0.16337455354427188,
"grad_norm": 1.3371450901031494,
"learning_rate": 4.795846984088361e-05,
"loss": 1.304,
"step": 18800
},
{
"epoch": 0.1651125807096365,
"grad_norm": 0.8607128858566284,
"learning_rate": 4.793674450131656e-05,
"loss": 1.2686,
"step": 19000
},
{
"epoch": 0.16685060787500108,
"grad_norm": 1.1792031526565552,
"learning_rate": 4.79150191617495e-05,
"loss": 1.3099,
"step": 19200
},
{
"epoch": 0.16858863504036567,
"grad_norm": 1.274556040763855,
"learning_rate": 4.789329382218244e-05,
"loss": 1.2745,
"step": 19400
},
{
"epoch": 0.17032666220573028,
"grad_norm": 0.7774292230606079,
"learning_rate": 4.787156848261539e-05,
"loss": 1.2905,
"step": 19600
},
{
"epoch": 0.17206468937109487,
"grad_norm": 1.204541802406311,
"learning_rate": 4.784984314304833e-05,
"loss": 1.3014,
"step": 19800
},
{
"epoch": 0.17380271653645946,
"grad_norm": 0.9959656000137329,
"learning_rate": 4.782811780348127e-05,
"loss": 1.2798,
"step": 20000
}
],
"logging_steps": 200,
"max_steps": 460292,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.178779779072e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}