|
{ |
|
"best_metric": 1.1020798683166504, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.06390593047034765, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001278118609406953, |
|
"eval_loss": 1.55583918094635, |
|
"eval_runtime": 164.9281, |
|
"eval_samples_per_second": 9.992, |
|
"eval_steps_per_second": 2.498, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001278118609406953, |
|
"grad_norm": 0.8316797614097595, |
|
"learning_rate": 4.12e-05, |
|
"loss": 2.8146, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002556237218813906, |
|
"grad_norm": 1.5763788223266602, |
|
"learning_rate": 8.24e-05, |
|
"loss": 2.7539, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.003834355828220859, |
|
"grad_norm": 1.363692283630371, |
|
"learning_rate": 0.0001236, |
|
"loss": 2.5238, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.005112474437627812, |
|
"grad_norm": 1.2749073505401611, |
|
"learning_rate": 0.0001648, |
|
"loss": 2.4211, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.006390593047034765, |
|
"grad_norm": 1.517897129058838, |
|
"learning_rate": 0.000206, |
|
"loss": 2.464, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006390593047034765, |
|
"eval_loss": 1.184857964515686, |
|
"eval_runtime": 164.9457, |
|
"eval_samples_per_second": 9.991, |
|
"eval_steps_per_second": 2.498, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.007668711656441718, |
|
"grad_norm": 1.6017893552780151, |
|
"learning_rate": 0.0002057490971767619, |
|
"loss": 2.5335, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00894683026584867, |
|
"grad_norm": 1.576207160949707, |
|
"learning_rate": 0.00020499761108038175, |
|
"loss": 2.3625, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.010224948875255624, |
|
"grad_norm": 1.646509051322937, |
|
"learning_rate": 0.00020374920287558198, |
|
"loss": 2.2446, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.011503067484662576, |
|
"grad_norm": 1.8106887340545654, |
|
"learning_rate": 0.00020200995468164684, |
|
"loss": 2.0361, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01278118609406953, |
|
"grad_norm": 5.966978073120117, |
|
"learning_rate": 0.00019978833994094855, |
|
"loss": 1.7612, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01278118609406953, |
|
"eval_loss": 1.323485255241394, |
|
"eval_runtime": 164.4974, |
|
"eval_samples_per_second": 10.018, |
|
"eval_steps_per_second": 2.505, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.014059304703476482, |
|
"grad_norm": 0.9883098006248474, |
|
"learning_rate": 0.00019709518213718787, |
|
"loss": 2.5625, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.015337423312883436, |
|
"grad_norm": 0.9952566027641296, |
|
"learning_rate": 0.00019394360206446948, |
|
"loss": 2.5723, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.016615541922290387, |
|
"grad_norm": 1.1392284631729126, |
|
"learning_rate": 0.00019034895390411186, |
|
"loss": 2.487, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.01789366053169734, |
|
"grad_norm": 1.1261663436889648, |
|
"learning_rate": 0.0001863287504206196, |
|
"loss": 2.4156, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.019171779141104295, |
|
"grad_norm": 1.3464722633361816, |
|
"learning_rate": 0.00018190257764125471, |
|
"loss": 2.4932, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.019171779141104295, |
|
"eval_loss": 1.177261471748352, |
|
"eval_runtime": 164.3678, |
|
"eval_samples_per_second": 10.026, |
|
"eval_steps_per_second": 2.507, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02044989775051125, |
|
"grad_norm": 1.7977056503295898, |
|
"learning_rate": 0.00017709199943488106, |
|
"loss": 2.3259, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0217280163599182, |
|
"grad_norm": 1.4776368141174316, |
|
"learning_rate": 0.00017192045245496238, |
|
"loss": 2.3461, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.023006134969325152, |
|
"grad_norm": 1.747175931930542, |
|
"learning_rate": 0.00016641313195854277, |
|
"loss": 2.3122, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.024284253578732106, |
|
"grad_norm": 1.9792368412017822, |
|
"learning_rate": 0.0001605968690574869, |
|
"loss": 2.0044, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02556237218813906, |
|
"grad_norm": 1.2479664087295532, |
|
"learning_rate": 0.0001545, |
|
"loss": 1.6114, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02556237218813906, |
|
"eval_loss": 1.4369239807128906, |
|
"eval_runtime": 164.2247, |
|
"eval_samples_per_second": 10.035, |
|
"eval_steps_per_second": 2.509, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.026840490797546013, |
|
"grad_norm": 0.9227780103683472, |
|
"learning_rate": 0.00014815222811927496, |
|
"loss": 2.6329, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.028118609406952964, |
|
"grad_norm": 1.103001356124878, |
|
"learning_rate": 0.00014158447912183896, |
|
"loss": 2.5218, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.029396728016359917, |
|
"grad_norm": 1.1868301630020142, |
|
"learning_rate": 0.00013482875042061958, |
|
"loss": 2.3236, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03067484662576687, |
|
"grad_norm": 1.4458407163619995, |
|
"learning_rate": 0.00012791795524676576, |
|
"loss": 2.4963, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.031952965235173825, |
|
"grad_norm": 1.193825602531433, |
|
"learning_rate": 0.00012088576229969385, |
|
"loss": 2.2952, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.031952965235173825, |
|
"eval_loss": 1.1527986526489258, |
|
"eval_runtime": 164.1739, |
|
"eval_samples_per_second": 10.038, |
|
"eval_steps_per_second": 2.51, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.033231083844580775, |
|
"grad_norm": 1.4545817375183105, |
|
"learning_rate": 0.0001137664317165683, |
|
"loss": 2.2958, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03450920245398773, |
|
"grad_norm": 1.3638569116592407, |
|
"learning_rate": 0.00010659464816035761, |
|
"loss": 2.2994, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03578732106339468, |
|
"grad_norm": 1.6373779773712158, |
|
"learning_rate": 9.940535183964242e-05, |
|
"loss": 2.0054, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03706543967280164, |
|
"grad_norm": 1.795109510421753, |
|
"learning_rate": 9.22335682834317e-05, |
|
"loss": 1.9559, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03834355828220859, |
|
"grad_norm": 3.544909954071045, |
|
"learning_rate": 8.511423770030617e-05, |
|
"loss": 1.7323, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03834355828220859, |
|
"eval_loss": 1.2193244695663452, |
|
"eval_runtime": 164.1641, |
|
"eval_samples_per_second": 10.039, |
|
"eval_steps_per_second": 2.51, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03962167689161554, |
|
"grad_norm": 0.8290108442306519, |
|
"learning_rate": 7.808204475323423e-05, |
|
"loss": 2.3842, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0408997955010225, |
|
"grad_norm": 0.8988869786262512, |
|
"learning_rate": 7.117124957938042e-05, |
|
"loss": 2.4288, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.04217791411042945, |
|
"grad_norm": 1.1712989807128906, |
|
"learning_rate": 6.441552087816105e-05, |
|
"loss": 2.3214, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0434560327198364, |
|
"grad_norm": 1.1696773767471313, |
|
"learning_rate": 5.784777188072502e-05, |
|
"loss": 2.4251, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.044734151329243355, |
|
"grad_norm": 1.3218914270401, |
|
"learning_rate": 5.150000000000002e-05, |
|
"loss": 2.2685, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.044734151329243355, |
|
"eval_loss": 1.1176284551620483, |
|
"eval_runtime": 164.1946, |
|
"eval_samples_per_second": 10.037, |
|
"eval_steps_per_second": 2.509, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.046012269938650305, |
|
"grad_norm": 1.2911388874053955, |
|
"learning_rate": 4.540313094251309e-05, |
|
"loss": 2.3459, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.04729038854805726, |
|
"grad_norm": 1.5387221574783325, |
|
"learning_rate": 3.958686804145719e-05, |
|
"loss": 2.2075, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04856850715746421, |
|
"grad_norm": 1.4516918659210205, |
|
"learning_rate": 3.4079547545037634e-05, |
|
"loss": 2.157, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04984662576687116, |
|
"grad_norm": 1.8935630321502686, |
|
"learning_rate": 2.8908000565118947e-05, |
|
"loss": 1.9017, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.05112474437627812, |
|
"grad_norm": 4.446241855621338, |
|
"learning_rate": 2.4097422358745275e-05, |
|
"loss": 1.6619, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05112474437627812, |
|
"eval_loss": 1.13141667842865, |
|
"eval_runtime": 164.2729, |
|
"eval_samples_per_second": 10.032, |
|
"eval_steps_per_second": 2.508, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05240286298568507, |
|
"grad_norm": 1.162409782409668, |
|
"learning_rate": 1.9671249579380422e-05, |
|
"loss": 2.313, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.05368098159509203, |
|
"grad_norm": 1.0322766304016113, |
|
"learning_rate": 1.5651046095888127e-05, |
|
"loss": 2.564, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.05495910020449898, |
|
"grad_norm": 0.9610581994056702, |
|
"learning_rate": 1.205639793553052e-05, |
|
"loss": 2.3624, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05623721881390593, |
|
"grad_norm": 1.0807439088821411, |
|
"learning_rate": 8.904817862812098e-06, |
|
"loss": 2.409, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.057515337423312884, |
|
"grad_norm": 1.0476551055908203, |
|
"learning_rate": 6.211660059051443e-06, |
|
"loss": 2.236, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.057515337423312884, |
|
"eval_loss": 1.1031837463378906, |
|
"eval_runtime": 164.1828, |
|
"eval_samples_per_second": 10.038, |
|
"eval_steps_per_second": 2.509, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.058793456032719835, |
|
"grad_norm": 1.250786542892456, |
|
"learning_rate": 3.990045318353154e-06, |
|
"loss": 2.3246, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.06007157464212679, |
|
"grad_norm": 1.4708008766174316, |
|
"learning_rate": 2.250797124418014e-06, |
|
"loss": 2.15, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.06134969325153374, |
|
"grad_norm": 1.5800373554229736, |
|
"learning_rate": 1.0023889196182526e-06, |
|
"loss": 2.0596, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0626278118609407, |
|
"grad_norm": 1.864892601966858, |
|
"learning_rate": 2.5090282323810766e-07, |
|
"loss": 1.793, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06390593047034765, |
|
"grad_norm": 3.8768982887268066, |
|
"learning_rate": 0.0, |
|
"loss": 1.6742, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06390593047034765, |
|
"eval_loss": 1.1020798683166504, |
|
"eval_runtime": 164.1474, |
|
"eval_samples_per_second": 10.04, |
|
"eval_steps_per_second": 2.51, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.074298032914432e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|