|
{ |
|
"best_metric": 1.5615293979644775, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.009793072380597964, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.958614476119593e-05, |
|
"eval_loss": 2.541055679321289, |
|
"eval_runtime": 808.5156, |
|
"eval_samples_per_second": 26.589, |
|
"eval_steps_per_second": 6.648, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0001958614476119593, |
|
"grad_norm": 0.42259156703948975, |
|
"learning_rate": 4.12e-05, |
|
"loss": 2.4447, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0003917228952239186, |
|
"grad_norm": 0.4474017918109894, |
|
"learning_rate": 8.24e-05, |
|
"loss": 2.3596, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0005875843428358779, |
|
"grad_norm": 0.5387372970581055, |
|
"learning_rate": 0.0001236, |
|
"loss": 2.2655, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0007834457904478372, |
|
"grad_norm": 0.6558287739753723, |
|
"learning_rate": 0.0001648, |
|
"loss": 2.1072, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0009793072380597965, |
|
"grad_norm": 0.790547788143158, |
|
"learning_rate": 0.000206, |
|
"loss": 2.031, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0009793072380597965, |
|
"eval_loss": 2.0144009590148926, |
|
"eval_runtime": 811.9056, |
|
"eval_samples_per_second": 26.478, |
|
"eval_steps_per_second": 6.62, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0011751686856717558, |
|
"grad_norm": 0.474514901638031, |
|
"learning_rate": 0.0002057490971767619, |
|
"loss": 1.9995, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.001371030133283715, |
|
"grad_norm": 0.48066890239715576, |
|
"learning_rate": 0.00020499761108038175, |
|
"loss": 1.9568, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0015668915808956744, |
|
"grad_norm": 0.5162322521209717, |
|
"learning_rate": 0.00020374920287558198, |
|
"loss": 1.8879, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0017627530285076337, |
|
"grad_norm": 0.5203802585601807, |
|
"learning_rate": 0.00020200995468164684, |
|
"loss": 1.8786, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.001958614476119593, |
|
"grad_norm": 0.6834505796432495, |
|
"learning_rate": 0.00019978833994094855, |
|
"loss": 1.8963, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.001958614476119593, |
|
"eval_loss": 1.8420443534851074, |
|
"eval_runtime": 802.6412, |
|
"eval_samples_per_second": 26.784, |
|
"eval_steps_per_second": 6.697, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0021544759237315525, |
|
"grad_norm": 0.4236699044704437, |
|
"learning_rate": 0.00019709518213718787, |
|
"loss": 1.8609, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0023503373713435116, |
|
"grad_norm": 0.47379371523857117, |
|
"learning_rate": 0.00019394360206446948, |
|
"loss": 1.8112, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.002546198818955471, |
|
"grad_norm": 0.48835086822509766, |
|
"learning_rate": 0.00019034895390411186, |
|
"loss": 1.7648, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00274206026656743, |
|
"grad_norm": 0.5578092336654663, |
|
"learning_rate": 0.0001863287504206196, |
|
"loss": 1.8491, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0029379217141793897, |
|
"grad_norm": 0.7225767374038696, |
|
"learning_rate": 0.00018190257764125471, |
|
"loss": 1.7379, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0029379217141793897, |
|
"eval_loss": 1.766533374786377, |
|
"eval_runtime": 808.4925, |
|
"eval_samples_per_second": 26.59, |
|
"eval_steps_per_second": 6.648, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0031337831617913488, |
|
"grad_norm": 0.4122462868690491, |
|
"learning_rate": 0.00017709199943488106, |
|
"loss": 1.8591, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0033296446094033083, |
|
"grad_norm": 0.45456036925315857, |
|
"learning_rate": 0.00017192045245496238, |
|
"loss": 1.7107, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0035255060570152674, |
|
"grad_norm": 0.4639202356338501, |
|
"learning_rate": 0.00016641313195854277, |
|
"loss": 1.6736, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.003721367504627227, |
|
"grad_norm": 0.551369309425354, |
|
"learning_rate": 0.0001605968690574869, |
|
"loss": 1.7199, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.003917228952239186, |
|
"grad_norm": 0.6994920969009399, |
|
"learning_rate": 0.0001545, |
|
"loss": 1.6386, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.003917228952239186, |
|
"eval_loss": 1.7132164239883423, |
|
"eval_runtime": 815.4936, |
|
"eval_samples_per_second": 26.362, |
|
"eval_steps_per_second": 6.591, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.004113090399851145, |
|
"grad_norm": 0.43010979890823364, |
|
"learning_rate": 0.00014815222811927496, |
|
"loss": 1.7888, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.004308951847463105, |
|
"grad_norm": 0.4258266091346741, |
|
"learning_rate": 0.00014158447912183896, |
|
"loss": 1.6298, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.004504813295075064, |
|
"grad_norm": 0.47796645760536194, |
|
"learning_rate": 0.00013482875042061958, |
|
"loss": 1.6301, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.004700674742687023, |
|
"grad_norm": 0.5105708241462708, |
|
"learning_rate": 0.00012791795524676576, |
|
"loss": 1.6413, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.004896536190298982, |
|
"grad_norm": 0.6835204362869263, |
|
"learning_rate": 0.00012088576229969385, |
|
"loss": 1.6857, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.004896536190298982, |
|
"eval_loss": 1.6581625938415527, |
|
"eval_runtime": 808.8687, |
|
"eval_samples_per_second": 26.578, |
|
"eval_steps_per_second": 6.645, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.005092397637910942, |
|
"grad_norm": 0.4199804961681366, |
|
"learning_rate": 0.0001137664317165683, |
|
"loss": 1.786, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.005288259085522901, |
|
"grad_norm": 0.46537598967552185, |
|
"learning_rate": 0.00010659464816035761, |
|
"loss": 1.6461, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.00548412053313486, |
|
"grad_norm": 0.5051584243774414, |
|
"learning_rate": 9.940535183964242e-05, |
|
"loss": 1.5999, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0056799819807468194, |
|
"grad_norm": 0.5738089680671692, |
|
"learning_rate": 9.22335682834317e-05, |
|
"loss": 1.6069, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.005875843428358779, |
|
"grad_norm": 0.6677771210670471, |
|
"learning_rate": 8.511423770030617e-05, |
|
"loss": 1.6102, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.005875843428358779, |
|
"eval_loss": 1.621539831161499, |
|
"eval_runtime": 849.3992, |
|
"eval_samples_per_second": 25.31, |
|
"eval_steps_per_second": 6.328, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0060717048759707385, |
|
"grad_norm": 0.42615145444869995, |
|
"learning_rate": 7.808204475323423e-05, |
|
"loss": 1.682, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0062675663235826976, |
|
"grad_norm": 0.4762561321258545, |
|
"learning_rate": 7.117124957938042e-05, |
|
"loss": 1.6215, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.006463427771194657, |
|
"grad_norm": 0.5104791522026062, |
|
"learning_rate": 6.441552087816105e-05, |
|
"loss": 1.5619, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.006659289218806617, |
|
"grad_norm": 0.575524091720581, |
|
"learning_rate": 5.784777188072502e-05, |
|
"loss": 1.6038, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.006855150666418576, |
|
"grad_norm": 0.6795148849487305, |
|
"learning_rate": 5.150000000000002e-05, |
|
"loss": 1.6331, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.006855150666418576, |
|
"eval_loss": 1.5960947275161743, |
|
"eval_runtime": 807.8208, |
|
"eval_samples_per_second": 26.612, |
|
"eval_steps_per_second": 6.654, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.007051012114030535, |
|
"grad_norm": 0.4098166823387146, |
|
"learning_rate": 4.540313094251309e-05, |
|
"loss": 1.6492, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.007246873561642494, |
|
"grad_norm": 0.47631141543388367, |
|
"learning_rate": 3.958686804145719e-05, |
|
"loss": 1.5605, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.007442735009254454, |
|
"grad_norm": 0.5455722808837891, |
|
"learning_rate": 3.4079547545037634e-05, |
|
"loss": 1.5975, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.007638596456866413, |
|
"grad_norm": 0.5428374409675598, |
|
"learning_rate": 2.8908000565118947e-05, |
|
"loss": 1.5256, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.007834457904478372, |
|
"grad_norm": 0.7749019265174866, |
|
"learning_rate": 2.4097422358745275e-05, |
|
"loss": 1.5731, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.007834457904478372, |
|
"eval_loss": 1.5720521211624146, |
|
"eval_runtime": 805.3716, |
|
"eval_samples_per_second": 26.693, |
|
"eval_steps_per_second": 6.674, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.008030319352090332, |
|
"grad_norm": 0.4348071813583374, |
|
"learning_rate": 1.9671249579380422e-05, |
|
"loss": 1.6855, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00822618079970229, |
|
"grad_norm": 0.5242032408714294, |
|
"learning_rate": 1.5651046095888127e-05, |
|
"loss": 1.6195, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.00842204224731425, |
|
"grad_norm": 0.5005368590354919, |
|
"learning_rate": 1.205639793553052e-05, |
|
"loss": 1.569, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.00861790369492621, |
|
"grad_norm": 0.5892754793167114, |
|
"learning_rate": 8.904817862812098e-06, |
|
"loss": 1.5471, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.008813765142538168, |
|
"grad_norm": 0.715459406375885, |
|
"learning_rate": 6.211660059051443e-06, |
|
"loss": 1.5927, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.008813765142538168, |
|
"eval_loss": 1.5629593133926392, |
|
"eval_runtime": 809.2741, |
|
"eval_samples_per_second": 26.565, |
|
"eval_steps_per_second": 6.642, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.009009626590150128, |
|
"grad_norm": 0.4325626492500305, |
|
"learning_rate": 3.990045318353154e-06, |
|
"loss": 1.6155, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.009205488037762086, |
|
"grad_norm": 0.4722021222114563, |
|
"learning_rate": 2.250797124418014e-06, |
|
"loss": 1.5226, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.009401349485374046, |
|
"grad_norm": 0.49364137649536133, |
|
"learning_rate": 1.0023889196182526e-06, |
|
"loss": 1.5093, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.009597210932986006, |
|
"grad_norm": 0.5476292967796326, |
|
"learning_rate": 2.5090282323810766e-07, |
|
"loss": 1.5315, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.009793072380597964, |
|
"grad_norm": 0.8033114075660706, |
|
"learning_rate": 0.0, |
|
"loss": 1.574, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.009793072380597964, |
|
"eval_loss": 1.5615293979644775, |
|
"eval_runtime": 809.085, |
|
"eval_samples_per_second": 26.571, |
|
"eval_steps_per_second": 6.643, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.2862904541773824e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|