|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.9161704076958315,
|
|
"eval_steps": 500,
|
|
"global_step": 500,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.01832340815391663,
|
|
"grad_norm": 0.06011037901043892,
|
|
"learning_rate": 4e-05,
|
|
"loss": 1.296,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.03664681630783326,
|
|
"grad_norm": 0.05856110155582428,
|
|
"learning_rate": 8e-05,
|
|
"loss": 1.3316,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.054970224461749886,
|
|
"grad_norm": 0.0607464499771595,
|
|
"learning_rate": 0.00012,
|
|
"loss": 1.2794,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.07329363261566652,
|
|
"grad_norm": 0.06632011383771896,
|
|
"learning_rate": 0.00016,
|
|
"loss": 1.3129,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.09161704076958314,
|
|
"grad_norm": 0.06631691753864288,
|
|
"learning_rate": 0.0002,
|
|
"loss": 1.2741,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.10994044892349977,
|
|
"grad_norm": 0.056466877460479736,
|
|
"learning_rate": 0.00019998035748930052,
|
|
"loss": 1.2717,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.1282638570774164,
|
|
"grad_norm": 0.05860245227813721,
|
|
"learning_rate": 0.00019992143767376668,
|
|
"loss": 1.2091,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.14658726523133303,
|
|
"grad_norm": 0.06553175300359726,
|
|
"learning_rate": 0.00019982326370006058,
|
|
"loss": 1.1926,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.16491067338524965,
|
|
"grad_norm": 0.07061401754617691,
|
|
"learning_rate": 0.00019968587413584876,
|
|
"loss": 1.1767,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.1832340815391663,
|
|
"grad_norm": 0.07183243334293365,
|
|
"learning_rate": 0.000199509322954651,
|
|
"loss": 1.1183,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.2015574896930829,
|
|
"grad_norm": 0.06944898515939713,
|
|
"learning_rate": 0.00019929367951463655,
|
|
"loss": 1.0868,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.21988089784699955,
|
|
"grad_norm": 0.06642703711986542,
|
|
"learning_rate": 0.00019903902853137703,
|
|
"loss": 1.048,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.23820430600091616,
|
|
"grad_norm": 0.06603793054819107,
|
|
"learning_rate": 0.00019874547004456562,
|
|
"loss": 1.0195,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.2565277141548328,
|
|
"grad_norm": 0.06488285213708878,
|
|
"learning_rate": 0.00019841311937871675,
|
|
"loss": 1.0014,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2748511223087494,
|
|
"grad_norm": 0.05940372124314308,
|
|
"learning_rate": 0.0001980421070978606,
|
|
"loss": 0.9943,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.29317453046266606,
|
|
"grad_norm": 0.059967171400785446,
|
|
"learning_rate": 0.00019763257895425113,
|
|
"loss": 0.9349,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.3114979386165827,
|
|
"grad_norm": 0.0554397851228714,
|
|
"learning_rate": 0.0001971846958311071,
|
|
"loss": 0.9045,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.3298213467704993,
|
|
"grad_norm": 0.055131904780864716,
|
|
"learning_rate": 0.00019669863367940935,
|
|
"loss": 0.8799,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.34814475492441593,
|
|
"grad_norm": 0.04358826205134392,
|
|
"learning_rate": 0.00019617458344877816,
|
|
"loss": 0.8504,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.3664681630783326,
|
|
"grad_norm": 0.04535752162337303,
|
|
"learning_rate": 0.00019561275101245883,
|
|
"loss": 0.828,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.3847915712322492,
|
|
"grad_norm": 0.04672062397003174,
|
|
"learning_rate": 0.00019501335708644414,
|
|
"loss": 0.8114,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.4031149793861658,
|
|
"grad_norm": 0.04161343351006508,
|
|
"learning_rate": 0.00019437663714276618,
|
|
"loss": 0.846,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.42143838754008245,
|
|
"grad_norm": 0.03887801244854927,
|
|
"learning_rate": 0.0001937028413169911,
|
|
"loss": 0.7911,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.4397617956939991,
|
|
"grad_norm": 0.03659196197986603,
|
|
"learning_rate": 0.00019299223430995323,
|
|
"loss": 0.7669,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.45808520384791573,
|
|
"grad_norm": 0.03447382524609566,
|
|
"learning_rate": 0.00019224509528376738,
|
|
"loss": 0.782,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.4764086120018323,
|
|
"grad_norm": 0.028725607320666313,
|
|
"learning_rate": 0.00019146171775215982,
|
|
"loss": 0.7183,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.49473202015574896,
|
|
"grad_norm": 0.027673941105604172,
|
|
"learning_rate": 0.0001906424094651615,
|
|
"loss": 0.7018,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.5130554283096656,
|
|
"grad_norm": 0.10227353870868683,
|
|
"learning_rate": 0.00018978749228820826,
|
|
"loss": 0.72,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.5313788364635822,
|
|
"grad_norm": 0.022650673985481262,
|
|
"learning_rate": 0.00018889730207569607,
|
|
"loss": 0.6936,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.5497022446174988,
|
|
"grad_norm": 0.023469725623726845,
|
|
"learning_rate": 0.00018797218853904037,
|
|
"loss": 0.6765,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.5680256527714155,
|
|
"grad_norm": 0.018101360648870468,
|
|
"learning_rate": 0.000187012515109292,
|
|
"loss": 0.6799,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.5863490609253321,
|
|
"grad_norm": 0.016794538125395775,
|
|
"learning_rate": 0.00018601865879436317,
|
|
"loss": 0.6732,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.6046724690792488,
|
|
"grad_norm": 0.017263714224100113,
|
|
"learning_rate": 0.00018499101003091993,
|
|
"loss": 0.6695,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.6229958772331654,
|
|
"grad_norm": 0.016381224617362022,
|
|
"learning_rate": 0.0001839299725309989,
|
|
"loss": 0.6928,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.641319285387082,
|
|
"grad_norm": 0.015325487591326237,
|
|
"learning_rate": 0.00018283596312340891,
|
|
"loss": 0.6622,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.6596426935409986,
|
|
"grad_norm": 0.014056784100830555,
|
|
"learning_rate": 0.0001817094115899799,
|
|
"loss": 0.7612,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.6779661016949152,
|
|
"grad_norm": 0.015031951479613781,
|
|
"learning_rate": 0.00018055076049672283,
|
|
"loss": 0.6596,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.6962895098488319,
|
|
"grad_norm": 0.01640532910823822,
|
|
"learning_rate": 0.00017936046501996762,
|
|
"loss": 0.6837,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.7146129180027485,
|
|
"grad_norm": 0.01830482669174671,
|
|
"learning_rate": 0.000178138992767547,
|
|
"loss": 0.6812,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.7329363261566652,
|
|
"grad_norm": 0.0472831092774868,
|
|
"learning_rate": 0.00017688682359509678,
|
|
"loss": 0.674,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.7512597343105818,
|
|
"grad_norm": 0.012456170283257961,
|
|
"learning_rate": 0.00017560444941754427,
|
|
"loss": 0.6518,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.7695831424644984,
|
|
"grad_norm": 0.01401186641305685,
|
|
"learning_rate": 0.0001742923740158595,
|
|
"loss": 0.6418,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.7879065506184151,
|
|
"grad_norm": 0.015530922450125217,
|
|
"learning_rate": 0.00017295111283914487,
|
|
"loss": 0.6465,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.8062299587723316,
|
|
"grad_norm": 0.01402275450527668,
|
|
"learning_rate": 0.0001715811928021406,
|
|
"loss": 0.6642,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.8245533669262483,
|
|
"grad_norm": 0.01176263578236103,
|
|
"learning_rate": 0.0001701831520782264,
|
|
"loss": 0.6336,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.8428767750801649,
|
|
"grad_norm": 0.013003438711166382,
|
|
"learning_rate": 0.00016875753988799982,
|
|
"loss": 0.6469,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.8612001832340815,
|
|
"grad_norm": 0.011523702181875706,
|
|
"learning_rate": 0.00016730491628351487,
|
|
"loss": 0.6434,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.8795235913879982,
|
|
"grad_norm": 0.011919384822249413,
|
|
"learning_rate": 0.00016582585192826543,
|
|
"loss": 0.6588,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.8978469995419148,
|
|
"grad_norm": 0.013994649983942509,
|
|
"learning_rate": 0.00016432092787299992,
|
|
"loss": 0.6315,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.9161704076958315,
|
|
"grad_norm": 0.013580686412751675,
|
|
"learning_rate": 0.00016279073532745553,
|
|
"loss": 0.6782,
|
|
"step": 500
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 1635,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.6890178748416e+17,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|