gsmyrnis's picture
End of training
cc438d6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 828,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.036231884057971016,
"grad_norm": 10.94329153798689,
"learning_rate": 5e-06,
"loss": 1.0543,
"step": 10
},
{
"epoch": 0.07246376811594203,
"grad_norm": 6.69746195176836,
"learning_rate": 5e-06,
"loss": 0.9358,
"step": 20
},
{
"epoch": 0.10869565217391304,
"grad_norm": 1.4693988083101803,
"learning_rate": 5e-06,
"loss": 0.8947,
"step": 30
},
{
"epoch": 0.14492753623188406,
"grad_norm": 1.1956268436230149,
"learning_rate": 5e-06,
"loss": 0.8568,
"step": 40
},
{
"epoch": 0.18115942028985507,
"grad_norm": 1.161881545591458,
"learning_rate": 5e-06,
"loss": 0.8308,
"step": 50
},
{
"epoch": 0.21739130434782608,
"grad_norm": 1.0465718565340598,
"learning_rate": 5e-06,
"loss": 0.8168,
"step": 60
},
{
"epoch": 0.2536231884057971,
"grad_norm": 0.8500018086045097,
"learning_rate": 5e-06,
"loss": 0.8027,
"step": 70
},
{
"epoch": 0.2898550724637681,
"grad_norm": 0.7729963066093143,
"learning_rate": 5e-06,
"loss": 0.7872,
"step": 80
},
{
"epoch": 0.32608695652173914,
"grad_norm": 0.9101222071301408,
"learning_rate": 5e-06,
"loss": 0.7819,
"step": 90
},
{
"epoch": 0.36231884057971014,
"grad_norm": 1.0288861721148082,
"learning_rate": 5e-06,
"loss": 0.7754,
"step": 100
},
{
"epoch": 0.39855072463768115,
"grad_norm": 0.8778862523158388,
"learning_rate": 5e-06,
"loss": 0.7745,
"step": 110
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.0273017034313925,
"learning_rate": 5e-06,
"loss": 0.7723,
"step": 120
},
{
"epoch": 0.47101449275362317,
"grad_norm": 1.1202477015356398,
"learning_rate": 5e-06,
"loss": 0.7689,
"step": 130
},
{
"epoch": 0.5072463768115942,
"grad_norm": 1.1947838008092264,
"learning_rate": 5e-06,
"loss": 0.7588,
"step": 140
},
{
"epoch": 0.5434782608695652,
"grad_norm": 0.9801347289517877,
"learning_rate": 5e-06,
"loss": 0.7641,
"step": 150
},
{
"epoch": 0.5797101449275363,
"grad_norm": 0.6049825201268222,
"learning_rate": 5e-06,
"loss": 0.7579,
"step": 160
},
{
"epoch": 0.6159420289855072,
"grad_norm": 0.7687560634971528,
"learning_rate": 5e-06,
"loss": 0.7595,
"step": 170
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.7372634515268928,
"learning_rate": 5e-06,
"loss": 0.7536,
"step": 180
},
{
"epoch": 0.6884057971014492,
"grad_norm": 0.7700572406910111,
"learning_rate": 5e-06,
"loss": 0.7531,
"step": 190
},
{
"epoch": 0.7246376811594203,
"grad_norm": 0.6355090382849566,
"learning_rate": 5e-06,
"loss": 0.7521,
"step": 200
},
{
"epoch": 0.7608695652173914,
"grad_norm": 0.6549502688413438,
"learning_rate": 5e-06,
"loss": 0.7503,
"step": 210
},
{
"epoch": 0.7971014492753623,
"grad_norm": 0.582664220792735,
"learning_rate": 5e-06,
"loss": 0.7551,
"step": 220
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.6407186366739495,
"learning_rate": 5e-06,
"loss": 0.7459,
"step": 230
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.6331891386534093,
"learning_rate": 5e-06,
"loss": 0.744,
"step": 240
},
{
"epoch": 0.9057971014492754,
"grad_norm": 0.7299118855941761,
"learning_rate": 5e-06,
"loss": 0.7374,
"step": 250
},
{
"epoch": 0.9420289855072463,
"grad_norm": 0.567880243301988,
"learning_rate": 5e-06,
"loss": 0.7458,
"step": 260
},
{
"epoch": 0.9782608695652174,
"grad_norm": 0.6907081447217029,
"learning_rate": 5e-06,
"loss": 0.7358,
"step": 270
},
{
"epoch": 1.0,
"eval_loss": 0.7438245415687561,
"eval_runtime": 26.677,
"eval_samples_per_second": 278.03,
"eval_steps_per_second": 1.087,
"step": 276
},
{
"epoch": 1.0144927536231885,
"grad_norm": 0.8940503757064469,
"learning_rate": 5e-06,
"loss": 0.7253,
"step": 280
},
{
"epoch": 1.0507246376811594,
"grad_norm": 0.7439488715557626,
"learning_rate": 5e-06,
"loss": 0.6908,
"step": 290
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.6352130108582714,
"learning_rate": 5e-06,
"loss": 0.6918,
"step": 300
},
{
"epoch": 1.1231884057971016,
"grad_norm": 0.6093227815341429,
"learning_rate": 5e-06,
"loss": 0.6951,
"step": 310
},
{
"epoch": 1.1594202898550725,
"grad_norm": 0.648343758224349,
"learning_rate": 5e-06,
"loss": 0.6853,
"step": 320
},
{
"epoch": 1.1956521739130435,
"grad_norm": 0.7170626934482996,
"learning_rate": 5e-06,
"loss": 0.6948,
"step": 330
},
{
"epoch": 1.2318840579710144,
"grad_norm": 0.669568534520701,
"learning_rate": 5e-06,
"loss": 0.6845,
"step": 340
},
{
"epoch": 1.2681159420289856,
"grad_norm": 0.7720612077716418,
"learning_rate": 5e-06,
"loss": 0.6873,
"step": 350
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.6321115893300935,
"learning_rate": 5e-06,
"loss": 0.6889,
"step": 360
},
{
"epoch": 1.3405797101449275,
"grad_norm": 0.6385246244453152,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 370
},
{
"epoch": 1.3768115942028984,
"grad_norm": 0.6322551870212395,
"learning_rate": 5e-06,
"loss": 0.6902,
"step": 380
},
{
"epoch": 1.4130434782608696,
"grad_norm": 0.682130131174512,
"learning_rate": 5e-06,
"loss": 0.6881,
"step": 390
},
{
"epoch": 1.4492753623188406,
"grad_norm": 0.6287023165159331,
"learning_rate": 5e-06,
"loss": 0.6874,
"step": 400
},
{
"epoch": 1.4855072463768115,
"grad_norm": 0.6249521988426261,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 410
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.6110574768057114,
"learning_rate": 5e-06,
"loss": 0.6875,
"step": 420
},
{
"epoch": 1.5579710144927537,
"grad_norm": 0.771458385172311,
"learning_rate": 5e-06,
"loss": 0.6905,
"step": 430
},
{
"epoch": 1.5942028985507246,
"grad_norm": 0.8776023781615215,
"learning_rate": 5e-06,
"loss": 0.6934,
"step": 440
},
{
"epoch": 1.6304347826086958,
"grad_norm": 0.7459525790966258,
"learning_rate": 5e-06,
"loss": 0.6927,
"step": 450
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.579140354002018,
"learning_rate": 5e-06,
"loss": 0.6857,
"step": 460
},
{
"epoch": 1.7028985507246377,
"grad_norm": 0.7077586289894291,
"learning_rate": 5e-06,
"loss": 0.6914,
"step": 470
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.5827410978723677,
"learning_rate": 5e-06,
"loss": 0.6861,
"step": 480
},
{
"epoch": 1.7753623188405796,
"grad_norm": 0.697371946305589,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 490
},
{
"epoch": 1.8115942028985508,
"grad_norm": 0.6086228815522553,
"learning_rate": 5e-06,
"loss": 0.6905,
"step": 500
},
{
"epoch": 1.8478260869565217,
"grad_norm": 0.5925797270404988,
"learning_rate": 5e-06,
"loss": 0.6875,
"step": 510
},
{
"epoch": 1.8840579710144927,
"grad_norm": 0.5774122986770631,
"learning_rate": 5e-06,
"loss": 0.6864,
"step": 520
},
{
"epoch": 1.9202898550724639,
"grad_norm": 0.7752937398691316,
"learning_rate": 5e-06,
"loss": 0.6848,
"step": 530
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.9039716311151348,
"learning_rate": 5e-06,
"loss": 0.6851,
"step": 540
},
{
"epoch": 1.9927536231884058,
"grad_norm": 0.685112724319237,
"learning_rate": 5e-06,
"loss": 0.685,
"step": 550
},
{
"epoch": 2.0,
"eval_loss": 0.7335925102233887,
"eval_runtime": 26.8139,
"eval_samples_per_second": 276.611,
"eval_steps_per_second": 1.082,
"step": 552
},
{
"epoch": 2.028985507246377,
"grad_norm": 0.7665524028493372,
"learning_rate": 5e-06,
"loss": 0.6385,
"step": 560
},
{
"epoch": 2.0652173913043477,
"grad_norm": 0.6043892824700962,
"learning_rate": 5e-06,
"loss": 0.6304,
"step": 570
},
{
"epoch": 2.101449275362319,
"grad_norm": 0.7259867108096663,
"learning_rate": 5e-06,
"loss": 0.6298,
"step": 580
},
{
"epoch": 2.13768115942029,
"grad_norm": 0.7510541142479693,
"learning_rate": 5e-06,
"loss": 0.635,
"step": 590
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.7187894788401495,
"learning_rate": 5e-06,
"loss": 0.6341,
"step": 600
},
{
"epoch": 2.210144927536232,
"grad_norm": 0.6750049579183571,
"learning_rate": 5e-06,
"loss": 0.6381,
"step": 610
},
{
"epoch": 2.246376811594203,
"grad_norm": 0.8490350516471592,
"learning_rate": 5e-06,
"loss": 0.6324,
"step": 620
},
{
"epoch": 2.282608695652174,
"grad_norm": 0.7651747850436946,
"learning_rate": 5e-06,
"loss": 0.6359,
"step": 630
},
{
"epoch": 2.318840579710145,
"grad_norm": 0.8525868267751531,
"learning_rate": 5e-06,
"loss": 0.6327,
"step": 640
},
{
"epoch": 2.355072463768116,
"grad_norm": 0.9041979799840293,
"learning_rate": 5e-06,
"loss": 0.6335,
"step": 650
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.6631449903138956,
"learning_rate": 5e-06,
"loss": 0.6333,
"step": 660
},
{
"epoch": 2.427536231884058,
"grad_norm": 0.6077513953951755,
"learning_rate": 5e-06,
"loss": 0.6354,
"step": 670
},
{
"epoch": 2.463768115942029,
"grad_norm": 0.7123161501999767,
"learning_rate": 5e-06,
"loss": 0.635,
"step": 680
},
{
"epoch": 2.5,
"grad_norm": 0.7517439764623557,
"learning_rate": 5e-06,
"loss": 0.6339,
"step": 690
},
{
"epoch": 2.536231884057971,
"grad_norm": 0.782865335447992,
"learning_rate": 5e-06,
"loss": 0.6409,
"step": 700
},
{
"epoch": 2.572463768115942,
"grad_norm": 0.8659272969336265,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 710
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.6169065680456968,
"learning_rate": 5e-06,
"loss": 0.6322,
"step": 720
},
{
"epoch": 2.644927536231884,
"grad_norm": 0.637344472194744,
"learning_rate": 5e-06,
"loss": 0.6394,
"step": 730
},
{
"epoch": 2.681159420289855,
"grad_norm": 0.723181305159317,
"learning_rate": 5e-06,
"loss": 0.6423,
"step": 740
},
{
"epoch": 2.717391304347826,
"grad_norm": 0.6636365428175426,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 750
},
{
"epoch": 2.753623188405797,
"grad_norm": 0.6833062497178735,
"learning_rate": 5e-06,
"loss": 0.6393,
"step": 760
},
{
"epoch": 2.789855072463768,
"grad_norm": 0.6330454188928252,
"learning_rate": 5e-06,
"loss": 0.6373,
"step": 770
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.6790151240239423,
"learning_rate": 5e-06,
"loss": 0.6375,
"step": 780
},
{
"epoch": 2.86231884057971,
"grad_norm": 0.6977511814313268,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 790
},
{
"epoch": 2.898550724637681,
"grad_norm": 0.5840071260671889,
"learning_rate": 5e-06,
"loss": 0.6407,
"step": 800
},
{
"epoch": 2.9347826086956523,
"grad_norm": 0.6248935845380484,
"learning_rate": 5e-06,
"loss": 0.634,
"step": 810
},
{
"epoch": 2.971014492753623,
"grad_norm": 0.6181053130332258,
"learning_rate": 5e-06,
"loss": 0.6412,
"step": 820
},
{
"epoch": 3.0,
"eval_loss": 0.7416301369667053,
"eval_runtime": 26.232,
"eval_samples_per_second": 282.746,
"eval_steps_per_second": 1.106,
"step": 828
},
{
"epoch": 3.0,
"step": 828,
"total_flos": 1386930839224320.0,
"train_loss": 0.7053444857758601,
"train_runtime": 5350.6579,
"train_samples_per_second": 79.003,
"train_steps_per_second": 0.155
}
],
"logging_steps": 10,
"max_steps": 828,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1386930839224320.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}