OH_DCFT_V3_wo_camel_ai_biology / trainer_state.json
sedrickkeh's picture
End of training
10f9d89 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9945750452079567,
"eval_steps": 500,
"global_step": 1242,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024110910186859555,
"grad_norm": 6.533456606849755,
"learning_rate": 5e-06,
"loss": 0.8842,
"step": 10
},
{
"epoch": 0.04822182037371911,
"grad_norm": 1.7431134609583179,
"learning_rate": 5e-06,
"loss": 0.7716,
"step": 20
},
{
"epoch": 0.07233273056057866,
"grad_norm": 1.3259257434449123,
"learning_rate": 5e-06,
"loss": 0.7528,
"step": 30
},
{
"epoch": 0.09644364074743822,
"grad_norm": 0.7933729448787181,
"learning_rate": 5e-06,
"loss": 0.7272,
"step": 40
},
{
"epoch": 0.12055455093429777,
"grad_norm": 0.9489596846030721,
"learning_rate": 5e-06,
"loss": 0.7249,
"step": 50
},
{
"epoch": 0.14466546112115733,
"grad_norm": 0.9021467855462361,
"learning_rate": 5e-06,
"loss": 0.7048,
"step": 60
},
{
"epoch": 0.16877637130801687,
"grad_norm": 0.9161730770097936,
"learning_rate": 5e-06,
"loss": 0.7005,
"step": 70
},
{
"epoch": 0.19288728149487644,
"grad_norm": 0.7938637800361754,
"learning_rate": 5e-06,
"loss": 0.6932,
"step": 80
},
{
"epoch": 0.21699819168173598,
"grad_norm": 0.5753057373815283,
"learning_rate": 5e-06,
"loss": 0.6974,
"step": 90
},
{
"epoch": 0.24110910186859555,
"grad_norm": 0.9667031835337068,
"learning_rate": 5e-06,
"loss": 0.6757,
"step": 100
},
{
"epoch": 0.2652200120554551,
"grad_norm": 0.5028322572120897,
"learning_rate": 5e-06,
"loss": 0.6834,
"step": 110
},
{
"epoch": 0.28933092224231466,
"grad_norm": 0.6797693061745307,
"learning_rate": 5e-06,
"loss": 0.674,
"step": 120
},
{
"epoch": 0.3134418324291742,
"grad_norm": 1.1680360190006298,
"learning_rate": 5e-06,
"loss": 0.6722,
"step": 130
},
{
"epoch": 0.33755274261603374,
"grad_norm": 0.544648561957048,
"learning_rate": 5e-06,
"loss": 0.677,
"step": 140
},
{
"epoch": 0.3616636528028933,
"grad_norm": 0.7257586557706087,
"learning_rate": 5e-06,
"loss": 0.6747,
"step": 150
},
{
"epoch": 0.3857745629897529,
"grad_norm": 0.4119617826643094,
"learning_rate": 5e-06,
"loss": 0.6693,
"step": 160
},
{
"epoch": 0.4098854731766124,
"grad_norm": 0.5605900141967505,
"learning_rate": 5e-06,
"loss": 0.6688,
"step": 170
},
{
"epoch": 0.43399638336347196,
"grad_norm": 0.6385688604944322,
"learning_rate": 5e-06,
"loss": 0.6643,
"step": 180
},
{
"epoch": 0.45810729355033153,
"grad_norm": 1.0125290764918353,
"learning_rate": 5e-06,
"loss": 0.6641,
"step": 190
},
{
"epoch": 0.4822182037371911,
"grad_norm": 0.6186627046406172,
"learning_rate": 5e-06,
"loss": 0.658,
"step": 200
},
{
"epoch": 0.5063291139240507,
"grad_norm": 0.8414739576765752,
"learning_rate": 5e-06,
"loss": 0.6671,
"step": 210
},
{
"epoch": 0.5304400241109102,
"grad_norm": 0.6025383396507406,
"learning_rate": 5e-06,
"loss": 0.6651,
"step": 220
},
{
"epoch": 0.5545509342977697,
"grad_norm": 0.5410823595468066,
"learning_rate": 5e-06,
"loss": 0.6554,
"step": 230
},
{
"epoch": 0.5786618444846293,
"grad_norm": 0.6181513401688427,
"learning_rate": 5e-06,
"loss": 0.6641,
"step": 240
},
{
"epoch": 0.6027727546714888,
"grad_norm": 0.4896208841371711,
"learning_rate": 5e-06,
"loss": 0.657,
"step": 250
},
{
"epoch": 0.6268836648583485,
"grad_norm": 0.544546111477725,
"learning_rate": 5e-06,
"loss": 0.6638,
"step": 260
},
{
"epoch": 0.650994575045208,
"grad_norm": 0.5356265326461168,
"learning_rate": 5e-06,
"loss": 0.6602,
"step": 270
},
{
"epoch": 0.6751054852320675,
"grad_norm": 0.5475932069244179,
"learning_rate": 5e-06,
"loss": 0.6538,
"step": 280
},
{
"epoch": 0.6992163954189271,
"grad_norm": 0.8345098978281534,
"learning_rate": 5e-06,
"loss": 0.6523,
"step": 290
},
{
"epoch": 0.7233273056057866,
"grad_norm": 0.8160477568039888,
"learning_rate": 5e-06,
"loss": 0.6604,
"step": 300
},
{
"epoch": 0.7474382157926461,
"grad_norm": 0.5563594159462366,
"learning_rate": 5e-06,
"loss": 0.6581,
"step": 310
},
{
"epoch": 0.7715491259795058,
"grad_norm": 0.6104670026137493,
"learning_rate": 5e-06,
"loss": 0.6543,
"step": 320
},
{
"epoch": 0.7956600361663653,
"grad_norm": 0.5818143425968119,
"learning_rate": 5e-06,
"loss": 0.6508,
"step": 330
},
{
"epoch": 0.8197709463532248,
"grad_norm": 0.39317653113678785,
"learning_rate": 5e-06,
"loss": 0.6459,
"step": 340
},
{
"epoch": 0.8438818565400844,
"grad_norm": 0.4869964807571895,
"learning_rate": 5e-06,
"loss": 0.6476,
"step": 350
},
{
"epoch": 0.8679927667269439,
"grad_norm": 0.9839633535279524,
"learning_rate": 5e-06,
"loss": 0.6517,
"step": 360
},
{
"epoch": 0.8921036769138035,
"grad_norm": 0.4947132075136725,
"learning_rate": 5e-06,
"loss": 0.6554,
"step": 370
},
{
"epoch": 0.9162145871006631,
"grad_norm": 0.42196728270115014,
"learning_rate": 5e-06,
"loss": 0.647,
"step": 380
},
{
"epoch": 0.9403254972875226,
"grad_norm": 0.7036293961206416,
"learning_rate": 5e-06,
"loss": 0.6437,
"step": 390
},
{
"epoch": 0.9644364074743822,
"grad_norm": 0.4303638291795801,
"learning_rate": 5e-06,
"loss": 0.647,
"step": 400
},
{
"epoch": 0.9885473176612417,
"grad_norm": 0.4993150805880552,
"learning_rate": 5e-06,
"loss": 0.6408,
"step": 410
},
{
"epoch": 0.9981916817359855,
"eval_loss": 0.6481794714927673,
"eval_runtime": 221.8793,
"eval_samples_per_second": 50.37,
"eval_steps_per_second": 0.397,
"step": 414
},
{
"epoch": 1.0126582278481013,
"grad_norm": 0.5861902646980864,
"learning_rate": 5e-06,
"loss": 0.6219,
"step": 420
},
{
"epoch": 1.0367691380349608,
"grad_norm": 0.4608104690581376,
"learning_rate": 5e-06,
"loss": 0.6109,
"step": 430
},
{
"epoch": 1.0608800482218204,
"grad_norm": 0.7019806195266277,
"learning_rate": 5e-06,
"loss": 0.5999,
"step": 440
},
{
"epoch": 1.0849909584086799,
"grad_norm": 0.4666118598227287,
"learning_rate": 5e-06,
"loss": 0.6071,
"step": 450
},
{
"epoch": 1.1091018685955394,
"grad_norm": 0.49273088471001014,
"learning_rate": 5e-06,
"loss": 0.6079,
"step": 460
},
{
"epoch": 1.1332127787823991,
"grad_norm": 0.5608412041594104,
"learning_rate": 5e-06,
"loss": 0.6093,
"step": 470
},
{
"epoch": 1.1573236889692586,
"grad_norm": 0.5133766270512516,
"learning_rate": 5e-06,
"loss": 0.6023,
"step": 480
},
{
"epoch": 1.1814345991561181,
"grad_norm": 0.4639503656965253,
"learning_rate": 5e-06,
"loss": 0.6067,
"step": 490
},
{
"epoch": 1.2055455093429777,
"grad_norm": 0.4941484591532595,
"learning_rate": 5e-06,
"loss": 0.6034,
"step": 500
},
{
"epoch": 1.2296564195298372,
"grad_norm": 0.532046568060987,
"learning_rate": 5e-06,
"loss": 0.6021,
"step": 510
},
{
"epoch": 1.253767329716697,
"grad_norm": 0.6313451414543506,
"learning_rate": 5e-06,
"loss": 0.6072,
"step": 520
},
{
"epoch": 1.2778782399035564,
"grad_norm": 0.48840150221258893,
"learning_rate": 5e-06,
"loss": 0.6008,
"step": 530
},
{
"epoch": 1.301989150090416,
"grad_norm": 0.4346073819877919,
"learning_rate": 5e-06,
"loss": 0.6084,
"step": 540
},
{
"epoch": 1.3261000602772754,
"grad_norm": 0.5696969325867375,
"learning_rate": 5e-06,
"loss": 0.6073,
"step": 550
},
{
"epoch": 1.350210970464135,
"grad_norm": 0.6029521082479712,
"learning_rate": 5e-06,
"loss": 0.6045,
"step": 560
},
{
"epoch": 1.3743218806509945,
"grad_norm": 0.5359000000861764,
"learning_rate": 5e-06,
"loss": 0.601,
"step": 570
},
{
"epoch": 1.3984327908378542,
"grad_norm": 0.4280776424781654,
"learning_rate": 5e-06,
"loss": 0.6068,
"step": 580
},
{
"epoch": 1.4225437010247137,
"grad_norm": 0.42975173635641,
"learning_rate": 5e-06,
"loss": 0.6058,
"step": 590
},
{
"epoch": 1.4466546112115732,
"grad_norm": 0.4148935722421534,
"learning_rate": 5e-06,
"loss": 0.6084,
"step": 600
},
{
"epoch": 1.4707655213984328,
"grad_norm": 0.4346895040838288,
"learning_rate": 5e-06,
"loss": 0.6083,
"step": 610
},
{
"epoch": 1.4948764315852923,
"grad_norm": 0.456872099031643,
"learning_rate": 5e-06,
"loss": 0.6101,
"step": 620
},
{
"epoch": 1.518987341772152,
"grad_norm": 0.518636393965265,
"learning_rate": 5e-06,
"loss": 0.6107,
"step": 630
},
{
"epoch": 1.5430982519590115,
"grad_norm": 0.4976317739138397,
"learning_rate": 5e-06,
"loss": 0.5995,
"step": 640
},
{
"epoch": 1.567209162145871,
"grad_norm": 0.5121056663367101,
"learning_rate": 5e-06,
"loss": 0.6147,
"step": 650
},
{
"epoch": 1.5913200723327305,
"grad_norm": 0.49181051844188867,
"learning_rate": 5e-06,
"loss": 0.6046,
"step": 660
},
{
"epoch": 1.61543098251959,
"grad_norm": 0.4913489094366748,
"learning_rate": 5e-06,
"loss": 0.605,
"step": 670
},
{
"epoch": 1.6395418927064496,
"grad_norm": 0.4360413141924259,
"learning_rate": 5e-06,
"loss": 0.603,
"step": 680
},
{
"epoch": 1.663652802893309,
"grad_norm": 0.5553873036504335,
"learning_rate": 5e-06,
"loss": 0.6037,
"step": 690
},
{
"epoch": 1.6877637130801688,
"grad_norm": 0.439159626571011,
"learning_rate": 5e-06,
"loss": 0.6037,
"step": 700
},
{
"epoch": 1.7118746232670283,
"grad_norm": 0.5009323338564864,
"learning_rate": 5e-06,
"loss": 0.6046,
"step": 710
},
{
"epoch": 1.7359855334538878,
"grad_norm": 0.49820787215486934,
"learning_rate": 5e-06,
"loss": 0.6057,
"step": 720
},
{
"epoch": 1.7600964436407476,
"grad_norm": 0.553637472752945,
"learning_rate": 5e-06,
"loss": 0.6066,
"step": 730
},
{
"epoch": 1.784207353827607,
"grad_norm": 0.44541140483577896,
"learning_rate": 5e-06,
"loss": 0.6,
"step": 740
},
{
"epoch": 1.8083182640144666,
"grad_norm": 0.5310706794248644,
"learning_rate": 5e-06,
"loss": 0.6098,
"step": 750
},
{
"epoch": 1.8324291742013261,
"grad_norm": 0.6630764624549126,
"learning_rate": 5e-06,
"loss": 0.6054,
"step": 760
},
{
"epoch": 1.8565400843881856,
"grad_norm": 0.553711920694149,
"learning_rate": 5e-06,
"loss": 0.6037,
"step": 770
},
{
"epoch": 1.8806509945750451,
"grad_norm": 0.566305473833487,
"learning_rate": 5e-06,
"loss": 0.6019,
"step": 780
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.582333160680419,
"learning_rate": 5e-06,
"loss": 0.6051,
"step": 790
},
{
"epoch": 1.9288728149487642,
"grad_norm": 0.509141986707748,
"learning_rate": 5e-06,
"loss": 0.6052,
"step": 800
},
{
"epoch": 1.952983725135624,
"grad_norm": 0.4543923308424651,
"learning_rate": 5e-06,
"loss": 0.598,
"step": 810
},
{
"epoch": 1.9770946353224834,
"grad_norm": 0.45958164108182104,
"learning_rate": 5e-06,
"loss": 0.6026,
"step": 820
},
{
"epoch": 1.998794454490657,
"eval_loss": 0.6378007531166077,
"eval_runtime": 222.8588,
"eval_samples_per_second": 50.148,
"eval_steps_per_second": 0.395,
"step": 829
},
{
"epoch": 2.001205545509343,
"grad_norm": 0.7867156441363433,
"learning_rate": 5e-06,
"loss": 0.6043,
"step": 830
},
{
"epoch": 2.0253164556962027,
"grad_norm": 0.5284619907031279,
"learning_rate": 5e-06,
"loss": 0.5707,
"step": 840
},
{
"epoch": 2.049427365883062,
"grad_norm": 0.5795012320295118,
"learning_rate": 5e-06,
"loss": 0.5606,
"step": 850
},
{
"epoch": 2.0735382760699217,
"grad_norm": 0.5627294692682645,
"learning_rate": 5e-06,
"loss": 0.563,
"step": 860
},
{
"epoch": 2.097649186256781,
"grad_norm": 0.5052179539566712,
"learning_rate": 5e-06,
"loss": 0.5631,
"step": 870
},
{
"epoch": 2.1217600964436407,
"grad_norm": 0.4428407773542258,
"learning_rate": 5e-06,
"loss": 0.5595,
"step": 880
},
{
"epoch": 2.1458710066305002,
"grad_norm": 0.5267499633401915,
"learning_rate": 5e-06,
"loss": 0.5601,
"step": 890
},
{
"epoch": 2.1699819168173597,
"grad_norm": 0.4655374512529405,
"learning_rate": 5e-06,
"loss": 0.5607,
"step": 900
},
{
"epoch": 2.1940928270042193,
"grad_norm": 0.48398838396056276,
"learning_rate": 5e-06,
"loss": 0.5595,
"step": 910
},
{
"epoch": 2.2182037371910788,
"grad_norm": 0.48096941817619093,
"learning_rate": 5e-06,
"loss": 0.5677,
"step": 920
},
{
"epoch": 2.2423146473779383,
"grad_norm": 0.5154141010470734,
"learning_rate": 5e-06,
"loss": 0.5594,
"step": 930
},
{
"epoch": 2.2664255575647982,
"grad_norm": 0.4799488446079912,
"learning_rate": 5e-06,
"loss": 0.5621,
"step": 940
},
{
"epoch": 2.2905364677516578,
"grad_norm": 0.5540016498502853,
"learning_rate": 5e-06,
"loss": 0.5638,
"step": 950
},
{
"epoch": 2.3146473779385173,
"grad_norm": 0.6082357481189948,
"learning_rate": 5e-06,
"loss": 0.564,
"step": 960
},
{
"epoch": 2.338758288125377,
"grad_norm": 0.5420853183530063,
"learning_rate": 5e-06,
"loss": 0.5643,
"step": 970
},
{
"epoch": 2.3628691983122363,
"grad_norm": 0.42570128293415416,
"learning_rate": 5e-06,
"loss": 0.5642,
"step": 980
},
{
"epoch": 2.386980108499096,
"grad_norm": 0.5255517048498499,
"learning_rate": 5e-06,
"loss": 0.5672,
"step": 990
},
{
"epoch": 2.4110910186859553,
"grad_norm": 0.5353694927594205,
"learning_rate": 5e-06,
"loss": 0.5565,
"step": 1000
},
{
"epoch": 2.435201928872815,
"grad_norm": 0.4617633168683323,
"learning_rate": 5e-06,
"loss": 0.5657,
"step": 1010
},
{
"epoch": 2.4593128390596743,
"grad_norm": 0.449869806649973,
"learning_rate": 5e-06,
"loss": 0.5586,
"step": 1020
},
{
"epoch": 2.483423749246534,
"grad_norm": 0.5115337318725849,
"learning_rate": 5e-06,
"loss": 0.5576,
"step": 1030
},
{
"epoch": 2.507534659433394,
"grad_norm": 0.6907411145245406,
"learning_rate": 5e-06,
"loss": 0.5681,
"step": 1040
},
{
"epoch": 2.5316455696202533,
"grad_norm": 0.5238948140915647,
"learning_rate": 5e-06,
"loss": 0.5659,
"step": 1050
},
{
"epoch": 2.555756479807113,
"grad_norm": 0.6589003211840228,
"learning_rate": 5e-06,
"loss": 0.5667,
"step": 1060
},
{
"epoch": 2.5798673899939724,
"grad_norm": 0.4764556136945032,
"learning_rate": 5e-06,
"loss": 0.5629,
"step": 1070
},
{
"epoch": 2.603978300180832,
"grad_norm": 0.44468254080490577,
"learning_rate": 5e-06,
"loss": 0.5687,
"step": 1080
},
{
"epoch": 2.6280892103676914,
"grad_norm": 0.5124860341949249,
"learning_rate": 5e-06,
"loss": 0.5692,
"step": 1090
},
{
"epoch": 2.652200120554551,
"grad_norm": 0.5228826110878407,
"learning_rate": 5e-06,
"loss": 0.5667,
"step": 1100
},
{
"epoch": 2.6763110307414104,
"grad_norm": 0.5458373344595544,
"learning_rate": 5e-06,
"loss": 0.5617,
"step": 1110
},
{
"epoch": 2.70042194092827,
"grad_norm": 0.43248189186264496,
"learning_rate": 5e-06,
"loss": 0.5589,
"step": 1120
},
{
"epoch": 2.7245328511151294,
"grad_norm": 0.44951413853647815,
"learning_rate": 5e-06,
"loss": 0.5696,
"step": 1130
},
{
"epoch": 2.748643761301989,
"grad_norm": 0.5059427152996532,
"learning_rate": 5e-06,
"loss": 0.5645,
"step": 1140
},
{
"epoch": 2.7727546714888485,
"grad_norm": 0.4713166756254001,
"learning_rate": 5e-06,
"loss": 0.5659,
"step": 1150
},
{
"epoch": 2.7968655816757084,
"grad_norm": 0.4662277376061737,
"learning_rate": 5e-06,
"loss": 0.5638,
"step": 1160
},
{
"epoch": 2.820976491862568,
"grad_norm": 0.5055943494520574,
"learning_rate": 5e-06,
"loss": 0.5651,
"step": 1170
},
{
"epoch": 2.8450874020494274,
"grad_norm": 0.49826856850045714,
"learning_rate": 5e-06,
"loss": 0.5664,
"step": 1180
},
{
"epoch": 2.869198312236287,
"grad_norm": 0.46906591997365343,
"learning_rate": 5e-06,
"loss": 0.5708,
"step": 1190
},
{
"epoch": 2.8933092224231465,
"grad_norm": 0.5743140790459712,
"learning_rate": 5e-06,
"loss": 0.5713,
"step": 1200
},
{
"epoch": 2.917420132610006,
"grad_norm": 0.5413293244789124,
"learning_rate": 5e-06,
"loss": 0.566,
"step": 1210
},
{
"epoch": 2.9415310427968655,
"grad_norm": 0.4769984493754597,
"learning_rate": 5e-06,
"loss": 0.5653,
"step": 1220
},
{
"epoch": 2.965641952983725,
"grad_norm": 0.4784113431133355,
"learning_rate": 5e-06,
"loss": 0.5663,
"step": 1230
},
{
"epoch": 2.9897528631705845,
"grad_norm": 0.46857130335535624,
"learning_rate": 5e-06,
"loss": 0.569,
"step": 1240
},
{
"epoch": 2.9945750452079567,
"eval_loss": 0.6389562487602234,
"eval_runtime": 223.5725,
"eval_samples_per_second": 49.988,
"eval_steps_per_second": 0.394,
"step": 1242
},
{
"epoch": 2.9945750452079567,
"step": 1242,
"total_flos": 2079977499525120.0,
"train_loss": 0.615725677754376,
"train_runtime": 37262.9121,
"train_samples_per_second": 17.094,
"train_steps_per_second": 0.033
}
],
"logging_steps": 10,
"max_steps": 1242,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2079977499525120.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}