Llama3-Energy / trainer_state.json
Zihao-Li's picture
First commit
e32f7f9
raw
history blame
28.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 1000,
"global_step": 1590,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018867924528301886,
"grad_norm": 2.9446594971470272,
"learning_rate": 6.289308176100629e-06,
"loss": 2.3241,
"step": 10
},
{
"epoch": 0.03773584905660377,
"grad_norm": 3.2052443029455557,
"learning_rate": 1.2578616352201259e-05,
"loss": 2.2496,
"step": 20
},
{
"epoch": 0.05660377358490566,
"grad_norm": 2.593311686421377,
"learning_rate": 1.8867924528301888e-05,
"loss": 2.2836,
"step": 30
},
{
"epoch": 0.07547169811320754,
"grad_norm": 2.8033026461409043,
"learning_rate": 2.5157232704402517e-05,
"loss": 2.2567,
"step": 40
},
{
"epoch": 0.09433962264150944,
"grad_norm": 3.0647958985618127,
"learning_rate": 3.144654088050314e-05,
"loss": 2.2749,
"step": 50
},
{
"epoch": 0.11320754716981132,
"grad_norm": 2.7224942436893023,
"learning_rate": 3.7735849056603776e-05,
"loss": 2.3334,
"step": 60
},
{
"epoch": 0.1320754716981132,
"grad_norm": 2.507797193629058,
"learning_rate": 4.402515723270441e-05,
"loss": 2.3378,
"step": 70
},
{
"epoch": 0.1509433962264151,
"grad_norm": 2.791876220429587,
"learning_rate": 5.0314465408805034e-05,
"loss": 2.3462,
"step": 80
},
{
"epoch": 0.16981132075471697,
"grad_norm": 2.8934882752144877,
"learning_rate": 5.660377358490566e-05,
"loss": 2.3832,
"step": 90
},
{
"epoch": 0.18867924528301888,
"grad_norm": 2.74130339104755,
"learning_rate": 6.289308176100629e-05,
"loss": 2.3658,
"step": 100
},
{
"epoch": 0.20754716981132076,
"grad_norm": 2.464301918797891,
"learning_rate": 6.918238993710691e-05,
"loss": 2.3894,
"step": 110
},
{
"epoch": 0.22641509433962265,
"grad_norm": 3.5787748721128176,
"learning_rate": 7.547169811320755e-05,
"loss": 2.4635,
"step": 120
},
{
"epoch": 0.24528301886792453,
"grad_norm": 3.8614586522023586,
"learning_rate": 8.176100628930818e-05,
"loss": 2.4099,
"step": 130
},
{
"epoch": 0.2641509433962264,
"grad_norm": 2.624798812422503,
"learning_rate": 8.805031446540882e-05,
"loss": 2.4141,
"step": 140
},
{
"epoch": 0.2830188679245283,
"grad_norm": 3.4083833226002174,
"learning_rate": 9.433962264150944e-05,
"loss": 2.4505,
"step": 150
},
{
"epoch": 0.3018867924528302,
"grad_norm": 2.4164498878680254,
"learning_rate": 9.999987950741765e-05,
"loss": 2.4853,
"step": 160
},
{
"epoch": 0.32075471698113206,
"grad_norm": 4.2037868049637,
"learning_rate": 9.9985421100216e-05,
"loss": 2.529,
"step": 170
},
{
"epoch": 0.33962264150943394,
"grad_norm": 10.425711730519438,
"learning_rate": 9.99468721610658e-05,
"loss": 2.5123,
"step": 180
},
{
"epoch": 0.3584905660377358,
"grad_norm": 6.075608387062913,
"learning_rate": 9.988425126867315e-05,
"loss": 2.5137,
"step": 190
},
{
"epoch": 0.37735849056603776,
"grad_norm": 5.979582059920921,
"learning_rate": 9.979758860325019e-05,
"loss": 2.4818,
"step": 200
},
{
"epoch": 0.39622641509433965,
"grad_norm": 300.5526680134449,
"learning_rate": 9.968692593196944e-05,
"loss": 2.5084,
"step": 210
},
{
"epoch": 0.41509433962264153,
"grad_norm": 2.3641585810185437,
"learning_rate": 9.955231658883432e-05,
"loss": 2.4667,
"step": 220
},
{
"epoch": 0.4339622641509434,
"grad_norm": 2.399558237267707,
"learning_rate": 9.93938254489746e-05,
"loss": 2.4815,
"step": 230
},
{
"epoch": 0.4528301886792453,
"grad_norm": 2.291187744959764,
"learning_rate": 9.921152889737984e-05,
"loss": 2.465,
"step": 240
},
{
"epoch": 0.4716981132075472,
"grad_norm": 2.2425372020480685,
"learning_rate": 9.900551479208552e-05,
"loss": 2.4827,
"step": 250
},
{
"epoch": 0.49056603773584906,
"grad_norm": 2.106996905280666,
"learning_rate": 9.877588242182975e-05,
"loss": 2.5077,
"step": 260
},
{
"epoch": 0.5094339622641509,
"grad_norm": 2.56597906125238,
"learning_rate": 9.852274245820096e-05,
"loss": 2.5812,
"step": 270
},
{
"epoch": 0.5283018867924528,
"grad_norm": 2.1161401839810323,
"learning_rate": 9.824621690229965e-05,
"loss": 2.5047,
"step": 280
},
{
"epoch": 0.5471698113207547,
"grad_norm": 2.9746454428316467,
"learning_rate": 9.79464390259397e-05,
"loss": 2.4985,
"step": 290
},
{
"epoch": 0.5660377358490566,
"grad_norm": 2.1237673830934156,
"learning_rate": 9.762355330741796e-05,
"loss": 2.4943,
"step": 300
},
{
"epoch": 0.5849056603773585,
"grad_norm": 1.8440846284987655,
"learning_rate": 9.727771536188275e-05,
"loss": 2.4536,
"step": 310
},
{
"epoch": 0.6037735849056604,
"grad_norm": 3.3815527986620526,
"learning_rate": 9.690909186633492e-05,
"loss": 2.4837,
"step": 320
},
{
"epoch": 0.6226415094339622,
"grad_norm": 2.7797010587604953,
"learning_rate": 9.651786047929773e-05,
"loss": 2.5074,
"step": 330
},
{
"epoch": 0.6415094339622641,
"grad_norm": 2.0947283835947794,
"learning_rate": 9.610420975519408e-05,
"loss": 2.441,
"step": 340
},
{
"epoch": 0.660377358490566,
"grad_norm": 1.9288902952601223,
"learning_rate": 9.566833905347245e-05,
"loss": 2.4885,
"step": 350
},
{
"epoch": 0.6792452830188679,
"grad_norm": 2.004635564736395,
"learning_rate": 9.521045844252552e-05,
"loss": 2.4342,
"step": 360
},
{
"epoch": 0.6981132075471698,
"grad_norm": 1.6511867070394874,
"learning_rate": 9.473078859844728e-05,
"loss": 2.4425,
"step": 370
},
{
"epoch": 0.7169811320754716,
"grad_norm": 1.4598720970043289,
"learning_rate": 9.422956069867807e-05,
"loss": 2.4567,
"step": 380
},
{
"epoch": 0.7358490566037735,
"grad_norm": 1.5295808219144331,
"learning_rate": 9.370701631058829e-05,
"loss": 2.4636,
"step": 390
},
{
"epoch": 0.7547169811320755,
"grad_norm": 1.606602994374719,
"learning_rate": 9.316340727505468e-05,
"loss": 2.4707,
"step": 400
},
{
"epoch": 0.7735849056603774,
"grad_norm": 1.5773231811089237,
"learning_rate": 9.259899558508543e-05,
"loss": 2.4242,
"step": 410
},
{
"epoch": 0.7924528301886793,
"grad_norm": 1.5694593702673683,
"learning_rate": 9.201405325955221e-05,
"loss": 2.4754,
"step": 420
},
{
"epoch": 0.8113207547169812,
"grad_norm": 1.4513304920200845,
"learning_rate": 9.14088622120905e-05,
"loss": 2.4735,
"step": 430
},
{
"epoch": 0.8301886792452831,
"grad_norm": 2.2119679560211436,
"learning_rate": 9.078371411523084e-05,
"loss": 2.4511,
"step": 440
},
{
"epoch": 0.8490566037735849,
"grad_norm": 1.4837853314532448,
"learning_rate": 9.013891025982704e-05,
"loss": 2.4627,
"step": 450
},
{
"epoch": 0.8679245283018868,
"grad_norm": 1.548323059472257,
"learning_rate": 8.947476140984856e-05,
"loss": 2.4804,
"step": 460
},
{
"epoch": 0.8867924528301887,
"grad_norm": 1.7369189464037587,
"learning_rate": 8.879158765260767e-05,
"loss": 2.4872,
"step": 470
},
{
"epoch": 0.9056603773584906,
"grad_norm": 1.4222000085980089,
"learning_rate": 8.808971824449275e-05,
"loss": 2.4847,
"step": 480
},
{
"epoch": 0.9245283018867925,
"grad_norm": 1.39169720237414,
"learning_rate": 8.736949145228295e-05,
"loss": 2.4873,
"step": 490
},
{
"epoch": 0.9433962264150944,
"grad_norm": 1.5495461414725966,
"learning_rate": 8.66312543901201e-05,
"loss": 2.4738,
"step": 500
},
{
"epoch": 0.9622641509433962,
"grad_norm": 1.5689856394055257,
"learning_rate": 8.587536285221656e-05,
"loss": 2.4211,
"step": 510
},
{
"epoch": 0.9811320754716981,
"grad_norm": 1.559462761559426,
"learning_rate": 8.510218114137992e-05,
"loss": 2.4183,
"step": 520
},
{
"epoch": 1.0,
"grad_norm": 1.38445361325361,
"learning_rate": 8.43120818934367e-05,
"loss": 2.459,
"step": 530
},
{
"epoch": 1.0188679245283019,
"grad_norm": 1.8042327175721304,
"learning_rate": 8.350544589764016e-05,
"loss": 1.8838,
"step": 540
},
{
"epoch": 1.0377358490566038,
"grad_norm": 1.8176496290402602,
"learning_rate": 8.268266191314848e-05,
"loss": 1.8624,
"step": 550
},
{
"epoch": 1.0566037735849056,
"grad_norm": 1.8868344352432986,
"learning_rate": 8.184412648166183e-05,
"loss": 1.8182,
"step": 560
},
{
"epoch": 1.0754716981132075,
"grad_norm": 1.7299260995769612,
"learning_rate": 8.099024373630854e-05,
"loss": 1.8391,
"step": 570
},
{
"epoch": 1.0943396226415094,
"grad_norm": 1.9113984544679725,
"learning_rate": 8.01214252068728e-05,
"loss": 1.8545,
"step": 580
},
{
"epoch": 1.1132075471698113,
"grad_norm": 1.794174287705714,
"learning_rate": 7.923808962145734e-05,
"loss": 1.8367,
"step": 590
},
{
"epoch": 1.1320754716981132,
"grad_norm": 1.5751797225379325,
"learning_rate": 7.83406627046769e-05,
"loss": 1.8149,
"step": 600
},
{
"epoch": 1.150943396226415,
"grad_norm": 1.9105350922209694,
"learning_rate": 7.742957697247984e-05,
"loss": 1.8061,
"step": 610
},
{
"epoch": 1.169811320754717,
"grad_norm": 1.7630498555967447,
"learning_rate": 7.650527152369647e-05,
"loss": 1.8411,
"step": 620
},
{
"epoch": 1.1886792452830188,
"grad_norm": 1.5261816105997068,
"learning_rate": 7.556819182841497e-05,
"loss": 1.8264,
"step": 630
},
{
"epoch": 1.2075471698113207,
"grad_norm": 1.9369411893196908,
"learning_rate": 7.461878951328653e-05,
"loss": 1.8954,
"step": 640
},
{
"epoch": 1.2264150943396226,
"grad_norm": 1.7688000917923798,
"learning_rate": 7.365752214386321e-05,
"loss": 1.8346,
"step": 650
},
{
"epoch": 1.2452830188679245,
"grad_norm": 1.6569058541238642,
"learning_rate": 7.268485300407393e-05,
"loss": 1.8805,
"step": 660
},
{
"epoch": 1.2641509433962264,
"grad_norm": 1.6708545601020437,
"learning_rate": 7.17012508729441e-05,
"loss": 1.7728,
"step": 670
},
{
"epoch": 1.2830188679245282,
"grad_norm": 1.652310201967167,
"learning_rate": 7.070718979866702e-05,
"loss": 1.8718,
"step": 680
},
{
"epoch": 1.3018867924528301,
"grad_norm": 1.9899020380799617,
"learning_rate": 6.970314887013584e-05,
"loss": 1.8535,
"step": 690
},
{
"epoch": 1.320754716981132,
"grad_norm": 1.643783798160392,
"learning_rate": 6.868961198604611e-05,
"loss": 1.8344,
"step": 700
},
{
"epoch": 1.3396226415094339,
"grad_norm": 1.8435538882684133,
"learning_rate": 6.766706762168022e-05,
"loss": 1.8759,
"step": 710
},
{
"epoch": 1.3584905660377358,
"grad_norm": 1.6989197917459231,
"learning_rate": 6.663600859348616e-05,
"loss": 1.7973,
"step": 720
},
{
"epoch": 1.3773584905660377,
"grad_norm": 1.6640164364452317,
"learning_rate": 6.55969318215641e-05,
"loss": 1.8101,
"step": 730
},
{
"epoch": 1.3962264150943398,
"grad_norm": 1.663705205393152,
"learning_rate": 6.455033809017512e-05,
"loss": 1.8574,
"step": 740
},
{
"epoch": 1.4150943396226414,
"grad_norm": 1.524574911562225,
"learning_rate": 6.34967318063877e-05,
"loss": 1.8194,
"step": 750
},
{
"epoch": 1.4339622641509435,
"grad_norm": 1.638744038935454,
"learning_rate": 6.24366207569781e-05,
"loss": 1.8557,
"step": 760
},
{
"epoch": 1.4528301886792452,
"grad_norm": 1.5905792259719815,
"learning_rate": 6.137051586370194e-05,
"loss": 1.8403,
"step": 770
},
{
"epoch": 1.4716981132075473,
"grad_norm": 1.4115389229640394,
"learning_rate": 6.029893093705492e-05,
"loss": 1.86,
"step": 780
},
{
"epoch": 1.490566037735849,
"grad_norm": 1.5664716217022607,
"learning_rate": 5.9222382428641174e-05,
"loss": 1.8223,
"step": 790
},
{
"epoch": 1.509433962264151,
"grad_norm": 1.3426007079954652,
"learning_rate": 5.814138918226887e-05,
"loss": 1.7957,
"step": 800
},
{
"epoch": 1.5283018867924527,
"grad_norm": 1.4496928054044773,
"learning_rate": 5.7056472183892806e-05,
"loss": 1.8542,
"step": 810
},
{
"epoch": 1.5471698113207548,
"grad_norm": 1.7249530177698127,
"learning_rate": 5.5968154310524614e-05,
"loss": 1.8043,
"step": 820
},
{
"epoch": 1.5660377358490565,
"grad_norm": 1.4451712049547103,
"learning_rate": 5.487696007823161e-05,
"loss": 1.7981,
"step": 830
},
{
"epoch": 1.5849056603773586,
"grad_norm": 1.5035729769726907,
"learning_rate": 5.378341538934566e-05,
"loss": 1.8313,
"step": 840
},
{
"epoch": 1.6037735849056602,
"grad_norm": 1.3823097737594126,
"learning_rate": 5.268804727900391e-05,
"loss": 1.8476,
"step": 850
},
{
"epoch": 1.6226415094339623,
"grad_norm": 1.41439773210909,
"learning_rate": 5.159138366114358e-05,
"loss": 1.7863,
"step": 860
},
{
"epoch": 1.641509433962264,
"grad_norm": 1.513162165314957,
"learning_rate": 5.049395307407329e-05,
"loss": 1.8363,
"step": 870
},
{
"epoch": 1.6603773584905661,
"grad_norm": 1.5375457880909025,
"learning_rate": 4.9396284425743326e-05,
"loss": 1.8004,
"step": 880
},
{
"epoch": 1.6792452830188678,
"grad_norm": 1.5695919072614308,
"learning_rate": 4.829890673883792e-05,
"loss": 1.818,
"step": 890
},
{
"epoch": 1.6981132075471699,
"grad_norm": 1.3666688643802247,
"learning_rate": 4.7202348895812035e-05,
"loss": 1.7885,
"step": 900
},
{
"epoch": 1.7169811320754715,
"grad_norm": 1.6027481528500458,
"learning_rate": 4.610713938399601e-05,
"loss": 1.7906,
"step": 910
},
{
"epoch": 1.7358490566037736,
"grad_norm": 1.3930291385793376,
"learning_rate": 4.5013806040890294e-05,
"loss": 1.7858,
"step": 920
},
{
"epoch": 1.7547169811320755,
"grad_norm": 1.4293209085375194,
"learning_rate": 4.392287579977374e-05,
"loss": 1.7796,
"step": 930
},
{
"epoch": 1.7735849056603774,
"grad_norm": 1.5151788900532224,
"learning_rate": 4.2834874435747305e-05,
"loss": 1.7666,
"step": 940
},
{
"epoch": 1.7924528301886793,
"grad_norm": 1.5253274784864974,
"learning_rate": 4.1750326312336254e-05,
"loss": 1.7516,
"step": 950
},
{
"epoch": 1.8113207547169812,
"grad_norm": 1.3957421524480444,
"learning_rate": 4.066975412877255e-05,
"loss": 1.7904,
"step": 960
},
{
"epoch": 1.830188679245283,
"grad_norm": 1.399046653332325,
"learning_rate": 3.959367866807926e-05,
"loss": 1.7605,
"step": 970
},
{
"epoch": 1.849056603773585,
"grad_norm": 1.48580398039922,
"learning_rate": 3.852261854607866e-05,
"loss": 1.8169,
"step": 980
},
{
"epoch": 1.8679245283018868,
"grad_norm": 1.4703556780094864,
"learning_rate": 3.7457089961444636e-05,
"loss": 1.7652,
"step": 990
},
{
"epoch": 1.8867924528301887,
"grad_norm": 1.4196287584590106,
"learning_rate": 3.6397606446920294e-05,
"loss": 1.75,
"step": 1000
},
{
"epoch": 1.8867924528301887,
"eval_loss": 2.2884254455566406,
"eval_runtime": 165.0682,
"eval_samples_per_second": 11.413,
"eval_steps_per_second": 2.853,
"step": 1000
},
{
"epoch": 1.9056603773584906,
"grad_norm": 1.442346199206303,
"learning_rate": 3.534467862182008e-05,
"loss": 1.7847,
"step": 1010
},
{
"epoch": 1.9245283018867925,
"grad_norm": 1.3835916856247392,
"learning_rate": 3.4298813945936295e-05,
"loss": 1.7737,
"step": 1020
},
{
"epoch": 1.9433962264150944,
"grad_norm": 1.3821884730018883,
"learning_rate": 3.3260516474968285e-05,
"loss": 1.7281,
"step": 1030
},
{
"epoch": 1.9622641509433962,
"grad_norm": 1.3924722724907153,
"learning_rate": 3.223028661759211e-05,
"loss": 1.7924,
"step": 1040
},
{
"epoch": 1.9811320754716981,
"grad_norm": 1.3388702147690976,
"learning_rate": 3.12086208942881e-05,
"loss": 1.7397,
"step": 1050
},
{
"epoch": 2.0,
"grad_norm": 1.4015243388990968,
"learning_rate": 3.019601169804216e-05,
"loss": 1.6932,
"step": 1060
},
{
"epoch": 2.018867924528302,
"grad_norm": 1.7480746986263314,
"learning_rate": 2.919294705703647e-05,
"loss": 0.6881,
"step": 1070
},
{
"epoch": 2.0377358490566038,
"grad_norm": 1.7026666847000977,
"learning_rate": 2.819991039944363e-05,
"loss": 0.6078,
"step": 1080
},
{
"epoch": 2.056603773584906,
"grad_norm": 1.7917514233908862,
"learning_rate": 2.7217380320437978e-05,
"loss": 0.6092,
"step": 1090
},
{
"epoch": 2.0754716981132075,
"grad_norm": 1.6723597171494868,
"learning_rate": 2.624583035153609e-05,
"loss": 0.585,
"step": 1100
},
{
"epoch": 2.0943396226415096,
"grad_norm": 1.63904815463906,
"learning_rate": 2.5285728732377613e-05,
"loss": 0.577,
"step": 1110
},
{
"epoch": 2.1132075471698113,
"grad_norm": 1.6791437732786112,
"learning_rate": 2.4337538185056762e-05,
"loss": 0.551,
"step": 1120
},
{
"epoch": 2.1320754716981134,
"grad_norm": 1.6076545037137666,
"learning_rate": 2.3401715691112746e-05,
"loss": 0.556,
"step": 1130
},
{
"epoch": 2.150943396226415,
"grad_norm": 1.726665027733004,
"learning_rate": 2.247871227128709e-05,
"loss": 0.5711,
"step": 1140
},
{
"epoch": 2.169811320754717,
"grad_norm": 1.6490156416373818,
"learning_rate": 2.1568972768153556e-05,
"loss": 0.5601,
"step": 1150
},
{
"epoch": 2.188679245283019,
"grad_norm": 1.7210537816210676,
"learning_rate": 2.067293563172581e-05,
"loss": 0.5609,
"step": 1160
},
{
"epoch": 2.207547169811321,
"grad_norm": 1.6521402147978896,
"learning_rate": 1.9791032708145963e-05,
"loss": 0.5417,
"step": 1170
},
{
"epoch": 2.2264150943396226,
"grad_norm": 1.7020323862071838,
"learning_rate": 1.8923689031555697e-05,
"loss": 0.5635,
"step": 1180
},
{
"epoch": 2.2452830188679247,
"grad_norm": 1.5791599921066155,
"learning_rate": 1.807132261925073e-05,
"loss": 0.5371,
"step": 1190
},
{
"epoch": 2.2641509433962264,
"grad_norm": 1.6370275383685373,
"learning_rate": 1.7234344270216713e-05,
"loss": 0.5459,
"step": 1200
},
{
"epoch": 2.2830188679245285,
"grad_norm": 1.649807184686461,
"learning_rate": 1.6413157367144354e-05,
"loss": 0.5608,
"step": 1210
},
{
"epoch": 2.30188679245283,
"grad_norm": 1.7662002841569535,
"learning_rate": 1.5608157682018505e-05,
"loss": 0.5613,
"step": 1220
},
{
"epoch": 2.3207547169811322,
"grad_norm": 1.641520954901167,
"learning_rate": 1.4819733185375534e-05,
"loss": 0.537,
"step": 1230
},
{
"epoch": 2.339622641509434,
"grad_norm": 1.6680780951150302,
"learning_rate": 1.4048263859320344e-05,
"loss": 0.5425,
"step": 1240
},
{
"epoch": 2.358490566037736,
"grad_norm": 1.5858289559337815,
"learning_rate": 1.3294121514393637e-05,
"loss": 0.5289,
"step": 1250
},
{
"epoch": 2.3773584905660377,
"grad_norm": 1.609281814988441,
"learning_rate": 1.2557669610377399e-05,
"loss": 0.5155,
"step": 1260
},
{
"epoch": 2.3962264150943398,
"grad_norm": 1.6108061713809745,
"learning_rate": 1.1839263081124946e-05,
"loss": 0.5214,
"step": 1270
},
{
"epoch": 2.4150943396226414,
"grad_norm": 1.5364583247125485,
"learning_rate": 1.113924816350026e-05,
"loss": 0.5326,
"step": 1280
},
{
"epoch": 2.4339622641509435,
"grad_norm": 1.523827370861251,
"learning_rate": 1.04579622305086e-05,
"loss": 0.5218,
"step": 1290
},
{
"epoch": 2.452830188679245,
"grad_norm": 1.6969638639614046,
"learning_rate": 9.795733628699333e-06,
"loss": 0.5341,
"step": 1300
},
{
"epoch": 2.4716981132075473,
"grad_norm": 1.502222163556516,
"learning_rate": 9.152881519918787e-06,
"loss": 0.5102,
"step": 1310
},
{
"epoch": 2.490566037735849,
"grad_norm": 1.6251186914379474,
"learning_rate": 8.529715727489912e-06,
"loss": 0.5113,
"step": 1320
},
{
"epoch": 2.509433962264151,
"grad_norm": 1.641634385361185,
"learning_rate": 7.926536586892591e-06,
"loss": 0.51,
"step": 1330
},
{
"epoch": 2.5283018867924527,
"grad_norm": 1.564996479749529,
"learning_rate": 7.3436348010165025e-06,
"loss": 0.5075,
"step": 1340
},
{
"epoch": 2.547169811320755,
"grad_norm": 1.5204914266086813,
"learning_rate": 6.781291300056647e-06,
"loss": 0.5111,
"step": 1350
},
{
"epoch": 2.5660377358490565,
"grad_norm": 1.5204438359613908,
"learning_rate": 6.239777106118605e-06,
"loss": 0.501,
"step": 1360
},
{
"epoch": 2.5849056603773586,
"grad_norm": 1.6153170323469739,
"learning_rate": 5.719353202599209e-06,
"loss": 0.5065,
"step": 1370
},
{
"epoch": 2.6037735849056602,
"grad_norm": 1.532440501266883,
"learning_rate": 5.220270408405198e-06,
"loss": 0.5268,
"step": 1380
},
{
"epoch": 2.6226415094339623,
"grad_norm": 1.5295028060682831,
"learning_rate": 4.7427692570708445e-06,
"loss": 0.5225,
"step": 1390
},
{
"epoch": 2.641509433962264,
"grad_norm": 1.5576876729006885,
"learning_rate": 4.287079880832478e-06,
"loss": 0.5094,
"step": 1400
},
{
"epoch": 2.660377358490566,
"grad_norm": 1.535240889295645,
"learning_rate": 3.853421899715992e-06,
"loss": 0.4991,
"step": 1410
},
{
"epoch": 2.6792452830188678,
"grad_norm": 1.5668838039374533,
"learning_rate": 3.44200431569075e-06,
"loss": 0.5011,
"step": 1420
},
{
"epoch": 2.69811320754717,
"grad_norm": 1.6597779325377704,
"learning_rate": 3.053025411940802e-06,
"loss": 0.4954,
"step": 1430
},
{
"epoch": 2.7169811320754715,
"grad_norm": 1.5562079580978392,
"learning_rate": 2.6866726573021026e-06,
"loss": 0.5054,
"step": 1440
},
{
"epoch": 2.7358490566037736,
"grad_norm": 1.5996686204830912,
"learning_rate": 2.3431226159116637e-06,
"loss": 0.5154,
"step": 1450
},
{
"epoch": 2.7547169811320753,
"grad_norm": 1.6603987931741782,
"learning_rate": 2.022540862112282e-06,
"loss": 0.5029,
"step": 1460
},
{
"epoch": 2.7735849056603774,
"grad_norm": 1.4442160081367916,
"learning_rate": 1.725081900653791e-06,
"loss": 0.5147,
"step": 1470
},
{
"epoch": 2.7924528301886795,
"grad_norm": 1.5601472307077258,
"learning_rate": 1.4508890922293018e-06,
"loss": 0.4882,
"step": 1480
},
{
"epoch": 2.811320754716981,
"grad_norm": 1.6882814081660615,
"learning_rate": 1.2000945843823551e-06,
"loss": 0.4909,
"step": 1490
},
{
"epoch": 2.830188679245283,
"grad_norm": 1.5897926116142052,
"learning_rate": 9.728192478182574e-07,
"loss": 0.485,
"step": 1500
},
{
"epoch": 2.849056603773585,
"grad_norm": 1.480162495765326,
"learning_rate": 7.691726181503267e-07,
"loss": 0.4985,
"step": 1510
},
{
"epoch": 2.867924528301887,
"grad_norm": 1.5161543246256077,
"learning_rate": 5.892528431090393e-07,
"loss": 0.4816,
"step": 1520
},
{
"epoch": 2.8867924528301887,
"grad_norm": 1.5434464499844907,
"learning_rate": 4.331466352396396e-07,
"loss": 0.4955,
"step": 1530
},
{
"epoch": 2.9056603773584904,
"grad_norm": 1.5292680330833108,
"learning_rate": 3.009292301109412e-07,
"loss": 0.5018,
"step": 1540
},
{
"epoch": 2.9245283018867925,
"grad_norm": 1.501995031518757,
"learning_rate": 1.9266435005540483e-07,
"loss": 0.5011,
"step": 1550
},
{
"epoch": 2.9433962264150946,
"grad_norm": 1.5344813758662075,
"learning_rate": 1.0840417345814313e-07,
"loss": 0.5141,
"step": 1560
},
{
"epoch": 2.9622641509433962,
"grad_norm": 1.5204098865333115,
"learning_rate": 4.818930960945878e-08,
"loss": 0.4904,
"step": 1570
},
{
"epoch": 2.981132075471698,
"grad_norm": 1.5256874098586901,
"learning_rate": 1.2048779133150279e-08,
"loss": 0.4746,
"step": 1580
},
{
"epoch": 3.0,
"grad_norm": 1.4804382321073322,
"learning_rate": 0.0,
"loss": 0.5039,
"step": 1590
},
{
"epoch": 3.0,
"step": 1590,
"total_flos": 83202240675840.0,
"train_loss": 1.594443890733539,
"train_runtime": 15602.0514,
"train_samples_per_second": 3.26,
"train_steps_per_second": 0.102
}
],
"logging_steps": 10,
"max_steps": 1590,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 83202240675840.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}