OH_DCFT_V3_wo_gpt4_llm / trainer_state.json
sedrickkeh's picture
End of training
1df7997 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.996415770609319,
"eval_steps": 500,
"global_step": 1254,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023894862604540025,
"grad_norm": 4.236361518666415,
"learning_rate": 5e-06,
"loss": 0.881,
"step": 10
},
{
"epoch": 0.04778972520908005,
"grad_norm": 5.741199257309677,
"learning_rate": 5e-06,
"loss": 0.7803,
"step": 20
},
{
"epoch": 0.07168458781362007,
"grad_norm": 19.255957544263854,
"learning_rate": 5e-06,
"loss": 0.7636,
"step": 30
},
{
"epoch": 0.0955794504181601,
"grad_norm": 1.4389740219584857,
"learning_rate": 5e-06,
"loss": 0.7601,
"step": 40
},
{
"epoch": 0.11947431302270012,
"grad_norm": 2.052033228969374,
"learning_rate": 5e-06,
"loss": 0.7307,
"step": 50
},
{
"epoch": 0.14336917562724014,
"grad_norm": 1.4883942502737415,
"learning_rate": 5e-06,
"loss": 0.7194,
"step": 60
},
{
"epoch": 0.16726403823178015,
"grad_norm": 0.7721229562083435,
"learning_rate": 5e-06,
"loss": 0.7186,
"step": 70
},
{
"epoch": 0.1911589008363202,
"grad_norm": 0.5683129469939435,
"learning_rate": 5e-06,
"loss": 0.6965,
"step": 80
},
{
"epoch": 0.21505376344086022,
"grad_norm": 0.527733611127623,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 90
},
{
"epoch": 0.23894862604540024,
"grad_norm": 0.5540046789983225,
"learning_rate": 5e-06,
"loss": 0.693,
"step": 100
},
{
"epoch": 0.2628434886499403,
"grad_norm": 0.5451390307514128,
"learning_rate": 5e-06,
"loss": 0.6802,
"step": 110
},
{
"epoch": 0.2867383512544803,
"grad_norm": 0.5143838898116624,
"learning_rate": 5e-06,
"loss": 0.688,
"step": 120
},
{
"epoch": 0.3106332138590203,
"grad_norm": 0.6502984472755421,
"learning_rate": 5e-06,
"loss": 0.6838,
"step": 130
},
{
"epoch": 0.3345280764635603,
"grad_norm": 0.5635569077666838,
"learning_rate": 5e-06,
"loss": 0.6733,
"step": 140
},
{
"epoch": 0.35842293906810035,
"grad_norm": 0.6029469287016763,
"learning_rate": 5e-06,
"loss": 0.6776,
"step": 150
},
{
"epoch": 0.3823178016726404,
"grad_norm": 0.486292600711864,
"learning_rate": 5e-06,
"loss": 0.6661,
"step": 160
},
{
"epoch": 0.4062126642771804,
"grad_norm": 0.6615883711779132,
"learning_rate": 5e-06,
"loss": 0.6652,
"step": 170
},
{
"epoch": 0.43010752688172044,
"grad_norm": 0.4717863479299739,
"learning_rate": 5e-06,
"loss": 0.6655,
"step": 180
},
{
"epoch": 0.4540023894862604,
"grad_norm": 0.4888275284899482,
"learning_rate": 5e-06,
"loss": 0.662,
"step": 190
},
{
"epoch": 0.4778972520908005,
"grad_norm": 0.5394213188181476,
"learning_rate": 5e-06,
"loss": 0.6687,
"step": 200
},
{
"epoch": 0.5017921146953405,
"grad_norm": 0.45576158948311507,
"learning_rate": 5e-06,
"loss": 0.6661,
"step": 210
},
{
"epoch": 0.5256869772998806,
"grad_norm": 0.45151984287636476,
"learning_rate": 5e-06,
"loss": 0.6597,
"step": 220
},
{
"epoch": 0.5495818399044206,
"grad_norm": 0.4799647706900106,
"learning_rate": 5e-06,
"loss": 0.6602,
"step": 230
},
{
"epoch": 0.5734767025089605,
"grad_norm": 0.47076133511342133,
"learning_rate": 5e-06,
"loss": 0.6571,
"step": 240
},
{
"epoch": 0.5973715651135006,
"grad_norm": 0.5919116297131423,
"learning_rate": 5e-06,
"loss": 0.6615,
"step": 250
},
{
"epoch": 0.6212664277180406,
"grad_norm": 0.5500231129527917,
"learning_rate": 5e-06,
"loss": 0.6585,
"step": 260
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.5242998976170237,
"learning_rate": 5e-06,
"loss": 0.6633,
"step": 270
},
{
"epoch": 0.6690561529271206,
"grad_norm": 0.44132900428051,
"learning_rate": 5e-06,
"loss": 0.6588,
"step": 280
},
{
"epoch": 0.6929510155316607,
"grad_norm": 0.6925054556015406,
"learning_rate": 5e-06,
"loss": 0.6553,
"step": 290
},
{
"epoch": 0.7168458781362007,
"grad_norm": 0.4625241785333385,
"learning_rate": 5e-06,
"loss": 0.6574,
"step": 300
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.4229402308269957,
"learning_rate": 5e-06,
"loss": 0.6527,
"step": 310
},
{
"epoch": 0.7646356033452808,
"grad_norm": 0.5130609463277542,
"learning_rate": 5e-06,
"loss": 0.6561,
"step": 320
},
{
"epoch": 0.7885304659498208,
"grad_norm": 0.6838274381409521,
"learning_rate": 5e-06,
"loss": 0.6555,
"step": 330
},
{
"epoch": 0.8124253285543608,
"grad_norm": 0.4426103821343896,
"learning_rate": 5e-06,
"loss": 0.6528,
"step": 340
},
{
"epoch": 0.8363201911589009,
"grad_norm": 0.4768048776745041,
"learning_rate": 5e-06,
"loss": 0.6526,
"step": 350
},
{
"epoch": 0.8602150537634409,
"grad_norm": 0.47979657505843953,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 360
},
{
"epoch": 0.8841099163679809,
"grad_norm": 0.43210991398577236,
"learning_rate": 5e-06,
"loss": 0.6545,
"step": 370
},
{
"epoch": 0.9080047789725209,
"grad_norm": 0.4219482631866451,
"learning_rate": 5e-06,
"loss": 0.6561,
"step": 380
},
{
"epoch": 0.931899641577061,
"grad_norm": 0.4889263682317913,
"learning_rate": 5e-06,
"loss": 0.6415,
"step": 390
},
{
"epoch": 0.955794504181601,
"grad_norm": 0.4994356501839893,
"learning_rate": 5e-06,
"loss": 0.6434,
"step": 400
},
{
"epoch": 0.9796893667861409,
"grad_norm": 0.5756138907013993,
"learning_rate": 5e-06,
"loss": 0.6535,
"step": 410
},
{
"epoch": 0.998805256869773,
"eval_loss": 0.6466529965400696,
"eval_runtime": 225.9354,
"eval_samples_per_second": 49.895,
"eval_steps_per_second": 0.394,
"step": 418
},
{
"epoch": 1.003584229390681,
"grad_norm": 0.7256416169536216,
"learning_rate": 5e-06,
"loss": 0.6438,
"step": 420
},
{
"epoch": 1.027479091995221,
"grad_norm": 0.6564158902335233,
"learning_rate": 5e-06,
"loss": 0.6108,
"step": 430
},
{
"epoch": 1.0513739545997611,
"grad_norm": 0.4999679801637927,
"learning_rate": 5e-06,
"loss": 0.6106,
"step": 440
},
{
"epoch": 1.075268817204301,
"grad_norm": 0.5241048691611577,
"learning_rate": 5e-06,
"loss": 0.6124,
"step": 450
},
{
"epoch": 1.099163679808841,
"grad_norm": 0.5456228664692746,
"learning_rate": 5e-06,
"loss": 0.6042,
"step": 460
},
{
"epoch": 1.1230585424133812,
"grad_norm": 0.5456744152195628,
"learning_rate": 5e-06,
"loss": 0.6028,
"step": 470
},
{
"epoch": 1.146953405017921,
"grad_norm": 0.4664933079979728,
"learning_rate": 5e-06,
"loss": 0.6095,
"step": 480
},
{
"epoch": 1.1708482676224612,
"grad_norm": 0.4894583019401931,
"learning_rate": 5e-06,
"loss": 0.6019,
"step": 490
},
{
"epoch": 1.194743130227001,
"grad_norm": 0.4942642519947347,
"learning_rate": 5e-06,
"loss": 0.6114,
"step": 500
},
{
"epoch": 1.2186379928315412,
"grad_norm": 0.46554339302452813,
"learning_rate": 5e-06,
"loss": 0.6099,
"step": 510
},
{
"epoch": 1.2425328554360813,
"grad_norm": 0.5215764597896382,
"learning_rate": 5e-06,
"loss": 0.6073,
"step": 520
},
{
"epoch": 1.2664277180406214,
"grad_norm": 0.5142341654295087,
"learning_rate": 5e-06,
"loss": 0.6105,
"step": 530
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.4429903840954624,
"learning_rate": 5e-06,
"loss": 0.61,
"step": 540
},
{
"epoch": 1.3142174432497014,
"grad_norm": 0.4244756990330428,
"learning_rate": 5e-06,
"loss": 0.6113,
"step": 550
},
{
"epoch": 1.3381123058542412,
"grad_norm": 0.4664930270424248,
"learning_rate": 5e-06,
"loss": 0.6057,
"step": 560
},
{
"epoch": 1.3620071684587813,
"grad_norm": 0.6747787167132405,
"learning_rate": 5e-06,
"loss": 0.6084,
"step": 570
},
{
"epoch": 1.3859020310633214,
"grad_norm": 0.8515989236641928,
"learning_rate": 5e-06,
"loss": 0.6058,
"step": 580
},
{
"epoch": 1.4097968936678615,
"grad_norm": 0.634857639704424,
"learning_rate": 5e-06,
"loss": 0.6058,
"step": 590
},
{
"epoch": 1.4336917562724014,
"grad_norm": 0.5282115500074044,
"learning_rate": 5e-06,
"loss": 0.6096,
"step": 600
},
{
"epoch": 1.4575866188769415,
"grad_norm": 0.5576953727126037,
"learning_rate": 5e-06,
"loss": 0.616,
"step": 610
},
{
"epoch": 1.4814814814814814,
"grad_norm": 0.45965397939992636,
"learning_rate": 5e-06,
"loss": 0.6082,
"step": 620
},
{
"epoch": 1.5053763440860215,
"grad_norm": 0.5729607655893968,
"learning_rate": 5e-06,
"loss": 0.6037,
"step": 630
},
{
"epoch": 1.5292712066905616,
"grad_norm": 0.4420855639504453,
"learning_rate": 5e-06,
"loss": 0.608,
"step": 640
},
{
"epoch": 1.5531660692951017,
"grad_norm": 0.4815965030552482,
"learning_rate": 5e-06,
"loss": 0.6053,
"step": 650
},
{
"epoch": 1.5770609318996416,
"grad_norm": 0.5446732967871324,
"learning_rate": 5e-06,
"loss": 0.6076,
"step": 660
},
{
"epoch": 1.6009557945041815,
"grad_norm": 0.5773921107864519,
"learning_rate": 5e-06,
"loss": 0.6058,
"step": 670
},
{
"epoch": 1.6248506571087216,
"grad_norm": 0.44904612161350127,
"learning_rate": 5e-06,
"loss": 0.5988,
"step": 680
},
{
"epoch": 1.6487455197132617,
"grad_norm": 0.4659803956684399,
"learning_rate": 5e-06,
"loss": 0.598,
"step": 690
},
{
"epoch": 1.6726403823178018,
"grad_norm": 0.4361474003132107,
"learning_rate": 5e-06,
"loss": 0.6081,
"step": 700
},
{
"epoch": 1.6965352449223416,
"grad_norm": 0.4702827100539838,
"learning_rate": 5e-06,
"loss": 0.5997,
"step": 710
},
{
"epoch": 1.7204301075268817,
"grad_norm": 0.46962735672309736,
"learning_rate": 5e-06,
"loss": 0.6135,
"step": 720
},
{
"epoch": 1.7443249701314216,
"grad_norm": 0.5064462322593579,
"learning_rate": 5e-06,
"loss": 0.6034,
"step": 730
},
{
"epoch": 1.7682198327359617,
"grad_norm": 0.6442892941899157,
"learning_rate": 5e-06,
"loss": 0.6044,
"step": 740
},
{
"epoch": 1.7921146953405018,
"grad_norm": 0.449859458258856,
"learning_rate": 5e-06,
"loss": 0.6062,
"step": 750
},
{
"epoch": 1.816009557945042,
"grad_norm": 0.47467567108778363,
"learning_rate": 5e-06,
"loss": 0.6035,
"step": 760
},
{
"epoch": 1.8399044205495818,
"grad_norm": 0.43550415026449085,
"learning_rate": 5e-06,
"loss": 0.5987,
"step": 770
},
{
"epoch": 1.863799283154122,
"grad_norm": 0.48913780227876247,
"learning_rate": 5e-06,
"loss": 0.6031,
"step": 780
},
{
"epoch": 1.8876941457586618,
"grad_norm": 0.5594004132295759,
"learning_rate": 5e-06,
"loss": 0.5995,
"step": 790
},
{
"epoch": 1.911589008363202,
"grad_norm": 0.4971730954697683,
"learning_rate": 5e-06,
"loss": 0.6056,
"step": 800
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.46186692571258725,
"learning_rate": 5e-06,
"loss": 0.6004,
"step": 810
},
{
"epoch": 1.959378733572282,
"grad_norm": 0.4508830943663248,
"learning_rate": 5e-06,
"loss": 0.604,
"step": 820
},
{
"epoch": 1.983273596176822,
"grad_norm": 0.501207898912081,
"learning_rate": 5e-06,
"loss": 0.608,
"step": 830
},
{
"epoch": 2.0,
"eval_loss": 0.6365451812744141,
"eval_runtime": 226.5918,
"eval_samples_per_second": 49.75,
"eval_steps_per_second": 0.393,
"step": 837
},
{
"epoch": 2.007168458781362,
"grad_norm": 0.85054207213812,
"learning_rate": 5e-06,
"loss": 0.592,
"step": 840
},
{
"epoch": 2.031063321385902,
"grad_norm": 0.5201507086108782,
"learning_rate": 5e-06,
"loss": 0.5608,
"step": 850
},
{
"epoch": 2.054958183990442,
"grad_norm": 0.7188535226812537,
"learning_rate": 5e-06,
"loss": 0.5647,
"step": 860
},
{
"epoch": 2.078853046594982,
"grad_norm": 0.4763195641282365,
"learning_rate": 5e-06,
"loss": 0.5644,
"step": 870
},
{
"epoch": 2.1027479091995223,
"grad_norm": 0.5081415859208832,
"learning_rate": 5e-06,
"loss": 0.5622,
"step": 880
},
{
"epoch": 2.126642771804062,
"grad_norm": 0.5931792000293172,
"learning_rate": 5e-06,
"loss": 0.5563,
"step": 890
},
{
"epoch": 2.150537634408602,
"grad_norm": 0.49851033855755,
"learning_rate": 5e-06,
"loss": 0.5611,
"step": 900
},
{
"epoch": 2.174432497013142,
"grad_norm": 0.5379278365329638,
"learning_rate": 5e-06,
"loss": 0.5573,
"step": 910
},
{
"epoch": 2.198327359617682,
"grad_norm": 0.5350268044233742,
"learning_rate": 5e-06,
"loss": 0.5678,
"step": 920
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.5689123372030673,
"learning_rate": 5e-06,
"loss": 0.5686,
"step": 930
},
{
"epoch": 2.2461170848267624,
"grad_norm": 0.6671996296787344,
"learning_rate": 5e-06,
"loss": 0.5622,
"step": 940
},
{
"epoch": 2.270011947431302,
"grad_norm": 0.4506810352733908,
"learning_rate": 5e-06,
"loss": 0.5642,
"step": 950
},
{
"epoch": 2.293906810035842,
"grad_norm": 0.5358151280205125,
"learning_rate": 5e-06,
"loss": 0.5628,
"step": 960
},
{
"epoch": 2.3178016726403823,
"grad_norm": 0.5566771627404731,
"learning_rate": 5e-06,
"loss": 0.5634,
"step": 970
},
{
"epoch": 2.3416965352449224,
"grad_norm": 0.49963936030628325,
"learning_rate": 5e-06,
"loss": 0.5632,
"step": 980
},
{
"epoch": 2.3655913978494625,
"grad_norm": 0.48679480824629434,
"learning_rate": 5e-06,
"loss": 0.5583,
"step": 990
},
{
"epoch": 2.389486260454002,
"grad_norm": 0.5074816823498985,
"learning_rate": 5e-06,
"loss": 0.5636,
"step": 1000
},
{
"epoch": 2.4133811230585422,
"grad_norm": 0.5739148795335686,
"learning_rate": 5e-06,
"loss": 0.5671,
"step": 1010
},
{
"epoch": 2.4372759856630823,
"grad_norm": 0.6501742104516552,
"learning_rate": 5e-06,
"loss": 0.5666,
"step": 1020
},
{
"epoch": 2.4611708482676224,
"grad_norm": 0.43406800220014613,
"learning_rate": 5e-06,
"loss": 0.5645,
"step": 1030
},
{
"epoch": 2.4850657108721625,
"grad_norm": 0.47946981158627366,
"learning_rate": 5e-06,
"loss": 0.5612,
"step": 1040
},
{
"epoch": 2.5089605734767026,
"grad_norm": 0.5508677225984592,
"learning_rate": 5e-06,
"loss": 0.5658,
"step": 1050
},
{
"epoch": 2.5328554360812428,
"grad_norm": 0.6172108213167418,
"learning_rate": 5e-06,
"loss": 0.5656,
"step": 1060
},
{
"epoch": 2.5567502986857824,
"grad_norm": 0.6149816712572169,
"learning_rate": 5e-06,
"loss": 0.5637,
"step": 1070
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.5494076230620691,
"learning_rate": 5e-06,
"loss": 0.5748,
"step": 1080
},
{
"epoch": 2.6045400238948626,
"grad_norm": 0.5098015036653776,
"learning_rate": 5e-06,
"loss": 0.5665,
"step": 1090
},
{
"epoch": 2.6284348864994027,
"grad_norm": 0.4763003977246298,
"learning_rate": 5e-06,
"loss": 0.5659,
"step": 1100
},
{
"epoch": 2.652329749103943,
"grad_norm": 0.45015059391064355,
"learning_rate": 5e-06,
"loss": 0.5613,
"step": 1110
},
{
"epoch": 2.6762246117084825,
"grad_norm": 0.5195016081388676,
"learning_rate": 5e-06,
"loss": 0.5661,
"step": 1120
},
{
"epoch": 2.7001194743130226,
"grad_norm": 0.461979850463992,
"learning_rate": 5e-06,
"loss": 0.5703,
"step": 1130
},
{
"epoch": 2.7240143369175627,
"grad_norm": 0.4611698536891998,
"learning_rate": 5e-06,
"loss": 0.5628,
"step": 1140
},
{
"epoch": 2.7479091995221028,
"grad_norm": 0.5474996121575114,
"learning_rate": 5e-06,
"loss": 0.5692,
"step": 1150
},
{
"epoch": 2.771804062126643,
"grad_norm": 0.477411852958178,
"learning_rate": 5e-06,
"loss": 0.5745,
"step": 1160
},
{
"epoch": 2.795698924731183,
"grad_norm": 0.48004817339516165,
"learning_rate": 5e-06,
"loss": 0.5625,
"step": 1170
},
{
"epoch": 2.819593787335723,
"grad_norm": 0.5043226922994581,
"learning_rate": 5e-06,
"loss": 0.5664,
"step": 1180
},
{
"epoch": 2.8434886499402627,
"grad_norm": 0.4988305698181874,
"learning_rate": 5e-06,
"loss": 0.5649,
"step": 1190
},
{
"epoch": 2.867383512544803,
"grad_norm": 0.4569103859069353,
"learning_rate": 5e-06,
"loss": 0.5647,
"step": 1200
},
{
"epoch": 2.891278375149343,
"grad_norm": 0.46286445346886024,
"learning_rate": 5e-06,
"loss": 0.5621,
"step": 1210
},
{
"epoch": 2.915173237753883,
"grad_norm": 0.5296890930558641,
"learning_rate": 5e-06,
"loss": 0.5668,
"step": 1220
},
{
"epoch": 2.9390681003584227,
"grad_norm": 0.5546209266748766,
"learning_rate": 5e-06,
"loss": 0.5688,
"step": 1230
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.5910470543653078,
"learning_rate": 5e-06,
"loss": 0.5668,
"step": 1240
},
{
"epoch": 2.986857825567503,
"grad_norm": 0.5524701632367459,
"learning_rate": 5e-06,
"loss": 0.5714,
"step": 1250
},
{
"epoch": 2.996415770609319,
"eval_loss": 0.6373269557952881,
"eval_runtime": 227.0283,
"eval_samples_per_second": 49.655,
"eval_steps_per_second": 0.392,
"step": 1254
},
{
"epoch": 2.996415770609319,
"step": 1254,
"total_flos": 2100077946470400.0,
"train_loss": 0.6173035094612523,
"train_runtime": 37751.1113,
"train_samples_per_second": 17.021,
"train_steps_per_second": 0.033
}
],
"logging_steps": 10,
"max_steps": 1254,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2100077946470400.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}