JoshuaKelleyDs's picture
uno
d7cee63 verified
raw
history blame
12.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.262571103526735,
"eval_steps": 5000,
"global_step": 55048,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11376564277588168,
"grad_norm": 4.343299865722656,
"learning_rate": 7.960000000000001e-05,
"loss": 4.3757,
"step": 1000
},
{
"epoch": 0.22753128555176336,
"grad_norm": 3.4980385303497314,
"learning_rate": 0.0001596,
"loss": 1.7997,
"step": 2000
},
{
"epoch": 0.3412969283276451,
"grad_norm": 3.350770950317383,
"learning_rate": 0.0002396,
"loss": 1.4879,
"step": 3000
},
{
"epoch": 0.4550625711035267,
"grad_norm": 3.362269163131714,
"learning_rate": 0.0003196,
"loss": 1.3834,
"step": 4000
},
{
"epoch": 0.5688282138794084,
"grad_norm": 2.424733877182007,
"learning_rate": 0.0003996,
"loss": 1.3351,
"step": 5000
},
{
"epoch": 0.5688282138794084,
"eval_accuracy": 0.668216,
"eval_loss": 1.3343349695205688,
"eval_runtime": 15.5029,
"eval_samples_per_second": 16125.98,
"eval_steps_per_second": 31.542,
"step": 5000
},
{
"epoch": 0.6825938566552902,
"grad_norm": 2.601680040359497,
"learning_rate": 0.00047960000000000006,
"loss": 1.3097,
"step": 6000
},
{
"epoch": 0.7963594994311718,
"grad_norm": 1.9989418983459473,
"learning_rate": 0.0005596,
"loss": 1.2866,
"step": 7000
},
{
"epoch": 0.9101251422070534,
"grad_norm": 1.8081185817718506,
"learning_rate": 0.0006396,
"loss": 1.2689,
"step": 8000
},
{
"epoch": 1.023890784982935,
"grad_norm": 1.6405588388442993,
"learning_rate": 0.00071952,
"loss": 1.2487,
"step": 9000
},
{
"epoch": 1.1376564277588168,
"grad_norm": 1.22613525390625,
"learning_rate": 0.00079952,
"loss": 1.2165,
"step": 10000
},
{
"epoch": 1.1376564277588168,
"eval_accuracy": 0.686196,
"eval_loss": 1.2481120824813843,
"eval_runtime": 15.8446,
"eval_samples_per_second": 15778.257,
"eval_steps_per_second": 30.862,
"step": 10000
},
{
"epoch": 1.2514220705346986,
"grad_norm": 1.2695167064666748,
"learning_rate": 0.0007996786565611985,
"loss": 1.2046,
"step": 11000
},
{
"epoch": 1.36518771331058,
"grad_norm": 1.2521305084228516,
"learning_rate": 0.0007987086748436788,
"loss": 1.1849,
"step": 12000
},
{
"epoch": 1.4789533560864618,
"grad_norm": 1.19619619846344,
"learning_rate": 0.0007970896788508052,
"loss": 1.1534,
"step": 13000
},
{
"epoch": 1.5927189988623436,
"grad_norm": 1.0483107566833496,
"learning_rate": 0.0007948275336376884,
"loss": 1.1312,
"step": 14000
},
{
"epoch": 1.7064846416382253,
"grad_norm": 1.3618515729904175,
"learning_rate": 0.0007919213896323948,
"loss": 1.112,
"step": 15000
},
{
"epoch": 1.7064846416382253,
"eval_accuracy": 0.716556,
"eval_loss": 1.1176625490188599,
"eval_runtime": 15.2386,
"eval_samples_per_second": 16405.688,
"eval_steps_per_second": 32.09,
"step": 15000
},
{
"epoch": 1.820250284414107,
"grad_norm": 0.9307771325111389,
"learning_rate": 0.0007883817747762077,
"loss": 1.0986,
"step": 16000
},
{
"epoch": 1.9340159271899886,
"grad_norm": 0.9189246296882629,
"learning_rate": 0.0007842073597303121,
"loss": 1.0847,
"step": 17000
},
{
"epoch": 2.04778156996587,
"grad_norm": 0.8016377687454224,
"learning_rate": 0.0007794081581686037,
"loss": 1.0506,
"step": 18000
},
{
"epoch": 2.161547212741752,
"grad_norm": 0.9774219989776611,
"learning_rate": 0.0007739919744091065,
"loss": 1.0158,
"step": 19000
},
{
"epoch": 2.2753128555176336,
"grad_norm": 0.8449124693870544,
"learning_rate": 0.0007679676160878387,
"loss": 1.0138,
"step": 20000
},
{
"epoch": 2.2753128555176336,
"eval_accuracy": 0.732636,
"eval_loss": 1.052935242652893,
"eval_runtime": 15.2399,
"eval_samples_per_second": 16404.275,
"eval_steps_per_second": 32.087,
"step": 20000
},
{
"epoch": 2.3890784982935154,
"grad_norm": 0.9211858510971069,
"learning_rate": 0.0007613448798360993,
"loss": 1.0113,
"step": 21000
},
{
"epoch": 2.502844141069397,
"grad_norm": 0.7870326042175293,
"learning_rate": 0.0007541345353494786,
"loss": 1.0024,
"step": 22000
},
{
"epoch": 2.616609783845279,
"grad_norm": 0.8683303594589233,
"learning_rate": 0.0007463483078745015,
"loss": 1.0032,
"step": 23000
},
{
"epoch": 2.73037542662116,
"grad_norm": 1.031267523765564,
"learning_rate": 0.000738007485475254,
"loss": 0.9961,
"step": 24000
},
{
"epoch": 2.8441410693970424,
"grad_norm": 0.8440726399421692,
"learning_rate": 0.0007291089356699791,
"loss": 0.9909,
"step": 25000
},
{
"epoch": 2.8441410693970424,
"eval_accuracy": 0.741524,
"eval_loss": 1.0143921375274658,
"eval_runtime": 14.9299,
"eval_samples_per_second": 16744.96,
"eval_steps_per_second": 32.753,
"step": 25000
},
{
"epoch": 2.9579067121729237,
"grad_norm": 0.708281934261322,
"learning_rate": 0.0007196848947861554,
"loss": 0.9832,
"step": 26000
},
{
"epoch": 3.0716723549488054,
"grad_norm": 0.692997395992279,
"learning_rate": 0.000709742030952583,
"loss": 0.9383,
"step": 27000
},
{
"epoch": 3.185437997724687,
"grad_norm": 0.8190609216690063,
"learning_rate": 0.0006992857783851634,
"loss": 0.9193,
"step": 28000
},
{
"epoch": 3.299203640500569,
"grad_norm": 0.7791016697883606,
"learning_rate": 0.0006883428362373026,
"loss": 0.9197,
"step": 29000
},
{
"epoch": 3.4129692832764507,
"grad_norm": 0.6834008693695068,
"learning_rate": 0.0006769309995941914,
"loss": 0.9236,
"step": 30000
},
{
"epoch": 3.4129692832764507,
"eval_accuracy": 0.748324,
"eval_loss": 0.9886829257011414,
"eval_runtime": 14.8544,
"eval_samples_per_second": 16830.07,
"eval_steps_per_second": 32.92,
"step": 30000
},
{
"epoch": 3.526734926052332,
"grad_norm": 0.7945353388786316,
"learning_rate": 0.0006650809067991791,
"loss": 0.9259,
"step": 31000
},
{
"epoch": 3.640500568828214,
"grad_norm": 0.7771942019462585,
"learning_rate": 0.000652788107427868,
"loss": 0.924,
"step": 32000
},
{
"epoch": 3.7542662116040955,
"grad_norm": 0.7232080101966858,
"learning_rate": 0.0006400842315977677,
"loss": 0.9149,
"step": 33000
},
{
"epoch": 3.868031854379977,
"grad_norm": 0.6129056215286255,
"learning_rate": 0.0006270032202430253,
"loss": 0.9142,
"step": 34000
},
{
"epoch": 3.981797497155859,
"grad_norm": 0.7053471803665161,
"learning_rate": 0.0006135401606551002,
"loss": 0.914,
"step": 35000
},
{
"epoch": 3.981797497155859,
"eval_accuracy": 0.754936,
"eval_loss": 0.9585933089256287,
"eval_runtime": 15.1971,
"eval_samples_per_second": 16450.463,
"eval_steps_per_second": 32.177,
"step": 35000
},
{
"epoch": 4.09556313993174,
"grad_norm": 0.7185536623001099,
"learning_rate": 0.0005997438247807972,
"loss": 0.8508,
"step": 36000
},
{
"epoch": 4.2093287827076225,
"grad_norm": 0.7729761600494385,
"learning_rate": 0.0005856090312640852,
"loss": 0.8434,
"step": 37000
},
{
"epoch": 4.323094425483504,
"grad_norm": 0.8405170440673828,
"learning_rate": 0.0005711724058927512,
"loss": 0.8442,
"step": 38000
},
{
"epoch": 4.436860068259386,
"grad_norm": 0.6555745005607605,
"learning_rate": 0.0005564574250751392,
"loss": 0.852,
"step": 39000
},
{
"epoch": 4.550625711035267,
"grad_norm": 0.6500961184501648,
"learning_rate": 0.0005415031062964693,
"loss": 0.849,
"step": 40000
},
{
"epoch": 4.550625711035267,
"eval_accuracy": 0.758092,
"eval_loss": 0.9504426121711731,
"eval_runtime": 14.8523,
"eval_samples_per_second": 16832.413,
"eval_steps_per_second": 32.924,
"step": 40000
},
{
"epoch": 4.664391353811149,
"grad_norm": 0.6345399618148804,
"learning_rate": 0.0005263038333083039,
"loss": 0.8493,
"step": 41000
},
{
"epoch": 4.778156996587031,
"grad_norm": 0.7285176515579224,
"learning_rate": 0.0005108991688044689,
"loss": 0.8505,
"step": 42000
},
{
"epoch": 4.891922639362912,
"grad_norm": 0.7443712949752808,
"learning_rate": 0.000495314163389589,
"loss": 0.8515,
"step": 43000
},
{
"epoch": 5.005688282138794,
"grad_norm": 0.6659076809883118,
"learning_rate": 0.0004795899698565036,
"loss": 0.8403,
"step": 44000
},
{
"epoch": 5.1194539249146755,
"grad_norm": 0.7172214984893799,
"learning_rate": 0.00046372068282238195,
"loss": 0.7614,
"step": 45000
},
{
"epoch": 5.1194539249146755,
"eval_accuracy": 0.761088,
"eval_loss": 0.9495302438735962,
"eval_runtime": 15.3396,
"eval_samples_per_second": 16297.69,
"eval_steps_per_second": 31.878,
"step": 45000
},
{
"epoch": 5.233219567690558,
"grad_norm": 0.8739346861839294,
"learning_rate": 0.0004477477751198958,
"loss": 0.767,
"step": 46000
},
{
"epoch": 5.346985210466439,
"grad_norm": 0.8742613196372986,
"learning_rate": 0.0004316972214137623,
"loss": 0.7723,
"step": 47000
},
{
"epoch": 5.460750853242321,
"grad_norm": 0.7512331604957581,
"learning_rate": 0.00041559512263430705,
"loss": 0.7738,
"step": 48000
},
{
"epoch": 5.5745164960182025,
"grad_norm": 0.7592815160751343,
"learning_rate": 0.00039948379493191056,
"loss": 0.7735,
"step": 49000
},
{
"epoch": 5.688282138794084,
"grad_norm": 0.6751989722251892,
"learning_rate": 0.00038335718753151784,
"loss": 0.7726,
"step": 50000
},
{
"epoch": 5.688282138794084,
"eval_accuracy": 0.763736,
"eval_loss": 0.9361330270767212,
"eval_runtime": 17.5361,
"eval_samples_per_second": 14256.306,
"eval_steps_per_second": 27.885,
"step": 50000
},
{
"epoch": 5.802047781569966,
"grad_norm": 0.7502247095108032,
"learning_rate": 0.0003672737214802269,
"loss": 0.7735,
"step": 51000
},
{
"epoch": 5.915813424345847,
"grad_norm": 0.7880488038063049,
"learning_rate": 0.000351227356466713,
"loss": 0.7736,
"step": 52000
},
{
"epoch": 6.0295790671217295,
"grad_norm": 0.7424056529998779,
"learning_rate": 0.0003352762228480271,
"loss": 0.7455,
"step": 53000
},
{
"epoch": 6.143344709897611,
"grad_norm": 0.8073525428771973,
"learning_rate": 0.0003194143300116524,
"loss": 0.6797,
"step": 54000
},
{
"epoch": 6.257110352673493,
"grad_norm": 0.7268177270889282,
"learning_rate": 0.00030369914003658996,
"loss": 0.6867,
"step": 55000
},
{
"epoch": 6.257110352673493,
"eval_accuracy": 0.763196,
"eval_loss": 0.9648858308792114,
"eval_runtime": 15.5355,
"eval_samples_per_second": 16092.147,
"eval_steps_per_second": 31.476,
"step": 55000
}
],
"logging_steps": 1000,
"max_steps": 87900,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5000,
"total_flos": 2.354556845700649e+18,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}