Bleking's picture
push llava-v1.6-vicuna-7b
dab50fc
{
"best_metric": 0.6768932938575745,
"best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b/checkpoint-250",
"epoch": 10.0,
"eval_steps": 1.0,
"global_step": 320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03125,
"grad_norm": 1.0817695604199613,
"learning_rate": 0.0,
"loss": 1.3872,
"step": 1
},
{
"epoch": 0.03125,
"eval_loss": 1.4023343324661255,
"eval_runtime": 35.2562,
"eval_samples_per_second": 5.673,
"eval_steps_per_second": 0.369,
"step": 1
},
{
"epoch": 0.0625,
"grad_norm": 0.8573794343563677,
"learning_rate": 8.613531161467863e-06,
"loss": 1.3352,
"step": 2
},
{
"epoch": 0.0625,
"eval_loss": 1.4023343324661255,
"eval_runtime": 27.8829,
"eval_samples_per_second": 7.173,
"eval_steps_per_second": 0.466,
"step": 2
},
{
"epoch": 0.09375,
"grad_norm": 0.8545279010393898,
"learning_rate": 1.3652123889719709e-05,
"loss": 1.3838,
"step": 3
},
{
"epoch": 0.09375,
"eval_loss": 1.3825562000274658,
"eval_runtime": 27.9018,
"eval_samples_per_second": 7.168,
"eval_steps_per_second": 0.466,
"step": 3
},
{
"epoch": 0.125,
"grad_norm": 0.7747695318679186,
"learning_rate": 1.7227062322935725e-05,
"loss": 1.3442,
"step": 4
},
{
"epoch": 0.125,
"eval_loss": 1.3529690504074097,
"eval_runtime": 27.9234,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 0.466,
"step": 4
},
{
"epoch": 0.15625,
"grad_norm": 0.9223438945487747,
"learning_rate": 2e-05,
"loss": 1.3265,
"step": 5
},
{
"epoch": 0.15625,
"eval_loss": 1.3111159801483154,
"eval_runtime": 27.8183,
"eval_samples_per_second": 7.19,
"eval_steps_per_second": 0.467,
"step": 5
},
{
"epoch": 0.1875,
"grad_norm": 0.8553066709777654,
"learning_rate": 2e-05,
"loss": 1.2969,
"step": 6
},
{
"epoch": 0.1875,
"eval_loss": 1.267953634262085,
"eval_runtime": 28.5087,
"eval_samples_per_second": 7.015,
"eval_steps_per_second": 0.456,
"step": 6
},
{
"epoch": 0.21875,
"grad_norm": 0.7513319744508511,
"learning_rate": 2e-05,
"loss": 1.2643,
"step": 7
},
{
"epoch": 0.21875,
"eval_loss": 1.2324440479278564,
"eval_runtime": 28.7026,
"eval_samples_per_second": 6.968,
"eval_steps_per_second": 0.453,
"step": 7
},
{
"epoch": 0.25,
"grad_norm": 0.5926161530676572,
"learning_rate": 2e-05,
"loss": 1.2343,
"step": 8
},
{
"epoch": 0.25,
"eval_loss": 1.2082672119140625,
"eval_runtime": 28.709,
"eval_samples_per_second": 6.966,
"eval_steps_per_second": 0.453,
"step": 8
},
{
"epoch": 0.28125,
"grad_norm": 0.45585108261607465,
"learning_rate": 2e-05,
"loss": 1.2556,
"step": 9
},
{
"epoch": 0.28125,
"eval_loss": 1.1897780895233154,
"eval_runtime": 28.5026,
"eval_samples_per_second": 7.017,
"eval_steps_per_second": 0.456,
"step": 9
},
{
"epoch": 0.3125,
"grad_norm": 0.45306175711380503,
"learning_rate": 2e-05,
"loss": 1.1941,
"step": 10
},
{
"epoch": 0.3125,
"eval_loss": 1.1719207763671875,
"eval_runtime": 28.4252,
"eval_samples_per_second": 7.036,
"eval_steps_per_second": 0.457,
"step": 10
},
{
"epoch": 0.34375,
"grad_norm": 0.40702053502599356,
"learning_rate": 2e-05,
"loss": 1.2414,
"step": 11
},
{
"epoch": 0.34375,
"eval_loss": 1.1534627676010132,
"eval_runtime": 31.953,
"eval_samples_per_second": 6.259,
"eval_steps_per_second": 0.407,
"step": 11
},
{
"epoch": 0.375,
"grad_norm": 0.45771435281195333,
"learning_rate": 2e-05,
"loss": 1.202,
"step": 12
},
{
"epoch": 0.375,
"eval_loss": 1.1343497037887573,
"eval_runtime": 31.7064,
"eval_samples_per_second": 6.308,
"eval_steps_per_second": 0.41,
"step": 12
},
{
"epoch": 0.40625,
"grad_norm": 0.49237132802399297,
"learning_rate": 2e-05,
"loss": 1.2167,
"step": 13
},
{
"epoch": 0.40625,
"eval_loss": 1.1149284839630127,
"eval_runtime": 31.7514,
"eval_samples_per_second": 6.299,
"eval_steps_per_second": 0.409,
"step": 13
},
{
"epoch": 0.4375,
"grad_norm": 0.4707558788321445,
"learning_rate": 2e-05,
"loss": 1.0463,
"step": 14
},
{
"epoch": 0.4375,
"eval_loss": 1.0956928730010986,
"eval_runtime": 30.7821,
"eval_samples_per_second": 6.497,
"eval_steps_per_second": 0.422,
"step": 14
},
{
"epoch": 0.46875,
"grad_norm": 0.44161060970171445,
"learning_rate": 2e-05,
"loss": 1.1615,
"step": 15
},
{
"epoch": 0.46875,
"eval_loss": 1.0776234865188599,
"eval_runtime": 30.5336,
"eval_samples_per_second": 6.55,
"eval_steps_per_second": 0.426,
"step": 15
},
{
"epoch": 0.5,
"grad_norm": 0.43310242386256154,
"learning_rate": 2e-05,
"loss": 1.0941,
"step": 16
},
{
"epoch": 0.5,
"eval_loss": 1.061128854751587,
"eval_runtime": 33.8247,
"eval_samples_per_second": 5.913,
"eval_steps_per_second": 0.384,
"step": 16
},
{
"epoch": 0.53125,
"grad_norm": 0.3719623439057395,
"learning_rate": 2e-05,
"loss": 1.0992,
"step": 17
},
{
"epoch": 0.53125,
"eval_loss": 1.0465847253799438,
"eval_runtime": 32.7443,
"eval_samples_per_second": 6.108,
"eval_steps_per_second": 0.397,
"step": 17
},
{
"epoch": 0.5625,
"grad_norm": 0.42266460981580545,
"learning_rate": 2e-05,
"loss": 1.0904,
"step": 18
},
{
"epoch": 0.5625,
"eval_loss": 1.0327677726745605,
"eval_runtime": 32.5697,
"eval_samples_per_second": 6.141,
"eval_steps_per_second": 0.399,
"step": 18
},
{
"epoch": 0.59375,
"grad_norm": 0.35416098431161336,
"learning_rate": 2e-05,
"loss": 1.0055,
"step": 19
},
{
"epoch": 0.59375,
"eval_loss": 1.019870638847351,
"eval_runtime": 32.6927,
"eval_samples_per_second": 6.118,
"eval_steps_per_second": 0.398,
"step": 19
},
{
"epoch": 0.625,
"grad_norm": 0.3454390449296124,
"learning_rate": 2e-05,
"loss": 1.1291,
"step": 20
},
{
"epoch": 0.625,
"eval_loss": 1.008323311805725,
"eval_runtime": 32.5051,
"eval_samples_per_second": 6.153,
"eval_steps_per_second": 0.4,
"step": 20
},
{
"epoch": 0.65625,
"grad_norm": 0.291766075949861,
"learning_rate": 2e-05,
"loss": 1.0363,
"step": 21
},
{
"epoch": 0.65625,
"eval_loss": 0.9983346462249756,
"eval_runtime": 36.1543,
"eval_samples_per_second": 5.532,
"eval_steps_per_second": 0.36,
"step": 21
},
{
"epoch": 0.6875,
"grad_norm": 0.3071914269593122,
"learning_rate": 2e-05,
"loss": 1.0869,
"step": 22
},
{
"epoch": 0.6875,
"eval_loss": 0.989651083946228,
"eval_runtime": 35.9583,
"eval_samples_per_second": 5.562,
"eval_steps_per_second": 0.362,
"step": 22
},
{
"epoch": 0.71875,
"grad_norm": 0.2642686659789585,
"learning_rate": 2e-05,
"loss": 1.0706,
"step": 23
},
{
"epoch": 0.71875,
"eval_loss": 0.981977641582489,
"eval_runtime": 35.7624,
"eval_samples_per_second": 5.592,
"eval_steps_per_second": 0.364,
"step": 23
},
{
"epoch": 0.75,
"grad_norm": 0.23789134722319716,
"learning_rate": 2e-05,
"loss": 1.0669,
"step": 24
},
{
"epoch": 0.75,
"eval_loss": 0.9751532077789307,
"eval_runtime": 35.6905,
"eval_samples_per_second": 5.604,
"eval_steps_per_second": 0.364,
"step": 24
},
{
"epoch": 0.78125,
"grad_norm": 0.26302325685095884,
"learning_rate": 2e-05,
"loss": 1.0141,
"step": 25
},
{
"epoch": 0.78125,
"eval_loss": 0.9684178233146667,
"eval_runtime": 35.4693,
"eval_samples_per_second": 5.639,
"eval_steps_per_second": 0.367,
"step": 25
},
{
"epoch": 0.8125,
"grad_norm": 0.2406662725995088,
"learning_rate": 2e-05,
"loss": 1.0381,
"step": 26
},
{
"epoch": 0.8125,
"eval_loss": 0.9618947505950928,
"eval_runtime": 37.5325,
"eval_samples_per_second": 5.329,
"eval_steps_per_second": 0.346,
"step": 26
},
{
"epoch": 0.84375,
"grad_norm": 0.27899113172875245,
"learning_rate": 2e-05,
"loss": 0.9693,
"step": 27
},
{
"epoch": 0.84375,
"eval_loss": 0.9552007913589478,
"eval_runtime": 37.4006,
"eval_samples_per_second": 5.348,
"eval_steps_per_second": 0.348,
"step": 27
},
{
"epoch": 0.875,
"grad_norm": 0.29303174930955905,
"learning_rate": 2e-05,
"loss": 0.9841,
"step": 28
},
{
"epoch": 0.875,
"eval_loss": 0.9481881856918335,
"eval_runtime": 37.7821,
"eval_samples_per_second": 5.294,
"eval_steps_per_second": 0.344,
"step": 28
},
{
"epoch": 0.90625,
"grad_norm": 0.22138226087715307,
"learning_rate": 2e-05,
"loss": 0.9959,
"step": 29
},
{
"epoch": 0.90625,
"eval_loss": 0.9415397644042969,
"eval_runtime": 37.9058,
"eval_samples_per_second": 5.276,
"eval_steps_per_second": 0.343,
"step": 29
},
{
"epoch": 0.9375,
"grad_norm": 0.23456101188675513,
"learning_rate": 2e-05,
"loss": 1.0351,
"step": 30
},
{
"epoch": 0.9375,
"eval_loss": 0.9354143738746643,
"eval_runtime": 37.9727,
"eval_samples_per_second": 5.267,
"eval_steps_per_second": 0.342,
"step": 30
},
{
"epoch": 0.96875,
"grad_norm": 0.2594838155429295,
"learning_rate": 2e-05,
"loss": 0.8741,
"step": 31
},
{
"epoch": 0.96875,
"eval_loss": 0.9291737079620361,
"eval_runtime": 37.081,
"eval_samples_per_second": 5.394,
"eval_steps_per_second": 0.351,
"step": 31
},
{
"epoch": 1.0,
"grad_norm": 0.2404582058613114,
"learning_rate": 2e-05,
"loss": 0.9814,
"step": 32
},
{
"epoch": 1.0,
"eval_loss": 0.9231625199317932,
"eval_runtime": 37.0946,
"eval_samples_per_second": 5.392,
"eval_steps_per_second": 0.35,
"step": 32
},
{
"epoch": 1.03125,
"grad_norm": 0.26862391186560797,
"learning_rate": 2e-05,
"loss": 1.0241,
"step": 33
},
{
"epoch": 1.03125,
"eval_loss": 0.917277991771698,
"eval_runtime": 37.1872,
"eval_samples_per_second": 5.378,
"eval_steps_per_second": 0.35,
"step": 33
},
{
"epoch": 1.0625,
"grad_norm": 0.24997341491489666,
"learning_rate": 2e-05,
"loss": 1.0296,
"step": 34
},
{
"epoch": 1.0625,
"eval_loss": 0.9116549491882324,
"eval_runtime": 30.7053,
"eval_samples_per_second": 6.514,
"eval_steps_per_second": 0.423,
"step": 34
},
{
"epoch": 1.09375,
"grad_norm": 0.22755062908849677,
"learning_rate": 2e-05,
"loss": 1.047,
"step": 35
},
{
"epoch": 1.09375,
"eval_loss": 0.9061525464057922,
"eval_runtime": 30.5238,
"eval_samples_per_second": 6.552,
"eval_steps_per_second": 0.426,
"step": 35
},
{
"epoch": 1.125,
"grad_norm": 0.2478793998097894,
"learning_rate": 2e-05,
"loss": 1.0071,
"step": 36
},
{
"epoch": 1.125,
"eval_loss": 0.9007319808006287,
"eval_runtime": 30.4573,
"eval_samples_per_second": 6.567,
"eval_steps_per_second": 0.427,
"step": 36
},
{
"epoch": 1.15625,
"grad_norm": 0.2319702521014333,
"learning_rate": 2e-05,
"loss": 0.9517,
"step": 37
},
{
"epoch": 1.15625,
"eval_loss": 0.8955077528953552,
"eval_runtime": 30.6396,
"eval_samples_per_second": 6.528,
"eval_steps_per_second": 0.424,
"step": 37
},
{
"epoch": 1.1875,
"grad_norm": 0.26929965642782505,
"learning_rate": 2e-05,
"loss": 0.9638,
"step": 38
},
{
"epoch": 1.1875,
"eval_loss": 0.8906582593917847,
"eval_runtime": 30.5706,
"eval_samples_per_second": 6.542,
"eval_steps_per_second": 0.425,
"step": 38
},
{
"epoch": 1.21875,
"grad_norm": 0.25494286133089294,
"learning_rate": 2e-05,
"loss": 0.9922,
"step": 39
},
{
"epoch": 1.21875,
"eval_loss": 0.8858879804611206,
"eval_runtime": 30.2267,
"eval_samples_per_second": 6.617,
"eval_steps_per_second": 0.43,
"step": 39
},
{
"epoch": 1.25,
"grad_norm": 0.2468866713698415,
"learning_rate": 2e-05,
"loss": 0.9873,
"step": 40
},
{
"epoch": 1.25,
"eval_loss": 0.8811590671539307,
"eval_runtime": 30.1065,
"eval_samples_per_second": 6.643,
"eval_steps_per_second": 0.432,
"step": 40
},
{
"epoch": 1.28125,
"grad_norm": 0.2460619663724958,
"learning_rate": 2e-05,
"loss": 0.9608,
"step": 41
},
{
"epoch": 1.28125,
"eval_loss": 0.876426637172699,
"eval_runtime": 30.2618,
"eval_samples_per_second": 6.609,
"eval_steps_per_second": 0.43,
"step": 41
},
{
"epoch": 1.3125,
"grad_norm": 0.244111044045335,
"learning_rate": 2e-05,
"loss": 0.9496,
"step": 42
},
{
"epoch": 1.3125,
"eval_loss": 0.8720347881317139,
"eval_runtime": 30.2637,
"eval_samples_per_second": 6.609,
"eval_steps_per_second": 0.43,
"step": 42
},
{
"epoch": 1.34375,
"grad_norm": 0.24263485999072093,
"learning_rate": 2e-05,
"loss": 0.9076,
"step": 43
},
{
"epoch": 1.34375,
"eval_loss": 0.8677232265472412,
"eval_runtime": 30.0588,
"eval_samples_per_second": 6.654,
"eval_steps_per_second": 0.432,
"step": 43
},
{
"epoch": 1.375,
"grad_norm": 0.2549786588443146,
"learning_rate": 2e-05,
"loss": 0.9291,
"step": 44
},
{
"epoch": 1.375,
"eval_loss": 0.864047110080719,
"eval_runtime": 30.3833,
"eval_samples_per_second": 6.583,
"eval_steps_per_second": 0.428,
"step": 44
},
{
"epoch": 1.40625,
"grad_norm": 0.27020952324959413,
"learning_rate": 2e-05,
"loss": 0.9111,
"step": 45
},
{
"epoch": 1.40625,
"eval_loss": 0.8608524799346924,
"eval_runtime": 30.284,
"eval_samples_per_second": 6.604,
"eval_steps_per_second": 0.429,
"step": 45
},
{
"epoch": 1.4375,
"grad_norm": 0.24108750741309573,
"learning_rate": 2e-05,
"loss": 0.8363,
"step": 46
},
{
"epoch": 1.4375,
"eval_loss": 0.8525222539901733,
"eval_runtime": 51.3231,
"eval_samples_per_second": 3.897,
"eval_steps_per_second": 0.487,
"step": 46
},
{
"epoch": 1.46875,
"grad_norm": 0.23963570627035977,
"learning_rate": 2e-05,
"loss": 0.9776,
"step": 47
},
{
"epoch": 1.46875,
"eval_loss": 0.8498736619949341,
"eval_runtime": 43.9039,
"eval_samples_per_second": 4.555,
"eval_steps_per_second": 0.569,
"step": 47
},
{
"epoch": 1.5,
"grad_norm": 0.2738559790360609,
"learning_rate": 2e-05,
"loss": 0.9075,
"step": 48
},
{
"epoch": 1.5,
"eval_loss": 0.846975564956665,
"eval_runtime": 43.6943,
"eval_samples_per_second": 4.577,
"eval_steps_per_second": 0.572,
"step": 48
},
{
"epoch": 1.53125,
"grad_norm": 0.2516715524185528,
"learning_rate": 2e-05,
"loss": 0.9256,
"step": 49
},
{
"epoch": 1.53125,
"eval_loss": 0.8441421985626221,
"eval_runtime": 44.0977,
"eval_samples_per_second": 4.535,
"eval_steps_per_second": 0.567,
"step": 49
},
{
"epoch": 1.5625,
"grad_norm": 0.25797542568004944,
"learning_rate": 2e-05,
"loss": 0.9168,
"step": 50
},
{
"epoch": 1.5625,
"eval_loss": 0.8408769369125366,
"eval_runtime": 45.4442,
"eval_samples_per_second": 4.401,
"eval_steps_per_second": 0.55,
"step": 50
},
{
"epoch": 1.59375,
"grad_norm": 0.24530872900913284,
"learning_rate": 2e-05,
"loss": 0.8547,
"step": 51
},
{
"epoch": 1.59375,
"eval_loss": 0.8373726010322571,
"eval_runtime": 44.6363,
"eval_samples_per_second": 4.481,
"eval_steps_per_second": 0.56,
"step": 51
},
{
"epoch": 1.625,
"grad_norm": 0.2549609506617865,
"learning_rate": 2e-05,
"loss": 0.979,
"step": 52
},
{
"epoch": 1.625,
"eval_loss": 0.8340890407562256,
"eval_runtime": 45.991,
"eval_samples_per_second": 4.349,
"eval_steps_per_second": 0.544,
"step": 52
},
{
"epoch": 1.65625,
"grad_norm": 0.24114496664848603,
"learning_rate": 2e-05,
"loss": 0.9196,
"step": 53
},
{
"epoch": 1.65625,
"eval_loss": 0.8311529755592346,
"eval_runtime": 46.0654,
"eval_samples_per_second": 4.342,
"eval_steps_per_second": 0.543,
"step": 53
},
{
"epoch": 1.6875,
"grad_norm": 0.29287872202759435,
"learning_rate": 2e-05,
"loss": 0.967,
"step": 54
},
{
"epoch": 1.6875,
"eval_loss": 0.8281388282775879,
"eval_runtime": 46.0396,
"eval_samples_per_second": 4.344,
"eval_steps_per_second": 0.543,
"step": 54
},
{
"epoch": 1.71875,
"grad_norm": 0.2620663114325604,
"learning_rate": 2e-05,
"loss": 0.9576,
"step": 55
},
{
"epoch": 1.71875,
"eval_loss": 0.8252360820770264,
"eval_runtime": 44.8935,
"eval_samples_per_second": 4.455,
"eval_steps_per_second": 0.557,
"step": 55
},
{
"epoch": 1.75,
"grad_norm": 0.24813796796229484,
"learning_rate": 2e-05,
"loss": 0.9652,
"step": 56
},
{
"epoch": 1.75,
"eval_loss": 0.8228487968444824,
"eval_runtime": 45.9424,
"eval_samples_per_second": 4.353,
"eval_steps_per_second": 0.544,
"step": 56
},
{
"epoch": 1.78125,
"grad_norm": 0.25644243214043555,
"learning_rate": 2e-05,
"loss": 0.8938,
"step": 57
},
{
"epoch": 1.78125,
"eval_loss": 0.8202834129333496,
"eval_runtime": 45.4583,
"eval_samples_per_second": 4.4,
"eval_steps_per_second": 0.55,
"step": 57
},
{
"epoch": 1.8125,
"grad_norm": 0.24429328723074778,
"learning_rate": 2e-05,
"loss": 0.9373,
"step": 58
},
{
"epoch": 1.8125,
"eval_loss": 0.8179032802581787,
"eval_runtime": 45.7499,
"eval_samples_per_second": 4.372,
"eval_steps_per_second": 0.546,
"step": 58
},
{
"epoch": 1.84375,
"grad_norm": 0.26226013327841075,
"learning_rate": 2e-05,
"loss": 0.8474,
"step": 59
},
{
"epoch": 1.84375,
"eval_loss": 0.8154602646827698,
"eval_runtime": 46.1391,
"eval_samples_per_second": 4.335,
"eval_steps_per_second": 0.542,
"step": 59
},
{
"epoch": 1.875,
"grad_norm": 0.2581666046262149,
"learning_rate": 2e-05,
"loss": 0.8517,
"step": 60
},
{
"epoch": 1.875,
"eval_loss": 0.812771737575531,
"eval_runtime": 45.5621,
"eval_samples_per_second": 4.39,
"eval_steps_per_second": 0.549,
"step": 60
},
{
"epoch": 1.90625,
"grad_norm": 0.2593197258112398,
"learning_rate": 2e-05,
"loss": 0.9011,
"step": 61
},
{
"epoch": 1.90625,
"eval_loss": 0.810187816619873,
"eval_runtime": 46.0597,
"eval_samples_per_second": 4.342,
"eval_steps_per_second": 0.543,
"step": 61
},
{
"epoch": 1.9375,
"grad_norm": 0.2899895571193183,
"learning_rate": 2e-05,
"loss": 0.9277,
"step": 62
},
{
"epoch": 1.9375,
"eval_loss": 0.8083757758140564,
"eval_runtime": 45.8079,
"eval_samples_per_second": 4.366,
"eval_steps_per_second": 0.546,
"step": 62
},
{
"epoch": 1.96875,
"grad_norm": 0.2759215195414453,
"learning_rate": 2e-05,
"loss": 0.772,
"step": 63
},
{
"epoch": 1.96875,
"eval_loss": 0.8061204552650452,
"eval_runtime": 47.3286,
"eval_samples_per_second": 4.226,
"eval_steps_per_second": 0.528,
"step": 63
},
{
"epoch": 2.0,
"grad_norm": 0.27248680511516205,
"learning_rate": 2e-05,
"loss": 0.874,
"step": 64
},
{
"epoch": 2.0,
"eval_loss": 0.8037504553794861,
"eval_runtime": 46.1177,
"eval_samples_per_second": 4.337,
"eval_steps_per_second": 0.542,
"step": 64
},
{
"epoch": 2.03125,
"grad_norm": 0.3116755816558186,
"learning_rate": 2e-05,
"loss": 0.8647,
"step": 65
},
{
"epoch": 2.03125,
"eval_loss": 0.8007115125656128,
"eval_runtime": 46.1583,
"eval_samples_per_second": 4.333,
"eval_steps_per_second": 0.542,
"step": 65
},
{
"epoch": 2.0625,
"grad_norm": 0.273032515206887,
"learning_rate": 2e-05,
"loss": 0.8862,
"step": 66
},
{
"epoch": 2.0625,
"eval_loss": 0.7983976006507874,
"eval_runtime": 47.3469,
"eval_samples_per_second": 4.224,
"eval_steps_per_second": 0.528,
"step": 66
},
{
"epoch": 2.09375,
"grad_norm": 0.2925240383907651,
"learning_rate": 2e-05,
"loss": 0.8617,
"step": 67
},
{
"epoch": 2.09375,
"eval_loss": 0.7959001064300537,
"eval_runtime": 47.9208,
"eval_samples_per_second": 4.174,
"eval_steps_per_second": 0.522,
"step": 67
},
{
"epoch": 2.125,
"grad_norm": 0.25775933439981163,
"learning_rate": 2e-05,
"loss": 0.9269,
"step": 68
},
{
"epoch": 2.125,
"eval_loss": 0.7938115000724792,
"eval_runtime": 47.8909,
"eval_samples_per_second": 4.176,
"eval_steps_per_second": 0.522,
"step": 68
},
{
"epoch": 2.15625,
"grad_norm": 0.2669684013704678,
"learning_rate": 2e-05,
"loss": 0.8607,
"step": 69
},
{
"epoch": 2.15625,
"eval_loss": 0.7918573617935181,
"eval_runtime": 47.39,
"eval_samples_per_second": 4.22,
"eval_steps_per_second": 0.528,
"step": 69
},
{
"epoch": 2.1875,
"grad_norm": 0.312578346444957,
"learning_rate": 2e-05,
"loss": 0.8086,
"step": 70
},
{
"epoch": 2.1875,
"eval_loss": 0.7894810438156128,
"eval_runtime": 46.2927,
"eval_samples_per_second": 4.32,
"eval_steps_per_second": 0.54,
"step": 70
},
{
"epoch": 2.21875,
"grad_norm": 0.25622754870894693,
"learning_rate": 2e-05,
"loss": 0.8945,
"step": 71
},
{
"epoch": 2.21875,
"eval_loss": 0.7875316739082336,
"eval_runtime": 45.7617,
"eval_samples_per_second": 4.37,
"eval_steps_per_second": 0.546,
"step": 71
},
{
"epoch": 2.25,
"grad_norm": 0.27025767580736354,
"learning_rate": 2e-05,
"loss": 0.815,
"step": 72
},
{
"epoch": 2.25,
"eval_loss": 0.7858334183692932,
"eval_runtime": 46.2427,
"eval_samples_per_second": 4.325,
"eval_steps_per_second": 0.541,
"step": 72
},
{
"epoch": 2.28125,
"grad_norm": 0.3110479115695806,
"learning_rate": 2e-05,
"loss": 0.8621,
"step": 73
},
{
"epoch": 2.28125,
"eval_loss": 0.7841551303863525,
"eval_runtime": 46.5372,
"eval_samples_per_second": 4.298,
"eval_steps_per_second": 0.537,
"step": 73
},
{
"epoch": 2.3125,
"grad_norm": 0.26061305588172545,
"learning_rate": 2e-05,
"loss": 0.8622,
"step": 74
},
{
"epoch": 2.3125,
"eval_loss": 0.7826495170593262,
"eval_runtime": 46.1361,
"eval_samples_per_second": 4.335,
"eval_steps_per_second": 0.542,
"step": 74
},
{
"epoch": 2.34375,
"grad_norm": 0.27448719719872205,
"learning_rate": 2e-05,
"loss": 0.9118,
"step": 75
},
{
"epoch": 2.34375,
"eval_loss": 0.7811364531517029,
"eval_runtime": 47.6194,
"eval_samples_per_second": 4.2,
"eval_steps_per_second": 0.525,
"step": 75
},
{
"epoch": 2.375,
"grad_norm": 0.27078145092639194,
"learning_rate": 2e-05,
"loss": 0.8256,
"step": 76
},
{
"epoch": 2.375,
"eval_loss": 0.779961109161377,
"eval_runtime": 46.0097,
"eval_samples_per_second": 4.347,
"eval_steps_per_second": 0.543,
"step": 76
},
{
"epoch": 2.40625,
"grad_norm": 0.2634646272324293,
"learning_rate": 2e-05,
"loss": 0.8774,
"step": 77
},
{
"epoch": 2.40625,
"eval_loss": 0.7788712978363037,
"eval_runtime": 46.2712,
"eval_samples_per_second": 4.322,
"eval_steps_per_second": 0.54,
"step": 77
},
{
"epoch": 2.4375,
"grad_norm": 0.3101668401682978,
"learning_rate": 2e-05,
"loss": 0.8769,
"step": 78
},
{
"epoch": 2.4375,
"eval_loss": 0.7776928544044495,
"eval_runtime": 46.3791,
"eval_samples_per_second": 4.312,
"eval_steps_per_second": 0.539,
"step": 78
},
{
"epoch": 2.46875,
"grad_norm": 0.28798302574187284,
"learning_rate": 2e-05,
"loss": 0.8765,
"step": 79
},
{
"epoch": 2.46875,
"eval_loss": 0.7773044109344482,
"eval_runtime": 43.9352,
"eval_samples_per_second": 4.552,
"eval_steps_per_second": 0.569,
"step": 79
},
{
"epoch": 2.5,
"grad_norm": 0.3349887736240022,
"learning_rate": 2e-05,
"loss": 0.9202,
"step": 80
},
{
"epoch": 2.5,
"eval_loss": 0.7766420245170593,
"eval_runtime": 44.0118,
"eval_samples_per_second": 4.544,
"eval_steps_per_second": 0.568,
"step": 80
},
{
"epoch": 2.53125,
"grad_norm": 0.3272989979927921,
"learning_rate": 2e-05,
"loss": 0.8496,
"step": 81
},
{
"epoch": 2.53125,
"eval_loss": 0.7754170894622803,
"eval_runtime": 44.5079,
"eval_samples_per_second": 4.494,
"eval_steps_per_second": 0.562,
"step": 81
},
{
"epoch": 2.5625,
"grad_norm": 0.2937867633662159,
"learning_rate": 2e-05,
"loss": 0.9088,
"step": 82
},
{
"epoch": 2.5625,
"eval_loss": 0.7740327715873718,
"eval_runtime": 43.7759,
"eval_samples_per_second": 4.569,
"eval_steps_per_second": 0.571,
"step": 82
},
{
"epoch": 2.59375,
"grad_norm": 0.3001827875228488,
"learning_rate": 2e-05,
"loss": 0.8514,
"step": 83
},
{
"epoch": 2.59375,
"eval_loss": 0.7725099921226501,
"eval_runtime": 43.9246,
"eval_samples_per_second": 4.553,
"eval_steps_per_second": 0.569,
"step": 83
},
{
"epoch": 2.625,
"grad_norm": 0.3153202233063334,
"learning_rate": 2e-05,
"loss": 0.8232,
"step": 84
},
{
"epoch": 2.625,
"eval_loss": 0.7707765698432922,
"eval_runtime": 45.7981,
"eval_samples_per_second": 4.367,
"eval_steps_per_second": 0.546,
"step": 84
},
{
"epoch": 2.65625,
"grad_norm": 0.3084122812305825,
"learning_rate": 2e-05,
"loss": 0.7899,
"step": 85
},
{
"epoch": 2.65625,
"eval_loss": 0.7689283490180969,
"eval_runtime": 43.8712,
"eval_samples_per_second": 4.559,
"eval_steps_per_second": 0.57,
"step": 85
},
{
"epoch": 2.6875,
"grad_norm": 0.34994590801092706,
"learning_rate": 2e-05,
"loss": 0.8186,
"step": 86
},
{
"epoch": 2.6875,
"eval_loss": 0.7668275237083435,
"eval_runtime": 44.0477,
"eval_samples_per_second": 4.541,
"eval_steps_per_second": 0.568,
"step": 86
},
{
"epoch": 2.71875,
"grad_norm": 0.33626535961990944,
"learning_rate": 2e-05,
"loss": 0.8439,
"step": 87
},
{
"epoch": 2.71875,
"eval_loss": 0.7653672695159912,
"eval_runtime": 43.9923,
"eval_samples_per_second": 4.546,
"eval_steps_per_second": 0.568,
"step": 87
},
{
"epoch": 2.75,
"grad_norm": 0.33991458856080364,
"learning_rate": 2e-05,
"loss": 0.9309,
"step": 88
},
{
"epoch": 2.75,
"eval_loss": 0.7641142010688782,
"eval_runtime": 44.018,
"eval_samples_per_second": 4.544,
"eval_steps_per_second": 0.568,
"step": 88
},
{
"epoch": 2.78125,
"grad_norm": 0.3212547051979476,
"learning_rate": 2e-05,
"loss": 0.8262,
"step": 89
},
{
"epoch": 2.78125,
"eval_loss": 0.763224720954895,
"eval_runtime": 43.7722,
"eval_samples_per_second": 4.569,
"eval_steps_per_second": 0.571,
"step": 89
},
{
"epoch": 2.8125,
"grad_norm": 0.335120027091876,
"learning_rate": 2e-05,
"loss": 0.8795,
"step": 90
},
{
"epoch": 2.8125,
"eval_loss": 0.7624655365943909,
"eval_runtime": 44.1972,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 0.566,
"step": 90
},
{
"epoch": 2.84375,
"grad_norm": 0.33822766071160937,
"learning_rate": 2e-05,
"loss": 0.7798,
"step": 91
},
{
"epoch": 2.84375,
"eval_loss": 0.761708676815033,
"eval_runtime": 43.8244,
"eval_samples_per_second": 4.564,
"eval_steps_per_second": 0.57,
"step": 91
},
{
"epoch": 2.875,
"grad_norm": 0.33505853726890483,
"learning_rate": 2e-05,
"loss": 0.8715,
"step": 92
},
{
"epoch": 2.875,
"eval_loss": 0.7611495852470398,
"eval_runtime": 43.7833,
"eval_samples_per_second": 4.568,
"eval_steps_per_second": 0.571,
"step": 92
},
{
"epoch": 2.90625,
"grad_norm": 0.3126942865091584,
"learning_rate": 2e-05,
"loss": 0.8102,
"step": 93
},
{
"epoch": 2.90625,
"eval_loss": 0.7608107924461365,
"eval_runtime": 44.0119,
"eval_samples_per_second": 4.544,
"eval_steps_per_second": 0.568,
"step": 93
},
{
"epoch": 2.9375,
"grad_norm": 0.3594152593867412,
"learning_rate": 2e-05,
"loss": 0.8871,
"step": 94
},
{
"epoch": 2.9375,
"eval_loss": 0.7598913311958313,
"eval_runtime": 43.8956,
"eval_samples_per_second": 4.556,
"eval_steps_per_second": 0.57,
"step": 94
},
{
"epoch": 2.96875,
"grad_norm": 0.3161380007473764,
"learning_rate": 2e-05,
"loss": 0.8278,
"step": 95
},
{
"epoch": 2.96875,
"eval_loss": 0.7596660852432251,
"eval_runtime": 44.0687,
"eval_samples_per_second": 4.538,
"eval_steps_per_second": 0.567,
"step": 95
},
{
"epoch": 3.0,
"grad_norm": 0.3922097294803287,
"learning_rate": 2e-05,
"loss": 0.7988,
"step": 96
},
{
"epoch": 3.0,
"eval_loss": 0.7576884627342224,
"eval_runtime": 44.1881,
"eval_samples_per_second": 4.526,
"eval_steps_per_second": 0.566,
"step": 96
},
{
"epoch": 3.03125,
"grad_norm": 0.372234038126675,
"learning_rate": 2e-05,
"loss": 0.7558,
"step": 97
},
{
"epoch": 3.03125,
"eval_loss": 0.7546435594558716,
"eval_runtime": 43.8881,
"eval_samples_per_second": 4.557,
"eval_steps_per_second": 0.57,
"step": 97
},
{
"epoch": 3.0625,
"grad_norm": 0.3249396043376576,
"learning_rate": 2e-05,
"loss": 0.8422,
"step": 98
},
{
"epoch": 3.0625,
"eval_loss": 0.7515354752540588,
"eval_runtime": 44.5887,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 0.561,
"step": 98
},
{
"epoch": 3.09375,
"grad_norm": 0.3194387311297811,
"learning_rate": 2e-05,
"loss": 0.8059,
"step": 99
},
{
"epoch": 3.09375,
"eval_loss": 0.7486842274665833,
"eval_runtime": 44.0967,
"eval_samples_per_second": 4.535,
"eval_steps_per_second": 0.567,
"step": 99
},
{
"epoch": 3.125,
"grad_norm": 0.3434194037136213,
"learning_rate": 2e-05,
"loss": 0.8341,
"step": 100
},
{
"epoch": 3.125,
"eval_loss": 0.7464652061462402,
"eval_runtime": 44.0666,
"eval_samples_per_second": 4.539,
"eval_steps_per_second": 0.567,
"step": 100
},
{
"epoch": 3.15625,
"grad_norm": 0.33666008484696835,
"learning_rate": 2e-05,
"loss": 0.7731,
"step": 101
},
{
"epoch": 3.15625,
"eval_loss": 0.7450191378593445,
"eval_runtime": 44.0337,
"eval_samples_per_second": 4.542,
"eval_steps_per_second": 0.568,
"step": 101
},
{
"epoch": 3.1875,
"grad_norm": 0.3596265575837954,
"learning_rate": 2e-05,
"loss": 0.8354,
"step": 102
},
{
"epoch": 3.1875,
"eval_loss": 0.7442840337753296,
"eval_runtime": 44.0804,
"eval_samples_per_second": 4.537,
"eval_steps_per_second": 0.567,
"step": 102
},
{
"epoch": 3.21875,
"grad_norm": 0.37228869739935877,
"learning_rate": 2e-05,
"loss": 0.8476,
"step": 103
},
{
"epoch": 3.21875,
"eval_loss": 0.74405837059021,
"eval_runtime": 43.9201,
"eval_samples_per_second": 4.554,
"eval_steps_per_second": 0.569,
"step": 103
},
{
"epoch": 3.25,
"grad_norm": 0.372126737706513,
"learning_rate": 2e-05,
"loss": 0.7568,
"step": 104
},
{
"epoch": 3.25,
"eval_loss": 0.7435027360916138,
"eval_runtime": 44.0105,
"eval_samples_per_second": 4.544,
"eval_steps_per_second": 0.568,
"step": 104
},
{
"epoch": 3.28125,
"grad_norm": 0.3362686942090606,
"learning_rate": 2e-05,
"loss": 0.8035,
"step": 105
},
{
"epoch": 3.28125,
"eval_loss": 0.7431904673576355,
"eval_runtime": 43.9113,
"eval_samples_per_second": 4.555,
"eval_steps_per_second": 0.569,
"step": 105
},
{
"epoch": 3.3125,
"grad_norm": 0.36392229188159225,
"learning_rate": 2e-05,
"loss": 0.8353,
"step": 106
},
{
"epoch": 3.3125,
"eval_loss": 0.7430496215820312,
"eval_runtime": 44.6371,
"eval_samples_per_second": 4.481,
"eval_steps_per_second": 0.56,
"step": 106
},
{
"epoch": 3.34375,
"grad_norm": 0.4471327905090859,
"learning_rate": 2e-05,
"loss": 0.7363,
"step": 107
},
{
"epoch": 3.34375,
"eval_loss": 0.7411425709724426,
"eval_runtime": 44.7094,
"eval_samples_per_second": 4.473,
"eval_steps_per_second": 0.559,
"step": 107
},
{
"epoch": 3.375,
"grad_norm": 0.3716356236311949,
"learning_rate": 2e-05,
"loss": 0.7774,
"step": 108
},
{
"epoch": 3.375,
"eval_loss": 0.7391970753669739,
"eval_runtime": 44.6877,
"eval_samples_per_second": 4.476,
"eval_steps_per_second": 0.559,
"step": 108
},
{
"epoch": 3.40625,
"grad_norm": 0.39848151618324823,
"learning_rate": 2e-05,
"loss": 0.766,
"step": 109
},
{
"epoch": 3.40625,
"eval_loss": 0.7370663285255432,
"eval_runtime": 44.7716,
"eval_samples_per_second": 4.467,
"eval_steps_per_second": 0.558,
"step": 109
},
{
"epoch": 3.4375,
"grad_norm": 0.3979613694284285,
"learning_rate": 2e-05,
"loss": 0.7647,
"step": 110
},
{
"epoch": 3.4375,
"eval_loss": 0.7347142100334167,
"eval_runtime": 46.1551,
"eval_samples_per_second": 4.333,
"eval_steps_per_second": 0.542,
"step": 110
},
{
"epoch": 3.46875,
"grad_norm": 0.4005021474949748,
"learning_rate": 2e-05,
"loss": 0.8363,
"step": 111
},
{
"epoch": 3.46875,
"eval_loss": 0.7330761551856995,
"eval_runtime": 45.4921,
"eval_samples_per_second": 4.396,
"eval_steps_per_second": 0.55,
"step": 111
},
{
"epoch": 3.5,
"grad_norm": 0.3814831442952738,
"learning_rate": 2e-05,
"loss": 0.8172,
"step": 112
},
{
"epoch": 3.5,
"eval_loss": 0.7321842908859253,
"eval_runtime": 46.3117,
"eval_samples_per_second": 4.319,
"eval_steps_per_second": 0.54,
"step": 112
},
{
"epoch": 3.53125,
"grad_norm": 0.37084330088188894,
"learning_rate": 2e-05,
"loss": 0.8984,
"step": 113
},
{
"epoch": 3.53125,
"eval_loss": 0.7323736548423767,
"eval_runtime": 45.7394,
"eval_samples_per_second": 4.373,
"eval_steps_per_second": 0.547,
"step": 113
},
{
"epoch": 3.5625,
"grad_norm": 0.4074607742772961,
"learning_rate": 2e-05,
"loss": 0.7623,
"step": 114
},
{
"epoch": 3.5625,
"eval_loss": 0.7331156134605408,
"eval_runtime": 47.2117,
"eval_samples_per_second": 4.236,
"eval_steps_per_second": 0.53,
"step": 114
},
{
"epoch": 3.59375,
"grad_norm": 0.3478981526620727,
"learning_rate": 2e-05,
"loss": 0.8294,
"step": 115
},
{
"epoch": 3.59375,
"eval_loss": 0.7339057326316833,
"eval_runtime": 45.3783,
"eval_samples_per_second": 4.407,
"eval_steps_per_second": 0.551,
"step": 115
},
{
"epoch": 3.625,
"grad_norm": 0.4015868947675386,
"learning_rate": 2e-05,
"loss": 0.8,
"step": 116
},
{
"epoch": 3.625,
"eval_loss": 0.7341201305389404,
"eval_runtime": 45.9888,
"eval_samples_per_second": 4.349,
"eval_steps_per_second": 0.544,
"step": 116
},
{
"epoch": 3.65625,
"grad_norm": 0.3908261734781783,
"learning_rate": 2e-05,
"loss": 0.7903,
"step": 117
},
{
"epoch": 3.65625,
"eval_loss": 0.7336520552635193,
"eval_runtime": 45.9012,
"eval_samples_per_second": 4.357,
"eval_steps_per_second": 0.545,
"step": 117
},
{
"epoch": 3.6875,
"grad_norm": 0.39497646856232355,
"learning_rate": 2e-05,
"loss": 0.8072,
"step": 118
},
{
"epoch": 3.6875,
"eval_loss": 0.7335306406021118,
"eval_runtime": 46.2389,
"eval_samples_per_second": 4.325,
"eval_steps_per_second": 0.541,
"step": 118
},
{
"epoch": 3.71875,
"grad_norm": 0.3773137872461335,
"learning_rate": 2e-05,
"loss": 0.8647,
"step": 119
},
{
"epoch": 3.71875,
"eval_loss": 0.7331534028053284,
"eval_runtime": 46.662,
"eval_samples_per_second": 4.286,
"eval_steps_per_second": 0.536,
"step": 119
},
{
"epoch": 3.75,
"grad_norm": 0.353841599712999,
"learning_rate": 2e-05,
"loss": 0.8076,
"step": 120
},
{
"epoch": 3.75,
"eval_loss": 0.732619047164917,
"eval_runtime": 47.5847,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.525,
"step": 120
},
{
"epoch": 3.78125,
"grad_norm": 0.38703604888096965,
"learning_rate": 2e-05,
"loss": 0.783,
"step": 121
},
{
"epoch": 3.78125,
"eval_loss": 0.7308679223060608,
"eval_runtime": 47.3672,
"eval_samples_per_second": 4.222,
"eval_steps_per_second": 0.528,
"step": 121
},
{
"epoch": 3.8125,
"grad_norm": 0.406784109988961,
"learning_rate": 2e-05,
"loss": 0.8592,
"step": 122
},
{
"epoch": 3.8125,
"eval_loss": 0.7294270396232605,
"eval_runtime": 46.3156,
"eval_samples_per_second": 4.318,
"eval_steps_per_second": 0.54,
"step": 122
},
{
"epoch": 3.84375,
"grad_norm": 0.3867362432665531,
"learning_rate": 2e-05,
"loss": 0.7773,
"step": 123
},
{
"epoch": 3.84375,
"eval_loss": 0.7278974056243896,
"eval_runtime": 46.0714,
"eval_samples_per_second": 4.341,
"eval_steps_per_second": 0.543,
"step": 123
},
{
"epoch": 3.875,
"grad_norm": 0.37454905814944983,
"learning_rate": 2e-05,
"loss": 0.8054,
"step": 124
},
{
"epoch": 3.875,
"eval_loss": 0.7264491319656372,
"eval_runtime": 46.0579,
"eval_samples_per_second": 4.342,
"eval_steps_per_second": 0.543,
"step": 124
},
{
"epoch": 3.90625,
"grad_norm": 0.444384159363942,
"learning_rate": 2e-05,
"loss": 0.8434,
"step": 125
},
{
"epoch": 3.90625,
"eval_loss": 0.7248883843421936,
"eval_runtime": 46.2593,
"eval_samples_per_second": 4.323,
"eval_steps_per_second": 0.54,
"step": 125
},
{
"epoch": 3.9375,
"grad_norm": 0.4296603454332508,
"learning_rate": 2e-05,
"loss": 0.8154,
"step": 126
},
{
"epoch": 3.9375,
"eval_loss": 0.7236350774765015,
"eval_runtime": 47.8167,
"eval_samples_per_second": 4.183,
"eval_steps_per_second": 0.523,
"step": 126
},
{
"epoch": 3.96875,
"grad_norm": 0.4369101294390371,
"learning_rate": 2e-05,
"loss": 0.7759,
"step": 127
},
{
"epoch": 3.96875,
"eval_loss": 0.7224241495132446,
"eval_runtime": 45.8583,
"eval_samples_per_second": 4.361,
"eval_steps_per_second": 0.545,
"step": 127
},
{
"epoch": 4.0,
"grad_norm": 0.4294598409798285,
"learning_rate": 2e-05,
"loss": 0.706,
"step": 128
},
{
"epoch": 4.0,
"eval_loss": 0.7210729718208313,
"eval_runtime": 45.9047,
"eval_samples_per_second": 4.357,
"eval_steps_per_second": 0.545,
"step": 128
},
{
"epoch": 4.03125,
"grad_norm": 0.355178274167416,
"learning_rate": 2e-05,
"loss": 0.7969,
"step": 129
},
{
"epoch": 4.03125,
"eval_loss": 0.7206510901451111,
"eval_runtime": 46.1016,
"eval_samples_per_second": 4.338,
"eval_steps_per_second": 0.542,
"step": 129
},
{
"epoch": 4.0625,
"grad_norm": 0.39855476598487416,
"learning_rate": 2e-05,
"loss": 0.8124,
"step": 130
},
{
"epoch": 4.0625,
"eval_loss": 0.7203733921051025,
"eval_runtime": 46.5052,
"eval_samples_per_second": 4.301,
"eval_steps_per_second": 0.538,
"step": 130
},
{
"epoch": 4.09375,
"grad_norm": 0.38252767359910733,
"learning_rate": 2e-05,
"loss": 0.8126,
"step": 131
},
{
"epoch": 4.09375,
"eval_loss": 0.7201277017593384,
"eval_runtime": 47.5144,
"eval_samples_per_second": 4.209,
"eval_steps_per_second": 0.526,
"step": 131
},
{
"epoch": 4.125,
"grad_norm": 0.44006887742113143,
"learning_rate": 2e-05,
"loss": 0.7706,
"step": 132
},
{
"epoch": 4.125,
"eval_loss": 0.7195135354995728,
"eval_runtime": 45.8417,
"eval_samples_per_second": 4.363,
"eval_steps_per_second": 0.545,
"step": 132
},
{
"epoch": 4.15625,
"grad_norm": 0.426129225179819,
"learning_rate": 2e-05,
"loss": 0.8699,
"step": 133
},
{
"epoch": 4.15625,
"eval_loss": 0.7189508080482483,
"eval_runtime": 46.2247,
"eval_samples_per_second": 4.327,
"eval_steps_per_second": 0.541,
"step": 133
},
{
"epoch": 4.1875,
"grad_norm": 0.4995092725647276,
"learning_rate": 2e-05,
"loss": 0.7811,
"step": 134
},
{
"epoch": 4.1875,
"eval_loss": 0.7180965542793274,
"eval_runtime": 46.4605,
"eval_samples_per_second": 4.305,
"eval_steps_per_second": 0.538,
"step": 134
},
{
"epoch": 4.21875,
"grad_norm": 0.42664484060733815,
"learning_rate": 2e-05,
"loss": 0.7795,
"step": 135
},
{
"epoch": 4.21875,
"eval_loss": 0.7173775434494019,
"eval_runtime": 46.1896,
"eval_samples_per_second": 4.33,
"eval_steps_per_second": 0.541,
"step": 135
},
{
"epoch": 4.25,
"grad_norm": 0.43970733071879864,
"learning_rate": 2e-05,
"loss": 0.772,
"step": 136
},
{
"epoch": 4.25,
"eval_loss": 0.716987133026123,
"eval_runtime": 45.88,
"eval_samples_per_second": 4.359,
"eval_steps_per_second": 0.545,
"step": 136
},
{
"epoch": 4.28125,
"grad_norm": 0.4585774179958974,
"learning_rate": 2e-05,
"loss": 0.7594,
"step": 137
},
{
"epoch": 4.28125,
"eval_loss": 0.7162837386131287,
"eval_runtime": 45.9687,
"eval_samples_per_second": 4.351,
"eval_steps_per_second": 0.544,
"step": 137
},
{
"epoch": 4.3125,
"grad_norm": 0.4482018280143517,
"learning_rate": 2e-05,
"loss": 0.7702,
"step": 138
},
{
"epoch": 4.3125,
"eval_loss": 0.7155399918556213,
"eval_runtime": 46.1566,
"eval_samples_per_second": 4.333,
"eval_steps_per_second": 0.542,
"step": 138
},
{
"epoch": 4.34375,
"grad_norm": 0.44262087649988896,
"learning_rate": 2e-05,
"loss": 0.7323,
"step": 139
},
{
"epoch": 4.34375,
"eval_loss": 0.7145451307296753,
"eval_runtime": 46.2257,
"eval_samples_per_second": 4.327,
"eval_steps_per_second": 0.541,
"step": 139
},
{
"epoch": 4.375,
"grad_norm": 0.4418100350036369,
"learning_rate": 2e-05,
"loss": 0.7669,
"step": 140
},
{
"epoch": 4.375,
"eval_loss": 0.7139186263084412,
"eval_runtime": 46.1994,
"eval_samples_per_second": 4.329,
"eval_steps_per_second": 0.541,
"step": 140
},
{
"epoch": 4.40625,
"grad_norm": 0.4068223149751762,
"learning_rate": 2e-05,
"loss": 0.7806,
"step": 141
},
{
"epoch": 4.40625,
"eval_loss": 0.7134376764297485,
"eval_runtime": 48.1068,
"eval_samples_per_second": 4.157,
"eval_steps_per_second": 0.52,
"step": 141
},
{
"epoch": 4.4375,
"grad_norm": 0.4339025102618351,
"learning_rate": 2e-05,
"loss": 0.7312,
"step": 142
},
{
"epoch": 4.4375,
"eval_loss": 0.7134268879890442,
"eval_runtime": 46.8951,
"eval_samples_per_second": 4.265,
"eval_steps_per_second": 0.533,
"step": 142
},
{
"epoch": 4.46875,
"grad_norm": 0.45474838622605346,
"learning_rate": 2e-05,
"loss": 0.7358,
"step": 143
},
{
"epoch": 4.46875,
"eval_loss": 0.7131960391998291,
"eval_runtime": 46.8155,
"eval_samples_per_second": 4.272,
"eval_steps_per_second": 0.534,
"step": 143
},
{
"epoch": 4.5,
"grad_norm": 0.4284980958119551,
"learning_rate": 2e-05,
"loss": 0.7146,
"step": 144
},
{
"epoch": 4.5,
"eval_loss": 0.7122372388839722,
"eval_runtime": 46.7899,
"eval_samples_per_second": 4.274,
"eval_steps_per_second": 0.534,
"step": 144
},
{
"epoch": 4.53125,
"grad_norm": 0.4679473362578349,
"learning_rate": 2e-05,
"loss": 0.8018,
"step": 145
},
{
"epoch": 4.53125,
"eval_loss": 0.7106640338897705,
"eval_runtime": 46.845,
"eval_samples_per_second": 4.269,
"eval_steps_per_second": 0.534,
"step": 145
},
{
"epoch": 4.5625,
"grad_norm": 0.4900067169351881,
"learning_rate": 2e-05,
"loss": 0.6884,
"step": 146
},
{
"epoch": 4.5625,
"eval_loss": 0.7087500095367432,
"eval_runtime": 47.5958,
"eval_samples_per_second": 4.202,
"eval_steps_per_second": 0.525,
"step": 146
},
{
"epoch": 4.59375,
"grad_norm": 0.4734076525152252,
"learning_rate": 2e-05,
"loss": 0.7491,
"step": 147
},
{
"epoch": 4.59375,
"eval_loss": 0.7072947025299072,
"eval_runtime": 48.7251,
"eval_samples_per_second": 4.105,
"eval_steps_per_second": 0.513,
"step": 147
},
{
"epoch": 4.625,
"grad_norm": 0.44251158400098356,
"learning_rate": 2e-05,
"loss": 0.7052,
"step": 148
},
{
"epoch": 4.625,
"eval_loss": 0.7068507671356201,
"eval_runtime": 47.7025,
"eval_samples_per_second": 4.193,
"eval_steps_per_second": 0.524,
"step": 148
},
{
"epoch": 4.65625,
"grad_norm": 0.4304625716692019,
"learning_rate": 2e-05,
"loss": 0.8176,
"step": 149
},
{
"epoch": 4.65625,
"eval_loss": 0.7074388265609741,
"eval_runtime": 48.6321,
"eval_samples_per_second": 4.113,
"eval_steps_per_second": 0.514,
"step": 149
},
{
"epoch": 4.6875,
"grad_norm": 0.5157530943388945,
"learning_rate": 2e-05,
"loss": 0.7429,
"step": 150
},
{
"epoch": 4.6875,
"eval_loss": 0.7071186900138855,
"eval_runtime": 47.9557,
"eval_samples_per_second": 4.171,
"eval_steps_per_second": 0.521,
"step": 150
},
{
"epoch": 4.71875,
"grad_norm": 0.5469994539610319,
"learning_rate": 2e-05,
"loss": 0.7643,
"step": 151
},
{
"epoch": 4.71875,
"eval_loss": 0.7050415277481079,
"eval_runtime": 47.5207,
"eval_samples_per_second": 4.209,
"eval_steps_per_second": 0.526,
"step": 151
},
{
"epoch": 4.75,
"grad_norm": 0.4821891223190419,
"learning_rate": 2e-05,
"loss": 0.7795,
"step": 152
},
{
"epoch": 4.75,
"eval_loss": 0.7032743692398071,
"eval_runtime": 47.2902,
"eval_samples_per_second": 4.229,
"eval_steps_per_second": 0.529,
"step": 152
},
{
"epoch": 4.78125,
"grad_norm": 0.4785594997922253,
"learning_rate": 2e-05,
"loss": 0.7323,
"step": 153
},
{
"epoch": 4.78125,
"eval_loss": 0.7028358578681946,
"eval_runtime": 47.7841,
"eval_samples_per_second": 4.185,
"eval_steps_per_second": 0.523,
"step": 153
},
{
"epoch": 4.8125,
"grad_norm": 0.47200733754346447,
"learning_rate": 2e-05,
"loss": 0.7555,
"step": 154
},
{
"epoch": 4.8125,
"eval_loss": 0.7034148573875427,
"eval_runtime": 47.4952,
"eval_samples_per_second": 4.211,
"eval_steps_per_second": 0.526,
"step": 154
},
{
"epoch": 4.84375,
"grad_norm": 0.49226670914533455,
"learning_rate": 2e-05,
"loss": 0.6884,
"step": 155
},
{
"epoch": 4.84375,
"eval_loss": 0.7038142681121826,
"eval_runtime": 47.6873,
"eval_samples_per_second": 4.194,
"eval_steps_per_second": 0.524,
"step": 155
},
{
"epoch": 4.875,
"grad_norm": 0.4894781168701622,
"learning_rate": 2e-05,
"loss": 0.8079,
"step": 156
},
{
"epoch": 4.875,
"eval_loss": 0.7031099200248718,
"eval_runtime": 47.0438,
"eval_samples_per_second": 4.251,
"eval_steps_per_second": 0.531,
"step": 156
},
{
"epoch": 4.90625,
"grad_norm": 0.44465660848434874,
"learning_rate": 2e-05,
"loss": 0.7868,
"step": 157
},
{
"epoch": 4.90625,
"eval_loss": 0.7025811672210693,
"eval_runtime": 47.2897,
"eval_samples_per_second": 4.229,
"eval_steps_per_second": 0.529,
"step": 157
},
{
"epoch": 4.9375,
"grad_norm": 0.4671993515654777,
"learning_rate": 2e-05,
"loss": 0.7949,
"step": 158
},
{
"epoch": 4.9375,
"eval_loss": 0.7016230225563049,
"eval_runtime": 48.7147,
"eval_samples_per_second": 4.106,
"eval_steps_per_second": 0.513,
"step": 158
},
{
"epoch": 4.96875,
"grad_norm": 0.46593892888464733,
"learning_rate": 2e-05,
"loss": 0.7445,
"step": 159
},
{
"epoch": 4.96875,
"eval_loss": 0.7006258964538574,
"eval_runtime": 48.5723,
"eval_samples_per_second": 4.118,
"eval_steps_per_second": 0.515,
"step": 159
},
{
"epoch": 5.0,
"grad_norm": 0.47383657575274585,
"learning_rate": 2e-05,
"loss": 0.7233,
"step": 160
},
{
"epoch": 5.0,
"eval_loss": 0.7000269889831543,
"eval_runtime": 48.7517,
"eval_samples_per_second": 4.102,
"eval_steps_per_second": 0.513,
"step": 160
},
{
"epoch": 5.03125,
"grad_norm": 0.42723336337060835,
"learning_rate": 2e-05,
"loss": 0.7061,
"step": 161
},
{
"epoch": 5.03125,
"eval_loss": 0.7001045942306519,
"eval_runtime": 51.0355,
"eval_samples_per_second": 3.919,
"eval_steps_per_second": 0.49,
"step": 161
},
{
"epoch": 5.0625,
"grad_norm": 0.452950592019195,
"learning_rate": 2e-05,
"loss": 0.8489,
"step": 162
},
{
"epoch": 5.0625,
"eval_loss": 0.7011143565177917,
"eval_runtime": 44.0195,
"eval_samples_per_second": 4.543,
"eval_steps_per_second": 0.568,
"step": 162
},
{
"epoch": 5.09375,
"grad_norm": 0.49095068041556844,
"learning_rate": 2e-05,
"loss": 0.6523,
"step": 163
},
{
"epoch": 5.09375,
"eval_loss": 0.7020147442817688,
"eval_runtime": 43.9994,
"eval_samples_per_second": 4.546,
"eval_steps_per_second": 0.568,
"step": 163
},
{
"epoch": 5.125,
"grad_norm": 0.49702685752637826,
"learning_rate": 2e-05,
"loss": 0.7931,
"step": 164
},
{
"epoch": 5.125,
"eval_loss": 0.7026366591453552,
"eval_runtime": 43.7736,
"eval_samples_per_second": 4.569,
"eval_steps_per_second": 0.571,
"step": 164
},
{
"epoch": 5.15625,
"grad_norm": 0.5894972181165574,
"learning_rate": 2e-05,
"loss": 0.6297,
"step": 165
},
{
"epoch": 5.15625,
"eval_loss": 0.7018793225288391,
"eval_runtime": 43.8277,
"eval_samples_per_second": 4.563,
"eval_steps_per_second": 0.57,
"step": 165
},
{
"epoch": 5.1875,
"grad_norm": 0.5431599726243479,
"learning_rate": 2e-05,
"loss": 0.7394,
"step": 166
},
{
"epoch": 5.1875,
"eval_loss": 0.701405942440033,
"eval_runtime": 46.007,
"eval_samples_per_second": 4.347,
"eval_steps_per_second": 0.543,
"step": 166
},
{
"epoch": 5.21875,
"grad_norm": 0.46081080554385206,
"learning_rate": 2e-05,
"loss": 0.7587,
"step": 167
},
{
"epoch": 5.21875,
"eval_loss": 0.7011873126029968,
"eval_runtime": 45.6739,
"eval_samples_per_second": 4.379,
"eval_steps_per_second": 0.547,
"step": 167
},
{
"epoch": 5.25,
"grad_norm": 0.5186784959253576,
"learning_rate": 2e-05,
"loss": 0.7944,
"step": 168
},
{
"epoch": 5.25,
"eval_loss": 0.7006779313087463,
"eval_runtime": 46.6382,
"eval_samples_per_second": 4.288,
"eval_steps_per_second": 0.536,
"step": 168
},
{
"epoch": 5.28125,
"grad_norm": 0.484045023962852,
"learning_rate": 2e-05,
"loss": 0.7149,
"step": 169
},
{
"epoch": 5.28125,
"eval_loss": 0.7005323171615601,
"eval_runtime": 45.7584,
"eval_samples_per_second": 4.371,
"eval_steps_per_second": 0.546,
"step": 169
},
{
"epoch": 5.3125,
"grad_norm": 0.5719751134907255,
"learning_rate": 2e-05,
"loss": 0.6939,
"step": 170
},
{
"epoch": 5.3125,
"eval_loss": 0.7002266645431519,
"eval_runtime": 45.9679,
"eval_samples_per_second": 4.351,
"eval_steps_per_second": 0.544,
"step": 170
},
{
"epoch": 5.34375,
"grad_norm": 0.6060894153712378,
"learning_rate": 2e-05,
"loss": 0.7048,
"step": 171
},
{
"epoch": 5.34375,
"eval_loss": 0.6983186602592468,
"eval_runtime": 47.2598,
"eval_samples_per_second": 4.232,
"eval_steps_per_second": 0.529,
"step": 171
},
{
"epoch": 5.375,
"grad_norm": 0.5548499769346423,
"learning_rate": 2e-05,
"loss": 0.7881,
"step": 172
},
{
"epoch": 5.375,
"eval_loss": 0.6966648697853088,
"eval_runtime": 47.0803,
"eval_samples_per_second": 4.248,
"eval_steps_per_second": 0.531,
"step": 172
},
{
"epoch": 5.40625,
"grad_norm": 0.5102316819603098,
"learning_rate": 2e-05,
"loss": 0.7542,
"step": 173
},
{
"epoch": 5.40625,
"eval_loss": 0.6953878998756409,
"eval_runtime": 48.3238,
"eval_samples_per_second": 4.139,
"eval_steps_per_second": 0.517,
"step": 173
},
{
"epoch": 5.4375,
"grad_norm": 0.5399890621278476,
"learning_rate": 2e-05,
"loss": 0.7937,
"step": 174
},
{
"epoch": 5.4375,
"eval_loss": 0.69431471824646,
"eval_runtime": 49.2122,
"eval_samples_per_second": 4.064,
"eval_steps_per_second": 0.508,
"step": 174
},
{
"epoch": 5.46875,
"grad_norm": 0.5252423839534397,
"learning_rate": 2e-05,
"loss": 0.7767,
"step": 175
},
{
"epoch": 5.46875,
"eval_loss": 0.6944937109947205,
"eval_runtime": 49.0039,
"eval_samples_per_second": 4.081,
"eval_steps_per_second": 0.51,
"step": 175
},
{
"epoch": 5.5,
"grad_norm": 0.5422683424689886,
"learning_rate": 2e-05,
"loss": 0.7171,
"step": 176
},
{
"epoch": 5.5,
"eval_loss": 0.6943515539169312,
"eval_runtime": 48.7295,
"eval_samples_per_second": 4.104,
"eval_steps_per_second": 0.513,
"step": 176
},
{
"epoch": 5.53125,
"grad_norm": 0.551339022612633,
"learning_rate": 2e-05,
"loss": 0.7529,
"step": 177
},
{
"epoch": 5.53125,
"eval_loss": 0.6935855150222778,
"eval_runtime": 50.259,
"eval_samples_per_second": 3.979,
"eval_steps_per_second": 0.497,
"step": 177
},
{
"epoch": 5.5625,
"grad_norm": 0.5040662348893271,
"learning_rate": 2e-05,
"loss": 0.7816,
"step": 178
},
{
"epoch": 5.5625,
"eval_loss": 0.6929727792739868,
"eval_runtime": 49.9267,
"eval_samples_per_second": 4.006,
"eval_steps_per_second": 0.501,
"step": 178
},
{
"epoch": 5.59375,
"grad_norm": 0.538094993002792,
"learning_rate": 2e-05,
"loss": 0.6785,
"step": 179
},
{
"epoch": 5.59375,
"eval_loss": 0.6930323839187622,
"eval_runtime": 48.28,
"eval_samples_per_second": 4.143,
"eval_steps_per_second": 0.518,
"step": 179
},
{
"epoch": 5.625,
"grad_norm": 0.5367726605699668,
"learning_rate": 2e-05,
"loss": 0.6868,
"step": 180
},
{
"epoch": 5.625,
"eval_loss": 0.6928802728652954,
"eval_runtime": 49.8478,
"eval_samples_per_second": 4.012,
"eval_steps_per_second": 0.502,
"step": 180
},
{
"epoch": 5.65625,
"grad_norm": 0.5978542074838507,
"learning_rate": 2e-05,
"loss": 0.698,
"step": 181
},
{
"epoch": 5.65625,
"eval_loss": 0.6921787858009338,
"eval_runtime": 50.778,
"eval_samples_per_second": 3.939,
"eval_steps_per_second": 0.492,
"step": 181
},
{
"epoch": 5.6875,
"grad_norm": 0.5779173967988954,
"learning_rate": 2e-05,
"loss": 0.664,
"step": 182
},
{
"epoch": 5.6875,
"eval_loss": 0.6921034455299377,
"eval_runtime": 49.7171,
"eval_samples_per_second": 4.023,
"eval_steps_per_second": 0.503,
"step": 182
},
{
"epoch": 5.71875,
"grad_norm": 0.6377165996743129,
"learning_rate": 2e-05,
"loss": 0.7051,
"step": 183
},
{
"epoch": 5.71875,
"eval_loss": 0.6914942264556885,
"eval_runtime": 51.9608,
"eval_samples_per_second": 3.849,
"eval_steps_per_second": 0.481,
"step": 183
},
{
"epoch": 5.75,
"grad_norm": 0.6093388082076064,
"learning_rate": 2e-05,
"loss": 0.6903,
"step": 184
},
{
"epoch": 5.75,
"eval_loss": 0.6904594302177429,
"eval_runtime": 49.6144,
"eval_samples_per_second": 4.031,
"eval_steps_per_second": 0.504,
"step": 184
},
{
"epoch": 5.78125,
"grad_norm": 0.5987747297973711,
"learning_rate": 2e-05,
"loss": 0.7368,
"step": 185
},
{
"epoch": 5.78125,
"eval_loss": 0.6894869804382324,
"eval_runtime": 49.7122,
"eval_samples_per_second": 4.023,
"eval_steps_per_second": 0.503,
"step": 185
},
{
"epoch": 5.8125,
"grad_norm": 0.5914952733954625,
"learning_rate": 2e-05,
"loss": 0.7003,
"step": 186
},
{
"epoch": 5.8125,
"eval_loss": 0.6885225772857666,
"eval_runtime": 49.8474,
"eval_samples_per_second": 4.012,
"eval_steps_per_second": 0.502,
"step": 186
},
{
"epoch": 5.84375,
"grad_norm": 0.5641237505681922,
"learning_rate": 2e-05,
"loss": 0.7571,
"step": 187
},
{
"epoch": 5.84375,
"eval_loss": 0.6889610290527344,
"eval_runtime": 51.5925,
"eval_samples_per_second": 3.877,
"eval_steps_per_second": 0.485,
"step": 187
},
{
"epoch": 5.875,
"grad_norm": 0.5566285784572296,
"learning_rate": 2e-05,
"loss": 0.6882,
"step": 188
},
{
"epoch": 5.875,
"eval_loss": 0.6903389692306519,
"eval_runtime": 49.713,
"eval_samples_per_second": 4.023,
"eval_steps_per_second": 0.503,
"step": 188
},
{
"epoch": 5.90625,
"grad_norm": 0.5594562993560854,
"learning_rate": 2e-05,
"loss": 0.7028,
"step": 189
},
{
"epoch": 5.90625,
"eval_loss": 0.6911373734474182,
"eval_runtime": 49.929,
"eval_samples_per_second": 4.006,
"eval_steps_per_second": 0.501,
"step": 189
},
{
"epoch": 5.9375,
"grad_norm": 0.6114177699067616,
"learning_rate": 2e-05,
"loss": 0.7181,
"step": 190
},
{
"epoch": 5.9375,
"eval_loss": 0.6901592016220093,
"eval_runtime": 49.9032,
"eval_samples_per_second": 4.008,
"eval_steps_per_second": 0.501,
"step": 190
},
{
"epoch": 5.96875,
"grad_norm": 0.5564307101453613,
"learning_rate": 2e-05,
"loss": 0.7116,
"step": 191
},
{
"epoch": 5.96875,
"eval_loss": 0.6883879899978638,
"eval_runtime": 49.9457,
"eval_samples_per_second": 4.004,
"eval_steps_per_second": 0.501,
"step": 191
},
{
"epoch": 6.0,
"grad_norm": 0.5242139835965315,
"learning_rate": 2e-05,
"loss": 0.6956,
"step": 192
},
{
"epoch": 6.0,
"eval_loss": 0.686991274356842,
"eval_runtime": 51.3206,
"eval_samples_per_second": 3.897,
"eval_steps_per_second": 0.487,
"step": 192
},
{
"epoch": 6.03125,
"grad_norm": 0.5661038874224659,
"learning_rate": 2e-05,
"loss": 0.7667,
"step": 193
},
{
"epoch": 6.03125,
"eval_loss": 0.6863989233970642,
"eval_runtime": 50.3486,
"eval_samples_per_second": 3.972,
"eval_steps_per_second": 0.497,
"step": 193
},
{
"epoch": 6.0625,
"grad_norm": 0.5015705892320539,
"learning_rate": 2e-05,
"loss": 0.7289,
"step": 194
},
{
"epoch": 6.0625,
"eval_loss": 0.6869972348213196,
"eval_runtime": 51.6966,
"eval_samples_per_second": 3.869,
"eval_steps_per_second": 0.484,
"step": 194
},
{
"epoch": 6.09375,
"grad_norm": 0.5679476318211268,
"learning_rate": 2e-05,
"loss": 0.6595,
"step": 195
},
{
"epoch": 6.09375,
"eval_loss": 0.6878303289413452,
"eval_runtime": 44.1921,
"eval_samples_per_second": 4.526,
"eval_steps_per_second": 0.566,
"step": 195
},
{
"epoch": 6.125,
"grad_norm": 0.5496769650020654,
"learning_rate": 2e-05,
"loss": 0.6934,
"step": 196
},
{
"epoch": 6.125,
"eval_loss": 0.689085841178894,
"eval_runtime": 44.0432,
"eval_samples_per_second": 4.541,
"eval_steps_per_second": 0.568,
"step": 196
},
{
"epoch": 6.15625,
"grad_norm": 0.5761731163916711,
"learning_rate": 2e-05,
"loss": 0.7212,
"step": 197
},
{
"epoch": 6.15625,
"eval_loss": 0.6919547915458679,
"eval_runtime": 45.3631,
"eval_samples_per_second": 4.409,
"eval_steps_per_second": 0.551,
"step": 197
},
{
"epoch": 6.1875,
"grad_norm": 0.6093485410765964,
"learning_rate": 2e-05,
"loss": 0.8013,
"step": 198
},
{
"epoch": 6.1875,
"eval_loss": 0.6936098337173462,
"eval_runtime": 44.1956,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 0.566,
"step": 198
},
{
"epoch": 6.21875,
"grad_norm": 0.6670365325797192,
"learning_rate": 2e-05,
"loss": 0.666,
"step": 199
},
{
"epoch": 6.21875,
"eval_loss": 0.693129301071167,
"eval_runtime": 44.0131,
"eval_samples_per_second": 4.544,
"eval_steps_per_second": 0.568,
"step": 199
},
{
"epoch": 6.25,
"grad_norm": 0.6464592274733308,
"learning_rate": 2e-05,
"loss": 0.7134,
"step": 200
},
{
"epoch": 6.25,
"eval_loss": 0.6912326216697693,
"eval_runtime": 44.0,
"eval_samples_per_second": 4.545,
"eval_steps_per_second": 0.568,
"step": 200
},
{
"epoch": 6.28125,
"grad_norm": 0.6088225232188101,
"learning_rate": 2e-05,
"loss": 0.7405,
"step": 201
},
{
"epoch": 6.28125,
"eval_loss": 0.6896650195121765,
"eval_runtime": 44.3194,
"eval_samples_per_second": 4.513,
"eval_steps_per_second": 0.564,
"step": 201
},
{
"epoch": 6.3125,
"grad_norm": 0.6638309972807995,
"learning_rate": 2e-05,
"loss": 0.6542,
"step": 202
},
{
"epoch": 6.3125,
"eval_loss": 0.6878445148468018,
"eval_runtime": 44.2101,
"eval_samples_per_second": 4.524,
"eval_steps_per_second": 0.565,
"step": 202
},
{
"epoch": 6.34375,
"grad_norm": 0.5632348029553863,
"learning_rate": 2e-05,
"loss": 0.7953,
"step": 203
},
{
"epoch": 6.34375,
"eval_loss": 0.6869116425514221,
"eval_runtime": 44.0039,
"eval_samples_per_second": 4.545,
"eval_steps_per_second": 0.568,
"step": 203
},
{
"epoch": 6.375,
"grad_norm": 0.6753158068984167,
"learning_rate": 2e-05,
"loss": 0.6369,
"step": 204
},
{
"epoch": 6.375,
"eval_loss": 0.6856124997138977,
"eval_runtime": 44.2493,
"eval_samples_per_second": 4.52,
"eval_steps_per_second": 0.565,
"step": 204
},
{
"epoch": 6.40625,
"grad_norm": 0.5601655147962107,
"learning_rate": 2e-05,
"loss": 0.6291,
"step": 205
},
{
"epoch": 6.40625,
"eval_loss": 0.685504138469696,
"eval_runtime": 43.9463,
"eval_samples_per_second": 4.551,
"eval_steps_per_second": 0.569,
"step": 205
},
{
"epoch": 6.4375,
"grad_norm": 0.6578412065562369,
"learning_rate": 2e-05,
"loss": 0.6887,
"step": 206
},
{
"epoch": 6.4375,
"eval_loss": 0.6858142018318176,
"eval_runtime": 45.1556,
"eval_samples_per_second": 4.429,
"eval_steps_per_second": 0.554,
"step": 206
},
{
"epoch": 6.46875,
"grad_norm": 0.6149787250576099,
"learning_rate": 2e-05,
"loss": 0.7375,
"step": 207
},
{
"epoch": 6.46875,
"eval_loss": 0.6860241889953613,
"eval_runtime": 44.9447,
"eval_samples_per_second": 4.45,
"eval_steps_per_second": 0.556,
"step": 207
},
{
"epoch": 6.5,
"grad_norm": 0.6674521606961297,
"learning_rate": 2e-05,
"loss": 0.6856,
"step": 208
},
{
"epoch": 6.5,
"eval_loss": 0.6866363286972046,
"eval_runtime": 44.714,
"eval_samples_per_second": 4.473,
"eval_steps_per_second": 0.559,
"step": 208
},
{
"epoch": 6.53125,
"grad_norm": 0.700420859386899,
"learning_rate": 2e-05,
"loss": 0.6556,
"step": 209
},
{
"epoch": 6.53125,
"eval_loss": 0.6870286464691162,
"eval_runtime": 44.8923,
"eval_samples_per_second": 4.455,
"eval_steps_per_second": 0.557,
"step": 209
},
{
"epoch": 6.5625,
"grad_norm": 0.6530651968630973,
"learning_rate": 2e-05,
"loss": 0.6334,
"step": 210
},
{
"epoch": 6.5625,
"eval_loss": 0.6872709393501282,
"eval_runtime": 44.7944,
"eval_samples_per_second": 4.465,
"eval_steps_per_second": 0.558,
"step": 210
},
{
"epoch": 6.59375,
"grad_norm": 0.695757498482456,
"learning_rate": 2e-05,
"loss": 0.6784,
"step": 211
},
{
"epoch": 6.59375,
"eval_loss": 0.6869171857833862,
"eval_runtime": 45.755,
"eval_samples_per_second": 4.371,
"eval_steps_per_second": 0.546,
"step": 211
},
{
"epoch": 6.625,
"grad_norm": 0.642060810781652,
"learning_rate": 2e-05,
"loss": 0.6489,
"step": 212
},
{
"epoch": 6.625,
"eval_loss": 0.685666024684906,
"eval_runtime": 46.4458,
"eval_samples_per_second": 4.306,
"eval_steps_per_second": 0.538,
"step": 212
},
{
"epoch": 6.65625,
"grad_norm": 0.6088750940603561,
"learning_rate": 2e-05,
"loss": 0.7216,
"step": 213
},
{
"epoch": 6.65625,
"eval_loss": 0.6843697428703308,
"eval_runtime": 46.1389,
"eval_samples_per_second": 4.335,
"eval_steps_per_second": 0.542,
"step": 213
},
{
"epoch": 6.6875,
"grad_norm": 0.6043945628080053,
"learning_rate": 2e-05,
"loss": 0.692,
"step": 214
},
{
"epoch": 6.6875,
"eval_loss": 0.6836680769920349,
"eval_runtime": 47.7324,
"eval_samples_per_second": 4.19,
"eval_steps_per_second": 0.524,
"step": 214
},
{
"epoch": 6.71875,
"grad_norm": 0.6506615838970475,
"learning_rate": 2e-05,
"loss": 0.691,
"step": 215
},
{
"epoch": 6.71875,
"eval_loss": 0.6824812293052673,
"eval_runtime": 45.8056,
"eval_samples_per_second": 4.366,
"eval_steps_per_second": 0.546,
"step": 215
},
{
"epoch": 6.75,
"grad_norm": 0.6878268158673746,
"learning_rate": 2e-05,
"loss": 0.6894,
"step": 216
},
{
"epoch": 6.75,
"eval_loss": 0.6817054748535156,
"eval_runtime": 46.47,
"eval_samples_per_second": 4.304,
"eval_steps_per_second": 0.538,
"step": 216
},
{
"epoch": 6.78125,
"grad_norm": 0.6793999118325932,
"learning_rate": 2e-05,
"loss": 0.6394,
"step": 217
},
{
"epoch": 6.78125,
"eval_loss": 0.6831635236740112,
"eval_runtime": 47.8532,
"eval_samples_per_second": 4.179,
"eval_steps_per_second": 0.522,
"step": 217
},
{
"epoch": 6.8125,
"grad_norm": 0.6935365262523343,
"learning_rate": 2e-05,
"loss": 0.6341,
"step": 218
},
{
"epoch": 6.8125,
"eval_loss": 0.6843095421791077,
"eval_runtime": 46.3828,
"eval_samples_per_second": 4.312,
"eval_steps_per_second": 0.539,
"step": 218
},
{
"epoch": 6.84375,
"grad_norm": 0.8071019513751874,
"learning_rate": 2e-05,
"loss": 0.7211,
"step": 219
},
{
"epoch": 6.84375,
"eval_loss": 0.6839814782142639,
"eval_runtime": 46.5771,
"eval_samples_per_second": 4.294,
"eval_steps_per_second": 0.537,
"step": 219
},
{
"epoch": 6.875,
"grad_norm": 0.7202535741704769,
"learning_rate": 2e-05,
"loss": 0.7305,
"step": 220
},
{
"epoch": 6.875,
"eval_loss": 0.6822354197502136,
"eval_runtime": 46.6149,
"eval_samples_per_second": 4.29,
"eval_steps_per_second": 0.536,
"step": 220
},
{
"epoch": 6.90625,
"grad_norm": 0.6829442890004696,
"learning_rate": 2e-05,
"loss": 0.6965,
"step": 221
},
{
"epoch": 6.90625,
"eval_loss": 0.6804749369621277,
"eval_runtime": 47.9027,
"eval_samples_per_second": 4.175,
"eval_steps_per_second": 0.522,
"step": 221
},
{
"epoch": 6.9375,
"grad_norm": 0.7007337811403486,
"learning_rate": 2e-05,
"loss": 0.6948,
"step": 222
},
{
"epoch": 6.9375,
"eval_loss": 0.6785742044448853,
"eval_runtime": 48.3484,
"eval_samples_per_second": 4.137,
"eval_steps_per_second": 0.517,
"step": 222
},
{
"epoch": 6.96875,
"grad_norm": 0.6672225040660534,
"learning_rate": 2e-05,
"loss": 0.7075,
"step": 223
},
{
"epoch": 6.96875,
"eval_loss": 0.6771878004074097,
"eval_runtime": 46.3836,
"eval_samples_per_second": 4.312,
"eval_steps_per_second": 0.539,
"step": 223
},
{
"epoch": 7.0,
"grad_norm": 0.6893374424350143,
"learning_rate": 2e-05,
"loss": 0.7652,
"step": 224
},
{
"epoch": 7.0,
"eval_loss": 0.6772673726081848,
"eval_runtime": 47.0913,
"eval_samples_per_second": 4.247,
"eval_steps_per_second": 0.531,
"step": 224
},
{
"epoch": 7.03125,
"grad_norm": 0.5866908507437849,
"learning_rate": 2e-05,
"loss": 0.6784,
"step": 225
},
{
"epoch": 7.03125,
"eval_loss": 0.6778077483177185,
"eval_runtime": 46.7766,
"eval_samples_per_second": 4.276,
"eval_steps_per_second": 0.534,
"step": 225
},
{
"epoch": 7.0625,
"grad_norm": 0.6620785641323407,
"learning_rate": 2e-05,
"loss": 0.6107,
"step": 226
},
{
"epoch": 7.0625,
"eval_loss": 0.6797336339950562,
"eval_runtime": 47.0779,
"eval_samples_per_second": 4.248,
"eval_steps_per_second": 0.531,
"step": 226
},
{
"epoch": 7.09375,
"grad_norm": 0.6646660025868149,
"learning_rate": 2e-05,
"loss": 0.6824,
"step": 227
},
{
"epoch": 7.09375,
"eval_loss": 0.6831703186035156,
"eval_runtime": 46.4223,
"eval_samples_per_second": 4.308,
"eval_steps_per_second": 0.539,
"step": 227
},
{
"epoch": 7.125,
"grad_norm": 0.7653429329219695,
"learning_rate": 2e-05,
"loss": 0.6289,
"step": 228
},
{
"epoch": 7.125,
"eval_loss": 0.6889806985855103,
"eval_runtime": 48.2668,
"eval_samples_per_second": 4.144,
"eval_steps_per_second": 0.518,
"step": 228
},
{
"epoch": 7.15625,
"grad_norm": 0.888507299589656,
"learning_rate": 2e-05,
"loss": 0.6405,
"step": 229
},
{
"epoch": 7.15625,
"eval_loss": 0.6938297748565674,
"eval_runtime": 48.2833,
"eval_samples_per_second": 4.142,
"eval_steps_per_second": 0.518,
"step": 229
},
{
"epoch": 7.1875,
"grad_norm": 0.8483995966585272,
"learning_rate": 2e-05,
"loss": 0.6256,
"step": 230
},
{
"epoch": 7.1875,
"eval_loss": 0.6941313147544861,
"eval_runtime": 46.6028,
"eval_samples_per_second": 4.292,
"eval_steps_per_second": 0.536,
"step": 230
},
{
"epoch": 7.21875,
"grad_norm": 0.8529011065789557,
"learning_rate": 2e-05,
"loss": 0.719,
"step": 231
},
{
"epoch": 7.21875,
"eval_loss": 0.6908813714981079,
"eval_runtime": 47.7668,
"eval_samples_per_second": 4.187,
"eval_steps_per_second": 0.523,
"step": 231
},
{
"epoch": 7.25,
"grad_norm": 0.7891947191711363,
"learning_rate": 2e-05,
"loss": 0.7122,
"step": 232
},
{
"epoch": 7.25,
"eval_loss": 0.6873031854629517,
"eval_runtime": 46.9441,
"eval_samples_per_second": 4.26,
"eval_steps_per_second": 0.533,
"step": 232
},
{
"epoch": 7.28125,
"grad_norm": 0.8410831266636205,
"learning_rate": 2e-05,
"loss": 0.6655,
"step": 233
},
{
"epoch": 7.28125,
"eval_loss": 0.6842228174209595,
"eval_runtime": 48.184,
"eval_samples_per_second": 4.151,
"eval_steps_per_second": 0.519,
"step": 233
},
{
"epoch": 7.3125,
"grad_norm": 0.7543966645145809,
"learning_rate": 2e-05,
"loss": 0.702,
"step": 234
},
{
"epoch": 7.3125,
"eval_loss": 0.6826092600822449,
"eval_runtime": 48.7587,
"eval_samples_per_second": 4.102,
"eval_steps_per_second": 0.513,
"step": 234
},
{
"epoch": 7.34375,
"grad_norm": 0.69863349246919,
"learning_rate": 2e-05,
"loss": 0.6676,
"step": 235
},
{
"epoch": 7.34375,
"eval_loss": 0.6820936799049377,
"eval_runtime": 46.5095,
"eval_samples_per_second": 4.3,
"eval_steps_per_second": 0.538,
"step": 235
},
{
"epoch": 7.375,
"grad_norm": 0.7718198795174328,
"learning_rate": 2e-05,
"loss": 0.6322,
"step": 236
},
{
"epoch": 7.375,
"eval_loss": 0.681590735912323,
"eval_runtime": 47.6491,
"eval_samples_per_second": 4.197,
"eval_steps_per_second": 0.525,
"step": 236
},
{
"epoch": 7.40625,
"grad_norm": 0.8032644336352275,
"learning_rate": 2e-05,
"loss": 0.6835,
"step": 237
},
{
"epoch": 7.40625,
"eval_loss": 0.6806458234786987,
"eval_runtime": 47.1412,
"eval_samples_per_second": 4.243,
"eval_steps_per_second": 0.53,
"step": 237
},
{
"epoch": 7.4375,
"grad_norm": 0.8165151350063435,
"learning_rate": 2e-05,
"loss": 0.6744,
"step": 238
},
{
"epoch": 7.4375,
"eval_loss": 0.6802331805229187,
"eval_runtime": 48.2476,
"eval_samples_per_second": 4.145,
"eval_steps_per_second": 0.518,
"step": 238
},
{
"epoch": 7.46875,
"grad_norm": 0.7665175082054141,
"learning_rate": 2e-05,
"loss": 0.6955,
"step": 239
},
{
"epoch": 7.46875,
"eval_loss": 0.6806652545928955,
"eval_runtime": 46.6541,
"eval_samples_per_second": 4.287,
"eval_steps_per_second": 0.536,
"step": 239
},
{
"epoch": 7.5,
"grad_norm": 0.7584547487112137,
"learning_rate": 2e-05,
"loss": 0.6374,
"step": 240
},
{
"epoch": 7.5,
"eval_loss": 0.6825945973396301,
"eval_runtime": 46.3848,
"eval_samples_per_second": 4.312,
"eval_steps_per_second": 0.539,
"step": 240
},
{
"epoch": 7.53125,
"grad_norm": 0.660822695597991,
"learning_rate": 2e-05,
"loss": 0.6825,
"step": 241
},
{
"epoch": 7.53125,
"eval_loss": 0.6861986517906189,
"eval_runtime": 46.2732,
"eval_samples_per_second": 4.322,
"eval_steps_per_second": 0.54,
"step": 241
},
{
"epoch": 7.5625,
"grad_norm": 0.7793836425815985,
"learning_rate": 2e-05,
"loss": 0.6824,
"step": 242
},
{
"epoch": 7.5625,
"eval_loss": 0.6895106434822083,
"eval_runtime": 46.6462,
"eval_samples_per_second": 4.288,
"eval_steps_per_second": 0.536,
"step": 242
},
{
"epoch": 7.59375,
"grad_norm": 0.8237113294656135,
"learning_rate": 2e-05,
"loss": 0.6604,
"step": 243
},
{
"epoch": 7.59375,
"eval_loss": 0.6898853778839111,
"eval_runtime": 46.7904,
"eval_samples_per_second": 4.274,
"eval_steps_per_second": 0.534,
"step": 243
},
{
"epoch": 7.625,
"grad_norm": 0.9966126829271594,
"learning_rate": 2e-05,
"loss": 0.7297,
"step": 244
},
{
"epoch": 7.625,
"eval_loss": 0.6854925751686096,
"eval_runtime": 46.5541,
"eval_samples_per_second": 4.296,
"eval_steps_per_second": 0.537,
"step": 244
},
{
"epoch": 7.65625,
"grad_norm": 0.7581680879353856,
"learning_rate": 2e-05,
"loss": 0.6319,
"step": 245
},
{
"epoch": 7.65625,
"eval_loss": 0.6836807131767273,
"eval_runtime": 48.3404,
"eval_samples_per_second": 4.137,
"eval_steps_per_second": 0.517,
"step": 245
},
{
"epoch": 7.6875,
"grad_norm": 0.799947909805063,
"learning_rate": 2e-05,
"loss": 0.672,
"step": 246
},
{
"epoch": 7.6875,
"eval_loss": 0.681761622428894,
"eval_runtime": 50.0597,
"eval_samples_per_second": 3.995,
"eval_steps_per_second": 0.499,
"step": 246
},
{
"epoch": 7.71875,
"grad_norm": 0.8377626405796506,
"learning_rate": 2e-05,
"loss": 0.6727,
"step": 247
},
{
"epoch": 7.71875,
"eval_loss": 0.6791908144950867,
"eval_runtime": 49.25,
"eval_samples_per_second": 4.061,
"eval_steps_per_second": 0.508,
"step": 247
},
{
"epoch": 7.75,
"grad_norm": 0.7237789197029182,
"learning_rate": 2e-05,
"loss": 0.6576,
"step": 248
},
{
"epoch": 7.75,
"eval_loss": 0.6767004132270813,
"eval_runtime": 48.5162,
"eval_samples_per_second": 4.122,
"eval_steps_per_second": 0.515,
"step": 248
},
{
"epoch": 7.78125,
"grad_norm": 0.7946831722044173,
"learning_rate": 2e-05,
"loss": 0.7029,
"step": 249
},
{
"epoch": 7.78125,
"eval_loss": 0.675483763217926,
"eval_runtime": 49.9932,
"eval_samples_per_second": 4.001,
"eval_steps_per_second": 0.5,
"step": 249
},
{
"epoch": 7.8125,
"grad_norm": 0.7259305030593936,
"learning_rate": 2e-05,
"loss": 0.7109,
"step": 250
},
{
"epoch": 7.8125,
"eval_loss": 0.6768932938575745,
"eval_runtime": 49.852,
"eval_samples_per_second": 4.012,
"eval_steps_per_second": 0.501,
"step": 250
},
{
"epoch": 7.84375,
"grad_norm": 0.7340863248905795,
"learning_rate": 2e-05,
"loss": 0.6231,
"step": 251
},
{
"epoch": 7.84375,
"eval_loss": 0.6790910363197327,
"eval_runtime": 51.2892,
"eval_samples_per_second": 3.899,
"eval_steps_per_second": 0.487,
"step": 251
},
{
"epoch": 7.875,
"grad_norm": 0.8413325044551803,
"learning_rate": 2e-05,
"loss": 0.6325,
"step": 252
},
{
"epoch": 7.875,
"eval_loss": 0.6796602010726929,
"eval_runtime": 51.5508,
"eval_samples_per_second": 3.88,
"eval_steps_per_second": 0.485,
"step": 252
},
{
"epoch": 7.90625,
"grad_norm": 0.7927416396360353,
"learning_rate": 2e-05,
"loss": 0.7207,
"step": 253
},
{
"epoch": 7.90625,
"eval_loss": 0.6797543168067932,
"eval_runtime": 51.7355,
"eval_samples_per_second": 3.866,
"eval_steps_per_second": 0.483,
"step": 253
},
{
"epoch": 7.9375,
"grad_norm": 0.7510046984656369,
"learning_rate": 2e-05,
"loss": 0.6728,
"step": 254
},
{
"epoch": 7.9375,
"eval_loss": 0.6813901662826538,
"eval_runtime": 50.2001,
"eval_samples_per_second": 3.984,
"eval_steps_per_second": 0.498,
"step": 254
},
{
"epoch": 7.96875,
"grad_norm": 0.8061013994114622,
"learning_rate": 2e-05,
"loss": 0.6006,
"step": 255
},
{
"epoch": 7.96875,
"eval_loss": 0.681613028049469,
"eval_runtime": 49.7101,
"eval_samples_per_second": 4.023,
"eval_steps_per_second": 0.503,
"step": 255
},
{
"epoch": 8.0,
"grad_norm": 0.7889275388211946,
"learning_rate": 2e-05,
"loss": 0.662,
"step": 256
},
{
"epoch": 8.0,
"eval_loss": 0.6804400086402893,
"eval_runtime": 51.28,
"eval_samples_per_second": 3.9,
"eval_steps_per_second": 0.488,
"step": 256
},
{
"epoch": 8.03125,
"grad_norm": 0.7870763956359581,
"learning_rate": 2e-05,
"loss": 0.6302,
"step": 257
},
{
"epoch": 8.03125,
"eval_loss": 0.6809322834014893,
"eval_runtime": 52.7641,
"eval_samples_per_second": 3.79,
"eval_steps_per_second": 0.474,
"step": 257
},
{
"epoch": 8.0625,
"grad_norm": 0.7603743206060642,
"learning_rate": 2e-05,
"loss": 0.6426,
"step": 258
},
{
"epoch": 8.0625,
"eval_loss": 0.683021068572998,
"eval_runtime": 43.8381,
"eval_samples_per_second": 4.562,
"eval_steps_per_second": 0.57,
"step": 258
},
{
"epoch": 8.09375,
"grad_norm": 0.7751516747488628,
"learning_rate": 2e-05,
"loss": 0.6734,
"step": 259
},
{
"epoch": 8.09375,
"eval_loss": 0.685730516910553,
"eval_runtime": 43.9143,
"eval_samples_per_second": 4.554,
"eval_steps_per_second": 0.569,
"step": 259
},
{
"epoch": 8.125,
"grad_norm": 0.8783715889493854,
"learning_rate": 2e-05,
"loss": 0.685,
"step": 260
},
{
"epoch": 8.125,
"eval_loss": 0.6876766085624695,
"eval_runtime": 43.8107,
"eval_samples_per_second": 4.565,
"eval_steps_per_second": 0.571,
"step": 260
},
{
"epoch": 8.15625,
"grad_norm": 0.8683763894470441,
"learning_rate": 2e-05,
"loss": 0.6111,
"step": 261
},
{
"epoch": 8.15625,
"eval_loss": 0.6892675757408142,
"eval_runtime": 45.4312,
"eval_samples_per_second": 4.402,
"eval_steps_per_second": 0.55,
"step": 261
},
{
"epoch": 8.1875,
"grad_norm": 0.83301264234889,
"learning_rate": 2e-05,
"loss": 0.7238,
"step": 262
},
{
"epoch": 8.1875,
"eval_loss": 0.6900019645690918,
"eval_runtime": 43.7899,
"eval_samples_per_second": 4.567,
"eval_steps_per_second": 0.571,
"step": 262
},
{
"epoch": 8.21875,
"grad_norm": 0.9311076945185538,
"learning_rate": 2e-05,
"loss": 0.5936,
"step": 263
},
{
"epoch": 8.21875,
"eval_loss": 0.6899961233139038,
"eval_runtime": 45.0746,
"eval_samples_per_second": 4.437,
"eval_steps_per_second": 0.555,
"step": 263
},
{
"epoch": 8.25,
"grad_norm": 0.8715436312553682,
"learning_rate": 2e-05,
"loss": 0.6483,
"step": 264
},
{
"epoch": 8.25,
"eval_loss": 0.690051257610321,
"eval_runtime": 43.9844,
"eval_samples_per_second": 4.547,
"eval_steps_per_second": 0.568,
"step": 264
},
{
"epoch": 8.28125,
"grad_norm": 0.9923902289464986,
"learning_rate": 2e-05,
"loss": 0.6718,
"step": 265
},
{
"epoch": 8.28125,
"eval_loss": 0.688658595085144,
"eval_runtime": 43.8005,
"eval_samples_per_second": 4.566,
"eval_steps_per_second": 0.571,
"step": 265
},
{
"epoch": 8.3125,
"grad_norm": 0.8485704756867186,
"learning_rate": 2e-05,
"loss": 0.663,
"step": 266
},
{
"epoch": 8.3125,
"eval_loss": 0.6868423223495483,
"eval_runtime": 46.8136,
"eval_samples_per_second": 4.272,
"eval_steps_per_second": 0.534,
"step": 266
},
{
"epoch": 8.34375,
"grad_norm": 0.8355813738463048,
"learning_rate": 2e-05,
"loss": 0.5884,
"step": 267
},
{
"epoch": 8.34375,
"eval_loss": 0.6864896416664124,
"eval_runtime": 46.0477,
"eval_samples_per_second": 4.343,
"eval_steps_per_second": 0.543,
"step": 267
},
{
"epoch": 8.375,
"grad_norm": 0.8932260711586627,
"learning_rate": 2e-05,
"loss": 0.6466,
"step": 268
},
{
"epoch": 8.375,
"eval_loss": 0.6860455274581909,
"eval_runtime": 46.3159,
"eval_samples_per_second": 4.318,
"eval_steps_per_second": 0.54,
"step": 268
},
{
"epoch": 8.40625,
"grad_norm": 0.8536230233577757,
"learning_rate": 2e-05,
"loss": 0.6364,
"step": 269
},
{
"epoch": 8.40625,
"eval_loss": 0.6861154437065125,
"eval_runtime": 45.4048,
"eval_samples_per_second": 4.405,
"eval_steps_per_second": 0.551,
"step": 269
},
{
"epoch": 8.4375,
"grad_norm": 0.83328335532683,
"learning_rate": 2e-05,
"loss": 0.6419,
"step": 270
},
{
"epoch": 8.4375,
"eval_loss": 0.6856899261474609,
"eval_runtime": 46.609,
"eval_samples_per_second": 4.291,
"eval_steps_per_second": 0.536,
"step": 270
},
{
"epoch": 8.46875,
"grad_norm": 0.8841406022945117,
"learning_rate": 2e-05,
"loss": 0.5383,
"step": 271
},
{
"epoch": 8.46875,
"eval_loss": 0.6865776181221008,
"eval_runtime": 47.0757,
"eval_samples_per_second": 4.248,
"eval_steps_per_second": 0.531,
"step": 271
},
{
"epoch": 8.5,
"grad_norm": 0.8194392324450703,
"learning_rate": 2e-05,
"loss": 0.6376,
"step": 272
},
{
"epoch": 8.5,
"eval_loss": 0.6892414689064026,
"eval_runtime": 46.8669,
"eval_samples_per_second": 4.267,
"eval_steps_per_second": 0.533,
"step": 272
},
{
"epoch": 8.53125,
"grad_norm": 0.937948691760343,
"learning_rate": 2e-05,
"loss": 0.6485,
"step": 273
},
{
"epoch": 8.53125,
"eval_loss": 0.6890290975570679,
"eval_runtime": 46.649,
"eval_samples_per_second": 4.287,
"eval_steps_per_second": 0.536,
"step": 273
},
{
"epoch": 8.5625,
"grad_norm": 0.9240471094453983,
"learning_rate": 2e-05,
"loss": 0.6387,
"step": 274
},
{
"epoch": 8.5625,
"eval_loss": 0.6875545382499695,
"eval_runtime": 48.2193,
"eval_samples_per_second": 4.148,
"eval_steps_per_second": 0.518,
"step": 274
},
{
"epoch": 8.59375,
"grad_norm": 0.9186571178066892,
"learning_rate": 2e-05,
"loss": 0.6503,
"step": 275
},
{
"epoch": 8.59375,
"eval_loss": 0.6848871111869812,
"eval_runtime": 46.9651,
"eval_samples_per_second": 4.258,
"eval_steps_per_second": 0.532,
"step": 275
},
{
"epoch": 8.625,
"grad_norm": 0.9603067514462874,
"learning_rate": 2e-05,
"loss": 0.6429,
"step": 276
},
{
"epoch": 8.625,
"eval_loss": 0.68189537525177,
"eval_runtime": 47.959,
"eval_samples_per_second": 4.17,
"eval_steps_per_second": 0.521,
"step": 276
},
{
"epoch": 8.65625,
"grad_norm": 0.8632677172122276,
"learning_rate": 2e-05,
"loss": 0.5888,
"step": 277
},
{
"epoch": 8.65625,
"eval_loss": 0.6817250847816467,
"eval_runtime": 47.5519,
"eval_samples_per_second": 4.206,
"eval_steps_per_second": 0.526,
"step": 277
},
{
"epoch": 8.6875,
"grad_norm": 0.9096699999767647,
"learning_rate": 2e-05,
"loss": 0.6434,
"step": 278
},
{
"epoch": 8.6875,
"eval_loss": 0.6826667785644531,
"eval_runtime": 48.058,
"eval_samples_per_second": 4.162,
"eval_steps_per_second": 0.52,
"step": 278
},
{
"epoch": 8.71875,
"grad_norm": 0.8315455850502919,
"learning_rate": 2e-05,
"loss": 0.6012,
"step": 279
},
{
"epoch": 8.71875,
"eval_loss": 0.6839814782142639,
"eval_runtime": 48.1576,
"eval_samples_per_second": 4.153,
"eval_steps_per_second": 0.519,
"step": 279
},
{
"epoch": 8.75,
"grad_norm": 0.9058679893646637,
"learning_rate": 2e-05,
"loss": 0.676,
"step": 280
},
{
"epoch": 8.75,
"eval_loss": 0.6849075555801392,
"eval_runtime": 47.9952,
"eval_samples_per_second": 4.167,
"eval_steps_per_second": 0.521,
"step": 280
},
{
"epoch": 8.78125,
"grad_norm": 0.8626848465032242,
"learning_rate": 2e-05,
"loss": 0.6137,
"step": 281
},
{
"epoch": 8.78125,
"eval_loss": 0.6846147775650024,
"eval_runtime": 50.2338,
"eval_samples_per_second": 3.981,
"eval_steps_per_second": 0.498,
"step": 281
},
{
"epoch": 8.8125,
"grad_norm": 0.8473178170336938,
"learning_rate": 2e-05,
"loss": 0.6017,
"step": 282
},
{
"epoch": 8.8125,
"eval_loss": 0.6846247911453247,
"eval_runtime": 49.6161,
"eval_samples_per_second": 4.031,
"eval_steps_per_second": 0.504,
"step": 282
},
{
"epoch": 8.84375,
"grad_norm": 0.8161205540198673,
"learning_rate": 2e-05,
"loss": 0.5811,
"step": 283
},
{
"epoch": 8.84375,
"eval_loss": 0.6851673126220703,
"eval_runtime": 48.2057,
"eval_samples_per_second": 4.149,
"eval_steps_per_second": 0.519,
"step": 283
},
{
"epoch": 8.875,
"grad_norm": 0.8854404259280148,
"learning_rate": 2e-05,
"loss": 0.5459,
"step": 284
},
{
"epoch": 8.875,
"eval_loss": 0.685972273349762,
"eval_runtime": 49.0992,
"eval_samples_per_second": 4.073,
"eval_steps_per_second": 0.509,
"step": 284
},
{
"epoch": 8.90625,
"grad_norm": 0.9439945965022273,
"learning_rate": 2e-05,
"loss": 0.5908,
"step": 285
},
{
"epoch": 8.90625,
"eval_loss": 0.6852046847343445,
"eval_runtime": 48.1612,
"eval_samples_per_second": 4.153,
"eval_steps_per_second": 0.519,
"step": 285
},
{
"epoch": 8.9375,
"grad_norm": 1.0054677849137328,
"learning_rate": 2e-05,
"loss": 0.7215,
"step": 286
},
{
"epoch": 8.9375,
"eval_loss": 0.6840152144432068,
"eval_runtime": 48.2329,
"eval_samples_per_second": 4.147,
"eval_steps_per_second": 0.518,
"step": 286
},
{
"epoch": 8.96875,
"grad_norm": 0.8657465123021779,
"learning_rate": 2e-05,
"loss": 0.6479,
"step": 287
},
{
"epoch": 8.96875,
"eval_loss": 0.6845163106918335,
"eval_runtime": 47.9574,
"eval_samples_per_second": 4.17,
"eval_steps_per_second": 0.521,
"step": 287
},
{
"epoch": 9.0,
"grad_norm": 0.9781677785178013,
"learning_rate": 2e-05,
"loss": 0.598,
"step": 288
},
{
"epoch": 9.0,
"eval_loss": 0.6835929751396179,
"eval_runtime": 48.3854,
"eval_samples_per_second": 4.133,
"eval_steps_per_second": 0.517,
"step": 288
},
{
"epoch": 9.03125,
"grad_norm": 0.8913448503162013,
"learning_rate": 2e-05,
"loss": 0.608,
"step": 289
},
{
"epoch": 9.03125,
"eval_loss": 0.682920515537262,
"eval_runtime": 48.0787,
"eval_samples_per_second": 4.16,
"eval_steps_per_second": 0.52,
"step": 289
},
{
"epoch": 9.0625,
"grad_norm": 0.8910028425785708,
"learning_rate": 2e-05,
"loss": 0.6249,
"step": 290
},
{
"epoch": 9.0625,
"eval_loss": 0.6842910647392273,
"eval_runtime": 45.3447,
"eval_samples_per_second": 4.411,
"eval_steps_per_second": 0.551,
"step": 290
},
{
"epoch": 9.09375,
"grad_norm": 0.8766964747132081,
"learning_rate": 2e-05,
"loss": 0.6198,
"step": 291
},
{
"epoch": 9.09375,
"eval_loss": 0.6897236704826355,
"eval_runtime": 44.1159,
"eval_samples_per_second": 4.534,
"eval_steps_per_second": 0.567,
"step": 291
},
{
"epoch": 9.125,
"grad_norm": 1.0295884589810356,
"learning_rate": 2e-05,
"loss": 0.5993,
"step": 292
},
{
"epoch": 9.125,
"eval_loss": 0.6943468451499939,
"eval_runtime": 43.8108,
"eval_samples_per_second": 4.565,
"eval_steps_per_second": 0.571,
"step": 292
},
{
"epoch": 9.15625,
"grad_norm": 0.9773325211255739,
"learning_rate": 2e-05,
"loss": 0.6508,
"step": 293
},
{
"epoch": 9.15625,
"eval_loss": 0.6970213055610657,
"eval_runtime": 45.2879,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 0.552,
"step": 293
},
{
"epoch": 9.1875,
"grad_norm": 0.8891126608483751,
"learning_rate": 2e-05,
"loss": 0.5919,
"step": 294
},
{
"epoch": 9.1875,
"eval_loss": 0.6991220116615295,
"eval_runtime": 45.4682,
"eval_samples_per_second": 4.399,
"eval_steps_per_second": 0.55,
"step": 294
},
{
"epoch": 9.21875,
"grad_norm": 1.0482454581695644,
"learning_rate": 2e-05,
"loss": 0.5355,
"step": 295
},
{
"epoch": 9.21875,
"eval_loss": 0.704166054725647,
"eval_runtime": 45.109,
"eval_samples_per_second": 4.434,
"eval_steps_per_second": 0.554,
"step": 295
},
{
"epoch": 9.25,
"grad_norm": 0.9935665009180418,
"learning_rate": 2e-05,
"loss": 0.5624,
"step": 296
},
{
"epoch": 9.25,
"eval_loss": 0.7078476548194885,
"eval_runtime": 43.6811,
"eval_samples_per_second": 4.579,
"eval_steps_per_second": 0.572,
"step": 296
},
{
"epoch": 9.28125,
"grad_norm": 1.1040486086703822,
"learning_rate": 2e-05,
"loss": 0.66,
"step": 297
},
{
"epoch": 9.28125,
"eval_loss": 0.7050178647041321,
"eval_runtime": 43.9806,
"eval_samples_per_second": 4.547,
"eval_steps_per_second": 0.568,
"step": 297
},
{
"epoch": 9.3125,
"grad_norm": 1.2781656869693958,
"learning_rate": 2e-05,
"loss": 0.5966,
"step": 298
},
{
"epoch": 9.3125,
"eval_loss": 0.6992971897125244,
"eval_runtime": 45.6581,
"eval_samples_per_second": 4.38,
"eval_steps_per_second": 0.548,
"step": 298
},
{
"epoch": 9.34375,
"grad_norm": 1.0619252838389437,
"learning_rate": 2e-05,
"loss": 0.5724,
"step": 299
},
{
"epoch": 9.34375,
"eval_loss": 0.6947219967842102,
"eval_runtime": 45.5657,
"eval_samples_per_second": 4.389,
"eval_steps_per_second": 0.549,
"step": 299
},
{
"epoch": 9.375,
"grad_norm": 0.9267592917491817,
"learning_rate": 2e-05,
"loss": 0.5834,
"step": 300
},
{
"epoch": 9.375,
"eval_loss": 0.6934340000152588,
"eval_runtime": 43.7418,
"eval_samples_per_second": 4.572,
"eval_steps_per_second": 0.572,
"step": 300
},
{
"epoch": 9.40625,
"grad_norm": 0.9597103067245094,
"learning_rate": 2e-05,
"loss": 0.5645,
"step": 301
},
{
"epoch": 9.40625,
"eval_loss": 0.6928582787513733,
"eval_runtime": 45.6592,
"eval_samples_per_second": 4.38,
"eval_steps_per_second": 0.548,
"step": 301
},
{
"epoch": 9.4375,
"grad_norm": 1.0528189035992561,
"learning_rate": 2e-05,
"loss": 0.6196,
"step": 302
},
{
"epoch": 9.4375,
"eval_loss": 0.6888896822929382,
"eval_runtime": 44.9727,
"eval_samples_per_second": 4.447,
"eval_steps_per_second": 0.556,
"step": 302
},
{
"epoch": 9.46875,
"grad_norm": 1.0053722794735602,
"learning_rate": 2e-05,
"loss": 0.6154,
"step": 303
},
{
"epoch": 9.46875,
"eval_loss": 0.6855815052986145,
"eval_runtime": 44.7585,
"eval_samples_per_second": 4.468,
"eval_steps_per_second": 0.559,
"step": 303
},
{
"epoch": 9.5,
"grad_norm": 0.8783611726661886,
"learning_rate": 2e-05,
"loss": 0.6542,
"step": 304
},
{
"epoch": 9.5,
"eval_loss": 0.685936689376831,
"eval_runtime": 44.7918,
"eval_samples_per_second": 4.465,
"eval_steps_per_second": 0.558,
"step": 304
},
{
"epoch": 9.53125,
"grad_norm": 0.9143611061568578,
"learning_rate": 2e-05,
"loss": 0.6178,
"step": 305
},
{
"epoch": 9.53125,
"eval_loss": 0.6888444423675537,
"eval_runtime": 46.8021,
"eval_samples_per_second": 4.273,
"eval_steps_per_second": 0.534,
"step": 305
},
{
"epoch": 9.5625,
"grad_norm": 1.0642585786595127,
"learning_rate": 2e-05,
"loss": 0.6078,
"step": 306
},
{
"epoch": 9.5625,
"eval_loss": 0.6898679137229919,
"eval_runtime": 47.6538,
"eval_samples_per_second": 4.197,
"eval_steps_per_second": 0.525,
"step": 306
},
{
"epoch": 9.59375,
"grad_norm": 1.1048937808634194,
"learning_rate": 2e-05,
"loss": 0.6019,
"step": 307
},
{
"epoch": 9.59375,
"eval_loss": 0.6891123056411743,
"eval_runtime": 45.7695,
"eval_samples_per_second": 4.37,
"eval_steps_per_second": 0.546,
"step": 307
},
{
"epoch": 9.625,
"grad_norm": 1.0058213310083948,
"learning_rate": 2e-05,
"loss": 0.6406,
"step": 308
},
{
"epoch": 9.625,
"eval_loss": 0.6902400851249695,
"eval_runtime": 45.7897,
"eval_samples_per_second": 4.368,
"eval_steps_per_second": 0.546,
"step": 308
},
{
"epoch": 9.65625,
"grad_norm": 0.9344450130195062,
"learning_rate": 2e-05,
"loss": 0.607,
"step": 309
},
{
"epoch": 9.65625,
"eval_loss": 0.6951236128807068,
"eval_runtime": 46.8406,
"eval_samples_per_second": 4.27,
"eval_steps_per_second": 0.534,
"step": 309
},
{
"epoch": 9.6875,
"grad_norm": 1.1997135893441022,
"learning_rate": 2e-05,
"loss": 0.5994,
"step": 310
},
{
"epoch": 9.6875,
"eval_loss": 0.6978768706321716,
"eval_runtime": 47.5626,
"eval_samples_per_second": 4.205,
"eval_steps_per_second": 0.526,
"step": 310
},
{
"epoch": 9.71875,
"grad_norm": 1.0755945446749937,
"learning_rate": 2e-05,
"loss": 0.5265,
"step": 311
},
{
"epoch": 9.71875,
"eval_loss": 0.70021653175354,
"eval_runtime": 46.1678,
"eval_samples_per_second": 4.332,
"eval_steps_per_second": 0.542,
"step": 311
},
{
"epoch": 9.75,
"grad_norm": 1.069679239983948,
"learning_rate": 2e-05,
"loss": 0.6212,
"step": 312
},
{
"epoch": 9.75,
"eval_loss": 0.7008029222488403,
"eval_runtime": 47.797,
"eval_samples_per_second": 4.184,
"eval_steps_per_second": 0.523,
"step": 312
},
{
"epoch": 9.78125,
"grad_norm": 0.9717104499586322,
"learning_rate": 2e-05,
"loss": 0.6063,
"step": 313
},
{
"epoch": 9.78125,
"eval_loss": 0.7000299096107483,
"eval_runtime": 46.9892,
"eval_samples_per_second": 4.256,
"eval_steps_per_second": 0.532,
"step": 313
},
{
"epoch": 9.8125,
"grad_norm": 1.117536796971012,
"learning_rate": 2e-05,
"loss": 0.5875,
"step": 314
},
{
"epoch": 9.8125,
"eval_loss": 0.6982808709144592,
"eval_runtime": 48.0867,
"eval_samples_per_second": 4.159,
"eval_steps_per_second": 0.52,
"step": 314
},
{
"epoch": 9.84375,
"grad_norm": 0.987633836102932,
"learning_rate": 2e-05,
"loss": 0.6072,
"step": 315
},
{
"epoch": 9.84375,
"eval_loss": 0.6959852576255798,
"eval_runtime": 46.1188,
"eval_samples_per_second": 4.337,
"eval_steps_per_second": 0.542,
"step": 315
},
{
"epoch": 9.875,
"grad_norm": 0.972220541559008,
"learning_rate": 2e-05,
"loss": 0.5984,
"step": 316
},
{
"epoch": 9.875,
"eval_loss": 0.6931790113449097,
"eval_runtime": 46.363,
"eval_samples_per_second": 4.314,
"eval_steps_per_second": 0.539,
"step": 316
},
{
"epoch": 9.90625,
"grad_norm": 1.073192480739423,
"learning_rate": 2e-05,
"loss": 0.5686,
"step": 317
},
{
"epoch": 9.90625,
"eval_loss": 0.6896910071372986,
"eval_runtime": 46.2139,
"eval_samples_per_second": 4.328,
"eval_steps_per_second": 0.541,
"step": 317
},
{
"epoch": 9.9375,
"grad_norm": 1.0275060141171612,
"learning_rate": 2e-05,
"loss": 0.5825,
"step": 318
},
{
"epoch": 9.9375,
"eval_loss": 0.6866476535797119,
"eval_runtime": 47.6084,
"eval_samples_per_second": 4.201,
"eval_steps_per_second": 0.525,
"step": 318
},
{
"epoch": 9.96875,
"grad_norm": 1.1137122139905515,
"learning_rate": 2e-05,
"loss": 0.614,
"step": 319
},
{
"epoch": 9.96875,
"eval_loss": 0.6832907199859619,
"eval_runtime": 48.0271,
"eval_samples_per_second": 4.164,
"eval_steps_per_second": 0.521,
"step": 319
},
{
"epoch": 10.0,
"grad_norm": 1.0329542238815055,
"learning_rate": 2e-05,
"loss": 0.569,
"step": 320
},
{
"epoch": 10.0,
"eval_loss": 0.6833243370056152,
"eval_runtime": 46.9821,
"eval_samples_per_second": 4.257,
"eval_steps_per_second": 0.532,
"step": 320
}
],
"logging_steps": 1.0,
"max_steps": 320,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 414702785134592.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}