Text Generation
Transformers
Safetensors
mistral
axolotl
Generated from Trainer
Mistral
instruct
finetune
chatml
gpt4
synthetic data
science
physics
chemistry
biology
math
conversational
Eval Results
text-generation-inference
Inference Endpoints
Einstein-v5-v0.2-7B / checkpoint-754 /trainer_state.json
Weyaxi's picture
Upload folder using huggingface_hub
d62148f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6678476527900797,
"eval_steps": 500,
"global_step": 754,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 15.385598322874909,
"learning_rate": 5.000000000000001e-07,
"loss": 0.7627,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 15.42812332406859,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.794,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 13.76934599903778,
"learning_rate": 1.5e-06,
"loss": 0.7894,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 7.9055471186770685,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7346,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 8.624179170790118,
"learning_rate": 2.5e-06,
"loss": 0.7458,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 37.14544394485457,
"learning_rate": 3e-06,
"loss": 0.8249,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 13.413192499879626,
"learning_rate": 3.5e-06,
"loss": 0.7692,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 11.194156755277431,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7724,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 8.569279640169995,
"learning_rate": 4.5e-06,
"loss": 0.7851,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 18.113903622060178,
"learning_rate": 5e-06,
"loss": 0.7874,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 7.486914001687124,
"learning_rate": 4.999997558722919e-06,
"loss": 0.7553,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 7.280219682440894,
"learning_rate": 4.999990234896445e-06,
"loss": 0.7095,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 4.3413734180304155,
"learning_rate": 4.99997802853488e-06,
"loss": 0.6916,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 5.756315245615391,
"learning_rate": 4.999960939662063e-06,
"loss": 0.7407,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 5.090553047874293,
"learning_rate": 4.999938968311371e-06,
"loss": 0.7387,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 5.8370558847287075,
"learning_rate": 4.9999121145257126e-06,
"loss": 0.7051,
"step": 16
},
{
"epoch": 0.02,
"grad_norm": 3.986658012877664,
"learning_rate": 4.999880378357535e-06,
"loss": 0.6871,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 4.141716122521651,
"learning_rate": 4.9998437598688195e-06,
"loss": 0.6694,
"step": 18
},
{
"epoch": 0.02,
"grad_norm": 4.729722439630604,
"learning_rate": 4.9998022591310815e-06,
"loss": 0.716,
"step": 19
},
{
"epoch": 0.02,
"grad_norm": 2.9486336901615497,
"learning_rate": 4.999755876225375e-06,
"loss": 0.6387,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 2.8336874650575745,
"learning_rate": 4.999704611242285e-06,
"loss": 0.6542,
"step": 21
},
{
"epoch": 0.02,
"grad_norm": 3.6724374918638905,
"learning_rate": 4.999648464281934e-06,
"loss": 0.6617,
"step": 22
},
{
"epoch": 0.02,
"grad_norm": 2.941494127880678,
"learning_rate": 4.999587435453979e-06,
"loss": 0.6687,
"step": 23
},
{
"epoch": 0.02,
"grad_norm": 2.6261822206464744,
"learning_rate": 4.999521524877608e-06,
"loss": 0.6634,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 2.8059947014946305,
"learning_rate": 4.999450732681549e-06,
"loss": 0.6901,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 3.131537494217822,
"learning_rate": 4.999375059004058e-06,
"loss": 0.6407,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 2.7893212245465837,
"learning_rate": 4.99929450399293e-06,
"loss": 0.6638,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 2.4411586751746,
"learning_rate": 4.999209067805487e-06,
"loss": 0.6196,
"step": 28
},
{
"epoch": 0.03,
"grad_norm": 2.8807261299944082,
"learning_rate": 4.999118750608591e-06,
"loss": 0.6839,
"step": 29
},
{
"epoch": 0.03,
"grad_norm": 2.879993804839069,
"learning_rate": 4.9990235525786326e-06,
"loss": 0.6484,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 2.604360711268946,
"learning_rate": 4.998923473901535e-06,
"loss": 0.6313,
"step": 31
},
{
"epoch": 0.03,
"grad_norm": 2.403225544767816,
"learning_rate": 4.9988185147727544e-06,
"loss": 0.6209,
"step": 32
},
{
"epoch": 0.03,
"grad_norm": 2.669567772543462,
"learning_rate": 4.998708675397278e-06,
"loss": 0.6068,
"step": 33
},
{
"epoch": 0.03,
"grad_norm": 2.443946495915797,
"learning_rate": 4.998593955989626e-06,
"loss": 0.6731,
"step": 34
},
{
"epoch": 0.03,
"grad_norm": 2.2104680876118317,
"learning_rate": 4.998474356773845e-06,
"loss": 0.6243,
"step": 35
},
{
"epoch": 0.03,
"grad_norm": 2.3602199264043957,
"learning_rate": 4.9983498779835175e-06,
"loss": 0.6649,
"step": 36
},
{
"epoch": 0.03,
"grad_norm": 2.4676911263240844,
"learning_rate": 4.998220519861752e-06,
"loss": 0.6174,
"step": 37
},
{
"epoch": 0.03,
"grad_norm": 2.3419026099030282,
"learning_rate": 4.998086282661188e-06,
"loss": 0.6123,
"step": 38
},
{
"epoch": 0.03,
"grad_norm": 2.14900736954254,
"learning_rate": 4.997947166643993e-06,
"loss": 0.63,
"step": 39
},
{
"epoch": 0.04,
"grad_norm": 2.570907426799795,
"learning_rate": 4.997803172081864e-06,
"loss": 0.6249,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 2.516952735669967,
"learning_rate": 4.997654299256026e-06,
"loss": 0.6727,
"step": 41
},
{
"epoch": 0.04,
"grad_norm": 2.1600457198543874,
"learning_rate": 4.997500548457231e-06,
"loss": 0.6719,
"step": 42
},
{
"epoch": 0.04,
"grad_norm": 2.2177572033934743,
"learning_rate": 4.997341919985756e-06,
"loss": 0.6148,
"step": 43
},
{
"epoch": 0.04,
"grad_norm": 2.397105205209689,
"learning_rate": 4.997178414151409e-06,
"loss": 0.6167,
"step": 44
},
{
"epoch": 0.04,
"grad_norm": 2.1254940534972167,
"learning_rate": 4.997010031273517e-06,
"loss": 0.6446,
"step": 45
},
{
"epoch": 0.04,
"grad_norm": 2.2113023791837194,
"learning_rate": 4.996836771680937e-06,
"loss": 0.6304,
"step": 46
},
{
"epoch": 0.04,
"grad_norm": 2.386446316275664,
"learning_rate": 4.99665863571205e-06,
"loss": 0.6621,
"step": 47
},
{
"epoch": 0.04,
"grad_norm": 2.1838934384314483,
"learning_rate": 4.996475623714756e-06,
"loss": 0.6214,
"step": 48
},
{
"epoch": 0.04,
"grad_norm": 2.2047933657923586,
"learning_rate": 4.996287736046485e-06,
"loss": 0.6478,
"step": 49
},
{
"epoch": 0.04,
"grad_norm": 2.208809457983808,
"learning_rate": 4.996094973074183e-06,
"loss": 0.6097,
"step": 50
},
{
"epoch": 0.05,
"grad_norm": 2.1318377198138267,
"learning_rate": 4.995897335174322e-06,
"loss": 0.622,
"step": 51
},
{
"epoch": 0.05,
"grad_norm": 2.0673034122993537,
"learning_rate": 4.995694822732893e-06,
"loss": 0.6036,
"step": 52
},
{
"epoch": 0.05,
"grad_norm": 2.195105312645423,
"learning_rate": 4.9954874361454055e-06,
"loss": 0.6052,
"step": 53
},
{
"epoch": 0.05,
"grad_norm": 2.157855029176061,
"learning_rate": 4.995275175816892e-06,
"loss": 0.6455,
"step": 54
},
{
"epoch": 0.05,
"grad_norm": 2.0500405783991043,
"learning_rate": 4.9950580421619e-06,
"loss": 0.6353,
"step": 55
},
{
"epoch": 0.05,
"grad_norm": 2.199629904296075,
"learning_rate": 4.9948360356044965e-06,
"loss": 0.6122,
"step": 56
},
{
"epoch": 0.05,
"grad_norm": 2.186847580161491,
"learning_rate": 4.994609156578267e-06,
"loss": 0.6073,
"step": 57
},
{
"epoch": 0.05,
"grad_norm": 2.0207512037097835,
"learning_rate": 4.994377405526308e-06,
"loss": 0.61,
"step": 58
},
{
"epoch": 0.05,
"grad_norm": 2.3170193964114976,
"learning_rate": 4.994140782901237e-06,
"loss": 0.6322,
"step": 59
},
{
"epoch": 0.05,
"grad_norm": 2.014785890436746,
"learning_rate": 4.9938992891651825e-06,
"loss": 0.6205,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 1.9538385063221935,
"learning_rate": 4.9936529247897854e-06,
"loss": 0.5992,
"step": 61
},
{
"epoch": 0.05,
"grad_norm": 2.084943826856202,
"learning_rate": 4.993401690256203e-06,
"loss": 0.6148,
"step": 62
},
{
"epoch": 0.06,
"grad_norm": 2.135158856581583,
"learning_rate": 4.9931455860551e-06,
"loss": 0.5937,
"step": 63
},
{
"epoch": 0.06,
"grad_norm": 1.982621418518698,
"learning_rate": 4.992884612686655e-06,
"loss": 0.6091,
"step": 64
},
{
"epoch": 0.06,
"grad_norm": 2.1030931953494956,
"learning_rate": 4.992618770660553e-06,
"loss": 0.6034,
"step": 65
},
{
"epoch": 0.06,
"grad_norm": 2.1994634556563994,
"learning_rate": 4.992348060495989e-06,
"loss": 0.5846,
"step": 66
},
{
"epoch": 0.06,
"grad_norm": 2.410691403277427,
"learning_rate": 4.992072482721669e-06,
"loss": 0.6294,
"step": 67
},
{
"epoch": 0.06,
"grad_norm": 1.9720494401999067,
"learning_rate": 4.991792037875799e-06,
"loss": 0.591,
"step": 68
},
{
"epoch": 0.06,
"grad_norm": 2.147504025949435,
"learning_rate": 4.991506726506094e-06,
"loss": 0.5689,
"step": 69
},
{
"epoch": 0.06,
"grad_norm": 2.1837702519904223,
"learning_rate": 4.991216549169776e-06,
"loss": 0.6422,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 2.0883865330274958,
"learning_rate": 4.9909215064335655e-06,
"loss": 0.6076,
"step": 71
},
{
"epoch": 0.06,
"grad_norm": 2.20727863923846,
"learning_rate": 4.990621598873687e-06,
"loss": 0.5974,
"step": 72
},
{
"epoch": 0.06,
"grad_norm": 2.0735330806418464,
"learning_rate": 4.990316827075868e-06,
"loss": 0.6809,
"step": 73
},
{
"epoch": 0.07,
"grad_norm": 2.0203203347538774,
"learning_rate": 4.990007191635334e-06,
"loss": 0.6107,
"step": 74
},
{
"epoch": 0.07,
"grad_norm": 2.234889365362174,
"learning_rate": 4.989692693156809e-06,
"loss": 0.6218,
"step": 75
},
{
"epoch": 0.07,
"grad_norm": 1.9902503343433904,
"learning_rate": 4.989373332254516e-06,
"loss": 0.6257,
"step": 76
},
{
"epoch": 0.07,
"grad_norm": 2.1041971507252466,
"learning_rate": 4.989049109552173e-06,
"loss": 0.5888,
"step": 77
},
{
"epoch": 0.07,
"grad_norm": 2.1151685783302123,
"learning_rate": 4.988720025682995e-06,
"loss": 0.6333,
"step": 78
},
{
"epoch": 0.07,
"grad_norm": 1.9223819269893592,
"learning_rate": 4.988386081289689e-06,
"loss": 0.6442,
"step": 79
},
{
"epoch": 0.07,
"grad_norm": 2.139676463756265,
"learning_rate": 4.988047277024456e-06,
"loss": 0.5966,
"step": 80
},
{
"epoch": 0.07,
"grad_norm": 2.1665820212993068,
"learning_rate": 4.987703613548988e-06,
"loss": 0.603,
"step": 81
},
{
"epoch": 0.07,
"grad_norm": 1.931456975470041,
"learning_rate": 4.987355091534467e-06,
"loss": 0.6122,
"step": 82
},
{
"epoch": 0.07,
"grad_norm": 2.134995092135601,
"learning_rate": 4.987001711661566e-06,
"loss": 0.6213,
"step": 83
},
{
"epoch": 0.07,
"grad_norm": 2.0173352657570818,
"learning_rate": 4.98664347462044e-06,
"loss": 0.5966,
"step": 84
},
{
"epoch": 0.08,
"grad_norm": 2.0816939924571183,
"learning_rate": 4.986280381110737e-06,
"loss": 0.5575,
"step": 85
},
{
"epoch": 0.08,
"grad_norm": 2.0072477771163357,
"learning_rate": 4.985912431841584e-06,
"loss": 0.6225,
"step": 86
},
{
"epoch": 0.08,
"grad_norm": 2.1895945454214507,
"learning_rate": 4.985539627531596e-06,
"loss": 0.6169,
"step": 87
},
{
"epoch": 0.08,
"grad_norm": 2.84518214074801,
"learning_rate": 4.985161968908866e-06,
"loss": 0.6317,
"step": 88
},
{
"epoch": 0.08,
"grad_norm": 2.194209857089938,
"learning_rate": 4.984779456710971e-06,
"loss": 0.6205,
"step": 89
},
{
"epoch": 0.08,
"grad_norm": 2.1604595364123083,
"learning_rate": 4.9843920916849645e-06,
"loss": 0.6176,
"step": 90
},
{
"epoch": 0.08,
"grad_norm": 2.039087518829079,
"learning_rate": 4.9839998745873795e-06,
"loss": 0.5842,
"step": 91
},
{
"epoch": 0.08,
"grad_norm": 2.0148570016863334,
"learning_rate": 4.983602806184225e-06,
"loss": 0.5936,
"step": 92
},
{
"epoch": 0.08,
"grad_norm": 2.073137159272384,
"learning_rate": 4.983200887250982e-06,
"loss": 0.6317,
"step": 93
},
{
"epoch": 0.08,
"grad_norm": 2.045469602089007,
"learning_rate": 4.9827941185726095e-06,
"loss": 0.5338,
"step": 94
},
{
"epoch": 0.08,
"grad_norm": 2.1201743116757417,
"learning_rate": 4.982382500943533e-06,
"loss": 0.6133,
"step": 95
},
{
"epoch": 0.09,
"grad_norm": 2.0637214917996363,
"learning_rate": 4.981966035167654e-06,
"loss": 0.6483,
"step": 96
},
{
"epoch": 0.09,
"grad_norm": 2.155574452675582,
"learning_rate": 4.981544722058336e-06,
"loss": 0.6001,
"step": 97
},
{
"epoch": 0.09,
"grad_norm": 1.9347601392775928,
"learning_rate": 4.981118562438414e-06,
"loss": 0.5954,
"step": 98
},
{
"epoch": 0.09,
"grad_norm": 2.3054537863874756,
"learning_rate": 4.980687557140187e-06,
"loss": 0.6338,
"step": 99
},
{
"epoch": 0.09,
"grad_norm": 2.0421104909837338,
"learning_rate": 4.980251707005417e-06,
"loss": 0.6166,
"step": 100
},
{
"epoch": 0.09,
"grad_norm": 2.023167301994367,
"learning_rate": 4.979811012885329e-06,
"loss": 0.5682,
"step": 101
},
{
"epoch": 0.09,
"grad_norm": 2.0583654213007967,
"learning_rate": 4.979365475640609e-06,
"loss": 0.5759,
"step": 102
},
{
"epoch": 0.09,
"grad_norm": 2.008917223929121,
"learning_rate": 4.9789150961414e-06,
"loss": 0.6324,
"step": 103
},
{
"epoch": 0.09,
"grad_norm": 2.1111479338304306,
"learning_rate": 4.978459875267303e-06,
"loss": 0.5821,
"step": 104
},
{
"epoch": 0.09,
"grad_norm": 2.400366962461983,
"learning_rate": 4.977999813907375e-06,
"loss": 0.5699,
"step": 105
},
{
"epoch": 0.09,
"grad_norm": 2.090668061316384,
"learning_rate": 4.977534912960124e-06,
"loss": 0.5754,
"step": 106
},
{
"epoch": 0.09,
"grad_norm": 2.2103419288491466,
"learning_rate": 4.977065173333515e-06,
"loss": 0.6005,
"step": 107
},
{
"epoch": 0.1,
"grad_norm": 2.1332380447628294,
"learning_rate": 4.9765905959449565e-06,
"loss": 0.6178,
"step": 108
},
{
"epoch": 0.1,
"grad_norm": 2.1372224949542464,
"learning_rate": 4.976111181721309e-06,
"loss": 0.6021,
"step": 109
},
{
"epoch": 0.1,
"grad_norm": 2.636052326949506,
"learning_rate": 4.97562693159888e-06,
"loss": 0.6418,
"step": 110
},
{
"epoch": 0.1,
"grad_norm": 2.1234423477493443,
"learning_rate": 4.975137846523419e-06,
"loss": 0.6231,
"step": 111
},
{
"epoch": 0.1,
"grad_norm": 2.2817790529425315,
"learning_rate": 4.974643927450121e-06,
"loss": 0.5681,
"step": 112
},
{
"epoch": 0.1,
"grad_norm": 2.2605060344304713,
"learning_rate": 4.9741451753436205e-06,
"loss": 0.5803,
"step": 113
},
{
"epoch": 0.1,
"grad_norm": 2.0355236974665876,
"learning_rate": 4.973641591177991e-06,
"loss": 0.6003,
"step": 114
},
{
"epoch": 0.1,
"grad_norm": 2.4343221170301415,
"learning_rate": 4.973133175936743e-06,
"loss": 0.5882,
"step": 115
},
{
"epoch": 0.1,
"grad_norm": 2.2135760843199734,
"learning_rate": 4.972619930612822e-06,
"loss": 0.5886,
"step": 116
},
{
"epoch": 0.1,
"grad_norm": 2.161909448676307,
"learning_rate": 4.972101856208609e-06,
"loss": 0.5792,
"step": 117
},
{
"epoch": 0.1,
"grad_norm": 2.0871148781401927,
"learning_rate": 4.9715789537359126e-06,
"loss": 0.6383,
"step": 118
},
{
"epoch": 0.11,
"grad_norm": 2.1159018206478626,
"learning_rate": 4.971051224215973e-06,
"loss": 0.5865,
"step": 119
},
{
"epoch": 0.11,
"grad_norm": 2.2036428070670375,
"learning_rate": 4.970518668679459e-06,
"loss": 0.5905,
"step": 120
},
{
"epoch": 0.11,
"grad_norm": 2.22262007661876,
"learning_rate": 4.969981288166461e-06,
"loss": 0.5951,
"step": 121
},
{
"epoch": 0.11,
"grad_norm": 2.0713458839382786,
"learning_rate": 4.969439083726496e-06,
"loss": 0.6011,
"step": 122
},
{
"epoch": 0.11,
"grad_norm": 2.0686060725186897,
"learning_rate": 4.9688920564185e-06,
"loss": 0.6038,
"step": 123
},
{
"epoch": 0.11,
"grad_norm": 2.1825376161159964,
"learning_rate": 4.968340207310832e-06,
"loss": 0.6098,
"step": 124
},
{
"epoch": 0.11,
"grad_norm": 2.142436541976576,
"learning_rate": 4.967783537481262e-06,
"loss": 0.6119,
"step": 125
},
{
"epoch": 0.11,
"grad_norm": 2.330044622755397,
"learning_rate": 4.967222048016979e-06,
"loss": 0.6057,
"step": 126
},
{
"epoch": 0.11,
"grad_norm": 2.109116942854107,
"learning_rate": 4.966655740014585e-06,
"loss": 0.5958,
"step": 127
},
{
"epoch": 0.11,
"grad_norm": 2.174219068914296,
"learning_rate": 4.9660846145800914e-06,
"loss": 0.6276,
"step": 128
},
{
"epoch": 0.11,
"grad_norm": 2.135736248304593,
"learning_rate": 4.965508672828918e-06,
"loss": 0.6309,
"step": 129
},
{
"epoch": 0.12,
"grad_norm": 2.2339234058672885,
"learning_rate": 4.964927915885893e-06,
"loss": 0.5879,
"step": 130
},
{
"epoch": 0.12,
"grad_norm": 2.0960660335616224,
"learning_rate": 4.9643423448852455e-06,
"loss": 0.6218,
"step": 131
},
{
"epoch": 0.12,
"grad_norm": 1.9468729925472703,
"learning_rate": 4.963751960970609e-06,
"loss": 0.5998,
"step": 132
},
{
"epoch": 0.12,
"grad_norm": 2.1623168252289915,
"learning_rate": 4.9631567652950164e-06,
"loss": 0.6885,
"step": 133
},
{
"epoch": 0.12,
"grad_norm": 2.084420579583794,
"learning_rate": 4.962556759020898e-06,
"loss": 0.5758,
"step": 134
},
{
"epoch": 0.12,
"grad_norm": 2.1082890389844713,
"learning_rate": 4.961951943320078e-06,
"loss": 0.6116,
"step": 135
},
{
"epoch": 0.12,
"grad_norm": 2.006123424806457,
"learning_rate": 4.9613423193737754e-06,
"loss": 0.5708,
"step": 136
},
{
"epoch": 0.12,
"grad_norm": 2.309431970929405,
"learning_rate": 4.960727888372599e-06,
"loss": 0.621,
"step": 137
},
{
"epoch": 0.12,
"grad_norm": 2.226488524758773,
"learning_rate": 4.9601086515165456e-06,
"loss": 0.5896,
"step": 138
},
{
"epoch": 0.12,
"grad_norm": 2.1242070778655253,
"learning_rate": 4.959484610014997e-06,
"loss": 0.624,
"step": 139
},
{
"epoch": 0.12,
"grad_norm": 2.2147491445730516,
"learning_rate": 4.958855765086722e-06,
"loss": 0.6064,
"step": 140
},
{
"epoch": 0.12,
"grad_norm": 2.1818004600393,
"learning_rate": 4.958222117959868e-06,
"loss": 0.6252,
"step": 141
},
{
"epoch": 0.13,
"grad_norm": 2.1094535889409696,
"learning_rate": 4.95758366987196e-06,
"loss": 0.5779,
"step": 142
},
{
"epoch": 0.13,
"grad_norm": 2.2043056809252577,
"learning_rate": 4.9569404220699025e-06,
"loss": 0.6156,
"step": 143
},
{
"epoch": 0.13,
"grad_norm": 2.158056342799238,
"learning_rate": 4.956292375809971e-06,
"loss": 0.5662,
"step": 144
},
{
"epoch": 0.13,
"grad_norm": 1.987581635345228,
"learning_rate": 4.955639532357815e-06,
"loss": 0.6148,
"step": 145
},
{
"epoch": 0.13,
"grad_norm": 2.266145451051948,
"learning_rate": 4.954981892988451e-06,
"loss": 0.5867,
"step": 146
},
{
"epoch": 0.13,
"grad_norm": 2.071082600205798,
"learning_rate": 4.954319458986264e-06,
"loss": 0.5976,
"step": 147
},
{
"epoch": 0.13,
"grad_norm": 2.1615342548575374,
"learning_rate": 4.953652231645002e-06,
"loss": 0.5643,
"step": 148
},
{
"epoch": 0.13,
"grad_norm": 2.145126231371731,
"learning_rate": 4.952980212267773e-06,
"loss": 0.5592,
"step": 149
},
{
"epoch": 0.13,
"grad_norm": 1.9161750244434461,
"learning_rate": 4.952303402167047e-06,
"loss": 0.5547,
"step": 150
},
{
"epoch": 0.13,
"grad_norm": 2.234370958372018,
"learning_rate": 4.9516218026646475e-06,
"loss": 0.578,
"step": 151
},
{
"epoch": 0.13,
"grad_norm": 2.149553338429868,
"learning_rate": 4.950935415091753e-06,
"loss": 0.5952,
"step": 152
},
{
"epoch": 0.14,
"grad_norm": 2.1021801657048016,
"learning_rate": 4.950244240788895e-06,
"loss": 0.573,
"step": 153
},
{
"epoch": 0.14,
"grad_norm": 2.488711367210497,
"learning_rate": 4.949548281105951e-06,
"loss": 0.5776,
"step": 154
},
{
"epoch": 0.14,
"grad_norm": 2.0302393290147167,
"learning_rate": 4.948847537402145e-06,
"loss": 0.5685,
"step": 155
},
{
"epoch": 0.14,
"grad_norm": 2.1563261797248043,
"learning_rate": 4.948142011046044e-06,
"loss": 0.6185,
"step": 156
},
{
"epoch": 0.14,
"grad_norm": 2.1308303224609997,
"learning_rate": 4.947431703415558e-06,
"loss": 0.6229,
"step": 157
},
{
"epoch": 0.14,
"grad_norm": 2.0988414912992273,
"learning_rate": 4.946716615897932e-06,
"loss": 0.6167,
"step": 158
},
{
"epoch": 0.14,
"grad_norm": 2.3558302474583095,
"learning_rate": 4.9459967498897485e-06,
"loss": 0.5903,
"step": 159
},
{
"epoch": 0.14,
"grad_norm": 2.1505555405055223,
"learning_rate": 4.945272106796919e-06,
"loss": 0.5709,
"step": 160
},
{
"epoch": 0.14,
"grad_norm": 2.0604140956574635,
"learning_rate": 4.94454268803469e-06,
"loss": 0.635,
"step": 161
},
{
"epoch": 0.14,
"grad_norm": 2.3699836246614696,
"learning_rate": 4.943808495027631e-06,
"loss": 0.581,
"step": 162
},
{
"epoch": 0.14,
"grad_norm": 1.9809907136859368,
"learning_rate": 4.9430695292096365e-06,
"loss": 0.5703,
"step": 163
},
{
"epoch": 0.15,
"grad_norm": 2.213101907296851,
"learning_rate": 4.942325792023922e-06,
"loss": 0.5915,
"step": 164
},
{
"epoch": 0.15,
"grad_norm": 2.3778783149383944,
"learning_rate": 4.941577284923025e-06,
"loss": 0.537,
"step": 165
},
{
"epoch": 0.15,
"grad_norm": 1.9283694807512721,
"learning_rate": 4.9408240093687934e-06,
"loss": 0.579,
"step": 166
},
{
"epoch": 0.15,
"grad_norm": 2.083087334039033,
"learning_rate": 4.940065966832392e-06,
"loss": 0.5612,
"step": 167
},
{
"epoch": 0.15,
"grad_norm": 2.314684793845775,
"learning_rate": 4.939303158794294e-06,
"loss": 0.6001,
"step": 168
},
{
"epoch": 0.15,
"grad_norm": 2.131977461745334,
"learning_rate": 4.93853558674428e-06,
"loss": 0.5809,
"step": 169
},
{
"epoch": 0.15,
"grad_norm": 2.1291924932946755,
"learning_rate": 4.937763252181434e-06,
"loss": 0.6216,
"step": 170
},
{
"epoch": 0.15,
"grad_norm": 1.9366549866764742,
"learning_rate": 4.936986156614144e-06,
"loss": 0.5888,
"step": 171
},
{
"epoch": 0.15,
"grad_norm": 2.231889540095555,
"learning_rate": 4.9362043015600934e-06,
"loss": 0.6437,
"step": 172
},
{
"epoch": 0.15,
"grad_norm": 2.0696023557568233,
"learning_rate": 4.9354176885462626e-06,
"loss": 0.5951,
"step": 173
},
{
"epoch": 0.15,
"grad_norm": 2.10974806039572,
"learning_rate": 4.934626319108923e-06,
"loss": 0.5817,
"step": 174
},
{
"epoch": 0.16,
"grad_norm": 2.0633698321381946,
"learning_rate": 4.933830194793636e-06,
"loss": 0.5692,
"step": 175
},
{
"epoch": 0.16,
"grad_norm": 2.0163693967733423,
"learning_rate": 4.933029317155251e-06,
"loss": 0.5322,
"step": 176
},
{
"epoch": 0.16,
"grad_norm": 2.1118176135699813,
"learning_rate": 4.932223687757899e-06,
"loss": 0.5809,
"step": 177
},
{
"epoch": 0.16,
"grad_norm": 2.181431947183138,
"learning_rate": 4.9314133081749906e-06,
"loss": 0.5444,
"step": 178
},
{
"epoch": 0.16,
"grad_norm": 2.2055197469621386,
"learning_rate": 4.930598179989215e-06,
"loss": 0.6063,
"step": 179
},
{
"epoch": 0.16,
"grad_norm": 2.1103699877035638,
"learning_rate": 4.929778304792537e-06,
"loss": 0.5908,
"step": 180
},
{
"epoch": 0.16,
"grad_norm": 2.01692648335164,
"learning_rate": 4.928953684186189e-06,
"loss": 0.5729,
"step": 181
},
{
"epoch": 0.16,
"grad_norm": 1.990744003423107,
"learning_rate": 4.928124319780673e-06,
"loss": 0.5935,
"step": 182
},
{
"epoch": 0.16,
"grad_norm": 1.9898687560952446,
"learning_rate": 4.9272902131957555e-06,
"loss": 0.6008,
"step": 183
},
{
"epoch": 0.16,
"grad_norm": 1.9499116832570582,
"learning_rate": 4.926451366060465e-06,
"loss": 0.5731,
"step": 184
},
{
"epoch": 0.16,
"grad_norm": 1.8933258467243923,
"learning_rate": 4.925607780013088e-06,
"loss": 0.5822,
"step": 185
},
{
"epoch": 0.16,
"grad_norm": 1.9711936623837691,
"learning_rate": 4.924759456701167e-06,
"loss": 0.5433,
"step": 186
},
{
"epoch": 0.17,
"grad_norm": 1.9981254191144715,
"learning_rate": 4.923906397781495e-06,
"loss": 0.5603,
"step": 187
},
{
"epoch": 0.17,
"grad_norm": 1.9489584101682442,
"learning_rate": 4.923048604920115e-06,
"loss": 0.592,
"step": 188
},
{
"epoch": 0.17,
"grad_norm": 2.14587896098926,
"learning_rate": 4.922186079792315e-06,
"loss": 0.5861,
"step": 189
},
{
"epoch": 0.17,
"grad_norm": 2.093505234897306,
"learning_rate": 4.921318824082625e-06,
"loss": 0.5756,
"step": 190
},
{
"epoch": 0.17,
"grad_norm": 1.9726924068956073,
"learning_rate": 4.920446839484814e-06,
"loss": 0.5954,
"step": 191
},
{
"epoch": 0.17,
"grad_norm": 2.0009011296035886,
"learning_rate": 4.919570127701888e-06,
"loss": 0.5185,
"step": 192
},
{
"epoch": 0.17,
"grad_norm": 2.0801246171281993,
"learning_rate": 4.9186886904460826e-06,
"loss": 0.5788,
"step": 193
},
{
"epoch": 0.17,
"grad_norm": 2.7712602468155096,
"learning_rate": 4.917802529438865e-06,
"loss": 0.6637,
"step": 194
},
{
"epoch": 0.17,
"grad_norm": 1.9721040372060654,
"learning_rate": 4.916911646410926e-06,
"loss": 0.5926,
"step": 195
},
{
"epoch": 0.17,
"grad_norm": 2.1199089061376855,
"learning_rate": 4.91601604310218e-06,
"loss": 0.5854,
"step": 196
},
{
"epoch": 0.17,
"grad_norm": 1.9518281461372036,
"learning_rate": 4.915115721261759e-06,
"loss": 0.5456,
"step": 197
},
{
"epoch": 0.18,
"grad_norm": 2.1537515435847734,
"learning_rate": 4.9142106826480114e-06,
"loss": 0.6152,
"step": 198
},
{
"epoch": 0.18,
"grad_norm": 2.3461320565666344,
"learning_rate": 4.913300929028498e-06,
"loss": 0.617,
"step": 199
},
{
"epoch": 0.18,
"grad_norm": 1.9789785575462193,
"learning_rate": 4.912386462179987e-06,
"loss": 0.5845,
"step": 200
},
{
"epoch": 0.18,
"grad_norm": 2.0705337307209253,
"learning_rate": 4.9114672838884515e-06,
"loss": 0.6062,
"step": 201
},
{
"epoch": 0.18,
"grad_norm": 1.9972918925367322,
"learning_rate": 4.910543395949066e-06,
"loss": 0.6318,
"step": 202
},
{
"epoch": 0.18,
"grad_norm": 2.03173534028091,
"learning_rate": 4.9096148001662055e-06,
"loss": 0.64,
"step": 203
},
{
"epoch": 0.18,
"grad_norm": 2.0861416304602356,
"learning_rate": 4.908681498353436e-06,
"loss": 0.5859,
"step": 204
},
{
"epoch": 0.18,
"grad_norm": 1.932510611788884,
"learning_rate": 4.907743492333517e-06,
"loss": 0.5483,
"step": 205
},
{
"epoch": 0.18,
"grad_norm": 1.9618471764126828,
"learning_rate": 4.906800783938395e-06,
"loss": 0.5767,
"step": 206
},
{
"epoch": 0.18,
"grad_norm": 2.3557796360921786,
"learning_rate": 4.905853375009198e-06,
"loss": 0.5934,
"step": 207
},
{
"epoch": 0.18,
"grad_norm": 2.0993364379712784,
"learning_rate": 4.9049012673962385e-06,
"loss": 0.5879,
"step": 208
},
{
"epoch": 0.19,
"grad_norm": 2.2015612636555155,
"learning_rate": 4.903944462959001e-06,
"loss": 0.5598,
"step": 209
},
{
"epoch": 0.19,
"grad_norm": 2.0374544745406062,
"learning_rate": 4.902982963566147e-06,
"loss": 0.577,
"step": 210
},
{
"epoch": 0.19,
"grad_norm": 2.194866218807,
"learning_rate": 4.902016771095506e-06,
"loss": 0.5848,
"step": 211
},
{
"epoch": 0.19,
"grad_norm": 2.2545375351308614,
"learning_rate": 4.901045887434072e-06,
"loss": 0.5846,
"step": 212
},
{
"epoch": 0.19,
"grad_norm": 2.017012770131601,
"learning_rate": 4.900070314478001e-06,
"loss": 0.5651,
"step": 213
},
{
"epoch": 0.19,
"grad_norm": 2.150900326654639,
"learning_rate": 4.899090054132609e-06,
"loss": 0.568,
"step": 214
},
{
"epoch": 0.19,
"grad_norm": 2.0404886979870454,
"learning_rate": 4.898105108312366e-06,
"loss": 0.5277,
"step": 215
},
{
"epoch": 0.19,
"grad_norm": 2.036614254190257,
"learning_rate": 4.897115478940892e-06,
"loss": 0.5754,
"step": 216
},
{
"epoch": 0.19,
"grad_norm": 2.041133008809928,
"learning_rate": 4.896121167950954e-06,
"loss": 0.6294,
"step": 217
},
{
"epoch": 0.19,
"grad_norm": 2.0029503409054885,
"learning_rate": 4.895122177284465e-06,
"loss": 0.5531,
"step": 218
},
{
"epoch": 0.19,
"grad_norm": 2.0303439698174754,
"learning_rate": 4.894118508892474e-06,
"loss": 0.6008,
"step": 219
},
{
"epoch": 0.19,
"grad_norm": 1.899982778272908,
"learning_rate": 4.893110164735167e-06,
"loss": 0.6076,
"step": 220
},
{
"epoch": 0.2,
"grad_norm": 2.170640326694132,
"learning_rate": 4.892097146781862e-06,
"loss": 0.5806,
"step": 221
},
{
"epoch": 0.2,
"grad_norm": 1.961802557992624,
"learning_rate": 4.8910794570110055e-06,
"loss": 0.5456,
"step": 222
},
{
"epoch": 0.2,
"grad_norm": 2.1149182672715807,
"learning_rate": 4.890057097410167e-06,
"loss": 0.5683,
"step": 223
},
{
"epoch": 0.2,
"grad_norm": 1.9988574008443096,
"learning_rate": 4.889030069976038e-06,
"loss": 0.5603,
"step": 224
},
{
"epoch": 0.2,
"grad_norm": 2.137840782586502,
"learning_rate": 4.887998376714424e-06,
"loss": 0.5713,
"step": 225
},
{
"epoch": 0.2,
"grad_norm": 2.2956357234771634,
"learning_rate": 4.886962019640244e-06,
"loss": 0.5635,
"step": 226
},
{
"epoch": 0.2,
"grad_norm": 2.2175517801056346,
"learning_rate": 4.885921000777528e-06,
"loss": 0.631,
"step": 227
},
{
"epoch": 0.2,
"grad_norm": 2.0861966792656546,
"learning_rate": 4.884875322159407e-06,
"loss": 0.5521,
"step": 228
},
{
"epoch": 0.2,
"grad_norm": 2.170862650134145,
"learning_rate": 4.883824985828114e-06,
"loss": 0.5953,
"step": 229
},
{
"epoch": 0.2,
"grad_norm": 2.016871028914906,
"learning_rate": 4.882769993834978e-06,
"loss": 0.5745,
"step": 230
},
{
"epoch": 0.2,
"grad_norm": 2.4069309610367107,
"learning_rate": 4.8817103482404236e-06,
"loss": 0.5752,
"step": 231
},
{
"epoch": 0.21,
"grad_norm": 1.9834780557891722,
"learning_rate": 4.880646051113959e-06,
"loss": 0.5619,
"step": 232
},
{
"epoch": 0.21,
"grad_norm": 2.1221686040256005,
"learning_rate": 4.87957710453418e-06,
"loss": 0.561,
"step": 233
},
{
"epoch": 0.21,
"grad_norm": 2.1497751964139002,
"learning_rate": 4.878503510588764e-06,
"loss": 0.5754,
"step": 234
},
{
"epoch": 0.21,
"grad_norm": 1.8535318318419167,
"learning_rate": 4.877425271374462e-06,
"loss": 0.5551,
"step": 235
},
{
"epoch": 0.21,
"grad_norm": 2.1537345489224404,
"learning_rate": 4.876342388997099e-06,
"loss": 0.544,
"step": 236
},
{
"epoch": 0.21,
"grad_norm": 1.9695512744073471,
"learning_rate": 4.875254865571567e-06,
"loss": 0.6003,
"step": 237
},
{
"epoch": 0.21,
"grad_norm": 2.2550853928957193,
"learning_rate": 4.874162703221823e-06,
"loss": 0.5968,
"step": 238
},
{
"epoch": 0.21,
"grad_norm": 2.0658630166795917,
"learning_rate": 4.873065904080884e-06,
"loss": 0.5658,
"step": 239
},
{
"epoch": 0.21,
"grad_norm": 2.0821280326495524,
"learning_rate": 4.871964470290823e-06,
"loss": 0.5711,
"step": 240
},
{
"epoch": 0.21,
"grad_norm": 1.9833074137024158,
"learning_rate": 4.8708584040027636e-06,
"loss": 0.5899,
"step": 241
},
{
"epoch": 0.21,
"grad_norm": 2.0288963441502195,
"learning_rate": 4.869747707376877e-06,
"loss": 0.5601,
"step": 242
},
{
"epoch": 0.22,
"grad_norm": 2.0970435875726463,
"learning_rate": 4.868632382582378e-06,
"loss": 0.6381,
"step": 243
},
{
"epoch": 0.22,
"grad_norm": 2.1303280408644194,
"learning_rate": 4.86751243179752e-06,
"loss": 0.5495,
"step": 244
},
{
"epoch": 0.22,
"grad_norm": 2.0851781018580584,
"learning_rate": 4.866387857209591e-06,
"loss": 0.5901,
"step": 245
},
{
"epoch": 0.22,
"grad_norm": 1.8310760160854438,
"learning_rate": 4.86525866101491e-06,
"loss": 0.5513,
"step": 246
},
{
"epoch": 0.22,
"grad_norm": 2.199726167537497,
"learning_rate": 4.8641248454188205e-06,
"loss": 0.5873,
"step": 247
},
{
"epoch": 0.22,
"grad_norm": 1.9776691221978735,
"learning_rate": 4.862986412635691e-06,
"loss": 0.6143,
"step": 248
},
{
"epoch": 0.22,
"grad_norm": 2.0663231641830873,
"learning_rate": 4.8618433648889034e-06,
"loss": 0.5937,
"step": 249
},
{
"epoch": 0.22,
"grad_norm": 2.170520506577784,
"learning_rate": 4.860695704410856e-06,
"loss": 0.5374,
"step": 250
},
{
"epoch": 0.22,
"grad_norm": 1.9685756224067419,
"learning_rate": 4.8595434334429535e-06,
"loss": 0.5139,
"step": 251
},
{
"epoch": 0.22,
"grad_norm": 1.9668205539999677,
"learning_rate": 4.8583865542356065e-06,
"loss": 0.5459,
"step": 252
},
{
"epoch": 0.22,
"grad_norm": 2.0793578279258704,
"learning_rate": 4.857225069048226e-06,
"loss": 0.593,
"step": 253
},
{
"epoch": 0.22,
"grad_norm": 1.9265474492849337,
"learning_rate": 4.8560589801492165e-06,
"loss": 0.5559,
"step": 254
},
{
"epoch": 0.23,
"grad_norm": 2.8555278122830696,
"learning_rate": 4.854888289815976e-06,
"loss": 0.5949,
"step": 255
},
{
"epoch": 0.23,
"grad_norm": 2.063838630196542,
"learning_rate": 4.853713000334887e-06,
"loss": 0.5712,
"step": 256
},
{
"epoch": 0.23,
"grad_norm": 2.168668910730517,
"learning_rate": 4.852533114001316e-06,
"loss": 0.5475,
"step": 257
},
{
"epoch": 0.23,
"grad_norm": 2.064042820960706,
"learning_rate": 4.8513486331196055e-06,
"loss": 0.5616,
"step": 258
},
{
"epoch": 0.23,
"grad_norm": 2.026751060346143,
"learning_rate": 4.850159560003074e-06,
"loss": 0.5997,
"step": 259
},
{
"epoch": 0.23,
"grad_norm": 2.1228129299875254,
"learning_rate": 4.848965896974006e-06,
"loss": 0.5622,
"step": 260
},
{
"epoch": 0.23,
"grad_norm": 1.9418510365881214,
"learning_rate": 4.847767646363652e-06,
"loss": 0.5741,
"step": 261
},
{
"epoch": 0.23,
"grad_norm": 2.070611833895483,
"learning_rate": 4.846564810512221e-06,
"loss": 0.5729,
"step": 262
},
{
"epoch": 0.23,
"grad_norm": 1.8833621440375596,
"learning_rate": 4.845357391768877e-06,
"loss": 0.5503,
"step": 263
},
{
"epoch": 0.23,
"grad_norm": 2.1022924907055387,
"learning_rate": 4.844145392491735e-06,
"loss": 0.6204,
"step": 264
},
{
"epoch": 0.23,
"grad_norm": 2.024625007813473,
"learning_rate": 4.842928815047856e-06,
"loss": 0.5776,
"step": 265
},
{
"epoch": 0.24,
"grad_norm": 1.9123739071371275,
"learning_rate": 4.8417076618132434e-06,
"loss": 0.5417,
"step": 266
},
{
"epoch": 0.24,
"grad_norm": 2.062879186086598,
"learning_rate": 4.8404819351728336e-06,
"loss": 0.5387,
"step": 267
},
{
"epoch": 0.24,
"grad_norm": 1.9944627549250884,
"learning_rate": 4.8392516375204986e-06,
"loss": 0.5731,
"step": 268
},
{
"epoch": 0.24,
"grad_norm": 1.9859912626846585,
"learning_rate": 4.838016771259037e-06,
"loss": 0.5969,
"step": 269
},
{
"epoch": 0.24,
"grad_norm": 2.043069520519082,
"learning_rate": 4.836777338800168e-06,
"loss": 0.6217,
"step": 270
},
{
"epoch": 0.24,
"grad_norm": 1.913212451622778,
"learning_rate": 4.835533342564531e-06,
"loss": 0.5527,
"step": 271
},
{
"epoch": 0.24,
"grad_norm": 1.978858281238778,
"learning_rate": 4.834284784981678e-06,
"loss": 0.5997,
"step": 272
},
{
"epoch": 0.24,
"grad_norm": 2.004628826916504,
"learning_rate": 4.833031668490067e-06,
"loss": 0.551,
"step": 273
},
{
"epoch": 0.24,
"grad_norm": 2.164370107566024,
"learning_rate": 4.8317739955370645e-06,
"loss": 0.5537,
"step": 274
},
{
"epoch": 0.24,
"grad_norm": 1.891772326146366,
"learning_rate": 4.83051176857893e-06,
"loss": 0.6075,
"step": 275
},
{
"epoch": 0.24,
"grad_norm": 2.0553128913886645,
"learning_rate": 4.8292449900808216e-06,
"loss": 0.5854,
"step": 276
},
{
"epoch": 0.25,
"grad_norm": 2.009000622167072,
"learning_rate": 4.827973662516786e-06,
"loss": 0.5503,
"step": 277
},
{
"epoch": 0.25,
"grad_norm": 1.9385043396652537,
"learning_rate": 4.826697788369752e-06,
"loss": 0.5704,
"step": 278
},
{
"epoch": 0.25,
"grad_norm": 2.3263786060073826,
"learning_rate": 4.8254173701315295e-06,
"loss": 0.5604,
"step": 279
},
{
"epoch": 0.25,
"grad_norm": 1.9251504140774536,
"learning_rate": 4.8241324103028055e-06,
"loss": 0.5647,
"step": 280
},
{
"epoch": 0.25,
"grad_norm": 1.9714117964729747,
"learning_rate": 4.822842911393131e-06,
"loss": 0.604,
"step": 281
},
{
"epoch": 0.25,
"grad_norm": 2.034372279161665,
"learning_rate": 4.821548875920927e-06,
"loss": 0.5803,
"step": 282
},
{
"epoch": 0.25,
"grad_norm": 1.9849114644945505,
"learning_rate": 4.8202503064134725e-06,
"loss": 0.5854,
"step": 283
},
{
"epoch": 0.25,
"grad_norm": 2.3435998455971343,
"learning_rate": 4.818947205406902e-06,
"loss": 0.4988,
"step": 284
},
{
"epoch": 0.25,
"grad_norm": 2.0672779732760924,
"learning_rate": 4.8176395754462e-06,
"loss": 0.5734,
"step": 285
},
{
"epoch": 0.25,
"grad_norm": 2.1206384205127544,
"learning_rate": 4.816327419085197e-06,
"loss": 0.563,
"step": 286
},
{
"epoch": 0.25,
"grad_norm": 2.1105254841893095,
"learning_rate": 4.815010738886561e-06,
"loss": 0.5765,
"step": 287
},
{
"epoch": 0.26,
"grad_norm": 2.072546090747287,
"learning_rate": 4.813689537421798e-06,
"loss": 0.6003,
"step": 288
},
{
"epoch": 0.26,
"grad_norm": 2.1131138426394442,
"learning_rate": 4.812363817271243e-06,
"loss": 0.6097,
"step": 289
},
{
"epoch": 0.26,
"grad_norm": 1.9218545344238502,
"learning_rate": 4.811033581024056e-06,
"loss": 0.6272,
"step": 290
},
{
"epoch": 0.26,
"grad_norm": 2.235420687671868,
"learning_rate": 4.809698831278217e-06,
"loss": 0.5519,
"step": 291
},
{
"epoch": 0.26,
"grad_norm": 1.8915062282224397,
"learning_rate": 4.808359570640522e-06,
"loss": 0.5832,
"step": 292
},
{
"epoch": 0.26,
"grad_norm": 1.9185231023206675,
"learning_rate": 4.8070158017265755e-06,
"loss": 0.5854,
"step": 293
},
{
"epoch": 0.26,
"grad_norm": 2.086526046887808,
"learning_rate": 4.805667527160788e-06,
"loss": 0.5314,
"step": 294
},
{
"epoch": 0.26,
"grad_norm": 1.9995370937944454,
"learning_rate": 4.804314749576368e-06,
"loss": 0.5749,
"step": 295
},
{
"epoch": 0.26,
"grad_norm": 2.099313489806141,
"learning_rate": 4.802957471615319e-06,
"loss": 0.5173,
"step": 296
},
{
"epoch": 0.26,
"grad_norm": 2.067736275086448,
"learning_rate": 4.8015956959284346e-06,
"loss": 0.5434,
"step": 297
},
{
"epoch": 0.26,
"grad_norm": 2.005525416579935,
"learning_rate": 4.800229425175294e-06,
"loss": 0.5589,
"step": 298
},
{
"epoch": 0.26,
"grad_norm": 2.172708847484724,
"learning_rate": 4.7988586620242515e-06,
"loss": 0.5919,
"step": 299
},
{
"epoch": 0.27,
"grad_norm": 2.0010542748493823,
"learning_rate": 4.797483409152438e-06,
"loss": 0.5803,
"step": 300
},
{
"epoch": 0.27,
"grad_norm": 2.1169505971764506,
"learning_rate": 4.7961036692457516e-06,
"loss": 0.5763,
"step": 301
},
{
"epoch": 0.27,
"grad_norm": 2.202849419501746,
"learning_rate": 4.794719444998856e-06,
"loss": 0.5691,
"step": 302
},
{
"epoch": 0.27,
"grad_norm": 1.9765013761990564,
"learning_rate": 4.793330739115169e-06,
"loss": 0.5657,
"step": 303
},
{
"epoch": 0.27,
"grad_norm": 2.0404392238791136,
"learning_rate": 4.791937554306863e-06,
"loss": 0.5648,
"step": 304
},
{
"epoch": 0.27,
"grad_norm": 2.0298920886210516,
"learning_rate": 4.790539893294861e-06,
"loss": 0.5353,
"step": 305
},
{
"epoch": 0.27,
"grad_norm": 2.03157486915788,
"learning_rate": 4.789137758808823e-06,
"loss": 0.5716,
"step": 306
},
{
"epoch": 0.27,
"grad_norm": 2.060346338513047,
"learning_rate": 4.787731153587149e-06,
"loss": 0.5502,
"step": 307
},
{
"epoch": 0.27,
"grad_norm": 1.9286831590091769,
"learning_rate": 4.786320080376968e-06,
"loss": 0.5646,
"step": 308
},
{
"epoch": 0.27,
"grad_norm": 2.042346254905274,
"learning_rate": 4.7849045419341376e-06,
"loss": 0.6085,
"step": 309
},
{
"epoch": 0.27,
"grad_norm": 2.0758243469708293,
"learning_rate": 4.7834845410232356e-06,
"loss": 0.5452,
"step": 310
},
{
"epoch": 0.28,
"grad_norm": 2.0454965773706553,
"learning_rate": 4.782060080417553e-06,
"loss": 0.514,
"step": 311
},
{
"epoch": 0.28,
"grad_norm": 2.073931876222572,
"learning_rate": 4.780631162899094e-06,
"loss": 0.5884,
"step": 312
},
{
"epoch": 0.28,
"grad_norm": 1.9699688248650635,
"learning_rate": 4.7791977912585645e-06,
"loss": 0.529,
"step": 313
},
{
"epoch": 0.28,
"grad_norm": 1.9886162974888701,
"learning_rate": 4.7777599682953696e-06,
"loss": 0.5796,
"step": 314
},
{
"epoch": 0.28,
"grad_norm": 1.9579685029739566,
"learning_rate": 4.7763176968176106e-06,
"loss": 0.5553,
"step": 315
},
{
"epoch": 0.28,
"grad_norm": 2.2181861411036086,
"learning_rate": 4.7748709796420735e-06,
"loss": 0.5806,
"step": 316
},
{
"epoch": 0.28,
"grad_norm": 2.0345738930041777,
"learning_rate": 4.773419819594228e-06,
"loss": 0.6059,
"step": 317
},
{
"epoch": 0.28,
"grad_norm": 2.0710385535524902,
"learning_rate": 4.7719642195082224e-06,
"loss": 0.5539,
"step": 318
},
{
"epoch": 0.28,
"grad_norm": 2.1239710444371442,
"learning_rate": 4.770504182226875e-06,
"loss": 0.5655,
"step": 319
},
{
"epoch": 0.28,
"grad_norm": 1.9564631444382952,
"learning_rate": 4.769039710601669e-06,
"loss": 0.5914,
"step": 320
},
{
"epoch": 0.28,
"grad_norm": 1.9969926160116234,
"learning_rate": 4.767570807492752e-06,
"loss": 0.55,
"step": 321
},
{
"epoch": 0.29,
"grad_norm": 1.9650736880864492,
"learning_rate": 4.766097475768919e-06,
"loss": 0.5804,
"step": 322
},
{
"epoch": 0.29,
"grad_norm": 2.1946368157969194,
"learning_rate": 4.7646197183076236e-06,
"loss": 0.5631,
"step": 323
},
{
"epoch": 0.29,
"grad_norm": 1.9834181085585831,
"learning_rate": 4.763137537994955e-06,
"loss": 0.5779,
"step": 324
},
{
"epoch": 0.29,
"grad_norm": 2.1081651164417057,
"learning_rate": 4.7616509377256445e-06,
"loss": 0.5375,
"step": 325
},
{
"epoch": 0.29,
"grad_norm": 1.9972027344990544,
"learning_rate": 4.760159920403055e-06,
"loss": 0.5608,
"step": 326
},
{
"epoch": 0.29,
"grad_norm": 1.9554967826543683,
"learning_rate": 4.758664488939174e-06,
"loss": 0.5613,
"step": 327
},
{
"epoch": 0.29,
"grad_norm": 2.211716512822424,
"learning_rate": 4.757164646254614e-06,
"loss": 0.5863,
"step": 328
},
{
"epoch": 0.29,
"grad_norm": 1.9203184200502181,
"learning_rate": 4.755660395278598e-06,
"loss": 0.5275,
"step": 329
},
{
"epoch": 0.29,
"grad_norm": 2.0355308159742505,
"learning_rate": 4.7541517389489626e-06,
"loss": 0.5304,
"step": 330
},
{
"epoch": 0.29,
"grad_norm": 2.005680103405306,
"learning_rate": 4.752638680212145e-06,
"loss": 0.5782,
"step": 331
},
{
"epoch": 0.29,
"grad_norm": 1.9930094995522492,
"learning_rate": 4.751121222023183e-06,
"loss": 0.5197,
"step": 332
},
{
"epoch": 0.29,
"grad_norm": 2.130907347619711,
"learning_rate": 4.749599367345703e-06,
"loss": 0.5453,
"step": 333
},
{
"epoch": 0.3,
"grad_norm": 2.0380649677356715,
"learning_rate": 4.748073119151923e-06,
"loss": 0.5394,
"step": 334
},
{
"epoch": 0.3,
"grad_norm": 2.02655053696048,
"learning_rate": 4.7465424804226366e-06,
"loss": 0.5359,
"step": 335
},
{
"epoch": 0.3,
"grad_norm": 2.108255877778432,
"learning_rate": 4.745007454147215e-06,
"loss": 0.5262,
"step": 336
},
{
"epoch": 0.3,
"grad_norm": 1.8422966312136684,
"learning_rate": 4.7434680433235986e-06,
"loss": 0.529,
"step": 337
},
{
"epoch": 0.3,
"grad_norm": 2.1387816386921004,
"learning_rate": 4.741924250958289e-06,
"loss": 0.5599,
"step": 338
},
{
"epoch": 0.3,
"grad_norm": 2.2063774820548794,
"learning_rate": 4.740376080066346e-06,
"loss": 0.6014,
"step": 339
},
{
"epoch": 0.3,
"grad_norm": 1.917696303327652,
"learning_rate": 4.738823533671383e-06,
"loss": 0.615,
"step": 340
},
{
"epoch": 0.3,
"grad_norm": 2.0283765999277916,
"learning_rate": 4.737266614805554e-06,
"loss": 0.5802,
"step": 341
},
{
"epoch": 0.3,
"grad_norm": 2.0340264609590437,
"learning_rate": 4.7357053265095575e-06,
"loss": 0.5331,
"step": 342
},
{
"epoch": 0.3,
"grad_norm": 2.102037194450825,
"learning_rate": 4.734139671832622e-06,
"loss": 0.5534,
"step": 343
},
{
"epoch": 0.3,
"grad_norm": 2.4389875670618113,
"learning_rate": 4.732569653832505e-06,
"loss": 0.5637,
"step": 344
},
{
"epoch": 0.31,
"grad_norm": 2.1143521053252012,
"learning_rate": 4.730995275575486e-06,
"loss": 0.6539,
"step": 345
},
{
"epoch": 0.31,
"grad_norm": 2.6240136232872064,
"learning_rate": 4.7294165401363616e-06,
"loss": 0.5515,
"step": 346
},
{
"epoch": 0.31,
"grad_norm": 2.037602072097695,
"learning_rate": 4.727833450598433e-06,
"loss": 0.5609,
"step": 347
},
{
"epoch": 0.31,
"grad_norm": 2.10711733636797,
"learning_rate": 4.72624601005351e-06,
"loss": 0.5719,
"step": 348
},
{
"epoch": 0.31,
"grad_norm": 2.277613433738313,
"learning_rate": 4.724654221601899e-06,
"loss": 0.5815,
"step": 349
},
{
"epoch": 0.31,
"grad_norm": 2.0082624113337824,
"learning_rate": 4.7230580883523955e-06,
"loss": 0.5524,
"step": 350
},
{
"epoch": 0.31,
"grad_norm": 1.8922591374161477,
"learning_rate": 4.721457613422285e-06,
"loss": 0.5981,
"step": 351
},
{
"epoch": 0.31,
"grad_norm": 2.108229047424278,
"learning_rate": 4.7198527999373266e-06,
"loss": 0.57,
"step": 352
},
{
"epoch": 0.31,
"grad_norm": 2.152965480400126,
"learning_rate": 4.718243651031759e-06,
"loss": 0.5996,
"step": 353
},
{
"epoch": 0.31,
"grad_norm": 1.8885994019827148,
"learning_rate": 4.716630169848282e-06,
"loss": 0.5543,
"step": 354
},
{
"epoch": 0.31,
"grad_norm": 2.221396082747074,
"learning_rate": 4.715012359538062e-06,
"loss": 0.5423,
"step": 355
},
{
"epoch": 0.32,
"grad_norm": 2.247525651087526,
"learning_rate": 4.7133902232607145e-06,
"loss": 0.6049,
"step": 356
},
{
"epoch": 0.32,
"grad_norm": 1.905837742487114,
"learning_rate": 4.711763764184309e-06,
"loss": 0.5523,
"step": 357
},
{
"epoch": 0.32,
"grad_norm": 2.117965067814315,
"learning_rate": 4.710132985485355e-06,
"loss": 0.5682,
"step": 358
},
{
"epoch": 0.32,
"grad_norm": 2.1530948606389373,
"learning_rate": 4.7084978903487985e-06,
"loss": 0.5506,
"step": 359
},
{
"epoch": 0.32,
"grad_norm": 1.8738866858316863,
"learning_rate": 4.706858481968017e-06,
"loss": 0.5426,
"step": 360
},
{
"epoch": 0.32,
"grad_norm": 1.9967053512246618,
"learning_rate": 4.705214763544806e-06,
"loss": 0.5555,
"step": 361
},
{
"epoch": 0.32,
"grad_norm": 2.352080896364055,
"learning_rate": 4.703566738289389e-06,
"loss": 0.587,
"step": 362
},
{
"epoch": 0.32,
"grad_norm": 2.031696719881503,
"learning_rate": 4.701914409420392e-06,
"loss": 0.6088,
"step": 363
},
{
"epoch": 0.32,
"grad_norm": 2.140107830595095,
"learning_rate": 4.700257780164849e-06,
"loss": 0.5596,
"step": 364
},
{
"epoch": 0.32,
"grad_norm": 2.125236417141067,
"learning_rate": 4.698596853758194e-06,
"loss": 0.5513,
"step": 365
},
{
"epoch": 0.32,
"grad_norm": 1.8878623518397697,
"learning_rate": 4.696931633444251e-06,
"loss": 0.5557,
"step": 366
},
{
"epoch": 0.33,
"grad_norm": 1.9523463678463824,
"learning_rate": 4.695262122475232e-06,
"loss": 0.5317,
"step": 367
},
{
"epoch": 0.33,
"grad_norm": 2.3748547328434455,
"learning_rate": 4.6935883241117286e-06,
"loss": 0.5733,
"step": 368
},
{
"epoch": 0.33,
"grad_norm": 1.9248854873148575,
"learning_rate": 4.691910241622704e-06,
"loss": 0.5523,
"step": 369
},
{
"epoch": 0.33,
"grad_norm": 2.1731794693383923,
"learning_rate": 4.69022787828549e-06,
"loss": 0.6489,
"step": 370
},
{
"epoch": 0.33,
"grad_norm": 1.996570702327501,
"learning_rate": 4.688541237385781e-06,
"loss": 0.584,
"step": 371
},
{
"epoch": 0.33,
"grad_norm": 2.0272036390008097,
"learning_rate": 4.68685032221762e-06,
"loss": 0.554,
"step": 372
},
{
"epoch": 0.33,
"grad_norm": 1.9986403184037858,
"learning_rate": 4.685155136083401e-06,
"loss": 0.5798,
"step": 373
},
{
"epoch": 0.33,
"grad_norm": 2.24642442330448,
"learning_rate": 4.683455682293863e-06,
"loss": 0.5486,
"step": 374
},
{
"epoch": 0.33,
"grad_norm": 2.916261956844043,
"learning_rate": 4.681751964168071e-06,
"loss": 0.5678,
"step": 375
},
{
"epoch": 0.33,
"grad_norm": 2.1597492287443396,
"learning_rate": 4.680043985033427e-06,
"loss": 0.5801,
"step": 376
},
{
"epoch": 0.33,
"grad_norm": 1.9634034606261326,
"learning_rate": 4.6783317482256506e-06,
"loss": 0.5412,
"step": 377
},
{
"epoch": 0.33,
"grad_norm": 2.0128604293697263,
"learning_rate": 4.676615257088777e-06,
"loss": 0.5538,
"step": 378
},
{
"epoch": 0.34,
"grad_norm": 2.2205659530523976,
"learning_rate": 4.674894514975149e-06,
"loss": 0.494,
"step": 379
},
{
"epoch": 0.34,
"grad_norm": 2.000557085172021,
"learning_rate": 4.673169525245416e-06,
"loss": 0.5459,
"step": 380
},
{
"epoch": 0.34,
"grad_norm": 2.0089256125274826,
"learning_rate": 4.671440291268518e-06,
"loss": 0.5729,
"step": 381
},
{
"epoch": 0.34,
"grad_norm": 2.076112293053539,
"learning_rate": 4.66970681642169e-06,
"loss": 0.5277,
"step": 382
},
{
"epoch": 0.34,
"grad_norm": 1.996445627957894,
"learning_rate": 4.667969104090441e-06,
"loss": 0.5879,
"step": 383
},
{
"epoch": 0.34,
"grad_norm": 2.379165029211644,
"learning_rate": 4.666227157668564e-06,
"loss": 0.5924,
"step": 384
},
{
"epoch": 0.34,
"grad_norm": 2.101190475222136,
"learning_rate": 4.664480980558118e-06,
"loss": 0.6466,
"step": 385
},
{
"epoch": 0.34,
"grad_norm": 2.035159570620747,
"learning_rate": 4.662730576169423e-06,
"loss": 0.5979,
"step": 386
},
{
"epoch": 0.34,
"grad_norm": 2.1034174780447814,
"learning_rate": 4.660975947921058e-06,
"loss": 0.5635,
"step": 387
},
{
"epoch": 0.34,
"grad_norm": 2.131573174129039,
"learning_rate": 4.65921709923985e-06,
"loss": 0.5602,
"step": 388
},
{
"epoch": 0.34,
"grad_norm": 1.9282515780121203,
"learning_rate": 4.657454033560868e-06,
"loss": 0.5292,
"step": 389
},
{
"epoch": 0.35,
"grad_norm": 1.922997066030009,
"learning_rate": 4.655686754327419e-06,
"loss": 0.5475,
"step": 390
},
{
"epoch": 0.35,
"grad_norm": 1.9692624098665525,
"learning_rate": 4.653915264991035e-06,
"loss": 0.5529,
"step": 391
},
{
"epoch": 0.35,
"grad_norm": 1.976011234185068,
"learning_rate": 4.652139569011475e-06,
"loss": 0.5439,
"step": 392
},
{
"epoch": 0.35,
"grad_norm": 1.909657950321316,
"learning_rate": 4.650359669856711e-06,
"loss": 0.5558,
"step": 393
},
{
"epoch": 0.35,
"grad_norm": 1.9134183734362904,
"learning_rate": 4.648575571002926e-06,
"loss": 0.5428,
"step": 394
},
{
"epoch": 0.35,
"grad_norm": 2.067168876792994,
"learning_rate": 4.646787275934501e-06,
"loss": 0.6261,
"step": 395
},
{
"epoch": 0.35,
"grad_norm": 1.9358304010171785,
"learning_rate": 4.644994788144017e-06,
"loss": 0.5698,
"step": 396
},
{
"epoch": 0.35,
"grad_norm": 1.9671634072657547,
"learning_rate": 4.643198111132241e-06,
"loss": 0.5345,
"step": 397
},
{
"epoch": 0.35,
"grad_norm": 2.0176052011599133,
"learning_rate": 4.641397248408122e-06,
"loss": 0.5028,
"step": 398
},
{
"epoch": 0.35,
"grad_norm": 1.9960700483606102,
"learning_rate": 4.639592203488784e-06,
"loss": 0.5253,
"step": 399
},
{
"epoch": 0.35,
"grad_norm": 1.9329472749401087,
"learning_rate": 4.63778297989952e-06,
"loss": 0.615,
"step": 400
},
{
"epoch": 0.36,
"grad_norm": 1.9689526846990402,
"learning_rate": 4.6359695811737805e-06,
"loss": 0.5558,
"step": 401
},
{
"epoch": 0.36,
"grad_norm": 2.043494453339269,
"learning_rate": 4.634152010853175e-06,
"loss": 0.5955,
"step": 402
},
{
"epoch": 0.36,
"grad_norm": 1.9251519214200417,
"learning_rate": 4.632330272487455e-06,
"loss": 0.5587,
"step": 403
},
{
"epoch": 0.36,
"grad_norm": 2.2049650629169495,
"learning_rate": 4.6305043696345175e-06,
"loss": 0.5633,
"step": 404
},
{
"epoch": 0.36,
"grad_norm": 1.8971004366601951,
"learning_rate": 4.628674305860389e-06,
"loss": 0.5147,
"step": 405
},
{
"epoch": 0.36,
"grad_norm": 1.958131978242853,
"learning_rate": 4.626840084739224e-06,
"loss": 0.558,
"step": 406
},
{
"epoch": 0.36,
"grad_norm": 1.8809187299789303,
"learning_rate": 4.625001709853296e-06,
"loss": 0.6029,
"step": 407
},
{
"epoch": 0.36,
"grad_norm": 2.07376704403877,
"learning_rate": 4.623159184792992e-06,
"loss": 0.5985,
"step": 408
},
{
"epoch": 0.36,
"grad_norm": 1.9773215118384355,
"learning_rate": 4.621312513156801e-06,
"loss": 0.5592,
"step": 409
},
{
"epoch": 0.36,
"grad_norm": 2.2454931529711373,
"learning_rate": 4.6194616985513144e-06,
"loss": 0.5265,
"step": 410
},
{
"epoch": 0.36,
"grad_norm": 1.917266484743525,
"learning_rate": 4.617606744591214e-06,
"loss": 0.5579,
"step": 411
},
{
"epoch": 0.36,
"grad_norm": 1.9196448264725143,
"learning_rate": 4.615747654899263e-06,
"loss": 0.5345,
"step": 412
},
{
"epoch": 0.37,
"grad_norm": 1.9733157447209138,
"learning_rate": 4.613884433106306e-06,
"loss": 0.528,
"step": 413
},
{
"epoch": 0.37,
"grad_norm": 1.994664364309963,
"learning_rate": 4.612017082851253e-06,
"loss": 0.5489,
"step": 414
},
{
"epoch": 0.37,
"grad_norm": 1.8266904473141898,
"learning_rate": 4.610145607781081e-06,
"loss": 0.5411,
"step": 415
},
{
"epoch": 0.37,
"grad_norm": 2.0294108873934364,
"learning_rate": 4.608270011550823e-06,
"loss": 0.5963,
"step": 416
},
{
"epoch": 0.37,
"grad_norm": 1.9735002273071562,
"learning_rate": 4.606390297823555e-06,
"loss": 0.5858,
"step": 417
},
{
"epoch": 0.37,
"grad_norm": 1.8987568737188125,
"learning_rate": 4.604506470270403e-06,
"loss": 0.493,
"step": 418
},
{
"epoch": 0.37,
"grad_norm": 1.9371998611194052,
"learning_rate": 4.6026185325705195e-06,
"loss": 0.521,
"step": 419
},
{
"epoch": 0.37,
"grad_norm": 1.8926221916061328,
"learning_rate": 4.60072648841109e-06,
"loss": 0.4922,
"step": 420
},
{
"epoch": 0.37,
"grad_norm": 1.8759546163633927,
"learning_rate": 4.598830341487317e-06,
"loss": 0.5487,
"step": 421
},
{
"epoch": 0.37,
"grad_norm": 1.9425705301229708,
"learning_rate": 4.596930095502416e-06,
"loss": 0.5155,
"step": 422
},
{
"epoch": 0.37,
"grad_norm": 1.8718904454318124,
"learning_rate": 4.59502575416761e-06,
"loss": 0.5372,
"step": 423
},
{
"epoch": 0.38,
"grad_norm": 1.8361742824749525,
"learning_rate": 4.593117321202117e-06,
"loss": 0.556,
"step": 424
},
{
"epoch": 0.38,
"grad_norm": 1.8520540031413573,
"learning_rate": 4.59120480033315e-06,
"loss": 0.6213,
"step": 425
},
{
"epoch": 0.38,
"grad_norm": 1.9670746741442957,
"learning_rate": 4.5892881952959015e-06,
"loss": 0.5685,
"step": 426
},
{
"epoch": 0.38,
"grad_norm": 1.969557039139786,
"learning_rate": 4.587367509833543e-06,
"loss": 0.5472,
"step": 427
},
{
"epoch": 0.38,
"grad_norm": 1.9873217018861624,
"learning_rate": 4.585442747697218e-06,
"loss": 0.5419,
"step": 428
},
{
"epoch": 0.38,
"grad_norm": 1.9508580236237527,
"learning_rate": 4.5835139126460234e-06,
"loss": 0.566,
"step": 429
},
{
"epoch": 0.38,
"grad_norm": 1.8929503262145966,
"learning_rate": 4.58158100844702e-06,
"loss": 0.5526,
"step": 430
},
{
"epoch": 0.38,
"grad_norm": 1.9394545018501204,
"learning_rate": 4.57964403887521e-06,
"loss": 0.5469,
"step": 431
},
{
"epoch": 0.38,
"grad_norm": 2.1045619298179927,
"learning_rate": 4.577703007713538e-06,
"loss": 0.5397,
"step": 432
},
{
"epoch": 0.38,
"grad_norm": 1.8886665443222683,
"learning_rate": 4.575757918752879e-06,
"loss": 0.5174,
"step": 433
},
{
"epoch": 0.38,
"grad_norm": 1.849256286655662,
"learning_rate": 4.573808775792033e-06,
"loss": 0.558,
"step": 434
},
{
"epoch": 0.39,
"grad_norm": 1.89537230772545,
"learning_rate": 4.5718555826377195e-06,
"loss": 0.6155,
"step": 435
},
{
"epoch": 0.39,
"grad_norm": 2.028600611269796,
"learning_rate": 4.569898343104568e-06,
"loss": 0.5639,
"step": 436
},
{
"epoch": 0.39,
"grad_norm": 2.1153787641168273,
"learning_rate": 4.567937061015107e-06,
"loss": 0.5883,
"step": 437
},
{
"epoch": 0.39,
"grad_norm": 2.0217937777574075,
"learning_rate": 4.5659717401997655e-06,
"loss": 0.5936,
"step": 438
},
{
"epoch": 0.39,
"grad_norm": 2.248716610859176,
"learning_rate": 4.564002384496856e-06,
"loss": 0.5539,
"step": 439
},
{
"epoch": 0.39,
"grad_norm": 1.9689879082294663,
"learning_rate": 4.562028997752574e-06,
"loss": 0.5636,
"step": 440
},
{
"epoch": 0.39,
"grad_norm": 1.763292547062648,
"learning_rate": 4.560051583820987e-06,
"loss": 0.5402,
"step": 441
},
{
"epoch": 0.39,
"grad_norm": 2.129235681815295,
"learning_rate": 4.558070146564025e-06,
"loss": 0.5279,
"step": 442
},
{
"epoch": 0.39,
"grad_norm": 1.987329959970642,
"learning_rate": 4.55608468985148e-06,
"loss": 0.5597,
"step": 443
},
{
"epoch": 0.39,
"grad_norm": 1.8223595251951752,
"learning_rate": 4.554095217560991e-06,
"loss": 0.5523,
"step": 444
},
{
"epoch": 0.39,
"grad_norm": 1.8945373677348296,
"learning_rate": 4.55210173357804e-06,
"loss": 0.5611,
"step": 445
},
{
"epoch": 0.4,
"grad_norm": 1.8010628987468362,
"learning_rate": 4.550104241795946e-06,
"loss": 0.5406,
"step": 446
},
{
"epoch": 0.4,
"grad_norm": 1.7680591979019162,
"learning_rate": 4.548102746115852e-06,
"loss": 0.5392,
"step": 447
},
{
"epoch": 0.4,
"grad_norm": 1.9894409183828397,
"learning_rate": 4.546097250446724e-06,
"loss": 0.568,
"step": 448
},
{
"epoch": 0.4,
"grad_norm": 1.9527217933389673,
"learning_rate": 4.544087758705338e-06,
"loss": 0.5616,
"step": 449
},
{
"epoch": 0.4,
"grad_norm": 1.8813970745759399,
"learning_rate": 4.5420742748162735e-06,
"loss": 0.5857,
"step": 450
},
{
"epoch": 0.4,
"grad_norm": 1.9697471415378363,
"learning_rate": 4.540056802711911e-06,
"loss": 0.5563,
"step": 451
},
{
"epoch": 0.4,
"grad_norm": 1.8610261764458738,
"learning_rate": 4.5380353463324135e-06,
"loss": 0.5414,
"step": 452
},
{
"epoch": 0.4,
"grad_norm": 2.0760585222699075,
"learning_rate": 4.536009909625733e-06,
"loss": 0.6113,
"step": 453
},
{
"epoch": 0.4,
"grad_norm": 1.9376608369819073,
"learning_rate": 4.533980496547588e-06,
"loss": 0.5567,
"step": 454
},
{
"epoch": 0.4,
"grad_norm": 1.9360208325717025,
"learning_rate": 4.5319471110614676e-06,
"loss": 0.5637,
"step": 455
},
{
"epoch": 0.4,
"grad_norm": 1.9103146510774847,
"learning_rate": 4.529909757138619e-06,
"loss": 0.5049,
"step": 456
},
{
"epoch": 0.4,
"grad_norm": 1.9645365532954322,
"learning_rate": 4.5278684387580356e-06,
"loss": 0.5424,
"step": 457
},
{
"epoch": 0.41,
"grad_norm": 2.0430691701895065,
"learning_rate": 4.52582315990646e-06,
"loss": 0.547,
"step": 458
},
{
"epoch": 0.41,
"grad_norm": 1.995685349345533,
"learning_rate": 4.523773924578362e-06,
"loss": 0.6005,
"step": 459
},
{
"epoch": 0.41,
"grad_norm": 1.9830544751269077,
"learning_rate": 4.521720736775947e-06,
"loss": 0.5563,
"step": 460
},
{
"epoch": 0.41,
"grad_norm": 1.8473463212841006,
"learning_rate": 4.519663600509131e-06,
"loss": 0.5913,
"step": 461
},
{
"epoch": 0.41,
"grad_norm": 1.8993140839815026,
"learning_rate": 4.5176025197955495e-06,
"loss": 0.5653,
"step": 462
},
{
"epoch": 0.41,
"grad_norm": 1.8179551662772986,
"learning_rate": 4.515537498660535e-06,
"loss": 0.5485,
"step": 463
},
{
"epoch": 0.41,
"grad_norm": 1.9275228062086758,
"learning_rate": 4.51346854113712e-06,
"loss": 0.5248,
"step": 464
},
{
"epoch": 0.41,
"grad_norm": 1.9668428438048349,
"learning_rate": 4.511395651266023e-06,
"loss": 0.5939,
"step": 465
},
{
"epoch": 0.41,
"grad_norm": 1.9602042152930792,
"learning_rate": 4.509318833095642e-06,
"loss": 0.5452,
"step": 466
},
{
"epoch": 0.41,
"grad_norm": 1.8348566721600683,
"learning_rate": 4.507238090682049e-06,
"loss": 0.5514,
"step": 467
},
{
"epoch": 0.41,
"grad_norm": 1.938525142403929,
"learning_rate": 4.505153428088979e-06,
"loss": 0.5822,
"step": 468
},
{
"epoch": 0.42,
"grad_norm": 2.008973560332548,
"learning_rate": 4.503064849387822e-06,
"loss": 0.5765,
"step": 469
},
{
"epoch": 0.42,
"grad_norm": 1.8911779425902009,
"learning_rate": 4.500972358657618e-06,
"loss": 0.5465,
"step": 470
},
{
"epoch": 0.42,
"grad_norm": 1.9224818772820709,
"learning_rate": 4.4988759599850485e-06,
"loss": 0.5897,
"step": 471
},
{
"epoch": 0.42,
"grad_norm": 1.990817812633161,
"learning_rate": 4.496775657464423e-06,
"loss": 0.5505,
"step": 472
},
{
"epoch": 0.42,
"grad_norm": 1.9167562026803746,
"learning_rate": 4.4946714551976795e-06,
"loss": 0.5779,
"step": 473
},
{
"epoch": 0.42,
"grad_norm": 1.9388400892712594,
"learning_rate": 4.492563357294369e-06,
"loss": 0.574,
"step": 474
},
{
"epoch": 0.42,
"grad_norm": 2.0140312788131762,
"learning_rate": 4.490451367871655e-06,
"loss": 0.4928,
"step": 475
},
{
"epoch": 0.42,
"grad_norm": 2.074902721101316,
"learning_rate": 4.488335491054296e-06,
"loss": 0.5366,
"step": 476
},
{
"epoch": 0.42,
"grad_norm": 1.8245504149698855,
"learning_rate": 4.486215730974646e-06,
"loss": 0.581,
"step": 477
},
{
"epoch": 0.42,
"grad_norm": 2.1100306515160656,
"learning_rate": 4.4840920917726425e-06,
"loss": 0.5677,
"step": 478
},
{
"epoch": 0.42,
"grad_norm": 1.9560380000004616,
"learning_rate": 4.4819645775958e-06,
"loss": 0.5426,
"step": 479
},
{
"epoch": 0.43,
"grad_norm": 1.721267171163405,
"learning_rate": 4.479833192599198e-06,
"loss": 0.5868,
"step": 480
},
{
"epoch": 0.43,
"grad_norm": 2.0001169229847124,
"learning_rate": 4.477697940945478e-06,
"loss": 0.5667,
"step": 481
},
{
"epoch": 0.43,
"grad_norm": 2.0111322894409134,
"learning_rate": 4.475558826804833e-06,
"loss": 0.5707,
"step": 482
},
{
"epoch": 0.43,
"grad_norm": 1.8179588699061133,
"learning_rate": 4.473415854355e-06,
"loss": 0.5484,
"step": 483
},
{
"epoch": 0.43,
"grad_norm": 2.0491236128150345,
"learning_rate": 4.47126902778125e-06,
"loss": 0.5575,
"step": 484
},
{
"epoch": 0.43,
"grad_norm": 2.049676347036571,
"learning_rate": 4.469118351276381e-06,
"loss": 0.5807,
"step": 485
},
{
"epoch": 0.43,
"grad_norm": 1.8999028972772445,
"learning_rate": 4.4669638290407115e-06,
"loss": 0.5447,
"step": 486
},
{
"epoch": 0.43,
"grad_norm": 2.0754807768031687,
"learning_rate": 4.464805465282071e-06,
"loss": 0.503,
"step": 487
},
{
"epoch": 0.43,
"grad_norm": 1.9532719169013661,
"learning_rate": 4.462643264215789e-06,
"loss": 0.5304,
"step": 488
},
{
"epoch": 0.43,
"grad_norm": 2.038547881198709,
"learning_rate": 4.460477230064693e-06,
"loss": 0.6116,
"step": 489
},
{
"epoch": 0.43,
"grad_norm": 2.1342568039197136,
"learning_rate": 4.458307367059092e-06,
"loss": 0.5632,
"step": 490
},
{
"epoch": 0.43,
"grad_norm": 1.9267024509918977,
"learning_rate": 4.456133679436778e-06,
"loss": 0.5574,
"step": 491
},
{
"epoch": 0.44,
"grad_norm": 1.795213135692931,
"learning_rate": 4.453956171443008e-06,
"loss": 0.5737,
"step": 492
},
{
"epoch": 0.44,
"grad_norm": 1.9428252328171443,
"learning_rate": 4.451774847330505e-06,
"loss": 0.5685,
"step": 493
},
{
"epoch": 0.44,
"grad_norm": 1.7903749800219122,
"learning_rate": 4.449589711359439e-06,
"loss": 0.5214,
"step": 494
},
{
"epoch": 0.44,
"grad_norm": 2.111615491479605,
"learning_rate": 4.447400767797429e-06,
"loss": 0.5329,
"step": 495
},
{
"epoch": 0.44,
"grad_norm": 1.936578332165912,
"learning_rate": 4.445208020919531e-06,
"loss": 0.543,
"step": 496
},
{
"epoch": 0.44,
"grad_norm": 2.0005145681262473,
"learning_rate": 4.4430114750082246e-06,
"loss": 0.5593,
"step": 497
},
{
"epoch": 0.44,
"grad_norm": 1.9720912009242426,
"learning_rate": 4.4408111343534125e-06,
"loss": 0.5812,
"step": 498
},
{
"epoch": 0.44,
"grad_norm": 2.0486055586452787,
"learning_rate": 4.4386070032524085e-06,
"loss": 0.5563,
"step": 499
},
{
"epoch": 0.44,
"grad_norm": 1.8043262288689983,
"learning_rate": 4.436399086009928e-06,
"loss": 0.4905,
"step": 500
},
{
"epoch": 0.44,
"grad_norm": 1.9608580808640215,
"learning_rate": 4.43418738693808e-06,
"loss": 0.5548,
"step": 501
},
{
"epoch": 0.44,
"grad_norm": 2.008548225584814,
"learning_rate": 4.431971910356363e-06,
"loss": 0.5955,
"step": 502
},
{
"epoch": 0.45,
"grad_norm": 1.8974274240345173,
"learning_rate": 4.429752660591648e-06,
"loss": 0.5742,
"step": 503
},
{
"epoch": 0.45,
"grad_norm": 1.8257689605722616,
"learning_rate": 4.427529641978181e-06,
"loss": 0.6177,
"step": 504
},
{
"epoch": 0.45,
"grad_norm": 2.0327301577551764,
"learning_rate": 4.425302858857563e-06,
"loss": 0.5872,
"step": 505
},
{
"epoch": 0.45,
"grad_norm": 1.9539661576324254,
"learning_rate": 4.42307231557875e-06,
"loss": 0.5728,
"step": 506
},
{
"epoch": 0.45,
"grad_norm": 1.9346302819034207,
"learning_rate": 4.420838016498043e-06,
"loss": 0.6019,
"step": 507
},
{
"epoch": 0.45,
"grad_norm": 2.1255667417446054,
"learning_rate": 4.418599965979074e-06,
"loss": 0.5981,
"step": 508
},
{
"epoch": 0.45,
"grad_norm": 1.8293805714793054,
"learning_rate": 4.416358168392806e-06,
"loss": 0.5497,
"step": 509
},
{
"epoch": 0.45,
"grad_norm": 1.929762647152706,
"learning_rate": 4.414112628117518e-06,
"loss": 0.5655,
"step": 510
},
{
"epoch": 0.45,
"grad_norm": 1.9808758258773635,
"learning_rate": 4.411863349538798e-06,
"loss": 0.5465,
"step": 511
},
{
"epoch": 0.45,
"grad_norm": 2.0413084054198647,
"learning_rate": 4.409610337049537e-06,
"loss": 0.5264,
"step": 512
},
{
"epoch": 0.45,
"grad_norm": 1.9506473664088613,
"learning_rate": 4.4073535950499155e-06,
"loss": 0.5284,
"step": 513
},
{
"epoch": 0.46,
"grad_norm": 1.7875399190820846,
"learning_rate": 4.405093127947402e-06,
"loss": 0.5406,
"step": 514
},
{
"epoch": 0.46,
"grad_norm": 1.9594159192262046,
"learning_rate": 4.402828940156735e-06,
"loss": 0.573,
"step": 515
},
{
"epoch": 0.46,
"grad_norm": 2.025943836966642,
"learning_rate": 4.400561036099924e-06,
"loss": 0.5227,
"step": 516
},
{
"epoch": 0.46,
"grad_norm": 1.9439140060564322,
"learning_rate": 4.398289420206235e-06,
"loss": 0.5802,
"step": 517
},
{
"epoch": 0.46,
"grad_norm": 1.891060025336787,
"learning_rate": 4.396014096912182e-06,
"loss": 0.55,
"step": 518
},
{
"epoch": 0.46,
"grad_norm": 1.9575594944193413,
"learning_rate": 4.393735070661521e-06,
"loss": 0.5213,
"step": 519
},
{
"epoch": 0.46,
"grad_norm": 2.024463679893138,
"learning_rate": 4.391452345905239e-06,
"loss": 0.5354,
"step": 520
},
{
"epoch": 0.46,
"grad_norm": 1.825359223217947,
"learning_rate": 4.389165927101549e-06,
"loss": 0.5506,
"step": 521
},
{
"epoch": 0.46,
"grad_norm": 2.0284690208197484,
"learning_rate": 4.386875818715875e-06,
"loss": 0.5763,
"step": 522
},
{
"epoch": 0.46,
"grad_norm": 1.9021830177238082,
"learning_rate": 4.3845820252208476e-06,
"loss": 0.5596,
"step": 523
},
{
"epoch": 0.46,
"grad_norm": 2.0000504821060203,
"learning_rate": 4.3822845510962966e-06,
"loss": 0.5701,
"step": 524
},
{
"epoch": 0.47,
"grad_norm": 1.7341340075311633,
"learning_rate": 4.379983400829237e-06,
"loss": 0.5315,
"step": 525
},
{
"epoch": 0.47,
"grad_norm": 1.9297447671947465,
"learning_rate": 4.377678578913868e-06,
"loss": 0.5798,
"step": 526
},
{
"epoch": 0.47,
"grad_norm": 1.9233069620366818,
"learning_rate": 4.375370089851554e-06,
"loss": 0.5391,
"step": 527
},
{
"epoch": 0.47,
"grad_norm": 1.976671700063146,
"learning_rate": 4.3730579381508254e-06,
"loss": 0.5674,
"step": 528
},
{
"epoch": 0.47,
"grad_norm": 1.914097057045113,
"learning_rate": 4.3707421283273645e-06,
"loss": 0.5367,
"step": 529
},
{
"epoch": 0.47,
"grad_norm": 1.8477362806445459,
"learning_rate": 4.368422664903997e-06,
"loss": 0.5349,
"step": 530
},
{
"epoch": 0.47,
"grad_norm": 1.9704477099484594,
"learning_rate": 4.366099552410686e-06,
"loss": 0.501,
"step": 531
},
{
"epoch": 0.47,
"grad_norm": 1.9297086500071385,
"learning_rate": 4.363772795384522e-06,
"loss": 0.5352,
"step": 532
},
{
"epoch": 0.47,
"grad_norm": 1.9090996748848685,
"learning_rate": 4.36144239836971e-06,
"loss": 0.5457,
"step": 533
},
{
"epoch": 0.47,
"grad_norm": 1.905870882711107,
"learning_rate": 4.3591083659175655e-06,
"loss": 0.5685,
"step": 534
},
{
"epoch": 0.47,
"grad_norm": 1.968618442539214,
"learning_rate": 4.356770702586506e-06,
"loss": 0.5476,
"step": 535
},
{
"epoch": 0.47,
"grad_norm": 1.9431218136805426,
"learning_rate": 4.354429412942038e-06,
"loss": 0.5719,
"step": 536
},
{
"epoch": 0.48,
"grad_norm": 2.0756451350956215,
"learning_rate": 4.3520845015567495e-06,
"loss": 0.5502,
"step": 537
},
{
"epoch": 0.48,
"grad_norm": 1.8350117686217275,
"learning_rate": 4.349735973010306e-06,
"loss": 0.5417,
"step": 538
},
{
"epoch": 0.48,
"grad_norm": 2.03495920394236,
"learning_rate": 4.3473838318894324e-06,
"loss": 0.545,
"step": 539
},
{
"epoch": 0.48,
"grad_norm": 1.7864245375307775,
"learning_rate": 4.3450280827879125e-06,
"loss": 0.5242,
"step": 540
},
{
"epoch": 0.48,
"grad_norm": 1.9018530036883652,
"learning_rate": 4.342668730306575e-06,
"loss": 0.554,
"step": 541
},
{
"epoch": 0.48,
"grad_norm": 1.8575071370513128,
"learning_rate": 4.340305779053286e-06,
"loss": 0.5287,
"step": 542
},
{
"epoch": 0.48,
"grad_norm": 1.8480049595126469,
"learning_rate": 4.33793923364294e-06,
"loss": 0.5554,
"step": 543
},
{
"epoch": 0.48,
"grad_norm": 2.103039565778625,
"learning_rate": 4.335569098697454e-06,
"loss": 0.5526,
"step": 544
},
{
"epoch": 0.48,
"grad_norm": 1.8712145108160219,
"learning_rate": 4.33319537884575e-06,
"loss": 0.5472,
"step": 545
},
{
"epoch": 0.48,
"grad_norm": 1.9271972466285336,
"learning_rate": 4.330818078723756e-06,
"loss": 0.5827,
"step": 546
},
{
"epoch": 0.48,
"grad_norm": 1.954438973741856,
"learning_rate": 4.328437202974389e-06,
"loss": 0.5433,
"step": 547
},
{
"epoch": 0.49,
"grad_norm": 2.0467264178153726,
"learning_rate": 4.326052756247553e-06,
"loss": 0.5981,
"step": 548
},
{
"epoch": 0.49,
"grad_norm": 1.9418055408636266,
"learning_rate": 4.323664743200123e-06,
"loss": 0.5832,
"step": 549
},
{
"epoch": 0.49,
"grad_norm": 2.444044603553196,
"learning_rate": 4.32127316849594e-06,
"loss": 0.5638,
"step": 550
},
{
"epoch": 0.49,
"grad_norm": 1.8791947879326414,
"learning_rate": 4.318878036805802e-06,
"loss": 0.5864,
"step": 551
},
{
"epoch": 0.49,
"grad_norm": 1.872356245946924,
"learning_rate": 4.3164793528074525e-06,
"loss": 0.5337,
"step": 552
},
{
"epoch": 0.49,
"grad_norm": 2.025493213646544,
"learning_rate": 4.3140771211855725e-06,
"loss": 0.5401,
"step": 553
},
{
"epoch": 0.49,
"grad_norm": 1.9845857759145742,
"learning_rate": 4.3116713466317745e-06,
"loss": 0.5712,
"step": 554
},
{
"epoch": 0.49,
"grad_norm": 1.9091874317608197,
"learning_rate": 4.309262033844587e-06,
"loss": 0.5337,
"step": 555
},
{
"epoch": 0.49,
"grad_norm": 1.926646558220673,
"learning_rate": 4.30684918752945e-06,
"loss": 0.5787,
"step": 556
},
{
"epoch": 0.49,
"grad_norm": 2.0450560123448165,
"learning_rate": 4.304432812398704e-06,
"loss": 0.5704,
"step": 557
},
{
"epoch": 0.49,
"grad_norm": 1.915800332391142,
"learning_rate": 4.302012913171584e-06,
"loss": 0.5194,
"step": 558
},
{
"epoch": 0.5,
"grad_norm": 1.9050588229807015,
"learning_rate": 4.299589494574204e-06,
"loss": 0.5104,
"step": 559
},
{
"epoch": 0.5,
"grad_norm": 1.9241714112001687,
"learning_rate": 4.297162561339554e-06,
"loss": 0.5388,
"step": 560
},
{
"epoch": 0.5,
"grad_norm": 1.8520273210081386,
"learning_rate": 4.294732118207486e-06,
"loss": 0.5363,
"step": 561
},
{
"epoch": 0.5,
"grad_norm": 2.0240180827444205,
"learning_rate": 4.292298169924709e-06,
"loss": 0.5632,
"step": 562
},
{
"epoch": 0.5,
"grad_norm": 1.8385436745856445,
"learning_rate": 4.289860721244776e-06,
"loss": 0.542,
"step": 563
},
{
"epoch": 0.5,
"grad_norm": 1.9260618068482396,
"learning_rate": 4.287419776928078e-06,
"loss": 0.5555,
"step": 564
},
{
"epoch": 0.5,
"grad_norm": 3.155290692386073,
"learning_rate": 4.284975341741833e-06,
"loss": 0.5336,
"step": 565
},
{
"epoch": 0.5,
"grad_norm": 2.461077264148098,
"learning_rate": 4.282527420460073e-06,
"loss": 0.5794,
"step": 566
},
{
"epoch": 0.5,
"grad_norm": 1.8539810703173831,
"learning_rate": 4.280076017863643e-06,
"loss": 0.5298,
"step": 567
},
{
"epoch": 0.5,
"grad_norm": 1.981150552962984,
"learning_rate": 4.277621138740185e-06,
"loss": 0.5862,
"step": 568
},
{
"epoch": 0.5,
"grad_norm": 1.8768796036679432,
"learning_rate": 4.275162787884132e-06,
"loss": 0.5255,
"step": 569
},
{
"epoch": 0.5,
"grad_norm": 2.022795676637582,
"learning_rate": 4.272700970096696e-06,
"loss": 0.5984,
"step": 570
},
{
"epoch": 0.51,
"grad_norm": 1.835618231704385,
"learning_rate": 4.27023569018586e-06,
"loss": 0.5297,
"step": 571
},
{
"epoch": 0.51,
"grad_norm": 1.853495005213679,
"learning_rate": 4.267766952966369e-06,
"loss": 0.5188,
"step": 572
},
{
"epoch": 0.51,
"grad_norm": 1.8841750183665413,
"learning_rate": 4.265294763259721e-06,
"loss": 0.5678,
"step": 573
},
{
"epoch": 0.51,
"grad_norm": 1.8013177249236558,
"learning_rate": 4.262819125894156e-06,
"loss": 0.5286,
"step": 574
},
{
"epoch": 0.51,
"grad_norm": 1.8320928495052518,
"learning_rate": 4.2603400457046476e-06,
"loss": 0.5341,
"step": 575
},
{
"epoch": 0.51,
"grad_norm": 1.8323864124122828,
"learning_rate": 4.257857527532891e-06,
"loss": 0.5283,
"step": 576
},
{
"epoch": 0.51,
"grad_norm": 1.9487038959665601,
"learning_rate": 4.255371576227301e-06,
"loss": 0.5418,
"step": 577
},
{
"epoch": 0.51,
"grad_norm": 1.7875154296015772,
"learning_rate": 4.252882196642993e-06,
"loss": 0.5065,
"step": 578
},
{
"epoch": 0.51,
"grad_norm": 2.089827238376911,
"learning_rate": 4.250389393641778e-06,
"loss": 0.5919,
"step": 579
},
{
"epoch": 0.51,
"grad_norm": 1.9078348658003164,
"learning_rate": 4.247893172092157e-06,
"loss": 0.5212,
"step": 580
},
{
"epoch": 0.51,
"grad_norm": 1.9952457072102052,
"learning_rate": 4.245393536869303e-06,
"loss": 0.5284,
"step": 581
},
{
"epoch": 0.52,
"grad_norm": 2.0728561008210384,
"learning_rate": 4.242890492855056e-06,
"loss": 0.5214,
"step": 582
},
{
"epoch": 0.52,
"grad_norm": 1.97825451090628,
"learning_rate": 4.240384044937919e-06,
"loss": 0.5586,
"step": 583
},
{
"epoch": 0.52,
"grad_norm": 1.85380003580073,
"learning_rate": 4.237874198013037e-06,
"loss": 0.6078,
"step": 584
},
{
"epoch": 0.52,
"grad_norm": 1.8198051628607304,
"learning_rate": 4.235360956982196e-06,
"loss": 0.5677,
"step": 585
},
{
"epoch": 0.52,
"grad_norm": 2.1343351043013183,
"learning_rate": 4.23284432675381e-06,
"loss": 0.5706,
"step": 586
},
{
"epoch": 0.52,
"grad_norm": 2.0294462862804896,
"learning_rate": 4.230324312242911e-06,
"loss": 0.5399,
"step": 587
},
{
"epoch": 0.52,
"grad_norm": 1.9618881336969853,
"learning_rate": 4.227800918371145e-06,
"loss": 0.5292,
"step": 588
},
{
"epoch": 0.52,
"grad_norm": 1.9665398714083597,
"learning_rate": 4.225274150066752e-06,
"loss": 0.5414,
"step": 589
},
{
"epoch": 0.52,
"grad_norm": 2.0976099857689268,
"learning_rate": 4.222744012264567e-06,
"loss": 0.5204,
"step": 590
},
{
"epoch": 0.52,
"grad_norm": 1.968032018982793,
"learning_rate": 4.220210509906002e-06,
"loss": 0.5622,
"step": 591
},
{
"epoch": 0.52,
"grad_norm": 2.0055542027073523,
"learning_rate": 4.217673647939044e-06,
"loss": 0.5723,
"step": 592
},
{
"epoch": 0.53,
"grad_norm": 2.031612125247833,
"learning_rate": 4.215133431318239e-06,
"loss": 0.5727,
"step": 593
},
{
"epoch": 0.53,
"grad_norm": 2.04253552367063,
"learning_rate": 4.212589865004684e-06,
"loss": 0.5676,
"step": 594
},
{
"epoch": 0.53,
"grad_norm": 1.9143447724555291,
"learning_rate": 4.2100429539660205e-06,
"loss": 0.5452,
"step": 595
},
{
"epoch": 0.53,
"grad_norm": 2.1284999811605334,
"learning_rate": 4.20749270317642e-06,
"loss": 0.5679,
"step": 596
},
{
"epoch": 0.53,
"grad_norm": 1.9726237378545723,
"learning_rate": 4.204939117616578e-06,
"loss": 0.5514,
"step": 597
},
{
"epoch": 0.53,
"grad_norm": 2.0537722291479583,
"learning_rate": 4.202382202273702e-06,
"loss": 0.5979,
"step": 598
},
{
"epoch": 0.53,
"grad_norm": 1.9695944675405062,
"learning_rate": 4.1998219621415035e-06,
"loss": 0.5519,
"step": 599
},
{
"epoch": 0.53,
"grad_norm": 2.1175148159531196,
"learning_rate": 4.197258402220187e-06,
"loss": 0.5437,
"step": 600
},
{
"epoch": 0.53,
"grad_norm": 1.9698920488340708,
"learning_rate": 4.19469152751644e-06,
"loss": 0.5765,
"step": 601
},
{
"epoch": 0.53,
"grad_norm": 1.879379971551763,
"learning_rate": 4.192121343043424e-06,
"loss": 0.5219,
"step": 602
},
{
"epoch": 0.53,
"grad_norm": 1.9668215341266202,
"learning_rate": 4.189547853820767e-06,
"loss": 0.4967,
"step": 603
},
{
"epoch": 0.53,
"grad_norm": 2.0264415648360723,
"learning_rate": 4.186971064874547e-06,
"loss": 0.5591,
"step": 604
},
{
"epoch": 0.54,
"grad_norm": 1.9996711001240413,
"learning_rate": 4.18439098123729e-06,
"loss": 0.5909,
"step": 605
},
{
"epoch": 0.54,
"grad_norm": 1.9209919754307736,
"learning_rate": 4.181807607947954e-06,
"loss": 0.5516,
"step": 606
},
{
"epoch": 0.54,
"grad_norm": 1.8120062816345244,
"learning_rate": 4.1792209500519245e-06,
"loss": 0.5112,
"step": 607
},
{
"epoch": 0.54,
"grad_norm": 1.9265993932694714,
"learning_rate": 4.176631012601e-06,
"loss": 0.5716,
"step": 608
},
{
"epoch": 0.54,
"grad_norm": 1.7951063568824173,
"learning_rate": 4.1740378006533835e-06,
"loss": 0.5546,
"step": 609
},
{
"epoch": 0.54,
"grad_norm": 1.9478736935670538,
"learning_rate": 4.1714413192736756e-06,
"loss": 0.5137,
"step": 610
},
{
"epoch": 0.54,
"grad_norm": 1.9166713700159672,
"learning_rate": 4.168841573532859e-06,
"loss": 0.5285,
"step": 611
},
{
"epoch": 0.54,
"grad_norm": 1.903061790874867,
"learning_rate": 4.166238568508294e-06,
"loss": 0.5643,
"step": 612
},
{
"epoch": 0.54,
"grad_norm": 1.8709574261812854,
"learning_rate": 4.1636323092837065e-06,
"loss": 0.5531,
"step": 613
},
{
"epoch": 0.54,
"grad_norm": 1.891374469060374,
"learning_rate": 4.161022800949177e-06,
"loss": 0.5386,
"step": 614
},
{
"epoch": 0.54,
"grad_norm": 1.8621023435008923,
"learning_rate": 4.1584100486011315e-06,
"loss": 0.5472,
"step": 615
},
{
"epoch": 0.55,
"grad_norm": 1.8927480615848256,
"learning_rate": 4.155794057342333e-06,
"loss": 0.567,
"step": 616
},
{
"epoch": 0.55,
"grad_norm": 1.9157957155248084,
"learning_rate": 4.153174832281867e-06,
"loss": 0.5295,
"step": 617
},
{
"epoch": 0.55,
"grad_norm": 1.7900976303440275,
"learning_rate": 4.150552378535138e-06,
"loss": 0.5374,
"step": 618
},
{
"epoch": 0.55,
"grad_norm": 1.9233860209522704,
"learning_rate": 4.1479267012238555e-06,
"loss": 0.5673,
"step": 619
},
{
"epoch": 0.55,
"grad_norm": 1.904244620695313,
"learning_rate": 4.145297805476023e-06,
"loss": 0.5674,
"step": 620
},
{
"epoch": 0.55,
"grad_norm": 1.8633100020518014,
"learning_rate": 4.142665696425932e-06,
"loss": 0.5717,
"step": 621
},
{
"epoch": 0.55,
"grad_norm": 2.0449274851229764,
"learning_rate": 4.140030379214147e-06,
"loss": 0.5382,
"step": 622
},
{
"epoch": 0.55,
"grad_norm": 1.8437126524936716,
"learning_rate": 4.137391858987502e-06,
"loss": 0.5635,
"step": 623
},
{
"epoch": 0.55,
"grad_norm": 1.9476300616110815,
"learning_rate": 4.134750140899082e-06,
"loss": 0.5354,
"step": 624
},
{
"epoch": 0.55,
"grad_norm": 1.8187836169409277,
"learning_rate": 4.132105230108221e-06,
"loss": 0.5678,
"step": 625
},
{
"epoch": 0.55,
"grad_norm": 1.8325255303792565,
"learning_rate": 4.1294571317804854e-06,
"loss": 0.5497,
"step": 626
},
{
"epoch": 0.56,
"grad_norm": 1.947073088948294,
"learning_rate": 4.12680585108767e-06,
"loss": 0.6005,
"step": 627
},
{
"epoch": 0.56,
"grad_norm": 1.9094602677105208,
"learning_rate": 4.1241513932077835e-06,
"loss": 0.5442,
"step": 628
},
{
"epoch": 0.56,
"grad_norm": 1.9308069577521967,
"learning_rate": 4.121493763325039e-06,
"loss": 0.4952,
"step": 629
},
{
"epoch": 0.56,
"grad_norm": 1.955225453108231,
"learning_rate": 4.118832966629847e-06,
"loss": 0.5161,
"step": 630
},
{
"epoch": 0.56,
"grad_norm": 1.8884686835300686,
"learning_rate": 4.116169008318798e-06,
"loss": 0.5834,
"step": 631
},
{
"epoch": 0.56,
"grad_norm": 1.851971220446282,
"learning_rate": 4.113501893594662e-06,
"loss": 0.5762,
"step": 632
},
{
"epoch": 0.56,
"grad_norm": 1.982231343732386,
"learning_rate": 4.110831627666372e-06,
"loss": 0.5043,
"step": 633
},
{
"epoch": 0.56,
"grad_norm": 1.8783480932058496,
"learning_rate": 4.108158215749014e-06,
"loss": 0.5202,
"step": 634
},
{
"epoch": 0.56,
"grad_norm": 1.7472053862830499,
"learning_rate": 4.105481663063821e-06,
"loss": 0.5064,
"step": 635
},
{
"epoch": 0.56,
"grad_norm": 4.71435326799849,
"learning_rate": 4.102801974838158e-06,
"loss": 0.5808,
"step": 636
},
{
"epoch": 0.56,
"grad_norm": 1.9383972995582568,
"learning_rate": 4.100119156305514e-06,
"loss": 0.5268,
"step": 637
},
{
"epoch": 0.57,
"grad_norm": 1.7165619283230378,
"learning_rate": 4.097433212705492e-06,
"loss": 0.5376,
"step": 638
},
{
"epoch": 0.57,
"grad_norm": 1.8524888535442023,
"learning_rate": 4.094744149283796e-06,
"loss": 0.5388,
"step": 639
},
{
"epoch": 0.57,
"grad_norm": 1.958121956311822,
"learning_rate": 4.092051971292228e-06,
"loss": 0.5273,
"step": 640
},
{
"epoch": 0.57,
"grad_norm": 1.8752806971174674,
"learning_rate": 4.089356683988668e-06,
"loss": 0.5283,
"step": 641
},
{
"epoch": 0.57,
"grad_norm": 2.4399117721583465,
"learning_rate": 4.086658292637072e-06,
"loss": 0.5643,
"step": 642
},
{
"epoch": 0.57,
"grad_norm": 1.897865148445396,
"learning_rate": 4.083956802507456e-06,
"loss": 0.5432,
"step": 643
},
{
"epoch": 0.57,
"grad_norm": 2.0947253224544826,
"learning_rate": 4.0812522188758874e-06,
"loss": 0.6738,
"step": 644
},
{
"epoch": 0.57,
"grad_norm": 1.8801252766945993,
"learning_rate": 4.078544547024479e-06,
"loss": 0.5516,
"step": 645
},
{
"epoch": 0.57,
"grad_norm": 1.884681207915535,
"learning_rate": 4.075833792241371e-06,
"loss": 0.5521,
"step": 646
},
{
"epoch": 0.57,
"grad_norm": 1.911314829964074,
"learning_rate": 4.073119959820728e-06,
"loss": 0.5279,
"step": 647
},
{
"epoch": 0.57,
"grad_norm": 1.860637117587055,
"learning_rate": 4.070403055062721e-06,
"loss": 0.5543,
"step": 648
},
{
"epoch": 0.57,
"grad_norm": 2.0453601596603157,
"learning_rate": 4.0676830832735245e-06,
"loss": 0.5757,
"step": 649
},
{
"epoch": 0.58,
"grad_norm": 1.8114060321351384,
"learning_rate": 4.064960049765304e-06,
"loss": 0.5049,
"step": 650
},
{
"epoch": 0.58,
"grad_norm": 1.959305167631277,
"learning_rate": 4.062233959856202e-06,
"loss": 0.5378,
"step": 651
},
{
"epoch": 0.58,
"grad_norm": 1.8509512649844786,
"learning_rate": 4.059504818870332e-06,
"loss": 0.5695,
"step": 652
},
{
"epoch": 0.58,
"grad_norm": 2.0120311393374677,
"learning_rate": 4.056772632137762e-06,
"loss": 0.5548,
"step": 653
},
{
"epoch": 0.58,
"grad_norm": 2.185006431209757,
"learning_rate": 4.054037404994516e-06,
"loss": 0.5796,
"step": 654
},
{
"epoch": 0.58,
"grad_norm": 1.8639659087725635,
"learning_rate": 4.05129914278255e-06,
"loss": 0.503,
"step": 655
},
{
"epoch": 0.58,
"grad_norm": 2.0128366658538726,
"learning_rate": 4.048557850849749e-06,
"loss": 0.5543,
"step": 656
},
{
"epoch": 0.58,
"grad_norm": 2.0493127075126467,
"learning_rate": 4.045813534549917e-06,
"loss": 0.5971,
"step": 657
},
{
"epoch": 0.58,
"grad_norm": 1.8943877873256292,
"learning_rate": 4.043066199242762e-06,
"loss": 0.5512,
"step": 658
},
{
"epoch": 0.58,
"grad_norm": 1.8607643797927613,
"learning_rate": 4.04031585029389e-06,
"loss": 0.5755,
"step": 659
},
{
"epoch": 0.58,
"grad_norm": 1.933467010931308,
"learning_rate": 4.037562493074792e-06,
"loss": 0.546,
"step": 660
},
{
"epoch": 0.59,
"grad_norm": 1.870898209604796,
"learning_rate": 4.034806132962834e-06,
"loss": 0.5101,
"step": 661
},
{
"epoch": 0.59,
"grad_norm": 1.7765005525064146,
"learning_rate": 4.032046775341247e-06,
"loss": 0.535,
"step": 662
},
{
"epoch": 0.59,
"grad_norm": 1.808388020113739,
"learning_rate": 4.029284425599116e-06,
"loss": 0.5532,
"step": 663
},
{
"epoch": 0.59,
"grad_norm": 1.9444426383785842,
"learning_rate": 4.026519089131371e-06,
"loss": 0.5804,
"step": 664
},
{
"epoch": 0.59,
"grad_norm": 1.8810929458792174,
"learning_rate": 4.023750771338774e-06,
"loss": 0.5023,
"step": 665
},
{
"epoch": 0.59,
"grad_norm": 1.7587173598023012,
"learning_rate": 4.020979477627907e-06,
"loss": 0.588,
"step": 666
},
{
"epoch": 0.59,
"grad_norm": 1.8616544736960938,
"learning_rate": 4.018205213411169e-06,
"loss": 0.5604,
"step": 667
},
{
"epoch": 0.59,
"grad_norm": 1.8517363531329913,
"learning_rate": 4.015427984106759e-06,
"loss": 0.5503,
"step": 668
},
{
"epoch": 0.59,
"grad_norm": 1.7164279131663547,
"learning_rate": 4.012647795138664e-06,
"loss": 0.5353,
"step": 669
},
{
"epoch": 0.59,
"grad_norm": 1.8490922932257532,
"learning_rate": 4.009864651936653e-06,
"loss": 0.5527,
"step": 670
},
{
"epoch": 0.59,
"grad_norm": 1.9222471762582807,
"learning_rate": 4.007078559936268e-06,
"loss": 0.5449,
"step": 671
},
{
"epoch": 0.6,
"grad_norm": 1.7126406752680576,
"learning_rate": 4.0042895245788035e-06,
"loss": 0.5102,
"step": 672
},
{
"epoch": 0.6,
"grad_norm": 1.7999692875631594,
"learning_rate": 4.001497551311308e-06,
"loss": 0.514,
"step": 673
},
{
"epoch": 0.6,
"grad_norm": 1.8482521644616647,
"learning_rate": 3.998702645586565e-06,
"loss": 0.546,
"step": 674
},
{
"epoch": 0.6,
"grad_norm": 1.8124842120343776,
"learning_rate": 3.995904812863086e-06,
"loss": 0.5432,
"step": 675
},
{
"epoch": 0.6,
"grad_norm": 1.9053654350943952,
"learning_rate": 3.993104058605099e-06,
"loss": 0.6222,
"step": 676
},
{
"epoch": 0.6,
"grad_norm": 1.851530834120678,
"learning_rate": 3.9903003882825396e-06,
"loss": 0.5069,
"step": 677
},
{
"epoch": 0.6,
"grad_norm": 1.824612938648448,
"learning_rate": 3.987493807371033e-06,
"loss": 0.5279,
"step": 678
},
{
"epoch": 0.6,
"grad_norm": 1.8322983038942529,
"learning_rate": 3.984684321351895e-06,
"loss": 0.504,
"step": 679
},
{
"epoch": 0.6,
"grad_norm": 2.1601679247075105,
"learning_rate": 3.981871935712112e-06,
"loss": 0.5448,
"step": 680
},
{
"epoch": 0.6,
"grad_norm": 1.9324323412240167,
"learning_rate": 3.979056655944335e-06,
"loss": 0.5696,
"step": 681
},
{
"epoch": 0.6,
"grad_norm": 1.8887222870071794,
"learning_rate": 3.9762384875468645e-06,
"loss": 0.5147,
"step": 682
},
{
"epoch": 0.6,
"grad_norm": 1.9025483031058836,
"learning_rate": 3.973417436023646e-06,
"loss": 0.5322,
"step": 683
},
{
"epoch": 0.61,
"grad_norm": 1.944754689874286,
"learning_rate": 3.970593506884254e-06,
"loss": 0.564,
"step": 684
},
{
"epoch": 0.61,
"grad_norm": 1.8782062559948918,
"learning_rate": 3.9677667056438824e-06,
"loss": 0.5179,
"step": 685
},
{
"epoch": 0.61,
"grad_norm": 1.7615090001622373,
"learning_rate": 3.964937037823337e-06,
"loss": 0.52,
"step": 686
},
{
"epoch": 0.61,
"grad_norm": 1.877979446527034,
"learning_rate": 3.962104508949018e-06,
"loss": 0.5611,
"step": 687
},
{
"epoch": 0.61,
"grad_norm": 1.8668900126580097,
"learning_rate": 3.9592691245529174e-06,
"loss": 0.5398,
"step": 688
},
{
"epoch": 0.61,
"grad_norm": 2.0467424748632395,
"learning_rate": 3.9564308901726016e-06,
"loss": 0.5429,
"step": 689
},
{
"epoch": 0.61,
"grad_norm": 1.7523480652481473,
"learning_rate": 3.9535898113512046e-06,
"loss": 0.5456,
"step": 690
},
{
"epoch": 0.61,
"grad_norm": 1.9384307177445268,
"learning_rate": 3.950745893637414e-06,
"loss": 0.5298,
"step": 691
},
{
"epoch": 0.61,
"grad_norm": 2.0200307543606266,
"learning_rate": 3.947899142585464e-06,
"loss": 0.5813,
"step": 692
},
{
"epoch": 0.61,
"grad_norm": 1.8825594318661294,
"learning_rate": 3.945049563755119e-06,
"loss": 0.5843,
"step": 693
},
{
"epoch": 0.61,
"grad_norm": 1.801304483173922,
"learning_rate": 3.94219716271167e-06,
"loss": 0.5332,
"step": 694
},
{
"epoch": 0.62,
"grad_norm": 1.789336412692842,
"learning_rate": 3.939341945025918e-06,
"loss": 0.5712,
"step": 695
},
{
"epoch": 0.62,
"grad_norm": 1.6764596672056864,
"learning_rate": 3.936483916274163e-06,
"loss": 0.5471,
"step": 696
},
{
"epoch": 0.62,
"grad_norm": 1.8160991340297739,
"learning_rate": 3.933623082038199e-06,
"loss": 0.5172,
"step": 697
},
{
"epoch": 0.62,
"grad_norm": 1.9958719154660882,
"learning_rate": 3.930759447905298e-06,
"loss": 0.5243,
"step": 698
},
{
"epoch": 0.62,
"grad_norm": 1.7844190098902166,
"learning_rate": 3.927893019468196e-06,
"loss": 0.5679,
"step": 699
},
{
"epoch": 0.62,
"grad_norm": 1.8231700761644845,
"learning_rate": 3.925023802325094e-06,
"loss": 0.5415,
"step": 700
},
{
"epoch": 0.62,
"grad_norm": 1.8577751348591511,
"learning_rate": 3.922151802079633e-06,
"loss": 0.5451,
"step": 701
},
{
"epoch": 0.62,
"grad_norm": 1.872268020286279,
"learning_rate": 3.919277024340891e-06,
"loss": 0.5805,
"step": 702
},
{
"epoch": 0.62,
"grad_norm": 1.956916033214976,
"learning_rate": 3.916399474723373e-06,
"loss": 0.5142,
"step": 703
},
{
"epoch": 0.62,
"grad_norm": 1.8690696320721123,
"learning_rate": 3.913519158846994e-06,
"loss": 0.5377,
"step": 704
},
{
"epoch": 0.62,
"grad_norm": 1.8932224298053513,
"learning_rate": 3.910636082337076e-06,
"loss": 0.5174,
"step": 705
},
{
"epoch": 0.63,
"grad_norm": 1.7671002724508906,
"learning_rate": 3.907750250824327e-06,
"loss": 0.5227,
"step": 706
},
{
"epoch": 0.63,
"grad_norm": 1.8537234882936333,
"learning_rate": 3.904861669944839e-06,
"loss": 0.5672,
"step": 707
},
{
"epoch": 0.63,
"grad_norm": 1.8993796687475375,
"learning_rate": 3.901970345340075e-06,
"loss": 0.5131,
"step": 708
},
{
"epoch": 0.63,
"grad_norm": 1.8118617206389966,
"learning_rate": 3.899076282656853e-06,
"loss": 0.5243,
"step": 709
},
{
"epoch": 0.63,
"grad_norm": 1.8195324114535576,
"learning_rate": 3.89617948754734e-06,
"loss": 0.5255,
"step": 710
},
{
"epoch": 0.63,
"grad_norm": 1.777076552111516,
"learning_rate": 3.89327996566904e-06,
"loss": 0.5482,
"step": 711
},
{
"epoch": 0.63,
"grad_norm": 1.7960584295638569,
"learning_rate": 3.890377722684782e-06,
"loss": 0.5232,
"step": 712
},
{
"epoch": 0.63,
"grad_norm": 2.0180517293259777,
"learning_rate": 3.887472764262709e-06,
"loss": 0.4988,
"step": 713
},
{
"epoch": 0.63,
"grad_norm": 1.7698597985590767,
"learning_rate": 3.884565096076269e-06,
"loss": 0.4934,
"step": 714
},
{
"epoch": 0.63,
"grad_norm": 1.9593013419554524,
"learning_rate": 3.8816547238042e-06,
"loss": 0.554,
"step": 715
},
{
"epoch": 0.63,
"grad_norm": 1.803176799671639,
"learning_rate": 3.878741653130521e-06,
"loss": 0.5058,
"step": 716
},
{
"epoch": 0.64,
"grad_norm": 1.8739139669777212,
"learning_rate": 3.875825889744525e-06,
"loss": 0.5291,
"step": 717
},
{
"epoch": 0.64,
"grad_norm": 1.7425957572489872,
"learning_rate": 3.872907439340758e-06,
"loss": 0.5132,
"step": 718
},
{
"epoch": 0.64,
"grad_norm": 1.7880023308134785,
"learning_rate": 3.86998630761902e-06,
"loss": 0.5388,
"step": 719
},
{
"epoch": 0.64,
"grad_norm": 2.035324802689225,
"learning_rate": 3.867062500284342e-06,
"loss": 0.5225,
"step": 720
},
{
"epoch": 0.64,
"grad_norm": 1.7720228048563502,
"learning_rate": 3.864136023046984e-06,
"loss": 0.5535,
"step": 721
},
{
"epoch": 0.64,
"grad_norm": 1.893636721431615,
"learning_rate": 3.861206881622419e-06,
"loss": 0.5445,
"step": 722
},
{
"epoch": 0.64,
"grad_norm": 1.9975882991420841,
"learning_rate": 3.8582750817313245e-06,
"loss": 0.498,
"step": 723
},
{
"epoch": 0.64,
"grad_norm": 1.8894358056153195,
"learning_rate": 3.855340629099568e-06,
"loss": 0.5262,
"step": 724
},
{
"epoch": 0.64,
"grad_norm": 1.8226831631189866,
"learning_rate": 3.852403529458199e-06,
"loss": 0.5289,
"step": 725
},
{
"epoch": 0.64,
"grad_norm": 1.9219589460322386,
"learning_rate": 3.84946378854344e-06,
"loss": 0.5828,
"step": 726
},
{
"epoch": 0.64,
"grad_norm": 1.9524000874112546,
"learning_rate": 3.846521412096665e-06,
"loss": 0.5755,
"step": 727
},
{
"epoch": 0.64,
"grad_norm": 1.7855988589662195,
"learning_rate": 3.8435764058643994e-06,
"loss": 0.508,
"step": 728
},
{
"epoch": 0.65,
"grad_norm": 1.7556968697529176,
"learning_rate": 3.840628775598306e-06,
"loss": 0.5038,
"step": 729
},
{
"epoch": 0.65,
"grad_norm": 1.8615629845007688,
"learning_rate": 3.837678527055168e-06,
"loss": 0.5658,
"step": 730
},
{
"epoch": 0.65,
"grad_norm": 3.355106616980178,
"learning_rate": 3.834725665996889e-06,
"loss": 0.6255,
"step": 731
},
{
"epoch": 0.65,
"grad_norm": 2.057901705133853,
"learning_rate": 3.8317701981904655e-06,
"loss": 0.5009,
"step": 732
},
{
"epoch": 0.65,
"grad_norm": 1.8144866213511652,
"learning_rate": 3.828812129407994e-06,
"loss": 0.5378,
"step": 733
},
{
"epoch": 0.65,
"grad_norm": 1.895740992214761,
"learning_rate": 3.825851465426643e-06,
"loss": 0.5414,
"step": 734
},
{
"epoch": 0.65,
"grad_norm": 1.7690202691648218,
"learning_rate": 3.822888212028658e-06,
"loss": 0.5782,
"step": 735
},
{
"epoch": 0.65,
"grad_norm": 1.9910212850942313,
"learning_rate": 3.819922375001334e-06,
"loss": 0.538,
"step": 736
},
{
"epoch": 0.65,
"grad_norm": 2.022977401775343,
"learning_rate": 3.816953960137017e-06,
"loss": 0.5265,
"step": 737
},
{
"epoch": 0.65,
"grad_norm": 2.18942238369997,
"learning_rate": 3.8139829732330833e-06,
"loss": 0.5419,
"step": 738
},
{
"epoch": 0.65,
"grad_norm": 2.0143145051916487,
"learning_rate": 3.8110094200919356e-06,
"loss": 0.5396,
"step": 739
},
{
"epoch": 0.66,
"grad_norm": 1.8684895296380082,
"learning_rate": 3.8080333065209885e-06,
"loss": 0.5285,
"step": 740
},
{
"epoch": 0.66,
"grad_norm": 1.899758991227905,
"learning_rate": 3.8050546383326546e-06,
"loss": 0.5392,
"step": 741
},
{
"epoch": 0.66,
"grad_norm": 1.7830347822365242,
"learning_rate": 3.8020734213443392e-06,
"loss": 0.5395,
"step": 742
},
{
"epoch": 0.66,
"grad_norm": 1.9688219937316351,
"learning_rate": 3.799089661378423e-06,
"loss": 0.5832,
"step": 743
},
{
"epoch": 0.66,
"grad_norm": 1.8380061964557934,
"learning_rate": 3.7961033642622536e-06,
"loss": 0.5182,
"step": 744
},
{
"epoch": 0.66,
"grad_norm": 1.9752769027783192,
"learning_rate": 3.793114535828134e-06,
"loss": 0.5189,
"step": 745
},
{
"epoch": 0.66,
"grad_norm": 1.9908258845677271,
"learning_rate": 3.7901231819133104e-06,
"loss": 0.5863,
"step": 746
},
{
"epoch": 0.66,
"grad_norm": 1.8419144313470388,
"learning_rate": 3.787129308359963e-06,
"loss": 0.5596,
"step": 747
},
{
"epoch": 0.66,
"grad_norm": 1.8578409208981632,
"learning_rate": 3.7841329210151905e-06,
"loss": 0.5757,
"step": 748
},
{
"epoch": 0.66,
"grad_norm": 1.8125362585272666,
"learning_rate": 3.7811340257310036e-06,
"loss": 0.5625,
"step": 749
},
{
"epoch": 0.66,
"grad_norm": 1.8266843142853604,
"learning_rate": 3.778132628364309e-06,
"loss": 0.5121,
"step": 750
},
{
"epoch": 0.67,
"grad_norm": 1.9286747700189457,
"learning_rate": 3.7751287347769006e-06,
"loss": 0.5856,
"step": 751
},
{
"epoch": 0.67,
"grad_norm": 1.8358169963837994,
"learning_rate": 3.772122350835447e-06,
"loss": 0.5363,
"step": 752
},
{
"epoch": 0.67,
"grad_norm": 1.8751145280860322,
"learning_rate": 3.769113482411483e-06,
"loss": 0.5435,
"step": 753
},
{
"epoch": 0.67,
"grad_norm": 1.7372022137266947,
"learning_rate": 3.766102135381393e-06,
"loss": 0.5114,
"step": 754
}
],
"logging_steps": 1,
"max_steps": 2258,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 377,
"total_flos": 355094809804800.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}