kimnamssya's picture
Upload folder using huggingface_hub
4163d10 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9789156626506024,
"eval_steps": 83,
"global_step": 664,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030120481927710845,
"grad_norm": 0.06898010522127151,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.5032,
"step": 1
},
{
"epoch": 0.0030120481927710845,
"eval_loss": 0.5438461303710938,
"eval_runtime": 48.0664,
"eval_samples_per_second": 7.656,
"eval_steps_per_second": 0.957,
"step": 1
},
{
"epoch": 0.006024096385542169,
"grad_norm": 0.0645361989736557,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.465,
"step": 2
},
{
"epoch": 0.009036144578313253,
"grad_norm": 0.09009082615375519,
"learning_rate": 3e-06,
"loss": 0.5036,
"step": 3
},
{
"epoch": 0.012048192771084338,
"grad_norm": 0.10703085362911224,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5503,
"step": 4
},
{
"epoch": 0.015060240963855422,
"grad_norm": 0.09233544766902924,
"learning_rate": 5e-06,
"loss": 0.5636,
"step": 5
},
{
"epoch": 0.018072289156626505,
"grad_norm": 0.07796537131071091,
"learning_rate": 6e-06,
"loss": 0.4586,
"step": 6
},
{
"epoch": 0.02108433734939759,
"grad_norm": 0.09727340191602707,
"learning_rate": 7e-06,
"loss": 0.5768,
"step": 7
},
{
"epoch": 0.024096385542168676,
"grad_norm": 0.0706663504242897,
"learning_rate": 8.000000000000001e-06,
"loss": 0.3726,
"step": 8
},
{
"epoch": 0.02710843373493976,
"grad_norm": 0.07420889288187027,
"learning_rate": 9e-06,
"loss": 0.449,
"step": 9
},
{
"epoch": 0.030120481927710843,
"grad_norm": 0.07959474623203278,
"learning_rate": 1e-05,
"loss": 0.3981,
"step": 10
},
{
"epoch": 0.03313253012048193,
"grad_norm": 0.08843814581632614,
"learning_rate": 9.999942312273667e-06,
"loss": 0.558,
"step": 11
},
{
"epoch": 0.03614457831325301,
"grad_norm": 0.10731809586286545,
"learning_rate": 9.999769250425817e-06,
"loss": 0.7307,
"step": 12
},
{
"epoch": 0.0391566265060241,
"grad_norm": 0.10979519039392471,
"learning_rate": 9.999480818449868e-06,
"loss": 0.4859,
"step": 13
},
{
"epoch": 0.04216867469879518,
"grad_norm": 0.09655233472585678,
"learning_rate": 9.999077023001411e-06,
"loss": 0.5583,
"step": 14
},
{
"epoch": 0.045180722891566265,
"grad_norm": 0.07842206954956055,
"learning_rate": 9.998557873398066e-06,
"loss": 0.4338,
"step": 15
},
{
"epoch": 0.04819277108433735,
"grad_norm": 0.07478363066911697,
"learning_rate": 9.997923381619257e-06,
"loss": 0.4124,
"step": 16
},
{
"epoch": 0.05120481927710843,
"grad_norm": 0.0887669026851654,
"learning_rate": 9.997173562305937e-06,
"loss": 0.4183,
"step": 17
},
{
"epoch": 0.05421686746987952,
"grad_norm": 0.10326416045427322,
"learning_rate": 9.996308432760257e-06,
"loss": 0.6075,
"step": 18
},
{
"epoch": 0.0572289156626506,
"grad_norm": 0.06556811183691025,
"learning_rate": 9.995328012945158e-06,
"loss": 0.364,
"step": 19
},
{
"epoch": 0.060240963855421686,
"grad_norm": 0.13618294894695282,
"learning_rate": 9.994232325483917e-06,
"loss": 0.6527,
"step": 20
},
{
"epoch": 0.06325301204819277,
"grad_norm": 0.07842332124710083,
"learning_rate": 9.99302139565962e-06,
"loss": 0.4065,
"step": 21
},
{
"epoch": 0.06626506024096386,
"grad_norm": 0.10683953016996384,
"learning_rate": 9.991695251414584e-06,
"loss": 0.614,
"step": 22
},
{
"epoch": 0.06927710843373494,
"grad_norm": 0.09143894165754318,
"learning_rate": 9.990253923349706e-06,
"loss": 0.4691,
"step": 23
},
{
"epoch": 0.07228915662650602,
"grad_norm": 0.08288835734128952,
"learning_rate": 9.988697444723763e-06,
"loss": 0.3832,
"step": 24
},
{
"epoch": 0.07530120481927711,
"grad_norm": 0.11008255183696747,
"learning_rate": 9.98702585145264e-06,
"loss": 0.6614,
"step": 25
},
{
"epoch": 0.0783132530120482,
"grad_norm": 0.10311611741781235,
"learning_rate": 9.9852391821085e-06,
"loss": 0.6724,
"step": 26
},
{
"epoch": 0.08132530120481928,
"grad_norm": 0.0974637120962143,
"learning_rate": 9.983337477918904e-06,
"loss": 0.4878,
"step": 27
},
{
"epoch": 0.08433734939759036,
"grad_norm": 0.09819214791059494,
"learning_rate": 9.981320782765847e-06,
"loss": 0.4209,
"step": 28
},
{
"epoch": 0.08734939759036145,
"grad_norm": 0.08742973208427429,
"learning_rate": 9.97918914318475e-06,
"loss": 0.4899,
"step": 29
},
{
"epoch": 0.09036144578313253,
"grad_norm": 0.06961015611886978,
"learning_rate": 9.976942608363394e-06,
"loss": 0.3108,
"step": 30
},
{
"epoch": 0.09337349397590361,
"grad_norm": 0.0967026874423027,
"learning_rate": 9.97458123014077e-06,
"loss": 0.4222,
"step": 31
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.08042623102664948,
"learning_rate": 9.972105063005895e-06,
"loss": 0.3722,
"step": 32
},
{
"epoch": 0.09939759036144578,
"grad_norm": 0.10365046560764313,
"learning_rate": 9.969514164096548e-06,
"loss": 0.4997,
"step": 33
},
{
"epoch": 0.10240963855421686,
"grad_norm": 0.1059841588139534,
"learning_rate": 9.966808593197959e-06,
"loss": 0.5668,
"step": 34
},
{
"epoch": 0.10542168674698796,
"grad_norm": 0.11545271426439285,
"learning_rate": 9.96398841274142e-06,
"loss": 0.5842,
"step": 35
},
{
"epoch": 0.10843373493975904,
"grad_norm": 0.07890575379133224,
"learning_rate": 9.96105368780285e-06,
"loss": 0.4241,
"step": 36
},
{
"epoch": 0.11144578313253012,
"grad_norm": 0.09370716661214828,
"learning_rate": 9.958004486101293e-06,
"loss": 0.5054,
"step": 37
},
{
"epoch": 0.1144578313253012,
"grad_norm": 0.08521121740341187,
"learning_rate": 9.954840877997356e-06,
"loss": 0.3894,
"step": 38
},
{
"epoch": 0.11746987951807229,
"grad_norm": 0.09335027635097504,
"learning_rate": 9.95156293649158e-06,
"loss": 0.4202,
"step": 39
},
{
"epoch": 0.12048192771084337,
"grad_norm": 0.0888521671295166,
"learning_rate": 9.948170737222763e-06,
"loss": 0.4229,
"step": 40
},
{
"epoch": 0.12349397590361445,
"grad_norm": 0.08825533837080002,
"learning_rate": 9.94466435846621e-06,
"loss": 0.525,
"step": 41
},
{
"epoch": 0.12650602409638553,
"grad_norm": 0.08048392087221146,
"learning_rate": 9.941043881131928e-06,
"loss": 0.4733,
"step": 42
},
{
"epoch": 0.12951807228915663,
"grad_norm": 0.08944156765937805,
"learning_rate": 9.93730938876276e-06,
"loss": 0.4739,
"step": 43
},
{
"epoch": 0.13253012048192772,
"grad_norm": 0.08353747427463531,
"learning_rate": 9.933460967532454e-06,
"loss": 0.4571,
"step": 44
},
{
"epoch": 0.1355421686746988,
"grad_norm": 0.07594406604766846,
"learning_rate": 9.929498706243681e-06,
"loss": 0.4573,
"step": 45
},
{
"epoch": 0.13855421686746988,
"grad_norm": 0.07401393353939056,
"learning_rate": 9.925422696325976e-06,
"loss": 0.3839,
"step": 46
},
{
"epoch": 0.14156626506024098,
"grad_norm": 0.08534356951713562,
"learning_rate": 9.921233031833639e-06,
"loss": 0.4649,
"step": 47
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.08551020175218582,
"learning_rate": 9.916929809443555e-06,
"loss": 0.4472,
"step": 48
},
{
"epoch": 0.14759036144578314,
"grad_norm": 0.09304791688919067,
"learning_rate": 9.912513128452974e-06,
"loss": 0.4685,
"step": 49
},
{
"epoch": 0.15060240963855423,
"grad_norm": 0.0794030949473381,
"learning_rate": 9.907983090777206e-06,
"loss": 0.4555,
"step": 50
},
{
"epoch": 0.1536144578313253,
"grad_norm": 0.07851307094097137,
"learning_rate": 9.903339800947284e-06,
"loss": 0.3596,
"step": 51
},
{
"epoch": 0.1566265060240964,
"grad_norm": 0.09233438223600388,
"learning_rate": 9.898583366107539e-06,
"loss": 0.5195,
"step": 52
},
{
"epoch": 0.15963855421686746,
"grad_norm": 0.06730569899082184,
"learning_rate": 9.893713896013134e-06,
"loss": 0.3624,
"step": 53
},
{
"epoch": 0.16265060240963855,
"grad_norm": 0.07352066785097122,
"learning_rate": 9.888731503027535e-06,
"loss": 0.3612,
"step": 54
},
{
"epoch": 0.16566265060240964,
"grad_norm": 0.08405814319849014,
"learning_rate": 9.883636302119911e-06,
"loss": 0.3556,
"step": 55
},
{
"epoch": 0.1686746987951807,
"grad_norm": 0.08859913796186447,
"learning_rate": 9.878428410862484e-06,
"loss": 0.5018,
"step": 56
},
{
"epoch": 0.1716867469879518,
"grad_norm": 0.06710238754749298,
"learning_rate": 9.873107949427815e-06,
"loss": 0.4708,
"step": 57
},
{
"epoch": 0.1746987951807229,
"grad_norm": 0.09144071489572525,
"learning_rate": 9.867675040586035e-06,
"loss": 0.6071,
"step": 58
},
{
"epoch": 0.17771084337349397,
"grad_norm": 0.08027662336826324,
"learning_rate": 9.862129809702006e-06,
"loss": 0.3936,
"step": 59
},
{
"epoch": 0.18072289156626506,
"grad_norm": 0.08805692195892334,
"learning_rate": 9.856472384732432e-06,
"loss": 0.5415,
"step": 60
},
{
"epoch": 0.18373493975903615,
"grad_norm": 0.08526882529258728,
"learning_rate": 9.850702896222908e-06,
"loss": 0.4938,
"step": 61
},
{
"epoch": 0.18674698795180722,
"grad_norm": 0.08791948109865189,
"learning_rate": 9.844821477304904e-06,
"loss": 0.5543,
"step": 62
},
{
"epoch": 0.1897590361445783,
"grad_norm": 0.08400929719209671,
"learning_rate": 9.838828263692693e-06,
"loss": 0.457,
"step": 63
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.0857481062412262,
"learning_rate": 9.832723393680222e-06,
"loss": 0.4335,
"step": 64
},
{
"epoch": 0.19578313253012047,
"grad_norm": 0.08857893198728561,
"learning_rate": 9.826507008137919e-06,
"loss": 0.4833,
"step": 65
},
{
"epoch": 0.19879518072289157,
"grad_norm": 0.11243699491024017,
"learning_rate": 9.820179250509442e-06,
"loss": 0.4892,
"step": 66
},
{
"epoch": 0.20180722891566266,
"grad_norm": 0.07048942148685455,
"learning_rate": 9.813740266808375e-06,
"loss": 0.3947,
"step": 67
},
{
"epoch": 0.20481927710843373,
"grad_norm": 0.09442531317472458,
"learning_rate": 9.807190205614847e-06,
"loss": 0.5209,
"step": 68
},
{
"epoch": 0.20783132530120482,
"grad_norm": 0.07926708459854126,
"learning_rate": 9.800529218072112e-06,
"loss": 0.4134,
"step": 69
},
{
"epoch": 0.21084337349397592,
"grad_norm": 0.08361215889453888,
"learning_rate": 9.793757457883062e-06,
"loss": 0.5141,
"step": 70
},
{
"epoch": 0.21385542168674698,
"grad_norm": 0.0780983418226242,
"learning_rate": 9.786875081306677e-06,
"loss": 0.3984,
"step": 71
},
{
"epoch": 0.21686746987951808,
"grad_norm": 0.0805901288986206,
"learning_rate": 9.779882247154419e-06,
"loss": 0.4594,
"step": 72
},
{
"epoch": 0.21987951807228914,
"grad_norm": 0.08035779744386673,
"learning_rate": 9.772779116786568e-06,
"loss": 0.4807,
"step": 73
},
{
"epoch": 0.22289156626506024,
"grad_norm": 0.09436261653900146,
"learning_rate": 9.765565854108503e-06,
"loss": 0.5547,
"step": 74
},
{
"epoch": 0.22590361445783133,
"grad_norm": 0.08481397479772568,
"learning_rate": 9.758242625566912e-06,
"loss": 0.4781,
"step": 75
},
{
"epoch": 0.2289156626506024,
"grad_norm": 0.07799296826124191,
"learning_rate": 9.750809600145955e-06,
"loss": 0.5435,
"step": 76
},
{
"epoch": 0.2319277108433735,
"grad_norm": 0.1005324274301529,
"learning_rate": 9.743266949363368e-06,
"loss": 0.4334,
"step": 77
},
{
"epoch": 0.23493975903614459,
"grad_norm": 0.06958253681659698,
"learning_rate": 9.735614847266502e-06,
"loss": 0.4439,
"step": 78
},
{
"epoch": 0.23795180722891565,
"grad_norm": 0.06568962335586548,
"learning_rate": 9.727853470428301e-06,
"loss": 0.2761,
"step": 79
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.08228959888219833,
"learning_rate": 9.719982997943245e-06,
"loss": 0.4604,
"step": 80
},
{
"epoch": 0.24397590361445784,
"grad_norm": 0.07239473611116409,
"learning_rate": 9.712003611423194e-06,
"loss": 0.4126,
"step": 81
},
{
"epoch": 0.2469879518072289,
"grad_norm": 0.06457886099815369,
"learning_rate": 9.703915494993215e-06,
"loss": 0.2551,
"step": 82
},
{
"epoch": 0.25,
"grad_norm": 0.07501790672540665,
"learning_rate": 9.695718835287328e-06,
"loss": 0.5372,
"step": 83
},
{
"epoch": 0.25,
"eval_loss": 0.4660322368144989,
"eval_runtime": 48.1631,
"eval_samples_per_second": 7.641,
"eval_steps_per_second": 0.955,
"step": 83
},
{
"epoch": 0.25301204819277107,
"grad_norm": 0.09980539232492447,
"learning_rate": 9.6874138214442e-06,
"loss": 0.4397,
"step": 84
},
{
"epoch": 0.2560240963855422,
"grad_norm": 0.09662894904613495,
"learning_rate": 9.679000645102771e-06,
"loss": 0.4799,
"step": 85
},
{
"epoch": 0.25903614457831325,
"grad_norm": 0.0800192579627037,
"learning_rate": 9.670479500397854e-06,
"loss": 0.3729,
"step": 86
},
{
"epoch": 0.2620481927710843,
"grad_norm": 0.09520886838436127,
"learning_rate": 9.66185058395563e-06,
"loss": 0.5556,
"step": 87
},
{
"epoch": 0.26506024096385544,
"grad_norm": 0.06621432304382324,
"learning_rate": 9.653114094889128e-06,
"loss": 0.2844,
"step": 88
},
{
"epoch": 0.2680722891566265,
"grad_norm": 0.07093362510204315,
"learning_rate": 9.644270234793625e-06,
"loss": 0.3263,
"step": 89
},
{
"epoch": 0.2710843373493976,
"grad_norm": 0.1954663097858429,
"learning_rate": 9.63531920774199e-06,
"loss": 0.5507,
"step": 90
},
{
"epoch": 0.2740963855421687,
"grad_norm": 0.0809064731001854,
"learning_rate": 9.62626122027999e-06,
"loss": 0.2653,
"step": 91
},
{
"epoch": 0.27710843373493976,
"grad_norm": 0.08081785589456558,
"learning_rate": 9.617096481421498e-06,
"loss": 0.393,
"step": 92
},
{
"epoch": 0.28012048192771083,
"grad_norm": 0.060243554413318634,
"learning_rate": 9.607825202643696e-06,
"loss": 0.2177,
"step": 93
},
{
"epoch": 0.28313253012048195,
"grad_norm": 0.07686587423086166,
"learning_rate": 9.598447597882181e-06,
"loss": 0.2648,
"step": 94
},
{
"epoch": 0.286144578313253,
"grad_norm": 0.10320460796356201,
"learning_rate": 9.588963883526033e-06,
"loss": 0.5048,
"step": 95
},
{
"epoch": 0.2891566265060241,
"grad_norm": 0.07446248829364777,
"learning_rate": 9.579374278412819e-06,
"loss": 0.4118,
"step": 96
},
{
"epoch": 0.2921686746987952,
"grad_norm": 0.08173944056034088,
"learning_rate": 9.569679003823542e-06,
"loss": 0.399,
"step": 97
},
{
"epoch": 0.29518072289156627,
"grad_norm": 0.07631703466176987,
"learning_rate": 9.559878283477546e-06,
"loss": 0.4263,
"step": 98
},
{
"epoch": 0.29819277108433734,
"grad_norm": 0.08103561401367188,
"learning_rate": 9.549972343527336e-06,
"loss": 0.3837,
"step": 99
},
{
"epoch": 0.30120481927710846,
"grad_norm": 0.08016617596149445,
"learning_rate": 9.539961412553375e-06,
"loss": 0.2906,
"step": 100
},
{
"epoch": 0.3042168674698795,
"grad_norm": 0.08368028700351715,
"learning_rate": 9.529845721558802e-06,
"loss": 0.329,
"step": 101
},
{
"epoch": 0.3072289156626506,
"grad_norm": 0.07581895589828491,
"learning_rate": 9.5196255039641e-06,
"loss": 0.3386,
"step": 102
},
{
"epoch": 0.3102409638554217,
"grad_norm": 0.07931499183177948,
"learning_rate": 9.50930099560172e-06,
"loss": 0.3775,
"step": 103
},
{
"epoch": 0.3132530120481928,
"grad_norm": 0.07715684920549393,
"learning_rate": 9.498872434710624e-06,
"loss": 0.2493,
"step": 104
},
{
"epoch": 0.31626506024096385,
"grad_norm": 0.15848775207996368,
"learning_rate": 9.488340061930797e-06,
"loss": 0.6623,
"step": 105
},
{
"epoch": 0.3192771084337349,
"grad_norm": 0.064506396651268,
"learning_rate": 9.477704120297698e-06,
"loss": 0.2924,
"step": 106
},
{
"epoch": 0.32228915662650603,
"grad_norm": 0.08634113520383835,
"learning_rate": 9.46696485523664e-06,
"loss": 0.4908,
"step": 107
},
{
"epoch": 0.3253012048192771,
"grad_norm": 0.09926196932792664,
"learning_rate": 9.45612251455714e-06,
"loss": 0.5368,
"step": 108
},
{
"epoch": 0.32831325301204817,
"grad_norm": 0.09239736199378967,
"learning_rate": 9.445177348447187e-06,
"loss": 0.4221,
"step": 109
},
{
"epoch": 0.3313253012048193,
"grad_norm": 0.09763186424970627,
"learning_rate": 9.434129609467484e-06,
"loss": 0.3754,
"step": 110
},
{
"epoch": 0.33433734939759036,
"grad_norm": 0.08639013767242432,
"learning_rate": 9.422979552545604e-06,
"loss": 0.4451,
"step": 111
},
{
"epoch": 0.3373493975903614,
"grad_norm": 0.08338082581758499,
"learning_rate": 9.411727434970121e-06,
"loss": 0.3597,
"step": 112
},
{
"epoch": 0.34036144578313254,
"grad_norm": 0.08375409245491028,
"learning_rate": 9.400373516384671e-06,
"loss": 0.3512,
"step": 113
},
{
"epoch": 0.3433734939759036,
"grad_norm": 0.08984484523534775,
"learning_rate": 9.388918058781947e-06,
"loss": 0.4637,
"step": 114
},
{
"epoch": 0.3463855421686747,
"grad_norm": 0.10151129961013794,
"learning_rate": 9.377361326497673e-06,
"loss": 0.4863,
"step": 115
},
{
"epoch": 0.3493975903614458,
"grad_norm": 0.0721992552280426,
"learning_rate": 9.365703586204495e-06,
"loss": 0.3994,
"step": 116
},
{
"epoch": 0.35240963855421686,
"grad_norm": 0.06449025869369507,
"learning_rate": 9.353945106905822e-06,
"loss": 0.3332,
"step": 117
},
{
"epoch": 0.35542168674698793,
"grad_norm": 0.11437945812940598,
"learning_rate": 9.342086159929629e-06,
"loss": 0.4353,
"step": 118
},
{
"epoch": 0.35843373493975905,
"grad_norm": 0.09777141362428665,
"learning_rate": 9.330127018922195e-06,
"loss": 0.3581,
"step": 119
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.09215305745601654,
"learning_rate": 9.318067959841776e-06,
"loss": 0.4457,
"step": 120
},
{
"epoch": 0.3644578313253012,
"grad_norm": 0.10612743347883224,
"learning_rate": 9.305909260952255e-06,
"loss": 0.5848,
"step": 121
},
{
"epoch": 0.3674698795180723,
"grad_norm": 0.0931277871131897,
"learning_rate": 9.29365120281671e-06,
"loss": 0.3868,
"step": 122
},
{
"epoch": 0.3704819277108434,
"grad_norm": 0.08951377868652344,
"learning_rate": 9.28129406829094e-06,
"loss": 0.4827,
"step": 123
},
{
"epoch": 0.37349397590361444,
"grad_norm": 0.08375540375709534,
"learning_rate": 9.268838142516943e-06,
"loss": 0.4715,
"step": 124
},
{
"epoch": 0.37650602409638556,
"grad_norm": 0.0971704050898552,
"learning_rate": 9.256283712916337e-06,
"loss": 0.3541,
"step": 125
},
{
"epoch": 0.3795180722891566,
"grad_norm": 0.06618267297744751,
"learning_rate": 9.24363106918372e-06,
"loss": 0.2523,
"step": 126
},
{
"epoch": 0.3825301204819277,
"grad_norm": 0.09525061398744583,
"learning_rate": 9.230880503279991e-06,
"loss": 0.4596,
"step": 127
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.11402564495801926,
"learning_rate": 9.218032309425613e-06,
"loss": 0.4864,
"step": 128
},
{
"epoch": 0.3885542168674699,
"grad_norm": 0.0729977935552597,
"learning_rate": 9.205086784093823e-06,
"loss": 0.3729,
"step": 129
},
{
"epoch": 0.39156626506024095,
"grad_norm": 0.11046875268220901,
"learning_rate": 9.19204422600379e-06,
"loss": 0.4042,
"step": 130
},
{
"epoch": 0.39457831325301207,
"grad_norm": 0.07796311378479004,
"learning_rate": 9.178904936113719e-06,
"loss": 0.3511,
"step": 131
},
{
"epoch": 0.39759036144578314,
"grad_norm": 0.07394801825284958,
"learning_rate": 9.165669217613919e-06,
"loss": 0.3633,
"step": 132
},
{
"epoch": 0.4006024096385542,
"grad_norm": 0.07786203175783157,
"learning_rate": 9.152337375919792e-06,
"loss": 0.2815,
"step": 133
},
{
"epoch": 0.4036144578313253,
"grad_norm": 0.0837029293179512,
"learning_rate": 9.138909718664788e-06,
"loss": 0.3982,
"step": 134
},
{
"epoch": 0.4066265060240964,
"grad_norm": 0.08676737546920776,
"learning_rate": 9.125386555693316e-06,
"loss": 0.2748,
"step": 135
},
{
"epoch": 0.40963855421686746,
"grad_norm": 0.07859157770872116,
"learning_rate": 9.111768199053588e-06,
"loss": 0.3965,
"step": 136
},
{
"epoch": 0.4126506024096386,
"grad_norm": 0.09692444652318954,
"learning_rate": 9.098054962990415e-06,
"loss": 0.4141,
"step": 137
},
{
"epoch": 0.41566265060240964,
"grad_norm": 0.07770588248968124,
"learning_rate": 9.084247163937959e-06,
"loss": 0.2752,
"step": 138
},
{
"epoch": 0.4186746987951807,
"grad_norm": 0.09689480066299438,
"learning_rate": 9.070345120512436e-06,
"loss": 0.3829,
"step": 139
},
{
"epoch": 0.42168674698795183,
"grad_norm": 0.0891459733247757,
"learning_rate": 9.056349153504753e-06,
"loss": 0.3809,
"step": 140
},
{
"epoch": 0.4246987951807229,
"grad_norm": 0.08335975557565689,
"learning_rate": 9.042259585873119e-06,
"loss": 0.3436,
"step": 141
},
{
"epoch": 0.42771084337349397,
"grad_norm": 0.1099216416478157,
"learning_rate": 9.028076742735583e-06,
"loss": 0.327,
"step": 142
},
{
"epoch": 0.4307228915662651,
"grad_norm": 0.1145530566573143,
"learning_rate": 9.013800951362532e-06,
"loss": 0.5582,
"step": 143
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.09491799771785736,
"learning_rate": 8.999432541169145e-06,
"loss": 0.4046,
"step": 144
},
{
"epoch": 0.4367469879518072,
"grad_norm": 0.09914596378803253,
"learning_rate": 8.984971843707787e-06,
"loss": 0.414,
"step": 145
},
{
"epoch": 0.4397590361445783,
"grad_norm": 0.09991983324289322,
"learning_rate": 8.970419192660366e-06,
"loss": 0.4991,
"step": 146
},
{
"epoch": 0.4427710843373494,
"grad_norm": 0.1129307746887207,
"learning_rate": 8.955774923830618e-06,
"loss": 0.4938,
"step": 147
},
{
"epoch": 0.4457831325301205,
"grad_norm": 0.11100436747074127,
"learning_rate": 8.94103937513637e-06,
"loss": 0.5161,
"step": 148
},
{
"epoch": 0.44879518072289154,
"grad_norm": 0.10246367007493973,
"learning_rate": 8.92621288660175e-06,
"loss": 0.3699,
"step": 149
},
{
"epoch": 0.45180722891566266,
"grad_norm": 0.08281731605529785,
"learning_rate": 8.911295800349316e-06,
"loss": 0.3351,
"step": 150
},
{
"epoch": 0.45481927710843373,
"grad_norm": 0.09455256164073944,
"learning_rate": 8.896288460592187e-06,
"loss": 0.4403,
"step": 151
},
{
"epoch": 0.4578313253012048,
"grad_norm": 0.10226281732320786,
"learning_rate": 8.881191213626084e-06,
"loss": 0.4367,
"step": 152
},
{
"epoch": 0.4608433734939759,
"grad_norm": 0.0877789556980133,
"learning_rate": 8.86600440782135e-06,
"loss": 0.3356,
"step": 153
},
{
"epoch": 0.463855421686747,
"grad_norm": 0.12456001341342926,
"learning_rate": 8.850728393614903e-06,
"loss": 0.4589,
"step": 154
},
{
"epoch": 0.46686746987951805,
"grad_norm": 0.09067565202713013,
"learning_rate": 8.835363523502154e-06,
"loss": 0.42,
"step": 155
},
{
"epoch": 0.46987951807228917,
"grad_norm": 0.08902376890182495,
"learning_rate": 8.819910152028872e-06,
"loss": 0.3528,
"step": 156
},
{
"epoch": 0.47289156626506024,
"grad_norm": 0.11373434960842133,
"learning_rate": 8.804368635783002e-06,
"loss": 0.3973,
"step": 157
},
{
"epoch": 0.4759036144578313,
"grad_norm": 0.08251000195741653,
"learning_rate": 8.788739333386443e-06,
"loss": 0.3475,
"step": 158
},
{
"epoch": 0.4789156626506024,
"grad_norm": 0.09951098263263702,
"learning_rate": 8.773022605486755e-06,
"loss": 0.307,
"step": 159
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.14891928434371948,
"learning_rate": 8.75721881474886e-06,
"loss": 0.5982,
"step": 160
},
{
"epoch": 0.48493975903614456,
"grad_norm": 0.09601758420467377,
"learning_rate": 8.741328325846663e-06,
"loss": 0.3029,
"step": 161
},
{
"epoch": 0.4879518072289157,
"grad_norm": 0.10120223462581635,
"learning_rate": 8.725351505454631e-06,
"loss": 0.4484,
"step": 162
},
{
"epoch": 0.49096385542168675,
"grad_norm": 0.09667719900608063,
"learning_rate": 8.709288722239345e-06,
"loss": 0.3746,
"step": 163
},
{
"epoch": 0.4939759036144578,
"grad_norm": 0.09432260692119598,
"learning_rate": 8.693140346850975e-06,
"loss": 0.3193,
"step": 164
},
{
"epoch": 0.49698795180722893,
"grad_norm": 0.08489949256181717,
"learning_rate": 8.67690675191475e-06,
"loss": 0.3274,
"step": 165
},
{
"epoch": 0.5,
"grad_norm": 0.11130422353744507,
"learning_rate": 8.660588312022345e-06,
"loss": 0.4527,
"step": 166
},
{
"epoch": 0.5,
"eval_loss": 0.42669200897216797,
"eval_runtime": 48.0943,
"eval_samples_per_second": 7.652,
"eval_steps_per_second": 0.956,
"step": 166
},
{
"epoch": 0.5030120481927711,
"grad_norm": 0.12064981460571289,
"learning_rate": 8.644185403723231e-06,
"loss": 0.4115,
"step": 167
},
{
"epoch": 0.5060240963855421,
"grad_norm": 0.06795509904623032,
"learning_rate": 8.627698405516007e-06,
"loss": 0.1872,
"step": 168
},
{
"epoch": 0.5090361445783133,
"grad_norm": 0.08577800542116165,
"learning_rate": 8.611127697839649e-06,
"loss": 0.3129,
"step": 169
},
{
"epoch": 0.5120481927710844,
"grad_norm": 0.10260018706321716,
"learning_rate": 8.594473663064735e-06,
"loss": 0.4587,
"step": 170
},
{
"epoch": 0.5150602409638554,
"grad_norm": 0.07991928607225418,
"learning_rate": 8.577736685484626e-06,
"loss": 0.2316,
"step": 171
},
{
"epoch": 0.5180722891566265,
"grad_norm": 0.12778519093990326,
"learning_rate": 8.560917151306594e-06,
"loss": 0.5422,
"step": 172
},
{
"epoch": 0.5210843373493976,
"grad_norm": 0.11797113716602325,
"learning_rate": 8.544015448642916e-06,
"loss": 0.5094,
"step": 173
},
{
"epoch": 0.5240963855421686,
"grad_norm": 0.09844504296779633,
"learning_rate": 8.527031967501906e-06,
"loss": 0.3694,
"step": 174
},
{
"epoch": 0.5271084337349398,
"grad_norm": 0.09397323429584503,
"learning_rate": 8.509967099778934e-06,
"loss": 0.3101,
"step": 175
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.10693401098251343,
"learning_rate": 8.492821239247365e-06,
"loss": 0.3144,
"step": 176
},
{
"epoch": 0.5331325301204819,
"grad_norm": 0.12896175682544708,
"learning_rate": 8.475594781549483e-06,
"loss": 0.2638,
"step": 177
},
{
"epoch": 0.536144578313253,
"grad_norm": 0.08528363704681396,
"learning_rate": 8.45828812418736e-06,
"loss": 0.3587,
"step": 178
},
{
"epoch": 0.5391566265060241,
"grad_norm": 0.12003973871469498,
"learning_rate": 8.44090166651368e-06,
"loss": 0.4367,
"step": 179
},
{
"epoch": 0.5421686746987951,
"grad_norm": 0.09943649917840958,
"learning_rate": 8.42343580972253e-06,
"loss": 0.4068,
"step": 180
},
{
"epoch": 0.5451807228915663,
"grad_norm": 0.09416316449642181,
"learning_rate": 8.405890956840136e-06,
"loss": 0.3894,
"step": 181
},
{
"epoch": 0.5481927710843374,
"grad_norm": 0.11709215492010117,
"learning_rate": 8.388267512715565e-06,
"loss": 0.3775,
"step": 182
},
{
"epoch": 0.5512048192771084,
"grad_norm": 0.1207321509718895,
"learning_rate": 8.370565884011389e-06,
"loss": 0.3226,
"step": 183
},
{
"epoch": 0.5542168674698795,
"grad_norm": 0.07701164484024048,
"learning_rate": 8.352786479194288e-06,
"loss": 0.2533,
"step": 184
},
{
"epoch": 0.5572289156626506,
"grad_norm": 0.10769670456647873,
"learning_rate": 8.33492970852564e-06,
"loss": 0.3303,
"step": 185
},
{
"epoch": 0.5602409638554217,
"grad_norm": 0.11196043342351913,
"learning_rate": 8.316995984052048e-06,
"loss": 0.3671,
"step": 186
},
{
"epoch": 0.5632530120481928,
"grad_norm": 0.10518307983875275,
"learning_rate": 8.298985719595824e-06,
"loss": 0.2897,
"step": 187
},
{
"epoch": 0.5662650602409639,
"grad_norm": 0.09793493896722794,
"learning_rate": 8.280899330745452e-06,
"loss": 0.3379,
"step": 188
},
{
"epoch": 0.5692771084337349,
"grad_norm": 0.10353343933820724,
"learning_rate": 8.262737234845993e-06,
"loss": 0.3022,
"step": 189
},
{
"epoch": 0.572289156626506,
"grad_norm": 0.13275578618049622,
"learning_rate": 8.244499850989453e-06,
"loss": 0.4683,
"step": 190
},
{
"epoch": 0.5753012048192772,
"grad_norm": 0.12100367993116379,
"learning_rate": 8.226187600005116e-06,
"loss": 0.3862,
"step": 191
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.10842237621545792,
"learning_rate": 8.207800904449829e-06,
"loss": 0.4358,
"step": 192
},
{
"epoch": 0.5813253012048193,
"grad_norm": 0.08900940418243408,
"learning_rate": 8.189340188598263e-06,
"loss": 0.2957,
"step": 193
},
{
"epoch": 0.5843373493975904,
"grad_norm": 0.1361563801765442,
"learning_rate": 8.1708058784331e-06,
"loss": 0.5122,
"step": 194
},
{
"epoch": 0.5873493975903614,
"grad_norm": 0.09149997681379318,
"learning_rate": 8.15219840163523e-06,
"loss": 0.2789,
"step": 195
},
{
"epoch": 0.5903614457831325,
"grad_norm": 0.12989595532417297,
"learning_rate": 8.133518187573864e-06,
"loss": 0.358,
"step": 196
},
{
"epoch": 0.5933734939759037,
"grad_norm": 0.09390953183174133,
"learning_rate": 8.114765667296628e-06,
"loss": 0.2678,
"step": 197
},
{
"epoch": 0.5963855421686747,
"grad_norm": 0.10648205876350403,
"learning_rate": 8.095941273519634e-06,
"loss": 0.4228,
"step": 198
},
{
"epoch": 0.5993975903614458,
"grad_norm": 0.10480794310569763,
"learning_rate": 8.077045440617465e-06,
"loss": 0.2776,
"step": 199
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.1166502833366394,
"learning_rate": 8.058078604613178e-06,
"loss": 0.4289,
"step": 200
},
{
"epoch": 0.6054216867469879,
"grad_norm": 0.13281071186065674,
"learning_rate": 8.039041203168233e-06,
"loss": 0.3458,
"step": 201
},
{
"epoch": 0.608433734939759,
"grad_norm": 0.12321053445339203,
"learning_rate": 8.019933675572389e-06,
"loss": 0.3823,
"step": 202
},
{
"epoch": 0.6114457831325302,
"grad_norm": 0.10648687928915024,
"learning_rate": 8.000756462733577e-06,
"loss": 0.2839,
"step": 203
},
{
"epoch": 0.6144578313253012,
"grad_norm": 0.09772239625453949,
"learning_rate": 7.981510007167719e-06,
"loss": 0.3649,
"step": 204
},
{
"epoch": 0.6174698795180723,
"grad_norm": 0.09530483931303024,
"learning_rate": 7.962194752988519e-06,
"loss": 0.4029,
"step": 205
},
{
"epoch": 0.6204819277108434,
"grad_norm": 0.14029258489608765,
"learning_rate": 7.942811145897215e-06,
"loss": 0.4089,
"step": 206
},
{
"epoch": 0.6234939759036144,
"grad_norm": 0.10488390922546387,
"learning_rate": 7.923359633172299e-06,
"loss": 0.3588,
"step": 207
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.11913249641656876,
"learning_rate": 7.903840663659186e-06,
"loss": 0.3846,
"step": 208
},
{
"epoch": 0.6295180722891566,
"grad_norm": 0.09327327460050583,
"learning_rate": 7.884254687759863e-06,
"loss": 0.2904,
"step": 209
},
{
"epoch": 0.6325301204819277,
"grad_norm": 0.1013420820236206,
"learning_rate": 7.864602157422501e-06,
"loss": 0.3217,
"step": 210
},
{
"epoch": 0.6355421686746988,
"grad_norm": 0.12041863054037094,
"learning_rate": 7.844883526131014e-06,
"loss": 0.4139,
"step": 211
},
{
"epoch": 0.6385542168674698,
"grad_norm": 0.1831393837928772,
"learning_rate": 7.8250992488946e-06,
"loss": 0.4607,
"step": 212
},
{
"epoch": 0.641566265060241,
"grad_norm": 0.12515975534915924,
"learning_rate": 7.805249782237256e-06,
"loss": 0.473,
"step": 213
},
{
"epoch": 0.6445783132530121,
"grad_norm": 0.10324941575527191,
"learning_rate": 7.78533558418722e-06,
"loss": 0.3054,
"step": 214
},
{
"epoch": 0.6475903614457831,
"grad_norm": 0.1663861721754074,
"learning_rate": 7.765357114266409e-06,
"loss": 0.3492,
"step": 215
},
{
"epoch": 0.6506024096385542,
"grad_norm": 0.09673499315977097,
"learning_rate": 7.745314833479834e-06,
"loss": 0.2971,
"step": 216
},
{
"epoch": 0.6536144578313253,
"grad_norm": 0.11750275641679764,
"learning_rate": 7.72520920430493e-06,
"loss": 0.3796,
"step": 217
},
{
"epoch": 0.6566265060240963,
"grad_norm": 0.11695467680692673,
"learning_rate": 7.705040690680915e-06,
"loss": 0.4032,
"step": 218
},
{
"epoch": 0.6596385542168675,
"grad_norm": 0.18078581988811493,
"learning_rate": 7.684809757998066e-06,
"loss": 0.4728,
"step": 219
},
{
"epoch": 0.6626506024096386,
"grad_norm": 0.16273823380470276,
"learning_rate": 7.664516873086987e-06,
"loss": 0.3374,
"step": 220
},
{
"epoch": 0.6656626506024096,
"grad_norm": 0.09355267137289047,
"learning_rate": 7.644162504207834e-06,
"loss": 0.3066,
"step": 221
},
{
"epoch": 0.6686746987951807,
"grad_norm": 0.1773396134376526,
"learning_rate": 7.623747121039512e-06,
"loss": 0.4394,
"step": 222
},
{
"epoch": 0.6716867469879518,
"grad_norm": 0.11587468534708023,
"learning_rate": 7.603271194668835e-06,
"loss": 0.3592,
"step": 223
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.13333013653755188,
"learning_rate": 7.582735197579657e-06,
"loss": 0.4295,
"step": 224
},
{
"epoch": 0.677710843373494,
"grad_norm": 0.1107691079378128,
"learning_rate": 7.562139603641971e-06,
"loss": 0.3213,
"step": 225
},
{
"epoch": 0.6807228915662651,
"grad_norm": 0.1352744698524475,
"learning_rate": 7.541484888100974e-06,
"loss": 0.448,
"step": 226
},
{
"epoch": 0.6837349397590361,
"grad_norm": 0.11878973990678787,
"learning_rate": 7.520771527566093e-06,
"loss": 0.3696,
"step": 227
},
{
"epoch": 0.6867469879518072,
"grad_norm": 0.35187989473342896,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4887,
"step": 228
},
{
"epoch": 0.6897590361445783,
"grad_norm": 0.1278284788131714,
"learning_rate": 7.479170784707574e-06,
"loss": 0.4926,
"step": 229
},
{
"epoch": 0.6927710843373494,
"grad_norm": 0.15527869760990143,
"learning_rate": 7.458284362324844e-06,
"loss": 0.436,
"step": 230
},
{
"epoch": 0.6957831325301205,
"grad_norm": 0.11231625825166702,
"learning_rate": 7.437341214807895e-06,
"loss": 0.3645,
"step": 231
},
{
"epoch": 0.6987951807228916,
"grad_norm": 0.1510346382856369,
"learning_rate": 7.416341825421755e-06,
"loss": 0.3642,
"step": 232
},
{
"epoch": 0.7018072289156626,
"grad_norm": 0.11185728013515472,
"learning_rate": 7.395286678729232e-06,
"loss": 0.4493,
"step": 233
},
{
"epoch": 0.7048192771084337,
"grad_norm": 0.1314469277858734,
"learning_rate": 7.374176260579746e-06,
"loss": 0.3682,
"step": 234
},
{
"epoch": 0.7078313253012049,
"grad_norm": 0.11538910865783691,
"learning_rate": 7.353011058098104e-06,
"loss": 0.376,
"step": 235
},
{
"epoch": 0.7108433734939759,
"grad_norm": 0.11839091032743454,
"learning_rate": 7.33179155967327e-06,
"loss": 0.3509,
"step": 236
},
{
"epoch": 0.713855421686747,
"grad_norm": 0.1360180377960205,
"learning_rate": 7.310518254947092e-06,
"loss": 0.3303,
"step": 237
},
{
"epoch": 0.7168674698795181,
"grad_norm": 0.1287102997303009,
"learning_rate": 7.289191634803002e-06,
"loss": 0.3966,
"step": 238
},
{
"epoch": 0.7198795180722891,
"grad_norm": 0.11669423431158066,
"learning_rate": 7.267812191354691e-06,
"loss": 0.3606,
"step": 239
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.12020015716552734,
"learning_rate": 7.246380417934752e-06,
"loss": 0.325,
"step": 240
},
{
"epoch": 0.7259036144578314,
"grad_norm": 0.10611787438392639,
"learning_rate": 7.224896809083297e-06,
"loss": 0.3202,
"step": 241
},
{
"epoch": 0.7289156626506024,
"grad_norm": 0.13962885737419128,
"learning_rate": 7.203361860536544e-06,
"loss": 0.5206,
"step": 242
},
{
"epoch": 0.7319277108433735,
"grad_norm": 0.12559789419174194,
"learning_rate": 7.181776069215382e-06,
"loss": 0.346,
"step": 243
},
{
"epoch": 0.7349397590361446,
"grad_norm": 0.1341724544763565,
"learning_rate": 7.160139933213899e-06,
"loss": 0.4553,
"step": 244
},
{
"epoch": 0.7379518072289156,
"grad_norm": 0.1436809003353119,
"learning_rate": 7.138453951787894e-06,
"loss": 0.4602,
"step": 245
},
{
"epoch": 0.7409638554216867,
"grad_norm": 0.11017356067895889,
"learning_rate": 7.1167186253433474e-06,
"loss": 0.3412,
"step": 246
},
{
"epoch": 0.7439759036144579,
"grad_norm": 0.1423342525959015,
"learning_rate": 7.094934455424889e-06,
"loss": 0.4518,
"step": 247
},
{
"epoch": 0.7469879518072289,
"grad_norm": 0.17157544195652008,
"learning_rate": 7.073101944704209e-06,
"loss": 0.3862,
"step": 248
},
{
"epoch": 0.75,
"grad_norm": 0.12028568983078003,
"learning_rate": 7.051221596968471e-06,
"loss": 0.364,
"step": 249
},
{
"epoch": 0.75,
"eval_loss": 0.3997645974159241,
"eval_runtime": 48.0252,
"eval_samples_per_second": 7.663,
"eval_steps_per_second": 0.958,
"step": 249
},
{
"epoch": 0.7530120481927711,
"grad_norm": 0.11746224015951157,
"learning_rate": 7.029293917108678e-06,
"loss": 0.2695,
"step": 250
},
{
"epoch": 0.7560240963855421,
"grad_norm": 0.11289820075035095,
"learning_rate": 7.0073194111080315e-06,
"loss": 0.2859,
"step": 251
},
{
"epoch": 0.7590361445783133,
"grad_norm": 0.1417943686246872,
"learning_rate": 6.985298586030241e-06,
"loss": 0.3099,
"step": 252
},
{
"epoch": 0.7620481927710844,
"grad_norm": 0.1480390578508377,
"learning_rate": 6.963231950007845e-06,
"loss": 0.4117,
"step": 253
},
{
"epoch": 0.7650602409638554,
"grad_norm": 0.13671045005321503,
"learning_rate": 6.941120012230464e-06,
"loss": 0.3894,
"step": 254
},
{
"epoch": 0.7680722891566265,
"grad_norm": 0.1379365772008896,
"learning_rate": 6.918963282933063e-06,
"loss": 0.3894,
"step": 255
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.13510343432426453,
"learning_rate": 6.896762273384179e-06,
"loss": 0.4063,
"step": 256
},
{
"epoch": 0.7740963855421686,
"grad_norm": 0.10843163728713989,
"learning_rate": 6.8745174958741164e-06,
"loss": 0.2694,
"step": 257
},
{
"epoch": 0.7771084337349398,
"grad_norm": 0.1143522784113884,
"learning_rate": 6.852229463703131e-06,
"loss": 0.3443,
"step": 258
},
{
"epoch": 0.7801204819277109,
"grad_norm": 0.1380804479122162,
"learning_rate": 6.829898691169581e-06,
"loss": 0.358,
"step": 259
},
{
"epoch": 0.7831325301204819,
"grad_norm": 0.08340010046958923,
"learning_rate": 6.8075256935580655e-06,
"loss": 0.1889,
"step": 260
},
{
"epoch": 0.786144578313253,
"grad_norm": 0.13955867290496826,
"learning_rate": 6.78511098712753e-06,
"loss": 0.3529,
"step": 261
},
{
"epoch": 0.7891566265060241,
"grad_norm": 0.13800843060016632,
"learning_rate": 6.762655089099353e-06,
"loss": 0.3343,
"step": 262
},
{
"epoch": 0.7921686746987951,
"grad_norm": 0.1281561404466629,
"learning_rate": 6.740158517645418e-06,
"loss": 0.3782,
"step": 263
},
{
"epoch": 0.7951807228915663,
"grad_norm": 0.11828935146331787,
"learning_rate": 6.717621791876147e-06,
"loss": 0.3209,
"step": 264
},
{
"epoch": 0.7981927710843374,
"grad_norm": 0.1325826644897461,
"learning_rate": 6.695045431828524e-06,
"loss": 0.3042,
"step": 265
},
{
"epoch": 0.8012048192771084,
"grad_norm": 0.16716520488262177,
"learning_rate": 6.672429958454103e-06,
"loss": 0.4642,
"step": 266
},
{
"epoch": 0.8042168674698795,
"grad_norm": 0.1317368745803833,
"learning_rate": 6.649775893606982e-06,
"loss": 0.4527,
"step": 267
},
{
"epoch": 0.8072289156626506,
"grad_norm": 0.10963856428861618,
"learning_rate": 6.627083760031755e-06,
"loss": 0.2991,
"step": 268
},
{
"epoch": 0.8102409638554217,
"grad_norm": 0.11349374055862427,
"learning_rate": 6.604354081351461e-06,
"loss": 0.2648,
"step": 269
},
{
"epoch": 0.8132530120481928,
"grad_norm": 0.14128448069095612,
"learning_rate": 6.5815873820554925e-06,
"loss": 0.3344,
"step": 270
},
{
"epoch": 0.8162650602409639,
"grad_norm": 0.11427152901887894,
"learning_rate": 6.558784187487495e-06,
"loss": 0.2597,
"step": 271
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.13017497956752777,
"learning_rate": 6.535945023833249e-06,
"loss": 0.3016,
"step": 272
},
{
"epoch": 0.822289156626506,
"grad_norm": 0.11172367632389069,
"learning_rate": 6.513070418108525e-06,
"loss": 0.3508,
"step": 273
},
{
"epoch": 0.8253012048192772,
"grad_norm": 0.1552492380142212,
"learning_rate": 6.490160898146919e-06,
"loss": 0.4058,
"step": 274
},
{
"epoch": 0.8283132530120482,
"grad_norm": 0.18851318955421448,
"learning_rate": 6.467216992587679e-06,
"loss": 0.5397,
"step": 275
},
{
"epoch": 0.8313253012048193,
"grad_norm": 0.13263428211212158,
"learning_rate": 6.444239230863505e-06,
"loss": 0.3756,
"step": 276
},
{
"epoch": 0.8343373493975904,
"grad_norm": 0.15950614213943481,
"learning_rate": 6.421228143188325e-06,
"loss": 0.3658,
"step": 277
},
{
"epoch": 0.8373493975903614,
"grad_norm": 0.15785080194473267,
"learning_rate": 6.398184260545072e-06,
"loss": 0.5645,
"step": 278
},
{
"epoch": 0.8403614457831325,
"grad_norm": 0.16358067095279694,
"learning_rate": 6.375108114673425e-06,
"loss": 0.4303,
"step": 279
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.14203619956970215,
"learning_rate": 6.3520002380575395e-06,
"loss": 0.3486,
"step": 280
},
{
"epoch": 0.8463855421686747,
"grad_norm": 0.1213625818490982,
"learning_rate": 6.32886116391376e-06,
"loss": 0.3574,
"step": 281
},
{
"epoch": 0.8493975903614458,
"grad_norm": 0.12014158070087433,
"learning_rate": 6.305691426178316e-06,
"loss": 0.3042,
"step": 282
},
{
"epoch": 0.8524096385542169,
"grad_norm": 0.1291957050561905,
"learning_rate": 6.282491559495005e-06,
"loss": 0.3114,
"step": 283
},
{
"epoch": 0.8554216867469879,
"grad_norm": 0.174629345536232,
"learning_rate": 6.259262099202849e-06,
"loss": 0.4871,
"step": 284
},
{
"epoch": 0.858433734939759,
"grad_norm": 0.12753546237945557,
"learning_rate": 6.23600358132375e-06,
"loss": 0.2086,
"step": 285
},
{
"epoch": 0.8614457831325302,
"grad_norm": 0.13646778464317322,
"learning_rate": 6.212716542550112e-06,
"loss": 0.329,
"step": 286
},
{
"epoch": 0.8644578313253012,
"grad_norm": 0.15520715713500977,
"learning_rate": 6.189401520232464e-06,
"loss": 0.2789,
"step": 287
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.14368513226509094,
"learning_rate": 6.166059052367055e-06,
"loss": 0.3816,
"step": 288
},
{
"epoch": 0.8704819277108434,
"grad_norm": 0.19336989521980286,
"learning_rate": 6.142689677583447e-06,
"loss": 0.3221,
"step": 289
},
{
"epoch": 0.8734939759036144,
"grad_norm": 0.15267544984817505,
"learning_rate": 6.119293935132076e-06,
"loss": 0.3729,
"step": 290
},
{
"epoch": 0.8765060240963856,
"grad_norm": 0.1431257575750351,
"learning_rate": 6.095872364871818e-06,
"loss": 0.3342,
"step": 291
},
{
"epoch": 0.8795180722891566,
"grad_norm": 0.16153433918952942,
"learning_rate": 6.072425507257528e-06,
"loss": 0.5453,
"step": 292
},
{
"epoch": 0.8825301204819277,
"grad_norm": 0.12874870002269745,
"learning_rate": 6.048953903327568e-06,
"loss": 0.388,
"step": 293
},
{
"epoch": 0.8855421686746988,
"grad_norm": 0.2061174362897873,
"learning_rate": 6.025458094691323e-06,
"loss": 0.4939,
"step": 294
},
{
"epoch": 0.8885542168674698,
"grad_norm": 0.10293994843959808,
"learning_rate": 6.0019386235167055e-06,
"loss": 0.255,
"step": 295
},
{
"epoch": 0.891566265060241,
"grad_norm": 0.12315797805786133,
"learning_rate": 5.978396032517641e-06,
"loss": 0.3047,
"step": 296
},
{
"epoch": 0.8945783132530121,
"grad_norm": 0.12810122966766357,
"learning_rate": 5.9548308649415486e-06,
"loss": 0.3818,
"step": 297
},
{
"epoch": 0.8975903614457831,
"grad_norm": 0.13862572610378265,
"learning_rate": 5.931243664556803e-06,
"loss": 0.4595,
"step": 298
},
{
"epoch": 0.9006024096385542,
"grad_norm": 0.1489073932170868,
"learning_rate": 5.90763497564019e-06,
"loss": 0.5371,
"step": 299
},
{
"epoch": 0.9036144578313253,
"grad_norm": 0.15324267745018005,
"learning_rate": 5.884005342964343e-06,
"loss": 0.3644,
"step": 300
},
{
"epoch": 0.9066265060240963,
"grad_norm": 0.5843567252159119,
"learning_rate": 5.860355311785175e-06,
"loss": 0.3839,
"step": 301
},
{
"epoch": 0.9096385542168675,
"grad_norm": 0.1056603267788887,
"learning_rate": 5.836685427829296e-06,
"loss": 0.3114,
"step": 302
},
{
"epoch": 0.9126506024096386,
"grad_norm": 0.1388939917087555,
"learning_rate": 5.812996237281423e-06,
"loss": 0.3585,
"step": 303
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.1273452639579773,
"learning_rate": 5.7892882867717705e-06,
"loss": 0.3699,
"step": 304
},
{
"epoch": 0.9186746987951807,
"grad_norm": 0.15796081721782684,
"learning_rate": 5.765562123363445e-06,
"loss": 0.319,
"step": 305
},
{
"epoch": 0.9216867469879518,
"grad_norm": 0.15159745514392853,
"learning_rate": 5.7418182945398136e-06,
"loss": 0.4136,
"step": 306
},
{
"epoch": 0.9246987951807228,
"grad_norm": 0.11746060848236084,
"learning_rate": 5.718057348191874e-06,
"loss": 0.3404,
"step": 307
},
{
"epoch": 0.927710843373494,
"grad_norm": 0.13623090088367462,
"learning_rate": 5.6942798326056205e-06,
"loss": 0.2705,
"step": 308
},
{
"epoch": 0.9307228915662651,
"grad_norm": 0.16252101957798004,
"learning_rate": 5.670486296449373e-06,
"loss": 0.4949,
"step": 309
},
{
"epoch": 0.9337349397590361,
"grad_norm": 0.1467500627040863,
"learning_rate": 5.646677288761132e-06,
"loss": 0.437,
"step": 310
},
{
"epoch": 0.9367469879518072,
"grad_norm": 0.13205009698867798,
"learning_rate": 5.622853358935908e-06,
"loss": 0.3296,
"step": 311
},
{
"epoch": 0.9397590361445783,
"grad_norm": 0.15937268733978271,
"learning_rate": 5.599015056713037e-06,
"loss": 0.3097,
"step": 312
},
{
"epoch": 0.9427710843373494,
"grad_norm": 0.14837417006492615,
"learning_rate": 5.575162932163501e-06,
"loss": 0.2976,
"step": 313
},
{
"epoch": 0.9457831325301205,
"grad_norm": 0.1272152066230774,
"learning_rate": 5.551297535677236e-06,
"loss": 0.2688,
"step": 314
},
{
"epoch": 0.9487951807228916,
"grad_norm": 0.15668314695358276,
"learning_rate": 5.527419417950424e-06,
"loss": 0.3275,
"step": 315
},
{
"epoch": 0.9518072289156626,
"grad_norm": 0.16511787474155426,
"learning_rate": 5.503529129972792e-06,
"loss": 0.4056,
"step": 316
},
{
"epoch": 0.9548192771084337,
"grad_norm": 0.16846664249897003,
"learning_rate": 5.479627223014902e-06,
"loss": 0.413,
"step": 317
},
{
"epoch": 0.9578313253012049,
"grad_norm": 0.13430540263652802,
"learning_rate": 5.455714248615417e-06,
"loss": 0.3085,
"step": 318
},
{
"epoch": 0.9608433734939759,
"grad_norm": 0.13739560544490814,
"learning_rate": 5.431790758568388e-06,
"loss": 0.3388,
"step": 319
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.15051551163196564,
"learning_rate": 5.4078573049105135e-06,
"loss": 0.3835,
"step": 320
},
{
"epoch": 0.9668674698795181,
"grad_norm": 0.13513195514678955,
"learning_rate": 5.383914439908403e-06,
"loss": 0.2506,
"step": 321
},
{
"epoch": 0.9698795180722891,
"grad_norm": 0.17084147036075592,
"learning_rate": 5.359962716045836e-06,
"loss": 0.4237,
"step": 322
},
{
"epoch": 0.9728915662650602,
"grad_norm": 0.154204323887825,
"learning_rate": 5.336002686011007e-06,
"loss": 0.3441,
"step": 323
},
{
"epoch": 0.9759036144578314,
"grad_norm": 0.14598777890205383,
"learning_rate": 5.312034902683779e-06,
"loss": 0.3549,
"step": 324
},
{
"epoch": 0.9789156626506024,
"grad_norm": 0.14227163791656494,
"learning_rate": 5.288059919122922e-06,
"loss": 0.3104,
"step": 325
},
{
"epoch": 0.9819277108433735,
"grad_norm": 0.11683394759893417,
"learning_rate": 5.2640782885533515e-06,
"loss": 0.2984,
"step": 326
},
{
"epoch": 0.9849397590361446,
"grad_norm": 0.2088451385498047,
"learning_rate": 5.240090564353365e-06,
"loss": 0.5566,
"step": 327
},
{
"epoch": 0.9879518072289156,
"grad_norm": 0.15885204076766968,
"learning_rate": 5.21609730004187e-06,
"loss": 0.397,
"step": 328
},
{
"epoch": 0.9909638554216867,
"grad_norm": 0.12600690126419067,
"learning_rate": 5.1920990492656135e-06,
"loss": 0.2872,
"step": 329
},
{
"epoch": 0.9939759036144579,
"grad_norm": 0.12452724575996399,
"learning_rate": 5.168096365786402e-06,
"loss": 0.2311,
"step": 330
},
{
"epoch": 0.9969879518072289,
"grad_norm": 0.2103230059146881,
"learning_rate": 5.144089803468333e-06,
"loss": 0.3784,
"step": 331
},
{
"epoch": 1.0,
"grad_norm": 0.15288111567497253,
"learning_rate": 5.1200799162650035e-06,
"loss": 0.3378,
"step": 332
},
{
"epoch": 1.0,
"eval_loss": 0.38105618953704834,
"eval_runtime": 47.9441,
"eval_samples_per_second": 7.676,
"eval_steps_per_second": 0.959,
"step": 332
},
{
"epoch": 1.0030120481927711,
"grad_norm": 0.1693660169839859,
"learning_rate": 5.096067258206735e-06,
"loss": 0.3782,
"step": 333
},
{
"epoch": 1.0060240963855422,
"grad_norm": 0.13185366988182068,
"learning_rate": 5.072052383387787e-06,
"loss": 0.4386,
"step": 334
},
{
"epoch": 1.0090361445783131,
"grad_norm": 0.2009076029062271,
"learning_rate": 5.048035845953569e-06,
"loss": 0.5079,
"step": 335
},
{
"epoch": 1.0120481927710843,
"grad_norm": 0.13641564548015594,
"learning_rate": 5.024018200087855e-06,
"loss": 0.2156,
"step": 336
},
{
"epoch": 1.0150602409638554,
"grad_norm": 0.162125363945961,
"learning_rate": 5e-06,
"loss": 0.2569,
"step": 337
},
{
"epoch": 1.0180722891566265,
"grad_norm": 0.14947426319122314,
"learning_rate": 4.975981799912147e-06,
"loss": 0.2846,
"step": 338
},
{
"epoch": 1.0210843373493976,
"grad_norm": 0.1346963495016098,
"learning_rate": 4.951964154046432e-06,
"loss": 0.3302,
"step": 339
},
{
"epoch": 1.0030120481927711,
"grad_norm": 0.17184017598628998,
"learning_rate": 4.927947616612216e-06,
"loss": 0.4225,
"step": 340
},
{
"epoch": 1.0060240963855422,
"grad_norm": 0.10629676282405853,
"learning_rate": 4.903932741793266e-06,
"loss": 0.2626,
"step": 341
},
{
"epoch": 1.0090361445783131,
"grad_norm": 0.11881528049707413,
"learning_rate": 4.879920083734997e-06,
"loss": 0.2189,
"step": 342
},
{
"epoch": 1.0120481927710843,
"grad_norm": 0.1317717581987381,
"learning_rate": 4.855910196531669e-06,
"loss": 0.247,
"step": 343
},
{
"epoch": 1.0150602409638554,
"grad_norm": 0.1618192493915558,
"learning_rate": 4.8319036342135985e-06,
"loss": 0.2762,
"step": 344
},
{
"epoch": 1.0180722891566265,
"grad_norm": 0.16160622239112854,
"learning_rate": 4.807900950734388e-06,
"loss": 0.3957,
"step": 345
},
{
"epoch": 1.0210843373493976,
"grad_norm": 0.14416123926639557,
"learning_rate": 4.78390269995813e-06,
"loss": 0.2766,
"step": 346
},
{
"epoch": 1.0240963855421688,
"grad_norm": 0.10820020735263824,
"learning_rate": 4.759909435646636e-06,
"loss": 0.1951,
"step": 347
},
{
"epoch": 1.0271084337349397,
"grad_norm": 0.16407382488250732,
"learning_rate": 4.735921711446649e-06,
"loss": 0.278,
"step": 348
},
{
"epoch": 1.0301204819277108,
"grad_norm": 0.17282168567180634,
"learning_rate": 4.711940080877079e-06,
"loss": 0.3751,
"step": 349
},
{
"epoch": 1.033132530120482,
"grad_norm": 0.1287918984889984,
"learning_rate": 4.687965097316223e-06,
"loss": 0.2509,
"step": 350
},
{
"epoch": 1.036144578313253,
"grad_norm": 0.11395692825317383,
"learning_rate": 4.6639973139889944e-06,
"loss": 0.2757,
"step": 351
},
{
"epoch": 1.0391566265060241,
"grad_norm": 0.142571821808815,
"learning_rate": 4.640037283954165e-06,
"loss": 0.2886,
"step": 352
},
{
"epoch": 1.0421686746987953,
"grad_norm": 0.12164673209190369,
"learning_rate": 4.616085560091596e-06,
"loss": 0.2117,
"step": 353
},
{
"epoch": 1.0451807228915662,
"grad_norm": 0.15279339253902435,
"learning_rate": 4.592142695089489e-06,
"loss": 0.3379,
"step": 354
},
{
"epoch": 1.0481927710843373,
"grad_norm": 0.14959678053855896,
"learning_rate": 4.568209241431615e-06,
"loss": 0.3164,
"step": 355
},
{
"epoch": 1.0512048192771084,
"grad_norm": 0.13712869584560394,
"learning_rate": 4.544285751384585e-06,
"loss": 0.2908,
"step": 356
},
{
"epoch": 1.0542168674698795,
"grad_norm": 0.22040824592113495,
"learning_rate": 4.520372776985101e-06,
"loss": 0.5508,
"step": 357
},
{
"epoch": 1.0572289156626506,
"grad_norm": 0.18658672273159027,
"learning_rate": 4.496470870027209e-06,
"loss": 0.5689,
"step": 358
},
{
"epoch": 1.0602409638554218,
"grad_norm": 0.1375323086977005,
"learning_rate": 4.472580582049578e-06,
"loss": 0.3468,
"step": 359
},
{
"epoch": 1.0632530120481927,
"grad_norm": 0.14857372641563416,
"learning_rate": 4.448702464322764e-06,
"loss": 0.3003,
"step": 360
},
{
"epoch": 1.0662650602409638,
"grad_norm": 0.16661293804645538,
"learning_rate": 4.4248370678364995e-06,
"loss": 0.3879,
"step": 361
},
{
"epoch": 1.069277108433735,
"grad_norm": 0.19231560826301575,
"learning_rate": 4.400984943286965e-06,
"loss": 0.3818,
"step": 362
},
{
"epoch": 1.072289156626506,
"grad_norm": 0.149313285946846,
"learning_rate": 4.377146641064093e-06,
"loss": 0.4512,
"step": 363
},
{
"epoch": 1.0753012048192772,
"grad_norm": 0.12603868544101715,
"learning_rate": 4.3533227112388694e-06,
"loss": 0.2365,
"step": 364
},
{
"epoch": 1.0783132530120483,
"grad_norm": 0.2094918042421341,
"learning_rate": 4.329513703550628e-06,
"loss": 0.4479,
"step": 365
},
{
"epoch": 1.0813253012048192,
"grad_norm": 0.18627941608428955,
"learning_rate": 4.305720167394381e-06,
"loss": 0.463,
"step": 366
},
{
"epoch": 1.0843373493975903,
"grad_norm": 0.16777126491069794,
"learning_rate": 4.2819426518081265e-06,
"loss": 0.3307,
"step": 367
},
{
"epoch": 1.0873493975903614,
"grad_norm": 0.13657556474208832,
"learning_rate": 4.258181705460188e-06,
"loss": 0.2546,
"step": 368
},
{
"epoch": 1.0903614457831325,
"grad_norm": 0.13052892684936523,
"learning_rate": 4.234437876636557e-06,
"loss": 0.2653,
"step": 369
},
{
"epoch": 1.0933734939759037,
"grad_norm": 0.2776612341403961,
"learning_rate": 4.21071171322823e-06,
"loss": 0.4,
"step": 370
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.17850278317928314,
"learning_rate": 4.1870037627185785e-06,
"loss": 0.2607,
"step": 371
},
{
"epoch": 1.0993975903614457,
"grad_norm": 0.20225200057029724,
"learning_rate": 4.163314572170704e-06,
"loss": 0.4746,
"step": 372
},
{
"epoch": 1.1024096385542168,
"grad_norm": 0.22072741389274597,
"learning_rate": 4.139644688214827e-06,
"loss": 0.5159,
"step": 373
},
{
"epoch": 1.105421686746988,
"grad_norm": 0.16861790418624878,
"learning_rate": 4.115994657035659e-06,
"loss": 0.4456,
"step": 374
},
{
"epoch": 1.108433734939759,
"grad_norm": 0.227167546749115,
"learning_rate": 4.0923650243598104e-06,
"loss": 0.3914,
"step": 375
},
{
"epoch": 1.1114457831325302,
"grad_norm": 0.19060221314430237,
"learning_rate": 4.0687563354431986e-06,
"loss": 0.3424,
"step": 376
},
{
"epoch": 1.1144578313253013,
"grad_norm": 0.17396700382232666,
"learning_rate": 4.045169135058452e-06,
"loss": 0.4925,
"step": 377
},
{
"epoch": 1.1174698795180722,
"grad_norm": 0.17985737323760986,
"learning_rate": 4.021603967482361e-06,
"loss": 0.256,
"step": 378
},
{
"epoch": 1.1204819277108433,
"grad_norm": 0.15050223469734192,
"learning_rate": 3.998061376483298e-06,
"loss": 0.2767,
"step": 379
},
{
"epoch": 1.1234939759036144,
"grad_norm": 0.14838173985481262,
"learning_rate": 3.974541905308679e-06,
"loss": 0.3555,
"step": 380
},
{
"epoch": 1.1265060240963856,
"grad_norm": 0.16787950694561005,
"learning_rate": 3.951046096672434e-06,
"loss": 0.235,
"step": 381
},
{
"epoch": 1.1295180722891567,
"grad_norm": 0.15755508840084076,
"learning_rate": 3.927574492742473e-06,
"loss": 0.3667,
"step": 382
},
{
"epoch": 1.1325301204819278,
"grad_norm": 0.20158375799655914,
"learning_rate": 3.904127635128184e-06,
"loss": 0.4655,
"step": 383
},
{
"epoch": 1.1355421686746987,
"grad_norm": 0.17723795771598816,
"learning_rate": 3.880706064867927e-06,
"loss": 0.4452,
"step": 384
},
{
"epoch": 1.1385542168674698,
"grad_norm": 0.19107376039028168,
"learning_rate": 3.857310322416555e-06,
"loss": 0.3636,
"step": 385
},
{
"epoch": 1.141566265060241,
"grad_norm": 0.17358338832855225,
"learning_rate": 3.833940947632947e-06,
"loss": 0.2636,
"step": 386
},
{
"epoch": 1.144578313253012,
"grad_norm": 0.18460644781589508,
"learning_rate": 3.8105984797675364e-06,
"loss": 0.4849,
"step": 387
},
{
"epoch": 1.1475903614457832,
"grad_norm": 0.15454675257205963,
"learning_rate": 3.7872834574498894e-06,
"loss": 0.3643,
"step": 388
},
{
"epoch": 1.1506024096385543,
"grad_norm": 0.16667881608009338,
"learning_rate": 3.7639964186762506e-06,
"loss": 0.4057,
"step": 389
},
{
"epoch": 1.1536144578313252,
"grad_norm": 0.12590649724006653,
"learning_rate": 3.740737900797151e-06,
"loss": 0.1882,
"step": 390
},
{
"epoch": 1.1566265060240963,
"grad_norm": 0.17524027824401855,
"learning_rate": 3.7175084405049978e-06,
"loss": 0.3662,
"step": 391
},
{
"epoch": 1.1596385542168675,
"grad_norm": 0.1579742133617401,
"learning_rate": 3.6943085738216855e-06,
"loss": 0.3395,
"step": 392
},
{
"epoch": 1.1626506024096386,
"grad_norm": 0.23439815640449524,
"learning_rate": 3.6711388360862417e-06,
"loss": 0.4181,
"step": 393
},
{
"epoch": 1.1656626506024097,
"grad_norm": 0.1944400668144226,
"learning_rate": 3.6479997619424605e-06,
"loss": 0.3925,
"step": 394
},
{
"epoch": 1.1686746987951806,
"grad_norm": 0.16743271052837372,
"learning_rate": 3.6248918853265756e-06,
"loss": 0.3325,
"step": 395
},
{
"epoch": 1.1716867469879517,
"grad_norm": 0.20892484486103058,
"learning_rate": 3.6018157394549287e-06,
"loss": 0.4283,
"step": 396
},
{
"epoch": 1.1746987951807228,
"grad_norm": 0.15310931205749512,
"learning_rate": 3.5787718568116764e-06,
"loss": 0.3372,
"step": 397
},
{
"epoch": 1.177710843373494,
"grad_norm": 0.16459712386131287,
"learning_rate": 3.5557607691364983e-06,
"loss": 0.4035,
"step": 398
},
{
"epoch": 1.180722891566265,
"grad_norm": 0.160426527261734,
"learning_rate": 3.5327830074123214e-06,
"loss": 0.39,
"step": 399
},
{
"epoch": 1.1837349397590362,
"grad_norm": 0.16313065588474274,
"learning_rate": 3.509839101853082e-06,
"loss": 0.3013,
"step": 400
},
{
"epoch": 1.1867469879518073,
"grad_norm": 0.14337725937366486,
"learning_rate": 3.486929581891476e-06,
"loss": 0.2797,
"step": 401
},
{
"epoch": 1.1897590361445782,
"grad_norm": 0.11817906051874161,
"learning_rate": 3.464054976166753e-06,
"loss": 0.2863,
"step": 402
},
{
"epoch": 1.1927710843373494,
"grad_norm": 0.2168179452419281,
"learning_rate": 3.441215812512508e-06,
"loss": 0.5059,
"step": 403
},
{
"epoch": 1.1957831325301205,
"grad_norm": 0.14408406615257263,
"learning_rate": 3.41841261794451e-06,
"loss": 0.2295,
"step": 404
},
{
"epoch": 1.1987951807228916,
"grad_norm": 0.14556995034217834,
"learning_rate": 3.3956459186485414e-06,
"loss": 0.3999,
"step": 405
},
{
"epoch": 1.2018072289156627,
"grad_norm": 0.22519560158252716,
"learning_rate": 3.372916239968246e-06,
"loss": 0.5036,
"step": 406
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.17916397750377655,
"learning_rate": 3.3502241063930196e-06,
"loss": 0.4047,
"step": 407
},
{
"epoch": 1.2078313253012047,
"grad_norm": 0.20149105787277222,
"learning_rate": 3.327570041545897e-06,
"loss": 0.3828,
"step": 408
},
{
"epoch": 1.2108433734939759,
"grad_norm": 0.2073061168193817,
"learning_rate": 3.304954568171478e-06,
"loss": 0.4249,
"step": 409
},
{
"epoch": 1.213855421686747,
"grad_norm": 0.16353848576545715,
"learning_rate": 3.282378208123856e-06,
"loss": 0.3528,
"step": 410
},
{
"epoch": 1.216867469879518,
"grad_norm": 0.1294896900653839,
"learning_rate": 3.259841482354582e-06,
"loss": 0.229,
"step": 411
},
{
"epoch": 1.2198795180722892,
"grad_norm": 0.1823435127735138,
"learning_rate": 3.2373449109006476e-06,
"loss": 0.2962,
"step": 412
},
{
"epoch": 1.2228915662650603,
"grad_norm": 0.1962028592824936,
"learning_rate": 3.21488901287247e-06,
"loss": 0.3654,
"step": 413
},
{
"epoch": 1.2259036144578312,
"grad_norm": 0.18562830984592438,
"learning_rate": 3.192474306441936e-06,
"loss": 0.5603,
"step": 414
},
{
"epoch": 1.2289156626506024,
"grad_norm": 0.19218891859054565,
"learning_rate": 3.170101308830421e-06,
"loss": 0.5087,
"step": 415
},
{
"epoch": 1.2289156626506024,
"eval_loss": 0.36883148550987244,
"eval_runtime": 48.0542,
"eval_samples_per_second": 7.658,
"eval_steps_per_second": 0.957,
"step": 415
},
{
"epoch": 1.2319277108433735,
"grad_norm": 0.15993016958236694,
"learning_rate": 3.1477705362968702e-06,
"loss": 0.3727,
"step": 416
},
{
"epoch": 1.2349397590361446,
"grad_norm": 0.1381690949201584,
"learning_rate": 3.1254825041258852e-06,
"loss": 0.2371,
"step": 417
},
{
"epoch": 1.2379518072289157,
"grad_norm": 0.14461062848567963,
"learning_rate": 3.103237726615822e-06,
"loss": 0.3255,
"step": 418
},
{
"epoch": 1.2409638554216866,
"grad_norm": 0.12627124786376953,
"learning_rate": 3.081036717066938e-06,
"loss": 0.2406,
"step": 419
},
{
"epoch": 1.2439759036144578,
"grad_norm": 0.170866921544075,
"learning_rate": 3.0588799877695375e-06,
"loss": 0.3002,
"step": 420
},
{
"epoch": 1.2469879518072289,
"grad_norm": 0.1740315705537796,
"learning_rate": 3.036768049992157e-06,
"loss": 0.4274,
"step": 421
},
{
"epoch": 1.25,
"grad_norm": 0.19806380569934845,
"learning_rate": 3.0147014139697596e-06,
"loss": 0.4143,
"step": 422
},
{
"epoch": 1.2530120481927711,
"grad_norm": 0.22980116307735443,
"learning_rate": 2.99268058889197e-06,
"loss": 0.2678,
"step": 423
},
{
"epoch": 1.2560240963855422,
"grad_norm": 0.21285657584667206,
"learning_rate": 2.9707060828913226e-06,
"loss": 0.4425,
"step": 424
},
{
"epoch": 1.2590361445783134,
"grad_norm": 0.15227141976356506,
"learning_rate": 2.9487784030315297e-06,
"loss": 0.2836,
"step": 425
},
{
"epoch": 1.2620481927710843,
"grad_norm": 0.1427774429321289,
"learning_rate": 2.9268980552957917e-06,
"loss": 0.3114,
"step": 426
},
{
"epoch": 1.2650602409638554,
"grad_norm": 0.1547195166349411,
"learning_rate": 2.905065544575114e-06,
"loss": 0.3177,
"step": 427
},
{
"epoch": 1.2680722891566265,
"grad_norm": 0.15522047877311707,
"learning_rate": 2.8832813746566546e-06,
"loss": 0.3496,
"step": 428
},
{
"epoch": 1.2710843373493976,
"grad_norm": 0.18824540078639984,
"learning_rate": 2.86154604821211e-06,
"loss": 0.3926,
"step": 429
},
{
"epoch": 1.2740963855421688,
"grad_norm": 0.13229520618915558,
"learning_rate": 2.8398600667861032e-06,
"loss": 0.2684,
"step": 430
},
{
"epoch": 1.2771084337349397,
"grad_norm": 0.17028671503067017,
"learning_rate": 2.8182239307846195e-06,
"loss": 0.4819,
"step": 431
},
{
"epoch": 1.2801204819277108,
"grad_norm": 0.30059295892715454,
"learning_rate": 2.796638139463456e-06,
"loss": 0.417,
"step": 432
},
{
"epoch": 1.283132530120482,
"grad_norm": 0.18978437781333923,
"learning_rate": 2.7751031909167046e-06,
"loss": 0.4118,
"step": 433
},
{
"epoch": 1.286144578313253,
"grad_norm": 0.1798931062221527,
"learning_rate": 2.7536195820652506e-06,
"loss": 0.3801,
"step": 434
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.19128261506557465,
"learning_rate": 2.73218780864531e-06,
"loss": 0.3277,
"step": 435
},
{
"epoch": 1.2921686746987953,
"grad_norm": 0.22201856970787048,
"learning_rate": 2.710808365197e-06,
"loss": 0.3294,
"step": 436
},
{
"epoch": 1.2951807228915664,
"grad_norm": 0.2222212851047516,
"learning_rate": 2.689481745052908e-06,
"loss": 0.4099,
"step": 437
},
{
"epoch": 1.2981927710843373,
"grad_norm": 0.11937326937913895,
"learning_rate": 2.6682084403267305e-06,
"loss": 0.1828,
"step": 438
},
{
"epoch": 1.3012048192771084,
"grad_norm": 0.25063556432724,
"learning_rate": 2.6469889419018985e-06,
"loss": 0.3695,
"step": 439
},
{
"epoch": 1.3042168674698795,
"grad_norm": 0.15843240916728973,
"learning_rate": 2.6258237394202556e-06,
"loss": 0.3532,
"step": 440
},
{
"epoch": 1.3072289156626506,
"grad_norm": 0.16287773847579956,
"learning_rate": 2.60471332127077e-06,
"loss": 0.4453,
"step": 441
},
{
"epoch": 1.3102409638554218,
"grad_norm": 0.2112409770488739,
"learning_rate": 2.5836581745782474e-06,
"loss": 0.4007,
"step": 442
},
{
"epoch": 1.3132530120481927,
"grad_norm": 0.19147521257400513,
"learning_rate": 2.5626587851921053e-06,
"loss": 0.3765,
"step": 443
},
{
"epoch": 1.3162650602409638,
"grad_norm": 0.1624053716659546,
"learning_rate": 2.541715637675156e-06,
"loss": 0.3615,
"step": 444
},
{
"epoch": 1.319277108433735,
"grad_norm": 0.1541513204574585,
"learning_rate": 2.520829215292426e-06,
"loss": 0.3372,
"step": 445
},
{
"epoch": 1.322289156626506,
"grad_norm": 0.21061226725578308,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.5027,
"step": 446
},
{
"epoch": 1.3253012048192772,
"grad_norm": 0.19752328097820282,
"learning_rate": 2.4792284724339077e-06,
"loss": 0.2708,
"step": 447
},
{
"epoch": 1.3283132530120483,
"grad_norm": 0.15623369812965393,
"learning_rate": 2.4585151118990286e-06,
"loss": 0.364,
"step": 448
},
{
"epoch": 1.3313253012048194,
"grad_norm": 0.15290258824825287,
"learning_rate": 2.4378603963580293e-06,
"loss": 0.2702,
"step": 449
},
{
"epoch": 1.3343373493975903,
"grad_norm": 0.15025874972343445,
"learning_rate": 2.417264802420343e-06,
"loss": 0.3501,
"step": 450
},
{
"epoch": 1.3373493975903614,
"grad_norm": 0.16519276797771454,
"learning_rate": 2.396728805331167e-06,
"loss": 0.2809,
"step": 451
},
{
"epoch": 1.3403614457831325,
"grad_norm": 0.17497073113918304,
"learning_rate": 2.3762528789604887e-06,
"loss": 0.4417,
"step": 452
},
{
"epoch": 1.3433734939759037,
"grad_norm": 0.25110098719596863,
"learning_rate": 2.3558374957921678e-06,
"loss": 0.3327,
"step": 453
},
{
"epoch": 1.3463855421686746,
"grad_norm": 0.24098452925682068,
"learning_rate": 2.3354831269130133e-06,
"loss": 0.3,
"step": 454
},
{
"epoch": 1.3493975903614457,
"grad_norm": 0.17397959530353546,
"learning_rate": 2.3151902420019357e-06,
"loss": 0.268,
"step": 455
},
{
"epoch": 1.3524096385542168,
"grad_norm": 0.14491979777812958,
"learning_rate": 2.2949593093190863e-06,
"loss": 0.3696,
"step": 456
},
{
"epoch": 1.355421686746988,
"grad_norm": 0.15514716506004333,
"learning_rate": 2.274790795695071e-06,
"loss": 0.2586,
"step": 457
},
{
"epoch": 1.358433734939759,
"grad_norm": 0.23854245245456696,
"learning_rate": 2.2546851665201692e-06,
"loss": 0.3591,
"step": 458
},
{
"epoch": 1.3614457831325302,
"grad_norm": 0.2554926872253418,
"learning_rate": 2.2346428857335904e-06,
"loss": 0.4179,
"step": 459
},
{
"epoch": 1.3644578313253013,
"grad_norm": 0.14370888471603394,
"learning_rate": 2.2146644158127827e-06,
"loss": 0.2502,
"step": 460
},
{
"epoch": 1.3674698795180724,
"grad_norm": 0.1868659108877182,
"learning_rate": 2.1947502177627437e-06,
"loss": 0.3848,
"step": 461
},
{
"epoch": 1.3704819277108433,
"grad_norm": 0.1326580047607422,
"learning_rate": 2.1749007511054005e-06,
"loss": 0.2471,
"step": 462
},
{
"epoch": 1.3734939759036144,
"grad_norm": 0.13474349677562714,
"learning_rate": 2.1551164738689896e-06,
"loss": 0.275,
"step": 463
},
{
"epoch": 1.3765060240963856,
"grad_norm": 0.1436898559331894,
"learning_rate": 2.1353978425775006e-06,
"loss": 0.2391,
"step": 464
},
{
"epoch": 1.3795180722891567,
"grad_norm": 0.15353891253471375,
"learning_rate": 2.1157453122401385e-06,
"loss": 0.2646,
"step": 465
},
{
"epoch": 1.3825301204819276,
"grad_norm": 0.18674910068511963,
"learning_rate": 2.0961593363408154e-06,
"loss": 0.3587,
"step": 466
},
{
"epoch": 1.3855421686746987,
"grad_norm": 0.20069798827171326,
"learning_rate": 2.076640366827703e-06,
"loss": 0.337,
"step": 467
},
{
"epoch": 1.3885542168674698,
"grad_norm": 0.20228977501392365,
"learning_rate": 2.0571888541027857e-06,
"loss": 0.356,
"step": 468
},
{
"epoch": 1.391566265060241,
"grad_norm": 0.18577708303928375,
"learning_rate": 2.0378052470114822e-06,
"loss": 0.3807,
"step": 469
},
{
"epoch": 1.394578313253012,
"grad_norm": 0.2242514193058014,
"learning_rate": 2.018489992832283e-06,
"loss": 0.3577,
"step": 470
},
{
"epoch": 1.3975903614457832,
"grad_norm": 0.16116055846214294,
"learning_rate": 1.999243537266424e-06,
"loss": 0.4194,
"step": 471
},
{
"epoch": 1.4006024096385543,
"grad_norm": 0.19547365605831146,
"learning_rate": 1.980066324427613e-06,
"loss": 0.4161,
"step": 472
},
{
"epoch": 1.4036144578313254,
"grad_norm": 0.13572612404823303,
"learning_rate": 1.960958796831769e-06,
"loss": 0.2878,
"step": 473
},
{
"epoch": 1.4066265060240963,
"grad_norm": 0.1661803424358368,
"learning_rate": 1.9419213953868236e-06,
"loss": 0.3692,
"step": 474
},
{
"epoch": 1.4096385542168675,
"grad_norm": 0.1443498730659485,
"learning_rate": 1.9229545593825367e-06,
"loss": 0.2833,
"step": 475
},
{
"epoch": 1.4126506024096386,
"grad_norm": 0.1970563381910324,
"learning_rate": 1.9040587264803673e-06,
"loss": 0.3256,
"step": 476
},
{
"epoch": 1.4156626506024097,
"grad_norm": 0.15597006678581238,
"learning_rate": 1.8852343327033717e-06,
"loss": 0.2241,
"step": 477
},
{
"epoch": 1.4186746987951806,
"grad_norm": 0.1828545331954956,
"learning_rate": 1.8664818124261375e-06,
"loss": 0.3341,
"step": 478
},
{
"epoch": 1.4216867469879517,
"grad_norm": 0.23139140009880066,
"learning_rate": 1.8478015983647718e-06,
"loss": 0.4646,
"step": 479
},
{
"epoch": 1.4246987951807228,
"grad_norm": 0.2196948230266571,
"learning_rate": 1.8291941215669024e-06,
"loss": 0.4086,
"step": 480
},
{
"epoch": 1.427710843373494,
"grad_norm": 0.25731542706489563,
"learning_rate": 1.8106598114017398e-06,
"loss": 0.3933,
"step": 481
},
{
"epoch": 1.430722891566265,
"grad_norm": 0.23287609219551086,
"learning_rate": 1.7921990955501705e-06,
"loss": 0.4715,
"step": 482
},
{
"epoch": 1.4337349397590362,
"grad_norm": 0.18525956571102142,
"learning_rate": 1.7738123999948853e-06,
"loss": 0.2407,
"step": 483
},
{
"epoch": 1.4367469879518073,
"grad_norm": 0.16159279644489288,
"learning_rate": 1.755500149010549e-06,
"loss": 0.3681,
"step": 484
},
{
"epoch": 1.4397590361445782,
"grad_norm": 0.1668546348810196,
"learning_rate": 1.737262765154008e-06,
"loss": 0.3868,
"step": 485
},
{
"epoch": 1.4427710843373494,
"grad_norm": 0.15711194276809692,
"learning_rate": 1.7191006692545493e-06,
"loss": 0.3467,
"step": 486
},
{
"epoch": 1.4457831325301205,
"grad_norm": 0.16672903299331665,
"learning_rate": 1.7010142804041785e-06,
"loss": 0.3285,
"step": 487
},
{
"epoch": 1.4487951807228916,
"grad_norm": 0.11567605286836624,
"learning_rate": 1.6830040159479521e-06,
"loss": 0.1985,
"step": 488
},
{
"epoch": 1.4518072289156627,
"grad_norm": 0.13821400701999664,
"learning_rate": 1.66507029147436e-06,
"loss": 0.2703,
"step": 489
},
{
"epoch": 1.4548192771084336,
"grad_norm": 0.1859131008386612,
"learning_rate": 1.6472135208057128e-06,
"loss": 0.3096,
"step": 490
},
{
"epoch": 1.4578313253012047,
"grad_norm": 0.20247076451778412,
"learning_rate": 1.629434115988614e-06,
"loss": 0.3022,
"step": 491
},
{
"epoch": 1.4608433734939759,
"grad_norm": 0.20361284911632538,
"learning_rate": 1.611732487284437e-06,
"loss": 0.4198,
"step": 492
},
{
"epoch": 1.463855421686747,
"grad_norm": 0.18198885023593903,
"learning_rate": 1.5941090431598654e-06,
"loss": 0.3556,
"step": 493
},
{
"epoch": 1.466867469879518,
"grad_norm": 0.19117406010627747,
"learning_rate": 1.5765641902774704e-06,
"loss": 0.3378,
"step": 494
},
{
"epoch": 1.4698795180722892,
"grad_norm": 0.15239785611629486,
"learning_rate": 1.5590983334863191e-06,
"loss": 0.2982,
"step": 495
},
{
"epoch": 1.4728915662650603,
"grad_norm": 0.18119259178638458,
"learning_rate": 1.5417118758126408e-06,
"loss": 0.3394,
"step": 496
},
{
"epoch": 1.4759036144578312,
"grad_norm": 0.22491803765296936,
"learning_rate": 1.524405218450517e-06,
"loss": 0.2992,
"step": 497
},
{
"epoch": 1.4789156626506024,
"grad_norm": 0.15981170535087585,
"learning_rate": 1.5071787607526366e-06,
"loss": 0.3029,
"step": 498
},
{
"epoch": 1.4789156626506024,
"eval_loss": 0.36260896921157837,
"eval_runtime": 127.8777,
"eval_samples_per_second": 2.878,
"eval_steps_per_second": 0.36,
"step": 498
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.21199637651443481,
"learning_rate": 1.4900329002210684e-06,
"loss": 0.3791,
"step": 499
},
{
"epoch": 1.4849397590361446,
"grad_norm": 0.2165050506591797,
"learning_rate": 1.472968032498095e-06,
"loss": 0.4786,
"step": 500
},
{
"epoch": 1.4879518072289157,
"grad_norm": 0.18277530372142792,
"learning_rate": 1.4559845513570859e-06,
"loss": 0.3564,
"step": 501
},
{
"epoch": 1.4909638554216866,
"grad_norm": 0.1746102273464203,
"learning_rate": 1.439082848693406e-06,
"loss": 0.4017,
"step": 502
},
{
"epoch": 1.4939759036144578,
"grad_norm": 0.14120972156524658,
"learning_rate": 1.4222633145153758e-06,
"loss": 0.2722,
"step": 503
},
{
"epoch": 1.4969879518072289,
"grad_norm": 0.18486057221889496,
"learning_rate": 1.4055263369352673e-06,
"loss": 0.3194,
"step": 504
},
{
"epoch": 1.5,
"grad_norm": 0.16988879442214966,
"learning_rate": 1.388872302160353e-06,
"loss": 0.2568,
"step": 505
},
{
"epoch": 1.5030120481927711,
"grad_norm": 0.18600550293922424,
"learning_rate": 1.3723015944839947e-06,
"loss": 0.3502,
"step": 506
},
{
"epoch": 1.5060240963855422,
"grad_norm": 0.15733253955841064,
"learning_rate": 1.35581459627677e-06,
"loss": 0.2692,
"step": 507
},
{
"epoch": 1.5090361445783134,
"grad_norm": 0.15347357094287872,
"learning_rate": 1.339411687977657e-06,
"loss": 0.2666,
"step": 508
},
{
"epoch": 1.5120481927710845,
"grad_norm": 0.32168644666671753,
"learning_rate": 1.3230932480852487e-06,
"loss": 0.2397,
"step": 509
},
{
"epoch": 1.5150602409638554,
"grad_norm": 0.17254099249839783,
"learning_rate": 1.3068596531490253e-06,
"loss": 0.2868,
"step": 510
},
{
"epoch": 1.5180722891566265,
"grad_norm": 0.20810185372829437,
"learning_rate": 1.290711277760658e-06,
"loss": 0.3383,
"step": 511
},
{
"epoch": 1.5210843373493976,
"grad_norm": 0.18230295181274414,
"learning_rate": 1.2746484945453691e-06,
"loss": 0.3909,
"step": 512
},
{
"epoch": 1.5240963855421685,
"grad_norm": 0.22496020793914795,
"learning_rate": 1.2586716741533389e-06,
"loss": 0.4322,
"step": 513
},
{
"epoch": 1.5271084337349397,
"grad_norm": 0.1773802936077118,
"learning_rate": 1.2427811852511396e-06,
"loss": 0.3379,
"step": 514
},
{
"epoch": 1.5301204819277108,
"grad_norm": 0.22131404280662537,
"learning_rate": 1.226977394513247e-06,
"loss": 0.5291,
"step": 515
},
{
"epoch": 1.533132530120482,
"grad_norm": 0.13072459399700165,
"learning_rate": 1.2112606666135602e-06,
"loss": 0.2691,
"step": 516
},
{
"epoch": 1.536144578313253,
"grad_norm": 0.16776853799819946,
"learning_rate": 1.1956313642169974e-06,
"loss": 0.3923,
"step": 517
},
{
"epoch": 1.5391566265060241,
"grad_norm": 0.21626092493534088,
"learning_rate": 1.1800898479711293e-06,
"loss": 0.383,
"step": 518
},
{
"epoch": 1.5421686746987953,
"grad_norm": 0.1677713394165039,
"learning_rate": 1.1646364764978468e-06,
"loss": 0.2859,
"step": 519
},
{
"epoch": 1.5451807228915664,
"grad_norm": 0.15191853046417236,
"learning_rate": 1.1492716063850973e-06,
"loss": 0.3017,
"step": 520
},
{
"epoch": 1.5481927710843375,
"grad_norm": 0.17901447415351868,
"learning_rate": 1.1339955921786504e-06,
"loss": 0.3612,
"step": 521
},
{
"epoch": 1.5512048192771084,
"grad_norm": 0.14615051448345184,
"learning_rate": 1.1188087863739173e-06,
"loss": 0.2722,
"step": 522
},
{
"epoch": 1.5542168674698795,
"grad_norm": 0.1234322264790535,
"learning_rate": 1.1037115394078162e-06,
"loss": 0.1978,
"step": 523
},
{
"epoch": 1.5572289156626506,
"grad_norm": 0.16058236360549927,
"learning_rate": 1.0887041996506858e-06,
"loss": 0.1943,
"step": 524
},
{
"epoch": 1.5602409638554215,
"grad_norm": 0.19182521104812622,
"learning_rate": 1.0737871133982524e-06,
"loss": 0.3489,
"step": 525
},
{
"epoch": 1.5632530120481927,
"grad_norm": 0.1292957067489624,
"learning_rate": 1.0589606248636291e-06,
"loss": 0.1982,
"step": 526
},
{
"epoch": 1.5662650602409638,
"grad_norm": 0.17571797966957092,
"learning_rate": 1.0442250761693829e-06,
"loss": 0.3113,
"step": 527
},
{
"epoch": 1.569277108433735,
"grad_norm": 0.1693243682384491,
"learning_rate": 1.0295808073396352e-06,
"loss": 0.3196,
"step": 528
},
{
"epoch": 1.572289156626506,
"grad_norm": 0.18643434345722198,
"learning_rate": 1.015028156292212e-06,
"loss": 0.3751,
"step": 529
},
{
"epoch": 1.5753012048192772,
"grad_norm": 0.1578662097454071,
"learning_rate": 1.0005674588308566e-06,
"loss": 0.4102,
"step": 530
},
{
"epoch": 1.5783132530120483,
"grad_norm": 0.16692018508911133,
"learning_rate": 9.861990486374695e-07,
"loss": 0.3071,
"step": 531
},
{
"epoch": 1.5813253012048194,
"grad_norm": 0.14578799903392792,
"learning_rate": 9.719232572644189e-07,
"loss": 0.3543,
"step": 532
},
{
"epoch": 1.5843373493975905,
"grad_norm": 0.26533403992652893,
"learning_rate": 9.577404141268815e-07,
"loss": 0.4793,
"step": 533
},
{
"epoch": 1.5873493975903614,
"grad_norm": 0.18731725215911865,
"learning_rate": 9.436508464952471e-07,
"loss": 0.3111,
"step": 534
},
{
"epoch": 1.5903614457831325,
"grad_norm": 0.15878336131572723,
"learning_rate": 9.296548794875659e-07,
"loss": 0.3838,
"step": 535
},
{
"epoch": 1.5933734939759037,
"grad_norm": 0.14642632007598877,
"learning_rate": 9.157528360620416e-07,
"loss": 0.2276,
"step": 536
},
{
"epoch": 1.5963855421686746,
"grad_norm": 0.18535569310188293,
"learning_rate": 9.019450370095867e-07,
"loss": 0.3032,
"step": 537
},
{
"epoch": 1.5993975903614457,
"grad_norm": 0.12464189529418945,
"learning_rate": 8.882318009464124e-07,
"loss": 0.247,
"step": 538
},
{
"epoch": 1.6024096385542168,
"grad_norm": 0.2229868322610855,
"learning_rate": 8.74613444306684e-07,
"loss": 0.3365,
"step": 539
},
{
"epoch": 1.605421686746988,
"grad_norm": 0.16516928374767303,
"learning_rate": 8.61090281335214e-07,
"loss": 0.2678,
"step": 540
},
{
"epoch": 1.608433734939759,
"grad_norm": 0.18349656462669373,
"learning_rate": 8.476626240802099e-07,
"loss": 0.2536,
"step": 541
},
{
"epoch": 1.6114457831325302,
"grad_norm": 0.20605145394802094,
"learning_rate": 8.343307823860819e-07,
"loss": 0.4024,
"step": 542
},
{
"epoch": 1.6144578313253013,
"grad_norm": 0.16944725811481476,
"learning_rate": 8.210950638862813e-07,
"loss": 0.3163,
"step": 543
},
{
"epoch": 1.6174698795180724,
"grad_norm": 0.1750240921974182,
"learning_rate": 8.079557739962129e-07,
"loss": 0.3426,
"step": 544
},
{
"epoch": 1.6204819277108435,
"grad_norm": 0.15269635617733002,
"learning_rate": 7.949132159061784e-07,
"loss": 0.2996,
"step": 545
},
{
"epoch": 1.6234939759036144,
"grad_norm": 0.19370746612548828,
"learning_rate": 7.819676905743872e-07,
"loss": 0.3121,
"step": 546
},
{
"epoch": 1.6265060240963856,
"grad_norm": 0.2879127562046051,
"learning_rate": 7.691194967200099e-07,
"loss": 0.4256,
"step": 547
},
{
"epoch": 1.6295180722891565,
"grad_norm": 0.15857596695423126,
"learning_rate": 7.563689308162803e-07,
"loss": 0.2879,
"step": 548
},
{
"epoch": 1.6325301204819276,
"grad_norm": 0.25081801414489746,
"learning_rate": 7.43716287083664e-07,
"loss": 0.5172,
"step": 549
},
{
"epoch": 1.6355421686746987,
"grad_norm": 0.3084939122200012,
"learning_rate": 7.31161857483057e-07,
"loss": 0.3093,
"step": 550
},
{
"epoch": 1.6385542168674698,
"grad_norm": 0.21095550060272217,
"learning_rate": 7.187059317090622e-07,
"loss": 0.2975,
"step": 551
},
{
"epoch": 1.641566265060241,
"grad_norm": 0.14392945170402527,
"learning_rate": 7.063487971832922e-07,
"loss": 0.3632,
"step": 552
},
{
"epoch": 1.644578313253012,
"grad_norm": 0.1537638157606125,
"learning_rate": 6.940907390477458e-07,
"loss": 0.2379,
"step": 553
},
{
"epoch": 1.6475903614457832,
"grad_norm": 0.20836707949638367,
"learning_rate": 6.819320401582258e-07,
"loss": 0.384,
"step": 554
},
{
"epoch": 1.6506024096385543,
"grad_norm": 0.20172376930713654,
"learning_rate": 6.698729810778065e-07,
"loss": 0.3071,
"step": 555
},
{
"epoch": 1.6536144578313254,
"grad_norm": 0.18422287702560425,
"learning_rate": 6.579138400703716e-07,
"loss": 0.2829,
"step": 556
},
{
"epoch": 1.6566265060240963,
"grad_norm": 0.17231068015098572,
"learning_rate": 6.460548930941801e-07,
"loss": 0.3375,
"step": 557
},
{
"epoch": 1.6596385542168675,
"grad_norm": 0.1713942289352417,
"learning_rate": 6.342964137955071e-07,
"loss": 0.2688,
"step": 558
},
{
"epoch": 1.6626506024096386,
"grad_norm": 0.1761113405227661,
"learning_rate": 6.226386735023271e-07,
"loss": 0.243,
"step": 559
},
{
"epoch": 1.6656626506024095,
"grad_norm": 0.18430888652801514,
"learning_rate": 6.110819412180535e-07,
"loss": 0.2764,
"step": 560
},
{
"epoch": 1.6686746987951806,
"grad_norm": 0.20808419585227966,
"learning_rate": 5.99626483615331e-07,
"loss": 0.3522,
"step": 561
},
{
"epoch": 1.6716867469879517,
"grad_norm": 0.1984253227710724,
"learning_rate": 5.882725650298787e-07,
"loss": 0.3782,
"step": 562
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.1662902534008026,
"learning_rate": 5.770204474543978e-07,
"loss": 0.3391,
"step": 563
},
{
"epoch": 1.677710843373494,
"grad_norm": 0.18588629364967346,
"learning_rate": 5.658703905325186e-07,
"loss": 0.392,
"step": 564
},
{
"epoch": 1.680722891566265,
"grad_norm": 0.21193590760231018,
"learning_rate": 5.548226515528133e-07,
"loss": 0.3875,
"step": 565
},
{
"epoch": 1.6837349397590362,
"grad_norm": 0.156774640083313,
"learning_rate": 5.438774854428614e-07,
"loss": 0.3524,
"step": 566
},
{
"epoch": 1.6867469879518073,
"grad_norm": 0.1758192926645279,
"learning_rate": 5.330351447633603e-07,
"loss": 0.4382,
"step": 567
},
{
"epoch": 1.6897590361445785,
"grad_norm": 0.20016002655029297,
"learning_rate": 5.222958797023036e-07,
"loss": 0.3239,
"step": 568
},
{
"epoch": 1.6927710843373494,
"grad_norm": 0.16859550774097443,
"learning_rate": 5.11659938069205e-07,
"loss": 0.3539,
"step": 569
},
{
"epoch": 1.6957831325301205,
"grad_norm": 0.22717247903347015,
"learning_rate": 5.011275652893782e-07,
"loss": 0.3754,
"step": 570
},
{
"epoch": 1.6987951807228916,
"grad_norm": 0.17677092552185059,
"learning_rate": 4.906990043982813e-07,
"loss": 0.3214,
"step": 571
},
{
"epoch": 1.7018072289156625,
"grad_norm": 0.33064964413642883,
"learning_rate": 4.803744960358992e-07,
"loss": 0.4244,
"step": 572
},
{
"epoch": 1.7048192771084336,
"grad_norm": 0.14009463787078857,
"learning_rate": 4.701542784411994e-07,
"loss": 0.289,
"step": 573
},
{
"epoch": 1.7078313253012047,
"grad_norm": 0.20244641602039337,
"learning_rate": 4.6003858744662564e-07,
"loss": 0.4061,
"step": 574
},
{
"epoch": 1.7108433734939759,
"grad_norm": 0.14781823754310608,
"learning_rate": 4.500276564726652e-07,
"loss": 0.3398,
"step": 575
},
{
"epoch": 1.713855421686747,
"grad_norm": 0.16534654796123505,
"learning_rate": 4.401217165224564e-07,
"loss": 0.3205,
"step": 576
},
{
"epoch": 1.716867469879518,
"grad_norm": 0.2676438093185425,
"learning_rate": 4.3032099617645874e-07,
"loss": 0.4988,
"step": 577
},
{
"epoch": 1.7198795180722892,
"grad_norm": 0.20747099816799164,
"learning_rate": 4.2062572158718284e-07,
"loss": 0.3462,
"step": 578
},
{
"epoch": 1.7228915662650603,
"grad_norm": 0.20796717703342438,
"learning_rate": 4.1103611647396734e-07,
"loss": 0.3622,
"step": 579
},
{
"epoch": 1.7259036144578315,
"grad_norm": 0.1525891274213791,
"learning_rate": 4.0155240211781966e-07,
"loss": 0.3539,
"step": 580
},
{
"epoch": 1.7289156626506024,
"grad_norm": 0.1623997986316681,
"learning_rate": 3.921747973563056e-07,
"loss": 0.3895,
"step": 581
},
{
"epoch": 1.7289156626506024,
"eval_loss": 0.36007678508758545,
"eval_runtime": 79.1746,
"eval_samples_per_second": 4.648,
"eval_steps_per_second": 0.581,
"step": 581
},
{
"epoch": 1.7319277108433735,
"grad_norm": 0.20012828707695007,
"learning_rate": 3.829035185785035e-07,
"loss": 0.4253,
"step": 582
},
{
"epoch": 1.7349397590361446,
"grad_norm": 0.16124022006988525,
"learning_rate": 3.737387797200126e-07,
"loss": 0.2732,
"step": 583
},
{
"epoch": 1.7379518072289155,
"grad_norm": 0.19511763751506805,
"learning_rate": 3.646807922580098e-07,
"loss": 0.3539,
"step": 584
},
{
"epoch": 1.7409638554216866,
"grad_norm": 0.19982604682445526,
"learning_rate": 3.557297652063768e-07,
"loss": 0.3382,
"step": 585
},
{
"epoch": 1.7439759036144578,
"grad_norm": 0.15003900229930878,
"learning_rate": 3.4688590511087304e-07,
"loss": 0.3204,
"step": 586
},
{
"epoch": 1.7469879518072289,
"grad_norm": 0.1792009174823761,
"learning_rate": 3.3814941604437155e-07,
"loss": 0.2802,
"step": 587
},
{
"epoch": 1.75,
"grad_norm": 0.38043561577796936,
"learning_rate": 3.2952049960214785e-07,
"loss": 0.2556,
"step": 588
},
{
"epoch": 1.7530120481927711,
"grad_norm": 0.14868871867656708,
"learning_rate": 3.20999354897229e-07,
"loss": 0.3935,
"step": 589
},
{
"epoch": 1.7560240963855422,
"grad_norm": 0.10944348573684692,
"learning_rate": 3.1258617855580155e-07,
"loss": 0.146,
"step": 590
},
{
"epoch": 1.7590361445783134,
"grad_norm": 0.16262729465961456,
"learning_rate": 3.0428116471267146e-07,
"loss": 0.2532,
"step": 591
},
{
"epoch": 1.7620481927710845,
"grad_norm": 0.1523403823375702,
"learning_rate": 2.9608450500678566e-07,
"loss": 0.3212,
"step": 592
},
{
"epoch": 1.7650602409638554,
"grad_norm": 0.1569455862045288,
"learning_rate": 2.879963885768083e-07,
"loss": 0.291,
"step": 593
},
{
"epoch": 1.7680722891566265,
"grad_norm": 0.21074244379997253,
"learning_rate": 2.800170020567566e-07,
"loss": 0.294,
"step": 594
},
{
"epoch": 1.7710843373493976,
"grad_norm": 0.17102384567260742,
"learning_rate": 2.721465295716996e-07,
"loss": 0.269,
"step": 595
},
{
"epoch": 1.7740963855421685,
"grad_norm": 0.17632022500038147,
"learning_rate": 2.643851527335006e-07,
"loss": 0.3749,
"step": 596
},
{
"epoch": 1.7771084337349397,
"grad_norm": 0.15809699892997742,
"learning_rate": 2.5673305063663335e-07,
"loss": 0.3419,
"step": 597
},
{
"epoch": 1.7801204819277108,
"grad_norm": 0.16983191668987274,
"learning_rate": 2.4919039985404626e-07,
"loss": 0.2738,
"step": 598
},
{
"epoch": 1.783132530120482,
"grad_norm": 0.18493536114692688,
"learning_rate": 2.4175737443308976e-07,
"loss": 0.2491,
"step": 599
},
{
"epoch": 1.786144578313253,
"grad_norm": 0.1757514476776123,
"learning_rate": 2.3443414589149838e-07,
"loss": 0.3816,
"step": 600
},
{
"epoch": 1.7891566265060241,
"grad_norm": 0.16976556181907654,
"learning_rate": 2.272208832134326e-07,
"loss": 0.2902,
"step": 601
},
{
"epoch": 1.7921686746987953,
"grad_norm": 0.16954952478408813,
"learning_rate": 2.201177528455828e-07,
"loss": 0.2708,
"step": 602
},
{
"epoch": 1.7951807228915664,
"grad_norm": 0.15355940163135529,
"learning_rate": 2.131249186933243e-07,
"loss": 0.2943,
"step": 603
},
{
"epoch": 1.7981927710843375,
"grad_norm": 0.14185413718223572,
"learning_rate": 2.0624254211693894e-07,
"loss": 0.302,
"step": 604
},
{
"epoch": 1.8012048192771084,
"grad_norm": 0.1535930335521698,
"learning_rate": 1.994707819278896e-07,
"loss": 0.3399,
"step": 605
},
{
"epoch": 1.8042168674698795,
"grad_norm": 0.1895059496164322,
"learning_rate": 1.9280979438515479e-07,
"loss": 0.4095,
"step": 606
},
{
"epoch": 1.8072289156626506,
"grad_norm": 0.21343795955181122,
"learning_rate": 1.8625973319162605e-07,
"loss": 0.3596,
"step": 607
},
{
"epoch": 1.8102409638554215,
"grad_norm": 0.16654372215270996,
"learning_rate": 1.7982074949055794e-07,
"loss": 0.2931,
"step": 608
},
{
"epoch": 1.8132530120481927,
"grad_norm": 0.19126953184604645,
"learning_rate": 1.7349299186208258e-07,
"loss": 0.3195,
"step": 609
},
{
"epoch": 1.8162650602409638,
"grad_norm": 0.22678323090076447,
"learning_rate": 1.6727660631977894e-07,
"loss": 0.3572,
"step": 610
},
{
"epoch": 1.819277108433735,
"grad_norm": 0.15590119361877441,
"learning_rate": 1.6117173630730787e-07,
"loss": 0.3264,
"step": 611
},
{
"epoch": 1.822289156626506,
"grad_norm": 0.16644251346588135,
"learning_rate": 1.5517852269509692e-07,
"loss": 0.4044,
"step": 612
},
{
"epoch": 1.8253012048192772,
"grad_norm": 0.18721044063568115,
"learning_rate": 1.492971037770924e-07,
"loss": 0.3631,
"step": 613
},
{
"epoch": 1.8283132530120483,
"grad_norm": 0.2001548856496811,
"learning_rate": 1.435276152675691e-07,
"loss": 0.2765,
"step": 614
},
{
"epoch": 1.8313253012048194,
"grad_norm": 0.18621757626533508,
"learning_rate": 1.378701902979962e-07,
"loss": 0.4584,
"step": 615
},
{
"epoch": 1.8343373493975905,
"grad_norm": 0.2068716138601303,
"learning_rate": 1.323249594139664e-07,
"loss": 0.501,
"step": 616
},
{
"epoch": 1.8373493975903614,
"grad_norm": 0.1547180563211441,
"learning_rate": 1.2689205057218602e-07,
"loss": 0.3241,
"step": 617
},
{
"epoch": 1.8403614457831325,
"grad_norm": 0.1633985936641693,
"learning_rate": 1.2157158913751687e-07,
"loss": 0.3025,
"step": 618
},
{
"epoch": 1.8433734939759037,
"grad_norm": 0.18659210205078125,
"learning_rate": 1.1636369788008973e-07,
"loss": 0.3649,
"step": 619
},
{
"epoch": 1.8463855421686746,
"grad_norm": 0.19178517162799835,
"learning_rate": 1.1126849697246533e-07,
"loss": 0.2552,
"step": 620
},
{
"epoch": 1.8493975903614457,
"grad_norm": 0.18525345623493195,
"learning_rate": 1.0628610398686679e-07,
"loss": 0.3322,
"step": 621
},
{
"epoch": 1.8524096385542168,
"grad_norm": 0.1565900444984436,
"learning_rate": 1.014166338924627e-07,
"loss": 0.2165,
"step": 622
},
{
"epoch": 1.855421686746988,
"grad_norm": 0.1916026771068573,
"learning_rate": 9.666019905271662e-08,
"loss": 0.3924,
"step": 623
},
{
"epoch": 1.858433734939759,
"grad_norm": 0.1600860208272934,
"learning_rate": 9.201690922279405e-08,
"loss": 0.282,
"step": 624
},
{
"epoch": 1.8614457831325302,
"grad_norm": 0.23213285207748413,
"learning_rate": 8.748687154702673e-08,
"loss": 0.3577,
"step": 625
},
{
"epoch": 1.8644578313253013,
"grad_norm": 0.14008109271526337,
"learning_rate": 8.307019055644517e-08,
"loss": 0.2977,
"step": 626
},
{
"epoch": 1.8674698795180724,
"grad_norm": 0.148224338889122,
"learning_rate": 7.876696816636276e-08,
"loss": 0.2515,
"step": 627
},
{
"epoch": 1.8704819277108435,
"grad_norm": 0.17251333594322205,
"learning_rate": 7.45773036740255e-08,
"loss": 0.3083,
"step": 628
},
{
"epoch": 1.8734939759036144,
"grad_norm": 0.15862807631492615,
"learning_rate": 7.050129375632098e-08,
"loss": 0.3082,
"step": 629
},
{
"epoch": 1.8765060240963856,
"grad_norm": 0.14815159142017365,
"learning_rate": 6.65390324675469e-08,
"loss": 0.2304,
"step": 630
},
{
"epoch": 1.8795180722891565,
"grad_norm": 0.26052090525627136,
"learning_rate": 6.269061123724163e-08,
"loss": 0.3544,
"step": 631
},
{
"epoch": 1.8825301204819276,
"grad_norm": 0.19842372834682465,
"learning_rate": 5.895611886807317e-08,
"loss": 0.4025,
"step": 632
},
{
"epoch": 1.8855421686746987,
"grad_norm": 0.18247205018997192,
"learning_rate": 5.533564153379134e-08,
"loss": 0.3334,
"step": 633
},
{
"epoch": 1.8885542168674698,
"grad_norm": 0.14497226476669312,
"learning_rate": 5.182926277723821e-08,
"loss": 0.2822,
"step": 634
},
{
"epoch": 1.891566265060241,
"grad_norm": 0.1699623316526413,
"learning_rate": 4.843706350842081e-08,
"loss": 0.3483,
"step": 635
},
{
"epoch": 1.894578313253012,
"grad_norm": 0.37397855520248413,
"learning_rate": 4.515912200264427e-08,
"loss": 0.3952,
"step": 636
},
{
"epoch": 1.8975903614457832,
"grad_norm": 0.2163594514131546,
"learning_rate": 4.19955138987066e-08,
"loss": 0.2991,
"step": 637
},
{
"epoch": 1.9006024096385543,
"grad_norm": 0.13354437053203583,
"learning_rate": 3.894631219715006e-08,
"loss": 0.1968,
"step": 638
},
{
"epoch": 1.9036144578313254,
"grad_norm": 0.19678649306297302,
"learning_rate": 3.601158725858034e-08,
"loss": 0.3328,
"step": 639
},
{
"epoch": 1.9066265060240963,
"grad_norm": 0.19473896920681,
"learning_rate": 3.3191406802041693e-08,
"loss": 0.3913,
"step": 640
},
{
"epoch": 1.9096385542168675,
"grad_norm": 0.18177784979343414,
"learning_rate": 3.048583590345266e-08,
"loss": 0.293,
"step": 641
},
{
"epoch": 1.9126506024096386,
"grad_norm": 0.13523522019386292,
"learning_rate": 2.7894936994106724e-08,
"loss": 0.2142,
"step": 642
},
{
"epoch": 1.9156626506024095,
"grad_norm": 0.16804173588752747,
"learning_rate": 2.5418769859231194e-08,
"loss": 0.3141,
"step": 643
},
{
"epoch": 1.9186746987951806,
"grad_norm": 0.2132112681865692,
"learning_rate": 2.3057391636606698e-08,
"loss": 0.2868,
"step": 644
},
{
"epoch": 1.9216867469879517,
"grad_norm": 0.2761172950267792,
"learning_rate": 2.081085681524986e-08,
"loss": 0.3607,
"step": 645
},
{
"epoch": 1.9246987951807228,
"grad_norm": 0.22394540905952454,
"learning_rate": 1.8679217234154335e-08,
"loss": 0.3675,
"step": 646
},
{
"epoch": 1.927710843373494,
"grad_norm": 0.15545201301574707,
"learning_rate": 1.6662522081097308e-08,
"loss": 0.2797,
"step": 647
},
{
"epoch": 1.930722891566265,
"grad_norm": 0.21229010820388794,
"learning_rate": 1.4760817891500966e-08,
"loss": 0.3739,
"step": 648
},
{
"epoch": 1.9337349397590362,
"grad_norm": 0.1342974752187729,
"learning_rate": 1.2974148547362231e-08,
"loss": 0.2724,
"step": 649
},
{
"epoch": 1.9367469879518073,
"grad_norm": 0.16407661139965057,
"learning_rate": 1.1302555276238581e-08,
"loss": 0.2649,
"step": 650
},
{
"epoch": 1.9397590361445785,
"grad_norm": 0.16028320789337158,
"learning_rate": 9.746076650294922e-09,
"loss": 0.3101,
"step": 651
},
{
"epoch": 1.9427710843373494,
"grad_norm": 0.160682812333107,
"learning_rate": 8.304748585417077e-09,
"loss": 0.2489,
"step": 652
},
{
"epoch": 1.9457831325301205,
"grad_norm": 0.16517463326454163,
"learning_rate": 6.978604340380779e-09,
"loss": 0.2734,
"step": 653
},
{
"epoch": 1.9487951807228916,
"grad_norm": 0.21058174967765808,
"learning_rate": 5.767674516083954e-09,
"loss": 0.3024,
"step": 654
},
{
"epoch": 1.9518072289156625,
"grad_norm": 0.13643121719360352,
"learning_rate": 4.671987054842842e-09,
"loss": 0.2566,
"step": 655
},
{
"epoch": 1.9548192771084336,
"grad_norm": 0.12848874926567078,
"learning_rate": 3.6915672397436208e-09,
"loss": 0.3298,
"step": 656
},
{
"epoch": 1.9578313253012047,
"grad_norm": 0.20304584503173828,
"learning_rate": 2.8264376940634332e-09,
"loss": 0.4233,
"step": 657
},
{
"epoch": 1.9608433734939759,
"grad_norm": 0.1930498331785202,
"learning_rate": 2.076618380744133e-09,
"loss": 0.3333,
"step": 658
},
{
"epoch": 1.963855421686747,
"grad_norm": 0.27355119585990906,
"learning_rate": 1.4421266019348789e-09,
"loss": 0.2296,
"step": 659
},
{
"epoch": 1.966867469879518,
"grad_norm": 0.13266071677207947,
"learning_rate": 9.229769985902304e-10,
"loss": 0.2616,
"step": 660
},
{
"epoch": 1.9698795180722892,
"grad_norm": 0.16650208830833435,
"learning_rate": 5.191815501343067e-10,
"loss": 0.2971,
"step": 661
},
{
"epoch": 1.9728915662650603,
"grad_norm": 0.1883377730846405,
"learning_rate": 2.307495741843413e-10,
"loss": 0.3542,
"step": 662
},
{
"epoch": 1.9759036144578315,
"grad_norm": 0.17213378846645355,
"learning_rate": 5.768772633363284e-11,
"loss": 0.2823,
"step": 663
},
{
"epoch": 1.9789156626506024,
"grad_norm": 0.16032862663269043,
"learning_rate": 0.0,
"loss": 0.3168,
"step": 664
},
{
"epoch": 1.9789156626506024,
"eval_loss": 0.35977768898010254,
"eval_runtime": 48.6413,
"eval_samples_per_second": 7.566,
"eval_steps_per_second": 0.946,
"step": 664
}
],
"logging_steps": 1,
"max_steps": 664,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 166,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.6730319840173097e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}