inferno-math-stage1-ckpt1400 / trainer_state.json
sthenno's picture
Add files using upload-large-folder tool
adf7fd8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9430784776018861,
"eval_steps": 100,
"global_step": 1400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006736274840013472,
"grad_norm": 3.42309308052063,
"learning_rate": 1.0067114093959731e-07,
"loss": 0.4257,
"step": 1
},
{
"epoch": 0.0013472549680026945,
"grad_norm": 3.701201915740967,
"learning_rate": 2.0134228187919462e-07,
"loss": 0.4285,
"step": 2
},
{
"epoch": 0.0020208824520040417,
"grad_norm": 4.045602321624756,
"learning_rate": 3.0201342281879193e-07,
"loss": 0.4232,
"step": 3
},
{
"epoch": 0.002694509936005389,
"grad_norm": 3.859919786453247,
"learning_rate": 4.0268456375838924e-07,
"loss": 0.4029,
"step": 4
},
{
"epoch": 0.003368137420006736,
"grad_norm": 4.171447277069092,
"learning_rate": 5.033557046979866e-07,
"loss": 0.4158,
"step": 5
},
{
"epoch": 0.0040417649040080834,
"grad_norm": 3.556626796722412,
"learning_rate": 6.040268456375839e-07,
"loss": 0.3945,
"step": 6
},
{
"epoch": 0.004715392388009431,
"grad_norm": 3.78082537651062,
"learning_rate": 7.046979865771813e-07,
"loss": 0.372,
"step": 7
},
{
"epoch": 0.005389019872010778,
"grad_norm": 3.459005355834961,
"learning_rate": 8.053691275167785e-07,
"loss": 0.392,
"step": 8
},
{
"epoch": 0.006062647356012125,
"grad_norm": 3.6338694095611572,
"learning_rate": 9.060402684563759e-07,
"loss": 0.4221,
"step": 9
},
{
"epoch": 0.006736274840013472,
"grad_norm": 3.6951706409454346,
"learning_rate": 1.006711409395973e-06,
"loss": 0.4504,
"step": 10
},
{
"epoch": 0.00740990232401482,
"grad_norm": 2.708463668823242,
"learning_rate": 1.1073825503355705e-06,
"loss": 0.3848,
"step": 11
},
{
"epoch": 0.008083529808016167,
"grad_norm": 2.9015040397644043,
"learning_rate": 1.2080536912751677e-06,
"loss": 0.3936,
"step": 12
},
{
"epoch": 0.008757157292017514,
"grad_norm": 3.133338212966919,
"learning_rate": 1.3087248322147651e-06,
"loss": 0.3844,
"step": 13
},
{
"epoch": 0.009430784776018861,
"grad_norm": 2.1770880222320557,
"learning_rate": 1.4093959731543626e-06,
"loss": 0.3474,
"step": 14
},
{
"epoch": 0.010104412260020209,
"grad_norm": 1.66121244430542,
"learning_rate": 1.5100671140939598e-06,
"loss": 0.3646,
"step": 15
},
{
"epoch": 0.010778039744021556,
"grad_norm": 1.725306510925293,
"learning_rate": 1.610738255033557e-06,
"loss": 0.3739,
"step": 16
},
{
"epoch": 0.011451667228022903,
"grad_norm": 1.5393097400665283,
"learning_rate": 1.7114093959731544e-06,
"loss": 0.2967,
"step": 17
},
{
"epoch": 0.01212529471202425,
"grad_norm": 1.6653029918670654,
"learning_rate": 1.8120805369127518e-06,
"loss": 0.3491,
"step": 18
},
{
"epoch": 0.012798922196025598,
"grad_norm": 1.4329285621643066,
"learning_rate": 1.912751677852349e-06,
"loss": 0.3438,
"step": 19
},
{
"epoch": 0.013472549680026945,
"grad_norm": 1.1590880155563354,
"learning_rate": 2.013422818791946e-06,
"loss": 0.2907,
"step": 20
},
{
"epoch": 0.014146177164028292,
"grad_norm": 1.4018336534500122,
"learning_rate": 2.1140939597315434e-06,
"loss": 0.3504,
"step": 21
},
{
"epoch": 0.01481980464802964,
"grad_norm": 1.201278805732727,
"learning_rate": 2.214765100671141e-06,
"loss": 0.3176,
"step": 22
},
{
"epoch": 0.015493432132030987,
"grad_norm": 1.14249849319458,
"learning_rate": 2.3154362416107382e-06,
"loss": 0.3079,
"step": 23
},
{
"epoch": 0.016167059616032334,
"grad_norm": 1.0337632894515991,
"learning_rate": 2.4161073825503354e-06,
"loss": 0.3039,
"step": 24
},
{
"epoch": 0.016840687100033683,
"grad_norm": 0.9944117665290833,
"learning_rate": 2.516778523489933e-06,
"loss": 0.297,
"step": 25
},
{
"epoch": 0.017514314584035028,
"grad_norm": 0.946663498878479,
"learning_rate": 2.6174496644295303e-06,
"loss": 0.3315,
"step": 26
},
{
"epoch": 0.018187942068036377,
"grad_norm": 1.056069254875183,
"learning_rate": 2.7181208053691275e-06,
"loss": 0.3274,
"step": 27
},
{
"epoch": 0.018861569552037723,
"grad_norm": 0.9784092903137207,
"learning_rate": 2.818791946308725e-06,
"loss": 0.3399,
"step": 28
},
{
"epoch": 0.01953519703603907,
"grad_norm": 1.07163667678833,
"learning_rate": 2.9194630872483223e-06,
"loss": 0.3361,
"step": 29
},
{
"epoch": 0.020208824520040417,
"grad_norm": 0.9870592951774597,
"learning_rate": 3.0201342281879195e-06,
"loss": 0.3026,
"step": 30
},
{
"epoch": 0.020882452004041766,
"grad_norm": 0.9180539846420288,
"learning_rate": 3.120805369127517e-06,
"loss": 0.2716,
"step": 31
},
{
"epoch": 0.02155607948804311,
"grad_norm": 0.8827613592147827,
"learning_rate": 3.221476510067114e-06,
"loss": 0.2623,
"step": 32
},
{
"epoch": 0.02222970697204446,
"grad_norm": 0.8390945196151733,
"learning_rate": 3.3221476510067116e-06,
"loss": 0.2792,
"step": 33
},
{
"epoch": 0.022903334456045806,
"grad_norm": 0.8577262163162231,
"learning_rate": 3.4228187919463088e-06,
"loss": 0.2906,
"step": 34
},
{
"epoch": 0.023576961940047155,
"grad_norm": 0.7939577102661133,
"learning_rate": 3.523489932885906e-06,
"loss": 0.2569,
"step": 35
},
{
"epoch": 0.0242505894240485,
"grad_norm": 0.8591914772987366,
"learning_rate": 3.6241610738255036e-06,
"loss": 0.3072,
"step": 36
},
{
"epoch": 0.02492421690804985,
"grad_norm": 0.8437011241912842,
"learning_rate": 3.724832214765101e-06,
"loss": 0.3002,
"step": 37
},
{
"epoch": 0.025597844392051195,
"grad_norm": 0.8370192646980286,
"learning_rate": 3.825503355704698e-06,
"loss": 0.2693,
"step": 38
},
{
"epoch": 0.026271471876052544,
"grad_norm": 0.7814688086509705,
"learning_rate": 3.926174496644295e-06,
"loss": 0.2816,
"step": 39
},
{
"epoch": 0.02694509936005389,
"grad_norm": 0.8348170518875122,
"learning_rate": 4.026845637583892e-06,
"loss": 0.251,
"step": 40
},
{
"epoch": 0.02761872684405524,
"grad_norm": 0.7987892627716064,
"learning_rate": 4.12751677852349e-06,
"loss": 0.2637,
"step": 41
},
{
"epoch": 0.028292354328056584,
"grad_norm": 0.8840119242668152,
"learning_rate": 4.228187919463087e-06,
"loss": 0.315,
"step": 42
},
{
"epoch": 0.028965981812057933,
"grad_norm": 0.7633718848228455,
"learning_rate": 4.328859060402685e-06,
"loss": 0.2988,
"step": 43
},
{
"epoch": 0.02963960929605928,
"grad_norm": 0.7476988434791565,
"learning_rate": 4.429530201342282e-06,
"loss": 0.2856,
"step": 44
},
{
"epoch": 0.030313236780060628,
"grad_norm": 0.7812101244926453,
"learning_rate": 4.530201342281879e-06,
"loss": 0.2622,
"step": 45
},
{
"epoch": 0.030986864264061973,
"grad_norm": 0.7842202186584473,
"learning_rate": 4.6308724832214765e-06,
"loss": 0.3366,
"step": 46
},
{
"epoch": 0.03166049174806332,
"grad_norm": 0.7462322115898132,
"learning_rate": 4.731543624161074e-06,
"loss": 0.2825,
"step": 47
},
{
"epoch": 0.03233411923206467,
"grad_norm": 0.7542632818222046,
"learning_rate": 4.832214765100671e-06,
"loss": 0.2549,
"step": 48
},
{
"epoch": 0.033007746716066017,
"grad_norm": 0.7200729250907898,
"learning_rate": 4.932885906040269e-06,
"loss": 0.2593,
"step": 49
},
{
"epoch": 0.033681374200067365,
"grad_norm": 0.8686407208442688,
"learning_rate": 5.033557046979866e-06,
"loss": 0.2828,
"step": 50
},
{
"epoch": 0.03435500168406871,
"grad_norm": 0.734254002571106,
"learning_rate": 5.134228187919463e-06,
"loss": 0.2841,
"step": 51
},
{
"epoch": 0.035028629168070056,
"grad_norm": 0.7483602166175842,
"learning_rate": 5.2348993288590606e-06,
"loss": 0.2956,
"step": 52
},
{
"epoch": 0.035702256652071405,
"grad_norm": 0.7722125053405762,
"learning_rate": 5.335570469798658e-06,
"loss": 0.2817,
"step": 53
},
{
"epoch": 0.036375884136072754,
"grad_norm": 0.7247833609580994,
"learning_rate": 5.436241610738255e-06,
"loss": 0.2589,
"step": 54
},
{
"epoch": 0.037049511620074096,
"grad_norm": 0.8258161544799805,
"learning_rate": 5.536912751677853e-06,
"loss": 0.2929,
"step": 55
},
{
"epoch": 0.037723139104075445,
"grad_norm": 0.784130334854126,
"learning_rate": 5.63758389261745e-06,
"loss": 0.2639,
"step": 56
},
{
"epoch": 0.038396766588076794,
"grad_norm": 0.8519976735115051,
"learning_rate": 5.738255033557047e-06,
"loss": 0.2611,
"step": 57
},
{
"epoch": 0.03907039407207814,
"grad_norm": 0.7617088556289673,
"learning_rate": 5.838926174496645e-06,
"loss": 0.3038,
"step": 58
},
{
"epoch": 0.039744021556079485,
"grad_norm": 0.7174592018127441,
"learning_rate": 5.939597315436242e-06,
"loss": 0.2451,
"step": 59
},
{
"epoch": 0.040417649040080834,
"grad_norm": 0.7933776378631592,
"learning_rate": 6.040268456375839e-06,
"loss": 0.2979,
"step": 60
},
{
"epoch": 0.04109127652408218,
"grad_norm": 0.7308351993560791,
"learning_rate": 6.140939597315437e-06,
"loss": 0.2547,
"step": 61
},
{
"epoch": 0.04176490400808353,
"grad_norm": 0.8782221674919128,
"learning_rate": 6.241610738255034e-06,
"loss": 0.2948,
"step": 62
},
{
"epoch": 0.042438531492084874,
"grad_norm": 0.7220450043678284,
"learning_rate": 6.342281879194631e-06,
"loss": 0.2397,
"step": 63
},
{
"epoch": 0.04311215897608622,
"grad_norm": 0.8042862415313721,
"learning_rate": 6.442953020134228e-06,
"loss": 0.2963,
"step": 64
},
{
"epoch": 0.04378578646008757,
"grad_norm": 0.6996918320655823,
"learning_rate": 6.543624161073825e-06,
"loss": 0.2542,
"step": 65
},
{
"epoch": 0.04445941394408892,
"grad_norm": 0.7606627941131592,
"learning_rate": 6.644295302013423e-06,
"loss": 0.285,
"step": 66
},
{
"epoch": 0.04513304142809026,
"grad_norm": 0.8591688275337219,
"learning_rate": 6.74496644295302e-06,
"loss": 0.2671,
"step": 67
},
{
"epoch": 0.04580666891209161,
"grad_norm": 0.8488709330558777,
"learning_rate": 6.8456375838926175e-06,
"loss": 0.2751,
"step": 68
},
{
"epoch": 0.04648029639609296,
"grad_norm": 0.7567676305770874,
"learning_rate": 6.946308724832215e-06,
"loss": 0.301,
"step": 69
},
{
"epoch": 0.04715392388009431,
"grad_norm": 0.7121560573577881,
"learning_rate": 7.046979865771812e-06,
"loss": 0.2557,
"step": 70
},
{
"epoch": 0.04782755136409565,
"grad_norm": 0.7666682600975037,
"learning_rate": 7.147651006711409e-06,
"loss": 0.2582,
"step": 71
},
{
"epoch": 0.048501178848097,
"grad_norm": 0.7414038181304932,
"learning_rate": 7.248322147651007e-06,
"loss": 0.262,
"step": 72
},
{
"epoch": 0.04917480633209835,
"grad_norm": 0.8357811570167542,
"learning_rate": 7.348993288590604e-06,
"loss": 0.2591,
"step": 73
},
{
"epoch": 0.0498484338160997,
"grad_norm": 0.7933549880981445,
"learning_rate": 7.449664429530202e-06,
"loss": 0.282,
"step": 74
},
{
"epoch": 0.05052206130010104,
"grad_norm": 0.7420201301574707,
"learning_rate": 7.5503355704698e-06,
"loss": 0.2469,
"step": 75
},
{
"epoch": 0.05119568878410239,
"grad_norm": 0.7670828104019165,
"learning_rate": 7.651006711409396e-06,
"loss": 0.295,
"step": 76
},
{
"epoch": 0.05186931626810374,
"grad_norm": 0.722752571105957,
"learning_rate": 7.751677852348993e-06,
"loss": 0.2301,
"step": 77
},
{
"epoch": 0.05254294375210509,
"grad_norm": 0.7430191040039062,
"learning_rate": 7.85234899328859e-06,
"loss": 0.2875,
"step": 78
},
{
"epoch": 0.05321657123610643,
"grad_norm": 0.6979767084121704,
"learning_rate": 7.953020134228188e-06,
"loss": 0.2326,
"step": 79
},
{
"epoch": 0.05389019872010778,
"grad_norm": 0.7197319269180298,
"learning_rate": 8.053691275167785e-06,
"loss": 0.2413,
"step": 80
},
{
"epoch": 0.05456382620410913,
"grad_norm": 0.7689131498336792,
"learning_rate": 8.154362416107382e-06,
"loss": 0.2868,
"step": 81
},
{
"epoch": 0.05523745368811048,
"grad_norm": 0.7233304381370544,
"learning_rate": 8.25503355704698e-06,
"loss": 0.2586,
"step": 82
},
{
"epoch": 0.05591108117211182,
"grad_norm": 0.8464373350143433,
"learning_rate": 8.355704697986576e-06,
"loss": 0.2998,
"step": 83
},
{
"epoch": 0.05658470865611317,
"grad_norm": 0.8020244240760803,
"learning_rate": 8.456375838926174e-06,
"loss": 0.3323,
"step": 84
},
{
"epoch": 0.05725833614011452,
"grad_norm": 0.9260913729667664,
"learning_rate": 8.55704697986577e-06,
"loss": 0.3353,
"step": 85
},
{
"epoch": 0.057931963624115866,
"grad_norm": 0.824252188205719,
"learning_rate": 8.65771812080537e-06,
"loss": 0.2778,
"step": 86
},
{
"epoch": 0.05860559110811721,
"grad_norm": 0.7277565598487854,
"learning_rate": 8.758389261744967e-06,
"loss": 0.2863,
"step": 87
},
{
"epoch": 0.05927921859211856,
"grad_norm": 0.7575395107269287,
"learning_rate": 8.859060402684564e-06,
"loss": 0.2192,
"step": 88
},
{
"epoch": 0.059952846076119906,
"grad_norm": 0.7741091251373291,
"learning_rate": 8.959731543624161e-06,
"loss": 0.2808,
"step": 89
},
{
"epoch": 0.060626473560121255,
"grad_norm": 0.7291881442070007,
"learning_rate": 9.060402684563759e-06,
"loss": 0.2624,
"step": 90
},
{
"epoch": 0.0613001010441226,
"grad_norm": 0.7662385106086731,
"learning_rate": 9.161073825503356e-06,
"loss": 0.2803,
"step": 91
},
{
"epoch": 0.061973728528123946,
"grad_norm": 0.7009522914886475,
"learning_rate": 9.261744966442953e-06,
"loss": 0.26,
"step": 92
},
{
"epoch": 0.06264735601212529,
"grad_norm": 0.8707520365715027,
"learning_rate": 9.36241610738255e-06,
"loss": 0.3179,
"step": 93
},
{
"epoch": 0.06332098349612664,
"grad_norm": 0.8629103302955627,
"learning_rate": 9.463087248322147e-06,
"loss": 0.3065,
"step": 94
},
{
"epoch": 0.06399461098012799,
"grad_norm": 0.8592970371246338,
"learning_rate": 9.563758389261745e-06,
"loss": 0.2574,
"step": 95
},
{
"epoch": 0.06466823846412934,
"grad_norm": 0.8038861751556396,
"learning_rate": 9.664429530201342e-06,
"loss": 0.2699,
"step": 96
},
{
"epoch": 0.06534186594813068,
"grad_norm": 0.7168505787849426,
"learning_rate": 9.765100671140939e-06,
"loss": 0.2507,
"step": 97
},
{
"epoch": 0.06601549343213203,
"grad_norm": 0.7545929551124573,
"learning_rate": 9.865771812080538e-06,
"loss": 0.2922,
"step": 98
},
{
"epoch": 0.06668912091613338,
"grad_norm": 0.7718814611434937,
"learning_rate": 9.966442953020135e-06,
"loss": 0.254,
"step": 99
},
{
"epoch": 0.06736274840013473,
"grad_norm": 0.8245450854301453,
"learning_rate": 1.0067114093959732e-05,
"loss": 0.2869,
"step": 100
},
{
"epoch": 0.06736274840013473,
"eval_loss": 0.2735002040863037,
"eval_runtime": 104.2064,
"eval_samples_per_second": 47.982,
"eval_steps_per_second": 3.004,
"step": 100
},
{
"epoch": 0.06803637588413607,
"grad_norm": 0.9367719888687134,
"learning_rate": 1.016778523489933e-05,
"loss": 0.3273,
"step": 101
},
{
"epoch": 0.06871000336813742,
"grad_norm": 0.7697410583496094,
"learning_rate": 1.0268456375838927e-05,
"loss": 0.2493,
"step": 102
},
{
"epoch": 0.06938363085213876,
"grad_norm": 0.7449803948402405,
"learning_rate": 1.0369127516778524e-05,
"loss": 0.2558,
"step": 103
},
{
"epoch": 0.07005725833614011,
"grad_norm": 0.7809726595878601,
"learning_rate": 1.0469798657718121e-05,
"loss": 0.3079,
"step": 104
},
{
"epoch": 0.07073088582014146,
"grad_norm": 0.8014216423034668,
"learning_rate": 1.0570469798657718e-05,
"loss": 0.2774,
"step": 105
},
{
"epoch": 0.07140451330414281,
"grad_norm": 0.7782856225967407,
"learning_rate": 1.0671140939597316e-05,
"loss": 0.2874,
"step": 106
},
{
"epoch": 0.07207814078814416,
"grad_norm": 0.7489345669746399,
"learning_rate": 1.0771812080536913e-05,
"loss": 0.2618,
"step": 107
},
{
"epoch": 0.07275176827214551,
"grad_norm": 0.7881894111633301,
"learning_rate": 1.087248322147651e-05,
"loss": 0.2914,
"step": 108
},
{
"epoch": 0.07342539575614684,
"grad_norm": 0.8149965405464172,
"learning_rate": 1.0973154362416109e-05,
"loss": 0.2904,
"step": 109
},
{
"epoch": 0.07409902324014819,
"grad_norm": 0.8088157176971436,
"learning_rate": 1.1073825503355706e-05,
"loss": 0.3084,
"step": 110
},
{
"epoch": 0.07477265072414954,
"grad_norm": 0.804861843585968,
"learning_rate": 1.1174496644295303e-05,
"loss": 0.2919,
"step": 111
},
{
"epoch": 0.07544627820815089,
"grad_norm": 0.7035599946975708,
"learning_rate": 1.12751677852349e-05,
"loss": 0.2971,
"step": 112
},
{
"epoch": 0.07611990569215224,
"grad_norm": 0.8036991357803345,
"learning_rate": 1.1375838926174498e-05,
"loss": 0.2857,
"step": 113
},
{
"epoch": 0.07679353317615359,
"grad_norm": 0.6793683767318726,
"learning_rate": 1.1476510067114095e-05,
"loss": 0.2742,
"step": 114
},
{
"epoch": 0.07746716066015494,
"grad_norm": 0.7865248918533325,
"learning_rate": 1.1577181208053692e-05,
"loss": 0.3156,
"step": 115
},
{
"epoch": 0.07814078814415629,
"grad_norm": 0.6990460157394409,
"learning_rate": 1.167785234899329e-05,
"loss": 0.281,
"step": 116
},
{
"epoch": 0.07881441562815762,
"grad_norm": 0.7218809723854065,
"learning_rate": 1.1778523489932886e-05,
"loss": 0.2584,
"step": 117
},
{
"epoch": 0.07948804311215897,
"grad_norm": 0.6970985531806946,
"learning_rate": 1.1879194630872484e-05,
"loss": 0.2438,
"step": 118
},
{
"epoch": 0.08016167059616032,
"grad_norm": 0.7687243819236755,
"learning_rate": 1.1979865771812081e-05,
"loss": 0.2846,
"step": 119
},
{
"epoch": 0.08083529808016167,
"grad_norm": 0.6929764151573181,
"learning_rate": 1.2080536912751678e-05,
"loss": 0.2611,
"step": 120
},
{
"epoch": 0.08150892556416302,
"grad_norm": 0.729848325252533,
"learning_rate": 1.2181208053691277e-05,
"loss": 0.3007,
"step": 121
},
{
"epoch": 0.08218255304816437,
"grad_norm": 0.7301985025405884,
"learning_rate": 1.2281879194630874e-05,
"loss": 0.2847,
"step": 122
},
{
"epoch": 0.08285618053216572,
"grad_norm": 0.7333296537399292,
"learning_rate": 1.2382550335570471e-05,
"loss": 0.2674,
"step": 123
},
{
"epoch": 0.08352980801616706,
"grad_norm": 0.7411990165710449,
"learning_rate": 1.2483221476510069e-05,
"loss": 0.2777,
"step": 124
},
{
"epoch": 0.08420343550016841,
"grad_norm": 0.6465498805046082,
"learning_rate": 1.2583892617449664e-05,
"loss": 0.2579,
"step": 125
},
{
"epoch": 0.08487706298416975,
"grad_norm": 0.6950599551200867,
"learning_rate": 1.2684563758389261e-05,
"loss": 0.3164,
"step": 126
},
{
"epoch": 0.0855506904681711,
"grad_norm": 0.6696597337722778,
"learning_rate": 1.2785234899328858e-05,
"loss": 0.2564,
"step": 127
},
{
"epoch": 0.08622431795217245,
"grad_norm": 0.6537868976593018,
"learning_rate": 1.2885906040268456e-05,
"loss": 0.2375,
"step": 128
},
{
"epoch": 0.0868979454361738,
"grad_norm": 0.7363224029541016,
"learning_rate": 1.2986577181208053e-05,
"loss": 0.2589,
"step": 129
},
{
"epoch": 0.08757157292017514,
"grad_norm": 0.7354284524917603,
"learning_rate": 1.308724832214765e-05,
"loss": 0.3049,
"step": 130
},
{
"epoch": 0.0882452004041765,
"grad_norm": 0.6521575450897217,
"learning_rate": 1.3187919463087247e-05,
"loss": 0.2385,
"step": 131
},
{
"epoch": 0.08891882788817784,
"grad_norm": 0.6530443429946899,
"learning_rate": 1.3288590604026846e-05,
"loss": 0.2588,
"step": 132
},
{
"epoch": 0.08959245537217919,
"grad_norm": 0.7331404089927673,
"learning_rate": 1.3389261744966443e-05,
"loss": 0.3061,
"step": 133
},
{
"epoch": 0.09026608285618053,
"grad_norm": 0.7427138090133667,
"learning_rate": 1.348993288590604e-05,
"loss": 0.3513,
"step": 134
},
{
"epoch": 0.09093971034018188,
"grad_norm": 0.6774203181266785,
"learning_rate": 1.3590604026845638e-05,
"loss": 0.2639,
"step": 135
},
{
"epoch": 0.09161333782418322,
"grad_norm": 0.6679060459136963,
"learning_rate": 1.3691275167785235e-05,
"loss": 0.2503,
"step": 136
},
{
"epoch": 0.09228696530818457,
"grad_norm": 0.6390411853790283,
"learning_rate": 1.3791946308724832e-05,
"loss": 0.2298,
"step": 137
},
{
"epoch": 0.09296059279218592,
"grad_norm": 0.7115532159805298,
"learning_rate": 1.389261744966443e-05,
"loss": 0.255,
"step": 138
},
{
"epoch": 0.09363422027618727,
"grad_norm": 0.6546367406845093,
"learning_rate": 1.3993288590604027e-05,
"loss": 0.2623,
"step": 139
},
{
"epoch": 0.09430784776018862,
"grad_norm": 0.7526003122329712,
"learning_rate": 1.4093959731543624e-05,
"loss": 0.2701,
"step": 140
},
{
"epoch": 0.09498147524418997,
"grad_norm": 0.7417687773704529,
"learning_rate": 1.4194630872483221e-05,
"loss": 0.2488,
"step": 141
},
{
"epoch": 0.0956551027281913,
"grad_norm": 0.6994727849960327,
"learning_rate": 1.4295302013422818e-05,
"loss": 0.2861,
"step": 142
},
{
"epoch": 0.09632873021219265,
"grad_norm": 0.7503766417503357,
"learning_rate": 1.4395973154362415e-05,
"loss": 0.3002,
"step": 143
},
{
"epoch": 0.097002357696194,
"grad_norm": 0.6777353882789612,
"learning_rate": 1.4496644295302014e-05,
"loss": 0.2548,
"step": 144
},
{
"epoch": 0.09767598518019535,
"grad_norm": 0.8131176829338074,
"learning_rate": 1.4597315436241612e-05,
"loss": 0.2736,
"step": 145
},
{
"epoch": 0.0983496126641967,
"grad_norm": 0.6841787099838257,
"learning_rate": 1.4697986577181209e-05,
"loss": 0.2647,
"step": 146
},
{
"epoch": 0.09902324014819805,
"grad_norm": 0.673572838306427,
"learning_rate": 1.4798657718120806e-05,
"loss": 0.2414,
"step": 147
},
{
"epoch": 0.0996968676321994,
"grad_norm": 0.6950225234031677,
"learning_rate": 1.4899328859060403e-05,
"loss": 0.268,
"step": 148
},
{
"epoch": 0.10037049511620075,
"grad_norm": 0.7058023810386658,
"learning_rate": 1.5e-05,
"loss": 0.2682,
"step": 149
},
{
"epoch": 0.10104412260020208,
"grad_norm": 0.7642398476600647,
"learning_rate": 1.4999979233262118e-05,
"loss": 0.2871,
"step": 150
},
{
"epoch": 0.10171775008420343,
"grad_norm": 0.7045179605484009,
"learning_rate": 1.4999916933163468e-05,
"loss": 0.2589,
"step": 151
},
{
"epoch": 0.10239137756820478,
"grad_norm": 0.6908326148986816,
"learning_rate": 1.499981310004906e-05,
"loss": 0.2727,
"step": 152
},
{
"epoch": 0.10306500505220613,
"grad_norm": 0.7265616655349731,
"learning_rate": 1.4999667734493901e-05,
"loss": 0.3177,
"step": 153
},
{
"epoch": 0.10373863253620748,
"grad_norm": 0.630407452583313,
"learning_rate": 1.4999480837302995e-05,
"loss": 0.2636,
"step": 154
},
{
"epoch": 0.10441226002020883,
"grad_norm": 0.6864127516746521,
"learning_rate": 1.4999252409511335e-05,
"loss": 0.3013,
"step": 155
},
{
"epoch": 0.10508588750421018,
"grad_norm": 0.7556886076927185,
"learning_rate": 1.4998982452383916e-05,
"loss": 0.279,
"step": 156
},
{
"epoch": 0.10575951498821153,
"grad_norm": 0.7267988324165344,
"learning_rate": 1.4998670967415701e-05,
"loss": 0.2528,
"step": 157
},
{
"epoch": 0.10643314247221286,
"grad_norm": 0.6894652843475342,
"learning_rate": 1.4998317956331634e-05,
"loss": 0.2833,
"step": 158
},
{
"epoch": 0.10710676995621421,
"grad_norm": 0.7065450549125671,
"learning_rate": 1.4997923421086613e-05,
"loss": 0.3159,
"step": 159
},
{
"epoch": 0.10778039744021556,
"grad_norm": 0.6692951321601868,
"learning_rate": 1.49974873638655e-05,
"loss": 0.2747,
"step": 160
},
{
"epoch": 0.10845402492421691,
"grad_norm": 0.589299738407135,
"learning_rate": 1.4997009787083088e-05,
"loss": 0.2436,
"step": 161
},
{
"epoch": 0.10912765240821826,
"grad_norm": 0.6986613869667053,
"learning_rate": 1.49964906933841e-05,
"loss": 0.2893,
"step": 162
},
{
"epoch": 0.1098012798922196,
"grad_norm": 0.6756588220596313,
"learning_rate": 1.4995930085643173e-05,
"loss": 0.3076,
"step": 163
},
{
"epoch": 0.11047490737622095,
"grad_norm": 0.6988603472709656,
"learning_rate": 1.4995327966964838e-05,
"loss": 0.2646,
"step": 164
},
{
"epoch": 0.1111485348602223,
"grad_norm": 0.6961201429367065,
"learning_rate": 1.4994684340683506e-05,
"loss": 0.2984,
"step": 165
},
{
"epoch": 0.11182216234422364,
"grad_norm": 0.7064459323883057,
"learning_rate": 1.4993999210363444e-05,
"loss": 0.3186,
"step": 166
},
{
"epoch": 0.11249578982822499,
"grad_norm": 0.6374897360801697,
"learning_rate": 1.4993272579798773e-05,
"loss": 0.2833,
"step": 167
},
{
"epoch": 0.11316941731222634,
"grad_norm": 0.6672942638397217,
"learning_rate": 1.4992504453013422e-05,
"loss": 0.2891,
"step": 168
},
{
"epoch": 0.11384304479622769,
"grad_norm": 0.6631248593330383,
"learning_rate": 1.499169483426112e-05,
"loss": 0.2512,
"step": 169
},
{
"epoch": 0.11451667228022903,
"grad_norm": 0.7132297158241272,
"learning_rate": 1.4990843728025367e-05,
"loss": 0.2988,
"step": 170
},
{
"epoch": 0.11519029976423038,
"grad_norm": 0.6612878441810608,
"learning_rate": 1.4989951139019425e-05,
"loss": 0.283,
"step": 171
},
{
"epoch": 0.11586392724823173,
"grad_norm": 0.6382921934127808,
"learning_rate": 1.4989017072186267e-05,
"loss": 0.2597,
"step": 172
},
{
"epoch": 0.11653755473223308,
"grad_norm": 0.5888513922691345,
"learning_rate": 1.498804153269856e-05,
"loss": 0.243,
"step": 173
},
{
"epoch": 0.11721118221623442,
"grad_norm": 0.7310932874679565,
"learning_rate": 1.498702452595865e-05,
"loss": 0.2871,
"step": 174
},
{
"epoch": 0.11788480970023577,
"grad_norm": 0.6769680380821228,
"learning_rate": 1.4985966057598512e-05,
"loss": 0.2896,
"step": 175
},
{
"epoch": 0.11855843718423711,
"grad_norm": 0.7013587355613708,
"learning_rate": 1.4984866133479729e-05,
"loss": 0.2913,
"step": 176
},
{
"epoch": 0.11923206466823846,
"grad_norm": 0.7067077159881592,
"learning_rate": 1.4983724759693456e-05,
"loss": 0.2931,
"step": 177
},
{
"epoch": 0.11990569215223981,
"grad_norm": 0.6384806632995605,
"learning_rate": 1.498254194256039e-05,
"loss": 0.2433,
"step": 178
},
{
"epoch": 0.12057931963624116,
"grad_norm": 0.733525276184082,
"learning_rate": 1.4981317688630729e-05,
"loss": 0.314,
"step": 179
},
{
"epoch": 0.12125294712024251,
"grad_norm": 0.6598628759384155,
"learning_rate": 1.4980052004684146e-05,
"loss": 0.281,
"step": 180
},
{
"epoch": 0.12192657460424386,
"grad_norm": 0.616263210773468,
"learning_rate": 1.4978744897729741e-05,
"loss": 0.2616,
"step": 181
},
{
"epoch": 0.1226002020882452,
"grad_norm": 0.6175768971443176,
"learning_rate": 1.4977396375006006e-05,
"loss": 0.2624,
"step": 182
},
{
"epoch": 0.12327382957224654,
"grad_norm": 0.676030695438385,
"learning_rate": 1.4976006443980785e-05,
"loss": 0.287,
"step": 183
},
{
"epoch": 0.12394745705624789,
"grad_norm": 0.6331183314323425,
"learning_rate": 1.4974575112351235e-05,
"loss": 0.2647,
"step": 184
},
{
"epoch": 0.12462108454024924,
"grad_norm": 0.656204104423523,
"learning_rate": 1.497310238804378e-05,
"loss": 0.2755,
"step": 185
},
{
"epoch": 0.12529471202425058,
"grad_norm": 0.6582143306732178,
"learning_rate": 1.4971588279214065e-05,
"loss": 0.2774,
"step": 186
},
{
"epoch": 0.12596833950825193,
"grad_norm": 0.6152216792106628,
"learning_rate": 1.4970032794246918e-05,
"loss": 0.2694,
"step": 187
},
{
"epoch": 0.12664196699225327,
"grad_norm": 0.5943458676338196,
"learning_rate": 1.4968435941756303e-05,
"loss": 0.2698,
"step": 188
},
{
"epoch": 0.12731559447625462,
"grad_norm": 0.7527596354484558,
"learning_rate": 1.496679773058526e-05,
"loss": 0.2996,
"step": 189
},
{
"epoch": 0.12798922196025597,
"grad_norm": 0.6229069828987122,
"learning_rate": 1.4965118169805868e-05,
"loss": 0.275,
"step": 190
},
{
"epoch": 0.12866284944425732,
"grad_norm": 0.620919406414032,
"learning_rate": 1.4963397268719198e-05,
"loss": 0.2956,
"step": 191
},
{
"epoch": 0.12933647692825867,
"grad_norm": 0.6090366244316101,
"learning_rate": 1.4961635036855249e-05,
"loss": 0.258,
"step": 192
},
{
"epoch": 0.13001010441226002,
"grad_norm": 0.5942346453666687,
"learning_rate": 1.4959831483972901e-05,
"loss": 0.266,
"step": 193
},
{
"epoch": 0.13068373189626137,
"grad_norm": 0.6019350290298462,
"learning_rate": 1.4957986620059866e-05,
"loss": 0.256,
"step": 194
},
{
"epoch": 0.13135735938026272,
"grad_norm": 0.6708882451057434,
"learning_rate": 1.4956100455332623e-05,
"loss": 0.2924,
"step": 195
},
{
"epoch": 0.13203098686426407,
"grad_norm": 0.7132793068885803,
"learning_rate": 1.4954173000236369e-05,
"loss": 0.3174,
"step": 196
},
{
"epoch": 0.13270461434826542,
"grad_norm": 0.602311909198761,
"learning_rate": 1.495220426544496e-05,
"loss": 0.2388,
"step": 197
},
{
"epoch": 0.13337824183226676,
"grad_norm": 0.5862560868263245,
"learning_rate": 1.495019426186085e-05,
"loss": 0.2382,
"step": 198
},
{
"epoch": 0.1340518693162681,
"grad_norm": 0.6618714332580566,
"learning_rate": 1.4948143000615028e-05,
"loss": 0.2654,
"step": 199
},
{
"epoch": 0.13472549680026946,
"grad_norm": 0.6195774078369141,
"learning_rate": 1.4946050493066965e-05,
"loss": 0.2696,
"step": 200
},
{
"epoch": 0.13472549680026946,
"eval_loss": 0.2768155038356781,
"eval_runtime": 105.0569,
"eval_samples_per_second": 47.593,
"eval_steps_per_second": 2.979,
"step": 200
},
{
"epoch": 0.1353991242842708,
"grad_norm": 0.5954621434211731,
"learning_rate": 1.4943916750804537e-05,
"loss": 0.2625,
"step": 201
},
{
"epoch": 0.13607275176827213,
"grad_norm": 0.610717236995697,
"learning_rate": 1.494174178564398e-05,
"loss": 0.2953,
"step": 202
},
{
"epoch": 0.13674637925227348,
"grad_norm": 0.6930943727493286,
"learning_rate": 1.4939525609629809e-05,
"loss": 0.2774,
"step": 203
},
{
"epoch": 0.13742000673627483,
"grad_norm": 0.6402983069419861,
"learning_rate": 1.4937268235034754e-05,
"loss": 0.2814,
"step": 204
},
{
"epoch": 0.13809363422027618,
"grad_norm": 0.6476616859436035,
"learning_rate": 1.4934969674359698e-05,
"loss": 0.2829,
"step": 205
},
{
"epoch": 0.13876726170427753,
"grad_norm": 0.6163775324821472,
"learning_rate": 1.49326299403336e-05,
"loss": 0.2682,
"step": 206
},
{
"epoch": 0.13944088918827888,
"grad_norm": 0.6615155935287476,
"learning_rate": 1.4930249045913437e-05,
"loss": 0.2656,
"step": 207
},
{
"epoch": 0.14011451667228023,
"grad_norm": 0.6666435599327087,
"learning_rate": 1.4927827004284117e-05,
"loss": 0.2972,
"step": 208
},
{
"epoch": 0.14078814415628157,
"grad_norm": 0.6047382950782776,
"learning_rate": 1.4925363828858407e-05,
"loss": 0.2527,
"step": 209
},
{
"epoch": 0.14146177164028292,
"grad_norm": 0.6405648589134216,
"learning_rate": 1.4922859533276882e-05,
"loss": 0.2589,
"step": 210
},
{
"epoch": 0.14213539912428427,
"grad_norm": 0.6201145648956299,
"learning_rate": 1.4920314131407817e-05,
"loss": 0.2419,
"step": 211
},
{
"epoch": 0.14280902660828562,
"grad_norm": 0.6683364510536194,
"learning_rate": 1.4917727637347132e-05,
"loss": 0.2973,
"step": 212
},
{
"epoch": 0.14348265409228697,
"grad_norm": 0.5999878644943237,
"learning_rate": 1.4915100065418302e-05,
"loss": 0.2714,
"step": 213
},
{
"epoch": 0.14415628157628832,
"grad_norm": 0.6046174764633179,
"learning_rate": 1.491243143017229e-05,
"loss": 0.2841,
"step": 214
},
{
"epoch": 0.14482990906028967,
"grad_norm": 0.6034740209579468,
"learning_rate": 1.4909721746387454e-05,
"loss": 0.2896,
"step": 215
},
{
"epoch": 0.14550353654429102,
"grad_norm": 0.6835145354270935,
"learning_rate": 1.4906971029069473e-05,
"loss": 0.2778,
"step": 216
},
{
"epoch": 0.14617716402829237,
"grad_norm": 0.6769616603851318,
"learning_rate": 1.490417929345126e-05,
"loss": 0.2697,
"step": 217
},
{
"epoch": 0.1468507915122937,
"grad_norm": 0.6558434367179871,
"learning_rate": 1.4901346554992879e-05,
"loss": 0.2708,
"step": 218
},
{
"epoch": 0.14752441899629504,
"grad_norm": 0.6363021731376648,
"learning_rate": 1.489847282938146e-05,
"loss": 0.297,
"step": 219
},
{
"epoch": 0.14819804648029639,
"grad_norm": 0.6437724828720093,
"learning_rate": 1.4895558132531112e-05,
"loss": 0.2827,
"step": 220
},
{
"epoch": 0.14887167396429773,
"grad_norm": 0.6295124292373657,
"learning_rate": 1.4892602480582836e-05,
"loss": 0.2998,
"step": 221
},
{
"epoch": 0.14954530144829908,
"grad_norm": 0.634768545627594,
"learning_rate": 1.4889605889904426e-05,
"loss": 0.2686,
"step": 222
},
{
"epoch": 0.15021892893230043,
"grad_norm": 0.624239981174469,
"learning_rate": 1.4886568377090396e-05,
"loss": 0.3161,
"step": 223
},
{
"epoch": 0.15089255641630178,
"grad_norm": 0.6285136342048645,
"learning_rate": 1.4883489958961875e-05,
"loss": 0.3089,
"step": 224
},
{
"epoch": 0.15156618390030313,
"grad_norm": 0.6140178442001343,
"learning_rate": 1.4880370652566516e-05,
"loss": 0.2888,
"step": 225
},
{
"epoch": 0.15223981138430448,
"grad_norm": 0.5987722873687744,
"learning_rate": 1.4877210475178403e-05,
"loss": 0.2586,
"step": 226
},
{
"epoch": 0.15291343886830583,
"grad_norm": 0.6315680146217346,
"learning_rate": 1.487400944429796e-05,
"loss": 0.2876,
"step": 227
},
{
"epoch": 0.15358706635230718,
"grad_norm": 0.6932382583618164,
"learning_rate": 1.487076757765184e-05,
"loss": 0.2886,
"step": 228
},
{
"epoch": 0.15426069383630853,
"grad_norm": 0.5736963748931885,
"learning_rate": 1.4867484893192847e-05,
"loss": 0.2524,
"step": 229
},
{
"epoch": 0.15493432132030988,
"grad_norm": 0.6102257370948792,
"learning_rate": 1.4864161409099814e-05,
"loss": 0.2518,
"step": 230
},
{
"epoch": 0.15560794880431122,
"grad_norm": 0.5340930819511414,
"learning_rate": 1.4860797143777526e-05,
"loss": 0.2466,
"step": 231
},
{
"epoch": 0.15628157628831257,
"grad_norm": 0.6170995831489563,
"learning_rate": 1.4857392115856597e-05,
"loss": 0.2588,
"step": 232
},
{
"epoch": 0.15695520377231392,
"grad_norm": 0.5439332127571106,
"learning_rate": 1.4853946344193386e-05,
"loss": 0.2377,
"step": 233
},
{
"epoch": 0.15762883125631524,
"grad_norm": 0.6084430813789368,
"learning_rate": 1.4850459847869866e-05,
"loss": 0.2514,
"step": 234
},
{
"epoch": 0.1583024587403166,
"grad_norm": 0.6239585280418396,
"learning_rate": 1.4846932646193554e-05,
"loss": 0.2892,
"step": 235
},
{
"epoch": 0.15897608622431794,
"grad_norm": 0.6361899375915527,
"learning_rate": 1.4843364758697371e-05,
"loss": 0.264,
"step": 236
},
{
"epoch": 0.1596497137083193,
"grad_norm": 0.5994705557823181,
"learning_rate": 1.4839756205139555e-05,
"loss": 0.2756,
"step": 237
},
{
"epoch": 0.16032334119232064,
"grad_norm": 0.6532281041145325,
"learning_rate": 1.4836107005503543e-05,
"loss": 0.3262,
"step": 238
},
{
"epoch": 0.160996968676322,
"grad_norm": 0.6311124563217163,
"learning_rate": 1.483241717999786e-05,
"loss": 0.3137,
"step": 239
},
{
"epoch": 0.16167059616032334,
"grad_norm": 0.5731788873672485,
"learning_rate": 1.4828686749056007e-05,
"loss": 0.2476,
"step": 240
},
{
"epoch": 0.1623442236443247,
"grad_norm": 0.5689460039138794,
"learning_rate": 1.4824915733336355e-05,
"loss": 0.2717,
"step": 241
},
{
"epoch": 0.16301785112832604,
"grad_norm": 0.6340669989585876,
"learning_rate": 1.4821104153722023e-05,
"loss": 0.2756,
"step": 242
},
{
"epoch": 0.16369147861232738,
"grad_norm": 0.6497682929039001,
"learning_rate": 1.4817252031320766e-05,
"loss": 0.3197,
"step": 243
},
{
"epoch": 0.16436510609632873,
"grad_norm": 0.6404630541801453,
"learning_rate": 1.481335938746485e-05,
"loss": 0.2641,
"step": 244
},
{
"epoch": 0.16503873358033008,
"grad_norm": 0.5862687230110168,
"learning_rate": 1.480942624371095e-05,
"loss": 0.261,
"step": 245
},
{
"epoch": 0.16571236106433143,
"grad_norm": 0.6154356598854065,
"learning_rate": 1.4805452621840015e-05,
"loss": 0.2856,
"step": 246
},
{
"epoch": 0.16638598854833278,
"grad_norm": 0.7411592602729797,
"learning_rate": 1.4801438543857154e-05,
"loss": 0.2838,
"step": 247
},
{
"epoch": 0.16705961603233413,
"grad_norm": 0.6304882764816284,
"learning_rate": 1.479738403199152e-05,
"loss": 0.3102,
"step": 248
},
{
"epoch": 0.16773324351633548,
"grad_norm": 0.5838252305984497,
"learning_rate": 1.479328910869617e-05,
"loss": 0.3074,
"step": 249
},
{
"epoch": 0.16840687100033683,
"grad_norm": 0.6592857241630554,
"learning_rate": 1.4789153796647957e-05,
"loss": 0.2482,
"step": 250
},
{
"epoch": 0.16908049848433815,
"grad_norm": 0.6678220629692078,
"learning_rate": 1.4784978118747404e-05,
"loss": 0.2858,
"step": 251
},
{
"epoch": 0.1697541259683395,
"grad_norm": 0.7072235345840454,
"learning_rate": 1.4780762098118564e-05,
"loss": 0.317,
"step": 252
},
{
"epoch": 0.17042775345234085,
"grad_norm": 0.6481045484542847,
"learning_rate": 1.4776505758108901e-05,
"loss": 0.3074,
"step": 253
},
{
"epoch": 0.1711013809363422,
"grad_norm": 0.573128342628479,
"learning_rate": 1.477220912228916e-05,
"loss": 0.2421,
"step": 254
},
{
"epoch": 0.17177500842034354,
"grad_norm": 0.5758487582206726,
"learning_rate": 1.4767872214453241e-05,
"loss": 0.2874,
"step": 255
},
{
"epoch": 0.1724486359043449,
"grad_norm": 0.5688092112541199,
"learning_rate": 1.4763495058618056e-05,
"loss": 0.2897,
"step": 256
},
{
"epoch": 0.17312226338834624,
"grad_norm": 0.607288658618927,
"learning_rate": 1.4759077679023406e-05,
"loss": 0.2707,
"step": 257
},
{
"epoch": 0.1737958908723476,
"grad_norm": 0.6363064646720886,
"learning_rate": 1.4754620100131838e-05,
"loss": 0.2977,
"step": 258
},
{
"epoch": 0.17446951835634894,
"grad_norm": 0.6312716007232666,
"learning_rate": 1.475012234662852e-05,
"loss": 0.2794,
"step": 259
},
{
"epoch": 0.1751431458403503,
"grad_norm": 0.6589624285697937,
"learning_rate": 1.4745584443421097e-05,
"loss": 0.3483,
"step": 260
},
{
"epoch": 0.17581677332435164,
"grad_norm": 0.5797691345214844,
"learning_rate": 1.4741006415639555e-05,
"loss": 0.3013,
"step": 261
},
{
"epoch": 0.176490400808353,
"grad_norm": 0.5717487335205078,
"learning_rate": 1.473638828863608e-05,
"loss": 0.2725,
"step": 262
},
{
"epoch": 0.17716402829235434,
"grad_norm": 0.6161592602729797,
"learning_rate": 1.4731730087984924e-05,
"loss": 0.3049,
"step": 263
},
{
"epoch": 0.17783765577635569,
"grad_norm": 0.6334370970726013,
"learning_rate": 1.4727031839482251e-05,
"loss": 0.2844,
"step": 264
},
{
"epoch": 0.17851128326035703,
"grad_norm": 0.576859176158905,
"learning_rate": 1.472229356914601e-05,
"loss": 0.244,
"step": 265
},
{
"epoch": 0.17918491074435838,
"grad_norm": 0.6241918802261353,
"learning_rate": 1.4717515303215776e-05,
"loss": 0.2838,
"step": 266
},
{
"epoch": 0.1798585382283597,
"grad_norm": 0.5989061594009399,
"learning_rate": 1.4712697068152619e-05,
"loss": 0.2984,
"step": 267
},
{
"epoch": 0.18053216571236105,
"grad_norm": 0.5685368180274963,
"learning_rate": 1.4707838890638941e-05,
"loss": 0.2787,
"step": 268
},
{
"epoch": 0.1812057931963624,
"grad_norm": 0.6349403262138367,
"learning_rate": 1.4702940797578345e-05,
"loss": 0.3078,
"step": 269
},
{
"epoch": 0.18187942068036375,
"grad_norm": 0.6529637575149536,
"learning_rate": 1.4698002816095473e-05,
"loss": 0.307,
"step": 270
},
{
"epoch": 0.1825530481643651,
"grad_norm": 0.5679558515548706,
"learning_rate": 1.4693024973535863e-05,
"loss": 0.25,
"step": 271
},
{
"epoch": 0.18322667564836645,
"grad_norm": 0.5999310612678528,
"learning_rate": 1.4688007297465796e-05,
"loss": 0.259,
"step": 272
},
{
"epoch": 0.1839003031323678,
"grad_norm": 0.6034629344940186,
"learning_rate": 1.4682949815672146e-05,
"loss": 0.3071,
"step": 273
},
{
"epoch": 0.18457393061636915,
"grad_norm": 0.610670268535614,
"learning_rate": 1.467785255616221e-05,
"loss": 0.2913,
"step": 274
},
{
"epoch": 0.1852475581003705,
"grad_norm": 0.628016471862793,
"learning_rate": 1.4672715547163584e-05,
"loss": 0.2839,
"step": 275
},
{
"epoch": 0.18592118558437185,
"grad_norm": 0.6297721862792969,
"learning_rate": 1.4667538817123977e-05,
"loss": 0.3403,
"step": 276
},
{
"epoch": 0.1865948130683732,
"grad_norm": 0.540552020072937,
"learning_rate": 1.4662322394711067e-05,
"loss": 0.2454,
"step": 277
},
{
"epoch": 0.18726844055237454,
"grad_norm": 0.513788640499115,
"learning_rate": 1.4657066308812342e-05,
"loss": 0.233,
"step": 278
},
{
"epoch": 0.1879420680363759,
"grad_norm": 0.6221415996551514,
"learning_rate": 1.4651770588534937e-05,
"loss": 0.2969,
"step": 279
},
{
"epoch": 0.18861569552037724,
"grad_norm": 0.5859697461128235,
"learning_rate": 1.4646435263205475e-05,
"loss": 0.2771,
"step": 280
},
{
"epoch": 0.1892893230043786,
"grad_norm": 0.5720670819282532,
"learning_rate": 1.4641060362369904e-05,
"loss": 0.2758,
"step": 281
},
{
"epoch": 0.18996295048837994,
"grad_norm": 0.5609393119812012,
"learning_rate": 1.4635645915793333e-05,
"loss": 0.256,
"step": 282
},
{
"epoch": 0.19063657797238126,
"grad_norm": 0.5734854340553284,
"learning_rate": 1.4630191953459862e-05,
"loss": 0.3233,
"step": 283
},
{
"epoch": 0.1913102054563826,
"grad_norm": 0.570590615272522,
"learning_rate": 1.4624698505572432e-05,
"loss": 0.2757,
"step": 284
},
{
"epoch": 0.19198383294038396,
"grad_norm": 0.623126208782196,
"learning_rate": 1.4619165602552637e-05,
"loss": 0.2964,
"step": 285
},
{
"epoch": 0.1926574604243853,
"grad_norm": 0.5599439144134521,
"learning_rate": 1.4613593275040572e-05,
"loss": 0.2582,
"step": 286
},
{
"epoch": 0.19333108790838666,
"grad_norm": 0.5614957809448242,
"learning_rate": 1.4607981553894654e-05,
"loss": 0.27,
"step": 287
},
{
"epoch": 0.194004715392388,
"grad_norm": 0.5625648498535156,
"learning_rate": 1.4602330470191453e-05,
"loss": 0.2751,
"step": 288
},
{
"epoch": 0.19467834287638935,
"grad_norm": 0.5504026412963867,
"learning_rate": 1.4596640055225521e-05,
"loss": 0.2429,
"step": 289
},
{
"epoch": 0.1953519703603907,
"grad_norm": 0.5794048309326172,
"learning_rate": 1.4590910340509224e-05,
"loss": 0.2882,
"step": 290
},
{
"epoch": 0.19602559784439205,
"grad_norm": 0.550942599773407,
"learning_rate": 1.4585141357772554e-05,
"loss": 0.2604,
"step": 291
},
{
"epoch": 0.1966992253283934,
"grad_norm": 0.6088408827781677,
"learning_rate": 1.4579333138962966e-05,
"loss": 0.2993,
"step": 292
},
{
"epoch": 0.19737285281239475,
"grad_norm": 0.6309805512428284,
"learning_rate": 1.4573485716245193e-05,
"loss": 0.297,
"step": 293
},
{
"epoch": 0.1980464802963961,
"grad_norm": 0.6433154344558716,
"learning_rate": 1.456759912200108e-05,
"loss": 0.2919,
"step": 294
},
{
"epoch": 0.19872010778039745,
"grad_norm": 0.6373067498207092,
"learning_rate": 1.456167338882938e-05,
"loss": 0.2719,
"step": 295
},
{
"epoch": 0.1993937352643988,
"grad_norm": 0.5514649748802185,
"learning_rate": 1.4555708549545607e-05,
"loss": 0.2638,
"step": 296
},
{
"epoch": 0.20006736274840015,
"grad_norm": 0.5804110169410706,
"learning_rate": 1.4549704637181827e-05,
"loss": 0.2828,
"step": 297
},
{
"epoch": 0.2007409902324015,
"grad_norm": 0.5397315621376038,
"learning_rate": 1.4543661684986484e-05,
"loss": 0.2712,
"step": 298
},
{
"epoch": 0.20141461771640282,
"grad_norm": 0.6435424089431763,
"learning_rate": 1.4537579726424221e-05,
"loss": 0.3095,
"step": 299
},
{
"epoch": 0.20208824520040417,
"grad_norm": 0.5241397023200989,
"learning_rate": 1.453145879517569e-05,
"loss": 0.2635,
"step": 300
},
{
"epoch": 0.20208824520040417,
"eval_loss": 0.2736159861087799,
"eval_runtime": 107.1602,
"eval_samples_per_second": 46.659,
"eval_steps_per_second": 2.921,
"step": 300
},
{
"epoch": 0.20276187268440551,
"grad_norm": 0.5774008631706238,
"learning_rate": 1.4525298925137362e-05,
"loss": 0.2752,
"step": 301
},
{
"epoch": 0.20343550016840686,
"grad_norm": 0.5994575619697571,
"learning_rate": 1.4519100150421343e-05,
"loss": 0.3073,
"step": 302
},
{
"epoch": 0.2041091276524082,
"grad_norm": 0.5691470503807068,
"learning_rate": 1.4512862505355195e-05,
"loss": 0.2846,
"step": 303
},
{
"epoch": 0.20478275513640956,
"grad_norm": 0.5722606182098389,
"learning_rate": 1.450658602448172e-05,
"loss": 0.2549,
"step": 304
},
{
"epoch": 0.2054563826204109,
"grad_norm": 0.632279634475708,
"learning_rate": 1.45002707425588e-05,
"loss": 0.3197,
"step": 305
},
{
"epoch": 0.20613001010441226,
"grad_norm": 0.5538962483406067,
"learning_rate": 1.449391669455918e-05,
"loss": 0.2656,
"step": 306
},
{
"epoch": 0.2068036375884136,
"grad_norm": 0.5925297737121582,
"learning_rate": 1.4487523915670286e-05,
"loss": 0.2821,
"step": 307
},
{
"epoch": 0.20747726507241496,
"grad_norm": 0.6299713850021362,
"learning_rate": 1.448109244129403e-05,
"loss": 0.3116,
"step": 308
},
{
"epoch": 0.2081508925564163,
"grad_norm": 0.6114513874053955,
"learning_rate": 1.447462230704661e-05,
"loss": 0.285,
"step": 309
},
{
"epoch": 0.20882452004041765,
"grad_norm": 0.5723987817764282,
"learning_rate": 1.4468113548758313e-05,
"loss": 0.278,
"step": 310
},
{
"epoch": 0.209498147524419,
"grad_norm": 0.5769573450088501,
"learning_rate": 1.4461566202473322e-05,
"loss": 0.2892,
"step": 311
},
{
"epoch": 0.21017177500842035,
"grad_norm": 0.6040593981742859,
"learning_rate": 1.4454980304449506e-05,
"loss": 0.3123,
"step": 312
},
{
"epoch": 0.2108454024924217,
"grad_norm": 0.5362566113471985,
"learning_rate": 1.4448355891158235e-05,
"loss": 0.24,
"step": 313
},
{
"epoch": 0.21151902997642305,
"grad_norm": 0.560070812702179,
"learning_rate": 1.4441692999284159e-05,
"loss": 0.2663,
"step": 314
},
{
"epoch": 0.21219265746042437,
"grad_norm": 0.6649965047836304,
"learning_rate": 1.443499166572502e-05,
"loss": 0.3441,
"step": 315
},
{
"epoch": 0.21286628494442572,
"grad_norm": 0.5337359309196472,
"learning_rate": 1.4428251927591445e-05,
"loss": 0.253,
"step": 316
},
{
"epoch": 0.21353991242842707,
"grad_norm": 0.6185274720191956,
"learning_rate": 1.4421473822206729e-05,
"loss": 0.305,
"step": 317
},
{
"epoch": 0.21421353991242842,
"grad_norm": 0.5125210881233215,
"learning_rate": 1.4414657387106646e-05,
"loss": 0.2774,
"step": 318
},
{
"epoch": 0.21488716739642977,
"grad_norm": 0.5758813619613647,
"learning_rate": 1.4407802660039226e-05,
"loss": 0.2484,
"step": 319
},
{
"epoch": 0.21556079488043112,
"grad_norm": 0.5220269560813904,
"learning_rate": 1.4400909678964556e-05,
"loss": 0.2399,
"step": 320
},
{
"epoch": 0.21623442236443247,
"grad_norm": 0.5919392704963684,
"learning_rate": 1.4393978482054561e-05,
"loss": 0.2924,
"step": 321
},
{
"epoch": 0.21690804984843381,
"grad_norm": 0.5359899997711182,
"learning_rate": 1.4387009107692808e-05,
"loss": 0.2493,
"step": 322
},
{
"epoch": 0.21758167733243516,
"grad_norm": 0.568356454372406,
"learning_rate": 1.4380001594474267e-05,
"loss": 0.2877,
"step": 323
},
{
"epoch": 0.2182553048164365,
"grad_norm": 0.5183501243591309,
"learning_rate": 1.4372955981205127e-05,
"loss": 0.262,
"step": 324
},
{
"epoch": 0.21892893230043786,
"grad_norm": 0.5353648662567139,
"learning_rate": 1.436587230690256e-05,
"loss": 0.269,
"step": 325
},
{
"epoch": 0.2196025597844392,
"grad_norm": 0.5863710641860962,
"learning_rate": 1.4358750610794522e-05,
"loss": 0.2933,
"step": 326
},
{
"epoch": 0.22027618726844056,
"grad_norm": 0.5193360447883606,
"learning_rate": 1.4351590932319506e-05,
"loss": 0.2539,
"step": 327
},
{
"epoch": 0.2209498147524419,
"grad_norm": 0.521597146987915,
"learning_rate": 1.4344393311126367e-05,
"loss": 0.24,
"step": 328
},
{
"epoch": 0.22162344223644326,
"grad_norm": 0.5621289014816284,
"learning_rate": 1.4337157787074063e-05,
"loss": 0.2647,
"step": 329
},
{
"epoch": 0.2222970697204446,
"grad_norm": 0.6134183406829834,
"learning_rate": 1.432988440023146e-05,
"loss": 0.2846,
"step": 330
},
{
"epoch": 0.22297069720444593,
"grad_norm": 0.5819990634918213,
"learning_rate": 1.4322573190877091e-05,
"loss": 0.2725,
"step": 331
},
{
"epoch": 0.22364432468844728,
"grad_norm": 0.6009438037872314,
"learning_rate": 1.4315224199498952e-05,
"loss": 0.2507,
"step": 332
},
{
"epoch": 0.22431795217244863,
"grad_norm": 0.5484105944633484,
"learning_rate": 1.4307837466794258e-05,
"loss": 0.2715,
"step": 333
},
{
"epoch": 0.22499157965644997,
"grad_norm": 0.5025244951248169,
"learning_rate": 1.4300413033669241e-05,
"loss": 0.2257,
"step": 334
},
{
"epoch": 0.22566520714045132,
"grad_norm": 0.5583484172821045,
"learning_rate": 1.4292950941238898e-05,
"loss": 0.3015,
"step": 335
},
{
"epoch": 0.22633883462445267,
"grad_norm": 0.5975006222724915,
"learning_rate": 1.4285451230826783e-05,
"loss": 0.2924,
"step": 336
},
{
"epoch": 0.22701246210845402,
"grad_norm": 0.6017155051231384,
"learning_rate": 1.4277913943964763e-05,
"loss": 0.2928,
"step": 337
},
{
"epoch": 0.22768608959245537,
"grad_norm": 0.5619384050369263,
"learning_rate": 1.4270339122392808e-05,
"loss": 0.2744,
"step": 338
},
{
"epoch": 0.22835971707645672,
"grad_norm": 0.576554536819458,
"learning_rate": 1.4262726808058735e-05,
"loss": 0.3019,
"step": 339
},
{
"epoch": 0.22903334456045807,
"grad_norm": 0.5621641874313354,
"learning_rate": 1.4255077043117994e-05,
"loss": 0.2801,
"step": 340
},
{
"epoch": 0.22970697204445942,
"grad_norm": 0.5104705095291138,
"learning_rate": 1.424738986993343e-05,
"loss": 0.2572,
"step": 341
},
{
"epoch": 0.23038059952846077,
"grad_norm": 0.5731213688850403,
"learning_rate": 1.4239665331075048e-05,
"loss": 0.2545,
"step": 342
},
{
"epoch": 0.23105422701246212,
"grad_norm": 0.6381127238273621,
"learning_rate": 1.4231903469319772e-05,
"loss": 0.3023,
"step": 343
},
{
"epoch": 0.23172785449646346,
"grad_norm": 0.5358138680458069,
"learning_rate": 1.4224104327651213e-05,
"loss": 0.2597,
"step": 344
},
{
"epoch": 0.2324014819804648,
"grad_norm": 0.5517827272415161,
"learning_rate": 1.4216267949259437e-05,
"loss": 0.2669,
"step": 345
},
{
"epoch": 0.23307510946446616,
"grad_norm": 0.5380638241767883,
"learning_rate": 1.4208394377540712e-05,
"loss": 0.2706,
"step": 346
},
{
"epoch": 0.23374873694846748,
"grad_norm": 0.6162987351417542,
"learning_rate": 1.4200483656097278e-05,
"loss": 0.2721,
"step": 347
},
{
"epoch": 0.23442236443246883,
"grad_norm": 0.6142714619636536,
"learning_rate": 1.4192535828737102e-05,
"loss": 0.3158,
"step": 348
},
{
"epoch": 0.23509599191647018,
"grad_norm": 0.6231828331947327,
"learning_rate": 1.4184550939473644e-05,
"loss": 0.3022,
"step": 349
},
{
"epoch": 0.23576961940047153,
"grad_norm": 0.5371239185333252,
"learning_rate": 1.4176529032525584e-05,
"loss": 0.2372,
"step": 350
},
{
"epoch": 0.23644324688447288,
"grad_norm": 0.5987442135810852,
"learning_rate": 1.4168470152316624e-05,
"loss": 0.2856,
"step": 351
},
{
"epoch": 0.23711687436847423,
"grad_norm": 0.5490831732749939,
"learning_rate": 1.41603743434752e-05,
"loss": 0.2352,
"step": 352
},
{
"epoch": 0.23779050185247558,
"grad_norm": 0.5611885786056519,
"learning_rate": 1.415224165083426e-05,
"loss": 0.2763,
"step": 353
},
{
"epoch": 0.23846412933647693,
"grad_norm": 0.5451275706291199,
"learning_rate": 1.4144072119431e-05,
"loss": 0.2725,
"step": 354
},
{
"epoch": 0.23913775682047828,
"grad_norm": 0.5789247155189514,
"learning_rate": 1.413586579450662e-05,
"loss": 0.2604,
"step": 355
},
{
"epoch": 0.23981138430447962,
"grad_norm": 0.6164606213569641,
"learning_rate": 1.4127622721506087e-05,
"loss": 0.2932,
"step": 356
},
{
"epoch": 0.24048501178848097,
"grad_norm": 0.5564325451850891,
"learning_rate": 1.4119342946077864e-05,
"loss": 0.2735,
"step": 357
},
{
"epoch": 0.24115863927248232,
"grad_norm": 0.6473014950752258,
"learning_rate": 1.4111026514073657e-05,
"loss": 0.2808,
"step": 358
},
{
"epoch": 0.24183226675648367,
"grad_norm": 0.5950415730476379,
"learning_rate": 1.4102673471548186e-05,
"loss": 0.2819,
"step": 359
},
{
"epoch": 0.24250589424048502,
"grad_norm": 0.576295793056488,
"learning_rate": 1.4094283864758896e-05,
"loss": 0.2818,
"step": 360
},
{
"epoch": 0.24317952172448637,
"grad_norm": 0.5290201306343079,
"learning_rate": 1.4085857740165727e-05,
"loss": 0.2731,
"step": 361
},
{
"epoch": 0.24385314920848772,
"grad_norm": 0.5469079613685608,
"learning_rate": 1.4077395144430845e-05,
"loss": 0.2533,
"step": 362
},
{
"epoch": 0.24452677669248907,
"grad_norm": 0.553629457950592,
"learning_rate": 1.4068896124418383e-05,
"loss": 0.2784,
"step": 363
},
{
"epoch": 0.2452004041764904,
"grad_norm": 0.5426369905471802,
"learning_rate": 1.4060360727194188e-05,
"loss": 0.2687,
"step": 364
},
{
"epoch": 0.24587403166049174,
"grad_norm": 0.5466113686561584,
"learning_rate": 1.4051789000025555e-05,
"loss": 0.2721,
"step": 365
},
{
"epoch": 0.2465476591444931,
"grad_norm": 0.5685258507728577,
"learning_rate": 1.4043180990380968e-05,
"loss": 0.283,
"step": 366
},
{
"epoch": 0.24722128662849444,
"grad_norm": 0.5648797154426575,
"learning_rate": 1.4034536745929835e-05,
"loss": 0.2579,
"step": 367
},
{
"epoch": 0.24789491411249578,
"grad_norm": 0.5363840460777283,
"learning_rate": 1.4025856314542223e-05,
"loss": 0.2577,
"step": 368
},
{
"epoch": 0.24856854159649713,
"grad_norm": 0.5171375870704651,
"learning_rate": 1.40171397442886e-05,
"loss": 0.2351,
"step": 369
},
{
"epoch": 0.24924216908049848,
"grad_norm": 0.646500825881958,
"learning_rate": 1.4008387083439554e-05,
"loss": 0.3039,
"step": 370
},
{
"epoch": 0.24991579656449983,
"grad_norm": 0.5827479362487793,
"learning_rate": 1.3999598380465552e-05,
"loss": 0.2913,
"step": 371
},
{
"epoch": 0.25058942404850115,
"grad_norm": 0.5602329969406128,
"learning_rate": 1.3990773684036636e-05,
"loss": 0.2822,
"step": 372
},
{
"epoch": 0.2512630515325025,
"grad_norm": 0.5731973648071289,
"learning_rate": 1.3981913043022187e-05,
"loss": 0.2638,
"step": 373
},
{
"epoch": 0.25193667901650385,
"grad_norm": 0.6127945780754089,
"learning_rate": 1.397301650649063e-05,
"loss": 0.314,
"step": 374
},
{
"epoch": 0.2526103065005052,
"grad_norm": 0.5554071664810181,
"learning_rate": 1.396408412370918e-05,
"loss": 0.2575,
"step": 375
},
{
"epoch": 0.25328393398450655,
"grad_norm": 0.5913053750991821,
"learning_rate": 1.3955115944143558e-05,
"loss": 0.2669,
"step": 376
},
{
"epoch": 0.2539575614685079,
"grad_norm": 0.6104479432106018,
"learning_rate": 1.3946112017457715e-05,
"loss": 0.2575,
"step": 377
},
{
"epoch": 0.25463118895250925,
"grad_norm": 0.6109972596168518,
"learning_rate": 1.393707239351357e-05,
"loss": 0.3141,
"step": 378
},
{
"epoch": 0.2553048164365106,
"grad_norm": 0.605560302734375,
"learning_rate": 1.3927997122370724e-05,
"loss": 0.2869,
"step": 379
},
{
"epoch": 0.25597844392051194,
"grad_norm": 0.5215985774993896,
"learning_rate": 1.3918886254286182e-05,
"loss": 0.2464,
"step": 380
},
{
"epoch": 0.2566520714045133,
"grad_norm": 0.5480206608772278,
"learning_rate": 1.3909739839714081e-05,
"loss": 0.2713,
"step": 381
},
{
"epoch": 0.25732569888851464,
"grad_norm": 0.5150758028030396,
"learning_rate": 1.3900557929305408e-05,
"loss": 0.2537,
"step": 382
},
{
"epoch": 0.257999326372516,
"grad_norm": 0.606860876083374,
"learning_rate": 1.3891340573907715e-05,
"loss": 0.2929,
"step": 383
},
{
"epoch": 0.25867295385651734,
"grad_norm": 0.5383312106132507,
"learning_rate": 1.3882087824564841e-05,
"loss": 0.2778,
"step": 384
},
{
"epoch": 0.2593465813405187,
"grad_norm": 0.5356404185295105,
"learning_rate": 1.3872799732516635e-05,
"loss": 0.2318,
"step": 385
},
{
"epoch": 0.26002020882452004,
"grad_norm": 0.5665723085403442,
"learning_rate": 1.386347634919866e-05,
"loss": 0.2898,
"step": 386
},
{
"epoch": 0.2606938363085214,
"grad_norm": 0.5390300750732422,
"learning_rate": 1.3854117726241922e-05,
"loss": 0.2789,
"step": 387
},
{
"epoch": 0.26136746379252274,
"grad_norm": 0.5479271411895752,
"learning_rate": 1.3844723915472568e-05,
"loss": 0.2552,
"step": 388
},
{
"epoch": 0.2620410912765241,
"grad_norm": 0.6038428544998169,
"learning_rate": 1.3835294968911615e-05,
"loss": 0.3018,
"step": 389
},
{
"epoch": 0.26271471876052543,
"grad_norm": 0.5380761027336121,
"learning_rate": 1.3825830938774653e-05,
"loss": 0.2683,
"step": 390
},
{
"epoch": 0.2633883462445268,
"grad_norm": 0.5072317719459534,
"learning_rate": 1.3816331877471562e-05,
"loss": 0.2728,
"step": 391
},
{
"epoch": 0.26406197372852813,
"grad_norm": 0.5953329205513,
"learning_rate": 1.3806797837606206e-05,
"loss": 0.2644,
"step": 392
},
{
"epoch": 0.2647356012125295,
"grad_norm": 0.5941304564476013,
"learning_rate": 1.3797228871976162e-05,
"loss": 0.2841,
"step": 393
},
{
"epoch": 0.26540922869653083,
"grad_norm": 0.6646502614021301,
"learning_rate": 1.378762503357242e-05,
"loss": 0.2966,
"step": 394
},
{
"epoch": 0.2660828561805322,
"grad_norm": 0.545456051826477,
"learning_rate": 1.377798637557908e-05,
"loss": 0.2481,
"step": 395
},
{
"epoch": 0.26675648366453353,
"grad_norm": 0.5886520147323608,
"learning_rate": 1.3768312951373076e-05,
"loss": 0.2735,
"step": 396
},
{
"epoch": 0.2674301111485349,
"grad_norm": 0.5731514096260071,
"learning_rate": 1.3758604814523863e-05,
"loss": 0.2953,
"step": 397
},
{
"epoch": 0.2681037386325362,
"grad_norm": 0.5029922723770142,
"learning_rate": 1.3748862018793131e-05,
"loss": 0.228,
"step": 398
},
{
"epoch": 0.2687773661165376,
"grad_norm": 0.557115375995636,
"learning_rate": 1.3739084618134502e-05,
"loss": 0.2861,
"step": 399
},
{
"epoch": 0.2694509936005389,
"grad_norm": 0.5246098041534424,
"learning_rate": 1.3729272666693235e-05,
"loss": 0.2705,
"step": 400
},
{
"epoch": 0.2694509936005389,
"eval_loss": 0.2706840932369232,
"eval_runtime": 105.373,
"eval_samples_per_second": 47.451,
"eval_steps_per_second": 2.97,
"step": 400
},
{
"epoch": 0.2701246210845403,
"grad_norm": 0.5355361104011536,
"learning_rate": 1.371942621880592e-05,
"loss": 0.249,
"step": 401
},
{
"epoch": 0.2707982485685416,
"grad_norm": 0.5726237297058105,
"learning_rate": 1.3709545329000187e-05,
"loss": 0.2849,
"step": 402
},
{
"epoch": 0.27147187605254297,
"grad_norm": 0.5560792088508606,
"learning_rate": 1.3699630051994395e-05,
"loss": 0.2397,
"step": 403
},
{
"epoch": 0.27214550353654426,
"grad_norm": 0.509462833404541,
"learning_rate": 1.3689680442697332e-05,
"loss": 0.2412,
"step": 404
},
{
"epoch": 0.2728191310205456,
"grad_norm": 0.5348261594772339,
"learning_rate": 1.3679696556207913e-05,
"loss": 0.2588,
"step": 405
},
{
"epoch": 0.27349275850454696,
"grad_norm": 0.5228528380393982,
"learning_rate": 1.3669678447814871e-05,
"loss": 0.2482,
"step": 406
},
{
"epoch": 0.2741663859885483,
"grad_norm": 0.5533547401428223,
"learning_rate": 1.3659626172996459e-05,
"loss": 0.2581,
"step": 407
},
{
"epoch": 0.27484001347254966,
"grad_norm": 0.538163959980011,
"learning_rate": 1.3649539787420126e-05,
"loss": 0.2444,
"step": 408
},
{
"epoch": 0.275513640956551,
"grad_norm": 0.6091170907020569,
"learning_rate": 1.3639419346942227e-05,
"loss": 0.2963,
"step": 409
},
{
"epoch": 0.27618726844055236,
"grad_norm": 0.5507506728172302,
"learning_rate": 1.3629264907607709e-05,
"loss": 0.2835,
"step": 410
},
{
"epoch": 0.2768608959245537,
"grad_norm": 0.5167334079742432,
"learning_rate": 1.361907652564979e-05,
"loss": 0.2751,
"step": 411
},
{
"epoch": 0.27753452340855506,
"grad_norm": 0.6182762384414673,
"learning_rate": 1.3608854257489656e-05,
"loss": 0.2953,
"step": 412
},
{
"epoch": 0.2782081508925564,
"grad_norm": 0.6356998085975647,
"learning_rate": 1.3598598159736155e-05,
"loss": 0.2586,
"step": 413
},
{
"epoch": 0.27888177837655775,
"grad_norm": 0.5957326889038086,
"learning_rate": 1.358830828918547e-05,
"loss": 0.283,
"step": 414
},
{
"epoch": 0.2795554058605591,
"grad_norm": 0.5173368453979492,
"learning_rate": 1.3577984702820811e-05,
"loss": 0.2403,
"step": 415
},
{
"epoch": 0.28022903334456045,
"grad_norm": 0.5449368357658386,
"learning_rate": 1.3567627457812107e-05,
"loss": 0.2641,
"step": 416
},
{
"epoch": 0.2809026608285618,
"grad_norm": 0.6340479850769043,
"learning_rate": 1.355723661151567e-05,
"loss": 0.3286,
"step": 417
},
{
"epoch": 0.28157628831256315,
"grad_norm": 0.49671491980552673,
"learning_rate": 1.3546812221473898e-05,
"loss": 0.2585,
"step": 418
},
{
"epoch": 0.2822499157965645,
"grad_norm": 0.5974727272987366,
"learning_rate": 1.3536354345414944e-05,
"loss": 0.2674,
"step": 419
},
{
"epoch": 0.28292354328056585,
"grad_norm": 0.5984825491905212,
"learning_rate": 1.35258630412524e-05,
"loss": 0.2548,
"step": 420
},
{
"epoch": 0.2835971707645672,
"grad_norm": 0.5152942538261414,
"learning_rate": 1.3515338367084975e-05,
"loss": 0.2323,
"step": 421
},
{
"epoch": 0.28427079824856855,
"grad_norm": 0.5210486054420471,
"learning_rate": 1.3504780381196178e-05,
"loss": 0.2538,
"step": 422
},
{
"epoch": 0.2849444257325699,
"grad_norm": 0.6852086782455444,
"learning_rate": 1.3494189142053988e-05,
"loss": 0.3409,
"step": 423
},
{
"epoch": 0.28561805321657124,
"grad_norm": 0.5637288689613342,
"learning_rate": 1.3483564708310535e-05,
"loss": 0.2435,
"step": 424
},
{
"epoch": 0.2862916807005726,
"grad_norm": 0.565467357635498,
"learning_rate": 1.3472907138801775e-05,
"loss": 0.2699,
"step": 425
},
{
"epoch": 0.28696530818457394,
"grad_norm": 0.6443371176719666,
"learning_rate": 1.346221649254716e-05,
"loss": 0.3226,
"step": 426
},
{
"epoch": 0.2876389356685753,
"grad_norm": 0.5877301096916199,
"learning_rate": 1.3451492828749317e-05,
"loss": 0.2626,
"step": 427
},
{
"epoch": 0.28831256315257664,
"grad_norm": 0.635368824005127,
"learning_rate": 1.3440736206793717e-05,
"loss": 0.2808,
"step": 428
},
{
"epoch": 0.288986190636578,
"grad_norm": 0.5623096823692322,
"learning_rate": 1.3429946686248346e-05,
"loss": 0.2583,
"step": 429
},
{
"epoch": 0.28965981812057934,
"grad_norm": 0.5355499386787415,
"learning_rate": 1.341912432686338e-05,
"loss": 0.2425,
"step": 430
},
{
"epoch": 0.2903334456045807,
"grad_norm": 0.5870991349220276,
"learning_rate": 1.3408269188570837e-05,
"loss": 0.2638,
"step": 431
},
{
"epoch": 0.29100707308858204,
"grad_norm": 0.5296127796173096,
"learning_rate": 1.3397381331484273e-05,
"loss": 0.2587,
"step": 432
},
{
"epoch": 0.2916807005725834,
"grad_norm": 0.5635933876037598,
"learning_rate": 1.3386460815898427e-05,
"loss": 0.2966,
"step": 433
},
{
"epoch": 0.29235432805658473,
"grad_norm": 0.5246622562408447,
"learning_rate": 1.3375507702288894e-05,
"loss": 0.2513,
"step": 434
},
{
"epoch": 0.2930279555405861,
"grad_norm": 0.6050205826759338,
"learning_rate": 1.3364522051311793e-05,
"loss": 0.3016,
"step": 435
},
{
"epoch": 0.2937015830245874,
"grad_norm": 0.5831138491630554,
"learning_rate": 1.3353503923803424e-05,
"loss": 0.312,
"step": 436
},
{
"epoch": 0.2943752105085887,
"grad_norm": 0.5354754328727722,
"learning_rate": 1.3342453380779939e-05,
"loss": 0.2743,
"step": 437
},
{
"epoch": 0.2950488379925901,
"grad_norm": 0.6059128642082214,
"learning_rate": 1.3331370483437e-05,
"loss": 0.2836,
"step": 438
},
{
"epoch": 0.2957224654765914,
"grad_norm": 0.6208754181861877,
"learning_rate": 1.332025529314944e-05,
"loss": 0.3069,
"step": 439
},
{
"epoch": 0.29639609296059277,
"grad_norm": 0.5791683197021484,
"learning_rate": 1.3309107871470922e-05,
"loss": 0.2904,
"step": 440
},
{
"epoch": 0.2970697204445941,
"grad_norm": 0.5765690803527832,
"learning_rate": 1.3297928280133606e-05,
"loss": 0.3015,
"step": 441
},
{
"epoch": 0.29774334792859547,
"grad_norm": 0.5978572368621826,
"learning_rate": 1.3286716581047791e-05,
"loss": 0.2827,
"step": 442
},
{
"epoch": 0.2984169754125968,
"grad_norm": 0.5690959692001343,
"learning_rate": 1.3275472836301592e-05,
"loss": 0.2819,
"step": 443
},
{
"epoch": 0.29909060289659817,
"grad_norm": 0.5888264775276184,
"learning_rate": 1.3264197108160582e-05,
"loss": 0.297,
"step": 444
},
{
"epoch": 0.2997642303805995,
"grad_norm": 0.566338837146759,
"learning_rate": 1.3252889459067452e-05,
"loss": 0.2703,
"step": 445
},
{
"epoch": 0.30043785786460087,
"grad_norm": 0.5249893665313721,
"learning_rate": 1.3241549951641663e-05,
"loss": 0.252,
"step": 446
},
{
"epoch": 0.3011114853486022,
"grad_norm": 0.6007825136184692,
"learning_rate": 1.3230178648679102e-05,
"loss": 0.2696,
"step": 447
},
{
"epoch": 0.30178511283260356,
"grad_norm": 0.5482873916625977,
"learning_rate": 1.3218775613151737e-05,
"loss": 0.2523,
"step": 448
},
{
"epoch": 0.3024587403166049,
"grad_norm": 0.6056886315345764,
"learning_rate": 1.3207340908207258e-05,
"loss": 0.2616,
"step": 449
},
{
"epoch": 0.30313236780060626,
"grad_norm": 0.5885447859764099,
"learning_rate": 1.319587459716874e-05,
"loss": 0.2976,
"step": 450
},
{
"epoch": 0.3038059952846076,
"grad_norm": 0.5747894644737244,
"learning_rate": 1.318437674353428e-05,
"loss": 0.2898,
"step": 451
},
{
"epoch": 0.30447962276860896,
"grad_norm": 0.569401741027832,
"learning_rate": 1.3172847410976658e-05,
"loss": 0.3104,
"step": 452
},
{
"epoch": 0.3051532502526103,
"grad_norm": 0.5612210631370544,
"learning_rate": 1.3161286663342972e-05,
"loss": 0.2825,
"step": 453
},
{
"epoch": 0.30582687773661166,
"grad_norm": 0.5914261937141418,
"learning_rate": 1.3149694564654295e-05,
"loss": 0.2781,
"step": 454
},
{
"epoch": 0.306500505220613,
"grad_norm": 0.5259233713150024,
"learning_rate": 1.3138071179105314e-05,
"loss": 0.2542,
"step": 455
},
{
"epoch": 0.30717413270461436,
"grad_norm": 0.5168178081512451,
"learning_rate": 1.3126416571063972e-05,
"loss": 0.2514,
"step": 456
},
{
"epoch": 0.3078477601886157,
"grad_norm": 0.5078200101852417,
"learning_rate": 1.3114730805071123e-05,
"loss": 0.2422,
"step": 457
},
{
"epoch": 0.30852138767261705,
"grad_norm": 0.5727274417877197,
"learning_rate": 1.3103013945840166e-05,
"loss": 0.2809,
"step": 458
},
{
"epoch": 0.3091950151566184,
"grad_norm": 0.5502845048904419,
"learning_rate": 1.309126605825668e-05,
"loss": 0.2552,
"step": 459
},
{
"epoch": 0.30986864264061975,
"grad_norm": 0.5696067214012146,
"learning_rate": 1.3079487207378084e-05,
"loss": 0.2959,
"step": 460
},
{
"epoch": 0.3105422701246211,
"grad_norm": 0.5644879341125488,
"learning_rate": 1.3067677458433258e-05,
"loss": 0.2713,
"step": 461
},
{
"epoch": 0.31121589760862245,
"grad_norm": 0.5638664364814758,
"learning_rate": 1.3055836876822196e-05,
"loss": 0.2687,
"step": 462
},
{
"epoch": 0.3118895250926238,
"grad_norm": 0.5337838530540466,
"learning_rate": 1.3043965528115625e-05,
"loss": 0.2238,
"step": 463
},
{
"epoch": 0.31256315257662515,
"grad_norm": 0.5844706892967224,
"learning_rate": 1.3032063478054666e-05,
"loss": 0.268,
"step": 464
},
{
"epoch": 0.3132367800606265,
"grad_norm": 0.6730402112007141,
"learning_rate": 1.3020130792550456e-05,
"loss": 0.2976,
"step": 465
},
{
"epoch": 0.31391040754462785,
"grad_norm": 0.5756520628929138,
"learning_rate": 1.3008167537683776e-05,
"loss": 0.2859,
"step": 466
},
{
"epoch": 0.3145840350286292,
"grad_norm": 0.5855886340141296,
"learning_rate": 1.2996173779704704e-05,
"loss": 0.2997,
"step": 467
},
{
"epoch": 0.3152576625126305,
"grad_norm": 0.5359857082366943,
"learning_rate": 1.2984149585032237e-05,
"loss": 0.2814,
"step": 468
},
{
"epoch": 0.31593128999663184,
"grad_norm": 0.5448024868965149,
"learning_rate": 1.2972095020253912e-05,
"loss": 0.2681,
"step": 469
},
{
"epoch": 0.3166049174806332,
"grad_norm": 0.518844723701477,
"learning_rate": 1.296001015212547e-05,
"loss": 0.2538,
"step": 470
},
{
"epoch": 0.31727854496463453,
"grad_norm": 0.5422329306602478,
"learning_rate": 1.2947895047570446e-05,
"loss": 0.2346,
"step": 471
},
{
"epoch": 0.3179521724486359,
"grad_norm": 0.5567420721054077,
"learning_rate": 1.2935749773679833e-05,
"loss": 0.259,
"step": 472
},
{
"epoch": 0.31862579993263723,
"grad_norm": 0.5199055671691895,
"learning_rate": 1.2923574397711684e-05,
"loss": 0.2273,
"step": 473
},
{
"epoch": 0.3192994274166386,
"grad_norm": 0.552947461605072,
"learning_rate": 1.291136898709076e-05,
"loss": 0.2541,
"step": 474
},
{
"epoch": 0.31997305490063993,
"grad_norm": 0.537124752998352,
"learning_rate": 1.2899133609408146e-05,
"loss": 0.2709,
"step": 475
},
{
"epoch": 0.3206466823846413,
"grad_norm": 0.5493146777153015,
"learning_rate": 1.2886868332420873e-05,
"loss": 0.2838,
"step": 476
},
{
"epoch": 0.32132030986864263,
"grad_norm": 0.6109126806259155,
"learning_rate": 1.2874573224051556e-05,
"loss": 0.3088,
"step": 477
},
{
"epoch": 0.321993937352644,
"grad_norm": 0.5879717469215393,
"learning_rate": 1.2862248352388005e-05,
"loss": 0.282,
"step": 478
},
{
"epoch": 0.3226675648366453,
"grad_norm": 0.5227838754653931,
"learning_rate": 1.2849893785682852e-05,
"loss": 0.2646,
"step": 479
},
{
"epoch": 0.3233411923206467,
"grad_norm": 0.4744933545589447,
"learning_rate": 1.2837509592353181e-05,
"loss": 0.2219,
"step": 480
},
{
"epoch": 0.324014819804648,
"grad_norm": 0.508222758769989,
"learning_rate": 1.2825095840980133e-05,
"loss": 0.2698,
"step": 481
},
{
"epoch": 0.3246884472886494,
"grad_norm": 0.5351443290710449,
"learning_rate": 1.2812652600308544e-05,
"loss": 0.2617,
"step": 482
},
{
"epoch": 0.3253620747726507,
"grad_norm": 0.5842475295066833,
"learning_rate": 1.2800179939246552e-05,
"loss": 0.2496,
"step": 483
},
{
"epoch": 0.32603570225665207,
"grad_norm": 0.5165258646011353,
"learning_rate": 1.2787677926865216e-05,
"loss": 0.2399,
"step": 484
},
{
"epoch": 0.3267093297406534,
"grad_norm": 0.5721768736839294,
"learning_rate": 1.2775146632398142e-05,
"loss": 0.2754,
"step": 485
},
{
"epoch": 0.32738295722465477,
"grad_norm": 0.47171083092689514,
"learning_rate": 1.2762586125241093e-05,
"loss": 0.2107,
"step": 486
},
{
"epoch": 0.3280565847086561,
"grad_norm": 0.5318099856376648,
"learning_rate": 1.2749996474951603e-05,
"loss": 0.2422,
"step": 487
},
{
"epoch": 0.32873021219265747,
"grad_norm": 0.5478540062904358,
"learning_rate": 1.2737377751248598e-05,
"loss": 0.2634,
"step": 488
},
{
"epoch": 0.3294038396766588,
"grad_norm": 0.4972551167011261,
"learning_rate": 1.2724730024012002e-05,
"loss": 0.232,
"step": 489
},
{
"epoch": 0.33007746716066017,
"grad_norm": 0.6141415238380432,
"learning_rate": 1.2712053363282363e-05,
"loss": 0.2998,
"step": 490
},
{
"epoch": 0.3307510946446615,
"grad_norm": 0.5177733302116394,
"learning_rate": 1.2699347839260448e-05,
"loss": 0.2574,
"step": 491
},
{
"epoch": 0.33142472212866286,
"grad_norm": 0.5531916618347168,
"learning_rate": 1.268661352230687e-05,
"loss": 0.2719,
"step": 492
},
{
"epoch": 0.3320983496126642,
"grad_norm": 0.5089963674545288,
"learning_rate": 1.2673850482941687e-05,
"loss": 0.2508,
"step": 493
},
{
"epoch": 0.33277197709666556,
"grad_norm": 0.557072103023529,
"learning_rate": 1.2661058791844016e-05,
"loss": 0.2823,
"step": 494
},
{
"epoch": 0.3334456045806669,
"grad_norm": 0.6557756662368774,
"learning_rate": 1.2648238519851644e-05,
"loss": 0.2821,
"step": 495
},
{
"epoch": 0.33411923206466826,
"grad_norm": 0.5633836984634399,
"learning_rate": 1.2635389737960632e-05,
"loss": 0.2576,
"step": 496
},
{
"epoch": 0.3347928595486696,
"grad_norm": 0.594456136226654,
"learning_rate": 1.262251251732492e-05,
"loss": 0.2985,
"step": 497
},
{
"epoch": 0.33546648703267096,
"grad_norm": 0.5753186345100403,
"learning_rate": 1.2609606929255942e-05,
"loss": 0.2775,
"step": 498
},
{
"epoch": 0.3361401145166723,
"grad_norm": 0.6262162327766418,
"learning_rate": 1.259667304522222e-05,
"loss": 0.3254,
"step": 499
},
{
"epoch": 0.33681374200067365,
"grad_norm": 0.5529574155807495,
"learning_rate": 1.2583710936848977e-05,
"loss": 0.2711,
"step": 500
},
{
"epoch": 0.33681374200067365,
"eval_loss": 0.2681807279586792,
"eval_runtime": 104.7062,
"eval_samples_per_second": 47.753,
"eval_steps_per_second": 2.989,
"step": 500
},
{
"epoch": 0.33748736948467495,
"grad_norm": 0.6187270283699036,
"learning_rate": 1.2570720675917734e-05,
"loss": 0.3082,
"step": 501
},
{
"epoch": 0.3381609969686763,
"grad_norm": 0.5153407454490662,
"learning_rate": 1.2557702334365916e-05,
"loss": 0.26,
"step": 502
},
{
"epoch": 0.33883462445267765,
"grad_norm": 0.5447744727134705,
"learning_rate": 1.2544655984286451e-05,
"loss": 0.2641,
"step": 503
},
{
"epoch": 0.339508251936679,
"grad_norm": 0.5450101494789124,
"learning_rate": 1.253158169792738e-05,
"loss": 0.276,
"step": 504
},
{
"epoch": 0.34018187942068034,
"grad_norm": 0.6855320930480957,
"learning_rate": 1.2518479547691437e-05,
"loss": 0.3589,
"step": 505
},
{
"epoch": 0.3408555069046817,
"grad_norm": 0.52507483959198,
"learning_rate": 1.250534960613567e-05,
"loss": 0.2489,
"step": 506
},
{
"epoch": 0.34152913438868304,
"grad_norm": 0.5259436964988708,
"learning_rate": 1.2492191945971028e-05,
"loss": 0.2568,
"step": 507
},
{
"epoch": 0.3422027618726844,
"grad_norm": 0.5746189951896667,
"learning_rate": 1.2479006640061958e-05,
"loss": 0.2878,
"step": 508
},
{
"epoch": 0.34287638935668574,
"grad_norm": 0.5484218001365662,
"learning_rate": 1.2465793761426005e-05,
"loss": 0.3059,
"step": 509
},
{
"epoch": 0.3435500168406871,
"grad_norm": 0.5747763514518738,
"learning_rate": 1.24525533832334e-05,
"loss": 0.2505,
"step": 510
},
{
"epoch": 0.34422364432468844,
"grad_norm": 0.5692996382713318,
"learning_rate": 1.2439285578806678e-05,
"loss": 0.3077,
"step": 511
},
{
"epoch": 0.3448972718086898,
"grad_norm": 0.5282084345817566,
"learning_rate": 1.2425990421620235e-05,
"loss": 0.2763,
"step": 512
},
{
"epoch": 0.34557089929269114,
"grad_norm": 0.4825171232223511,
"learning_rate": 1.241266798529995e-05,
"loss": 0.2423,
"step": 513
},
{
"epoch": 0.3462445267766925,
"grad_norm": 0.5359032154083252,
"learning_rate": 1.239931834362277e-05,
"loss": 0.2796,
"step": 514
},
{
"epoch": 0.34691815426069383,
"grad_norm": 0.473827600479126,
"learning_rate": 1.2385941570516297e-05,
"loss": 0.2531,
"step": 515
},
{
"epoch": 0.3475917817446952,
"grad_norm": 0.4639384150505066,
"learning_rate": 1.2372537740058382e-05,
"loss": 0.2326,
"step": 516
},
{
"epoch": 0.34826540922869653,
"grad_norm": 0.5909863710403442,
"learning_rate": 1.2359106926476714e-05,
"loss": 0.2824,
"step": 517
},
{
"epoch": 0.3489390367126979,
"grad_norm": 0.5261175036430359,
"learning_rate": 1.234564920414841e-05,
"loss": 0.2757,
"step": 518
},
{
"epoch": 0.34961266419669923,
"grad_norm": 0.577748715877533,
"learning_rate": 1.2332164647599599e-05,
"loss": 0.2619,
"step": 519
},
{
"epoch": 0.3502862916807006,
"grad_norm": 0.5614107251167297,
"learning_rate": 1.2318653331505015e-05,
"loss": 0.2928,
"step": 520
},
{
"epoch": 0.35095991916470193,
"grad_norm": 0.5660324692726135,
"learning_rate": 1.2305115330687585e-05,
"loss": 0.2797,
"step": 521
},
{
"epoch": 0.3516335466487033,
"grad_norm": 0.5362821817398071,
"learning_rate": 1.2291550720117997e-05,
"loss": 0.2931,
"step": 522
},
{
"epoch": 0.3523071741327046,
"grad_norm": 0.5424318909645081,
"learning_rate": 1.2277959574914317e-05,
"loss": 0.2709,
"step": 523
},
{
"epoch": 0.352980801616706,
"grad_norm": 0.5283873081207275,
"learning_rate": 1.226434197034154e-05,
"loss": 0.2478,
"step": 524
},
{
"epoch": 0.3536544291007073,
"grad_norm": 0.5451403260231018,
"learning_rate": 1.2250697981811195e-05,
"loss": 0.2684,
"step": 525
},
{
"epoch": 0.3543280565847087,
"grad_norm": 0.5320309400558472,
"learning_rate": 1.2237027684880914e-05,
"loss": 0.2678,
"step": 526
},
{
"epoch": 0.35500168406871,
"grad_norm": 0.558335542678833,
"learning_rate": 1.2223331155254026e-05,
"loss": 0.2715,
"step": 527
},
{
"epoch": 0.35567531155271137,
"grad_norm": 0.5011473298072815,
"learning_rate": 1.220960846877913e-05,
"loss": 0.2535,
"step": 528
},
{
"epoch": 0.3563489390367127,
"grad_norm": 0.5432257056236267,
"learning_rate": 1.2195859701449672e-05,
"loss": 0.2802,
"step": 529
},
{
"epoch": 0.35702256652071407,
"grad_norm": 0.5836246013641357,
"learning_rate": 1.2182084929403531e-05,
"loss": 0.3088,
"step": 530
},
{
"epoch": 0.3576961940047154,
"grad_norm": 0.5858445167541504,
"learning_rate": 1.2168284228922597e-05,
"loss": 0.2751,
"step": 531
},
{
"epoch": 0.35836982148871677,
"grad_norm": 0.556725800037384,
"learning_rate": 1.2154457676432344e-05,
"loss": 0.2693,
"step": 532
},
{
"epoch": 0.35904344897271806,
"grad_norm": 0.5822067260742188,
"learning_rate": 1.2140605348501409e-05,
"loss": 0.3145,
"step": 533
},
{
"epoch": 0.3597170764567194,
"grad_norm": 0.5754439830780029,
"learning_rate": 1.212672732184117e-05,
"loss": 0.3009,
"step": 534
},
{
"epoch": 0.36039070394072076,
"grad_norm": 0.5826534032821655,
"learning_rate": 1.2112823673305317e-05,
"loss": 0.3112,
"step": 535
},
{
"epoch": 0.3610643314247221,
"grad_norm": 0.5259435176849365,
"learning_rate": 1.209889447988943e-05,
"loss": 0.2572,
"step": 536
},
{
"epoch": 0.36173795890872346,
"grad_norm": 0.5303089022636414,
"learning_rate": 1.2084939818730554e-05,
"loss": 0.2745,
"step": 537
},
{
"epoch": 0.3624115863927248,
"grad_norm": 0.4945959150791168,
"learning_rate": 1.2070959767106762e-05,
"loss": 0.2624,
"step": 538
},
{
"epoch": 0.36308521387672615,
"grad_norm": 0.5212944149971008,
"learning_rate": 1.2056954402436743e-05,
"loss": 0.2367,
"step": 539
},
{
"epoch": 0.3637588413607275,
"grad_norm": 0.5474100708961487,
"learning_rate": 1.2042923802279356e-05,
"loss": 0.2922,
"step": 540
},
{
"epoch": 0.36443246884472885,
"grad_norm": 0.5586138963699341,
"learning_rate": 1.2028868044333218e-05,
"loss": 0.2779,
"step": 541
},
{
"epoch": 0.3651060963287302,
"grad_norm": 0.4587612450122833,
"learning_rate": 1.2014787206436256e-05,
"loss": 0.2291,
"step": 542
},
{
"epoch": 0.36577972381273155,
"grad_norm": 0.5979660749435425,
"learning_rate": 1.200068136656529e-05,
"loss": 0.2663,
"step": 543
},
{
"epoch": 0.3664533512967329,
"grad_norm": 0.5004269480705261,
"learning_rate": 1.1986550602835595e-05,
"loss": 0.2325,
"step": 544
},
{
"epoch": 0.36712697878073425,
"grad_norm": 0.5056456327438354,
"learning_rate": 1.1972394993500466e-05,
"loss": 0.2691,
"step": 545
},
{
"epoch": 0.3678006062647356,
"grad_norm": 0.5447576642036438,
"learning_rate": 1.1958214616950794e-05,
"loss": 0.272,
"step": 546
},
{
"epoch": 0.36847423374873695,
"grad_norm": 0.5720804929733276,
"learning_rate": 1.1944009551714623e-05,
"loss": 0.2651,
"step": 547
},
{
"epoch": 0.3691478612327383,
"grad_norm": 0.5342965722084045,
"learning_rate": 1.1929779876456713e-05,
"loss": 0.2681,
"step": 548
},
{
"epoch": 0.36982148871673964,
"grad_norm": 0.5355931520462036,
"learning_rate": 1.191552566997812e-05,
"loss": 0.2504,
"step": 549
},
{
"epoch": 0.370495116200741,
"grad_norm": 0.6217589378356934,
"learning_rate": 1.1901247011215733e-05,
"loss": 0.2704,
"step": 550
},
{
"epoch": 0.37116874368474234,
"grad_norm": 0.6108464002609253,
"learning_rate": 1.1886943979241874e-05,
"loss": 0.2995,
"step": 551
},
{
"epoch": 0.3718423711687437,
"grad_norm": 0.5349010229110718,
"learning_rate": 1.187261665326382e-05,
"loss": 0.2873,
"step": 552
},
{
"epoch": 0.37251599865274504,
"grad_norm": 0.5306320786476135,
"learning_rate": 1.1858265112623388e-05,
"loss": 0.2546,
"step": 553
},
{
"epoch": 0.3731896261367464,
"grad_norm": 0.5984854102134705,
"learning_rate": 1.18438894367965e-05,
"loss": 0.3019,
"step": 554
},
{
"epoch": 0.37386325362074774,
"grad_norm": 0.5498750805854797,
"learning_rate": 1.1829489705392727e-05,
"loss": 0.2702,
"step": 555
},
{
"epoch": 0.3745368811047491,
"grad_norm": 0.5973288416862488,
"learning_rate": 1.1815065998154849e-05,
"loss": 0.2947,
"step": 556
},
{
"epoch": 0.37521050858875044,
"grad_norm": 0.5865532755851746,
"learning_rate": 1.180061839495843e-05,
"loss": 0.3207,
"step": 557
},
{
"epoch": 0.3758841360727518,
"grad_norm": 0.5075846314430237,
"learning_rate": 1.1786146975811359e-05,
"loss": 0.2474,
"step": 558
},
{
"epoch": 0.37655776355675313,
"grad_norm": 0.5501227378845215,
"learning_rate": 1.1771651820853417e-05,
"loss": 0.274,
"step": 559
},
{
"epoch": 0.3772313910407545,
"grad_norm": 0.5292581915855408,
"learning_rate": 1.1757133010355821e-05,
"loss": 0.2546,
"step": 560
},
{
"epoch": 0.37790501852475583,
"grad_norm": 0.5926501750946045,
"learning_rate": 1.1742590624720796e-05,
"loss": 0.2847,
"step": 561
},
{
"epoch": 0.3785786460087572,
"grad_norm": 0.5264430046081543,
"learning_rate": 1.1728024744481117e-05,
"loss": 0.2634,
"step": 562
},
{
"epoch": 0.37925227349275853,
"grad_norm": 0.5014563798904419,
"learning_rate": 1.171343545029967e-05,
"loss": 0.2301,
"step": 563
},
{
"epoch": 0.3799259009767599,
"grad_norm": 0.48584073781967163,
"learning_rate": 1.1698822822969001e-05,
"loss": 0.2482,
"step": 564
},
{
"epoch": 0.38059952846076117,
"grad_norm": 0.5884197354316711,
"learning_rate": 1.1684186943410867e-05,
"loss": 0.286,
"step": 565
},
{
"epoch": 0.3812731559447625,
"grad_norm": 0.556430459022522,
"learning_rate": 1.16695278926758e-05,
"loss": 0.2496,
"step": 566
},
{
"epoch": 0.38194678342876387,
"grad_norm": 0.5392268300056458,
"learning_rate": 1.165484575194264e-05,
"loss": 0.2786,
"step": 567
},
{
"epoch": 0.3826204109127652,
"grad_norm": 0.5491148233413696,
"learning_rate": 1.1640140602518102e-05,
"loss": 0.2289,
"step": 568
},
{
"epoch": 0.38329403839676657,
"grad_norm": 0.5565954446792603,
"learning_rate": 1.162541252583631e-05,
"loss": 0.2614,
"step": 569
},
{
"epoch": 0.3839676658807679,
"grad_norm": 0.5307971239089966,
"learning_rate": 1.1610661603458363e-05,
"loss": 0.2577,
"step": 570
},
{
"epoch": 0.38464129336476927,
"grad_norm": 0.5446802377700806,
"learning_rate": 1.159588791707187e-05,
"loss": 0.292,
"step": 571
},
{
"epoch": 0.3853149208487706,
"grad_norm": 0.5837084054946899,
"learning_rate": 1.1581091548490505e-05,
"loss": 0.2771,
"step": 572
},
{
"epoch": 0.38598854833277196,
"grad_norm": 0.5611515045166016,
"learning_rate": 1.156627257965355e-05,
"loss": 0.2602,
"step": 573
},
{
"epoch": 0.3866621758167733,
"grad_norm": 0.5338358879089355,
"learning_rate": 1.155143109262544e-05,
"loss": 0.2573,
"step": 574
},
{
"epoch": 0.38733580330077466,
"grad_norm": 0.4791894853115082,
"learning_rate": 1.1536567169595316e-05,
"loss": 0.2411,
"step": 575
},
{
"epoch": 0.388009430784776,
"grad_norm": 0.5701311826705933,
"learning_rate": 1.1521680892876563e-05,
"loss": 0.2973,
"step": 576
},
{
"epoch": 0.38868305826877736,
"grad_norm": 0.4976153075695038,
"learning_rate": 1.1506772344906356e-05,
"loss": 0.2716,
"step": 577
},
{
"epoch": 0.3893566857527787,
"grad_norm": 0.5492983460426331,
"learning_rate": 1.1491841608245204e-05,
"loss": 0.2621,
"step": 578
},
{
"epoch": 0.39003031323678006,
"grad_norm": 0.5490813255310059,
"learning_rate": 1.1476888765576493e-05,
"loss": 0.2687,
"step": 579
},
{
"epoch": 0.3907039407207814,
"grad_norm": 0.5402075052261353,
"learning_rate": 1.1461913899706025e-05,
"loss": 0.3112,
"step": 580
},
{
"epoch": 0.39137756820478276,
"grad_norm": 0.5017600059509277,
"learning_rate": 1.1446917093561564e-05,
"loss": 0.2242,
"step": 581
},
{
"epoch": 0.3920511956887841,
"grad_norm": 0.5590758919715881,
"learning_rate": 1.1431898430192375e-05,
"loss": 0.2569,
"step": 582
},
{
"epoch": 0.39272482317278545,
"grad_norm": 0.5497624278068542,
"learning_rate": 1.1416857992768764e-05,
"loss": 0.3114,
"step": 583
},
{
"epoch": 0.3933984506567868,
"grad_norm": 0.5833696126937866,
"learning_rate": 1.1401795864581616e-05,
"loss": 0.2999,
"step": 584
},
{
"epoch": 0.39407207814078815,
"grad_norm": 0.5114924907684326,
"learning_rate": 1.1386712129041937e-05,
"loss": 0.2428,
"step": 585
},
{
"epoch": 0.3947457056247895,
"grad_norm": 0.5477609038352966,
"learning_rate": 1.1371606869680388e-05,
"loss": 0.2722,
"step": 586
},
{
"epoch": 0.39541933310879085,
"grad_norm": 0.5121515393257141,
"learning_rate": 1.1356480170146826e-05,
"loss": 0.2376,
"step": 587
},
{
"epoch": 0.3960929605927922,
"grad_norm": 0.502560019493103,
"learning_rate": 1.1341332114209838e-05,
"loss": 0.2737,
"step": 588
},
{
"epoch": 0.39676658807679355,
"grad_norm": 0.5239719748497009,
"learning_rate": 1.1326162785756281e-05,
"loss": 0.2563,
"step": 589
},
{
"epoch": 0.3974402155607949,
"grad_norm": 0.5645294189453125,
"learning_rate": 1.131097226879081e-05,
"loss": 0.308,
"step": 590
},
{
"epoch": 0.39811384304479625,
"grad_norm": 0.5425258278846741,
"learning_rate": 1.1295760647435424e-05,
"loss": 0.2388,
"step": 591
},
{
"epoch": 0.3987874705287976,
"grad_norm": 0.5374796390533447,
"learning_rate": 1.1280528005928988e-05,
"loss": 0.2774,
"step": 592
},
{
"epoch": 0.39946109801279894,
"grad_norm": 0.5628758072853088,
"learning_rate": 1.1265274428626775e-05,
"loss": 0.2689,
"step": 593
},
{
"epoch": 0.4001347254968003,
"grad_norm": 0.5226148366928101,
"learning_rate": 1.125e-05,
"loss": 0.2713,
"step": 594
},
{
"epoch": 0.40080835298080164,
"grad_norm": 0.5630069971084595,
"learning_rate": 1.1234704804635342e-05,
"loss": 0.3279,
"step": 595
},
{
"epoch": 0.401481980464803,
"grad_norm": 0.508704423904419,
"learning_rate": 1.1219388927234482e-05,
"loss": 0.2623,
"step": 596
},
{
"epoch": 0.40215560794880434,
"grad_norm": 0.5345742702484131,
"learning_rate": 1.1204052452613638e-05,
"loss": 0.2865,
"step": 597
},
{
"epoch": 0.40282923543280563,
"grad_norm": 0.5258358120918274,
"learning_rate": 1.1188695465703092e-05,
"loss": 0.2721,
"step": 598
},
{
"epoch": 0.403502862916807,
"grad_norm": 0.5306556820869446,
"learning_rate": 1.1173318051546713e-05,
"loss": 0.2753,
"step": 599
},
{
"epoch": 0.40417649040080833,
"grad_norm": 0.49859175086021423,
"learning_rate": 1.1157920295301498e-05,
"loss": 0.2594,
"step": 600
},
{
"epoch": 0.40417649040080833,
"eval_loss": 0.2652011811733246,
"eval_runtime": 105.8884,
"eval_samples_per_second": 47.22,
"eval_steps_per_second": 2.956,
"step": 600
},
{
"epoch": 0.4048501178848097,
"grad_norm": 0.558407723903656,
"learning_rate": 1.114250228223709e-05,
"loss": 0.256,
"step": 601
},
{
"epoch": 0.40552374536881103,
"grad_norm": 0.508040726184845,
"learning_rate": 1.1127064097735315e-05,
"loss": 0.2575,
"step": 602
},
{
"epoch": 0.4061973728528124,
"grad_norm": 0.5474634766578674,
"learning_rate": 1.1111605827289698e-05,
"loss": 0.2805,
"step": 603
},
{
"epoch": 0.4068710003368137,
"grad_norm": 0.519263505935669,
"learning_rate": 1.1096127556505e-05,
"loss": 0.2534,
"step": 604
},
{
"epoch": 0.4075446278208151,
"grad_norm": 0.5802994966506958,
"learning_rate": 1.1080629371096738e-05,
"loss": 0.2756,
"step": 605
},
{
"epoch": 0.4082182553048164,
"grad_norm": 0.5730322599411011,
"learning_rate": 1.1065111356890712e-05,
"loss": 0.2888,
"step": 606
},
{
"epoch": 0.4088918827888178,
"grad_norm": 0.5447918176651001,
"learning_rate": 1.1049573599822537e-05,
"loss": 0.2848,
"step": 607
},
{
"epoch": 0.4095655102728191,
"grad_norm": 0.5072281360626221,
"learning_rate": 1.1034016185937149e-05,
"loss": 0.2972,
"step": 608
},
{
"epoch": 0.41023913775682047,
"grad_norm": 0.6098499298095703,
"learning_rate": 1.1018439201388346e-05,
"loss": 0.299,
"step": 609
},
{
"epoch": 0.4109127652408218,
"grad_norm": 0.594445526599884,
"learning_rate": 1.1002842732438301e-05,
"loss": 0.2778,
"step": 610
},
{
"epoch": 0.41158639272482317,
"grad_norm": 0.5406931638717651,
"learning_rate": 1.0987226865457091e-05,
"loss": 0.2948,
"step": 611
},
{
"epoch": 0.4122600202088245,
"grad_norm": 0.5487210750579834,
"learning_rate": 1.0971591686922211e-05,
"loss": 0.256,
"step": 612
},
{
"epoch": 0.41293364769282587,
"grad_norm": 0.5063245296478271,
"learning_rate": 1.0955937283418104e-05,
"loss": 0.2481,
"step": 613
},
{
"epoch": 0.4136072751768272,
"grad_norm": 0.5232447981834412,
"learning_rate": 1.0940263741635678e-05,
"loss": 0.2436,
"step": 614
},
{
"epoch": 0.41428090266082856,
"grad_norm": 0.5449836254119873,
"learning_rate": 1.092457114837182e-05,
"loss": 0.2621,
"step": 615
},
{
"epoch": 0.4149545301448299,
"grad_norm": 0.5582854151725769,
"learning_rate": 1.090885959052892e-05,
"loss": 0.2885,
"step": 616
},
{
"epoch": 0.41562815762883126,
"grad_norm": 0.5433541536331177,
"learning_rate": 1.0893129155114396e-05,
"loss": 0.2659,
"step": 617
},
{
"epoch": 0.4163017851128326,
"grad_norm": 0.5937801599502563,
"learning_rate": 1.0877379929240198e-05,
"loss": 0.2968,
"step": 618
},
{
"epoch": 0.41697541259683396,
"grad_norm": 0.4904331564903259,
"learning_rate": 1.0861612000122341e-05,
"loss": 0.2508,
"step": 619
},
{
"epoch": 0.4176490400808353,
"grad_norm": 0.5370484590530396,
"learning_rate": 1.0845825455080411e-05,
"loss": 0.2564,
"step": 620
},
{
"epoch": 0.41832266756483666,
"grad_norm": 0.535376250743866,
"learning_rate": 1.0830020381537088e-05,
"loss": 0.2796,
"step": 621
},
{
"epoch": 0.418996295048838,
"grad_norm": 0.5508119463920593,
"learning_rate": 1.0814196867017656e-05,
"loss": 0.281,
"step": 622
},
{
"epoch": 0.41966992253283936,
"grad_norm": 0.525283694267273,
"learning_rate": 1.079835499914952e-05,
"loss": 0.2306,
"step": 623
},
{
"epoch": 0.4203435500168407,
"grad_norm": 0.5157189965248108,
"learning_rate": 1.078249486566173e-05,
"loss": 0.2679,
"step": 624
},
{
"epoch": 0.42101717750084205,
"grad_norm": 0.6008614301681519,
"learning_rate": 1.0766616554384477e-05,
"loss": 0.2815,
"step": 625
},
{
"epoch": 0.4216908049848434,
"grad_norm": 0.5147749185562134,
"learning_rate": 1.0750720153248626e-05,
"loss": 0.2587,
"step": 626
},
{
"epoch": 0.42236443246884475,
"grad_norm": 0.5508129596710205,
"learning_rate": 1.073480575028521e-05,
"loss": 0.2788,
"step": 627
},
{
"epoch": 0.4230380599528461,
"grad_norm": 0.5465036034584045,
"learning_rate": 1.0718873433624966e-05,
"loss": 0.2606,
"step": 628
},
{
"epoch": 0.42371168743684745,
"grad_norm": 0.5761625170707703,
"learning_rate": 1.070292329149782e-05,
"loss": 0.3149,
"step": 629
},
{
"epoch": 0.42438531492084874,
"grad_norm": 0.5194136500358582,
"learning_rate": 1.0686955412232419e-05,
"loss": 0.2305,
"step": 630
},
{
"epoch": 0.4250589424048501,
"grad_norm": 0.5823161602020264,
"learning_rate": 1.0670969884255636e-05,
"loss": 0.2495,
"step": 631
},
{
"epoch": 0.42573256988885144,
"grad_norm": 0.5550847053527832,
"learning_rate": 1.0654966796092073e-05,
"loss": 0.2539,
"step": 632
},
{
"epoch": 0.4264061973728528,
"grad_norm": 0.5327949523925781,
"learning_rate": 1.0638946236363578e-05,
"loss": 0.2655,
"step": 633
},
{
"epoch": 0.42707982485685414,
"grad_norm": 0.5146956443786621,
"learning_rate": 1.0622908293788758e-05,
"loss": 0.2599,
"step": 634
},
{
"epoch": 0.4277534523408555,
"grad_norm": 0.5790160894393921,
"learning_rate": 1.0606853057182481e-05,
"loss": 0.298,
"step": 635
},
{
"epoch": 0.42842707982485684,
"grad_norm": 0.5627730488777161,
"learning_rate": 1.059078061545538e-05,
"loss": 0.2622,
"step": 636
},
{
"epoch": 0.4291007073088582,
"grad_norm": 0.619365394115448,
"learning_rate": 1.0574691057613376e-05,
"loss": 0.2905,
"step": 637
},
{
"epoch": 0.42977433479285954,
"grad_norm": 0.5521032810211182,
"learning_rate": 1.0558584472757167e-05,
"loss": 0.2705,
"step": 638
},
{
"epoch": 0.4304479622768609,
"grad_norm": 0.5045711398124695,
"learning_rate": 1.0542460950081747e-05,
"loss": 0.2289,
"step": 639
},
{
"epoch": 0.43112158976086223,
"grad_norm": 0.5129411816596985,
"learning_rate": 1.0526320578875909e-05,
"loss": 0.2572,
"step": 640
},
{
"epoch": 0.4317952172448636,
"grad_norm": 0.5294272899627686,
"learning_rate": 1.0510163448521747e-05,
"loss": 0.2702,
"step": 641
},
{
"epoch": 0.43246884472886493,
"grad_norm": 0.5448393225669861,
"learning_rate": 1.0493989648494165e-05,
"loss": 0.2808,
"step": 642
},
{
"epoch": 0.4331424722128663,
"grad_norm": 0.5107436776161194,
"learning_rate": 1.0477799268360384e-05,
"loss": 0.248,
"step": 643
},
{
"epoch": 0.43381609969686763,
"grad_norm": 0.5598347187042236,
"learning_rate": 1.0461592397779435e-05,
"loss": 0.2342,
"step": 644
},
{
"epoch": 0.434489727180869,
"grad_norm": 0.5707139372825623,
"learning_rate": 1.0445369126501676e-05,
"loss": 0.2764,
"step": 645
},
{
"epoch": 0.4351633546648703,
"grad_norm": 0.48345211148262024,
"learning_rate": 1.0429129544368283e-05,
"loss": 0.2215,
"step": 646
},
{
"epoch": 0.4358369821488717,
"grad_norm": 0.5131022930145264,
"learning_rate": 1.0412873741310763e-05,
"loss": 0.2423,
"step": 647
},
{
"epoch": 0.436510609632873,
"grad_norm": 0.5428949594497681,
"learning_rate": 1.0396601807350452e-05,
"loss": 0.2331,
"step": 648
},
{
"epoch": 0.4371842371168744,
"grad_norm": 0.47753867506980896,
"learning_rate": 1.038031383259801e-05,
"loss": 0.2552,
"step": 649
},
{
"epoch": 0.4378578646008757,
"grad_norm": 0.48779332637786865,
"learning_rate": 1.0364009907252937e-05,
"loss": 0.2499,
"step": 650
},
{
"epoch": 0.4385314920848771,
"grad_norm": 0.4910006523132324,
"learning_rate": 1.0347690121603047e-05,
"loss": 0.2498,
"step": 651
},
{
"epoch": 0.4392051195688784,
"grad_norm": 0.5575456023216248,
"learning_rate": 1.0331354566024005e-05,
"loss": 0.2503,
"step": 652
},
{
"epoch": 0.43987874705287977,
"grad_norm": 0.5806515216827393,
"learning_rate": 1.0315003330978799e-05,
"loss": 0.254,
"step": 653
},
{
"epoch": 0.4405523745368811,
"grad_norm": 0.5564923882484436,
"learning_rate": 1.0298636507017241e-05,
"loss": 0.2804,
"step": 654
},
{
"epoch": 0.44122600202088247,
"grad_norm": 0.5716164708137512,
"learning_rate": 1.0282254184775473e-05,
"loss": 0.2844,
"step": 655
},
{
"epoch": 0.4418996295048838,
"grad_norm": 0.5606719255447388,
"learning_rate": 1.0265856454975473e-05,
"loss": 0.2576,
"step": 656
},
{
"epoch": 0.44257325698888517,
"grad_norm": 0.5467285513877869,
"learning_rate": 1.0249443408424535e-05,
"loss": 0.2782,
"step": 657
},
{
"epoch": 0.4432468844728865,
"grad_norm": 0.569665253162384,
"learning_rate": 1.0233015136014773e-05,
"loss": 0.272,
"step": 658
},
{
"epoch": 0.44392051195688786,
"grad_norm": 0.5965842604637146,
"learning_rate": 1.021657172872262e-05,
"loss": 0.3023,
"step": 659
},
{
"epoch": 0.4445941394408892,
"grad_norm": 0.5759636163711548,
"learning_rate": 1.0200113277608326e-05,
"loss": 0.2621,
"step": 660
},
{
"epoch": 0.44526776692489056,
"grad_norm": 0.5999960899353027,
"learning_rate": 1.0183639873815448e-05,
"loss": 0.2976,
"step": 661
},
{
"epoch": 0.44594139440889186,
"grad_norm": 0.5440315008163452,
"learning_rate": 1.0167151608570346e-05,
"loss": 0.2889,
"step": 662
},
{
"epoch": 0.4466150218928932,
"grad_norm": 0.4932374358177185,
"learning_rate": 1.0150648573181685e-05,
"loss": 0.2271,
"step": 663
},
{
"epoch": 0.44728864937689455,
"grad_norm": 0.5871284604072571,
"learning_rate": 1.0134130859039921e-05,
"loss": 0.3202,
"step": 664
},
{
"epoch": 0.4479622768608959,
"grad_norm": 0.5287674069404602,
"learning_rate": 1.0117598557616796e-05,
"loss": 0.2486,
"step": 665
},
{
"epoch": 0.44863590434489725,
"grad_norm": 0.588444709777832,
"learning_rate": 1.0101051760464837e-05,
"loss": 0.2555,
"step": 666
},
{
"epoch": 0.4493095318288986,
"grad_norm": 0.5376453399658203,
"learning_rate": 1.0084490559216843e-05,
"loss": 0.2506,
"step": 667
},
{
"epoch": 0.44998315931289995,
"grad_norm": 0.5496957898139954,
"learning_rate": 1.006791504558538e-05,
"loss": 0.2616,
"step": 668
},
{
"epoch": 0.4506567867969013,
"grad_norm": 0.523008406162262,
"learning_rate": 1.0051325311362278e-05,
"loss": 0.2597,
"step": 669
},
{
"epoch": 0.45133041428090265,
"grad_norm": 0.5686816573143005,
"learning_rate": 1.0034721448418105e-05,
"loss": 0.2665,
"step": 670
},
{
"epoch": 0.452004041764904,
"grad_norm": 0.5065593719482422,
"learning_rate": 1.0018103548701688e-05,
"loss": 0.2566,
"step": 671
},
{
"epoch": 0.45267766924890535,
"grad_norm": 0.5687103867530823,
"learning_rate": 1.0001471704239577e-05,
"loss": 0.2628,
"step": 672
},
{
"epoch": 0.4533512967329067,
"grad_norm": 0.5782075524330139,
"learning_rate": 9.984826007135544e-06,
"loss": 0.2732,
"step": 673
},
{
"epoch": 0.45402492421690804,
"grad_norm": 0.5679803490638733,
"learning_rate": 9.968166549570075e-06,
"loss": 0.2664,
"step": 674
},
{
"epoch": 0.4546985517009094,
"grad_norm": 0.5293748378753662,
"learning_rate": 9.951493423799866e-06,
"loss": 0.2498,
"step": 675
},
{
"epoch": 0.45537217918491074,
"grad_norm": 0.5444015264511108,
"learning_rate": 9.934806722157294e-06,
"loss": 0.2549,
"step": 676
},
{
"epoch": 0.4560458066689121,
"grad_norm": 0.5367648601531982,
"learning_rate": 9.918106537049921e-06,
"loss": 0.2623,
"step": 677
},
{
"epoch": 0.45671943415291344,
"grad_norm": 0.5820662975311279,
"learning_rate": 9.901392960959983e-06,
"loss": 0.2771,
"step": 678
},
{
"epoch": 0.4573930616369148,
"grad_norm": 0.5573861598968506,
"learning_rate": 9.884666086443862e-06,
"loss": 0.2614,
"step": 679
},
{
"epoch": 0.45806668912091614,
"grad_norm": 0.6296043992042542,
"learning_rate": 9.867926006131597e-06,
"loss": 0.3102,
"step": 680
},
{
"epoch": 0.4587403166049175,
"grad_norm": 0.5795363187789917,
"learning_rate": 9.851172812726344e-06,
"loss": 0.3059,
"step": 681
},
{
"epoch": 0.45941394408891884,
"grad_norm": 0.48046785593032837,
"learning_rate": 9.834406599003885e-06,
"loss": 0.2323,
"step": 682
},
{
"epoch": 0.4600875715729202,
"grad_norm": 0.4878872036933899,
"learning_rate": 9.817627457812105e-06,
"loss": 0.2467,
"step": 683
},
{
"epoch": 0.46076119905692153,
"grad_norm": 0.5333375334739685,
"learning_rate": 9.800835482070479e-06,
"loss": 0.2282,
"step": 684
},
{
"epoch": 0.4614348265409229,
"grad_norm": 0.543725848197937,
"learning_rate": 9.784030764769553e-06,
"loss": 0.2427,
"step": 685
},
{
"epoch": 0.46210845402492423,
"grad_norm": 0.5145445466041565,
"learning_rate": 9.76721339897044e-06,
"loss": 0.2291,
"step": 686
},
{
"epoch": 0.4627820815089256,
"grad_norm": 0.5099066495895386,
"learning_rate": 9.75038347780429e-06,
"loss": 0.245,
"step": 687
},
{
"epoch": 0.46345570899292693,
"grad_norm": 0.5599386096000671,
"learning_rate": 9.73354109447179e-06,
"loss": 0.2994,
"step": 688
},
{
"epoch": 0.4641293364769283,
"grad_norm": 0.5298258662223816,
"learning_rate": 9.716686342242632e-06,
"loss": 0.231,
"step": 689
},
{
"epoch": 0.4648029639609296,
"grad_norm": 0.5349884033203125,
"learning_rate": 9.69981931445501e-06,
"loss": 0.2436,
"step": 690
},
{
"epoch": 0.465476591444931,
"grad_norm": 0.5078858137130737,
"learning_rate": 9.682940104515097e-06,
"loss": 0.2735,
"step": 691
},
{
"epoch": 0.4661502189289323,
"grad_norm": 0.5433405637741089,
"learning_rate": 9.666048805896524e-06,
"loss": 0.2472,
"step": 692
},
{
"epoch": 0.4668238464129337,
"grad_norm": 0.5337989926338196,
"learning_rate": 9.649145512139876e-06,
"loss": 0.2815,
"step": 693
},
{
"epoch": 0.46749747389693497,
"grad_norm": 0.491817831993103,
"learning_rate": 9.632230316852153e-06,
"loss": 0.2712,
"step": 694
},
{
"epoch": 0.4681711013809363,
"grad_norm": 0.5814330577850342,
"learning_rate": 9.615303313706271e-06,
"loss": 0.2931,
"step": 695
},
{
"epoch": 0.46884472886493767,
"grad_norm": 0.5358330607414246,
"learning_rate": 9.598364596440534e-06,
"loss": 0.2546,
"step": 696
},
{
"epoch": 0.469518356348939,
"grad_norm": 0.5111145377159119,
"learning_rate": 9.581414258858116e-06,
"loss": 0.2607,
"step": 697
},
{
"epoch": 0.47019198383294036,
"grad_norm": 0.5266521573066711,
"learning_rate": 9.564452394826538e-06,
"loss": 0.2554,
"step": 698
},
{
"epoch": 0.4708656113169417,
"grad_norm": 0.5091780424118042,
"learning_rate": 9.54747909827716e-06,
"loss": 0.2723,
"step": 699
},
{
"epoch": 0.47153923880094306,
"grad_norm": 0.5414915680885315,
"learning_rate": 9.530494463204646e-06,
"loss": 0.2577,
"step": 700
},
{
"epoch": 0.47153923880094306,
"eval_loss": 0.26179420948028564,
"eval_runtime": 105.0708,
"eval_samples_per_second": 47.587,
"eval_steps_per_second": 2.979,
"step": 700
},
{
"epoch": 0.4722128662849444,
"grad_norm": 0.505789577960968,
"learning_rate": 9.513498583666456e-06,
"loss": 0.2448,
"step": 701
},
{
"epoch": 0.47288649376894576,
"grad_norm": 0.46454617381095886,
"learning_rate": 9.496491553782314e-06,
"loss": 0.221,
"step": 702
},
{
"epoch": 0.4735601212529471,
"grad_norm": 0.5358849763870239,
"learning_rate": 9.479473467733697e-06,
"loss": 0.2872,
"step": 703
},
{
"epoch": 0.47423374873694846,
"grad_norm": 0.5496987700462341,
"learning_rate": 9.462444419763306e-06,
"loss": 0.2464,
"step": 704
},
{
"epoch": 0.4749073762209498,
"grad_norm": 0.5485591292381287,
"learning_rate": 9.445404504174546e-06,
"loss": 0.2695,
"step": 705
},
{
"epoch": 0.47558100370495116,
"grad_norm": 0.5437228679656982,
"learning_rate": 9.42835381533101e-06,
"loss": 0.2823,
"step": 706
},
{
"epoch": 0.4762546311889525,
"grad_norm": 0.5094515085220337,
"learning_rate": 9.411292447655948e-06,
"loss": 0.2401,
"step": 707
},
{
"epoch": 0.47692825867295385,
"grad_norm": 0.5395442843437195,
"learning_rate": 9.394220495631744e-06,
"loss": 0.2659,
"step": 708
},
{
"epoch": 0.4776018861569552,
"grad_norm": 0.4930800795555115,
"learning_rate": 9.377138053799399e-06,
"loss": 0.2383,
"step": 709
},
{
"epoch": 0.47827551364095655,
"grad_norm": 0.5237337350845337,
"learning_rate": 9.360045216758008e-06,
"loss": 0.2527,
"step": 710
},
{
"epoch": 0.4789491411249579,
"grad_norm": 0.5243161916732788,
"learning_rate": 9.342942079164223e-06,
"loss": 0.2515,
"step": 711
},
{
"epoch": 0.47962276860895925,
"grad_norm": 0.5414012670516968,
"learning_rate": 9.325828735731747e-06,
"loss": 0.275,
"step": 712
},
{
"epoch": 0.4802963960929606,
"grad_norm": 0.547073245048523,
"learning_rate": 9.308705281230796e-06,
"loss": 0.276,
"step": 713
},
{
"epoch": 0.48097002357696195,
"grad_norm": 0.49008458852767944,
"learning_rate": 9.291571810487584e-06,
"loss": 0.246,
"step": 714
},
{
"epoch": 0.4816436510609633,
"grad_norm": 0.5415433645248413,
"learning_rate": 9.27442841838379e-06,
"loss": 0.2658,
"step": 715
},
{
"epoch": 0.48231727854496464,
"grad_norm": 0.5856931209564209,
"learning_rate": 9.257275199856032e-06,
"loss": 0.2675,
"step": 716
},
{
"epoch": 0.482990906028966,
"grad_norm": 0.5154370665550232,
"learning_rate": 9.24011224989535e-06,
"loss": 0.2422,
"step": 717
},
{
"epoch": 0.48366453351296734,
"grad_norm": 0.5306107401847839,
"learning_rate": 9.222939663546677e-06,
"loss": 0.2687,
"step": 718
},
{
"epoch": 0.4843381609969687,
"grad_norm": 0.4880635142326355,
"learning_rate": 9.2057575359083e-06,
"loss": 0.2276,
"step": 719
},
{
"epoch": 0.48501178848097004,
"grad_norm": 0.6055603623390198,
"learning_rate": 9.18856596213135e-06,
"loss": 0.2907,
"step": 720
},
{
"epoch": 0.4856854159649714,
"grad_norm": 0.5602757930755615,
"learning_rate": 9.171365037419272e-06,
"loss": 0.2511,
"step": 721
},
{
"epoch": 0.48635904344897274,
"grad_norm": 0.5492405295372009,
"learning_rate": 9.15415485702729e-06,
"loss": 0.246,
"step": 722
},
{
"epoch": 0.4870326709329741,
"grad_norm": 0.6091371178627014,
"learning_rate": 9.136935516261887e-06,
"loss": 0.3003,
"step": 723
},
{
"epoch": 0.48770629841697544,
"grad_norm": 0.5400590300559998,
"learning_rate": 9.119707110480272e-06,
"loss": 0.2576,
"step": 724
},
{
"epoch": 0.4883799259009768,
"grad_norm": 0.5183984041213989,
"learning_rate": 9.10246973508985e-06,
"loss": 0.2519,
"step": 725
},
{
"epoch": 0.48905355338497813,
"grad_norm": 0.5791885256767273,
"learning_rate": 9.08522348554771e-06,
"loss": 0.269,
"step": 726
},
{
"epoch": 0.48972718086897943,
"grad_norm": 0.5196906328201294,
"learning_rate": 9.067968457360073e-06,
"loss": 0.2681,
"step": 727
},
{
"epoch": 0.4904008083529808,
"grad_norm": 0.5393977165222168,
"learning_rate": 9.050704746081779e-06,
"loss": 0.2487,
"step": 728
},
{
"epoch": 0.4910744358369821,
"grad_norm": 0.5441868305206299,
"learning_rate": 9.033432447315751e-06,
"loss": 0.2603,
"step": 729
},
{
"epoch": 0.4917480633209835,
"grad_norm": 0.4999203383922577,
"learning_rate": 9.016151656712473e-06,
"loss": 0.2569,
"step": 730
},
{
"epoch": 0.4924216908049848,
"grad_norm": 0.5059922933578491,
"learning_rate": 8.998862469969452e-06,
"loss": 0.2428,
"step": 731
},
{
"epoch": 0.4930953182889862,
"grad_norm": 0.5794141292572021,
"learning_rate": 8.981564982830683e-06,
"loss": 0.2901,
"step": 732
},
{
"epoch": 0.4937689457729875,
"grad_norm": 0.5344904065132141,
"learning_rate": 8.964259291086141e-06,
"loss": 0.278,
"step": 733
},
{
"epoch": 0.49444257325698887,
"grad_norm": 0.5577378273010254,
"learning_rate": 8.946945490571227e-06,
"loss": 0.2753,
"step": 734
},
{
"epoch": 0.4951162007409902,
"grad_norm": 0.48888590931892395,
"learning_rate": 8.92962367716625e-06,
"loss": 0.2565,
"step": 735
},
{
"epoch": 0.49578982822499157,
"grad_norm": 0.5605798363685608,
"learning_rate": 8.912293946795895e-06,
"loss": 0.274,
"step": 736
},
{
"epoch": 0.4964634557089929,
"grad_norm": 0.5351974964141846,
"learning_rate": 8.894956395428685e-06,
"loss": 0.259,
"step": 737
},
{
"epoch": 0.49713708319299427,
"grad_norm": 0.530037522315979,
"learning_rate": 8.877611119076454e-06,
"loss": 0.2468,
"step": 738
},
{
"epoch": 0.4978107106769956,
"grad_norm": 0.5955355763435364,
"learning_rate": 8.860258213793819e-06,
"loss": 0.2702,
"step": 739
},
{
"epoch": 0.49848433816099696,
"grad_norm": 0.5594556927680969,
"learning_rate": 8.842897775677645e-06,
"loss": 0.2796,
"step": 740
},
{
"epoch": 0.4991579656449983,
"grad_norm": 0.5318235158920288,
"learning_rate": 8.825529900866507e-06,
"loss": 0.2721,
"step": 741
},
{
"epoch": 0.49983159312899966,
"grad_norm": 0.6066297888755798,
"learning_rate": 8.808154685540164e-06,
"loss": 0.2814,
"step": 742
},
{
"epoch": 0.500505220613001,
"grad_norm": 0.520949125289917,
"learning_rate": 8.790772225919031e-06,
"loss": 0.2479,
"step": 743
},
{
"epoch": 0.5011788480970023,
"grad_norm": 0.532832682132721,
"learning_rate": 8.77338261826364e-06,
"loss": 0.2717,
"step": 744
},
{
"epoch": 0.5018524755810037,
"grad_norm": 0.4917290210723877,
"learning_rate": 8.755985958874096e-06,
"loss": 0.2331,
"step": 745
},
{
"epoch": 0.502526103065005,
"grad_norm": 0.6336959004402161,
"learning_rate": 8.73858234408957e-06,
"loss": 0.3059,
"step": 746
},
{
"epoch": 0.5031997305490064,
"grad_norm": 0.5722649693489075,
"learning_rate": 8.72117187028774e-06,
"loss": 0.2682,
"step": 747
},
{
"epoch": 0.5038733580330077,
"grad_norm": 0.47712576389312744,
"learning_rate": 8.70375463388427e-06,
"loss": 0.2468,
"step": 748
},
{
"epoch": 0.504546985517009,
"grad_norm": 0.49866771697998047,
"learning_rate": 8.68633073133228e-06,
"loss": 0.2609,
"step": 749
},
{
"epoch": 0.5052206130010104,
"grad_norm": 0.5410306453704834,
"learning_rate": 8.6689002591218e-06,
"loss": 0.2733,
"step": 750
},
{
"epoch": 0.5058942404850117,
"grad_norm": 0.5518447160720825,
"learning_rate": 8.651463313779241e-06,
"loss": 0.2525,
"step": 751
},
{
"epoch": 0.5065678679690131,
"grad_norm": 0.5311466455459595,
"learning_rate": 8.634019991866863e-06,
"loss": 0.275,
"step": 752
},
{
"epoch": 0.5072414954530144,
"grad_norm": 0.5381631255149841,
"learning_rate": 8.61657038998224e-06,
"loss": 0.275,
"step": 753
},
{
"epoch": 0.5079151229370158,
"grad_norm": 0.48526835441589355,
"learning_rate": 8.599114604757716e-06,
"loss": 0.2431,
"step": 754
},
{
"epoch": 0.5085887504210171,
"grad_norm": 0.5347431302070618,
"learning_rate": 8.581652732859887e-06,
"loss": 0.2731,
"step": 755
},
{
"epoch": 0.5092623779050185,
"grad_norm": 0.5098583102226257,
"learning_rate": 8.56418487098905e-06,
"loss": 0.294,
"step": 756
},
{
"epoch": 0.5099360053890198,
"grad_norm": 0.499496191740036,
"learning_rate": 8.54671111587867e-06,
"loss": 0.2294,
"step": 757
},
{
"epoch": 0.5106096328730212,
"grad_norm": 0.5586072206497192,
"learning_rate": 8.529231564294858e-06,
"loss": 0.2506,
"step": 758
},
{
"epoch": 0.5112832603570225,
"grad_norm": 0.5203363299369812,
"learning_rate": 8.51174631303581e-06,
"loss": 0.2505,
"step": 759
},
{
"epoch": 0.5119568878410239,
"grad_norm": 0.5142697095870972,
"learning_rate": 8.494255458931304e-06,
"loss": 0.2456,
"step": 760
},
{
"epoch": 0.5126305153250252,
"grad_norm": 0.4652908444404602,
"learning_rate": 8.476759098842129e-06,
"loss": 0.2085,
"step": 761
},
{
"epoch": 0.5133041428090266,
"grad_norm": 0.5014703273773193,
"learning_rate": 8.459257329659571e-06,
"loss": 0.239,
"step": 762
},
{
"epoch": 0.5139777702930279,
"grad_norm": 0.5147262215614319,
"learning_rate": 8.441750248304872e-06,
"loss": 0.2727,
"step": 763
},
{
"epoch": 0.5146513977770293,
"grad_norm": 0.564335823059082,
"learning_rate": 8.424237951728689e-06,
"loss": 0.2983,
"step": 764
},
{
"epoch": 0.5153250252610306,
"grad_norm": 0.5217107534408569,
"learning_rate": 8.406720536910568e-06,
"loss": 0.238,
"step": 765
},
{
"epoch": 0.515998652745032,
"grad_norm": 0.529780924320221,
"learning_rate": 8.389198100858385e-06,
"loss": 0.271,
"step": 766
},
{
"epoch": 0.5166722802290333,
"grad_norm": 0.5005664229393005,
"learning_rate": 8.371670740607833e-06,
"loss": 0.265,
"step": 767
},
{
"epoch": 0.5173459077130347,
"grad_norm": 0.4695169925689697,
"learning_rate": 8.354138553221869e-06,
"loss": 0.225,
"step": 768
},
{
"epoch": 0.518019535197036,
"grad_norm": 0.6260945200920105,
"learning_rate": 8.336601635790184e-06,
"loss": 0.2725,
"step": 769
},
{
"epoch": 0.5186931626810374,
"grad_norm": 0.5363501310348511,
"learning_rate": 8.319060085428664e-06,
"loss": 0.2631,
"step": 770
},
{
"epoch": 0.5193667901650387,
"grad_norm": 0.5340143442153931,
"learning_rate": 8.301513999278851e-06,
"loss": 0.2829,
"step": 771
},
{
"epoch": 0.5200404176490401,
"grad_norm": 0.5355620384216309,
"learning_rate": 8.283963474507402e-06,
"loss": 0.2675,
"step": 772
},
{
"epoch": 0.5207140451330414,
"grad_norm": 0.5030906796455383,
"learning_rate": 8.266408608305555e-06,
"loss": 0.2243,
"step": 773
},
{
"epoch": 0.5213876726170428,
"grad_norm": 0.5517938137054443,
"learning_rate": 8.248849497888598e-06,
"loss": 0.2554,
"step": 774
},
{
"epoch": 0.5220613001010441,
"grad_norm": 0.47788354754447937,
"learning_rate": 8.231286240495305e-06,
"loss": 0.2258,
"step": 775
},
{
"epoch": 0.5227349275850455,
"grad_norm": 0.550268828868866,
"learning_rate": 8.213718933387438e-06,
"loss": 0.2586,
"step": 776
},
{
"epoch": 0.5234085550690468,
"grad_norm": 0.5247451066970825,
"learning_rate": 8.196147673849165e-06,
"loss": 0.2491,
"step": 777
},
{
"epoch": 0.5240821825530482,
"grad_norm": 0.49666067957878113,
"learning_rate": 8.17857255918655e-06,
"loss": 0.2501,
"step": 778
},
{
"epoch": 0.5247558100370495,
"grad_norm": 0.5575336217880249,
"learning_rate": 8.160993686727015e-06,
"loss": 0.3047,
"step": 779
},
{
"epoch": 0.5254294375210509,
"grad_norm": 0.5327598452568054,
"learning_rate": 8.143411153818773e-06,
"loss": 0.289,
"step": 780
},
{
"epoch": 0.5261030650050522,
"grad_norm": 0.4978947043418884,
"learning_rate": 8.125825057830323e-06,
"loss": 0.2817,
"step": 781
},
{
"epoch": 0.5267766924890536,
"grad_norm": 0.5068449378013611,
"learning_rate": 8.108235496149892e-06,
"loss": 0.2549,
"step": 782
},
{
"epoch": 0.5274503199730549,
"grad_norm": 0.5815426111221313,
"learning_rate": 8.090642566184896e-06,
"loss": 0.3215,
"step": 783
},
{
"epoch": 0.5281239474570563,
"grad_norm": 0.528716504573822,
"learning_rate": 8.073046365361404e-06,
"loss": 0.2405,
"step": 784
},
{
"epoch": 0.5287975749410576,
"grad_norm": 0.5129048824310303,
"learning_rate": 8.0554469911236e-06,
"loss": 0.2696,
"step": 785
},
{
"epoch": 0.529471202425059,
"grad_norm": 0.5234351754188538,
"learning_rate": 8.037844540933245e-06,
"loss": 0.2608,
"step": 786
},
{
"epoch": 0.5301448299090603,
"grad_norm": 0.531194269657135,
"learning_rate": 8.020239112269131e-06,
"loss": 0.2826,
"step": 787
},
{
"epoch": 0.5308184573930617,
"grad_norm": 0.5546161532402039,
"learning_rate": 8.002630802626538e-06,
"loss": 0.2635,
"step": 788
},
{
"epoch": 0.531492084877063,
"grad_norm": 0.5576707124710083,
"learning_rate": 7.985019709516714e-06,
"loss": 0.2591,
"step": 789
},
{
"epoch": 0.5321657123610644,
"grad_norm": 0.5075989961624146,
"learning_rate": 7.967405930466305e-06,
"loss": 0.2751,
"step": 790
},
{
"epoch": 0.5328393398450657,
"grad_norm": 0.547538161277771,
"learning_rate": 7.94978956301685e-06,
"loss": 0.2767,
"step": 791
},
{
"epoch": 0.5335129673290671,
"grad_norm": 0.6105408072471619,
"learning_rate": 7.932170704724202e-06,
"loss": 0.3202,
"step": 792
},
{
"epoch": 0.5341865948130684,
"grad_norm": 0.517285943031311,
"learning_rate": 7.914549453158025e-06,
"loss": 0.2497,
"step": 793
},
{
"epoch": 0.5348602222970698,
"grad_norm": 0.5324558615684509,
"learning_rate": 7.896925905901223e-06,
"loss": 0.2804,
"step": 794
},
{
"epoch": 0.5355338497810711,
"grad_norm": 0.5467241406440735,
"learning_rate": 7.879300160549423e-06,
"loss": 0.274,
"step": 795
},
{
"epoch": 0.5362074772650725,
"grad_norm": 0.5673408508300781,
"learning_rate": 7.86167231471042e-06,
"loss": 0.2681,
"step": 796
},
{
"epoch": 0.5368811047490738,
"grad_norm": 0.5435929298400879,
"learning_rate": 7.844042466003643e-06,
"loss": 0.2456,
"step": 797
},
{
"epoch": 0.5375547322330751,
"grad_norm": 0.5365129113197327,
"learning_rate": 7.826410712059607e-06,
"loss": 0.2433,
"step": 798
},
{
"epoch": 0.5382283597170765,
"grad_norm": 0.556115984916687,
"learning_rate": 7.808777150519384e-06,
"loss": 0.2723,
"step": 799
},
{
"epoch": 0.5389019872010778,
"grad_norm": 0.6075104475021362,
"learning_rate": 7.791141879034055e-06,
"loss": 0.3197,
"step": 800
},
{
"epoch": 0.5389019872010778,
"eval_loss": 0.25853946805000305,
"eval_runtime": 105.3349,
"eval_samples_per_second": 47.468,
"eval_steps_per_second": 2.971,
"step": 800
},
{
"epoch": 0.5395756146850792,
"grad_norm": 0.5173077583312988,
"learning_rate": 7.773504995264167e-06,
"loss": 0.2458,
"step": 801
},
{
"epoch": 0.5402492421690805,
"grad_norm": 0.5317369699478149,
"learning_rate": 7.755866596879203e-06,
"loss": 0.2535,
"step": 802
},
{
"epoch": 0.5409228696530819,
"grad_norm": 0.5028438568115234,
"learning_rate": 7.738226781557024e-06,
"loss": 0.2558,
"step": 803
},
{
"epoch": 0.5415964971370832,
"grad_norm": 0.4917846918106079,
"learning_rate": 7.720585646983346e-06,
"loss": 0.2567,
"step": 804
},
{
"epoch": 0.5422701246210846,
"grad_norm": 0.5413616299629211,
"learning_rate": 7.702943290851183e-06,
"loss": 0.3068,
"step": 805
},
{
"epoch": 0.5429437521050859,
"grad_norm": 0.5557405352592468,
"learning_rate": 7.685299810860319e-06,
"loss": 0.2807,
"step": 806
},
{
"epoch": 0.5436173795890872,
"grad_norm": 0.5536317229270935,
"learning_rate": 7.667655304716762e-06,
"loss": 0.2535,
"step": 807
},
{
"epoch": 0.5442910070730885,
"grad_norm": 0.6285427808761597,
"learning_rate": 7.650009870132202e-06,
"loss": 0.2687,
"step": 808
},
{
"epoch": 0.5449646345570899,
"grad_norm": 0.5142940282821655,
"learning_rate": 7.632363604823466e-06,
"loss": 0.2328,
"step": 809
},
{
"epoch": 0.5456382620410912,
"grad_norm": 0.5419033765792847,
"learning_rate": 7.614716606511986e-06,
"loss": 0.2687,
"step": 810
},
{
"epoch": 0.5463118895250926,
"grad_norm": 0.5078312158584595,
"learning_rate": 7.597068972923254e-06,
"loss": 0.2429,
"step": 811
},
{
"epoch": 0.5469855170090939,
"grad_norm": 0.5140127539634705,
"learning_rate": 7.579420801786278e-06,
"loss": 0.2358,
"step": 812
},
{
"epoch": 0.5476591444930953,
"grad_norm": 0.5336434841156006,
"learning_rate": 7.561772190833041e-06,
"loss": 0.2561,
"step": 813
},
{
"epoch": 0.5483327719770966,
"grad_norm": 0.4892539978027344,
"learning_rate": 7.544123237797967e-06,
"loss": 0.2447,
"step": 814
},
{
"epoch": 0.549006399461098,
"grad_norm": 0.5128865838050842,
"learning_rate": 7.526474040417368e-06,
"loss": 0.2305,
"step": 815
},
{
"epoch": 0.5496800269450993,
"grad_norm": 0.5284186601638794,
"learning_rate": 7.508824696428914e-06,
"loss": 0.2665,
"step": 816
},
{
"epoch": 0.5503536544291007,
"grad_norm": 0.49982714653015137,
"learning_rate": 7.491175303571087e-06,
"loss": 0.2361,
"step": 817
},
{
"epoch": 0.551027281913102,
"grad_norm": 0.5274138450622559,
"learning_rate": 7.473525959582631e-06,
"loss": 0.2542,
"step": 818
},
{
"epoch": 0.5517009093971034,
"grad_norm": 0.5714825987815857,
"learning_rate": 7.4558767622020345e-06,
"loss": 0.287,
"step": 819
},
{
"epoch": 0.5523745368811047,
"grad_norm": 0.5137256979942322,
"learning_rate": 7.438227809166959e-06,
"loss": 0.2416,
"step": 820
},
{
"epoch": 0.5530481643651061,
"grad_norm": 0.5832123756408691,
"learning_rate": 7.4205791982137215e-06,
"loss": 0.2589,
"step": 821
},
{
"epoch": 0.5537217918491074,
"grad_norm": 0.6384348273277283,
"learning_rate": 7.402931027076746e-06,
"loss": 0.3011,
"step": 822
},
{
"epoch": 0.5543954193331088,
"grad_norm": 0.5485447645187378,
"learning_rate": 7.385283393488017e-06,
"loss": 0.2596,
"step": 823
},
{
"epoch": 0.5550690468171101,
"grad_norm": 0.5725424885749817,
"learning_rate": 7.367636395176536e-06,
"loss": 0.278,
"step": 824
},
{
"epoch": 0.5557426743011115,
"grad_norm": 0.49892446398735046,
"learning_rate": 7.349990129867802e-06,
"loss": 0.2308,
"step": 825
},
{
"epoch": 0.5564163017851128,
"grad_norm": 0.5304402709007263,
"learning_rate": 7.332344695283239e-06,
"loss": 0.2661,
"step": 826
},
{
"epoch": 0.5570899292691142,
"grad_norm": 0.5314590334892273,
"learning_rate": 7.314700189139683e-06,
"loss": 0.2545,
"step": 827
},
{
"epoch": 0.5577635567531155,
"grad_norm": 0.5156052112579346,
"learning_rate": 7.297056709148819e-06,
"loss": 0.2513,
"step": 828
},
{
"epoch": 0.5584371842371169,
"grad_norm": 0.5569677352905273,
"learning_rate": 7.279414353016655e-06,
"loss": 0.2701,
"step": 829
},
{
"epoch": 0.5591108117211182,
"grad_norm": 0.5068705081939697,
"learning_rate": 7.261773218442978e-06,
"loss": 0.2578,
"step": 830
},
{
"epoch": 0.5597844392051196,
"grad_norm": 0.5413905382156372,
"learning_rate": 7.244133403120797e-06,
"loss": 0.2657,
"step": 831
},
{
"epoch": 0.5604580666891209,
"grad_norm": 0.5509982109069824,
"learning_rate": 7.226495004735833e-06,
"loss": 0.2421,
"step": 832
},
{
"epoch": 0.5611316941731223,
"grad_norm": 0.5037456750869751,
"learning_rate": 7.208858120965949e-06,
"loss": 0.2366,
"step": 833
},
{
"epoch": 0.5618053216571236,
"grad_norm": 0.45753926038742065,
"learning_rate": 7.191222849480618e-06,
"loss": 0.2295,
"step": 834
},
{
"epoch": 0.562478949141125,
"grad_norm": 0.5005747079849243,
"learning_rate": 7.1735892879403955e-06,
"loss": 0.2431,
"step": 835
},
{
"epoch": 0.5631525766251263,
"grad_norm": 0.6139580607414246,
"learning_rate": 7.155957533996361e-06,
"loss": 0.2954,
"step": 836
},
{
"epoch": 0.5638262041091276,
"grad_norm": 0.4900098443031311,
"learning_rate": 7.1383276852895805e-06,
"loss": 0.2472,
"step": 837
},
{
"epoch": 0.564499831593129,
"grad_norm": 0.5588510632514954,
"learning_rate": 7.120699839450578e-06,
"loss": 0.2963,
"step": 838
},
{
"epoch": 0.5651734590771303,
"grad_norm": 0.45477819442749023,
"learning_rate": 7.103074094098776e-06,
"loss": 0.2459,
"step": 839
},
{
"epoch": 0.5658470865611317,
"grad_norm": 0.5369901061058044,
"learning_rate": 7.085450546841977e-06,
"loss": 0.2378,
"step": 840
},
{
"epoch": 0.566520714045133,
"grad_norm": 0.5580633878707886,
"learning_rate": 7.0678292952757986e-06,
"loss": 0.2466,
"step": 841
},
{
"epoch": 0.5671943415291344,
"grad_norm": 0.5392370223999023,
"learning_rate": 7.050210436983152e-06,
"loss": 0.2847,
"step": 842
},
{
"epoch": 0.5678679690131357,
"grad_norm": 0.5429926514625549,
"learning_rate": 7.032594069533694e-06,
"loss": 0.2589,
"step": 843
},
{
"epoch": 0.5685415964971371,
"grad_norm": 0.529365062713623,
"learning_rate": 7.0149802904832865e-06,
"loss": 0.2692,
"step": 844
},
{
"epoch": 0.5692152239811384,
"grad_norm": 0.5019341707229614,
"learning_rate": 6.997369197373462e-06,
"loss": 0.2501,
"step": 845
},
{
"epoch": 0.5698888514651398,
"grad_norm": 0.5088992714881897,
"learning_rate": 6.979760887730873e-06,
"loss": 0.2741,
"step": 846
},
{
"epoch": 0.5705624789491411,
"grad_norm": 0.5390922427177429,
"learning_rate": 6.962155459066755e-06,
"loss": 0.2653,
"step": 847
},
{
"epoch": 0.5712361064331425,
"grad_norm": 0.5300227403640747,
"learning_rate": 6.9445530088764015e-06,
"loss": 0.2356,
"step": 848
},
{
"epoch": 0.5719097339171438,
"grad_norm": 0.5471487641334534,
"learning_rate": 6.926953634638598e-06,
"loss": 0.2434,
"step": 849
},
{
"epoch": 0.5725833614011452,
"grad_norm": 0.49165770411491394,
"learning_rate": 6.909357433815104e-06,
"loss": 0.2539,
"step": 850
},
{
"epoch": 0.5732569888851465,
"grad_norm": 0.5154786705970764,
"learning_rate": 6.891764503850109e-06,
"loss": 0.2525,
"step": 851
},
{
"epoch": 0.5739306163691479,
"grad_norm": 0.5185630321502686,
"learning_rate": 6.874174942169674e-06,
"loss": 0.2709,
"step": 852
},
{
"epoch": 0.5746042438531492,
"grad_norm": 0.5015746355056763,
"learning_rate": 6.856588846181228e-06,
"loss": 0.2522,
"step": 853
},
{
"epoch": 0.5752778713371506,
"grad_norm": 0.5378702282905579,
"learning_rate": 6.839006313272989e-06,
"loss": 0.2634,
"step": 854
},
{
"epoch": 0.5759514988211519,
"grad_norm": 0.5816572308540344,
"learning_rate": 6.82142744081345e-06,
"loss": 0.3396,
"step": 855
},
{
"epoch": 0.5766251263051533,
"grad_norm": 0.5909308791160583,
"learning_rate": 6.803852326150838e-06,
"loss": 0.2834,
"step": 856
},
{
"epoch": 0.5772987537891546,
"grad_norm": 0.5006569623947144,
"learning_rate": 6.786281066612564e-06,
"loss": 0.212,
"step": 857
},
{
"epoch": 0.577972381273156,
"grad_norm": 0.5730767846107483,
"learning_rate": 6.768713759504694e-06,
"loss": 0.2998,
"step": 858
},
{
"epoch": 0.5786460087571573,
"grad_norm": 0.5159865617752075,
"learning_rate": 6.751150502111406e-06,
"loss": 0.2685,
"step": 859
},
{
"epoch": 0.5793196362411587,
"grad_norm": 0.5225328803062439,
"learning_rate": 6.733591391694444e-06,
"loss": 0.2404,
"step": 860
},
{
"epoch": 0.57999326372516,
"grad_norm": 0.540481686592102,
"learning_rate": 6.7160365254926005e-06,
"loss": 0.265,
"step": 861
},
{
"epoch": 0.5806668912091614,
"grad_norm": 0.5876161456108093,
"learning_rate": 6.698486000721151e-06,
"loss": 0.2758,
"step": 862
},
{
"epoch": 0.5813405186931627,
"grad_norm": 0.5269771218299866,
"learning_rate": 6.680939914571336e-06,
"loss": 0.2497,
"step": 863
},
{
"epoch": 0.5820141461771641,
"grad_norm": 0.5683711171150208,
"learning_rate": 6.663398364209817e-06,
"loss": 0.2895,
"step": 864
},
{
"epoch": 0.5826877736611654,
"grad_norm": 0.5690784454345703,
"learning_rate": 6.645861446778131e-06,
"loss": 0.2927,
"step": 865
},
{
"epoch": 0.5833614011451668,
"grad_norm": 0.4923837184906006,
"learning_rate": 6.628329259392169e-06,
"loss": 0.2294,
"step": 866
},
{
"epoch": 0.5840350286291681,
"grad_norm": 0.5871672630310059,
"learning_rate": 6.610801899141618e-06,
"loss": 0.2883,
"step": 867
},
{
"epoch": 0.5847086561131695,
"grad_norm": 0.5314139127731323,
"learning_rate": 6.593279463089433e-06,
"loss": 0.2698,
"step": 868
},
{
"epoch": 0.5853822835971708,
"grad_norm": 0.4713616967201233,
"learning_rate": 6.575762048271311e-06,
"loss": 0.2551,
"step": 869
},
{
"epoch": 0.5860559110811722,
"grad_norm": 0.5604876279830933,
"learning_rate": 6.558249751695129e-06,
"loss": 0.2507,
"step": 870
},
{
"epoch": 0.5867295385651735,
"grad_norm": 0.5332925319671631,
"learning_rate": 6.54074267034043e-06,
"loss": 0.2921,
"step": 871
},
{
"epoch": 0.5874031660491748,
"grad_norm": 0.5870206356048584,
"learning_rate": 6.523240901157874e-06,
"loss": 0.305,
"step": 872
},
{
"epoch": 0.5880767935331761,
"grad_norm": 0.5209013223648071,
"learning_rate": 6.505744541068696e-06,
"loss": 0.2504,
"step": 873
},
{
"epoch": 0.5887504210171774,
"grad_norm": 0.5347055196762085,
"learning_rate": 6.488253686964189e-06,
"loss": 0.26,
"step": 874
},
{
"epoch": 0.5894240485011788,
"grad_norm": 0.5568848252296448,
"learning_rate": 6.470768435705146e-06,
"loss": 0.2506,
"step": 875
},
{
"epoch": 0.5900976759851801,
"grad_norm": 0.4880235493183136,
"learning_rate": 6.45328888412133e-06,
"loss": 0.2549,
"step": 876
},
{
"epoch": 0.5907713034691815,
"grad_norm": 0.5328478217124939,
"learning_rate": 6.435815129010952e-06,
"loss": 0.2892,
"step": 877
},
{
"epoch": 0.5914449309531828,
"grad_norm": 0.5507891178131104,
"learning_rate": 6.418347267140113e-06,
"loss": 0.295,
"step": 878
},
{
"epoch": 0.5921185584371842,
"grad_norm": 0.5917878150939941,
"learning_rate": 6.400885395242284e-06,
"loss": 0.2775,
"step": 879
},
{
"epoch": 0.5927921859211855,
"grad_norm": 0.5396655201911926,
"learning_rate": 6.383429610017763e-06,
"loss": 0.2601,
"step": 880
},
{
"epoch": 0.5934658134051869,
"grad_norm": 0.5640776753425598,
"learning_rate": 6.3659800081331375e-06,
"loss": 0.2532,
"step": 881
},
{
"epoch": 0.5941394408891882,
"grad_norm": 0.5693733096122742,
"learning_rate": 6.348536686220761e-06,
"loss": 0.276,
"step": 882
},
{
"epoch": 0.5948130683731896,
"grad_norm": 0.49299901723861694,
"learning_rate": 6.331099740878201e-06,
"loss": 0.2197,
"step": 883
},
{
"epoch": 0.5954866958571909,
"grad_norm": 0.5112996697425842,
"learning_rate": 6.3136692686677204e-06,
"loss": 0.2685,
"step": 884
},
{
"epoch": 0.5961603233411923,
"grad_norm": 0.5770703554153442,
"learning_rate": 6.2962453661157305e-06,
"loss": 0.2439,
"step": 885
},
{
"epoch": 0.5968339508251936,
"grad_norm": 0.5604544878005981,
"learning_rate": 6.2788281297122605e-06,
"loss": 0.2603,
"step": 886
},
{
"epoch": 0.597507578309195,
"grad_norm": 0.5164006948471069,
"learning_rate": 6.261417655910432e-06,
"loss": 0.2419,
"step": 887
},
{
"epoch": 0.5981812057931963,
"grad_norm": 0.5085450410842896,
"learning_rate": 6.244014041125906e-06,
"loss": 0.2714,
"step": 888
},
{
"epoch": 0.5988548332771977,
"grad_norm": 0.5820232629776001,
"learning_rate": 6.226617381736361e-06,
"loss": 0.2909,
"step": 889
},
{
"epoch": 0.599528460761199,
"grad_norm": 0.5919815301895142,
"learning_rate": 6.209227774080969e-06,
"loss": 0.3283,
"step": 890
},
{
"epoch": 0.6002020882452004,
"grad_norm": 0.5612049102783203,
"learning_rate": 6.191845314459836e-06,
"loss": 0.2623,
"step": 891
},
{
"epoch": 0.6008757157292017,
"grad_norm": 0.5206785798072815,
"learning_rate": 6.174470099133495e-06,
"loss": 0.2391,
"step": 892
},
{
"epoch": 0.6015493432132031,
"grad_norm": 0.5109294652938843,
"learning_rate": 6.157102224322357e-06,
"loss": 0.2435,
"step": 893
},
{
"epoch": 0.6022229706972044,
"grad_norm": 0.5124114155769348,
"learning_rate": 6.13974178620618e-06,
"loss": 0.2508,
"step": 894
},
{
"epoch": 0.6028965981812058,
"grad_norm": 0.538691520690918,
"learning_rate": 6.1223888809235475e-06,
"loss": 0.2742,
"step": 895
},
{
"epoch": 0.6035702256652071,
"grad_norm": 0.4782629609107971,
"learning_rate": 6.105043604571319e-06,
"loss": 0.215,
"step": 896
},
{
"epoch": 0.6042438531492085,
"grad_norm": 0.48708873987197876,
"learning_rate": 6.087706053204106e-06,
"loss": 0.2685,
"step": 897
},
{
"epoch": 0.6049174806332098,
"grad_norm": 0.5199108719825745,
"learning_rate": 6.070376322833751e-06,
"loss": 0.2522,
"step": 898
},
{
"epoch": 0.6055911081172112,
"grad_norm": 0.5264055728912354,
"learning_rate": 6.053054509428774e-06,
"loss": 0.2702,
"step": 899
},
{
"epoch": 0.6062647356012125,
"grad_norm": 0.5014949440956116,
"learning_rate": 6.035740708913861e-06,
"loss": 0.2592,
"step": 900
},
{
"epoch": 0.6062647356012125,
"eval_loss": 0.255189448595047,
"eval_runtime": 106.7863,
"eval_samples_per_second": 46.823,
"eval_steps_per_second": 2.931,
"step": 900
},
{
"epoch": 0.6069383630852139,
"grad_norm": 0.5549845695495605,
"learning_rate": 6.01843501716932e-06,
"loss": 0.2676,
"step": 901
},
{
"epoch": 0.6076119905692152,
"grad_norm": 0.5285577178001404,
"learning_rate": 6.001137530030551e-06,
"loss": 0.287,
"step": 902
},
{
"epoch": 0.6082856180532166,
"grad_norm": 0.5555633306503296,
"learning_rate": 5.983848343287529e-06,
"loss": 0.27,
"step": 903
},
{
"epoch": 0.6089592455372179,
"grad_norm": 0.4878551661968231,
"learning_rate": 5.966567552684248e-06,
"loss": 0.2132,
"step": 904
},
{
"epoch": 0.6096328730212193,
"grad_norm": 0.5712552070617676,
"learning_rate": 5.949295253918223e-06,
"loss": 0.264,
"step": 905
},
{
"epoch": 0.6103065005052206,
"grad_norm": 0.5029177665710449,
"learning_rate": 5.932031542639929e-06,
"loss": 0.2327,
"step": 906
},
{
"epoch": 0.610980127989222,
"grad_norm": 0.5280793309211731,
"learning_rate": 5.914776514452292e-06,
"loss": 0.2666,
"step": 907
},
{
"epoch": 0.6116537554732233,
"grad_norm": 0.5493948459625244,
"learning_rate": 5.897530264910151e-06,
"loss": 0.2747,
"step": 908
},
{
"epoch": 0.6123273829572247,
"grad_norm": 0.5202659964561462,
"learning_rate": 5.880292889519733e-06,
"loss": 0.2648,
"step": 909
},
{
"epoch": 0.613001010441226,
"grad_norm": 0.5150463581085205,
"learning_rate": 5.863064483738114e-06,
"loss": 0.2465,
"step": 910
},
{
"epoch": 0.6136746379252274,
"grad_norm": 0.5893501043319702,
"learning_rate": 5.845845142972711e-06,
"loss": 0.258,
"step": 911
},
{
"epoch": 0.6143482654092287,
"grad_norm": 0.5318591594696045,
"learning_rate": 5.828634962580728e-06,
"loss": 0.2566,
"step": 912
},
{
"epoch": 0.6150218928932301,
"grad_norm": 0.5880672335624695,
"learning_rate": 5.811434037868652e-06,
"loss": 0.2776,
"step": 913
},
{
"epoch": 0.6156955203772314,
"grad_norm": 0.536673903465271,
"learning_rate": 5.794242464091703e-06,
"loss": 0.2655,
"step": 914
},
{
"epoch": 0.6163691478612328,
"grad_norm": 0.53472501039505,
"learning_rate": 5.777060336453324e-06,
"loss": 0.2465,
"step": 915
},
{
"epoch": 0.6170427753452341,
"grad_norm": 0.5193040370941162,
"learning_rate": 5.75988775010465e-06,
"loss": 0.2597,
"step": 916
},
{
"epoch": 0.6177164028292355,
"grad_norm": 0.5033950209617615,
"learning_rate": 5.742724800143967e-06,
"loss": 0.2564,
"step": 917
},
{
"epoch": 0.6183900303132368,
"grad_norm": 0.479815274477005,
"learning_rate": 5.725571581616212e-06,
"loss": 0.2359,
"step": 918
},
{
"epoch": 0.6190636577972382,
"grad_norm": 0.6129284501075745,
"learning_rate": 5.708428189512418e-06,
"loss": 0.2789,
"step": 919
},
{
"epoch": 0.6197372852812395,
"grad_norm": 0.5521009564399719,
"learning_rate": 5.691294718769205e-06,
"loss": 0.2605,
"step": 920
},
{
"epoch": 0.6204109127652409,
"grad_norm": 0.5177654027938843,
"learning_rate": 5.674171264268255e-06,
"loss": 0.2519,
"step": 921
},
{
"epoch": 0.6210845402492422,
"grad_norm": 0.5175455808639526,
"learning_rate": 5.657057920835781e-06,
"loss": 0.2247,
"step": 922
},
{
"epoch": 0.6217581677332435,
"grad_norm": 0.5852333903312683,
"learning_rate": 5.639954783241994e-06,
"loss": 0.2767,
"step": 923
},
{
"epoch": 0.6224317952172449,
"grad_norm": 0.508068323135376,
"learning_rate": 5.622861946200602e-06,
"loss": 0.2584,
"step": 924
},
{
"epoch": 0.6231054227012462,
"grad_norm": 0.5253134965896606,
"learning_rate": 5.605779504368256e-06,
"loss": 0.2479,
"step": 925
},
{
"epoch": 0.6237790501852476,
"grad_norm": 0.5303956866264343,
"learning_rate": 5.588707552344052e-06,
"loss": 0.2445,
"step": 926
},
{
"epoch": 0.624452677669249,
"grad_norm": 0.5583487749099731,
"learning_rate": 5.571646184668989e-06,
"loss": 0.2703,
"step": 927
},
{
"epoch": 0.6251263051532503,
"grad_norm": 0.48656296730041504,
"learning_rate": 5.5545954958254535e-06,
"loss": 0.22,
"step": 928
},
{
"epoch": 0.6257999326372516,
"grad_norm": 0.5838629603385925,
"learning_rate": 5.537555580236696e-06,
"loss": 0.2995,
"step": 929
},
{
"epoch": 0.626473560121253,
"grad_norm": 0.4895997643470764,
"learning_rate": 5.520526532266303e-06,
"loss": 0.2508,
"step": 930
},
{
"epoch": 0.6271471876052543,
"grad_norm": 0.5736584663391113,
"learning_rate": 5.503508446217687e-06,
"loss": 0.2738,
"step": 931
},
{
"epoch": 0.6278208150892557,
"grad_norm": 0.507853627204895,
"learning_rate": 5.486501416333547e-06,
"loss": 0.2342,
"step": 932
},
{
"epoch": 0.628494442573257,
"grad_norm": 0.5749799013137817,
"learning_rate": 5.469505536795354e-06,
"loss": 0.2505,
"step": 933
},
{
"epoch": 0.6291680700572584,
"grad_norm": 0.5327485203742981,
"learning_rate": 5.452520901722843e-06,
"loss": 0.2444,
"step": 934
},
{
"epoch": 0.6298416975412597,
"grad_norm": 0.5296816229820251,
"learning_rate": 5.435547605173464e-06,
"loss": 0.2369,
"step": 935
},
{
"epoch": 0.630515325025261,
"grad_norm": 0.568265974521637,
"learning_rate": 5.4185857411418856e-06,
"loss": 0.2668,
"step": 936
},
{
"epoch": 0.6311889525092623,
"grad_norm": 0.571258544921875,
"learning_rate": 5.401635403559467e-06,
"loss": 0.2651,
"step": 937
},
{
"epoch": 0.6318625799932637,
"grad_norm": 0.5336675047874451,
"learning_rate": 5.384696686293728e-06,
"loss": 0.2571,
"step": 938
},
{
"epoch": 0.632536207477265,
"grad_norm": 0.5422372221946716,
"learning_rate": 5.367769683147849e-06,
"loss": 0.2474,
"step": 939
},
{
"epoch": 0.6332098349612664,
"grad_norm": 0.5360538363456726,
"learning_rate": 5.350854487860127e-06,
"loss": 0.2612,
"step": 940
},
{
"epoch": 0.6338834624452677,
"grad_norm": 0.5526731014251709,
"learning_rate": 5.333951194103476e-06,
"loss": 0.291,
"step": 941
},
{
"epoch": 0.6345570899292691,
"grad_norm": 0.501751720905304,
"learning_rate": 5.317059895484905e-06,
"loss": 0.2305,
"step": 942
},
{
"epoch": 0.6352307174132704,
"grad_norm": 0.5227620005607605,
"learning_rate": 5.300180685544992e-06,
"loss": 0.2425,
"step": 943
},
{
"epoch": 0.6359043448972718,
"grad_norm": 0.4993986189365387,
"learning_rate": 5.28331365775737e-06,
"loss": 0.2426,
"step": 944
},
{
"epoch": 0.6365779723812731,
"grad_norm": 0.5128391981124878,
"learning_rate": 5.266458905528214e-06,
"loss": 0.2635,
"step": 945
},
{
"epoch": 0.6372515998652745,
"grad_norm": 0.5762251615524292,
"learning_rate": 5.2496165221957105e-06,
"loss": 0.2652,
"step": 946
},
{
"epoch": 0.6379252273492758,
"grad_norm": 0.48678985238075256,
"learning_rate": 5.232786601029562e-06,
"loss": 0.2518,
"step": 947
},
{
"epoch": 0.6385988548332772,
"grad_norm": 0.5538300275802612,
"learning_rate": 5.215969235230447e-06,
"loss": 0.2489,
"step": 948
},
{
"epoch": 0.6392724823172785,
"grad_norm": 0.5497295260429382,
"learning_rate": 5.199164517929521e-06,
"loss": 0.2454,
"step": 949
},
{
"epoch": 0.6399461098012799,
"grad_norm": 0.49725213646888733,
"learning_rate": 5.182372542187895e-06,
"loss": 0.2555,
"step": 950
},
{
"epoch": 0.6406197372852812,
"grad_norm": 0.533641517162323,
"learning_rate": 5.165593400996114e-06,
"loss": 0.2927,
"step": 951
},
{
"epoch": 0.6412933647692826,
"grad_norm": 0.5489848256111145,
"learning_rate": 5.148827187273657e-06,
"loss": 0.2801,
"step": 952
},
{
"epoch": 0.6419669922532839,
"grad_norm": 0.5017451643943787,
"learning_rate": 5.132073993868406e-06,
"loss": 0.264,
"step": 953
},
{
"epoch": 0.6426406197372853,
"grad_norm": 0.5372846722602844,
"learning_rate": 5.115333913556137e-06,
"loss": 0.2721,
"step": 954
},
{
"epoch": 0.6433142472212866,
"grad_norm": 0.5356566309928894,
"learning_rate": 5.098607039040019e-06,
"loss": 0.2608,
"step": 955
},
{
"epoch": 0.643987874705288,
"grad_norm": 0.5957320928573608,
"learning_rate": 5.081893462950079e-06,
"loss": 0.2601,
"step": 956
},
{
"epoch": 0.6446615021892893,
"grad_norm": 0.5288376212120056,
"learning_rate": 5.0651932778427074e-06,
"loss": 0.2587,
"step": 957
},
{
"epoch": 0.6453351296732907,
"grad_norm": 0.5290555953979492,
"learning_rate": 5.048506576200137e-06,
"loss": 0.2756,
"step": 958
},
{
"epoch": 0.646008757157292,
"grad_norm": 0.5248770117759705,
"learning_rate": 5.031833450429925e-06,
"loss": 0.2451,
"step": 959
},
{
"epoch": 0.6466823846412934,
"grad_norm": 0.5826844573020935,
"learning_rate": 5.0151739928644585e-06,
"loss": 0.2619,
"step": 960
},
{
"epoch": 0.6473560121252947,
"grad_norm": 0.5782036185264587,
"learning_rate": 4.998528295760426e-06,
"loss": 0.2751,
"step": 961
},
{
"epoch": 0.648029639609296,
"grad_norm": 0.5718022584915161,
"learning_rate": 4.981896451298311e-06,
"loss": 0.2754,
"step": 962
},
{
"epoch": 0.6487032670932974,
"grad_norm": 0.5494672060012817,
"learning_rate": 4.965278551581896e-06,
"loss": 0.2612,
"step": 963
},
{
"epoch": 0.6493768945772987,
"grad_norm": 0.5269261002540588,
"learning_rate": 4.948674688637724e-06,
"loss": 0.2498,
"step": 964
},
{
"epoch": 0.6500505220613001,
"grad_norm": 0.5737338662147522,
"learning_rate": 4.932084954414619e-06,
"loss": 0.2512,
"step": 965
},
{
"epoch": 0.6507241495453014,
"grad_norm": 0.6112325191497803,
"learning_rate": 4.915509440783158e-06,
"loss": 0.2436,
"step": 966
},
{
"epoch": 0.6513977770293028,
"grad_norm": 0.5490378737449646,
"learning_rate": 4.898948239535162e-06,
"loss": 0.2666,
"step": 967
},
{
"epoch": 0.6520714045133041,
"grad_norm": 0.49087807536125183,
"learning_rate": 4.882401442383205e-06,
"loss": 0.2307,
"step": 968
},
{
"epoch": 0.6527450319973055,
"grad_norm": 0.5699329972267151,
"learning_rate": 4.865869140960081e-06,
"loss": 0.2788,
"step": 969
},
{
"epoch": 0.6534186594813068,
"grad_norm": 0.5976101756095886,
"learning_rate": 4.8493514268183154e-06,
"loss": 0.295,
"step": 970
},
{
"epoch": 0.6540922869653082,
"grad_norm": 0.5223735570907593,
"learning_rate": 4.8328483914296545e-06,
"loss": 0.2524,
"step": 971
},
{
"epoch": 0.6547659144493095,
"grad_norm": 0.521709680557251,
"learning_rate": 4.816360126184552e-06,
"loss": 0.256,
"step": 972
},
{
"epoch": 0.6554395419333109,
"grad_norm": 0.6724926829338074,
"learning_rate": 4.799886722391676e-06,
"loss": 0.3489,
"step": 973
},
{
"epoch": 0.6561131694173122,
"grad_norm": 0.5592423677444458,
"learning_rate": 4.783428271277383e-06,
"loss": 0.2486,
"step": 974
},
{
"epoch": 0.6567867969013136,
"grad_norm": 0.5178484320640564,
"learning_rate": 4.766984863985229e-06,
"loss": 0.231,
"step": 975
},
{
"epoch": 0.6574604243853149,
"grad_norm": 0.5621674060821533,
"learning_rate": 4.750556591575467e-06,
"loss": 0.286,
"step": 976
},
{
"epoch": 0.6581340518693163,
"grad_norm": 0.5048483610153198,
"learning_rate": 4.734143545024527e-06,
"loss": 0.2308,
"step": 977
},
{
"epoch": 0.6588076793533176,
"grad_norm": 0.5381827354431152,
"learning_rate": 4.7177458152245286e-06,
"loss": 0.262,
"step": 978
},
{
"epoch": 0.659481306837319,
"grad_norm": 0.6153239607810974,
"learning_rate": 4.701363492982763e-06,
"loss": 0.2889,
"step": 979
},
{
"epoch": 0.6601549343213203,
"grad_norm": 0.5119926333427429,
"learning_rate": 4.684996669021202e-06,
"loss": 0.2313,
"step": 980
},
{
"epoch": 0.6608285618053217,
"grad_norm": 0.5575754046440125,
"learning_rate": 4.668645433975994e-06,
"loss": 0.2926,
"step": 981
},
{
"epoch": 0.661502189289323,
"grad_norm": 0.5210109949111938,
"learning_rate": 4.652309878396955e-06,
"loss": 0.2567,
"step": 982
},
{
"epoch": 0.6621758167733244,
"grad_norm": 0.5215359330177307,
"learning_rate": 4.635990092747066e-06,
"loss": 0.2542,
"step": 983
},
{
"epoch": 0.6628494442573257,
"grad_norm": 0.5880634188652039,
"learning_rate": 4.619686167401991e-06,
"loss": 0.3099,
"step": 984
},
{
"epoch": 0.6635230717413271,
"grad_norm": 0.6169697046279907,
"learning_rate": 4.603398192649549e-06,
"loss": 0.3095,
"step": 985
},
{
"epoch": 0.6641966992253284,
"grad_norm": 0.5105169415473938,
"learning_rate": 4.5871262586892365e-06,
"loss": 0.2439,
"step": 986
},
{
"epoch": 0.6648703267093298,
"grad_norm": 0.5112780928611755,
"learning_rate": 4.5708704556317195e-06,
"loss": 0.2843,
"step": 987
},
{
"epoch": 0.6655439541933311,
"grad_norm": 0.5523808002471924,
"learning_rate": 4.554630873498325e-06,
"loss": 0.2779,
"step": 988
},
{
"epoch": 0.6662175816773325,
"grad_norm": 0.49872297048568726,
"learning_rate": 4.538407602220566e-06,
"loss": 0.2385,
"step": 989
},
{
"epoch": 0.6668912091613338,
"grad_norm": 0.4888902008533478,
"learning_rate": 4.522200731639616e-06,
"loss": 0.2541,
"step": 990
},
{
"epoch": 0.6675648366453352,
"grad_norm": 0.5053279995918274,
"learning_rate": 4.506010351505834e-06,
"loss": 0.2465,
"step": 991
},
{
"epoch": 0.6682384641293365,
"grad_norm": 0.5656309723854065,
"learning_rate": 4.489836551478254e-06,
"loss": 0.2878,
"step": 992
},
{
"epoch": 0.6689120916133379,
"grad_norm": 0.5291764736175537,
"learning_rate": 4.473679421124092e-06,
"loss": 0.2803,
"step": 993
},
{
"epoch": 0.6695857190973392,
"grad_norm": 0.5425894260406494,
"learning_rate": 4.457539049918253e-06,
"loss": 0.2758,
"step": 994
},
{
"epoch": 0.6702593465813406,
"grad_norm": 0.5237170457839966,
"learning_rate": 4.441415527242835e-06,
"loss": 0.2615,
"step": 995
},
{
"epoch": 0.6709329740653419,
"grad_norm": 0.48956528306007385,
"learning_rate": 4.425308942386624e-06,
"loss": 0.2502,
"step": 996
},
{
"epoch": 0.6716066015493433,
"grad_norm": 0.5325567722320557,
"learning_rate": 4.409219384544621e-06,
"loss": 0.2663,
"step": 997
},
{
"epoch": 0.6722802290333446,
"grad_norm": 0.5196683406829834,
"learning_rate": 4.3931469428175195e-06,
"loss": 0.2785,
"step": 998
},
{
"epoch": 0.672953856517346,
"grad_norm": 0.5358217358589172,
"learning_rate": 4.377091706211243e-06,
"loss": 0.2701,
"step": 999
},
{
"epoch": 0.6736274840013473,
"grad_norm": 0.5513088703155518,
"learning_rate": 4.3610537636364256e-06,
"loss": 0.2583,
"step": 1000
},
{
"epoch": 0.6736274840013473,
"eval_loss": 0.2523915767669678,
"eval_runtime": 104.4369,
"eval_samples_per_second": 47.876,
"eval_steps_per_second": 2.997,
"step": 1000
},
{
"epoch": 0.6743011114853485,
"grad_norm": 0.599454402923584,
"learning_rate": 4.345033203907931e-06,
"loss": 0.3127,
"step": 1001
},
{
"epoch": 0.6749747389693499,
"grad_norm": 0.5195282697677612,
"learning_rate": 4.329030115744368e-06,
"loss": 0.2336,
"step": 1002
},
{
"epoch": 0.6756483664533512,
"grad_norm": 0.5394783616065979,
"learning_rate": 4.313044587767581e-06,
"loss": 0.2266,
"step": 1003
},
{
"epoch": 0.6763219939373526,
"grad_norm": 0.502860963344574,
"learning_rate": 4.297076708502179e-06,
"loss": 0.2226,
"step": 1004
},
{
"epoch": 0.6769956214213539,
"grad_norm": 0.5646010637283325,
"learning_rate": 4.281126566375035e-06,
"loss": 0.2612,
"step": 1005
},
{
"epoch": 0.6776692489053553,
"grad_norm": 0.5330033898353577,
"learning_rate": 4.265194249714788e-06,
"loss": 0.27,
"step": 1006
},
{
"epoch": 0.6783428763893566,
"grad_norm": 0.5152742266654968,
"learning_rate": 4.249279846751376e-06,
"loss": 0.2522,
"step": 1007
},
{
"epoch": 0.679016503873358,
"grad_norm": 0.5699672698974609,
"learning_rate": 4.233383445615524e-06,
"loss": 0.3023,
"step": 1008
},
{
"epoch": 0.6796901313573593,
"grad_norm": 0.5329395532608032,
"learning_rate": 4.21750513433827e-06,
"loss": 0.2413,
"step": 1009
},
{
"epoch": 0.6803637588413607,
"grad_norm": 0.4887201488018036,
"learning_rate": 4.201645000850481e-06,
"loss": 0.24,
"step": 1010
},
{
"epoch": 0.681037386325362,
"grad_norm": 0.49501362442970276,
"learning_rate": 4.1858031329823445e-06,
"loss": 0.2288,
"step": 1011
},
{
"epoch": 0.6817110138093634,
"grad_norm": 0.48089247941970825,
"learning_rate": 4.169979618462912e-06,
"loss": 0.2311,
"step": 1012
},
{
"epoch": 0.6823846412933647,
"grad_norm": 0.5128735899925232,
"learning_rate": 4.154174544919591e-06,
"loss": 0.2342,
"step": 1013
},
{
"epoch": 0.6830582687773661,
"grad_norm": 0.5249293446540833,
"learning_rate": 4.13838799987766e-06,
"loss": 0.2799,
"step": 1014
},
{
"epoch": 0.6837318962613674,
"grad_norm": 0.5358514785766602,
"learning_rate": 4.122620070759805e-06,
"loss": 0.2569,
"step": 1015
},
{
"epoch": 0.6844055237453688,
"grad_norm": 0.4961945712566376,
"learning_rate": 4.106870844885606e-06,
"loss": 0.2856,
"step": 1016
},
{
"epoch": 0.6850791512293701,
"grad_norm": 0.5068737268447876,
"learning_rate": 4.091140409471082e-06,
"loss": 0.247,
"step": 1017
},
{
"epoch": 0.6857527787133715,
"grad_norm": 0.5845658779144287,
"learning_rate": 4.0754288516281805e-06,
"loss": 0.3199,
"step": 1018
},
{
"epoch": 0.6864264061973728,
"grad_norm": 0.5782644152641296,
"learning_rate": 4.05973625836432e-06,
"loss": 0.277,
"step": 1019
},
{
"epoch": 0.6871000336813742,
"grad_norm": 0.4946306347846985,
"learning_rate": 4.044062716581894e-06,
"loss": 0.2596,
"step": 1020
},
{
"epoch": 0.6877736611653755,
"grad_norm": 0.4905906915664673,
"learning_rate": 4.02840831307779e-06,
"loss": 0.243,
"step": 1021
},
{
"epoch": 0.6884472886493769,
"grad_norm": 0.5413460731506348,
"learning_rate": 4.012773134542911e-06,
"loss": 0.2787,
"step": 1022
},
{
"epoch": 0.6891209161333782,
"grad_norm": 0.5733334422111511,
"learning_rate": 3.997157267561701e-06,
"loss": 0.2473,
"step": 1023
},
{
"epoch": 0.6897945436173796,
"grad_norm": 0.5300191044807434,
"learning_rate": 3.981560798611655e-06,
"loss": 0.2451,
"step": 1024
},
{
"epoch": 0.6904681711013809,
"grad_norm": 0.5103181600570679,
"learning_rate": 3.965983814062852e-06,
"loss": 0.2519,
"step": 1025
},
{
"epoch": 0.6911417985853823,
"grad_norm": 0.6034629344940186,
"learning_rate": 3.950426400177465e-06,
"loss": 0.2702,
"step": 1026
},
{
"epoch": 0.6918154260693836,
"grad_norm": 0.5429885387420654,
"learning_rate": 3.934888643109288e-06,
"loss": 0.2549,
"step": 1027
},
{
"epoch": 0.692489053553385,
"grad_norm": 0.5191195011138916,
"learning_rate": 3.919370628903266e-06,
"loss": 0.263,
"step": 1028
},
{
"epoch": 0.6931626810373863,
"grad_norm": 0.5467904210090637,
"learning_rate": 3.903872443495005e-06,
"loss": 0.2502,
"step": 1029
},
{
"epoch": 0.6938363085213877,
"grad_norm": 0.5506168007850647,
"learning_rate": 3.888394172710305e-06,
"loss": 0.2731,
"step": 1030
},
{
"epoch": 0.694509936005389,
"grad_norm": 0.5254278182983398,
"learning_rate": 3.872935902264689e-06,
"loss": 0.2547,
"step": 1031
},
{
"epoch": 0.6951835634893904,
"grad_norm": 0.5569198131561279,
"learning_rate": 3.857497717762911e-06,
"loss": 0.2644,
"step": 1032
},
{
"epoch": 0.6958571909733917,
"grad_norm": 0.5069310069084167,
"learning_rate": 3.8420797046985024e-06,
"loss": 0.2643,
"step": 1033
},
{
"epoch": 0.6965308184573931,
"grad_norm": 0.539691686630249,
"learning_rate": 3.826681948453288e-06,
"loss": 0.259,
"step": 1034
},
{
"epoch": 0.6972044459413944,
"grad_norm": 0.49122515320777893,
"learning_rate": 3.8113045342969083e-06,
"loss": 0.2326,
"step": 1035
},
{
"epoch": 0.6978780734253958,
"grad_norm": 0.5575307011604309,
"learning_rate": 3.7959475473863624e-06,
"loss": 0.262,
"step": 1036
},
{
"epoch": 0.6985517009093971,
"grad_norm": 0.48936426639556885,
"learning_rate": 3.7806110727655185e-06,
"loss": 0.2561,
"step": 1037
},
{
"epoch": 0.6992253283933985,
"grad_norm": 0.5563534498214722,
"learning_rate": 3.76529519536466e-06,
"loss": 0.2706,
"step": 1038
},
{
"epoch": 0.6998989558773998,
"grad_norm": 0.567457377910614,
"learning_rate": 3.750000000000002e-06,
"loss": 0.2453,
"step": 1039
},
{
"epoch": 0.7005725833614012,
"grad_norm": 0.5170741677284241,
"learning_rate": 3.7347255713732236e-06,
"loss": 0.2199,
"step": 1040
},
{
"epoch": 0.7012462108454025,
"grad_norm": 0.5691453218460083,
"learning_rate": 3.7194719940710135e-06,
"loss": 0.2831,
"step": 1041
},
{
"epoch": 0.7019198383294039,
"grad_norm": 0.5637477040290833,
"learning_rate": 3.7042393525645793e-06,
"loss": 0.2747,
"step": 1042
},
{
"epoch": 0.7025934658134052,
"grad_norm": 0.5031242966651917,
"learning_rate": 3.689027731209191e-06,
"loss": 0.2321,
"step": 1043
},
{
"epoch": 0.7032670932974066,
"grad_norm": 0.6389548182487488,
"learning_rate": 3.6738372142437223e-06,
"loss": 0.2598,
"step": 1044
},
{
"epoch": 0.7039407207814079,
"grad_norm": 0.5508800148963928,
"learning_rate": 3.6586678857901624e-06,
"loss": 0.2607,
"step": 1045
},
{
"epoch": 0.7046143482654093,
"grad_norm": 0.5346333980560303,
"learning_rate": 3.6435198298531762e-06,
"loss": 0.2484,
"step": 1046
},
{
"epoch": 0.7052879757494106,
"grad_norm": 0.5504537224769592,
"learning_rate": 3.6283931303196123e-06,
"loss": 0.2751,
"step": 1047
},
{
"epoch": 0.705961603233412,
"grad_norm": 0.5008198618888855,
"learning_rate": 3.6132878709580612e-06,
"loss": 0.235,
"step": 1048
},
{
"epoch": 0.7066352307174133,
"grad_norm": 0.5508736371994019,
"learning_rate": 3.5982041354183843e-06,
"loss": 0.2627,
"step": 1049
},
{
"epoch": 0.7073088582014146,
"grad_norm": 0.5439531207084656,
"learning_rate": 3.583142007231235e-06,
"loss": 0.2524,
"step": 1050
},
{
"epoch": 0.707982485685416,
"grad_norm": 0.5159290432929993,
"learning_rate": 3.5681015698076254e-06,
"loss": 0.2323,
"step": 1051
},
{
"epoch": 0.7086561131694173,
"grad_norm": 0.5393319725990295,
"learning_rate": 3.5530829064384378e-06,
"loss": 0.2732,
"step": 1052
},
{
"epoch": 0.7093297406534187,
"grad_norm": 0.5629071593284607,
"learning_rate": 3.5380861002939764e-06,
"loss": 0.2651,
"step": 1053
},
{
"epoch": 0.71000336813742,
"grad_norm": 0.4863939881324768,
"learning_rate": 3.523111234423509e-06,
"loss": 0.2181,
"step": 1054
},
{
"epoch": 0.7106769956214214,
"grad_norm": 0.4968824088573456,
"learning_rate": 3.508158391754798e-06,
"loss": 0.22,
"step": 1055
},
{
"epoch": 0.7113506231054227,
"grad_norm": 0.49191296100616455,
"learning_rate": 3.493227655093645e-06,
"loss": 0.246,
"step": 1056
},
{
"epoch": 0.7120242505894241,
"grad_norm": 0.526577353477478,
"learning_rate": 3.4783191071234387e-06,
"loss": 0.2494,
"step": 1057
},
{
"epoch": 0.7126978780734254,
"grad_norm": 0.5475011467933655,
"learning_rate": 3.463432830404685e-06,
"loss": 0.2609,
"step": 1058
},
{
"epoch": 0.7133715055574268,
"grad_norm": 0.5295203328132629,
"learning_rate": 3.448568907374563e-06,
"loss": 0.2494,
"step": 1059
},
{
"epoch": 0.7140451330414281,
"grad_norm": 0.5042027831077576,
"learning_rate": 3.4337274203464523e-06,
"loss": 0.2266,
"step": 1060
},
{
"epoch": 0.7147187605254295,
"grad_norm": 0.5058079957962036,
"learning_rate": 3.4189084515094974e-06,
"loss": 0.2344,
"step": 1061
},
{
"epoch": 0.7153923880094308,
"grad_norm": 0.5442999601364136,
"learning_rate": 3.40411208292813e-06,
"loss": 0.2545,
"step": 1062
},
{
"epoch": 0.7160660154934322,
"grad_norm": 0.578435480594635,
"learning_rate": 3.3893383965416355e-06,
"loss": 0.2534,
"step": 1063
},
{
"epoch": 0.7167396429774335,
"grad_norm": 0.5491860508918762,
"learning_rate": 3.37458747416369e-06,
"loss": 0.3073,
"step": 1064
},
{
"epoch": 0.7174132704614348,
"grad_norm": 0.49808141589164734,
"learning_rate": 3.3598593974818997e-06,
"loss": 0.2254,
"step": 1065
},
{
"epoch": 0.7180868979454361,
"grad_norm": 0.5253027081489563,
"learning_rate": 3.345154248057359e-06,
"loss": 0.2227,
"step": 1066
},
{
"epoch": 0.7187605254294375,
"grad_norm": 0.5097954273223877,
"learning_rate": 3.3304721073242004e-06,
"loss": 0.2159,
"step": 1067
},
{
"epoch": 0.7194341529134388,
"grad_norm": 0.5558974146842957,
"learning_rate": 3.3158130565891347e-06,
"loss": 0.2458,
"step": 1068
},
{
"epoch": 0.7201077803974402,
"grad_norm": 0.529330849647522,
"learning_rate": 3.3011771770310014e-06,
"loss": 0.2666,
"step": 1069
},
{
"epoch": 0.7207814078814415,
"grad_norm": 0.5007720589637756,
"learning_rate": 3.286564549700333e-06,
"loss": 0.2415,
"step": 1070
},
{
"epoch": 0.7214550353654429,
"grad_norm": 0.6243747472763062,
"learning_rate": 3.271975255518884e-06,
"loss": 0.291,
"step": 1071
},
{
"epoch": 0.7221286628494442,
"grad_norm": 0.5337501168251038,
"learning_rate": 3.2574093752792068e-06,
"loss": 0.2675,
"step": 1072
},
{
"epoch": 0.7228022903334456,
"grad_norm": 0.6054463982582092,
"learning_rate": 3.2428669896441833e-06,
"loss": 0.3009,
"step": 1073
},
{
"epoch": 0.7234759178174469,
"grad_norm": 0.5312137007713318,
"learning_rate": 3.228348179146586e-06,
"loss": 0.2513,
"step": 1074
},
{
"epoch": 0.7241495453014483,
"grad_norm": 0.510999858379364,
"learning_rate": 3.2138530241886403e-06,
"loss": 0.2454,
"step": 1075
},
{
"epoch": 0.7248231727854496,
"grad_norm": 0.5202507972717285,
"learning_rate": 3.199381605041571e-06,
"loss": 0.2348,
"step": 1076
},
{
"epoch": 0.725496800269451,
"grad_norm": 0.5304343700408936,
"learning_rate": 3.18493400184515e-06,
"loss": 0.2677,
"step": 1077
},
{
"epoch": 0.7261704277534523,
"grad_norm": 0.5709294676780701,
"learning_rate": 3.1705102946072746e-06,
"loss": 0.2855,
"step": 1078
},
{
"epoch": 0.7268440552374537,
"grad_norm": 0.5579668879508972,
"learning_rate": 3.156110563203498e-06,
"loss": 0.2858,
"step": 1079
},
{
"epoch": 0.727517682721455,
"grad_norm": 0.6169936060905457,
"learning_rate": 3.141734887376612e-06,
"loss": 0.2939,
"step": 1080
},
{
"epoch": 0.7281913102054564,
"grad_norm": 0.556327223777771,
"learning_rate": 3.127383346736184e-06,
"loss": 0.2797,
"step": 1081
},
{
"epoch": 0.7288649376894577,
"grad_norm": 0.4888077974319458,
"learning_rate": 3.1130560207581275e-06,
"loss": 0.2147,
"step": 1082
},
{
"epoch": 0.729538565173459,
"grad_norm": 0.568587601184845,
"learning_rate": 3.098752988784268e-06,
"loss": 0.2786,
"step": 1083
},
{
"epoch": 0.7302121926574604,
"grad_norm": 0.5443982481956482,
"learning_rate": 3.084474330021882e-06,
"loss": 0.2445,
"step": 1084
},
{
"epoch": 0.7308858201414618,
"grad_norm": 0.4714532494544983,
"learning_rate": 3.070220123543288e-06,
"loss": 0.2044,
"step": 1085
},
{
"epoch": 0.7315594476254631,
"grad_norm": 0.5746622085571289,
"learning_rate": 3.0559904482853808e-06,
"loss": 0.2627,
"step": 1086
},
{
"epoch": 0.7322330751094644,
"grad_norm": 0.5493502616882324,
"learning_rate": 3.041785383049206e-06,
"loss": 0.2564,
"step": 1087
},
{
"epoch": 0.7329067025934658,
"grad_norm": 0.5477399826049805,
"learning_rate": 3.027605006499536e-06,
"loss": 0.252,
"step": 1088
},
{
"epoch": 0.7335803300774671,
"grad_norm": 0.507681667804718,
"learning_rate": 3.013449397164407e-06,
"loss": 0.246,
"step": 1089
},
{
"epoch": 0.7342539575614685,
"grad_norm": 0.5245915651321411,
"learning_rate": 2.99931863343471e-06,
"loss": 0.2374,
"step": 1090
},
{
"epoch": 0.7349275850454698,
"grad_norm": 0.526141345500946,
"learning_rate": 2.985212793563745e-06,
"loss": 0.2358,
"step": 1091
},
{
"epoch": 0.7356012125294712,
"grad_norm": 0.5303114652633667,
"learning_rate": 2.971131955666782e-06,
"loss": 0.232,
"step": 1092
},
{
"epoch": 0.7362748400134725,
"grad_norm": 0.5322446227073669,
"learning_rate": 2.957076197720644e-06,
"loss": 0.2536,
"step": 1093
},
{
"epoch": 0.7369484674974739,
"grad_norm": 0.6054010987281799,
"learning_rate": 2.9430455975632593e-06,
"loss": 0.2825,
"step": 1094
},
{
"epoch": 0.7376220949814752,
"grad_norm": 0.4822597801685333,
"learning_rate": 2.9290402328932374e-06,
"loss": 0.2158,
"step": 1095
},
{
"epoch": 0.7382957224654766,
"grad_norm": 0.5092408061027527,
"learning_rate": 2.9150601812694477e-06,
"loss": 0.2434,
"step": 1096
},
{
"epoch": 0.7389693499494779,
"grad_norm": 0.48755231499671936,
"learning_rate": 2.901105520110569e-06,
"loss": 0.2489,
"step": 1097
},
{
"epoch": 0.7396429774334793,
"grad_norm": 0.5457514524459839,
"learning_rate": 2.887176326694684e-06,
"loss": 0.269,
"step": 1098
},
{
"epoch": 0.7403166049174806,
"grad_norm": 0.5498961210250854,
"learning_rate": 2.8732726781588325e-06,
"loss": 0.2446,
"step": 1099
},
{
"epoch": 0.740990232401482,
"grad_norm": 0.5210698246955872,
"learning_rate": 2.859394651498592e-06,
"loss": 0.2447,
"step": 1100
},
{
"epoch": 0.740990232401482,
"eval_loss": 0.25004303455352783,
"eval_runtime": 104.4563,
"eval_samples_per_second": 47.867,
"eval_steps_per_second": 2.996,
"step": 1100
},
{
"epoch": 0.7416638598854833,
"grad_norm": 0.517212986946106,
"learning_rate": 2.8455423235676586e-06,
"loss": 0.252,
"step": 1101
},
{
"epoch": 0.7423374873694847,
"grad_norm": 0.5591882467269897,
"learning_rate": 2.8317157710774066e-06,
"loss": 0.2567,
"step": 1102
},
{
"epoch": 0.743011114853486,
"grad_norm": 0.5390484929084778,
"learning_rate": 2.8179150705964713e-06,
"loss": 0.2752,
"step": 1103
},
{
"epoch": 0.7436847423374874,
"grad_norm": 0.5284495949745178,
"learning_rate": 2.8041402985503294e-06,
"loss": 0.2248,
"step": 1104
},
{
"epoch": 0.7443583698214887,
"grad_norm": 0.5000547170639038,
"learning_rate": 2.7903915312208696e-06,
"loss": 0.2352,
"step": 1105
},
{
"epoch": 0.7450319973054901,
"grad_norm": 0.5302792191505432,
"learning_rate": 2.7766688447459735e-06,
"loss": 0.2328,
"step": 1106
},
{
"epoch": 0.7457056247894914,
"grad_norm": 0.5096173286437988,
"learning_rate": 2.762972315119088e-06,
"loss": 0.2408,
"step": 1107
},
{
"epoch": 0.7463792522734928,
"grad_norm": 0.5064148902893066,
"learning_rate": 2.7493020181888058e-06,
"loss": 0.2385,
"step": 1108
},
{
"epoch": 0.7470528797574941,
"grad_norm": 0.508243203163147,
"learning_rate": 2.735658029658461e-06,
"loss": 0.2482,
"step": 1109
},
{
"epoch": 0.7477265072414955,
"grad_norm": 0.5063915848731995,
"learning_rate": 2.7220404250856833e-06,
"loss": 0.2661,
"step": 1110
},
{
"epoch": 0.7484001347254968,
"grad_norm": 0.5426428318023682,
"learning_rate": 2.7084492798820035e-06,
"loss": 0.2527,
"step": 1111
},
{
"epoch": 0.7490737622094982,
"grad_norm": 0.5647068023681641,
"learning_rate": 2.6948846693124188e-06,
"loss": 0.2906,
"step": 1112
},
{
"epoch": 0.7497473896934995,
"grad_norm": 0.5713849663734436,
"learning_rate": 2.681346668494985e-06,
"loss": 0.258,
"step": 1113
},
{
"epoch": 0.7504210171775009,
"grad_norm": 0.5479983687400818,
"learning_rate": 2.6678353524004027e-06,
"loss": 0.2393,
"step": 1114
},
{
"epoch": 0.7510946446615022,
"grad_norm": 0.5582695007324219,
"learning_rate": 2.654350795851593e-06,
"loss": 0.2351,
"step": 1115
},
{
"epoch": 0.7517682721455036,
"grad_norm": 0.5141502618789673,
"learning_rate": 2.640893073523286e-06,
"loss": 0.2587,
"step": 1116
},
{
"epoch": 0.7524418996295049,
"grad_norm": 0.5738133788108826,
"learning_rate": 2.6274622599416197e-06,
"loss": 0.2719,
"step": 1117
},
{
"epoch": 0.7531155271135063,
"grad_norm": 0.5252017974853516,
"learning_rate": 2.614058429483703e-06,
"loss": 0.2979,
"step": 1118
},
{
"epoch": 0.7537891545975076,
"grad_norm": 0.5406326055526733,
"learning_rate": 2.600681656377229e-06,
"loss": 0.2803,
"step": 1119
},
{
"epoch": 0.754462782081509,
"grad_norm": 0.5155901312828064,
"learning_rate": 2.587332014700051e-06,
"loss": 0.2645,
"step": 1120
},
{
"epoch": 0.7551364095655103,
"grad_norm": 0.49665388464927673,
"learning_rate": 2.5740095783797656e-06,
"loss": 0.2482,
"step": 1121
},
{
"epoch": 0.7558100370495117,
"grad_norm": 0.585488498210907,
"learning_rate": 2.560714421193323e-06,
"loss": 0.3037,
"step": 1122
},
{
"epoch": 0.756483664533513,
"grad_norm": 0.5360546708106995,
"learning_rate": 2.547446616766597e-06,
"loss": 0.2697,
"step": 1123
},
{
"epoch": 0.7571572920175144,
"grad_norm": 0.5727128982543945,
"learning_rate": 2.534206238573997e-06,
"loss": 0.2627,
"step": 1124
},
{
"epoch": 0.7578309195015157,
"grad_norm": 0.5131103992462158,
"learning_rate": 2.5209933599380443e-06,
"loss": 0.2576,
"step": 1125
},
{
"epoch": 0.7585045469855171,
"grad_norm": 0.5608295798301697,
"learning_rate": 2.507808054028972e-06,
"loss": 0.2851,
"step": 1126
},
{
"epoch": 0.7591781744695184,
"grad_norm": 0.47002115845680237,
"learning_rate": 2.4946503938643306e-06,
"loss": 0.2293,
"step": 1127
},
{
"epoch": 0.7598518019535198,
"grad_norm": 0.5585823655128479,
"learning_rate": 2.4815204523085656e-06,
"loss": 0.2893,
"step": 1128
},
{
"epoch": 0.7605254294375211,
"grad_norm": 0.5366945266723633,
"learning_rate": 2.4684183020726213e-06,
"loss": 0.2358,
"step": 1129
},
{
"epoch": 0.7611990569215223,
"grad_norm": 0.5874181985855103,
"learning_rate": 2.4553440157135496e-06,
"loss": 0.2795,
"step": 1130
},
{
"epoch": 0.7618726844055237,
"grad_norm": 0.4842762351036072,
"learning_rate": 2.442297665634085e-06,
"loss": 0.2238,
"step": 1131
},
{
"epoch": 0.762546311889525,
"grad_norm": 0.5179473161697388,
"learning_rate": 2.4292793240822682e-06,
"loss": 0.236,
"step": 1132
},
{
"epoch": 0.7632199393735264,
"grad_norm": 0.5912408232688904,
"learning_rate": 2.4162890631510233e-06,
"loss": 0.2599,
"step": 1133
},
{
"epoch": 0.7638935668575277,
"grad_norm": 0.541069746017456,
"learning_rate": 2.4033269547777788e-06,
"loss": 0.2805,
"step": 1134
},
{
"epoch": 0.7645671943415291,
"grad_norm": 0.5376386046409607,
"learning_rate": 2.3903930707440584e-06,
"loss": 0.2604,
"step": 1135
},
{
"epoch": 0.7652408218255304,
"grad_norm": 0.5389772057533264,
"learning_rate": 2.3774874826750796e-06,
"loss": 0.2417,
"step": 1136
},
{
"epoch": 0.7659144493095318,
"grad_norm": 0.49994543194770813,
"learning_rate": 2.364610262039369e-06,
"loss": 0.237,
"step": 1137
},
{
"epoch": 0.7665880767935331,
"grad_norm": 0.5099679827690125,
"learning_rate": 2.351761480148358e-06,
"loss": 0.2376,
"step": 1138
},
{
"epoch": 0.7672617042775345,
"grad_norm": 0.5313609838485718,
"learning_rate": 2.3389412081559842e-06,
"loss": 0.2559,
"step": 1139
},
{
"epoch": 0.7679353317615358,
"grad_norm": 0.5471706390380859,
"learning_rate": 2.326149517058314e-06,
"loss": 0.2667,
"step": 1140
},
{
"epoch": 0.7686089592455372,
"grad_norm": 0.4816801846027374,
"learning_rate": 2.313386477693131e-06,
"loss": 0.2245,
"step": 1141
},
{
"epoch": 0.7692825867295385,
"grad_norm": 0.5714917182922363,
"learning_rate": 2.3006521607395516e-06,
"loss": 0.3004,
"step": 1142
},
{
"epoch": 0.7699562142135399,
"grad_norm": 0.5020681619644165,
"learning_rate": 2.2879466367176393e-06,
"loss": 0.2477,
"step": 1143
},
{
"epoch": 0.7706298416975412,
"grad_norm": 0.5577126145362854,
"learning_rate": 2.275269975987998e-06,
"loss": 0.2691,
"step": 1144
},
{
"epoch": 0.7713034691815426,
"grad_norm": 0.481965035200119,
"learning_rate": 2.262622248751405e-06,
"loss": 0.2481,
"step": 1145
},
{
"epoch": 0.7719770966655439,
"grad_norm": 0.5549229979515076,
"learning_rate": 2.250003525048398e-06,
"loss": 0.2568,
"step": 1146
},
{
"epoch": 0.7726507241495453,
"grad_norm": 0.48651793599128723,
"learning_rate": 2.2374138747589086e-06,
"loss": 0.2255,
"step": 1147
},
{
"epoch": 0.7733243516335466,
"grad_norm": 0.5344985723495483,
"learning_rate": 2.224853367601858e-06,
"loss": 0.2485,
"step": 1148
},
{
"epoch": 0.773997979117548,
"grad_norm": 0.5281147360801697,
"learning_rate": 2.212322073134783e-06,
"loss": 0.2634,
"step": 1149
},
{
"epoch": 0.7746716066015493,
"grad_norm": 0.5449070930480957,
"learning_rate": 2.199820060753449e-06,
"loss": 0.2726,
"step": 1150
},
{
"epoch": 0.7753452340855507,
"grad_norm": 0.5562757253646851,
"learning_rate": 2.187347399691457e-06,
"loss": 0.2837,
"step": 1151
},
{
"epoch": 0.776018861569552,
"grad_norm": 0.5350236892700195,
"learning_rate": 2.1749041590198664e-06,
"loss": 0.2456,
"step": 1152
},
{
"epoch": 0.7766924890535534,
"grad_norm": 0.5433318614959717,
"learning_rate": 2.1624904076468215e-06,
"loss": 0.2465,
"step": 1153
},
{
"epoch": 0.7773661165375547,
"grad_norm": 0.5183177590370178,
"learning_rate": 2.1501062143171506e-06,
"loss": 0.245,
"step": 1154
},
{
"epoch": 0.7780397440215561,
"grad_norm": 0.5763646960258484,
"learning_rate": 2.137751647611997e-06,
"loss": 0.2403,
"step": 1155
},
{
"epoch": 0.7787133715055574,
"grad_norm": 0.47587332129478455,
"learning_rate": 2.125426775948446e-06,
"loss": 0.2331,
"step": 1156
},
{
"epoch": 0.7793869989895588,
"grad_norm": 0.4680344760417938,
"learning_rate": 2.113131667579127e-06,
"loss": 0.2246,
"step": 1157
},
{
"epoch": 0.7800606264735601,
"grad_norm": 0.5854060649871826,
"learning_rate": 2.1008663905918553e-06,
"loss": 0.3072,
"step": 1158
},
{
"epoch": 0.7807342539575615,
"grad_norm": 0.48982059955596924,
"learning_rate": 2.088631012909242e-06,
"loss": 0.2257,
"step": 1159
},
{
"epoch": 0.7814078814415628,
"grad_norm": 0.522206723690033,
"learning_rate": 2.0764256022883174e-06,
"loss": 0.2607,
"step": 1160
},
{
"epoch": 0.7820815089255642,
"grad_norm": 0.524056077003479,
"learning_rate": 2.0642502263201687e-06,
"loss": 0.2478,
"step": 1161
},
{
"epoch": 0.7827551364095655,
"grad_norm": 0.49944427609443665,
"learning_rate": 2.052104952429555e-06,
"loss": 0.2371,
"step": 1162
},
{
"epoch": 0.7834287638935669,
"grad_norm": 0.5648651719093323,
"learning_rate": 2.0399898478745307e-06,
"loss": 0.2341,
"step": 1163
},
{
"epoch": 0.7841023913775682,
"grad_norm": 0.5346094965934753,
"learning_rate": 2.027904979746088e-06,
"loss": 0.2694,
"step": 1164
},
{
"epoch": 0.7847760188615696,
"grad_norm": 0.5461552739143372,
"learning_rate": 2.0158504149677643e-06,
"loss": 0.2178,
"step": 1165
},
{
"epoch": 0.7854496463455709,
"grad_norm": 0.5188043713569641,
"learning_rate": 2.003826220295295e-06,
"loss": 0.2449,
"step": 1166
},
{
"epoch": 0.7861232738295723,
"grad_norm": 0.5148183703422546,
"learning_rate": 1.9918324623162253e-06,
"loss": 0.2381,
"step": 1167
},
{
"epoch": 0.7867969013135736,
"grad_norm": 0.550039529800415,
"learning_rate": 1.979869207449545e-06,
"loss": 0.2633,
"step": 1168
},
{
"epoch": 0.787470528797575,
"grad_norm": 0.5604125261306763,
"learning_rate": 1.9679365219453337e-06,
"loss": 0.2605,
"step": 1169
},
{
"epoch": 0.7881441562815763,
"grad_norm": 0.5368652939796448,
"learning_rate": 1.9560344718843746e-06,
"loss": 0.2725,
"step": 1170
},
{
"epoch": 0.7888177837655777,
"grad_norm": 0.5414903163909912,
"learning_rate": 1.9441631231778063e-06,
"loss": 0.2505,
"step": 1171
},
{
"epoch": 0.789491411249579,
"grad_norm": 0.4991462528705597,
"learning_rate": 1.932322541566743e-06,
"loss": 0.2455,
"step": 1172
},
{
"epoch": 0.7901650387335803,
"grad_norm": 0.4789801836013794,
"learning_rate": 1.920512792621917e-06,
"loss": 0.2252,
"step": 1173
},
{
"epoch": 0.7908386662175817,
"grad_norm": 0.572302520275116,
"learning_rate": 1.908733941743322e-06,
"loss": 0.2803,
"step": 1174
},
{
"epoch": 0.791512293701583,
"grad_norm": 0.5717800259590149,
"learning_rate": 1.8969860541598358e-06,
"loss": 0.2782,
"step": 1175
},
{
"epoch": 0.7921859211855844,
"grad_norm": 0.5871665477752686,
"learning_rate": 1.885269194928876e-06,
"loss": 0.2884,
"step": 1176
},
{
"epoch": 0.7928595486695857,
"grad_norm": 0.5179949402809143,
"learning_rate": 1.8735834289360281e-06,
"loss": 0.2484,
"step": 1177
},
{
"epoch": 0.7935331761535871,
"grad_norm": 0.5230392813682556,
"learning_rate": 1.8619288208946858e-06,
"loss": 0.244,
"step": 1178
},
{
"epoch": 0.7942068036375884,
"grad_norm": 0.4939616024494171,
"learning_rate": 1.850305435345704e-06,
"loss": 0.2202,
"step": 1179
},
{
"epoch": 0.7948804311215898,
"grad_norm": 0.5704658031463623,
"learning_rate": 1.8387133366570284e-06,
"loss": 0.2999,
"step": 1180
},
{
"epoch": 0.7955540586055911,
"grad_norm": 0.5412735342979431,
"learning_rate": 1.8271525890233412e-06,
"loss": 0.254,
"step": 1181
},
{
"epoch": 0.7962276860895925,
"grad_norm": 0.5522796511650085,
"learning_rate": 1.8156232564657204e-06,
"loss": 0.256,
"step": 1182
},
{
"epoch": 0.7969013135735938,
"grad_norm": 0.5172164440155029,
"learning_rate": 1.8041254028312604e-06,
"loss": 0.2408,
"step": 1183
},
{
"epoch": 0.7975749410575952,
"grad_norm": 0.5235007405281067,
"learning_rate": 1.792659091792742e-06,
"loss": 0.2455,
"step": 1184
},
{
"epoch": 0.7982485685415965,
"grad_norm": 0.5019668340682983,
"learning_rate": 1.781224386848265e-06,
"loss": 0.2212,
"step": 1185
},
{
"epoch": 0.7989221960255979,
"grad_norm": 0.5576637387275696,
"learning_rate": 1.7698213513208983e-06,
"loss": 0.2655,
"step": 1186
},
{
"epoch": 0.7995958235095992,
"grad_norm": 0.5962572693824768,
"learning_rate": 1.758450048358339e-06,
"loss": 0.2673,
"step": 1187
},
{
"epoch": 0.8002694509936006,
"grad_norm": 0.5175941586494446,
"learning_rate": 1.7471105409325507e-06,
"loss": 0.2609,
"step": 1188
},
{
"epoch": 0.8009430784776019,
"grad_norm": 0.5470423698425293,
"learning_rate": 1.7358028918394187e-06,
"loss": 0.2781,
"step": 1189
},
{
"epoch": 0.8016167059616033,
"grad_norm": 0.5484721660614014,
"learning_rate": 1.7245271636984072e-06,
"loss": 0.2503,
"step": 1190
},
{
"epoch": 0.8022903334456046,
"grad_norm": 0.5539147257804871,
"learning_rate": 1.7132834189522075e-06,
"loss": 0.2697,
"step": 1191
},
{
"epoch": 0.802963960929606,
"grad_norm": 0.5356603860855103,
"learning_rate": 1.7020717198663948e-06,
"loss": 0.2343,
"step": 1192
},
{
"epoch": 0.8036375884136073,
"grad_norm": 0.5115563273429871,
"learning_rate": 1.690892128529078e-06,
"loss": 0.2507,
"step": 1193
},
{
"epoch": 0.8043112158976087,
"grad_norm": 0.5782252550125122,
"learning_rate": 1.6797447068505604e-06,
"loss": 0.2993,
"step": 1194
},
{
"epoch": 0.8049848433816099,
"grad_norm": 0.4831259548664093,
"learning_rate": 1.6686295165630005e-06,
"loss": 0.2095,
"step": 1195
},
{
"epoch": 0.8056584708656113,
"grad_norm": 0.5409023761749268,
"learning_rate": 1.6575466192200609e-06,
"loss": 0.2591,
"step": 1196
},
{
"epoch": 0.8063320983496126,
"grad_norm": 0.510886013507843,
"learning_rate": 1.6464960761965773e-06,
"loss": 0.2221,
"step": 1197
},
{
"epoch": 0.807005725833614,
"grad_norm": 0.5136117935180664,
"learning_rate": 1.635477948688209e-06,
"loss": 0.2306,
"step": 1198
},
{
"epoch": 0.8076793533176153,
"grad_norm": 0.5284943580627441,
"learning_rate": 1.624492297711106e-06,
"loss": 0.2497,
"step": 1199
},
{
"epoch": 0.8083529808016167,
"grad_norm": 0.547935426235199,
"learning_rate": 1.6135391841015749e-06,
"loss": 0.237,
"step": 1200
},
{
"epoch": 0.8083529808016167,
"eval_loss": 0.24800407886505127,
"eval_runtime": 104.1907,
"eval_samples_per_second": 47.989,
"eval_steps_per_second": 3.004,
"step": 1200
},
{
"epoch": 0.809026608285618,
"grad_norm": 0.5304118990898132,
"learning_rate": 1.6026186685157299e-06,
"loss": 0.2637,
"step": 1201
},
{
"epoch": 0.8097002357696194,
"grad_norm": 0.5001785755157471,
"learning_rate": 1.591730811429165e-06,
"loss": 0.2512,
"step": 1202
},
{
"epoch": 0.8103738632536207,
"grad_norm": 0.5340592265129089,
"learning_rate": 1.5808756731366246e-06,
"loss": 0.2356,
"step": 1203
},
{
"epoch": 0.8110474907376221,
"grad_norm": 0.5753365755081177,
"learning_rate": 1.5700533137516538e-06,
"loss": 0.2411,
"step": 1204
},
{
"epoch": 0.8117211182216234,
"grad_norm": 0.5412161946296692,
"learning_rate": 1.559263793206282e-06,
"loss": 0.259,
"step": 1205
},
{
"epoch": 0.8123947457056248,
"grad_norm": 0.5372704267501831,
"learning_rate": 1.5485071712506836e-06,
"loss": 0.2583,
"step": 1206
},
{
"epoch": 0.8130683731896261,
"grad_norm": 0.5177714228630066,
"learning_rate": 1.5377835074528396e-06,
"loss": 0.2566,
"step": 1207
},
{
"epoch": 0.8137420006736275,
"grad_norm": 0.5761350989341736,
"learning_rate": 1.5270928611982252e-06,
"loss": 0.2748,
"step": 1208
},
{
"epoch": 0.8144156281576288,
"grad_norm": 0.4516087770462036,
"learning_rate": 1.5164352916894639e-06,
"loss": 0.2042,
"step": 1209
},
{
"epoch": 0.8150892556416302,
"grad_norm": 0.5373425483703613,
"learning_rate": 1.5058108579460117e-06,
"loss": 0.2473,
"step": 1210
},
{
"epoch": 0.8157628831256315,
"grad_norm": 0.5737300515174866,
"learning_rate": 1.4952196188038232e-06,
"loss": 0.2378,
"step": 1211
},
{
"epoch": 0.8164365106096328,
"grad_norm": 0.5578494071960449,
"learning_rate": 1.4846616329150252e-06,
"loss": 0.2455,
"step": 1212
},
{
"epoch": 0.8171101380936342,
"grad_norm": 0.5392588376998901,
"learning_rate": 1.4741369587476023e-06,
"loss": 0.2587,
"step": 1213
},
{
"epoch": 0.8177837655776355,
"grad_norm": 0.5129286050796509,
"learning_rate": 1.4636456545850584e-06,
"loss": 0.2269,
"step": 1214
},
{
"epoch": 0.8184573930616369,
"grad_norm": 0.5656298398971558,
"learning_rate": 1.4531877785261032e-06,
"loss": 0.25,
"step": 1215
},
{
"epoch": 0.8191310205456382,
"grad_norm": 0.5242322087287903,
"learning_rate": 1.4427633884843321e-06,
"loss": 0.2213,
"step": 1216
},
{
"epoch": 0.8198046480296396,
"grad_norm": 0.5498111248016357,
"learning_rate": 1.432372542187895e-06,
"loss": 0.2842,
"step": 1217
},
{
"epoch": 0.8204782755136409,
"grad_norm": 0.5031629204750061,
"learning_rate": 1.42201529717919e-06,
"loss": 0.2404,
"step": 1218
},
{
"epoch": 0.8211519029976423,
"grad_norm": 0.5014514327049255,
"learning_rate": 1.4116917108145318e-06,
"loss": 0.2447,
"step": 1219
},
{
"epoch": 0.8218255304816436,
"grad_norm": 0.6042701005935669,
"learning_rate": 1.4014018402638454e-06,
"loss": 0.2935,
"step": 1220
},
{
"epoch": 0.822499157965645,
"grad_norm": 0.4977283179759979,
"learning_rate": 1.3911457425103444e-06,
"loss": 0.2463,
"step": 1221
},
{
"epoch": 0.8231727854496463,
"grad_norm": 0.5383139252662659,
"learning_rate": 1.3809234743502109e-06,
"loss": 0.2432,
"step": 1222
},
{
"epoch": 0.8238464129336477,
"grad_norm": 0.4999409019947052,
"learning_rate": 1.3707350923922915e-06,
"loss": 0.2427,
"step": 1223
},
{
"epoch": 0.824520040417649,
"grad_norm": 0.5174874067306519,
"learning_rate": 1.3605806530577725e-06,
"loss": 0.2475,
"step": 1224
},
{
"epoch": 0.8251936679016504,
"grad_norm": 0.545793890953064,
"learning_rate": 1.3504602125798742e-06,
"loss": 0.26,
"step": 1225
},
{
"epoch": 0.8258672953856517,
"grad_norm": 0.5483867526054382,
"learning_rate": 1.340373827003543e-06,
"loss": 0.2454,
"step": 1226
},
{
"epoch": 0.8265409228696531,
"grad_norm": 0.5197309851646423,
"learning_rate": 1.3303215521851303e-06,
"loss": 0.2109,
"step": 1227
},
{
"epoch": 0.8272145503536544,
"grad_norm": 0.5258607864379883,
"learning_rate": 1.3203034437920889e-06,
"loss": 0.2473,
"step": 1228
},
{
"epoch": 0.8278881778376558,
"grad_norm": 0.5303134322166443,
"learning_rate": 1.3103195573026708e-06,
"loss": 0.2348,
"step": 1229
},
{
"epoch": 0.8285618053216571,
"grad_norm": 0.6092815399169922,
"learning_rate": 1.3003699480056073e-06,
"loss": 0.3257,
"step": 1230
},
{
"epoch": 0.8292354328056585,
"grad_norm": 0.5898075103759766,
"learning_rate": 1.2904546709998153e-06,
"loss": 0.2941,
"step": 1231
},
{
"epoch": 0.8299090602896598,
"grad_norm": 0.519887387752533,
"learning_rate": 1.2805737811940814e-06,
"loss": 0.2452,
"step": 1232
},
{
"epoch": 0.8305826877736612,
"grad_norm": 0.5605584979057312,
"learning_rate": 1.2707273333067675e-06,
"loss": 0.254,
"step": 1233
},
{
"epoch": 0.8312563152576625,
"grad_norm": 0.5284910202026367,
"learning_rate": 1.2609153818654983e-06,
"loss": 0.2709,
"step": 1234
},
{
"epoch": 0.8319299427416639,
"grad_norm": 0.5385324358940125,
"learning_rate": 1.2511379812068683e-06,
"loss": 0.2483,
"step": 1235
},
{
"epoch": 0.8326035702256652,
"grad_norm": 0.5575158596038818,
"learning_rate": 1.2413951854761364e-06,
"loss": 0.2434,
"step": 1236
},
{
"epoch": 0.8332771977096666,
"grad_norm": 0.4902471601963043,
"learning_rate": 1.231687048626925e-06,
"loss": 0.2183,
"step": 1237
},
{
"epoch": 0.8339508251936679,
"grad_norm": 0.4895704984664917,
"learning_rate": 1.22201362442092e-06,
"loss": 0.2555,
"step": 1238
},
{
"epoch": 0.8346244526776693,
"grad_norm": 0.5114362835884094,
"learning_rate": 1.2123749664275823e-06,
"loss": 0.2474,
"step": 1239
},
{
"epoch": 0.8352980801616706,
"grad_norm": 0.5052047371864319,
"learning_rate": 1.2027711280238396e-06,
"loss": 0.2158,
"step": 1240
},
{
"epoch": 0.835971707645672,
"grad_norm": 0.549039900302887,
"learning_rate": 1.1932021623937954e-06,
"loss": 0.2728,
"step": 1241
},
{
"epoch": 0.8366453351296733,
"grad_norm": 0.5076087117195129,
"learning_rate": 1.1836681225284401e-06,
"loss": 0.248,
"step": 1242
},
{
"epoch": 0.8373189626136747,
"grad_norm": 0.5593048930168152,
"learning_rate": 1.1741690612253455e-06,
"loss": 0.2778,
"step": 1243
},
{
"epoch": 0.837992590097676,
"grad_norm": 0.557877242565155,
"learning_rate": 1.1647050310883855e-06,
"loss": 0.2744,
"step": 1244
},
{
"epoch": 0.8386662175816774,
"grad_norm": 0.571789562702179,
"learning_rate": 1.155276084527435e-06,
"loss": 0.2623,
"step": 1245
},
{
"epoch": 0.8393398450656787,
"grad_norm": 0.532217800617218,
"learning_rate": 1.1458822737580804e-06,
"loss": 0.2604,
"step": 1246
},
{
"epoch": 0.8400134725496801,
"grad_norm": 0.488652765750885,
"learning_rate": 1.1365236508013396e-06,
"loss": 0.2302,
"step": 1247
},
{
"epoch": 0.8406871000336814,
"grad_norm": 0.48154351115226746,
"learning_rate": 1.1272002674833668e-06,
"loss": 0.2292,
"step": 1248
},
{
"epoch": 0.8413607275176828,
"grad_norm": 0.5416498184204102,
"learning_rate": 1.1179121754351587e-06,
"loss": 0.2675,
"step": 1249
},
{
"epoch": 0.8420343550016841,
"grad_norm": 0.5223404169082642,
"learning_rate": 1.1086594260922873e-06,
"loss": 0.2495,
"step": 1250
},
{
"epoch": 0.8427079824856855,
"grad_norm": 0.4795687198638916,
"learning_rate": 1.0994420706945922e-06,
"loss": 0.2405,
"step": 1251
},
{
"epoch": 0.8433816099696868,
"grad_norm": 0.528590202331543,
"learning_rate": 1.0902601602859192e-06,
"loss": 0.234,
"step": 1252
},
{
"epoch": 0.8440552374536882,
"grad_norm": 0.5488812327384949,
"learning_rate": 1.0811137457138195e-06,
"loss": 0.2309,
"step": 1253
},
{
"epoch": 0.8447288649376895,
"grad_norm": 0.48816734552383423,
"learning_rate": 1.0720028776292775e-06,
"loss": 0.2252,
"step": 1254
},
{
"epoch": 0.8454024924216909,
"grad_norm": 0.4672640860080719,
"learning_rate": 1.0629276064864315e-06,
"loss": 0.2241,
"step": 1255
},
{
"epoch": 0.8460761199056922,
"grad_norm": 0.6129331588745117,
"learning_rate": 1.053887982542286e-06,
"loss": 0.2933,
"step": 1256
},
{
"epoch": 0.8467497473896936,
"grad_norm": 0.4707486033439636,
"learning_rate": 1.0448840558564437e-06,
"loss": 0.2263,
"step": 1257
},
{
"epoch": 0.8474233748736949,
"grad_norm": 0.5238653421401978,
"learning_rate": 1.0359158762908206e-06,
"loss": 0.2251,
"step": 1258
},
{
"epoch": 0.8480970023576961,
"grad_norm": 0.5185546278953552,
"learning_rate": 1.0269834935093692e-06,
"loss": 0.2423,
"step": 1259
},
{
"epoch": 0.8487706298416975,
"grad_norm": 0.551475465297699,
"learning_rate": 1.0180869569778146e-06,
"loss": 0.252,
"step": 1260
},
{
"epoch": 0.8494442573256988,
"grad_norm": 0.5508469343185425,
"learning_rate": 1.0092263159633643e-06,
"loss": 0.2689,
"step": 1261
},
{
"epoch": 0.8501178848097002,
"grad_norm": 0.5417614579200745,
"learning_rate": 1.000401619534449e-06,
"loss": 0.2693,
"step": 1262
},
{
"epoch": 0.8507915122937015,
"grad_norm": 0.5418084859848022,
"learning_rate": 9.91612916560445e-07,
"loss": 0.2355,
"step": 1263
},
{
"epoch": 0.8514651397777029,
"grad_norm": 0.49578985571861267,
"learning_rate": 9.828602557114017e-07,
"loss": 0.2373,
"step": 1264
},
{
"epoch": 0.8521387672617042,
"grad_norm": 0.5037760734558105,
"learning_rate": 9.741436854577778e-07,
"loss": 0.2109,
"step": 1265
},
{
"epoch": 0.8528123947457056,
"grad_norm": 0.5789549946784973,
"learning_rate": 9.654632540701663e-07,
"loss": 0.2314,
"step": 1266
},
{
"epoch": 0.8534860222297069,
"grad_norm": 0.5235623717308044,
"learning_rate": 9.568190096190321e-07,
"loss": 0.2648,
"step": 1267
},
{
"epoch": 0.8541596497137083,
"grad_norm": 0.5276992917060852,
"learning_rate": 9.482109999744456e-07,
"loss": 0.2422,
"step": 1268
},
{
"epoch": 0.8548332771977096,
"grad_norm": 0.5321447253227234,
"learning_rate": 9.396392728058129e-07,
"loss": 0.2257,
"step": 1269
},
{
"epoch": 0.855506904681711,
"grad_norm": 0.5687943696975708,
"learning_rate": 9.311038755816187e-07,
"loss": 0.2534,
"step": 1270
},
{
"epoch": 0.8561805321657123,
"grad_norm": 0.5804548263549805,
"learning_rate": 9.226048555691583e-07,
"loss": 0.2888,
"step": 1271
},
{
"epoch": 0.8568541596497137,
"grad_norm": 0.603203535079956,
"learning_rate": 9.141422598342745e-07,
"loss": 0.2695,
"step": 1272
},
{
"epoch": 0.857527787133715,
"grad_norm": 0.49607640504837036,
"learning_rate": 9.057161352411055e-07,
"loss": 0.2329,
"step": 1273
},
{
"epoch": 0.8582014146177164,
"grad_norm": 0.5365235209465027,
"learning_rate": 8.973265284518168e-07,
"loss": 0.2576,
"step": 1274
},
{
"epoch": 0.8588750421017177,
"grad_norm": 0.4797859191894531,
"learning_rate": 8.889734859263429e-07,
"loss": 0.2337,
"step": 1275
},
{
"epoch": 0.8595486695857191,
"grad_norm": 0.5661630034446716,
"learning_rate": 8.806570539221378e-07,
"loss": 0.2612,
"step": 1276
},
{
"epoch": 0.8602222970697204,
"grad_norm": 0.5369821190834045,
"learning_rate": 8.723772784939132e-07,
"loss": 0.2509,
"step": 1277
},
{
"epoch": 0.8608959245537218,
"grad_norm": 0.6320977807044983,
"learning_rate": 8.641342054933799e-07,
"loss": 0.2837,
"step": 1278
},
{
"epoch": 0.8615695520377231,
"grad_norm": 0.605116069316864,
"learning_rate": 8.559278805690027e-07,
"loss": 0.3332,
"step": 1279
},
{
"epoch": 0.8622431795217245,
"grad_norm": 0.5639594793319702,
"learning_rate": 8.477583491657404e-07,
"loss": 0.267,
"step": 1280
},
{
"epoch": 0.8629168070057258,
"grad_norm": 0.5377295613288879,
"learning_rate": 8.396256565247987e-07,
"loss": 0.2624,
"step": 1281
},
{
"epoch": 0.8635904344897272,
"grad_norm": 0.5511677265167236,
"learning_rate": 8.315298476833749e-07,
"loss": 0.2253,
"step": 1282
},
{
"epoch": 0.8642640619737285,
"grad_norm": 0.5488938093185425,
"learning_rate": 8.234709674744156e-07,
"loss": 0.2745,
"step": 1283
},
{
"epoch": 0.8649376894577299,
"grad_norm": 0.538172721862793,
"learning_rate": 8.154490605263592e-07,
"loss": 0.2664,
"step": 1284
},
{
"epoch": 0.8656113169417312,
"grad_norm": 0.46900901198387146,
"learning_rate": 8.074641712628963e-07,
"loss": 0.2223,
"step": 1285
},
{
"epoch": 0.8662849444257326,
"grad_norm": 0.5262213349342346,
"learning_rate": 7.995163439027223e-07,
"loss": 0.2444,
"step": 1286
},
{
"epoch": 0.8669585719097339,
"grad_norm": 0.5740031003952026,
"learning_rate": 7.916056224592899e-07,
"loss": 0.3013,
"step": 1287
},
{
"epoch": 0.8676321993937353,
"grad_norm": 0.48303380608558655,
"learning_rate": 7.837320507405633e-07,
"loss": 0.2352,
"step": 1288
},
{
"epoch": 0.8683058268777366,
"grad_norm": 0.5323200821876526,
"learning_rate": 7.758956723487872e-07,
"loss": 0.2453,
"step": 1289
},
{
"epoch": 0.868979454361738,
"grad_norm": 0.5278131365776062,
"learning_rate": 7.680965306802288e-07,
"loss": 0.227,
"step": 1290
},
{
"epoch": 0.8696530818457393,
"grad_norm": 0.4973738491535187,
"learning_rate": 7.603346689249515e-07,
"loss": 0.2294,
"step": 1291
},
{
"epoch": 0.8703267093297407,
"grad_norm": 0.4885808825492859,
"learning_rate": 7.526101300665692e-07,
"loss": 0.2251,
"step": 1292
},
{
"epoch": 0.871000336813742,
"grad_norm": 0.5182288289070129,
"learning_rate": 7.44922956882006e-07,
"loss": 0.2279,
"step": 1293
},
{
"epoch": 0.8716739642977434,
"grad_norm": 0.6276636123657227,
"learning_rate": 7.37273191941267e-07,
"loss": 0.2574,
"step": 1294
},
{
"epoch": 0.8723475917817447,
"grad_norm": 0.514959454536438,
"learning_rate": 7.296608776071931e-07,
"loss": 0.2344,
"step": 1295
},
{
"epoch": 0.873021219265746,
"grad_norm": 0.5535395741462708,
"learning_rate": 7.220860560352365e-07,
"loss": 0.2702,
"step": 1296
},
{
"epoch": 0.8736948467497474,
"grad_norm": 0.538652241230011,
"learning_rate": 7.145487691732194e-07,
"loss": 0.2414,
"step": 1297
},
{
"epoch": 0.8743684742337487,
"grad_norm": 0.4960603415966034,
"learning_rate": 7.070490587611014e-07,
"loss": 0.2188,
"step": 1298
},
{
"epoch": 0.8750421017177501,
"grad_norm": 0.6888505220413208,
"learning_rate": 6.995869663307588e-07,
"loss": 0.2467,
"step": 1299
},
{
"epoch": 0.8757157292017514,
"grad_norm": 0.5374876856803894,
"learning_rate": 6.921625332057413e-07,
"loss": 0.2615,
"step": 1300
},
{
"epoch": 0.8757157292017514,
"eval_loss": 0.24685746431350708,
"eval_runtime": 105.528,
"eval_samples_per_second": 47.381,
"eval_steps_per_second": 2.966,
"step": 1300
},
{
"epoch": 0.8763893566857528,
"grad_norm": 0.5609838962554932,
"learning_rate": 6.847758005010493e-07,
"loss": 0.2512,
"step": 1301
},
{
"epoch": 0.8770629841697541,
"grad_norm": 0.5215060114860535,
"learning_rate": 6.774268091229097e-07,
"loss": 0.2401,
"step": 1302
},
{
"epoch": 0.8777366116537555,
"grad_norm": 0.5238656401634216,
"learning_rate": 6.701155997685413e-07,
"loss": 0.2291,
"step": 1303
},
{
"epoch": 0.8784102391377568,
"grad_norm": 0.5399895906448364,
"learning_rate": 6.628422129259371e-07,
"loss": 0.2594,
"step": 1304
},
{
"epoch": 0.8790838666217582,
"grad_norm": 0.5661953687667847,
"learning_rate": 6.556066888736334e-07,
"loss": 0.2781,
"step": 1305
},
{
"epoch": 0.8797574941057595,
"grad_norm": 0.5317832827568054,
"learning_rate": 6.484090676804927e-07,
"loss": 0.2365,
"step": 1306
},
{
"epoch": 0.8804311215897609,
"grad_norm": 0.5083217024803162,
"learning_rate": 6.412493892054802e-07,
"loss": 0.251,
"step": 1307
},
{
"epoch": 0.8811047490737622,
"grad_norm": 0.5453861951828003,
"learning_rate": 6.341276930974377e-07,
"loss": 0.2472,
"step": 1308
},
{
"epoch": 0.8817783765577636,
"grad_norm": 0.524014949798584,
"learning_rate": 6.270440187948734e-07,
"loss": 0.2392,
"step": 1309
},
{
"epoch": 0.8824520040417649,
"grad_norm": 0.5500375628471375,
"learning_rate": 6.19998405525734e-07,
"loss": 0.2429,
"step": 1310
},
{
"epoch": 0.8831256315257663,
"grad_norm": 0.4858246445655823,
"learning_rate": 6.129908923071933e-07,
"loss": 0.2301,
"step": 1311
},
{
"epoch": 0.8837992590097676,
"grad_norm": 0.524882972240448,
"learning_rate": 6.060215179454379e-07,
"loss": 0.265,
"step": 1312
},
{
"epoch": 0.884472886493769,
"grad_norm": 0.47017255425453186,
"learning_rate": 5.990903210354456e-07,
"loss": 0.2178,
"step": 1313
},
{
"epoch": 0.8851465139777703,
"grad_norm": 0.5531392097473145,
"learning_rate": 5.921973399607738e-07,
"loss": 0.2613,
"step": 1314
},
{
"epoch": 0.8858201414617717,
"grad_norm": 0.5758329033851624,
"learning_rate": 5.853426128933548e-07,
"loss": 0.2408,
"step": 1315
},
{
"epoch": 0.886493768945773,
"grad_norm": 0.5558485984802246,
"learning_rate": 5.78526177793271e-07,
"loss": 0.271,
"step": 1316
},
{
"epoch": 0.8871673964297744,
"grad_norm": 0.5643607974052429,
"learning_rate": 5.717480724085564e-07,
"loss": 0.2524,
"step": 1317
},
{
"epoch": 0.8878410239137757,
"grad_norm": 0.5513696670532227,
"learning_rate": 5.650083342749796e-07,
"loss": 0.271,
"step": 1318
},
{
"epoch": 0.8885146513977771,
"grad_norm": 0.5440685749053955,
"learning_rate": 5.583070007158425e-07,
"loss": 0.2397,
"step": 1319
},
{
"epoch": 0.8891882788817784,
"grad_norm": 0.5702263712882996,
"learning_rate": 5.516441088417665e-07,
"loss": 0.2512,
"step": 1320
},
{
"epoch": 0.8898619063657798,
"grad_norm": 0.5015043020248413,
"learning_rate": 5.450196955504946e-07,
"loss": 0.2414,
"step": 1321
},
{
"epoch": 0.8905355338497811,
"grad_norm": 0.5015976428985596,
"learning_rate": 5.384337975266789e-07,
"loss": 0.2394,
"step": 1322
},
{
"epoch": 0.8912091613337825,
"grad_norm": 0.5432824492454529,
"learning_rate": 5.318864512416871e-07,
"loss": 0.2451,
"step": 1323
},
{
"epoch": 0.8918827888177837,
"grad_norm": 0.550553023815155,
"learning_rate": 5.253776929533898e-07,
"loss": 0.229,
"step": 1324
},
{
"epoch": 0.8925564163017851,
"grad_norm": 0.49185919761657715,
"learning_rate": 5.1890755870597e-07,
"loss": 0.2408,
"step": 1325
},
{
"epoch": 0.8932300437857864,
"grad_norm": 0.5123654007911682,
"learning_rate": 5.124760843297144e-07,
"loss": 0.2529,
"step": 1326
},
{
"epoch": 0.8939036712697878,
"grad_norm": 0.5548482537269592,
"learning_rate": 5.060833054408206e-07,
"loss": 0.2219,
"step": 1327
},
{
"epoch": 0.8945772987537891,
"grad_norm": 0.5485543608665466,
"learning_rate": 4.997292574412019e-07,
"loss": 0.281,
"step": 1328
},
{
"epoch": 0.8952509262377905,
"grad_norm": 0.49521514773368835,
"learning_rate": 4.934139755182801e-07,
"loss": 0.2481,
"step": 1329
},
{
"epoch": 0.8959245537217918,
"grad_norm": 0.5502017140388489,
"learning_rate": 4.871374946448077e-07,
"loss": 0.2141,
"step": 1330
},
{
"epoch": 0.8965981812057932,
"grad_norm": 0.5505282878875732,
"learning_rate": 4.808998495786577e-07,
"loss": 0.2761,
"step": 1331
},
{
"epoch": 0.8972718086897945,
"grad_norm": 0.5089839696884155,
"learning_rate": 4.747010748626404e-07,
"loss": 0.2086,
"step": 1332
},
{
"epoch": 0.8979454361737959,
"grad_norm": 0.556952953338623,
"learning_rate": 4.685412048243118e-07,
"loss": 0.2672,
"step": 1333
},
{
"epoch": 0.8986190636577972,
"grad_norm": 0.5195740461349487,
"learning_rate": 4.6242027357577903e-07,
"loss": 0.2466,
"step": 1334
},
{
"epoch": 0.8992926911417986,
"grad_norm": 0.528290867805481,
"learning_rate": 4.5633831501351616e-07,
"loss": 0.24,
"step": 1335
},
{
"epoch": 0.8999663186257999,
"grad_norm": 0.5216066241264343,
"learning_rate": 4.5029536281817386e-07,
"loss": 0.2312,
"step": 1336
},
{
"epoch": 0.9006399461098012,
"grad_norm": 0.473034143447876,
"learning_rate": 4.442914504543924e-07,
"loss": 0.2353,
"step": 1337
},
{
"epoch": 0.9013135735938026,
"grad_norm": 0.5582137703895569,
"learning_rate": 4.3832661117061993e-07,
"loss": 0.2657,
"step": 1338
},
{
"epoch": 0.901987201077804,
"grad_norm": 0.5023413300514221,
"learning_rate": 4.3240087799892357e-07,
"loss": 0.2335,
"step": 1339
},
{
"epoch": 0.9026608285618053,
"grad_norm": 0.5657601952552795,
"learning_rate": 4.2651428375480694e-07,
"loss": 0.2517,
"step": 1340
},
{
"epoch": 0.9033344560458066,
"grad_norm": 0.6110347509384155,
"learning_rate": 4.206668610370362e-07,
"loss": 0.2873,
"step": 1341
},
{
"epoch": 0.904008083529808,
"grad_norm": 0.5331605076789856,
"learning_rate": 4.14858642227447e-07,
"loss": 0.2316,
"step": 1342
},
{
"epoch": 0.9046817110138093,
"grad_norm": 0.5356966853141785,
"learning_rate": 4.090896594907767e-07,
"loss": 0.2124,
"step": 1343
},
{
"epoch": 0.9053553384978107,
"grad_norm": 0.5137231349945068,
"learning_rate": 4.033599447744785e-07,
"loss": 0.2475,
"step": 1344
},
{
"epoch": 0.906028965981812,
"grad_norm": 0.5301010012626648,
"learning_rate": 3.9766952980854755e-07,
"loss": 0.2319,
"step": 1345
},
{
"epoch": 0.9067025934658134,
"grad_norm": 0.5305171012878418,
"learning_rate": 3.9201844610534667e-07,
"loss": 0.2307,
"step": 1346
},
{
"epoch": 0.9073762209498147,
"grad_norm": 0.5359605550765991,
"learning_rate": 3.8640672495942777e-07,
"loss": 0.2406,
"step": 1347
},
{
"epoch": 0.9080498484338161,
"grad_norm": 0.534532904624939,
"learning_rate": 3.8083439744736296e-07,
"loss": 0.263,
"step": 1348
},
{
"epoch": 0.9087234759178174,
"grad_norm": 0.5522202253341675,
"learning_rate": 3.75301494427569e-07,
"loss": 0.257,
"step": 1349
},
{
"epoch": 0.9093971034018188,
"grad_norm": 0.5170401334762573,
"learning_rate": 3.6980804654013794e-07,
"loss": 0.2534,
"step": 1350
},
{
"epoch": 0.9100707308858201,
"grad_norm": 0.542862594127655,
"learning_rate": 3.643540842066692e-07,
"loss": 0.2502,
"step": 1351
},
{
"epoch": 0.9107443583698215,
"grad_norm": 0.5424035787582397,
"learning_rate": 3.5893963763009713e-07,
"loss": 0.2531,
"step": 1352
},
{
"epoch": 0.9114179858538228,
"grad_norm": 0.517169177532196,
"learning_rate": 3.5356473679452524e-07,
"loss": 0.2209,
"step": 1353
},
{
"epoch": 0.9120916133378242,
"grad_norm": 0.536840558052063,
"learning_rate": 3.482294114650639e-07,
"loss": 0.2681,
"step": 1354
},
{
"epoch": 0.9127652408218255,
"grad_norm": 0.6323632001876831,
"learning_rate": 3.4293369118765794e-07,
"loss": 0.3221,
"step": 1355
},
{
"epoch": 0.9134388683058269,
"grad_norm": 0.5508329272270203,
"learning_rate": 3.3767760528893356e-07,
"loss": 0.2675,
"step": 1356
},
{
"epoch": 0.9141124957898282,
"grad_norm": 0.5150956511497498,
"learning_rate": 3.324611828760241e-07,
"loss": 0.2383,
"step": 1357
},
{
"epoch": 0.9147861232738296,
"grad_norm": 0.5284437537193298,
"learning_rate": 3.272844528364161e-07,
"loss": 0.2326,
"step": 1358
},
{
"epoch": 0.9154597507578309,
"grad_norm": 0.4829590618610382,
"learning_rate": 3.221474438377903e-07,
"loss": 0.2247,
"step": 1359
},
{
"epoch": 0.9161333782418323,
"grad_norm": 0.5343140363693237,
"learning_rate": 3.1705018432785673e-07,
"loss": 0.2778,
"step": 1360
},
{
"epoch": 0.9168070057258336,
"grad_norm": 0.5202133655548096,
"learning_rate": 3.1199270253420397e-07,
"loss": 0.262,
"step": 1361
},
{
"epoch": 0.917480633209835,
"grad_norm": 0.6042024493217468,
"learning_rate": 3.069750264641369e-07,
"loss": 0.3138,
"step": 1362
},
{
"epoch": 0.9181542606938363,
"grad_norm": 0.4585079550743103,
"learning_rate": 3.0199718390452825e-07,
"loss": 0.1988,
"step": 1363
},
{
"epoch": 0.9188278881778377,
"grad_norm": 0.5287424921989441,
"learning_rate": 2.9705920242165565e-07,
"loss": 0.2417,
"step": 1364
},
{
"epoch": 0.919501515661839,
"grad_norm": 0.5599202513694763,
"learning_rate": 2.9216110936105906e-07,
"loss": 0.2709,
"step": 1365
},
{
"epoch": 0.9201751431458404,
"grad_norm": 0.5495063066482544,
"learning_rate": 2.8730293184738105e-07,
"loss": 0.2546,
"step": 1366
},
{
"epoch": 0.9208487706298417,
"grad_norm": 0.529340386390686,
"learning_rate": 2.8248469678422346e-07,
"loss": 0.2454,
"step": 1367
},
{
"epoch": 0.9215223981138431,
"grad_norm": 0.5447036027908325,
"learning_rate": 2.7770643085399004e-07,
"loss": 0.2703,
"step": 1368
},
{
"epoch": 0.9221960255978444,
"grad_norm": 0.5069358348846436,
"learning_rate": 2.729681605177492e-07,
"loss": 0.2529,
"step": 1369
},
{
"epoch": 0.9228696530818458,
"grad_norm": 0.5129917860031128,
"learning_rate": 2.6826991201507724e-07,
"loss": 0.2237,
"step": 1370
},
{
"epoch": 0.9235432805658471,
"grad_norm": 0.5351532697677612,
"learning_rate": 2.636117113639194e-07,
"loss": 0.2592,
"step": 1371
},
{
"epoch": 0.9242169080498485,
"grad_norm": 0.5014567375183105,
"learning_rate": 2.589935843604452e-07,
"loss": 0.2112,
"step": 1372
},
{
"epoch": 0.9248905355338498,
"grad_norm": 0.5409959554672241,
"learning_rate": 2.54415556578903e-07,
"loss": 0.259,
"step": 1373
},
{
"epoch": 0.9255641630178512,
"grad_norm": 0.547943115234375,
"learning_rate": 2.4987765337148e-07,
"loss": 0.262,
"step": 1374
},
{
"epoch": 0.9262377905018525,
"grad_norm": 0.5011503100395203,
"learning_rate": 2.453798998681625e-07,
"loss": 0.2436,
"step": 1375
},
{
"epoch": 0.9269114179858539,
"grad_norm": 0.5170352458953857,
"learning_rate": 2.4092232097659486e-07,
"loss": 0.2529,
"step": 1376
},
{
"epoch": 0.9275850454698552,
"grad_norm": 0.5595082640647888,
"learning_rate": 2.3650494138194257e-07,
"loss": 0.2843,
"step": 1377
},
{
"epoch": 0.9282586729538566,
"grad_norm": 0.5302537679672241,
"learning_rate": 2.3212778554675766e-07,
"loss": 0.2382,
"step": 1378
},
{
"epoch": 0.9289323004378579,
"grad_norm": 0.5083282589912415,
"learning_rate": 2.277908777108387e-07,
"loss": 0.2587,
"step": 1379
},
{
"epoch": 0.9296059279218593,
"grad_norm": 0.512374222278595,
"learning_rate": 2.2349424189109984e-07,
"loss": 0.2337,
"step": 1380
},
{
"epoch": 0.9302795554058606,
"grad_norm": 0.5485064387321472,
"learning_rate": 2.192379018814372e-07,
"loss": 0.263,
"step": 1381
},
{
"epoch": 0.930953182889862,
"grad_norm": 0.5666427612304688,
"learning_rate": 2.150218812525953e-07,
"loss": 0.2683,
"step": 1382
},
{
"epoch": 0.9316268103738633,
"grad_norm": 0.5534345507621765,
"learning_rate": 2.1084620335204225e-07,
"loss": 0.2069,
"step": 1383
},
{
"epoch": 0.9323004378578647,
"grad_norm": 0.5635040402412415,
"learning_rate": 2.0671089130383152e-07,
"loss": 0.3081,
"step": 1384
},
{
"epoch": 0.932974065341866,
"grad_norm": 0.5579578280448914,
"learning_rate": 2.0261596800848132e-07,
"loss": 0.2694,
"step": 1385
},
{
"epoch": 0.9336476928258673,
"grad_norm": 0.5865366458892822,
"learning_rate": 1.9856145614284616e-07,
"loss": 0.261,
"step": 1386
},
{
"epoch": 0.9343213203098687,
"grad_norm": 0.5374801754951477,
"learning_rate": 1.9454737815998546e-07,
"loss": 0.258,
"step": 1387
},
{
"epoch": 0.9349949477938699,
"grad_norm": 0.4848352372646332,
"learning_rate": 1.9057375628905112e-07,
"loss": 0.2121,
"step": 1388
},
{
"epoch": 0.9356685752778713,
"grad_norm": 0.5470321774482727,
"learning_rate": 1.8664061253514997e-07,
"loss": 0.2556,
"step": 1389
},
{
"epoch": 0.9363422027618726,
"grad_norm": 0.4898010194301605,
"learning_rate": 1.8274796867923578e-07,
"loss": 0.2436,
"step": 1390
},
{
"epoch": 0.937015830245874,
"grad_norm": 0.5198950171470642,
"learning_rate": 1.788958462779766e-07,
"loss": 0.2523,
"step": 1391
},
{
"epoch": 0.9376894577298753,
"grad_norm": 0.5553786754608154,
"learning_rate": 1.750842666636443e-07,
"loss": 0.2764,
"step": 1392
},
{
"epoch": 0.9383630852138767,
"grad_norm": 0.5620520114898682,
"learning_rate": 1.7131325094399352e-07,
"loss": 0.249,
"step": 1393
},
{
"epoch": 0.939036712697878,
"grad_norm": 0.5561796426773071,
"learning_rate": 1.6758282000214202e-07,
"loss": 0.2581,
"step": 1394
},
{
"epoch": 0.9397103401818794,
"grad_norm": 0.5772345066070557,
"learning_rate": 1.6389299449645734e-07,
"loss": 0.2718,
"step": 1395
},
{
"epoch": 0.9403839676658807,
"grad_norm": 0.5675050020217896,
"learning_rate": 1.6024379486044517e-07,
"loss": 0.2668,
"step": 1396
},
{
"epoch": 0.9410575951498821,
"grad_norm": 0.517271876335144,
"learning_rate": 1.5663524130262867e-07,
"loss": 0.2287,
"step": 1397
},
{
"epoch": 0.9417312226338834,
"grad_norm": 0.5633882284164429,
"learning_rate": 1.5306735380644698e-07,
"loss": 0.2676,
"step": 1398
},
{
"epoch": 0.9424048501178848,
"grad_norm": 0.5311648845672607,
"learning_rate": 1.4954015213013427e-07,
"loss": 0.2269,
"step": 1399
},
{
"epoch": 0.9430784776018861,
"grad_norm": 0.4805348813533783,
"learning_rate": 1.4605365580661668e-07,
"loss": 0.2116,
"step": 1400
},
{
"epoch": 0.9430784776018861,
"eval_loss": 0.24632865190505981,
"eval_runtime": 107.1175,
"eval_samples_per_second": 46.678,
"eval_steps_per_second": 2.922,
"step": 1400
}
],
"logging_steps": 1,
"max_steps": 1484,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.577960594384093e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}