terry69's picture
Model save
8fb5600 verified
raw
history blame contribute delete
No virus
65.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1858,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005382131324004305,
"grad_norm": 24.46331800159023,
"learning_rate": 5.376344086021506e-08,
"loss": 1.3638,
"step": 1
},
{
"epoch": 0.002691065662002153,
"grad_norm": 24.232442902094746,
"learning_rate": 2.688172043010753e-07,
"loss": 1.3692,
"step": 5
},
{
"epoch": 0.005382131324004306,
"grad_norm": 14.752612983186452,
"learning_rate": 5.376344086021506e-07,
"loss": 1.3038,
"step": 10
},
{
"epoch": 0.008073196986006458,
"grad_norm": 11.800751606437952,
"learning_rate": 8.064516129032258e-07,
"loss": 1.166,
"step": 15
},
{
"epoch": 0.010764262648008612,
"grad_norm": 9.554957230427643,
"learning_rate": 1.0752688172043011e-06,
"loss": 1.0338,
"step": 20
},
{
"epoch": 0.013455328310010764,
"grad_norm": 3.628600831390835,
"learning_rate": 1.3440860215053765e-06,
"loss": 0.9317,
"step": 25
},
{
"epoch": 0.016146393972012917,
"grad_norm": 3.2255977477470297,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.9012,
"step": 30
},
{
"epoch": 0.01883745963401507,
"grad_norm": 3.0289714775630516,
"learning_rate": 1.881720430107527e-06,
"loss": 0.8827,
"step": 35
},
{
"epoch": 0.021528525296017224,
"grad_norm": 3.1823522788295704,
"learning_rate": 2.1505376344086023e-06,
"loss": 0.8644,
"step": 40
},
{
"epoch": 0.024219590958019375,
"grad_norm": 2.8342525430348315,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.8452,
"step": 45
},
{
"epoch": 0.02691065662002153,
"grad_norm": 3.052168326380914,
"learning_rate": 2.688172043010753e-06,
"loss": 0.8347,
"step": 50
},
{
"epoch": 0.029601722282023683,
"grad_norm": 2.8700531898378,
"learning_rate": 2.9569892473118283e-06,
"loss": 0.8117,
"step": 55
},
{
"epoch": 0.03229278794402583,
"grad_norm": 2.9876737446045385,
"learning_rate": 3.225806451612903e-06,
"loss": 0.815,
"step": 60
},
{
"epoch": 0.03498385360602799,
"grad_norm": 3.0224919131706023,
"learning_rate": 3.494623655913979e-06,
"loss": 0.7953,
"step": 65
},
{
"epoch": 0.03767491926803014,
"grad_norm": 3.0473808806620517,
"learning_rate": 3.763440860215054e-06,
"loss": 0.7859,
"step": 70
},
{
"epoch": 0.040365984930032295,
"grad_norm": 3.096384352129974,
"learning_rate": 4.032258064516129e-06,
"loss": 0.7971,
"step": 75
},
{
"epoch": 0.04305705059203445,
"grad_norm": 3.1948521945715074,
"learning_rate": 4.3010752688172045e-06,
"loss": 0.7941,
"step": 80
},
{
"epoch": 0.045748116254036596,
"grad_norm": 3.0704452147023473,
"learning_rate": 4.56989247311828e-06,
"loss": 0.7655,
"step": 85
},
{
"epoch": 0.04843918191603875,
"grad_norm": 3.213509560134052,
"learning_rate": 4.838709677419355e-06,
"loss": 0.7705,
"step": 90
},
{
"epoch": 0.051130247578040904,
"grad_norm": 3.551334383153254,
"learning_rate": 5.1075268817204305e-06,
"loss": 0.7731,
"step": 95
},
{
"epoch": 0.05382131324004306,
"grad_norm": 3.3115985648508595,
"learning_rate": 5.376344086021506e-06,
"loss": 0.7421,
"step": 100
},
{
"epoch": 0.05651237890204521,
"grad_norm": 3.024863181759173,
"learning_rate": 5.645161290322582e-06,
"loss": 0.7446,
"step": 105
},
{
"epoch": 0.059203444564047365,
"grad_norm": 3.003509243588486,
"learning_rate": 5.9139784946236566e-06,
"loss": 0.7168,
"step": 110
},
{
"epoch": 0.06189451022604951,
"grad_norm": 2.8904939122926785,
"learning_rate": 6.182795698924732e-06,
"loss": 0.7226,
"step": 115
},
{
"epoch": 0.06458557588805167,
"grad_norm": 3.0907313428716585,
"learning_rate": 6.451612903225806e-06,
"loss": 0.7287,
"step": 120
},
{
"epoch": 0.06727664155005382,
"grad_norm": 2.908190969126188,
"learning_rate": 6.720430107526882e-06,
"loss": 0.7362,
"step": 125
},
{
"epoch": 0.06996770721205597,
"grad_norm": 3.0277469501250267,
"learning_rate": 6.989247311827958e-06,
"loss": 0.7127,
"step": 130
},
{
"epoch": 0.07265877287405813,
"grad_norm": 2.9375347724298946,
"learning_rate": 7.258064516129033e-06,
"loss": 0.7057,
"step": 135
},
{
"epoch": 0.07534983853606028,
"grad_norm": 2.7968808634431097,
"learning_rate": 7.526881720430108e-06,
"loss": 0.7212,
"step": 140
},
{
"epoch": 0.07804090419806244,
"grad_norm": 3.0782635414890342,
"learning_rate": 7.795698924731183e-06,
"loss": 0.7121,
"step": 145
},
{
"epoch": 0.08073196986006459,
"grad_norm": 2.957567985639813,
"learning_rate": 8.064516129032258e-06,
"loss": 0.7285,
"step": 150
},
{
"epoch": 0.08342303552206674,
"grad_norm": 2.8558517093443045,
"learning_rate": 8.333333333333334e-06,
"loss": 0.7182,
"step": 155
},
{
"epoch": 0.0861141011840689,
"grad_norm": 2.875223992789043,
"learning_rate": 8.602150537634409e-06,
"loss": 0.7179,
"step": 160
},
{
"epoch": 0.08880516684607104,
"grad_norm": 3.0182318773591517,
"learning_rate": 8.870967741935484e-06,
"loss": 0.702,
"step": 165
},
{
"epoch": 0.09149623250807319,
"grad_norm": 2.7804594099140014,
"learning_rate": 9.13978494623656e-06,
"loss": 0.7105,
"step": 170
},
{
"epoch": 0.09418729817007535,
"grad_norm": 2.8174567099055157,
"learning_rate": 9.408602150537635e-06,
"loss": 0.7095,
"step": 175
},
{
"epoch": 0.0968783638320775,
"grad_norm": 2.803920967572859,
"learning_rate": 9.67741935483871e-06,
"loss": 0.7048,
"step": 180
},
{
"epoch": 0.09956942949407965,
"grad_norm": 2.7583241887420917,
"learning_rate": 9.946236559139786e-06,
"loss": 0.7161,
"step": 185
},
{
"epoch": 0.10226049515608181,
"grad_norm": 2.9304582816909286,
"learning_rate": 9.999858783596665e-06,
"loss": 0.7136,
"step": 190
},
{
"epoch": 0.10495156081808396,
"grad_norm": 2.8592073188711886,
"learning_rate": 9.999285105629308e-06,
"loss": 0.7213,
"step": 195
},
{
"epoch": 0.10764262648008611,
"grad_norm": 2.6445409308883256,
"learning_rate": 9.998270190666602e-06,
"loss": 0.71,
"step": 200
},
{
"epoch": 0.11033369214208827,
"grad_norm": 2.811803987053168,
"learning_rate": 9.99681412828496e-06,
"loss": 0.6946,
"step": 205
},
{
"epoch": 0.11302475780409042,
"grad_norm": 2.60382159101248,
"learning_rate": 9.994917046996472e-06,
"loss": 0.6824,
"step": 210
},
{
"epoch": 0.11571582346609258,
"grad_norm": 2.7632251330599327,
"learning_rate": 9.99257911423757e-06,
"loss": 0.712,
"step": 215
},
{
"epoch": 0.11840688912809473,
"grad_norm": 2.737121694120479,
"learning_rate": 9.989800536354243e-06,
"loss": 0.705,
"step": 220
},
{
"epoch": 0.12109795479009688,
"grad_norm": 2.7950967614991105,
"learning_rate": 9.986581558583824e-06,
"loss": 0.6785,
"step": 225
},
{
"epoch": 0.12378902045209902,
"grad_norm": 2.580895787265613,
"learning_rate": 9.98292246503335e-06,
"loss": 0.6921,
"step": 230
},
{
"epoch": 0.1264800861141012,
"grad_norm": 2.740755897744941,
"learning_rate": 9.978823578654486e-06,
"loss": 0.7008,
"step": 235
},
{
"epoch": 0.12917115177610333,
"grad_norm": 2.5740723463717514,
"learning_rate": 9.97428526121502e-06,
"loss": 0.6851,
"step": 240
},
{
"epoch": 0.1318622174381055,
"grad_norm": 2.5648119982196316,
"learning_rate": 9.969307913266931e-06,
"loss": 0.6918,
"step": 245
},
{
"epoch": 0.13455328310010764,
"grad_norm": 2.6146329921031968,
"learning_rate": 9.963891974111042e-06,
"loss": 0.686,
"step": 250
},
{
"epoch": 0.1372443487621098,
"grad_norm": 2.5693313714525225,
"learning_rate": 9.958037921758241e-06,
"loss": 0.6851,
"step": 255
},
{
"epoch": 0.13993541442411195,
"grad_norm": 2.5758611472453183,
"learning_rate": 9.951746272887298e-06,
"loss": 0.6825,
"step": 260
},
{
"epoch": 0.1426264800861141,
"grad_norm": 2.6882089017132866,
"learning_rate": 9.945017582799256e-06,
"loss": 0.6857,
"step": 265
},
{
"epoch": 0.14531754574811626,
"grad_norm": 2.471441857805247,
"learning_rate": 9.937852445368427e-06,
"loss": 0.7087,
"step": 270
},
{
"epoch": 0.1480086114101184,
"grad_norm": 2.499362888582163,
"learning_rate": 9.930251492989972e-06,
"loss": 0.6834,
"step": 275
},
{
"epoch": 0.15069967707212056,
"grad_norm": 2.4532511507065684,
"learning_rate": 9.922215396524089e-06,
"loss": 0.6841,
"step": 280
},
{
"epoch": 0.1533907427341227,
"grad_norm": 2.476586585638722,
"learning_rate": 9.913744865236798e-06,
"loss": 0.6738,
"step": 285
},
{
"epoch": 0.15608180839612487,
"grad_norm": 2.4741617401097593,
"learning_rate": 9.904840646737346e-06,
"loss": 0.6977,
"step": 290
},
{
"epoch": 0.158772874058127,
"grad_norm": 2.4646734777776405,
"learning_rate": 9.895503526912224e-06,
"loss": 0.6664,
"step": 295
},
{
"epoch": 0.16146393972012918,
"grad_norm": 2.541571671880384,
"learning_rate": 9.885734329855798e-06,
"loss": 0.6578,
"step": 300
},
{
"epoch": 0.16415500538213132,
"grad_norm": 2.512328994032953,
"learning_rate": 9.875533917797579e-06,
"loss": 0.6564,
"step": 305
},
{
"epoch": 0.1668460710441335,
"grad_norm": 2.608219630629651,
"learning_rate": 9.864903191026125e-06,
"loss": 0.6676,
"step": 310
},
{
"epoch": 0.16953713670613563,
"grad_norm": 2.5127310208806226,
"learning_rate": 9.853843087809574e-06,
"loss": 0.675,
"step": 315
},
{
"epoch": 0.1722282023681378,
"grad_norm": 2.954995883472683,
"learning_rate": 9.842354584312841e-06,
"loss": 0.6711,
"step": 320
},
{
"epoch": 0.17491926803013993,
"grad_norm": 2.4877912144296723,
"learning_rate": 9.830438694511454e-06,
"loss": 0.6532,
"step": 325
},
{
"epoch": 0.17761033369214208,
"grad_norm": 2.4109884507809998,
"learning_rate": 9.818096470102067e-06,
"loss": 0.6742,
"step": 330
},
{
"epoch": 0.18030139935414424,
"grad_norm": 2.7208831806314633,
"learning_rate": 9.805329000409634e-06,
"loss": 0.6626,
"step": 335
},
{
"epoch": 0.18299246501614638,
"grad_norm": 2.70446788462542,
"learning_rate": 9.792137412291265e-06,
"loss": 0.6578,
"step": 340
},
{
"epoch": 0.18568353067814855,
"grad_norm": 2.554719643429019,
"learning_rate": 9.778522870036768e-06,
"loss": 0.6386,
"step": 345
},
{
"epoch": 0.1883745963401507,
"grad_norm": 2.548018478530073,
"learning_rate": 9.764486575265893e-06,
"loss": 0.653,
"step": 350
},
{
"epoch": 0.19106566200215286,
"grad_norm": 2.720938193286558,
"learning_rate": 9.750029766822277e-06,
"loss": 0.6579,
"step": 355
},
{
"epoch": 0.193756727664155,
"grad_norm": 3.2604834173029267,
"learning_rate": 9.735153720664096e-06,
"loss": 0.6357,
"step": 360
},
{
"epoch": 0.19644779332615717,
"grad_norm": 2.854496420392015,
"learning_rate": 9.719859749751462e-06,
"loss": 0.643,
"step": 365
},
{
"epoch": 0.1991388589881593,
"grad_norm": 2.5148923925605295,
"learning_rate": 9.704149203930522e-06,
"loss": 0.6314,
"step": 370
},
{
"epoch": 0.20182992465016147,
"grad_norm": 2.6156775549028097,
"learning_rate": 9.688023469814345e-06,
"loss": 0.6291,
"step": 375
},
{
"epoch": 0.20452099031216361,
"grad_norm": 2.4436480149075903,
"learning_rate": 9.671483970660519e-06,
"loss": 0.6391,
"step": 380
},
{
"epoch": 0.20721205597416578,
"grad_norm": 2.6176248575730683,
"learning_rate": 9.654532166245543e-06,
"loss": 0.6451,
"step": 385
},
{
"epoch": 0.20990312163616792,
"grad_norm": 2.6284920127628704,
"learning_rate": 9.637169552735985e-06,
"loss": 0.6457,
"step": 390
},
{
"epoch": 0.21259418729817006,
"grad_norm": 2.5079900288395676,
"learning_rate": 9.619397662556434e-06,
"loss": 0.6363,
"step": 395
},
{
"epoch": 0.21528525296017223,
"grad_norm": 2.510239111931181,
"learning_rate": 9.601218064254245e-06,
"loss": 0.636,
"step": 400
},
{
"epoch": 0.21797631862217437,
"grad_norm": 2.6420717433236294,
"learning_rate": 9.582632362361098e-06,
"loss": 0.638,
"step": 405
},
{
"epoch": 0.22066738428417654,
"grad_norm": 2.4228761926249343,
"learning_rate": 9.563642197251382e-06,
"loss": 0.6373,
"step": 410
},
{
"epoch": 0.22335844994617868,
"grad_norm": 2.482460770882953,
"learning_rate": 9.54424924499742e-06,
"loss": 0.6138,
"step": 415
},
{
"epoch": 0.22604951560818085,
"grad_norm": 2.4554209699636904,
"learning_rate": 9.524455217221537e-06,
"loss": 0.6297,
"step": 420
},
{
"epoch": 0.22874058127018299,
"grad_norm": 2.2267259427452686,
"learning_rate": 9.504261860944984e-06,
"loss": 0.6345,
"step": 425
},
{
"epoch": 0.23143164693218515,
"grad_norm": 2.5062964865846467,
"learning_rate": 9.48367095843376e-06,
"loss": 0.6271,
"step": 430
},
{
"epoch": 0.2341227125941873,
"grad_norm": 2.5483397828557353,
"learning_rate": 9.462684327041298e-06,
"loss": 0.6137,
"step": 435
},
{
"epoch": 0.23681377825618946,
"grad_norm": 6.340986217773637,
"learning_rate": 9.441303819048073e-06,
"loss": 0.6236,
"step": 440
},
{
"epoch": 0.2395048439181916,
"grad_norm": 3.6533277008246787,
"learning_rate": 9.41953132149811e-06,
"loss": 0.6201,
"step": 445
},
{
"epoch": 0.24219590958019377,
"grad_norm": 2.539798703180873,
"learning_rate": 9.397368756032445e-06,
"loss": 0.623,
"step": 450
},
{
"epoch": 0.2448869752421959,
"grad_norm": 2.569421112776594,
"learning_rate": 9.374818078719515e-06,
"loss": 0.6129,
"step": 455
},
{
"epoch": 0.24757804090419805,
"grad_norm": 2.3715746149053953,
"learning_rate": 9.351881279882512e-06,
"loss": 0.6268,
"step": 460
},
{
"epoch": 0.2502691065662002,
"grad_norm": 2.572579471719146,
"learning_rate": 9.328560383923724e-06,
"loss": 0.6161,
"step": 465
},
{
"epoch": 0.2529601722282024,
"grad_norm": 2.4902601550757737,
"learning_rate": 9.304857449145858e-06,
"loss": 0.6244,
"step": 470
},
{
"epoch": 0.2556512378902045,
"grad_norm": 2.4501823079061658,
"learning_rate": 9.280774567570372e-06,
"loss": 0.6287,
"step": 475
},
{
"epoch": 0.25834230355220666,
"grad_norm": 2.5682717978671126,
"learning_rate": 9.256313864752838e-06,
"loss": 0.604,
"step": 480
},
{
"epoch": 0.26103336921420883,
"grad_norm": 2.404883352610733,
"learning_rate": 9.231477499595333e-06,
"loss": 0.6138,
"step": 485
},
{
"epoch": 0.263724434876211,
"grad_norm": 2.562822568126396,
"learning_rate": 9.206267664155906e-06,
"loss": 0.6054,
"step": 490
},
{
"epoch": 0.2664155005382131,
"grad_norm": 2.321411236526974,
"learning_rate": 9.180686583455097e-06,
"loss": 0.5948,
"step": 495
},
{
"epoch": 0.2691065662002153,
"grad_norm": 2.475132775898768,
"learning_rate": 9.154736515279557e-06,
"loss": 0.5905,
"step": 500
},
{
"epoch": 0.27179763186221745,
"grad_norm": 2.3355547839115354,
"learning_rate": 9.12841974998278e-06,
"loss": 0.5813,
"step": 505
},
{
"epoch": 0.2744886975242196,
"grad_norm": 2.742626737076485,
"learning_rate": 9.101738610282956e-06,
"loss": 0.6136,
"step": 510
},
{
"epoch": 0.27717976318622173,
"grad_norm": 2.52218472070317,
"learning_rate": 9.074695451057966e-06,
"loss": 0.6002,
"step": 515
},
{
"epoch": 0.2798708288482239,
"grad_norm": 2.364034819036867,
"learning_rate": 9.047292659137542e-06,
"loss": 0.6055,
"step": 520
},
{
"epoch": 0.28256189451022606,
"grad_norm": 2.5115978707447524,
"learning_rate": 9.019532653092597e-06,
"loss": 0.5978,
"step": 525
},
{
"epoch": 0.2852529601722282,
"grad_norm": 2.4450725301262457,
"learning_rate": 8.99141788302178e-06,
"loss": 0.5748,
"step": 530
},
{
"epoch": 0.28794402583423034,
"grad_norm": 2.5635891607971764,
"learning_rate": 8.962950830335213e-06,
"loss": 0.6019,
"step": 535
},
{
"epoch": 0.2906350914962325,
"grad_norm": 2.6838630282585316,
"learning_rate": 8.93413400753549e-06,
"loss": 0.6119,
"step": 540
},
{
"epoch": 0.2933261571582347,
"grad_norm": 2.559914467821027,
"learning_rate": 8.90496995799592e-06,
"loss": 0.5934,
"step": 545
},
{
"epoch": 0.2960172228202368,
"grad_norm": 2.3172236979073633,
"learning_rate": 8.875461255736055e-06,
"loss": 0.5923,
"step": 550
},
{
"epoch": 0.29870828848223896,
"grad_norm": 2.613084787164224,
"learning_rate": 8.845610505194495e-06,
"loss": 0.5881,
"step": 555
},
{
"epoch": 0.3013993541442411,
"grad_norm": 2.527915276378667,
"learning_rate": 8.815420340999034e-06,
"loss": 0.5877,
"step": 560
},
{
"epoch": 0.3040904198062433,
"grad_norm": 2.566801330647483,
"learning_rate": 8.784893427734117e-06,
"loss": 0.5742,
"step": 565
},
{
"epoch": 0.3067814854682454,
"grad_norm": 2.363950845115375,
"learning_rate": 8.754032459705672e-06,
"loss": 0.5828,
"step": 570
},
{
"epoch": 0.3094725511302476,
"grad_norm": 2.2951780141802307,
"learning_rate": 8.722840160703304e-06,
"loss": 0.5825,
"step": 575
},
{
"epoch": 0.31216361679224974,
"grad_norm": 2.53863020294738,
"learning_rate": 8.691319283759896e-06,
"loss": 0.5751,
"step": 580
},
{
"epoch": 0.3148546824542519,
"grad_norm": 2.357318941568188,
"learning_rate": 8.659472610908628e-06,
"loss": 0.5897,
"step": 585
},
{
"epoch": 0.317545748116254,
"grad_norm": 2.318670881116376,
"learning_rate": 8.627302952937431e-06,
"loss": 0.5649,
"step": 590
},
{
"epoch": 0.3202368137782562,
"grad_norm": 2.2618602596217388,
"learning_rate": 8.594813149140908e-06,
"loss": 0.577,
"step": 595
},
{
"epoch": 0.32292787944025836,
"grad_norm": 2.39290329100585,
"learning_rate": 8.56200606706974e-06,
"loss": 0.5543,
"step": 600
},
{
"epoch": 0.32561894510226047,
"grad_norm": 2.470727799467087,
"learning_rate": 8.528884602277593e-06,
"loss": 0.5823,
"step": 605
},
{
"epoch": 0.32831001076426264,
"grad_norm": 2.4343415189492403,
"learning_rate": 8.495451678065563e-06,
"loss": 0.583,
"step": 610
},
{
"epoch": 0.3310010764262648,
"grad_norm": 2.329527413930226,
"learning_rate": 8.461710245224149e-06,
"loss": 0.5596,
"step": 615
},
{
"epoch": 0.333692142088267,
"grad_norm": 2.710197180873677,
"learning_rate": 8.42766328177284e-06,
"loss": 0.5688,
"step": 620
},
{
"epoch": 0.3363832077502691,
"grad_norm": 2.3295706886393908,
"learning_rate": 8.393313792697251e-06,
"loss": 0.5606,
"step": 625
},
{
"epoch": 0.33907427341227125,
"grad_norm": 2.7411314085447893,
"learning_rate": 8.358664809683926e-06,
"loss": 0.5679,
"step": 630
},
{
"epoch": 0.3417653390742734,
"grad_norm": 2.354293086420231,
"learning_rate": 8.323719390852735e-06,
"loss": 0.5737,
"step": 635
},
{
"epoch": 0.3444564047362756,
"grad_norm": 2.5084140873961083,
"learning_rate": 8.288480620486991e-06,
"loss": 0.5479,
"step": 640
},
{
"epoch": 0.3471474703982777,
"grad_norm": 2.4064859741841427,
"learning_rate": 8.252951608761217e-06,
"loss": 0.5574,
"step": 645
},
{
"epoch": 0.34983853606027987,
"grad_norm": 2.4777423126966185,
"learning_rate": 8.217135491466636e-06,
"loss": 0.5666,
"step": 650
},
{
"epoch": 0.35252960172228204,
"grad_norm": 2.3905287579939416,
"learning_rate": 8.181035429734423e-06,
"loss": 0.5629,
"step": 655
},
{
"epoch": 0.35522066738428415,
"grad_norm": 2.327857140876721,
"learning_rate": 8.144654609756685e-06,
"loss": 0.5372,
"step": 660
},
{
"epoch": 0.3579117330462863,
"grad_norm": 2.290053555489138,
"learning_rate": 8.10799624250527e-06,
"loss": 0.5646,
"step": 665
},
{
"epoch": 0.3606027987082885,
"grad_norm": 2.297702496538287,
"learning_rate": 8.071063563448341e-06,
"loss": 0.553,
"step": 670
},
{
"epoch": 0.36329386437029065,
"grad_norm": 2.302044112251165,
"learning_rate": 8.03385983226483e-06,
"loss": 0.5381,
"step": 675
},
{
"epoch": 0.36598493003229277,
"grad_norm": 2.508769537248855,
"learning_rate": 7.996388332556735e-06,
"loss": 0.5433,
"step": 680
},
{
"epoch": 0.36867599569429493,
"grad_norm": 2.4009515047198193,
"learning_rate": 7.958652371559313e-06,
"loss": 0.5524,
"step": 685
},
{
"epoch": 0.3713670613562971,
"grad_norm": 2.3402259477581198,
"learning_rate": 7.920655279849173e-06,
"loss": 0.5609,
"step": 690
},
{
"epoch": 0.37405812701829927,
"grad_norm": 2.71945405647194,
"learning_rate": 7.882400411050328e-06,
"loss": 0.5414,
"step": 695
},
{
"epoch": 0.3767491926803014,
"grad_norm": 2.325373282991122,
"learning_rate": 7.843891141538201e-06,
"loss": 0.5352,
"step": 700
},
{
"epoch": 0.37944025834230355,
"grad_norm": 2.243347153689357,
"learning_rate": 7.80513087014163e-06,
"loss": 0.5503,
"step": 705
},
{
"epoch": 0.3821313240043057,
"grad_norm": 2.2504693819243506,
"learning_rate": 7.766123017842877e-06,
"loss": 0.5408,
"step": 710
},
{
"epoch": 0.3848223896663079,
"grad_norm": 2.24546531050267,
"learning_rate": 7.726871027475709e-06,
"loss": 0.5384,
"step": 715
},
{
"epoch": 0.38751345532831,
"grad_norm": 2.452293080065367,
"learning_rate": 7.687378363421512e-06,
"loss": 0.5435,
"step": 720
},
{
"epoch": 0.39020452099031216,
"grad_norm": 2.4702348319235625,
"learning_rate": 7.647648511303545e-06,
"loss": 0.5366,
"step": 725
},
{
"epoch": 0.39289558665231433,
"grad_norm": 2.6594839428477646,
"learning_rate": 7.607684977679284e-06,
"loss": 0.5311,
"step": 730
},
{
"epoch": 0.39558665231431644,
"grad_norm": 2.3853401333086097,
"learning_rate": 7.567491289730944e-06,
"loss": 0.5347,
"step": 735
},
{
"epoch": 0.3982777179763186,
"grad_norm": 2.305805010232876,
"learning_rate": 7.52707099495416e-06,
"loss": 0.5198,
"step": 740
},
{
"epoch": 0.4009687836383208,
"grad_norm": 2.32709407366988,
"learning_rate": 7.4864276608448925e-06,
"loss": 0.5162,
"step": 745
},
{
"epoch": 0.40365984930032295,
"grad_norm": 2.2558330926103993,
"learning_rate": 7.44556487458456e-06,
"loss": 0.5157,
"step": 750
},
{
"epoch": 0.40635091496232506,
"grad_norm": 2.358890714175164,
"learning_rate": 7.404486242723428e-06,
"loss": 0.5223,
"step": 755
},
{
"epoch": 0.40904198062432723,
"grad_norm": 2.4688588460570036,
"learning_rate": 7.363195390862298e-06,
"loss": 0.5306,
"step": 760
},
{
"epoch": 0.4117330462863294,
"grad_norm": 2.342696497908346,
"learning_rate": 7.321695963332516e-06,
"loss": 0.5331,
"step": 765
},
{
"epoch": 0.41442411194833156,
"grad_norm": 2.5064305240289144,
"learning_rate": 7.279991622874319e-06,
"loss": 0.5397,
"step": 770
},
{
"epoch": 0.4171151776103337,
"grad_norm": 2.3898169886899296,
"learning_rate": 7.238086050313563e-06,
"loss": 0.5197,
"step": 775
},
{
"epoch": 0.41980624327233584,
"grad_norm": 2.429842329992259,
"learning_rate": 7.195982944236853e-06,
"loss": 0.516,
"step": 780
},
{
"epoch": 0.422497308934338,
"grad_norm": 2.3477023797859813,
"learning_rate": 7.1536860206651025e-06,
"loss": 0.5215,
"step": 785
},
{
"epoch": 0.4251883745963401,
"grad_norm": 2.3798189210672587,
"learning_rate": 7.1111990127255684e-06,
"loss": 0.5033,
"step": 790
},
{
"epoch": 0.4278794402583423,
"grad_norm": 2.400485115987879,
"learning_rate": 7.068525670322349e-06,
"loss": 0.5301,
"step": 795
},
{
"epoch": 0.43057050592034446,
"grad_norm": 2.830405312883021,
"learning_rate": 7.025669759805431e-06,
"loss": 0.5226,
"step": 800
},
{
"epoch": 0.4332615715823466,
"grad_norm": 2.267489328220382,
"learning_rate": 6.982635063638265e-06,
"loss": 0.5065,
"step": 805
},
{
"epoch": 0.43595263724434874,
"grad_norm": 2.452358924939666,
"learning_rate": 6.939425380063924e-06,
"loss": 0.5037,
"step": 810
},
{
"epoch": 0.4386437029063509,
"grad_norm": 2.4030600337466588,
"learning_rate": 6.896044522769879e-06,
"loss": 0.5181,
"step": 815
},
{
"epoch": 0.4413347685683531,
"grad_norm": 2.385954958028487,
"learning_rate": 6.852496320551387e-06,
"loss": 0.513,
"step": 820
},
{
"epoch": 0.44402583423035524,
"grad_norm": 2.3598226733949064,
"learning_rate": 6.808784616973581e-06,
"loss": 0.5126,
"step": 825
},
{
"epoch": 0.44671689989235736,
"grad_norm": 2.3565314879299053,
"learning_rate": 6.76491327003222e-06,
"loss": 0.5029,
"step": 830
},
{
"epoch": 0.4494079655543595,
"grad_norm": 2.5160272506828756,
"learning_rate": 6.720886151813194e-06,
"loss": 0.4934,
"step": 835
},
{
"epoch": 0.4520990312163617,
"grad_norm": 2.386495536841602,
"learning_rate": 6.676707148150763e-06,
"loss": 0.5032,
"step": 840
},
{
"epoch": 0.45479009687836386,
"grad_norm": 2.436343528178474,
"learning_rate": 6.632380158284607e-06,
"loss": 0.4946,
"step": 845
},
{
"epoch": 0.45748116254036597,
"grad_norm": 2.4789859597196426,
"learning_rate": 6.587909094515663e-06,
"loss": 0.5066,
"step": 850
},
{
"epoch": 0.46017222820236814,
"grad_norm": 2.417011825533269,
"learning_rate": 6.5432978818608395e-06,
"loss": 0.5112,
"step": 855
},
{
"epoch": 0.4628632938643703,
"grad_norm": 2.3434112329726275,
"learning_rate": 6.498550457706584e-06,
"loss": 0.5045,
"step": 860
},
{
"epoch": 0.4655543595263724,
"grad_norm": 2.517810982631902,
"learning_rate": 6.453670771461377e-06,
"loss": 0.4905,
"step": 865
},
{
"epoch": 0.4682454251883746,
"grad_norm": 2.3345178470528385,
"learning_rate": 6.408662784207149e-06,
"loss": 0.5141,
"step": 870
},
{
"epoch": 0.47093649085037675,
"grad_norm": 2.39523867329611,
"learning_rate": 6.363530468349686e-06,
"loss": 0.4879,
"step": 875
},
{
"epoch": 0.4736275565123789,
"grad_norm": 2.308366752180236,
"learning_rate": 6.318277807268013e-06,
"loss": 0.4956,
"step": 880
},
{
"epoch": 0.47631862217438103,
"grad_norm": 2.2009448335762127,
"learning_rate": 6.27290879496283e-06,
"loss": 0.4946,
"step": 885
},
{
"epoch": 0.4790096878363832,
"grad_norm": 2.506472513608515,
"learning_rate": 6.227427435703997e-06,
"loss": 0.4747,
"step": 890
},
{
"epoch": 0.48170075349838537,
"grad_norm": 2.3312661677886157,
"learning_rate": 6.181837743677118e-06,
"loss": 0.4781,
"step": 895
},
{
"epoch": 0.48439181916038754,
"grad_norm": 2.415462314886686,
"learning_rate": 6.136143742629252e-06,
"loss": 0.4846,
"step": 900
},
{
"epoch": 0.48708288482238965,
"grad_norm": 2.5220206493746997,
"learning_rate": 6.09034946551377e-06,
"loss": 0.4891,
"step": 905
},
{
"epoch": 0.4897739504843918,
"grad_norm": 2.4607305732947853,
"learning_rate": 6.044458954134411e-06,
"loss": 0.5057,
"step": 910
},
{
"epoch": 0.492465016146394,
"grad_norm": 2.306884124252388,
"learning_rate": 5.998476258788555e-06,
"loss": 0.477,
"step": 915
},
{
"epoch": 0.4951560818083961,
"grad_norm": 2.385382261424978,
"learning_rate": 5.952405437909738e-06,
"loss": 0.4812,
"step": 920
},
{
"epoch": 0.49784714747039827,
"grad_norm": 2.2862418710775847,
"learning_rate": 5.90625055770946e-06,
"loss": 0.4784,
"step": 925
},
{
"epoch": 0.5005382131324004,
"grad_norm": 2.3522872723971093,
"learning_rate": 5.860015691818292e-06,
"loss": 0.4849,
"step": 930
},
{
"epoch": 0.5032292787944026,
"grad_norm": 2.257405887575007,
"learning_rate": 5.813704920926352e-06,
"loss": 0.4799,
"step": 935
},
{
"epoch": 0.5059203444564048,
"grad_norm": 2.5409180122003225,
"learning_rate": 5.767322332423128e-06,
"loss": 0.4753,
"step": 940
},
{
"epoch": 0.5086114101184069,
"grad_norm": 2.2912568364325137,
"learning_rate": 5.720872020036734e-06,
"loss": 0.4836,
"step": 945
},
{
"epoch": 0.511302475780409,
"grad_norm": 2.3404279089465256,
"learning_rate": 5.674358083472598e-06,
"loss": 0.4709,
"step": 950
},
{
"epoch": 0.5139935414424112,
"grad_norm": 2.3138319616276504,
"learning_rate": 5.6277846280516125e-06,
"loss": 0.4775,
"step": 955
},
{
"epoch": 0.5166846071044133,
"grad_norm": 2.319930769748735,
"learning_rate": 5.581155764347812e-06,
"loss": 0.4737,
"step": 960
},
{
"epoch": 0.5193756727664155,
"grad_norm": 2.2631450594519573,
"learning_rate": 5.534475607825566e-06,
"loss": 0.4681,
"step": 965
},
{
"epoch": 0.5220667384284177,
"grad_norm": 2.2993752726651877,
"learning_rate": 5.487748278476342e-06,
"loss": 0.4744,
"step": 970
},
{
"epoch": 0.5247578040904198,
"grad_norm": 2.6084393355683853,
"learning_rate": 5.440977900455093e-06,
"loss": 0.474,
"step": 975
},
{
"epoch": 0.527448869752422,
"grad_norm": 2.2812262068529767,
"learning_rate": 5.39416860171624e-06,
"loss": 0.4603,
"step": 980
},
{
"epoch": 0.5301399354144241,
"grad_norm": 2.366439732267079,
"learning_rate": 5.347324513649352e-06,
"loss": 0.4554,
"step": 985
},
{
"epoch": 0.5328310010764262,
"grad_norm": 2.282327173713366,
"learning_rate": 5.300449770714502e-06,
"loss": 0.4484,
"step": 990
},
{
"epoch": 0.5355220667384284,
"grad_norm": 2.388092616206348,
"learning_rate": 5.253548510077366e-06,
"loss": 0.4565,
"step": 995
},
{
"epoch": 0.5382131324004306,
"grad_norm": 2.347808334356873,
"learning_rate": 5.206624871244066e-06,
"loss": 0.4581,
"step": 1000
},
{
"epoch": 0.5409041980624327,
"grad_norm": 2.2379521568960206,
"learning_rate": 5.159682995695833e-06,
"loss": 0.4477,
"step": 1005
},
{
"epoch": 0.5435952637244349,
"grad_norm": 2.2336687017970296,
"learning_rate": 5.112727026523461e-06,
"loss": 0.4559,
"step": 1010
},
{
"epoch": 0.5462863293864371,
"grad_norm": 2.3526152961096223,
"learning_rate": 5.065761108061658e-06,
"loss": 0.4517,
"step": 1015
},
{
"epoch": 0.5489773950484392,
"grad_norm": 2.3430650960275305,
"learning_rate": 5.018789385523245e-06,
"loss": 0.4651,
"step": 1020
},
{
"epoch": 0.5516684607104413,
"grad_norm": 2.238164378282626,
"learning_rate": 4.971816004633323e-06,
"loss": 0.4461,
"step": 1025
},
{
"epoch": 0.5543595263724435,
"grad_norm": 2.357723214292236,
"learning_rate": 4.924845111263349e-06,
"loss": 0.4475,
"step": 1030
},
{
"epoch": 0.5570505920344456,
"grad_norm": 2.416110748499095,
"learning_rate": 4.877880851065238e-06,
"loss": 0.4621,
"step": 1035
},
{
"epoch": 0.5597416576964478,
"grad_norm": 2.446048600948981,
"learning_rate": 4.830927369105457e-06,
"loss": 0.4585,
"step": 1040
},
{
"epoch": 0.56243272335845,
"grad_norm": 2.381977000693451,
"learning_rate": 4.783988809499187e-06,
"loss": 0.4544,
"step": 1045
},
{
"epoch": 0.5651237890204521,
"grad_norm": 2.633241388056459,
"learning_rate": 4.737069315044562e-06,
"loss": 0.44,
"step": 1050
},
{
"epoch": 0.5678148546824543,
"grad_norm": 2.2816953591294937,
"learning_rate": 4.690173026857028e-06,
"loss": 0.4501,
"step": 1055
},
{
"epoch": 0.5705059203444564,
"grad_norm": 2.446396815822862,
"learning_rate": 4.643304084003839e-06,
"loss": 0.4506,
"step": 1060
},
{
"epoch": 0.5731969860064585,
"grad_norm": 2.184428941729192,
"learning_rate": 4.596466623138756e-06,
"loss": 0.4308,
"step": 1065
},
{
"epoch": 0.5758880516684607,
"grad_norm": 2.3003382421283844,
"learning_rate": 4.549664778136933e-06,
"loss": 0.4416,
"step": 1070
},
{
"epoch": 0.5785791173304629,
"grad_norm": 2.2767749774728783,
"learning_rate": 4.502902679730074e-06,
"loss": 0.4315,
"step": 1075
},
{
"epoch": 0.581270182992465,
"grad_norm": 2.3411661207469674,
"learning_rate": 4.456184455141843e-06,
"loss": 0.447,
"step": 1080
},
{
"epoch": 0.5839612486544672,
"grad_norm": 2.381303755147278,
"learning_rate": 4.4095142277236015e-06,
"loss": 0.4397,
"step": 1085
},
{
"epoch": 0.5866523143164694,
"grad_norm": 2.332530420483932,
"learning_rate": 4.362896116590475e-06,
"loss": 0.4392,
"step": 1090
},
{
"epoch": 0.5893433799784715,
"grad_norm": 2.3579190260561704,
"learning_rate": 4.316334236257818e-06,
"loss": 0.4328,
"step": 1095
},
{
"epoch": 0.5920344456404736,
"grad_norm": 2.2783721282271685,
"learning_rate": 4.269832696278038e-06,
"loss": 0.4336,
"step": 1100
},
{
"epoch": 0.5947255113024758,
"grad_norm": 2.300306639814476,
"learning_rate": 4.223395600877912e-06,
"loss": 0.4242,
"step": 1105
},
{
"epoch": 0.5974165769644779,
"grad_norm": 2.351752301438986,
"learning_rate": 4.17702704859633e-06,
"loss": 0.4442,
"step": 1110
},
{
"epoch": 0.6001076426264801,
"grad_norm": 2.4962239067091847,
"learning_rate": 4.130731131922574e-06,
"loss": 0.4378,
"step": 1115
},
{
"epoch": 0.6027987082884823,
"grad_norm": 2.361239331500982,
"learning_rate": 4.0845119369350995e-06,
"loss": 0.4262,
"step": 1120
},
{
"epoch": 0.6054897739504844,
"grad_norm": 2.1310276875997936,
"learning_rate": 4.038373542940905e-06,
"loss": 0.4183,
"step": 1125
},
{
"epoch": 0.6081808396124866,
"grad_norm": 2.3017752163117744,
"learning_rate": 3.992320022115492e-06,
"loss": 0.4064,
"step": 1130
},
{
"epoch": 0.6108719052744886,
"grad_norm": 2.288740574605373,
"learning_rate": 3.946355439143455e-06,
"loss": 0.4133,
"step": 1135
},
{
"epoch": 0.6135629709364908,
"grad_norm": 2.2674805407755194,
"learning_rate": 3.900483850859735e-06,
"loss": 0.4275,
"step": 1140
},
{
"epoch": 0.616254036598493,
"grad_norm": 2.3917723784081706,
"learning_rate": 3.854709305891557e-06,
"loss": 0.4311,
"step": 1145
},
{
"epoch": 0.6189451022604952,
"grad_norm": 2.24620365098204,
"learning_rate": 3.8090358443010993e-06,
"loss": 0.4111,
"step": 1150
},
{
"epoch": 0.6216361679224973,
"grad_norm": 2.297246451401738,
"learning_rate": 3.7634674972289227e-06,
"loss": 0.422,
"step": 1155
},
{
"epoch": 0.6243272335844995,
"grad_norm": 2.1634920709192462,
"learning_rate": 3.718008286538179e-06,
"loss": 0.4322,
"step": 1160
},
{
"epoch": 0.6270182992465017,
"grad_norm": 2.2583945433268857,
"learning_rate": 3.67266222445964e-06,
"loss": 0.4235,
"step": 1165
},
{
"epoch": 0.6297093649085038,
"grad_norm": 2.194753709445314,
"learning_rate": 3.627433313237576e-06,
"loss": 0.4219,
"step": 1170
},
{
"epoch": 0.6324004305705059,
"grad_norm": 2.1815873696945323,
"learning_rate": 3.5823255447765233e-06,
"loss": 0.4185,
"step": 1175
},
{
"epoch": 0.635091496232508,
"grad_norm": 2.115828204783862,
"learning_rate": 3.5373429002889583e-06,
"loss": 0.4015,
"step": 1180
},
{
"epoch": 0.6377825618945102,
"grad_norm": 2.250398497407886,
"learning_rate": 3.4924893499439096e-06,
"loss": 0.4164,
"step": 1185
},
{
"epoch": 0.6404736275565124,
"grad_norm": 2.3734751418562405,
"learning_rate": 3.447768852516554e-06,
"loss": 0.4031,
"step": 1190
},
{
"epoch": 0.6431646932185145,
"grad_norm": 2.2971665886143056,
"learning_rate": 3.4031853550388176e-06,
"loss": 0.4204,
"step": 1195
},
{
"epoch": 0.6458557588805167,
"grad_norm": 2.148355296872698,
"learning_rate": 3.3587427924510086e-06,
"loss": 0.4088,
"step": 1200
},
{
"epoch": 0.6485468245425189,
"grad_norm": 2.2063900099924636,
"learning_rate": 3.314445087254518e-06,
"loss": 0.4038,
"step": 1205
},
{
"epoch": 0.6512378902045209,
"grad_norm": 2.396941304529023,
"learning_rate": 3.2702961491656197e-06,
"loss": 0.4095,
"step": 1210
},
{
"epoch": 0.6539289558665231,
"grad_norm": 2.4764518437414558,
"learning_rate": 3.226299874770402e-06,
"loss": 0.4006,
"step": 1215
},
{
"epoch": 0.6566200215285253,
"grad_norm": 2.169456697824407,
"learning_rate": 3.1824601471808504e-06,
"loss": 0.4102,
"step": 1220
},
{
"epoch": 0.6593110871905274,
"grad_norm": 2.2877199164126223,
"learning_rate": 3.138780835692132e-06,
"loss": 0.4044,
"step": 1225
},
{
"epoch": 0.6620021528525296,
"grad_norm": 2.411168783471376,
"learning_rate": 3.0952657954410792e-06,
"loss": 0.3998,
"step": 1230
},
{
"epoch": 0.6646932185145318,
"grad_norm": 2.1915427873990687,
"learning_rate": 3.051918867065944e-06,
"loss": 0.3935,
"step": 1235
},
{
"epoch": 0.667384284176534,
"grad_norm": 2.236321241200045,
"learning_rate": 3.0087438763674226e-06,
"loss": 0.3968,
"step": 1240
},
{
"epoch": 0.670075349838536,
"grad_norm": 2.153491088223765,
"learning_rate": 2.9657446339709906e-06,
"loss": 0.4108,
"step": 1245
},
{
"epoch": 0.6727664155005382,
"grad_norm": 2.1667604177883066,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.4067,
"step": 1250
},
{
"epoch": 0.6754574811625403,
"grad_norm": 2.3480884088972216,
"learning_rate": 2.8802885586935794e-06,
"loss": 0.3973,
"step": 1255
},
{
"epoch": 0.6781485468245425,
"grad_norm": 2.363888356448008,
"learning_rate": 2.837839268167373e-06,
"loss": 0.3997,
"step": 1260
},
{
"epoch": 0.6808396124865447,
"grad_norm": 2.1759737776060906,
"learning_rate": 2.7955808099871196e-06,
"loss": 0.41,
"step": 1265
},
{
"epoch": 0.6835306781485468,
"grad_norm": 2.145769279313241,
"learning_rate": 2.7535169138851124e-06,
"loss": 0.3968,
"step": 1270
},
{
"epoch": 0.686221743810549,
"grad_norm": 2.3938641882258738,
"learning_rate": 2.711651292421593e-06,
"loss": 0.3943,
"step": 1275
},
{
"epoch": 0.6889128094725512,
"grad_norm": 2.2985974347283435,
"learning_rate": 2.6699876406570823e-06,
"loss": 0.4057,
"step": 1280
},
{
"epoch": 0.6916038751345532,
"grad_norm": 2.212164701308494,
"learning_rate": 2.62852963582625e-06,
"loss": 0.4069,
"step": 1285
},
{
"epoch": 0.6942949407965554,
"grad_norm": 2.2332995585806783,
"learning_rate": 2.5872809370133704e-06,
"loss": 0.3929,
"step": 1290
},
{
"epoch": 0.6969860064585576,
"grad_norm": 2.154755198337086,
"learning_rate": 2.5462451848293535e-06,
"loss": 0.395,
"step": 1295
},
{
"epoch": 0.6996770721205597,
"grad_norm": 2.35147288064046,
"learning_rate": 2.5054260010904423e-06,
"loss": 0.4131,
"step": 1300
},
{
"epoch": 0.7023681377825619,
"grad_norm": 2.307361805622348,
"learning_rate": 2.464826988498544e-06,
"loss": 0.3889,
"step": 1305
},
{
"epoch": 0.7050592034445641,
"grad_norm": 2.2374159276691294,
"learning_rate": 2.424451730323261e-06,
"loss": 0.3911,
"step": 1310
},
{
"epoch": 0.7077502691065662,
"grad_norm": 2.0218101351051714,
"learning_rate": 2.3843037900856174e-06,
"loss": 0.3744,
"step": 1315
},
{
"epoch": 0.7104413347685683,
"grad_norm": 2.117430212547741,
"learning_rate": 2.3443867112435585e-06,
"loss": 0.3735,
"step": 1320
},
{
"epoch": 0.7131324004305705,
"grad_norm": 2.4379881536686625,
"learning_rate": 2.304704016879195e-06,
"loss": 0.3808,
"step": 1325
},
{
"epoch": 0.7158234660925726,
"grad_norm": 2.186063282716259,
"learning_rate": 2.265259209387867e-06,
"loss": 0.401,
"step": 1330
},
{
"epoch": 0.7185145317545748,
"grad_norm": 2.1034516865579316,
"learning_rate": 2.226055770169002e-06,
"loss": 0.3867,
"step": 1335
},
{
"epoch": 0.721205597416577,
"grad_norm": 2.257291682766681,
"learning_rate": 2.1870971593188704e-06,
"loss": 0.3827,
"step": 1340
},
{
"epoch": 0.7238966630785791,
"grad_norm": 2.2641761303433983,
"learning_rate": 2.148386815325179e-06,
"loss": 0.3795,
"step": 1345
},
{
"epoch": 0.7265877287405813,
"grad_norm": 2.485493712722777,
"learning_rate": 2.109928154763606e-06,
"loss": 0.3878,
"step": 1350
},
{
"epoch": 0.7292787944025835,
"grad_norm": 2.1251911128194285,
"learning_rate": 2.0717245719962347e-06,
"loss": 0.3664,
"step": 1355
},
{
"epoch": 0.7319698600645855,
"grad_norm": 2.276033486452104,
"learning_rate": 2.0337794388719845e-06,
"loss": 0.3813,
"step": 1360
},
{
"epoch": 0.7346609257265877,
"grad_norm": 2.2995322739078747,
"learning_rate": 1.9960961044290015e-06,
"loss": 0.3744,
"step": 1365
},
{
"epoch": 0.7373519913885899,
"grad_norm": 2.0899009167827343,
"learning_rate": 1.9586778945990785e-06,
"loss": 0.3721,
"step": 1370
},
{
"epoch": 0.740043057050592,
"grad_norm": 2.2617171211103395,
"learning_rate": 1.921528111914102e-06,
"loss": 0.3785,
"step": 1375
},
{
"epoch": 0.7427341227125942,
"grad_norm": 2.154826412069349,
"learning_rate": 1.8846500352145753e-06,
"loss": 0.3762,
"step": 1380
},
{
"epoch": 0.7454251883745964,
"grad_norm": 2.099913553338307,
"learning_rate": 1.848046919360225e-06,
"loss": 0.3707,
"step": 1385
},
{
"epoch": 0.7481162540365985,
"grad_norm": 2.0516253842538767,
"learning_rate": 1.811721994942731e-06,
"loss": 0.3744,
"step": 1390
},
{
"epoch": 0.7508073196986006,
"grad_norm": 2.1300015889778425,
"learning_rate": 1.775678468000589e-06,
"loss": 0.3762,
"step": 1395
},
{
"epoch": 0.7534983853606028,
"grad_norm": 2.09371262898784,
"learning_rate": 1.7399195197361507e-06,
"loss": 0.3767,
"step": 1400
},
{
"epoch": 0.7561894510226049,
"grad_norm": 2.0414347937365194,
"learning_rate": 1.7044483062348465e-06,
"loss": 0.3769,
"step": 1405
},
{
"epoch": 0.7588805166846071,
"grad_norm": 2.245862892723086,
"learning_rate": 1.6692679581866334e-06,
"loss": 0.3699,
"step": 1410
},
{
"epoch": 0.7615715823466093,
"grad_norm": 2.0019206442455535,
"learning_rate": 1.6343815806096764e-06,
"loss": 0.3718,
"step": 1415
},
{
"epoch": 0.7642626480086114,
"grad_norm": 2.0667622426417975,
"learning_rate": 1.5997922525763015e-06,
"loss": 0.3709,
"step": 1420
},
{
"epoch": 0.7669537136706136,
"grad_norm": 2.2454001706534252,
"learning_rate": 1.5655030269412375e-06,
"loss": 0.378,
"step": 1425
},
{
"epoch": 0.7696447793326158,
"grad_norm": 2.0453161314199035,
"learning_rate": 1.5315169300721694e-06,
"loss": 0.3798,
"step": 1430
},
{
"epoch": 0.7723358449946178,
"grad_norm": 2.122627263333417,
"learning_rate": 1.4978369615826316e-06,
"loss": 0.3642,
"step": 1435
},
{
"epoch": 0.77502691065662,
"grad_norm": 2.253665103900189,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.3649,
"step": 1440
},
{
"epoch": 0.7777179763186222,
"grad_norm": 2.223866039627806,
"learning_rate": 1.431407272839443e-06,
"loss": 0.3648,
"step": 1445
},
{
"epoch": 0.7804090419806243,
"grad_norm": 2.124642169885474,
"learning_rate": 1.3986634156713418e-06,
"loss": 0.3686,
"step": 1450
},
{
"epoch": 0.7831001076426265,
"grad_norm": 2.039777090168692,
"learning_rate": 1.3662374125363954e-06,
"loss": 0.365,
"step": 1455
},
{
"epoch": 0.7857911733046287,
"grad_norm": 2.1116486617070405,
"learning_rate": 1.334132125354236e-06,
"loss": 0.3636,
"step": 1460
},
{
"epoch": 0.7884822389666308,
"grad_norm": 2.1955820453419217,
"learning_rate": 1.302350387738101e-06,
"loss": 0.3623,
"step": 1465
},
{
"epoch": 0.7911733046286329,
"grad_norm": 2.1223258884983074,
"learning_rate": 1.270895004744737e-06,
"loss": 0.3631,
"step": 1470
},
{
"epoch": 0.7938643702906351,
"grad_norm": 2.272320371876702,
"learning_rate": 1.2397687526268248e-06,
"loss": 0.3714,
"step": 1475
},
{
"epoch": 0.7965554359526372,
"grad_norm": 2.234677152395205,
"learning_rate": 1.2089743785879493e-06,
"loss": 0.3613,
"step": 1480
},
{
"epoch": 0.7992465016146394,
"grad_norm": 2.112632819570097,
"learning_rate": 1.1785146005401292e-06,
"loss": 0.3676,
"step": 1485
},
{
"epoch": 0.8019375672766416,
"grad_norm": 2.0426642476776857,
"learning_rate": 1.1483921068639353e-06,
"loss": 0.3589,
"step": 1490
},
{
"epoch": 0.8046286329386437,
"grad_norm": 2.2682633423265006,
"learning_rate": 1.118609556171213e-06,
"loss": 0.3702,
"step": 1495
},
{
"epoch": 0.8073196986006459,
"grad_norm": 2.206478926731362,
"learning_rate": 1.0891695770704341e-06,
"loss": 0.3669,
"step": 1500
},
{
"epoch": 0.810010764262648,
"grad_norm": 2.0838005667062425,
"learning_rate": 1.0600747679346956e-06,
"loss": 0.3649,
"step": 1505
},
{
"epoch": 0.8127018299246501,
"grad_norm": 2.0526302864143515,
"learning_rate": 1.0313276966723867e-06,
"loss": 0.3607,
"step": 1510
},
{
"epoch": 0.8153928955866523,
"grad_norm": 2.2411019803506704,
"learning_rate": 1.002930900500546e-06,
"loss": 0.3608,
"step": 1515
},
{
"epoch": 0.8180839612486545,
"grad_norm": 2.2183253148113877,
"learning_rate": 9.74886885720925e-07,
"loss": 0.3489,
"step": 1520
},
{
"epoch": 0.8207750269106566,
"grad_norm": 2.2046741515838555,
"learning_rate": 9.471981274987846e-07,
"loss": 0.3582,
"step": 1525
},
{
"epoch": 0.8234660925726588,
"grad_norm": 2.0776101459474936,
"learning_rate": 9.198670696444339e-07,
"loss": 0.3684,
"step": 1530
},
{
"epoch": 0.826157158234661,
"grad_norm": 2.1260490300416053,
"learning_rate": 8.928961243975437e-07,
"loss": 0.3602,
"step": 1535
},
{
"epoch": 0.8288482238966631,
"grad_norm": 2.203007778671738,
"learning_rate": 8.662876722142327e-07,
"loss": 0.3676,
"step": 1540
},
{
"epoch": 0.8315392895586652,
"grad_norm": 2.0515304374779415,
"learning_rate": 8.400440615569849e-07,
"loss": 0.3666,
"step": 1545
},
{
"epoch": 0.8342303552206674,
"grad_norm": 2.235903012764375,
"learning_rate": 8.141676086873574e-07,
"loss": 0.35,
"step": 1550
},
{
"epoch": 0.8369214208826695,
"grad_norm": 1.9898897228797456,
"learning_rate": 7.886605974615574e-07,
"loss": 0.3619,
"step": 1555
},
{
"epoch": 0.8396124865446717,
"grad_norm": 2.0672348209333102,
"learning_rate": 7.635252791288611e-07,
"loss": 0.346,
"step": 1560
},
{
"epoch": 0.8423035522066739,
"grad_norm": 2.140977921399784,
"learning_rate": 7.38763872132931e-07,
"loss": 0.3579,
"step": 1565
},
{
"epoch": 0.844994617868676,
"grad_norm": 2.0050132762403137,
"learning_rate": 7.143785619160026e-07,
"loss": 0.3561,
"step": 1570
},
{
"epoch": 0.8476856835306782,
"grad_norm": 2.0184002747173104,
"learning_rate": 6.903715007260043e-07,
"loss": 0.3555,
"step": 1575
},
{
"epoch": 0.8503767491926802,
"grad_norm": 1.8477534752755447,
"learning_rate": 6.667448074265954e-07,
"loss": 0.3517,
"step": 1580
},
{
"epoch": 0.8530678148546824,
"grad_norm": 2.0049189975542965,
"learning_rate": 6.435005673101646e-07,
"loss": 0.3388,
"step": 1585
},
{
"epoch": 0.8557588805166846,
"grad_norm": 2.0424760670973345,
"learning_rate": 6.206408319137703e-07,
"loss": 0.371,
"step": 1590
},
{
"epoch": 0.8584499461786868,
"grad_norm": 2.0546063213646577,
"learning_rate": 5.981676188380802e-07,
"loss": 0.3467,
"step": 1595
},
{
"epoch": 0.8611410118406889,
"grad_norm": 2.0825364915777085,
"learning_rate": 5.760829115692907e-07,
"loss": 0.3593,
"step": 1600
},
{
"epoch": 0.8638320775026911,
"grad_norm": 2.276064828333872,
"learning_rate": 5.543886593040737e-07,
"loss": 0.3527,
"step": 1605
},
{
"epoch": 0.8665231431646933,
"grad_norm": 2.0176514994976524,
"learning_rate": 5.330867767775333e-07,
"loss": 0.3532,
"step": 1610
},
{
"epoch": 0.8692142088266954,
"grad_norm": 2.0708726570249922,
"learning_rate": 5.121791440942131e-07,
"loss": 0.3498,
"step": 1615
},
{
"epoch": 0.8719052744886975,
"grad_norm": 1.9377547369040535,
"learning_rate": 4.916676065621562e-07,
"loss": 0.3485,
"step": 1620
},
{
"epoch": 0.8745963401506996,
"grad_norm": 1.9361489483911087,
"learning_rate": 4.715539745300429e-07,
"loss": 0.3433,
"step": 1625
},
{
"epoch": 0.8772874058127018,
"grad_norm": 2.362723902179469,
"learning_rate": 4.5184002322740784e-07,
"loss": 0.3529,
"step": 1630
},
{
"epoch": 0.879978471474704,
"grad_norm": 2.0061855571363765,
"learning_rate": 4.3252749260795533e-07,
"loss": 0.3405,
"step": 1635
},
{
"epoch": 0.8826695371367062,
"grad_norm": 2.332323171973721,
"learning_rate": 4.1361808719599163e-07,
"loss": 0.3522,
"step": 1640
},
{
"epoch": 0.8853606027987083,
"grad_norm": 2.1057140470170155,
"learning_rate": 3.951134759359854e-07,
"loss": 0.346,
"step": 1645
},
{
"epoch": 0.8880516684607105,
"grad_norm": 1.927661281496956,
"learning_rate": 3.7701529204526856e-07,
"loss": 0.3592,
"step": 1650
},
{
"epoch": 0.8907427341227125,
"grad_norm": 2.118098677778136,
"learning_rate": 3.5932513286988436e-07,
"loss": 0.3533,
"step": 1655
},
{
"epoch": 0.8934337997847147,
"grad_norm": 1.971012885096697,
"learning_rate": 3.420445597436056e-07,
"loss": 0.3485,
"step": 1660
},
{
"epoch": 0.8961248654467169,
"grad_norm": 2.0143344650309265,
"learning_rate": 3.251750978501339e-07,
"loss": 0.3503,
"step": 1665
},
{
"epoch": 0.898815931108719,
"grad_norm": 2.0292306602899193,
"learning_rate": 3.087182360884872e-07,
"loss": 0.3416,
"step": 1670
},
{
"epoch": 0.9015069967707212,
"grad_norm": 1.9564839527078925,
"learning_rate": 2.926754269415877e-07,
"loss": 0.3507,
"step": 1675
},
{
"epoch": 0.9041980624327234,
"grad_norm": 2.1080877292592373,
"learning_rate": 2.77048086348064e-07,
"loss": 0.3544,
"step": 1680
},
{
"epoch": 0.9068891280947255,
"grad_norm": 2.0632107292197617,
"learning_rate": 2.6183759357728543e-07,
"loss": 0.3502,
"step": 1685
},
{
"epoch": 0.9095801937567277,
"grad_norm": 1.937198592937774,
"learning_rate": 2.470452911076227e-07,
"loss": 0.3485,
"step": 1690
},
{
"epoch": 0.9122712594187298,
"grad_norm": 2.013603885615683,
"learning_rate": 2.326724845079653e-07,
"loss": 0.3572,
"step": 1695
},
{
"epoch": 0.9149623250807319,
"grad_norm": 2.1281726376758066,
"learning_rate": 2.1872044232248646e-07,
"loss": 0.3521,
"step": 1700
},
{
"epoch": 0.9176533907427341,
"grad_norm": 2.1720549170018897,
"learning_rate": 2.0519039595868706e-07,
"loss": 0.355,
"step": 1705
},
{
"epoch": 0.9203444564047363,
"grad_norm": 2.058083501208556,
"learning_rate": 1.9208353957870684e-07,
"loss": 0.3606,
"step": 1710
},
{
"epoch": 0.9230355220667384,
"grad_norm": 2.083719674982006,
"learning_rate": 1.7940102999393194e-07,
"loss": 0.3548,
"step": 1715
},
{
"epoch": 0.9257265877287406,
"grad_norm": 2.130647708015,
"learning_rate": 1.6714398656289154e-07,
"loss": 0.3497,
"step": 1720
},
{
"epoch": 0.9284176533907428,
"grad_norm": 1.9132420017221308,
"learning_rate": 1.5531349109246364e-07,
"loss": 0.3356,
"step": 1725
},
{
"epoch": 0.9311087190527448,
"grad_norm": 2.0946347399647687,
"learning_rate": 1.439105877423963e-07,
"loss": 0.3495,
"step": 1730
},
{
"epoch": 0.933799784714747,
"grad_norm": 2.08738192237864,
"learning_rate": 1.3293628293314876e-07,
"loss": 0.3517,
"step": 1735
},
{
"epoch": 0.9364908503767492,
"grad_norm": 1.9669616900562368,
"learning_rate": 1.223915452570651e-07,
"loss": 0.3624,
"step": 1740
},
{
"epoch": 0.9391819160387513,
"grad_norm": 2.0421605949353694,
"learning_rate": 1.1227730539288717e-07,
"loss": 0.3452,
"step": 1745
},
{
"epoch": 0.9418729817007535,
"grad_norm": 2.0355903627837098,
"learning_rate": 1.0259445602361084e-07,
"loss": 0.3492,
"step": 1750
},
{
"epoch": 0.9445640473627557,
"grad_norm": 2.0896912495997255,
"learning_rate": 9.334385175769955e-08,
"loss": 0.3572,
"step": 1755
},
{
"epoch": 0.9472551130247578,
"grad_norm": 2.002959531267022,
"learning_rate": 8.452630905365633e-08,
"loss": 0.3481,
"step": 1760
},
{
"epoch": 0.94994617868676,
"grad_norm": 2.136820278680856,
"learning_rate": 7.614260614796143e-08,
"loss": 0.3435,
"step": 1765
},
{
"epoch": 0.9526372443487621,
"grad_norm": 1.9052167785885679,
"learning_rate": 6.819348298638839e-08,
"loss": 0.3381,
"step": 1770
},
{
"epoch": 0.9553283100107642,
"grad_norm": 2.1788907550411403,
"learning_rate": 6.067964115869297e-08,
"loss": 0.3577,
"step": 1775
},
{
"epoch": 0.9580193756727664,
"grad_norm": 2.085202442833548,
"learning_rate": 5.36017438366937e-08,
"loss": 0.3443,
"step": 1780
},
{
"epoch": 0.9607104413347686,
"grad_norm": 1.9066530306055132,
"learning_rate": 4.696041571573773e-08,
"loss": 0.3514,
"step": 1785
},
{
"epoch": 0.9634015069967707,
"grad_norm": 2.030414548308391,
"learning_rate": 4.0756242959567596e-08,
"loss": 0.3458,
"step": 1790
},
{
"epoch": 0.9660925726587729,
"grad_norm": 2.3356985672893034,
"learning_rate": 3.498977314858487e-08,
"loss": 0.3532,
"step": 1795
},
{
"epoch": 0.9687836383207751,
"grad_norm": 2.008020036717182,
"learning_rate": 2.96615152315205e-08,
"loss": 0.3371,
"step": 1800
},
{
"epoch": 0.9714747039827771,
"grad_norm": 1.996053166126216,
"learning_rate": 2.4771939480516817e-08,
"loss": 0.345,
"step": 1805
},
{
"epoch": 0.9741657696447793,
"grad_norm": 1.9817659128174747,
"learning_rate": 2.0321477449619098e-08,
"loss": 0.3428,
"step": 1810
},
{
"epoch": 0.9768568353067815,
"grad_norm": 1.968843928974531,
"learning_rate": 1.6310521936688806e-08,
"loss": 0.3475,
"step": 1815
},
{
"epoch": 0.9795479009687836,
"grad_norm": 2.008709148033631,
"learning_rate": 1.2739426948732426e-08,
"loss": 0.3579,
"step": 1820
},
{
"epoch": 0.9822389666307858,
"grad_norm": 2.0860175169319852,
"learning_rate": 9.608507670659239e-09,
"loss": 0.3269,
"step": 1825
},
{
"epoch": 0.984930032292788,
"grad_norm": 2.1919123246360415,
"learning_rate": 6.918040437463025e-09,
"loss": 0.3546,
"step": 1830
},
{
"epoch": 0.9876210979547901,
"grad_norm": 2.197666814347832,
"learning_rate": 4.668262709830451e-09,
"loss": 0.3476,
"step": 1835
},
{
"epoch": 0.9903121636167922,
"grad_norm": 2.0435447242057716,
"learning_rate": 2.8593730531861764e-09,
"loss": 0.3309,
"step": 1840
},
{
"epoch": 0.9930032292787944,
"grad_norm": 2.1036408486178244,
"learning_rate": 1.4915311201635362e-09,
"loss": 0.3428,
"step": 1845
},
{
"epoch": 0.9956942949407965,
"grad_norm": 1.9217810613809283,
"learning_rate": 5.648576365169245e-10,
"loss": 0.3392,
"step": 1850
},
{
"epoch": 0.9983853606027987,
"grad_norm": 2.1168830996512846,
"learning_rate": 7.943439046531609e-11,
"loss": 0.3501,
"step": 1855
},
{
"epoch": 1.0,
"eval_runtime": 3.853,
"eval_samples_per_second": 2.595,
"eval_steps_per_second": 0.779,
"step": 1858
},
{
"epoch": 1.0,
"step": 1858,
"total_flos": 194513700126720.0,
"train_loss": 0.5110224842126967,
"train_runtime": 16738.0887,
"train_samples_per_second": 1.776,
"train_steps_per_second": 0.111
}
],
"logging_steps": 5,
"max_steps": 1858,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 194513700126720.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}