{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.500419559081547, "eval_steps": 500, "global_step": 205, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024410710199099855, "grad_norm": 8.533896446228027, "learning_rate": 2.0000000000000003e-06, "loss": 1.1655, "step": 1 }, { "epoch": 0.004882142039819971, "grad_norm": 8.988560676574707, "learning_rate": 4.000000000000001e-06, "loss": 1.2516, "step": 2 }, { "epoch": 0.0073232130597299565, "grad_norm": 7.550627708435059, "learning_rate": 6e-06, "loss": 1.1895, "step": 3 }, { "epoch": 0.009764284079639942, "grad_norm": 3.6377415657043457, "learning_rate": 8.000000000000001e-06, "loss": 1.0982, "step": 4 }, { "epoch": 0.012205355099549928, "grad_norm": 3.964740753173828, "learning_rate": 1e-05, "loss": 1.0622, "step": 5 }, { "epoch": 0.014646426119459913, "grad_norm": 4.8016157150268555, "learning_rate": 9.999962669988608e-06, "loss": 1.0653, "step": 6 }, { "epoch": 0.017087497139369898, "grad_norm": 2.9538488388061523, "learning_rate": 9.999850680511844e-06, "loss": 1.026, "step": 7 }, { "epoch": 0.019528568159279884, "grad_norm": 2.869965076446533, "learning_rate": 9.999664033241933e-06, "loss": 1.0349, "step": 8 }, { "epoch": 0.02196963917918987, "grad_norm": 1.8026058673858643, "learning_rate": 9.999402730965894e-06, "loss": 1.0421, "step": 9 }, { "epoch": 0.024410710199099857, "grad_norm": 1.075210452079773, "learning_rate": 9.999066777585496e-06, "loss": 1.0008, "step": 10 }, { "epoch": 0.02685178121900984, "grad_norm": 1.4493818283081055, "learning_rate": 9.998656178117193e-06, "loss": 0.9347, "step": 11 }, { "epoch": 0.029292852238919826, "grad_norm": 1.2218502759933472, "learning_rate": 9.99817093869206e-06, "loss": 0.9537, "step": 12 }, { "epoch": 0.03173392325882981, "grad_norm": 1.0389800071716309, "learning_rate": 9.997611066555694e-06, "loss": 0.9458, "step": 13 }, { "epoch": 0.034174994278739795, "grad_norm": 0.959168016910553, "learning_rate": 9.99697657006811e-06, "loss": 0.9622, "step": 14 }, { "epoch": 0.03661606529864978, "grad_norm": 1.0173426866531372, "learning_rate": 9.99626745870361e-06, "loss": 0.9594, "step": 15 }, { "epoch": 0.03905713631855977, "grad_norm": 0.9893942475318909, "learning_rate": 9.995483743050649e-06, "loss": 0.9128, "step": 16 }, { "epoch": 0.041498207338469754, "grad_norm": 0.9174278974533081, "learning_rate": 9.99462543481167e-06, "loss": 0.9108, "step": 17 }, { "epoch": 0.04393927835837974, "grad_norm": 0.8355342745780945, "learning_rate": 9.993692546802943e-06, "loss": 0.9341, "step": 18 }, { "epoch": 0.04638034937828973, "grad_norm": 0.9482454061508179, "learning_rate": 9.992685092954347e-06, "loss": 0.8488, "step": 19 }, { "epoch": 0.048821420398199714, "grad_norm": 0.8152992129325867, "learning_rate": 9.991603088309195e-06, "loss": 0.9388, "step": 20 }, { "epoch": 0.05126249141810969, "grad_norm": 0.7824520468711853, "learning_rate": 9.990446549023977e-06, "loss": 0.917, "step": 21 }, { "epoch": 0.05370356243801968, "grad_norm": 0.8396065831184387, "learning_rate": 9.989215492368152e-06, "loss": 0.9043, "step": 22 }, { "epoch": 0.056144633457929666, "grad_norm": 0.7503563761711121, "learning_rate": 9.98790993672386e-06, "loss": 0.9368, "step": 23 }, { "epoch": 0.05858570447783965, "grad_norm": 0.846466600894928, "learning_rate": 9.98652990158566e-06, "loss": 0.8641, "step": 24 }, { "epoch": 0.06102677549774964, "grad_norm": 0.8216990232467651, "learning_rate": 9.985075407560247e-06, "loss": 0.8744, "step": 25 }, { "epoch": 0.06346784651765962, "grad_norm": 0.7758781313896179, "learning_rate": 9.983546476366133e-06, "loss": 0.8722, "step": 26 }, { "epoch": 0.06590891753756961, "grad_norm": 0.8065202236175537, "learning_rate": 9.981943130833323e-06, "loss": 0.8582, "step": 27 }, { "epoch": 0.06834998855747959, "grad_norm": 0.79361891746521, "learning_rate": 9.980265394902982e-06, "loss": 0.8549, "step": 28 }, { "epoch": 0.07079105957738958, "grad_norm": 0.7769683003425598, "learning_rate": 9.978513293627068e-06, "loss": 0.8801, "step": 29 }, { "epoch": 0.07323213059729956, "grad_norm": 0.7662413120269775, "learning_rate": 9.976686853167967e-06, "loss": 0.849, "step": 30 }, { "epoch": 0.07567320161720956, "grad_norm": 0.7053027153015137, "learning_rate": 9.974786100798098e-06, "loss": 0.8925, "step": 31 }, { "epoch": 0.07811427263711954, "grad_norm": 0.7407605051994324, "learning_rate": 9.9728110648995e-06, "loss": 0.8623, "step": 32 }, { "epoch": 0.08055534365702952, "grad_norm": 0.7798149585723877, "learning_rate": 9.970761774963421e-06, "loss": 0.8711, "step": 33 }, { "epoch": 0.08299641467693951, "grad_norm": 0.7310554385185242, "learning_rate": 9.968638261589866e-06, "loss": 0.9071, "step": 34 }, { "epoch": 0.08543748569684949, "grad_norm": 0.8006892204284668, "learning_rate": 9.966440556487149e-06, "loss": 0.9026, "step": 35 }, { "epoch": 0.08787855671675948, "grad_norm": 0.7774298787117004, "learning_rate": 9.96416869247141e-06, "loss": 0.8512, "step": 36 }, { "epoch": 0.09031962773666946, "grad_norm": 0.7737051844596863, "learning_rate": 9.961822703466131e-06, "loss": 0.8629, "step": 37 }, { "epoch": 0.09276069875657945, "grad_norm": 0.8388147950172424, "learning_rate": 9.959402624501636e-06, "loss": 0.803, "step": 38 }, { "epoch": 0.09520176977648943, "grad_norm": 0.7394818067550659, "learning_rate": 9.956908491714552e-06, "loss": 0.8768, "step": 39 }, { "epoch": 0.09764284079639943, "grad_norm": 0.8373251557350159, "learning_rate": 9.95434034234728e-06, "loss": 0.8604, "step": 40 }, { "epoch": 0.1000839118163094, "grad_norm": 0.7941448092460632, "learning_rate": 9.951698214747441e-06, "loss": 0.8397, "step": 41 }, { "epoch": 0.10252498283621939, "grad_norm": 0.7676767706871033, "learning_rate": 9.948982148367294e-06, "loss": 0.8434, "step": 42 }, { "epoch": 0.10496605385612938, "grad_norm": 0.7958892583847046, "learning_rate": 9.946192183763155e-06, "loss": 0.8503, "step": 43 }, { "epoch": 0.10740712487603936, "grad_norm": 0.793487012386322, "learning_rate": 9.943328362594788e-06, "loss": 0.8566, "step": 44 }, { "epoch": 0.10984819589594935, "grad_norm": 0.716295599937439, "learning_rate": 9.940390727624785e-06, "loss": 0.8128, "step": 45 }, { "epoch": 0.11228926691585933, "grad_norm": 0.7760279178619385, "learning_rate": 9.937379322717923e-06, "loss": 0.8409, "step": 46 }, { "epoch": 0.11473033793576932, "grad_norm": 0.8229836821556091, "learning_rate": 9.934294192840518e-06, "loss": 0.8429, "step": 47 }, { "epoch": 0.1171714089556793, "grad_norm": 0.6973395347595215, "learning_rate": 9.931135384059737e-06, "loss": 0.8542, "step": 48 }, { "epoch": 0.11961247997558928, "grad_norm": 0.7911590933799744, "learning_rate": 9.927902943542932e-06, "loss": 0.8554, "step": 49 }, { "epoch": 0.12205355099549928, "grad_norm": 0.6992570757865906, "learning_rate": 9.924596919556917e-06, "loss": 0.8706, "step": 50 }, { "epoch": 0.12449462201540926, "grad_norm": 0.7577567100524902, "learning_rate": 9.921217361467259e-06, "loss": 0.856, "step": 51 }, { "epoch": 0.12693569303531924, "grad_norm": 0.8022581934928894, "learning_rate": 9.917764319737533e-06, "loss": 0.8276, "step": 52 }, { "epoch": 0.12937676405522924, "grad_norm": 0.720230758190155, "learning_rate": 9.914237845928574e-06, "loss": 0.8613, "step": 53 }, { "epoch": 0.13181783507513922, "grad_norm": 0.7254828214645386, "learning_rate": 9.910637992697707e-06, "loss": 0.8617, "step": 54 }, { "epoch": 0.1342589060950492, "grad_norm": 0.7254623174667358, "learning_rate": 9.906964813797955e-06, "loss": 0.8543, "step": 55 }, { "epoch": 0.13669997711495918, "grad_norm": 0.7306321859359741, "learning_rate": 9.903218364077242e-06, "loss": 0.8332, "step": 56 }, { "epoch": 0.13914104813486916, "grad_norm": 0.7202122211456299, "learning_rate": 9.899398699477573e-06, "loss": 0.8663, "step": 57 }, { "epoch": 0.14158211915477917, "grad_norm": 0.7067145109176636, "learning_rate": 9.895505877034198e-06, "loss": 0.8165, "step": 58 }, { "epoch": 0.14402319017468915, "grad_norm": 0.7376930713653564, "learning_rate": 9.891539954874758e-06, "loss": 0.8267, "step": 59 }, { "epoch": 0.14646426119459913, "grad_norm": 0.7250686883926392, "learning_rate": 9.887500992218421e-06, "loss": 0.8239, "step": 60 }, { "epoch": 0.1489053322145091, "grad_norm": 0.7254573106765747, "learning_rate": 9.883389049374998e-06, "loss": 0.8452, "step": 61 }, { "epoch": 0.1513464032344191, "grad_norm": 0.7461521029472351, "learning_rate": 9.879204187744036e-06, "loss": 0.803, "step": 62 }, { "epoch": 0.1537874742543291, "grad_norm": 0.7778986096382141, "learning_rate": 9.874946469813907e-06, "loss": 0.8287, "step": 63 }, { "epoch": 0.15622854527423907, "grad_norm": 0.7395936846733093, "learning_rate": 9.870615959160876e-06, "loss": 0.8781, "step": 64 }, { "epoch": 0.15866961629414905, "grad_norm": 0.7308329343795776, "learning_rate": 9.866212720448149e-06, "loss": 0.807, "step": 65 }, { "epoch": 0.16111068731405903, "grad_norm": 0.7851212620735168, "learning_rate": 9.861736819424904e-06, "loss": 0.821, "step": 66 }, { "epoch": 0.16355175833396904, "grad_norm": 0.7638505697250366, "learning_rate": 9.857188322925317e-06, "loss": 0.8273, "step": 67 }, { "epoch": 0.16599282935387902, "grad_norm": 0.7750548720359802, "learning_rate": 9.852567298867557e-06, "loss": 0.8523, "step": 68 }, { "epoch": 0.168433900373789, "grad_norm": 0.7466771602630615, "learning_rate": 9.84787381625278e-06, "loss": 0.8415, "step": 69 }, { "epoch": 0.17087497139369898, "grad_norm": 0.6956301331520081, "learning_rate": 9.843107945164086e-06, "loss": 0.8206, "step": 70 }, { "epoch": 0.17331604241360898, "grad_norm": 0.7392652630805969, "learning_rate": 9.838269756765483e-06, "loss": 0.8098, "step": 71 }, { "epoch": 0.17575711343351896, "grad_norm": 0.7311574220657349, "learning_rate": 9.833359323300827e-06, "loss": 0.8116, "step": 72 }, { "epoch": 0.17819818445342894, "grad_norm": 0.6983757615089417, "learning_rate": 9.82837671809273e-06, "loss": 0.8436, "step": 73 }, { "epoch": 0.18063925547333892, "grad_norm": 0.7569893598556519, "learning_rate": 9.823322015541474e-06, "loss": 0.8058, "step": 74 }, { "epoch": 0.1830803264932489, "grad_norm": 0.7439902424812317, "learning_rate": 9.818195291123903e-06, "loss": 0.8424, "step": 75 }, { "epoch": 0.1855213975131589, "grad_norm": 0.7790477275848389, "learning_rate": 9.81299662139229e-06, "loss": 0.8483, "step": 76 }, { "epoch": 0.1879624685330689, "grad_norm": 0.7717331051826477, "learning_rate": 9.807726083973192e-06, "loss": 0.8214, "step": 77 }, { "epoch": 0.19040353955297887, "grad_norm": 0.7872374057769775, "learning_rate": 9.8023837575663e-06, "loss": 0.7938, "step": 78 }, { "epoch": 0.19284461057288885, "grad_norm": 0.8018149137496948, "learning_rate": 9.796969721943257e-06, "loss": 0.802, "step": 79 }, { "epoch": 0.19528568159279885, "grad_norm": 0.709600031375885, "learning_rate": 9.791484057946465e-06, "loss": 0.7944, "step": 80 }, { "epoch": 0.19772675261270883, "grad_norm": 0.8216169476509094, "learning_rate": 9.785926847487885e-06, "loss": 0.8181, "step": 81 }, { "epoch": 0.2001678236326188, "grad_norm": 0.7138919830322266, "learning_rate": 9.780298173547811e-06, "loss": 0.8043, "step": 82 }, { "epoch": 0.2026088946525288, "grad_norm": 0.7637642621994019, "learning_rate": 9.774598120173625e-06, "loss": 0.8034, "step": 83 }, { "epoch": 0.20504996567243877, "grad_norm": 0.7272418141365051, "learning_rate": 9.76882677247855e-06, "loss": 0.8271, "step": 84 }, { "epoch": 0.20749103669234878, "grad_norm": 0.7340764999389648, "learning_rate": 9.762984216640378e-06, "loss": 0.8508, "step": 85 }, { "epoch": 0.20993210771225876, "grad_norm": 0.7231638431549072, "learning_rate": 9.75707053990018e-06, "loss": 0.823, "step": 86 }, { "epoch": 0.21237317873216874, "grad_norm": 0.7670260071754456, "learning_rate": 9.751085830561e-06, "loss": 0.8595, "step": 87 }, { "epoch": 0.21481424975207872, "grad_norm": 0.7142215371131897, "learning_rate": 9.74503017798655e-06, "loss": 0.8325, "step": 88 }, { "epoch": 0.2172553207719887, "grad_norm": 0.7884289026260376, "learning_rate": 9.738903672599858e-06, "loss": 0.7751, "step": 89 }, { "epoch": 0.2196963917918987, "grad_norm": 0.7771654725074768, "learning_rate": 9.732706405881931e-06, "loss": 0.7827, "step": 90 }, { "epoch": 0.22213746281180868, "grad_norm": 0.7293388247489929, "learning_rate": 9.726438470370385e-06, "loss": 0.8724, "step": 91 }, { "epoch": 0.22457853383171866, "grad_norm": 0.7578020095825195, "learning_rate": 9.720099959658062e-06, "loss": 0.8277, "step": 92 }, { "epoch": 0.22701960485162864, "grad_norm": 0.7896732091903687, "learning_rate": 9.713690968391634e-06, "loss": 0.7769, "step": 93 }, { "epoch": 0.22946067587153865, "grad_norm": 0.6877868175506592, "learning_rate": 9.707211592270183e-06, "loss": 0.7938, "step": 94 }, { "epoch": 0.23190174689144863, "grad_norm": 0.8047687411308289, "learning_rate": 9.700661928043787e-06, "loss": 0.7735, "step": 95 }, { "epoch": 0.2343428179113586, "grad_norm": 0.7561459541320801, "learning_rate": 9.69404207351206e-06, "loss": 0.8079, "step": 96 }, { "epoch": 0.2367838889312686, "grad_norm": 0.7163955569267273, "learning_rate": 9.687352127522703e-06, "loss": 0.8042, "step": 97 }, { "epoch": 0.23922495995117857, "grad_norm": 0.7289466857910156, "learning_rate": 9.680592189970015e-06, "loss": 0.8449, "step": 98 }, { "epoch": 0.24166603097108857, "grad_norm": 0.6951574087142944, "learning_rate": 9.673762361793418e-06, "loss": 0.7988, "step": 99 }, { "epoch": 0.24410710199099855, "grad_norm": 0.7552266716957092, "learning_rate": 9.666862744975938e-06, "loss": 0.8323, "step": 100 }, { "epoch": 0.24654817301090853, "grad_norm": 0.7086972594261169, "learning_rate": 9.659893442542683e-06, "loss": 0.8567, "step": 101 }, { "epoch": 0.2489892440308185, "grad_norm": 0.7231544852256775, "learning_rate": 9.652854558559309e-06, "loss": 0.8265, "step": 102 }, { "epoch": 0.2514303150507285, "grad_norm": 0.7094722986221313, "learning_rate": 9.645746198130462e-06, "loss": 0.7803, "step": 103 }, { "epoch": 0.25387138607063847, "grad_norm": 0.6969436407089233, "learning_rate": 9.638568467398215e-06, "loss": 0.804, "step": 104 }, { "epoch": 0.25631245709054845, "grad_norm": 0.7204388380050659, "learning_rate": 9.631321473540476e-06, "loss": 0.787, "step": 105 }, { "epoch": 0.2587535281104585, "grad_norm": 0.6980841159820557, "learning_rate": 9.62400532476939e-06, "loss": 0.8294, "step": 106 }, { "epoch": 0.26119459913036847, "grad_norm": 0.6793758273124695, "learning_rate": 9.61662013032972e-06, "loss": 0.7739, "step": 107 }, { "epoch": 0.26363567015027844, "grad_norm": 0.7096854448318481, "learning_rate": 9.60916600049723e-06, "loss": 0.8035, "step": 108 }, { "epoch": 0.2660767411701884, "grad_norm": 0.6875160932540894, "learning_rate": 9.601643046577014e-06, "loss": 0.8567, "step": 109 }, { "epoch": 0.2685178121900984, "grad_norm": 0.7122709155082703, "learning_rate": 9.59405138090186e-06, "loss": 0.8153, "step": 110 }, { "epoch": 0.2709588832100084, "grad_norm": 0.695655882358551, "learning_rate": 9.586391116830549e-06, "loss": 0.7813, "step": 111 }, { "epoch": 0.27339995422991836, "grad_norm": 0.674659788608551, "learning_rate": 9.578662368746183e-06, "loss": 0.8802, "step": 112 }, { "epoch": 0.27584102524982834, "grad_norm": 0.7121911644935608, "learning_rate": 9.570865252054462e-06, "loss": 0.8017, "step": 113 }, { "epoch": 0.2782820962697383, "grad_norm": 0.7068195939064026, "learning_rate": 9.562999883181968e-06, "loss": 0.7817, "step": 114 }, { "epoch": 0.28072316728964836, "grad_norm": 0.6847429275512695, "learning_rate": 9.555066379574423e-06, "loss": 0.801, "step": 115 }, { "epoch": 0.28316423830955834, "grad_norm": 0.743248462677002, "learning_rate": 9.547064859694943e-06, "loss": 0.7978, "step": 116 }, { "epoch": 0.2856053093294683, "grad_norm": 0.7640885710716248, "learning_rate": 9.538995443022256e-06, "loss": 0.7913, "step": 117 }, { "epoch": 0.2880463803493783, "grad_norm": 0.7139798402786255, "learning_rate": 9.530858250048933e-06, "loss": 0.7994, "step": 118 }, { "epoch": 0.2904874513692883, "grad_norm": 0.7640753388404846, "learning_rate": 9.52265340227957e-06, "loss": 0.7946, "step": 119 }, { "epoch": 0.29292852238919825, "grad_norm": 0.7454321980476379, "learning_rate": 9.514381022228997e-06, "loss": 0.809, "step": 120 }, { "epoch": 0.29536959340910823, "grad_norm": 0.6853974461555481, "learning_rate": 9.506041233420427e-06, "loss": 0.8013, "step": 121 }, { "epoch": 0.2978106644290182, "grad_norm": 0.723430335521698, "learning_rate": 9.497634160383627e-06, "loss": 0.7923, "step": 122 }, { "epoch": 0.3002517354489282, "grad_norm": 0.7062557935714722, "learning_rate": 9.489159928653047e-06, "loss": 0.7702, "step": 123 }, { "epoch": 0.3026928064688382, "grad_norm": 0.6789696216583252, "learning_rate": 9.480618664765956e-06, "loss": 0.7748, "step": 124 }, { "epoch": 0.3051338774887482, "grad_norm": 0.7581243515014648, "learning_rate": 9.472010496260545e-06, "loss": 0.771, "step": 125 }, { "epoch": 0.3075749485086582, "grad_norm": 0.7822269201278687, "learning_rate": 9.463335551674024e-06, "loss": 0.8, "step": 126 }, { "epoch": 0.31001601952856817, "grad_norm": 0.7157217264175415, "learning_rate": 9.454593960540709e-06, "loss": 0.7883, "step": 127 }, { "epoch": 0.31245709054847814, "grad_norm": 0.7614567875862122, "learning_rate": 9.445785853390074e-06, "loss": 0.7929, "step": 128 }, { "epoch": 0.3148981615683881, "grad_norm": 0.7470414042472839, "learning_rate": 9.436911361744817e-06, "loss": 0.7826, "step": 129 }, { "epoch": 0.3173392325882981, "grad_norm": 0.7033482193946838, "learning_rate": 9.427970618118888e-06, "loss": 0.8359, "step": 130 }, { "epoch": 0.3197803036082081, "grad_norm": 0.7030816674232483, "learning_rate": 9.418963756015511e-06, "loss": 0.7966, "step": 131 }, { "epoch": 0.32222137462811806, "grad_norm": 0.7050835490226746, "learning_rate": 9.409890909925191e-06, "loss": 0.7852, "step": 132 }, { "epoch": 0.3246624456480281, "grad_norm": 0.7047673463821411, "learning_rate": 9.400752215323712e-06, "loss": 0.8134, "step": 133 }, { "epoch": 0.3271035166679381, "grad_norm": 0.6739450693130493, "learning_rate": 9.391547808670097e-06, "loss": 0.8186, "step": 134 }, { "epoch": 0.32954458768784806, "grad_norm": 0.7166461944580078, "learning_rate": 9.38227782740459e-06, "loss": 0.8118, "step": 135 }, { "epoch": 0.33198565870775804, "grad_norm": 0.6905531287193298, "learning_rate": 9.372942409946597e-06, "loss": 0.8092, "step": 136 }, { "epoch": 0.334426729727668, "grad_norm": 0.7552813291549683, "learning_rate": 9.36354169569261e-06, "loss": 0.7405, "step": 137 }, { "epoch": 0.336867800747578, "grad_norm": 0.6745990514755249, "learning_rate": 9.35407582501414e-06, "loss": 0.8397, "step": 138 }, { "epoch": 0.339308871767488, "grad_norm": 0.7749987840652466, "learning_rate": 9.344544939255608e-06, "loss": 0.7979, "step": 139 }, { "epoch": 0.34174994278739795, "grad_norm": 0.7859154939651489, "learning_rate": 9.334949180732245e-06, "loss": 0.8217, "step": 140 }, { "epoch": 0.34419101380730793, "grad_norm": 0.7111227512359619, "learning_rate": 9.325288692727963e-06, "loss": 0.7692, "step": 141 }, { "epoch": 0.34663208482721797, "grad_norm": 0.824995219707489, "learning_rate": 9.315563619493209e-06, "loss": 0.7989, "step": 142 }, { "epoch": 0.34907315584712795, "grad_norm": 0.7707095742225647, "learning_rate": 9.305774106242825e-06, "loss": 0.8115, "step": 143 }, { "epoch": 0.3515142268670379, "grad_norm": 0.7036089301109314, "learning_rate": 9.295920299153863e-06, "loss": 0.8119, "step": 144 }, { "epoch": 0.3539552978869479, "grad_norm": 0.7585278153419495, "learning_rate": 9.286002345363418e-06, "loss": 0.7853, "step": 145 }, { "epoch": 0.3563963689068579, "grad_norm": 0.7351112961769104, "learning_rate": 9.276020392966423e-06, "loss": 0.7974, "step": 146 }, { "epoch": 0.35883743992676786, "grad_norm": 0.7286148071289062, "learning_rate": 9.265974591013434e-06, "loss": 0.8044, "step": 147 }, { "epoch": 0.36127851094667784, "grad_norm": 0.6930050253868103, "learning_rate": 9.25586508950841e-06, "loss": 0.8117, "step": 148 }, { "epoch": 0.3637195819665878, "grad_norm": 0.8765610456466675, "learning_rate": 9.24569203940648e-06, "loss": 0.7551, "step": 149 }, { "epoch": 0.3661606529864978, "grad_norm": 0.7214458584785461, "learning_rate": 9.235455592611667e-06, "loss": 0.7984, "step": 150 }, { "epoch": 0.36860172400640784, "grad_norm": 0.7065439820289612, "learning_rate": 9.225155901974645e-06, "loss": 0.8106, "step": 151 }, { "epoch": 0.3710427950263178, "grad_norm": 0.7775700092315674, "learning_rate": 9.214793121290442e-06, "loss": 0.8211, "step": 152 }, { "epoch": 0.3734838660462278, "grad_norm": 0.7118616700172424, "learning_rate": 9.204367405296144e-06, "loss": 0.82, "step": 153 }, { "epoch": 0.3759249370661378, "grad_norm": 0.7476733326911926, "learning_rate": 9.193878909668591e-06, "loss": 0.7584, "step": 154 }, { "epoch": 0.37836600808604776, "grad_norm": 0.7488994002342224, "learning_rate": 9.183327791022048e-06, "loss": 0.7552, "step": 155 }, { "epoch": 0.38080707910595774, "grad_norm": 0.7086935043334961, "learning_rate": 9.172714206905866e-06, "loss": 0.7993, "step": 156 }, { "epoch": 0.3832481501258677, "grad_norm": 0.7513390183448792, "learning_rate": 9.162038315802132e-06, "loss": 0.7684, "step": 157 }, { "epoch": 0.3856892211457777, "grad_norm": 0.6983102560043335, "learning_rate": 9.1513002771233e-06, "loss": 0.7904, "step": 158 }, { "epoch": 0.3881302921656877, "grad_norm": 0.6591006517410278, "learning_rate": 9.140500251209813e-06, "loss": 0.7357, "step": 159 }, { "epoch": 0.3905713631855977, "grad_norm": 0.7491998672485352, "learning_rate": 9.129638399327707e-06, "loss": 0.7964, "step": 160 }, { "epoch": 0.3930124342055077, "grad_norm": 0.7312127947807312, "learning_rate": 9.118714883666204e-06, "loss": 0.7706, "step": 161 }, { "epoch": 0.39545350522541767, "grad_norm": 0.7120770215988159, "learning_rate": 9.107729867335287e-06, "loss": 0.8367, "step": 162 }, { "epoch": 0.39789457624532765, "grad_norm": 0.735023021697998, "learning_rate": 9.096683514363275e-06, "loss": 0.7832, "step": 163 }, { "epoch": 0.4003356472652376, "grad_norm": 0.7334295511245728, "learning_rate": 9.085575989694358e-06, "loss": 0.7977, "step": 164 }, { "epoch": 0.4027767182851476, "grad_norm": 0.7482827305793762, "learning_rate": 9.074407459186144e-06, "loss": 0.868, "step": 165 }, { "epoch": 0.4052177893050576, "grad_norm": 0.7395485043525696, "learning_rate": 9.063178089607183e-06, "loss": 0.7676, "step": 166 }, { "epoch": 0.40765886032496756, "grad_norm": 0.6970906257629395, "learning_rate": 9.051888048634471e-06, "loss": 0.762, "step": 167 }, { "epoch": 0.41009993134487754, "grad_norm": 0.7200821042060852, "learning_rate": 9.040537504850954e-06, "loss": 0.8067, "step": 168 }, { "epoch": 0.4125410023647875, "grad_norm": 0.7742771506309509, "learning_rate": 9.029126627743003e-06, "loss": 0.7767, "step": 169 }, { "epoch": 0.41498207338469756, "grad_norm": 0.7340243458747864, "learning_rate": 9.017655587697885e-06, "loss": 0.7816, "step": 170 }, { "epoch": 0.41742314440460754, "grad_norm": 0.7570080161094666, "learning_rate": 9.006124556001223e-06, "loss": 0.8374, "step": 171 }, { "epoch": 0.4198642154245175, "grad_norm": 0.7807502150535583, "learning_rate": 8.994533704834435e-06, "loss": 0.7749, "step": 172 }, { "epoch": 0.4223052864444275, "grad_norm": 0.7137355208396912, "learning_rate": 8.982883207272164e-06, "loss": 0.7397, "step": 173 }, { "epoch": 0.4247463574643375, "grad_norm": 0.7511448860168457, "learning_rate": 8.971173237279693e-06, "loss": 0.8006, "step": 174 }, { "epoch": 0.42718742848424746, "grad_norm": 0.7791663408279419, "learning_rate": 8.959403969710346e-06, "loss": 0.7684, "step": 175 }, { "epoch": 0.42962849950415744, "grad_norm": 0.7711341381072998, "learning_rate": 8.947575580302879e-06, "loss": 0.7905, "step": 176 }, { "epoch": 0.4320695705240674, "grad_norm": 0.7793801426887512, "learning_rate": 8.935688245678859e-06, "loss": 0.8121, "step": 177 }, { "epoch": 0.4345106415439774, "grad_norm": 0.7082055807113647, "learning_rate": 8.92374214334002e-06, "loss": 0.7657, "step": 178 }, { "epoch": 0.43695171256388743, "grad_norm": 0.735462486743927, "learning_rate": 8.911737451665616e-06, "loss": 0.7833, "step": 179 }, { "epoch": 0.4393927835837974, "grad_norm": 0.7432037591934204, "learning_rate": 8.899674349909759e-06, "loss": 0.7645, "step": 180 }, { "epoch": 0.4418338546037074, "grad_norm": 0.7552315592765808, "learning_rate": 8.887553018198738e-06, "loss": 0.8018, "step": 181 }, { "epoch": 0.44427492562361737, "grad_norm": 0.677143931388855, "learning_rate": 8.875373637528336e-06, "loss": 0.8029, "step": 182 }, { "epoch": 0.44671599664352735, "grad_norm": 0.7790682911872864, "learning_rate": 8.863136389761115e-06, "loss": 0.792, "step": 183 }, { "epoch": 0.4491570676634373, "grad_norm": 0.735373854637146, "learning_rate": 8.85084145762372e-06, "loss": 0.78, "step": 184 }, { "epoch": 0.4515981386833473, "grad_norm": 0.7221420407295227, "learning_rate": 8.838489024704131e-06, "loss": 0.807, "step": 185 }, { "epoch": 0.4540392097032573, "grad_norm": 0.7021591067314148, "learning_rate": 8.826079275448934e-06, "loss": 0.7828, "step": 186 }, { "epoch": 0.45648028072316726, "grad_norm": 0.7104141712188721, "learning_rate": 8.81361239516056e-06, "loss": 0.8051, "step": 187 }, { "epoch": 0.4589213517430773, "grad_norm": 0.749536395072937, "learning_rate": 8.801088569994523e-06, "loss": 0.7811, "step": 188 }, { "epoch": 0.4613624227629873, "grad_norm": 0.7570759654045105, "learning_rate": 8.788507986956639e-06, "loss": 0.8015, "step": 189 }, { "epoch": 0.46380349378289726, "grad_norm": 0.6997769474983215, "learning_rate": 8.775870833900226e-06, "loss": 0.7816, "step": 190 }, { "epoch": 0.46624456480280724, "grad_norm": 0.6764109134674072, "learning_rate": 8.763177299523318e-06, "loss": 0.7577, "step": 191 }, { "epoch": 0.4686856358227172, "grad_norm": 0.7811216115951538, "learning_rate": 8.750427573365825e-06, "loss": 0.7324, "step": 192 }, { "epoch": 0.4711267068426272, "grad_norm": 0.7098534107208252, "learning_rate": 8.737621845806715e-06, "loss": 0.7321, "step": 193 }, { "epoch": 0.4735677778625372, "grad_norm": 0.7705920934677124, "learning_rate": 8.724760308061172e-06, "loss": 0.7501, "step": 194 }, { "epoch": 0.47600884888244716, "grad_norm": 0.7170778512954712, "learning_rate": 8.711843152177735e-06, "loss": 0.767, "step": 195 }, { "epoch": 0.47844991990235713, "grad_norm": 0.7175964713096619, "learning_rate": 8.698870571035436e-06, "loss": 0.7592, "step": 196 }, { "epoch": 0.48089099092226717, "grad_norm": 0.7901434898376465, "learning_rate": 8.685842758340912e-06, "loss": 0.7921, "step": 197 }, { "epoch": 0.48333206194217715, "grad_norm": 0.7608402371406555, "learning_rate": 8.672759908625528e-06, "loss": 0.8617, "step": 198 }, { "epoch": 0.48577313296208713, "grad_norm": 0.7593024373054504, "learning_rate": 8.65962221724245e-06, "loss": 0.7674, "step": 199 }, { "epoch": 0.4882142039819971, "grad_norm": 0.7110275626182556, "learning_rate": 8.646429880363746e-06, "loss": 0.7521, "step": 200 }, { "epoch": 0.4906552750019071, "grad_norm": 0.7535459399223328, "learning_rate": 8.633183094977453e-06, "loss": 0.7296, "step": 201 }, { "epoch": 0.49309634602181707, "grad_norm": 0.7531000971794128, "learning_rate": 8.61988205888463e-06, "loss": 0.7863, "step": 202 }, { "epoch": 0.49553741704172705, "grad_norm": 0.7889319658279419, "learning_rate": 8.60652697069641e-06, "loss": 0.7784, "step": 203 }, { "epoch": 0.497978488061637, "grad_norm": 0.6903645396232605, "learning_rate": 8.593118029831025e-06, "loss": 0.7954, "step": 204 }, { "epoch": 0.500419559081547, "grad_norm": 0.7375295758247375, "learning_rate": 8.579655436510847e-06, "loss": 0.7764, "step": 205 } ], "logging_steps": 1, "max_steps": 818, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 205, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.5569037728439337e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }