diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9981460882461994, + "eval_steps": 506, + "global_step": 8088, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004943764676801385, + "grad_norm": 1.590954292543756, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.1575, + "step": 1 + }, + { + "epoch": 0.0004943764676801385, + "eval_loss": 1.1355711221694946, + "eval_runtime": 81.3575, + "eval_samples_per_second": 373.094, + "eval_steps_per_second": 46.646, + "step": 1 + }, + { + "epoch": 0.000988752935360277, + "grad_norm": 1.7326218530038808, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1291, + "step": 2 + }, + { + "epoch": 0.0014831294030404152, + "grad_norm": 1.5133990373740318, + "learning_rate": 3e-06, + "loss": 1.1362, + "step": 3 + }, + { + "epoch": 0.001977505870720554, + "grad_norm": 1.239830386259594, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1653, + "step": 4 + }, + { + "epoch": 0.002471882338400692, + "grad_norm": 0.8676096601071056, + "learning_rate": 5e-06, + "loss": 1.0758, + "step": 5 + }, + { + "epoch": 0.0029662588060808304, + "grad_norm": 0.701095723114537, + "learning_rate": 6e-06, + "loss": 1.0674, + "step": 6 + }, + { + "epoch": 0.003460635273760969, + "grad_norm": 0.5401551315443412, + "learning_rate": 7e-06, + "loss": 1.0889, + "step": 7 + }, + { + "epoch": 0.003955011741441108, + "grad_norm": 0.8576290358405926, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0602, + "step": 8 + }, + { + "epoch": 0.004449388209121246, + "grad_norm": 2.041353940370225, + "learning_rate": 9e-06, + "loss": 1.0502, + "step": 9 + }, + { + "epoch": 0.004943764676801384, + "grad_norm": 0.6058971239941059, + "learning_rate": 1e-05, + "loss": 1.0178, + "step": 10 + }, + { + "epoch": 0.005438141144481523, + "grad_norm": 0.5126712306876138, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.9905, + "step": 11 + }, + { + "epoch": 0.005932517612161661, + "grad_norm": 0.4040841723792844, + "learning_rate": 1.2e-05, + "loss": 1.0194, + "step": 12 + }, + { + "epoch": 0.0064268940798418, + "grad_norm": 0.33228859658090587, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.9632, + "step": 13 + }, + { + "epoch": 0.006921270547521938, + "grad_norm": 0.4560857156252839, + "learning_rate": 1.4e-05, + "loss": 1.0381, + "step": 14 + }, + { + "epoch": 0.007415647015202077, + "grad_norm": 0.3346544566012203, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.9516, + "step": 15 + }, + { + "epoch": 0.007910023482882216, + "grad_norm": 0.27693990693748854, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.9612, + "step": 16 + }, + { + "epoch": 0.008404399950562353, + "grad_norm": 0.2938663309185276, + "learning_rate": 1.7e-05, + "loss": 0.9701, + "step": 17 + }, + { + "epoch": 0.008898776418242492, + "grad_norm": 0.26968211399101333, + "learning_rate": 1.8e-05, + "loss": 0.9495, + "step": 18 + }, + { + "epoch": 0.00939315288592263, + "grad_norm": 0.28519974440122803, + "learning_rate": 1.9e-05, + "loss": 0.964, + "step": 19 + }, + { + "epoch": 0.009887529353602768, + "grad_norm": 0.2374025965426359, + "learning_rate": 2e-05, + "loss": 0.9261, + "step": 20 + }, + { + "epoch": 0.010381905821282907, + "grad_norm": 0.2414412743678375, + "learning_rate": 1.999999924187998e-05, + "loss": 0.9155, + "step": 21 + }, + { + "epoch": 0.010876282288963045, + "grad_norm": 0.20754352150178124, + "learning_rate": 1.9999996967520037e-05, + "loss": 0.8948, + "step": 22 + }, + { + "epoch": 0.011370658756643184, + "grad_norm": 0.3134326787045934, + "learning_rate": 1.9999993176920513e-05, + "loss": 0.9746, + "step": 23 + }, + { + "epoch": 0.011865035224323322, + "grad_norm": 0.20124285434057398, + "learning_rate": 1.9999987870081987e-05, + "loss": 0.903, + "step": 24 + }, + { + "epoch": 0.01235941169200346, + "grad_norm": 0.19225786521128577, + "learning_rate": 1.999998104700526e-05, + "loss": 0.9404, + "step": 25 + }, + { + "epoch": 0.0128537881596836, + "grad_norm": 0.18984883688572615, + "learning_rate": 1.9999972707691367e-05, + "loss": 0.9344, + "step": 26 + }, + { + "epoch": 0.013348164627363738, + "grad_norm": 0.20019723748638912, + "learning_rate": 1.9999962852141573e-05, + "loss": 0.9166, + "step": 27 + }, + { + "epoch": 0.013842541095043875, + "grad_norm": 0.18526471300642144, + "learning_rate": 1.9999951480357373e-05, + "loss": 0.9218, + "step": 28 + }, + { + "epoch": 0.014336917562724014, + "grad_norm": 0.17911171371444115, + "learning_rate": 1.999993859234049e-05, + "loss": 0.8901, + "step": 29 + }, + { + "epoch": 0.014831294030404153, + "grad_norm": 0.16207568973155415, + "learning_rate": 1.9999924188092875e-05, + "loss": 0.8845, + "step": 30 + }, + { + "epoch": 0.01532567049808429, + "grad_norm": 0.17618323952728912, + "learning_rate": 1.9999908267616722e-05, + "loss": 0.8614, + "step": 31 + }, + { + "epoch": 0.01582004696576443, + "grad_norm": 0.16725432939060128, + "learning_rate": 1.9999890830914436e-05, + "loss": 0.8644, + "step": 32 + }, + { + "epoch": 0.016314423433444566, + "grad_norm": 0.1612410078287994, + "learning_rate": 1.9999871877988663e-05, + "loss": 0.8848, + "step": 33 + }, + { + "epoch": 0.016808799901124705, + "grad_norm": 0.16563124615972358, + "learning_rate": 1.9999851408842276e-05, + "loss": 0.8681, + "step": 34 + }, + { + "epoch": 0.017303176368804844, + "grad_norm": 0.15182841865559302, + "learning_rate": 1.999982942347838e-05, + "loss": 0.8905, + "step": 35 + }, + { + "epoch": 0.017797552836484983, + "grad_norm": 0.15229064825498453, + "learning_rate": 1.999980592190031e-05, + "loss": 0.8822, + "step": 36 + }, + { + "epoch": 0.018291929304165122, + "grad_norm": 0.1524067100434279, + "learning_rate": 1.9999780904111628e-05, + "loss": 0.8485, + "step": 37 + }, + { + "epoch": 0.01878630577184526, + "grad_norm": 0.1563675721705987, + "learning_rate": 1.9999754370116124e-05, + "loss": 0.8749, + "step": 38 + }, + { + "epoch": 0.0192806822395254, + "grad_norm": 0.15160657964257052, + "learning_rate": 1.9999726319917828e-05, + "loss": 0.8406, + "step": 39 + }, + { + "epoch": 0.019775058707205535, + "grad_norm": 0.14577027839919324, + "learning_rate": 1.9999696753520988e-05, + "loss": 0.8524, + "step": 40 + }, + { + "epoch": 0.020269435174885674, + "grad_norm": 0.16378444407708173, + "learning_rate": 1.999966567093009e-05, + "loss": 0.8782, + "step": 41 + }, + { + "epoch": 0.020763811642565813, + "grad_norm": 0.15056562527748352, + "learning_rate": 1.999963307214984e-05, + "loss": 0.8596, + "step": 42 + }, + { + "epoch": 0.021258188110245952, + "grad_norm": 0.1522851064860354, + "learning_rate": 1.999959895718519e-05, + "loss": 0.8601, + "step": 43 + }, + { + "epoch": 0.02175256457792609, + "grad_norm": 0.15345861920362566, + "learning_rate": 1.9999563326041307e-05, + "loss": 0.8551, + "step": 44 + }, + { + "epoch": 0.02224694104560623, + "grad_norm": 0.1568835218252288, + "learning_rate": 1.9999526178723598e-05, + "loss": 0.8779, + "step": 45 + }, + { + "epoch": 0.02274131751328637, + "grad_norm": 0.17124087628827941, + "learning_rate": 1.999948751523769e-05, + "loss": 0.9327, + "step": 46 + }, + { + "epoch": 0.023235693980966508, + "grad_norm": 0.1551946897941387, + "learning_rate": 1.9999447335589445e-05, + "loss": 0.8754, + "step": 47 + }, + { + "epoch": 0.023730070448646643, + "grad_norm": 0.16633672438710623, + "learning_rate": 1.999940563978496e-05, + "loss": 0.8408, + "step": 48 + }, + { + "epoch": 0.024224446916326782, + "grad_norm": 0.164964315541417, + "learning_rate": 1.999936242783056e-05, + "loss": 0.8719, + "step": 49 + }, + { + "epoch": 0.02471882338400692, + "grad_norm": 0.16137388222540153, + "learning_rate": 1.9999317699732786e-05, + "loss": 0.88, + "step": 50 + }, + { + "epoch": 0.02521319985168706, + "grad_norm": 0.17814356018657757, + "learning_rate": 1.999927145549843e-05, + "loss": 0.8737, + "step": 51 + }, + { + "epoch": 0.0257075763193672, + "grad_norm": 0.1522716835215964, + "learning_rate": 1.9999223695134494e-05, + "loss": 0.85, + "step": 52 + }, + { + "epoch": 0.026201952787047338, + "grad_norm": 0.14409007249833472, + "learning_rate": 1.9999174418648232e-05, + "loss": 0.8588, + "step": 53 + }, + { + "epoch": 0.026696329254727477, + "grad_norm": 0.18012993031307106, + "learning_rate": 1.9999123626047106e-05, + "loss": 0.8711, + "step": 54 + }, + { + "epoch": 0.027190705722407612, + "grad_norm": 0.1519245776781811, + "learning_rate": 1.999907131733882e-05, + "loss": 0.846, + "step": 55 + }, + { + "epoch": 0.02768508219008775, + "grad_norm": 0.16177235586986433, + "learning_rate": 1.9999017492531305e-05, + "loss": 0.833, + "step": 56 + }, + { + "epoch": 0.02817945865776789, + "grad_norm": 0.15396174943126179, + "learning_rate": 1.9998962151632723e-05, + "loss": 0.8586, + "step": 57 + }, + { + "epoch": 0.02867383512544803, + "grad_norm": 0.1605297699588116, + "learning_rate": 1.9998905294651462e-05, + "loss": 0.8898, + "step": 58 + }, + { + "epoch": 0.029168211593128168, + "grad_norm": 0.1561667064230911, + "learning_rate": 1.9998846921596148e-05, + "loss": 0.8373, + "step": 59 + }, + { + "epoch": 0.029662588060808306, + "grad_norm": 0.1415852873986437, + "learning_rate": 1.999878703247563e-05, + "loss": 0.8365, + "step": 60 + }, + { + "epoch": 0.030156964528488445, + "grad_norm": 0.1522375631665932, + "learning_rate": 1.9998725627298988e-05, + "loss": 0.8868, + "step": 61 + }, + { + "epoch": 0.03065134099616858, + "grad_norm": 0.155407313933617, + "learning_rate": 1.999866270607553e-05, + "loss": 0.7907, + "step": 62 + }, + { + "epoch": 0.03114571746384872, + "grad_norm": 0.1553640757070083, + "learning_rate": 1.9998598268814803e-05, + "loss": 0.837, + "step": 63 + }, + { + "epoch": 0.03164009393152886, + "grad_norm": 0.15227486101794055, + "learning_rate": 1.9998532315526565e-05, + "loss": 0.8406, + "step": 64 + }, + { + "epoch": 0.032134470399209, + "grad_norm": 0.14816595532474708, + "learning_rate": 1.9998464846220832e-05, + "loss": 0.8225, + "step": 65 + }, + { + "epoch": 0.03262884686688913, + "grad_norm": 0.1570026471883785, + "learning_rate": 1.9998395860907822e-05, + "loss": 0.8674, + "step": 66 + }, + { + "epoch": 0.033123223334569275, + "grad_norm": 0.15720428984029938, + "learning_rate": 1.9998325359597998e-05, + "loss": 0.8118, + "step": 67 + }, + { + "epoch": 0.03361759980224941, + "grad_norm": 0.15655943903392028, + "learning_rate": 1.9998253342302053e-05, + "loss": 0.8295, + "step": 68 + }, + { + "epoch": 0.03411197626992955, + "grad_norm": 0.1562027144542974, + "learning_rate": 1.9998179809030906e-05, + "loss": 0.8455, + "step": 69 + }, + { + "epoch": 0.03460635273760969, + "grad_norm": 0.1447339238992723, + "learning_rate": 1.99981047597957e-05, + "loss": 0.8334, + "step": 70 + }, + { + "epoch": 0.03510072920528983, + "grad_norm": 0.1582453724108778, + "learning_rate": 1.999802819460782e-05, + "loss": 0.8635, + "step": 71 + }, + { + "epoch": 0.035595105672969966, + "grad_norm": 0.15862431424376414, + "learning_rate": 1.9997950113478875e-05, + "loss": 0.8194, + "step": 72 + }, + { + "epoch": 0.0360894821406501, + "grad_norm": 0.1551156105636101, + "learning_rate": 1.9997870516420702e-05, + "loss": 0.8688, + "step": 73 + }, + { + "epoch": 0.036583858608330244, + "grad_norm": 0.16554387299570203, + "learning_rate": 1.999778940344537e-05, + "loss": 0.8393, + "step": 74 + }, + { + "epoch": 0.03707823507601038, + "grad_norm": 0.15296942829649676, + "learning_rate": 1.999770677456518e-05, + "loss": 0.8425, + "step": 75 + }, + { + "epoch": 0.03757261154369052, + "grad_norm": 0.15860035783062054, + "learning_rate": 1.9997622629792656e-05, + "loss": 0.8024, + "step": 76 + }, + { + "epoch": 0.03806698801137066, + "grad_norm": 0.14597988021073433, + "learning_rate": 1.9997536969140564e-05, + "loss": 0.7969, + "step": 77 + }, + { + "epoch": 0.0385613644790508, + "grad_norm": 0.15502738677511627, + "learning_rate": 1.9997449792621885e-05, + "loss": 0.8042, + "step": 78 + }, + { + "epoch": 0.039055740946730935, + "grad_norm": 0.1546272533149546, + "learning_rate": 1.999736110024984e-05, + "loss": 0.8157, + "step": 79 + }, + { + "epoch": 0.03955011741441107, + "grad_norm": 0.16009989696629534, + "learning_rate": 1.999727089203787e-05, + "loss": 0.8437, + "step": 80 + }, + { + "epoch": 0.04004449388209121, + "grad_norm": 0.1559123172520392, + "learning_rate": 1.9997179167999666e-05, + "loss": 0.8505, + "step": 81 + }, + { + "epoch": 0.04053887034977135, + "grad_norm": 0.1438838620650671, + "learning_rate": 1.999708592814913e-05, + "loss": 0.8334, + "step": 82 + }, + { + "epoch": 0.04103324681745149, + "grad_norm": 0.14745088545658777, + "learning_rate": 1.999699117250039e-05, + "loss": 0.8039, + "step": 83 + }, + { + "epoch": 0.041527623285131626, + "grad_norm": 0.15820905070975788, + "learning_rate": 1.999689490106783e-05, + "loss": 0.8635, + "step": 84 + }, + { + "epoch": 0.04202199975281177, + "grad_norm": 0.15585838317751174, + "learning_rate": 1.9996797113866036e-05, + "loss": 0.8174, + "step": 85 + }, + { + "epoch": 0.042516376220491904, + "grad_norm": 0.16556175403869522, + "learning_rate": 1.9996697810909834e-05, + "loss": 0.8549, + "step": 86 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 0.15004839387184385, + "learning_rate": 1.999659699221429e-05, + "loss": 0.8376, + "step": 87 + }, + { + "epoch": 0.04350512915585218, + "grad_norm": 0.22827288442919677, + "learning_rate": 1.9996494657794678e-05, + "loss": 0.8609, + "step": 88 + }, + { + "epoch": 0.04399950562353232, + "grad_norm": 0.15718663547952635, + "learning_rate": 1.9996390807666525e-05, + "loss": 0.8446, + "step": 89 + }, + { + "epoch": 0.04449388209121246, + "grad_norm": 0.14603717420461557, + "learning_rate": 1.9996285441845568e-05, + "loss": 0.8402, + "step": 90 + }, + { + "epoch": 0.044988258558892595, + "grad_norm": 0.15651263465254053, + "learning_rate": 1.9996178560347795e-05, + "loss": 0.857, + "step": 91 + }, + { + "epoch": 0.04548263502657274, + "grad_norm": 0.1598313420501459, + "learning_rate": 1.99960701631894e-05, + "loss": 0.8374, + "step": 92 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 0.1492918209123173, + "learning_rate": 1.9995960250386822e-05, + "loss": 0.8276, + "step": 93 + }, + { + "epoch": 0.046471387961933015, + "grad_norm": 0.15165712052402164, + "learning_rate": 1.999584882195673e-05, + "loss": 0.8253, + "step": 94 + }, + { + "epoch": 0.04696576442961315, + "grad_norm": 0.1460596406344608, + "learning_rate": 1.999573587791602e-05, + "loss": 0.7685, + "step": 95 + }, + { + "epoch": 0.047460140897293286, + "grad_norm": 0.1459317842071744, + "learning_rate": 1.999562141828181e-05, + "loss": 0.8248, + "step": 96 + }, + { + "epoch": 0.04795451736497343, + "grad_norm": 0.15706325520541814, + "learning_rate": 1.999550544307146e-05, + "loss": 0.8334, + "step": 97 + }, + { + "epoch": 0.048448893832653564, + "grad_norm": 0.15492046242763668, + "learning_rate": 1.9995387952302557e-05, + "loss": 0.8269, + "step": 98 + }, + { + "epoch": 0.048943270300333706, + "grad_norm": 0.15280817204285538, + "learning_rate": 1.9995268945992908e-05, + "loss": 0.8422, + "step": 99 + }, + { + "epoch": 0.04943764676801384, + "grad_norm": 0.14984472392166376, + "learning_rate": 1.9995148424160563e-05, + "loss": 0.8158, + "step": 100 + }, + { + "epoch": 0.049932023235693984, + "grad_norm": 0.1424785776535798, + "learning_rate": 1.999502638682379e-05, + "loss": 0.7718, + "step": 101 + }, + { + "epoch": 0.05042639970337412, + "grad_norm": 0.15143852153062987, + "learning_rate": 1.9994902834001104e-05, + "loss": 0.8211, + "step": 102 + }, + { + "epoch": 0.050920776171054255, + "grad_norm": 0.191276216809184, + "learning_rate": 1.9994777765711226e-05, + "loss": 0.831, + "step": 103 + }, + { + "epoch": 0.0514151526387344, + "grad_norm": 0.14413166736020525, + "learning_rate": 1.999465118197313e-05, + "loss": 0.8267, + "step": 104 + }, + { + "epoch": 0.05190952910641453, + "grad_norm": 0.15673289417773714, + "learning_rate": 1.9994523082805998e-05, + "loss": 0.8372, + "step": 105 + }, + { + "epoch": 0.052403905574094675, + "grad_norm": 0.15572447800095732, + "learning_rate": 1.9994393468229263e-05, + "loss": 0.8048, + "step": 106 + }, + { + "epoch": 0.05289828204177481, + "grad_norm": 0.16067059956825708, + "learning_rate": 1.9994262338262572e-05, + "loss": 0.8194, + "step": 107 + }, + { + "epoch": 0.05339265850945495, + "grad_norm": 0.21794960331365335, + "learning_rate": 1.999412969292581e-05, + "loss": 0.8641, + "step": 108 + }, + { + "epoch": 0.05388703497713509, + "grad_norm": 0.16629984555736496, + "learning_rate": 1.9993995532239087e-05, + "loss": 0.8232, + "step": 109 + }, + { + "epoch": 0.054381411444815224, + "grad_norm": 0.15080377287082616, + "learning_rate": 1.999385985622275e-05, + "loss": 0.8722, + "step": 110 + }, + { + "epoch": 0.054875787912495366, + "grad_norm": 0.15933906074884263, + "learning_rate": 1.9993722664897358e-05, + "loss": 0.8218, + "step": 111 + }, + { + "epoch": 0.0553701643801755, + "grad_norm": 0.15949935492293382, + "learning_rate": 1.999358395828373e-05, + "loss": 0.8217, + "step": 112 + }, + { + "epoch": 0.055864540847855644, + "grad_norm": 0.14996563705258584, + "learning_rate": 1.9993443736402887e-05, + "loss": 0.8107, + "step": 113 + }, + { + "epoch": 0.05635891731553578, + "grad_norm": 0.15244447108360898, + "learning_rate": 1.9993301999276088e-05, + "loss": 0.7912, + "step": 114 + }, + { + "epoch": 0.05685329378321592, + "grad_norm": 0.2381018150630388, + "learning_rate": 1.9993158746924832e-05, + "loss": 0.815, + "step": 115 + }, + { + "epoch": 0.05734767025089606, + "grad_norm": 0.14877004018628825, + "learning_rate": 1.9993013979370836e-05, + "loss": 0.8079, + "step": 116 + }, + { + "epoch": 0.05784204671857619, + "grad_norm": 0.15560482997213645, + "learning_rate": 1.9992867696636047e-05, + "loss": 0.7956, + "step": 117 + }, + { + "epoch": 0.058336423186256335, + "grad_norm": 0.14659416211119433, + "learning_rate": 1.9992719898742646e-05, + "loss": 0.7951, + "step": 118 + }, + { + "epoch": 0.05883079965393647, + "grad_norm": 0.14940227872881504, + "learning_rate": 1.9992570585713044e-05, + "loss": 0.8125, + "step": 119 + }, + { + "epoch": 0.05932517612161661, + "grad_norm": 0.1631090295410188, + "learning_rate": 1.9992419757569884e-05, + "loss": 0.8511, + "step": 120 + }, + { + "epoch": 0.05981955258929675, + "grad_norm": 0.15035666245799748, + "learning_rate": 1.9992267414336027e-05, + "loss": 0.8016, + "step": 121 + }, + { + "epoch": 0.06031392905697689, + "grad_norm": 0.17332153967637515, + "learning_rate": 1.999211355603458e-05, + "loss": 0.7933, + "step": 122 + }, + { + "epoch": 0.060808305524657026, + "grad_norm": 0.14623176242136854, + "learning_rate": 1.9991958182688865e-05, + "loss": 0.8078, + "step": 123 + }, + { + "epoch": 0.06130268199233716, + "grad_norm": 0.21345905847546617, + "learning_rate": 1.9991801294322445e-05, + "loss": 0.8668, + "step": 124 + }, + { + "epoch": 0.061797058460017304, + "grad_norm": 0.15060171173411613, + "learning_rate": 1.9991642890959105e-05, + "loss": 0.8182, + "step": 125 + }, + { + "epoch": 0.06229143492769744, + "grad_norm": 0.16063860319016804, + "learning_rate": 1.9991482972622865e-05, + "loss": 0.8243, + "step": 126 + }, + { + "epoch": 0.06278581139537757, + "grad_norm": 0.15565215354619288, + "learning_rate": 1.9991321539337974e-05, + "loss": 0.8034, + "step": 127 + }, + { + "epoch": 0.06328018786305772, + "grad_norm": 0.15031929577075526, + "learning_rate": 1.9991158591128903e-05, + "loss": 0.8296, + "step": 128 + }, + { + "epoch": 0.06377456433073786, + "grad_norm": 0.16428218021377675, + "learning_rate": 1.9990994128020366e-05, + "loss": 0.8169, + "step": 129 + }, + { + "epoch": 0.064268940798418, + "grad_norm": 0.15095026275459245, + "learning_rate": 1.9990828150037292e-05, + "loss": 0.805, + "step": 130 + }, + { + "epoch": 0.06476331726609813, + "grad_norm": 0.15429511907288754, + "learning_rate": 1.9990660657204853e-05, + "loss": 0.8144, + "step": 131 + }, + { + "epoch": 0.06525769373377827, + "grad_norm": 0.15235600529776383, + "learning_rate": 1.9990491649548445e-05, + "loss": 0.8384, + "step": 132 + }, + { + "epoch": 0.06575207020145842, + "grad_norm": 0.1522405591667572, + "learning_rate": 1.9990321127093694e-05, + "loss": 0.8035, + "step": 133 + }, + { + "epoch": 0.06624644666913855, + "grad_norm": 0.16050249677668593, + "learning_rate": 1.999014908986645e-05, + "loss": 0.8294, + "step": 134 + }, + { + "epoch": 0.06674082313681869, + "grad_norm": 0.15277408584102947, + "learning_rate": 1.99899755378928e-05, + "loss": 0.807, + "step": 135 + }, + { + "epoch": 0.06723519960449882, + "grad_norm": 0.15433938915298703, + "learning_rate": 1.998980047119906e-05, + "loss": 0.8478, + "step": 136 + }, + { + "epoch": 0.06772957607217897, + "grad_norm": 0.19204677057513578, + "learning_rate": 1.998962388981178e-05, + "loss": 0.8096, + "step": 137 + }, + { + "epoch": 0.0682239525398591, + "grad_norm": 0.1604019055960415, + "learning_rate": 1.998944579375772e-05, + "loss": 0.8561, + "step": 138 + }, + { + "epoch": 0.06871832900753924, + "grad_norm": 0.15352883663697808, + "learning_rate": 1.9989266183063897e-05, + "loss": 0.7981, + "step": 139 + }, + { + "epoch": 0.06921270547521938, + "grad_norm": 0.16369769452011582, + "learning_rate": 1.998908505775754e-05, + "loss": 0.7984, + "step": 140 + }, + { + "epoch": 0.06970708194289951, + "grad_norm": 0.14836631328567645, + "learning_rate": 1.9988902417866106e-05, + "loss": 0.8131, + "step": 141 + }, + { + "epoch": 0.07020145841057966, + "grad_norm": 0.15214705198342263, + "learning_rate": 1.99887182634173e-05, + "loss": 0.8231, + "step": 142 + }, + { + "epoch": 0.0706958348782598, + "grad_norm": 0.16083571532456153, + "learning_rate": 1.998853259443903e-05, + "loss": 0.8426, + "step": 143 + }, + { + "epoch": 0.07119021134593993, + "grad_norm": 0.14687006294212476, + "learning_rate": 1.9988345410959457e-05, + "loss": 0.7909, + "step": 144 + }, + { + "epoch": 0.07168458781362007, + "grad_norm": 0.15550615690169492, + "learning_rate": 1.998815671300696e-05, + "loss": 0.787, + "step": 145 + }, + { + "epoch": 0.0721789642813002, + "grad_norm": 0.15464745012497672, + "learning_rate": 1.9987966500610156e-05, + "loss": 0.7888, + "step": 146 + }, + { + "epoch": 0.07267334074898035, + "grad_norm": 0.15918208741258796, + "learning_rate": 1.9987774773797873e-05, + "loss": 0.8392, + "step": 147 + }, + { + "epoch": 0.07316771721666049, + "grad_norm": 0.15149778691159055, + "learning_rate": 1.998758153259919e-05, + "loss": 0.7696, + "step": 148 + }, + { + "epoch": 0.07366209368434062, + "grad_norm": 0.15560378566411742, + "learning_rate": 1.9987386777043407e-05, + "loss": 0.8299, + "step": 149 + }, + { + "epoch": 0.07415647015202076, + "grad_norm": 0.18543015861229437, + "learning_rate": 1.9987190507160052e-05, + "loss": 0.8084, + "step": 150 + }, + { + "epoch": 0.07465084661970091, + "grad_norm": 0.15103373293623032, + "learning_rate": 1.9986992722978882e-05, + "loss": 0.8133, + "step": 151 + }, + { + "epoch": 0.07514522308738104, + "grad_norm": 0.15052888805571182, + "learning_rate": 1.9986793424529895e-05, + "loss": 0.8044, + "step": 152 + }, + { + "epoch": 0.07563959955506118, + "grad_norm": 0.16777076644851557, + "learning_rate": 1.9986592611843293e-05, + "loss": 0.8231, + "step": 153 + }, + { + "epoch": 0.07613397602274131, + "grad_norm": 0.1503300201133093, + "learning_rate": 1.998639028494954e-05, + "loss": 0.8254, + "step": 154 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 0.15417595657938266, + "learning_rate": 1.998618644387931e-05, + "loss": 0.7963, + "step": 155 + }, + { + "epoch": 0.0771227289581016, + "grad_norm": 0.15340802281352123, + "learning_rate": 1.99859810886635e-05, + "loss": 0.816, + "step": 156 + }, + { + "epoch": 0.07761710542578173, + "grad_norm": 0.1609699438348385, + "learning_rate": 1.998577421933326e-05, + "loss": 0.7996, + "step": 157 + }, + { + "epoch": 0.07811148189346187, + "grad_norm": 0.1523894437458277, + "learning_rate": 1.9985565835919948e-05, + "loss": 0.7719, + "step": 158 + }, + { + "epoch": 0.078605858361142, + "grad_norm": 0.1570870983382468, + "learning_rate": 1.998535593845516e-05, + "loss": 0.8527, + "step": 159 + }, + { + "epoch": 0.07910023482882214, + "grad_norm": 0.15219399661586103, + "learning_rate": 1.998514452697073e-05, + "loss": 0.7846, + "step": 160 + }, + { + "epoch": 0.07959461129650229, + "grad_norm": 0.18226648705518994, + "learning_rate": 1.9984931601498703e-05, + "loss": 0.7988, + "step": 161 + }, + { + "epoch": 0.08008898776418243, + "grad_norm": 0.16240358125174703, + "learning_rate": 1.9984717162071367e-05, + "loss": 0.8093, + "step": 162 + }, + { + "epoch": 0.08058336423186256, + "grad_norm": 0.17799477581606468, + "learning_rate": 1.9984501208721242e-05, + "loss": 0.808, + "step": 163 + }, + { + "epoch": 0.0810777406995427, + "grad_norm": 0.14732472203225083, + "learning_rate": 1.998428374148106e-05, + "loss": 0.7412, + "step": 164 + }, + { + "epoch": 0.08157211716722285, + "grad_norm": 0.1836806067833032, + "learning_rate": 1.9984064760383807e-05, + "loss": 0.8164, + "step": 165 + }, + { + "epoch": 0.08206649363490298, + "grad_norm": 0.15351837524448347, + "learning_rate": 1.9983844265462674e-05, + "loss": 0.8045, + "step": 166 + }, + { + "epoch": 0.08256087010258312, + "grad_norm": 0.17838831101071348, + "learning_rate": 1.9983622256751105e-05, + "loss": 0.7964, + "step": 167 + }, + { + "epoch": 0.08305524657026325, + "grad_norm": 0.16163178484728888, + "learning_rate": 1.9983398734282752e-05, + "loss": 0.7932, + "step": 168 + }, + { + "epoch": 0.08354962303794339, + "grad_norm": 0.1741568226080338, + "learning_rate": 1.9983173698091512e-05, + "loss": 0.8235, + "step": 169 + }, + { + "epoch": 0.08404399950562354, + "grad_norm": 0.16682452689507382, + "learning_rate": 1.99829471482115e-05, + "loss": 0.8032, + "step": 170 + }, + { + "epoch": 0.08453837597330367, + "grad_norm": 0.17189915082178667, + "learning_rate": 1.9982719084677077e-05, + "loss": 0.8151, + "step": 171 + }, + { + "epoch": 0.08503275244098381, + "grad_norm": 0.16146928439937805, + "learning_rate": 1.9982489507522813e-05, + "loss": 0.7815, + "step": 172 + }, + { + "epoch": 0.08552712890866394, + "grad_norm": 0.17270871270812269, + "learning_rate": 1.998225841678352e-05, + "loss": 0.7951, + "step": 173 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 0.15796520375436468, + "learning_rate": 1.9982025812494238e-05, + "loss": 0.8046, + "step": 174 + }, + { + "epoch": 0.08651588184402423, + "grad_norm": 0.1491753998012419, + "learning_rate": 1.9981791694690237e-05, + "loss": 0.775, + "step": 175 + }, + { + "epoch": 0.08701025831170436, + "grad_norm": 0.16232276760005576, + "learning_rate": 1.998155606340701e-05, + "loss": 0.8223, + "step": 176 + }, + { + "epoch": 0.0875046347793845, + "grad_norm": 0.15739879973523, + "learning_rate": 1.998131891868029e-05, + "loss": 0.8377, + "step": 177 + }, + { + "epoch": 0.08799901124706463, + "grad_norm": 0.19419733654611354, + "learning_rate": 1.998108026054603e-05, + "loss": 0.8144, + "step": 178 + }, + { + "epoch": 0.08849338771474478, + "grad_norm": 0.14736939157090728, + "learning_rate": 1.9980840089040415e-05, + "loss": 0.7808, + "step": 179 + }, + { + "epoch": 0.08898776418242492, + "grad_norm": 0.14933163464935909, + "learning_rate": 1.9980598404199868e-05, + "loss": 0.7901, + "step": 180 + }, + { + "epoch": 0.08948214065010505, + "grad_norm": 0.15342052574772796, + "learning_rate": 1.9980355206061025e-05, + "loss": 0.8126, + "step": 181 + }, + { + "epoch": 0.08997651711778519, + "grad_norm": 0.15408274258664406, + "learning_rate": 1.9980110494660773e-05, + "loss": 0.7967, + "step": 182 + }, + { + "epoch": 0.09047089358546533, + "grad_norm": 0.15854305719145265, + "learning_rate": 1.99798642700362e-05, + "loss": 0.7881, + "step": 183 + }, + { + "epoch": 0.09096527005314547, + "grad_norm": 0.16008840431102875, + "learning_rate": 1.997961653222465e-05, + "loss": 0.8029, + "step": 184 + }, + { + "epoch": 0.09145964652082561, + "grad_norm": 0.1530160945791843, + "learning_rate": 1.9979367281263684e-05, + "loss": 0.7859, + "step": 185 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 0.15964760493463895, + "learning_rate": 1.9979116517191094e-05, + "loss": 0.821, + "step": 186 + }, + { + "epoch": 0.09244839945618588, + "grad_norm": 0.15091606021369647, + "learning_rate": 1.9978864240044903e-05, + "loss": 0.7858, + "step": 187 + }, + { + "epoch": 0.09294277592386603, + "grad_norm": 0.1497532527772963, + "learning_rate": 1.9978610449863362e-05, + "loss": 0.7964, + "step": 188 + }, + { + "epoch": 0.09343715239154617, + "grad_norm": 0.1472649842479327, + "learning_rate": 1.997835514668495e-05, + "loss": 0.7753, + "step": 189 + }, + { + "epoch": 0.0939315288592263, + "grad_norm": 0.16413251634246817, + "learning_rate": 1.997809833054838e-05, + "loss": 0.812, + "step": 190 + }, + { + "epoch": 0.09442590532690644, + "grad_norm": 0.16845897309575897, + "learning_rate": 1.9977840001492587e-05, + "loss": 0.8317, + "step": 191 + }, + { + "epoch": 0.09492028179458657, + "grad_norm": 0.15383914845252747, + "learning_rate": 1.9977580159556743e-05, + "loss": 0.7862, + "step": 192 + }, + { + "epoch": 0.09541465826226672, + "grad_norm": 0.1669860286810135, + "learning_rate": 1.9977318804780245e-05, + "loss": 0.7825, + "step": 193 + }, + { + "epoch": 0.09590903472994686, + "grad_norm": 0.17261000936572626, + "learning_rate": 1.9977055937202724e-05, + "loss": 0.803, + "step": 194 + }, + { + "epoch": 0.09640341119762699, + "grad_norm": 0.1627112287951544, + "learning_rate": 1.9976791556864034e-05, + "loss": 0.7585, + "step": 195 + }, + { + "epoch": 0.09689778766530713, + "grad_norm": 0.17522569970154203, + "learning_rate": 1.9976525663804257e-05, + "loss": 0.8206, + "step": 196 + }, + { + "epoch": 0.09739216413298726, + "grad_norm": 0.16190617613082328, + "learning_rate": 1.997625825806372e-05, + "loss": 0.7585, + "step": 197 + }, + { + "epoch": 0.09788654060066741, + "grad_norm": 0.16188066150385447, + "learning_rate": 1.997598933968296e-05, + "loss": 0.7866, + "step": 198 + }, + { + "epoch": 0.09838091706834755, + "grad_norm": 0.1780625352627313, + "learning_rate": 1.997571890870275e-05, + "loss": 0.8066, + "step": 199 + }, + { + "epoch": 0.09887529353602768, + "grad_norm": 0.1503712984510588, + "learning_rate": 1.9975446965164104e-05, + "loss": 0.7944, + "step": 200 + }, + { + "epoch": 0.09936967000370782, + "grad_norm": 0.16332698168299012, + "learning_rate": 1.9975173509108242e-05, + "loss": 0.7757, + "step": 201 + }, + { + "epoch": 0.09986404647138797, + "grad_norm": 0.1612316106245206, + "learning_rate": 1.9974898540576636e-05, + "loss": 0.7846, + "step": 202 + }, + { + "epoch": 0.1003584229390681, + "grad_norm": 0.16066258717535623, + "learning_rate": 1.9974622059610974e-05, + "loss": 0.7805, + "step": 203 + }, + { + "epoch": 0.10085279940674824, + "grad_norm": 0.15275477221664785, + "learning_rate": 1.997434406625318e-05, + "loss": 0.8054, + "step": 204 + }, + { + "epoch": 0.10134717587442837, + "grad_norm": 0.15715180413519678, + "learning_rate": 1.9974064560545395e-05, + "loss": 0.8322, + "step": 205 + }, + { + "epoch": 0.10184155234210851, + "grad_norm": 0.1421100691377523, + "learning_rate": 1.9973783542530012e-05, + "loss": 0.7487, + "step": 206 + }, + { + "epoch": 0.10233592880978866, + "grad_norm": 0.1472032250808568, + "learning_rate": 1.9973501012249632e-05, + "loss": 0.799, + "step": 207 + }, + { + "epoch": 0.1028303052774688, + "grad_norm": 0.14913668274539543, + "learning_rate": 1.9973216969747097e-05, + "loss": 0.7706, + "step": 208 + }, + { + "epoch": 0.10332468174514893, + "grad_norm": 0.16049856547179883, + "learning_rate": 1.997293141506547e-05, + "loss": 0.7862, + "step": 209 + }, + { + "epoch": 0.10381905821282907, + "grad_norm": 0.1573600137085301, + "learning_rate": 1.9972644348248055e-05, + "loss": 0.7863, + "step": 210 + }, + { + "epoch": 0.1043134346805092, + "grad_norm": 0.14975781898080223, + "learning_rate": 1.997235576933837e-05, + "loss": 0.7887, + "step": 211 + }, + { + "epoch": 0.10480781114818935, + "grad_norm": 0.17308606257781525, + "learning_rate": 1.997206567838018e-05, + "loss": 0.7942, + "step": 212 + }, + { + "epoch": 0.10530218761586949, + "grad_norm": 0.16715777213186844, + "learning_rate": 1.9971774075417462e-05, + "loss": 0.7993, + "step": 213 + }, + { + "epoch": 0.10579656408354962, + "grad_norm": 0.16223209963091273, + "learning_rate": 1.9971480960494432e-05, + "loss": 0.7831, + "step": 214 + }, + { + "epoch": 0.10629094055122976, + "grad_norm": 0.16256714449855927, + "learning_rate": 1.9971186333655536e-05, + "loss": 0.792, + "step": 215 + }, + { + "epoch": 0.1067853170189099, + "grad_norm": 0.17046749298637948, + "learning_rate": 1.997089019494544e-05, + "loss": 0.7988, + "step": 216 + }, + { + "epoch": 0.10727969348659004, + "grad_norm": 0.18004825577680264, + "learning_rate": 1.9970592544409054e-05, + "loss": 0.7978, + "step": 217 + }, + { + "epoch": 0.10777406995427018, + "grad_norm": 0.16469346761429285, + "learning_rate": 1.9970293382091502e-05, + "loss": 0.8323, + "step": 218 + }, + { + "epoch": 0.10826844642195031, + "grad_norm": 0.1644107304434145, + "learning_rate": 1.996999270803815e-05, + "loss": 0.8046, + "step": 219 + }, + { + "epoch": 0.10876282288963045, + "grad_norm": 0.1553534840190301, + "learning_rate": 1.996969052229458e-05, + "loss": 0.7437, + "step": 220 + }, + { + "epoch": 0.1092571993573106, + "grad_norm": 0.16299530563920187, + "learning_rate": 1.996938682490662e-05, + "loss": 0.8054, + "step": 221 + }, + { + "epoch": 0.10975157582499073, + "grad_norm": 0.1648388361042669, + "learning_rate": 1.9969081615920312e-05, + "loss": 0.8197, + "step": 222 + }, + { + "epoch": 0.11024595229267087, + "grad_norm": 0.1716493603565508, + "learning_rate": 1.9968774895381933e-05, + "loss": 0.8033, + "step": 223 + }, + { + "epoch": 0.110740328760351, + "grad_norm": 0.18175450022203402, + "learning_rate": 1.996846666333799e-05, + "loss": 0.819, + "step": 224 + }, + { + "epoch": 0.11123470522803114, + "grad_norm": 0.1876331265141738, + "learning_rate": 1.996815691983522e-05, + "loss": 0.823, + "step": 225 + }, + { + "epoch": 0.11172908169571129, + "grad_norm": 0.167942384487726, + "learning_rate": 1.9967845664920584e-05, + "loss": 0.762, + "step": 226 + }, + { + "epoch": 0.11222345816339142, + "grad_norm": 0.18192411214381216, + "learning_rate": 1.996753289864128e-05, + "loss": 0.7965, + "step": 227 + }, + { + "epoch": 0.11271783463107156, + "grad_norm": 0.15772418301673358, + "learning_rate": 1.996721862104473e-05, + "loss": 0.7908, + "step": 228 + }, + { + "epoch": 0.1132122110987517, + "grad_norm": 0.17826101898144822, + "learning_rate": 1.996690283217858e-05, + "loss": 0.7645, + "step": 229 + }, + { + "epoch": 0.11370658756643184, + "grad_norm": 0.161592025401347, + "learning_rate": 1.9966585532090717e-05, + "loss": 0.7918, + "step": 230 + }, + { + "epoch": 0.11420096403411198, + "grad_norm": 0.1717307799708942, + "learning_rate": 1.9966266720829256e-05, + "loss": 0.7519, + "step": 231 + }, + { + "epoch": 0.11469534050179211, + "grad_norm": 0.15586842821895552, + "learning_rate": 1.9965946398442524e-05, + "loss": 0.7763, + "step": 232 + }, + { + "epoch": 0.11518971696947225, + "grad_norm": 0.17008573987444034, + "learning_rate": 1.9965624564979097e-05, + "loss": 0.8125, + "step": 233 + }, + { + "epoch": 0.11568409343715239, + "grad_norm": 0.16909729779091207, + "learning_rate": 1.9965301220487775e-05, + "loss": 0.7861, + "step": 234 + }, + { + "epoch": 0.11617846990483253, + "grad_norm": 0.16131200671116694, + "learning_rate": 1.996497636501758e-05, + "loss": 0.7581, + "step": 235 + }, + { + "epoch": 0.11667284637251267, + "grad_norm": 0.16184395935947965, + "learning_rate": 1.996464999861777e-05, + "loss": 0.7834, + "step": 236 + }, + { + "epoch": 0.1171672228401928, + "grad_norm": 0.15625101002653255, + "learning_rate": 1.996432212133783e-05, + "loss": 0.7691, + "step": 237 + }, + { + "epoch": 0.11766159930787294, + "grad_norm": 0.15778164108438722, + "learning_rate": 1.9963992733227473e-05, + "loss": 0.7787, + "step": 238 + }, + { + "epoch": 0.11815597577555309, + "grad_norm": 0.15763111734876747, + "learning_rate": 1.996366183433664e-05, + "loss": 0.7859, + "step": 239 + }, + { + "epoch": 0.11865035224323323, + "grad_norm": 0.15902162152773353, + "learning_rate": 1.996332942471551e-05, + "loss": 0.8426, + "step": 240 + }, + { + "epoch": 0.11914472871091336, + "grad_norm": 0.15765682754731522, + "learning_rate": 1.996299550441448e-05, + "loss": 0.7939, + "step": 241 + }, + { + "epoch": 0.1196391051785935, + "grad_norm": 0.16240674007893507, + "learning_rate": 1.996266007348418e-05, + "loss": 0.7501, + "step": 242 + }, + { + "epoch": 0.12013348164627363, + "grad_norm": 0.1543080688838884, + "learning_rate": 1.996232313197547e-05, + "loss": 0.7833, + "step": 243 + }, + { + "epoch": 0.12062785811395378, + "grad_norm": 0.15477014712589826, + "learning_rate": 1.9961984679939438e-05, + "loss": 0.7537, + "step": 244 + }, + { + "epoch": 0.12112223458163392, + "grad_norm": 0.14569955936629844, + "learning_rate": 1.9961644717427405e-05, + "loss": 0.7604, + "step": 245 + }, + { + "epoch": 0.12161661104931405, + "grad_norm": 0.15263126519352369, + "learning_rate": 1.996130324449091e-05, + "loss": 0.7687, + "step": 246 + }, + { + "epoch": 0.12211098751699419, + "grad_norm": 0.1515694650116483, + "learning_rate": 1.9960960261181733e-05, + "loss": 0.7953, + "step": 247 + }, + { + "epoch": 0.12260536398467432, + "grad_norm": 0.14575108928090383, + "learning_rate": 1.996061576755188e-05, + "loss": 0.7585, + "step": 248 + }, + { + "epoch": 0.12309974045235447, + "grad_norm": 0.1652983667519365, + "learning_rate": 1.996026976365358e-05, + "loss": 0.7413, + "step": 249 + }, + { + "epoch": 0.12359411692003461, + "grad_norm": 0.15765902822427819, + "learning_rate": 1.9959922249539303e-05, + "loss": 0.8357, + "step": 250 + }, + { + "epoch": 0.12408849338771474, + "grad_norm": 0.16001519835998795, + "learning_rate": 1.995957322526173e-05, + "loss": 0.7685, + "step": 251 + }, + { + "epoch": 0.12458286985539488, + "grad_norm": 0.16118624074916482, + "learning_rate": 1.9959222690873794e-05, + "loss": 0.7629, + "step": 252 + }, + { + "epoch": 0.12507724632307501, + "grad_norm": 0.18530626485056942, + "learning_rate": 1.9958870646428634e-05, + "loss": 0.7497, + "step": 253 + }, + { + "epoch": 0.12557162279075515, + "grad_norm": 0.1590349259816098, + "learning_rate": 1.995851709197963e-05, + "loss": 0.7395, + "step": 254 + }, + { + "epoch": 0.12606599925843529, + "grad_norm": 0.17794492658723735, + "learning_rate": 1.9958162027580396e-05, + "loss": 0.7548, + "step": 255 + }, + { + "epoch": 0.12656037572611545, + "grad_norm": 0.15875077400741042, + "learning_rate": 1.9957805453284763e-05, + "loss": 0.7658, + "step": 256 + }, + { + "epoch": 0.12705475219379558, + "grad_norm": 0.15932563701029331, + "learning_rate": 1.9957447369146792e-05, + "loss": 0.8255, + "step": 257 + }, + { + "epoch": 0.12754912866147572, + "grad_norm": 0.16074231944417772, + "learning_rate": 1.995708777522079e-05, + "loss": 0.7729, + "step": 258 + }, + { + "epoch": 0.12804350512915585, + "grad_norm": 0.1498075698570396, + "learning_rate": 1.995672667156127e-05, + "loss": 0.7824, + "step": 259 + }, + { + "epoch": 0.128537881596836, + "grad_norm": 0.1630845633768196, + "learning_rate": 1.995636405822298e-05, + "loss": 0.8005, + "step": 260 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.16038704125017547, + "learning_rate": 1.9955999935260913e-05, + "loss": 0.7848, + "step": 261 + }, + { + "epoch": 0.12952663453219626, + "grad_norm": 0.15207010305037064, + "learning_rate": 1.995563430273027e-05, + "loss": 0.7345, + "step": 262 + }, + { + "epoch": 0.1300210109998764, + "grad_norm": 0.1586569965771398, + "learning_rate": 1.9955267160686492e-05, + "loss": 0.7784, + "step": 263 + }, + { + "epoch": 0.13051538746755653, + "grad_norm": 0.16599063577652856, + "learning_rate": 1.995489850918525e-05, + "loss": 0.7745, + "step": 264 + }, + { + "epoch": 0.1310097639352367, + "grad_norm": 0.1613956693747982, + "learning_rate": 1.9954528348282435e-05, + "loss": 0.7796, + "step": 265 + }, + { + "epoch": 0.13150414040291683, + "grad_norm": 0.17580482181746948, + "learning_rate": 1.9954156678034176e-05, + "loss": 0.761, + "step": 266 + }, + { + "epoch": 0.13199851687059697, + "grad_norm": 0.16749867961948134, + "learning_rate": 1.9953783498496825e-05, + "loss": 0.7689, + "step": 267 + }, + { + "epoch": 0.1324928933382771, + "grad_norm": 0.1680041467529898, + "learning_rate": 1.995340880972697e-05, + "loss": 0.7456, + "step": 268 + }, + { + "epoch": 0.13298726980595724, + "grad_norm": 0.15959609967004795, + "learning_rate": 1.9953032611781412e-05, + "loss": 0.8048, + "step": 269 + }, + { + "epoch": 0.13348164627363737, + "grad_norm": 0.17914713895333348, + "learning_rate": 1.9952654904717203e-05, + "loss": 0.8649, + "step": 270 + }, + { + "epoch": 0.1339760227413175, + "grad_norm": 0.16320974032334315, + "learning_rate": 1.9952275688591606e-05, + "loss": 0.7979, + "step": 271 + }, + { + "epoch": 0.13447039920899764, + "grad_norm": 0.15276981913361473, + "learning_rate": 1.995189496346212e-05, + "loss": 0.7626, + "step": 272 + }, + { + "epoch": 0.13496477567667778, + "grad_norm": 0.1752515647259904, + "learning_rate": 1.9951512729386474e-05, + "loss": 0.7827, + "step": 273 + }, + { + "epoch": 0.13545915214435794, + "grad_norm": 0.16596011324511997, + "learning_rate": 1.9951128986422623e-05, + "loss": 0.7795, + "step": 274 + }, + { + "epoch": 0.13595352861203808, + "grad_norm": 0.1668372922520809, + "learning_rate": 1.9950743734628754e-05, + "loss": 0.7723, + "step": 275 + }, + { + "epoch": 0.1364479050797182, + "grad_norm": 0.1580226743865861, + "learning_rate": 1.9950356974063272e-05, + "loss": 0.7609, + "step": 276 + }, + { + "epoch": 0.13694228154739835, + "grad_norm": 0.16136089371627765, + "learning_rate": 1.994996870478483e-05, + "loss": 0.7582, + "step": 277 + }, + { + "epoch": 0.13743665801507848, + "grad_norm": 0.17762193774422708, + "learning_rate": 1.9949578926852293e-05, + "loss": 0.7776, + "step": 278 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.1500051114730356, + "learning_rate": 1.994918764032476e-05, + "loss": 0.7712, + "step": 279 + }, + { + "epoch": 0.13842541095043875, + "grad_norm": 0.18891236221762034, + "learning_rate": 1.9948794845261562e-05, + "loss": 0.8013, + "step": 280 + }, + { + "epoch": 0.1389197874181189, + "grad_norm": 0.15030182590448393, + "learning_rate": 1.994840054172226e-05, + "loss": 0.7483, + "step": 281 + }, + { + "epoch": 0.13941416388579903, + "grad_norm": 0.16682849482576528, + "learning_rate": 1.994800472976663e-05, + "loss": 0.7778, + "step": 282 + }, + { + "epoch": 0.1399085403534792, + "grad_norm": 0.16418781459500154, + "learning_rate": 1.994760740945469e-05, + "loss": 0.7747, + "step": 283 + }, + { + "epoch": 0.14040291682115932, + "grad_norm": 0.15846017733531312, + "learning_rate": 1.9947208580846694e-05, + "loss": 0.7947, + "step": 284 + }, + { + "epoch": 0.14089729328883946, + "grad_norm": 0.17690921717406974, + "learning_rate": 1.9946808244003096e-05, + "loss": 0.7589, + "step": 285 + }, + { + "epoch": 0.1413916697565196, + "grad_norm": 0.15652709235162063, + "learning_rate": 1.994640639898461e-05, + "loss": 0.7849, + "step": 286 + }, + { + "epoch": 0.14188604622419973, + "grad_norm": 0.1639249605883931, + "learning_rate": 1.994600304585216e-05, + "loss": 0.7834, + "step": 287 + }, + { + "epoch": 0.14238042269187987, + "grad_norm": 0.1632101671331181, + "learning_rate": 1.9945598184666907e-05, + "loss": 0.7889, + "step": 288 + }, + { + "epoch": 0.14287479915956, + "grad_norm": 0.17007366446524963, + "learning_rate": 1.9945191815490235e-05, + "loss": 0.8109, + "step": 289 + }, + { + "epoch": 0.14336917562724014, + "grad_norm": 0.16859712869086568, + "learning_rate": 1.994478393838376e-05, + "loss": 0.7601, + "step": 290 + }, + { + "epoch": 0.14386355209492027, + "grad_norm": 0.1654209115489427, + "learning_rate": 1.9944374553409326e-05, + "loss": 0.7587, + "step": 291 + }, + { + "epoch": 0.1443579285626004, + "grad_norm": 0.17134997650156245, + "learning_rate": 1.9943963660629008e-05, + "loss": 0.8139, + "step": 292 + }, + { + "epoch": 0.14485230503028057, + "grad_norm": 0.17776806957317903, + "learning_rate": 1.99435512601051e-05, + "loss": 0.7558, + "step": 293 + }, + { + "epoch": 0.1453466814979607, + "grad_norm": 0.16003196441272421, + "learning_rate": 1.9943137351900143e-05, + "loss": 0.7757, + "step": 294 + }, + { + "epoch": 0.14584105796564084, + "grad_norm": 0.1724218295245866, + "learning_rate": 1.9942721936076885e-05, + "loss": 0.7596, + "step": 295 + }, + { + "epoch": 0.14633543443332098, + "grad_norm": 0.19786386420361912, + "learning_rate": 1.994230501269832e-05, + "loss": 0.7622, + "step": 296 + }, + { + "epoch": 0.1468298109010011, + "grad_norm": 0.1508955562202643, + "learning_rate": 1.9941886581827658e-05, + "loss": 0.7746, + "step": 297 + }, + { + "epoch": 0.14732418736868125, + "grad_norm": 0.17393972449384473, + "learning_rate": 1.9941466643528348e-05, + "loss": 0.7681, + "step": 298 + }, + { + "epoch": 0.14781856383636138, + "grad_norm": 0.1607948329148593, + "learning_rate": 1.9941045197864058e-05, + "loss": 0.7697, + "step": 299 + }, + { + "epoch": 0.14831294030404152, + "grad_norm": 0.18264199041354331, + "learning_rate": 1.9940622244898696e-05, + "loss": 0.8343, + "step": 300 + }, + { + "epoch": 0.14880731677172165, + "grad_norm": 0.15762914036314776, + "learning_rate": 1.9940197784696385e-05, + "loss": 0.7894, + "step": 301 + }, + { + "epoch": 0.14930169323940182, + "grad_norm": 0.15764550665571, + "learning_rate": 1.9939771817321484e-05, + "loss": 0.7568, + "step": 302 + }, + { + "epoch": 0.14979606970708195, + "grad_norm": 0.1582593975347385, + "learning_rate": 1.9939344342838585e-05, + "loss": 0.7526, + "step": 303 + }, + { + "epoch": 0.1502904461747621, + "grad_norm": 0.14990069735599407, + "learning_rate": 1.99389153613125e-05, + "loss": 0.7947, + "step": 304 + }, + { + "epoch": 0.15078482264244222, + "grad_norm": 0.16016208199507498, + "learning_rate": 1.9938484872808274e-05, + "loss": 0.784, + "step": 305 + }, + { + "epoch": 0.15127919911012236, + "grad_norm": 0.15615502745151089, + "learning_rate": 1.9938052877391177e-05, + "loss": 0.7651, + "step": 306 + }, + { + "epoch": 0.1517735755778025, + "grad_norm": 0.1691127756493626, + "learning_rate": 1.9937619375126714e-05, + "loss": 0.7751, + "step": 307 + }, + { + "epoch": 0.15226795204548263, + "grad_norm": 0.16023483934476349, + "learning_rate": 1.9937184366080612e-05, + "loss": 0.7697, + "step": 308 + }, + { + "epoch": 0.15276232851316277, + "grad_norm": 0.14497829358579675, + "learning_rate": 1.9936747850318826e-05, + "loss": 0.7228, + "step": 309 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 0.16844992425714525, + "learning_rate": 1.993630982790755e-05, + "loss": 0.7609, + "step": 310 + }, + { + "epoch": 0.15375108144852306, + "grad_norm": 0.1573567972447207, + "learning_rate": 1.993587029891319e-05, + "loss": 0.7848, + "step": 311 + }, + { + "epoch": 0.1542454579162032, + "grad_norm": 0.15953994057342247, + "learning_rate": 1.993542926340239e-05, + "loss": 0.7255, + "step": 312 + }, + { + "epoch": 0.15473983438388333, + "grad_norm": 0.1598014737117235, + "learning_rate": 1.9934986721442027e-05, + "loss": 0.8046, + "step": 313 + }, + { + "epoch": 0.15523421085156347, + "grad_norm": 0.15596156180608833, + "learning_rate": 1.99345426730992e-05, + "loss": 0.8017, + "step": 314 + }, + { + "epoch": 0.1557285873192436, + "grad_norm": 0.1553095799316784, + "learning_rate": 1.9934097118441235e-05, + "loss": 0.7767, + "step": 315 + }, + { + "epoch": 0.15622296378692374, + "grad_norm": 0.16579220721861485, + "learning_rate": 1.9933650057535687e-05, + "loss": 0.7707, + "step": 316 + }, + { + "epoch": 0.15671734025460388, + "grad_norm": 0.16600880020073885, + "learning_rate": 1.9933201490450346e-05, + "loss": 0.7801, + "step": 317 + }, + { + "epoch": 0.157211716722284, + "grad_norm": 0.17549488507684916, + "learning_rate": 1.9932751417253223e-05, + "loss": 0.7988, + "step": 318 + }, + { + "epoch": 0.15770609318996415, + "grad_norm": 0.1509795703752368, + "learning_rate": 1.993229983801256e-05, + "loss": 0.7589, + "step": 319 + }, + { + "epoch": 0.15820046965764428, + "grad_norm": 0.14112252301821995, + "learning_rate": 1.993184675279683e-05, + "loss": 0.7554, + "step": 320 + }, + { + "epoch": 0.15869484612532445, + "grad_norm": 0.15752808412474137, + "learning_rate": 1.993139216167473e-05, + "loss": 0.771, + "step": 321 + }, + { + "epoch": 0.15918922259300458, + "grad_norm": 0.14818013105021138, + "learning_rate": 1.993093606471518e-05, + "loss": 0.7581, + "step": 322 + }, + { + "epoch": 0.15968359906068472, + "grad_norm": 0.15884894769107274, + "learning_rate": 1.9930478461987343e-05, + "loss": 0.7694, + "step": 323 + }, + { + "epoch": 0.16017797552836485, + "grad_norm": 0.14693045480339825, + "learning_rate": 1.9930019353560605e-05, + "loss": 0.787, + "step": 324 + }, + { + "epoch": 0.160672351996045, + "grad_norm": 0.1629191803658917, + "learning_rate": 1.992955873950457e-05, + "loss": 0.7252, + "step": 325 + }, + { + "epoch": 0.16116672846372512, + "grad_norm": 0.1463102550783214, + "learning_rate": 1.992909661988908e-05, + "loss": 0.7681, + "step": 326 + }, + { + "epoch": 0.16166110493140526, + "grad_norm": 0.16903240980901774, + "learning_rate": 1.9928632994784206e-05, + "loss": 0.7554, + "step": 327 + }, + { + "epoch": 0.1621554813990854, + "grad_norm": 0.15671449082922356, + "learning_rate": 1.992816786426025e-05, + "loss": 0.7991, + "step": 328 + }, + { + "epoch": 0.16264985786676553, + "grad_norm": 0.17575665107283295, + "learning_rate": 1.9927701228387725e-05, + "loss": 0.7926, + "step": 329 + }, + { + "epoch": 0.1631442343344457, + "grad_norm": 0.15280489756785276, + "learning_rate": 1.992723308723739e-05, + "loss": 0.7554, + "step": 330 + }, + { + "epoch": 0.16363861080212583, + "grad_norm": 0.16633655042386056, + "learning_rate": 1.9926763440880228e-05, + "loss": 0.7427, + "step": 331 + }, + { + "epoch": 0.16413298726980596, + "grad_norm": 0.17056746127418182, + "learning_rate": 1.992629228938745e-05, + "loss": 0.8064, + "step": 332 + }, + { + "epoch": 0.1646273637374861, + "grad_norm": 0.1632652448207258, + "learning_rate": 1.9925819632830485e-05, + "loss": 0.7673, + "step": 333 + }, + { + "epoch": 0.16512174020516623, + "grad_norm": 0.17325934214591981, + "learning_rate": 1.9925345471281007e-05, + "loss": 0.7708, + "step": 334 + }, + { + "epoch": 0.16561611667284637, + "grad_norm": 0.15570008261645732, + "learning_rate": 1.992486980481091e-05, + "loss": 0.7955, + "step": 335 + }, + { + "epoch": 0.1661104931405265, + "grad_norm": 0.16521564631744493, + "learning_rate": 1.9924392633492316e-05, + "loss": 0.7503, + "step": 336 + }, + { + "epoch": 0.16660486960820664, + "grad_norm": 0.16450627346716434, + "learning_rate": 1.9923913957397572e-05, + "loss": 0.7509, + "step": 337 + }, + { + "epoch": 0.16709924607588678, + "grad_norm": 0.1665768995167728, + "learning_rate": 1.992343377659926e-05, + "loss": 0.8178, + "step": 338 + }, + { + "epoch": 0.16759362254356694, + "grad_norm": 0.17986372617613455, + "learning_rate": 1.9922952091170185e-05, + "loss": 0.8084, + "step": 339 + }, + { + "epoch": 0.16808799901124707, + "grad_norm": 0.15772359024790927, + "learning_rate": 1.9922468901183384e-05, + "loss": 0.7653, + "step": 340 + }, + { + "epoch": 0.1685823754789272, + "grad_norm": 0.18840555378362545, + "learning_rate": 1.9921984206712122e-05, + "loss": 0.768, + "step": 341 + }, + { + "epoch": 0.16907675194660735, + "grad_norm": 0.15393110540245067, + "learning_rate": 1.9921498007829885e-05, + "loss": 0.7688, + "step": 342 + }, + { + "epoch": 0.16957112841428748, + "grad_norm": 0.16977368739471244, + "learning_rate": 1.9921010304610397e-05, + "loss": 0.7509, + "step": 343 + }, + { + "epoch": 0.17006550488196762, + "grad_norm": 0.14429062940644777, + "learning_rate": 1.9920521097127602e-05, + "loss": 0.718, + "step": 344 + }, + { + "epoch": 0.17055988134964775, + "grad_norm": 0.1845838059291274, + "learning_rate": 1.9920030385455676e-05, + "loss": 0.7633, + "step": 345 + }, + { + "epoch": 0.1710542578173279, + "grad_norm": 0.1590107402138662, + "learning_rate": 1.991953816966903e-05, + "loss": 0.747, + "step": 346 + }, + { + "epoch": 0.17154863428500802, + "grad_norm": 0.1662983636909797, + "learning_rate": 1.9919044449842285e-05, + "loss": 0.8286, + "step": 347 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 0.16319560180967213, + "learning_rate": 1.9918549226050305e-05, + "loss": 0.7764, + "step": 348 + }, + { + "epoch": 0.17253738722036832, + "grad_norm": 0.1632492023072476, + "learning_rate": 1.991805249836818e-05, + "loss": 0.7493, + "step": 349 + }, + { + "epoch": 0.17303176368804846, + "grad_norm": 0.16017363035648916, + "learning_rate": 1.9917554266871223e-05, + "loss": 0.7741, + "step": 350 + }, + { + "epoch": 0.1735261401557286, + "grad_norm": 0.16417709548781576, + "learning_rate": 1.991705453163498e-05, + "loss": 0.7563, + "step": 351 + }, + { + "epoch": 0.17402051662340873, + "grad_norm": 0.1653559516744988, + "learning_rate": 1.991655329273522e-05, + "loss": 0.7696, + "step": 352 + }, + { + "epoch": 0.17451489309108886, + "grad_norm": 0.15692484678151353, + "learning_rate": 1.9916050550247948e-05, + "loss": 0.7318, + "step": 353 + }, + { + "epoch": 0.175009269558769, + "grad_norm": 0.1865754866080479, + "learning_rate": 1.9915546304249385e-05, + "loss": 0.7887, + "step": 354 + }, + { + "epoch": 0.17550364602644913, + "grad_norm": 0.15121795179861897, + "learning_rate": 1.9915040554815994e-05, + "loss": 0.7731, + "step": 355 + }, + { + "epoch": 0.17599802249412927, + "grad_norm": 0.1694547437728942, + "learning_rate": 1.9914533302024452e-05, + "loss": 0.7656, + "step": 356 + }, + { + "epoch": 0.1764923989618094, + "grad_norm": 0.14977050483995552, + "learning_rate": 1.9914024545951673e-05, + "loss": 0.7338, + "step": 357 + }, + { + "epoch": 0.17698677542948957, + "grad_norm": 0.16277723603113248, + "learning_rate": 1.99135142866748e-05, + "loss": 0.7562, + "step": 358 + }, + { + "epoch": 0.1774811518971697, + "grad_norm": 0.15530495113988851, + "learning_rate": 1.9913002524271198e-05, + "loss": 0.767, + "step": 359 + }, + { + "epoch": 0.17797552836484984, + "grad_norm": 0.1532519208124734, + "learning_rate": 1.9912489258818462e-05, + "loss": 0.7339, + "step": 360 + }, + { + "epoch": 0.17846990483252997, + "grad_norm": 0.16344340890681727, + "learning_rate": 1.9911974490394415e-05, + "loss": 0.7923, + "step": 361 + }, + { + "epoch": 0.1789642813002101, + "grad_norm": 0.15667006127908445, + "learning_rate": 1.991145821907711e-05, + "loss": 0.7852, + "step": 362 + }, + { + "epoch": 0.17945865776789025, + "grad_norm": 0.1548799199935257, + "learning_rate": 1.9910940444944824e-05, + "loss": 0.7734, + "step": 363 + }, + { + "epoch": 0.17995303423557038, + "grad_norm": 0.1495225713811733, + "learning_rate": 1.9910421168076066e-05, + "loss": 0.7641, + "step": 364 + }, + { + "epoch": 0.18044741070325052, + "grad_norm": 0.16450229153271412, + "learning_rate": 1.990990038854957e-05, + "loss": 0.7521, + "step": 365 + }, + { + "epoch": 0.18094178717093065, + "grad_norm": 0.15044005869879076, + "learning_rate": 1.9909378106444295e-05, + "loss": 0.724, + "step": 366 + }, + { + "epoch": 0.18143616363861081, + "grad_norm": 0.1614792933855538, + "learning_rate": 1.990885432183944e-05, + "loss": 0.7577, + "step": 367 + }, + { + "epoch": 0.18193054010629095, + "grad_norm": 0.15853550384147724, + "learning_rate": 1.9908329034814416e-05, + "loss": 0.7712, + "step": 368 + }, + { + "epoch": 0.18242491657397109, + "grad_norm": 0.1582671318913434, + "learning_rate": 1.9907802245448876e-05, + "loss": 0.7375, + "step": 369 + }, + { + "epoch": 0.18291929304165122, + "grad_norm": 0.15094208111817753, + "learning_rate": 1.9907273953822685e-05, + "loss": 0.7789, + "step": 370 + }, + { + "epoch": 0.18341366950933136, + "grad_norm": 0.1680069862293419, + "learning_rate": 1.990674416001595e-05, + "loss": 0.7845, + "step": 371 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 0.15342066146698044, + "learning_rate": 1.9906212864109e-05, + "loss": 0.7349, + "step": 372 + }, + { + "epoch": 0.18440242244469163, + "grad_norm": 0.15656525630487692, + "learning_rate": 1.990568006618239e-05, + "loss": 0.7584, + "step": 373 + }, + { + "epoch": 0.18489679891237176, + "grad_norm": 0.15713508545978713, + "learning_rate": 1.990514576631691e-05, + "loss": 0.7513, + "step": 374 + }, + { + "epoch": 0.1853911753800519, + "grad_norm": 0.16001510165472016, + "learning_rate": 1.990460996459357e-05, + "loss": 0.8025, + "step": 375 + }, + { + "epoch": 0.18588555184773206, + "grad_norm": 0.1528099495517522, + "learning_rate": 1.9904072661093608e-05, + "loss": 0.7899, + "step": 376 + }, + { + "epoch": 0.1863799283154122, + "grad_norm": 0.1672612201216499, + "learning_rate": 1.9903533855898493e-05, + "loss": 0.774, + "step": 377 + }, + { + "epoch": 0.18687430478309233, + "grad_norm": 0.16592515057803653, + "learning_rate": 1.9902993549089924e-05, + "loss": 0.7603, + "step": 378 + }, + { + "epoch": 0.18736868125077247, + "grad_norm": 0.15465022097422024, + "learning_rate": 1.990245174074982e-05, + "loss": 0.8009, + "step": 379 + }, + { + "epoch": 0.1878630577184526, + "grad_norm": 0.15202732797761875, + "learning_rate": 1.9901908430960337e-05, + "loss": 0.7623, + "step": 380 + }, + { + "epoch": 0.18835743418613274, + "grad_norm": 0.16306772500848446, + "learning_rate": 1.990136361980385e-05, + "loss": 0.766, + "step": 381 + }, + { + "epoch": 0.18885181065381287, + "grad_norm": 0.15686037406632403, + "learning_rate": 1.9900817307362965e-05, + "loss": 0.7341, + "step": 382 + }, + { + "epoch": 0.189346187121493, + "grad_norm": 0.1594134255628027, + "learning_rate": 1.990026949372052e-05, + "loss": 0.7399, + "step": 383 + }, + { + "epoch": 0.18984056358917314, + "grad_norm": 0.1590748632670886, + "learning_rate": 1.9899720178959576e-05, + "loss": 0.7601, + "step": 384 + }, + { + "epoch": 0.19033494005685328, + "grad_norm": 0.14860582385424892, + "learning_rate": 1.989916936316342e-05, + "loss": 0.7312, + "step": 385 + }, + { + "epoch": 0.19082931652453344, + "grad_norm": 0.1540454454421375, + "learning_rate": 1.989861704641557e-05, + "loss": 0.7464, + "step": 386 + }, + { + "epoch": 0.19132369299221358, + "grad_norm": 0.17013415497806023, + "learning_rate": 1.9898063228799764e-05, + "loss": 0.7893, + "step": 387 + }, + { + "epoch": 0.19181806945989371, + "grad_norm": 0.1528596262579037, + "learning_rate": 1.9897507910399987e-05, + "loss": 0.7306, + "step": 388 + }, + { + "epoch": 0.19231244592757385, + "grad_norm": 0.15878116624187036, + "learning_rate": 1.989695109130043e-05, + "loss": 0.8099, + "step": 389 + }, + { + "epoch": 0.19280682239525399, + "grad_norm": 0.14771955632782138, + "learning_rate": 1.9896392771585523e-05, + "loss": 0.7837, + "step": 390 + }, + { + "epoch": 0.19330119886293412, + "grad_norm": 0.17543954428316116, + "learning_rate": 1.9895832951339916e-05, + "loss": 0.7654, + "step": 391 + }, + { + "epoch": 0.19379557533061426, + "grad_norm": 0.14822041938819253, + "learning_rate": 1.9895271630648497e-05, + "loss": 0.7292, + "step": 392 + }, + { + "epoch": 0.1942899517982944, + "grad_norm": 0.14894336893305488, + "learning_rate": 1.9894708809596374e-05, + "loss": 0.7517, + "step": 393 + }, + { + "epoch": 0.19478432826597453, + "grad_norm": 0.1557974319841423, + "learning_rate": 1.9894144488268883e-05, + "loss": 0.7676, + "step": 394 + }, + { + "epoch": 0.1952787047336547, + "grad_norm": 0.17858362662223468, + "learning_rate": 1.989357866675159e-05, + "loss": 0.8084, + "step": 395 + }, + { + "epoch": 0.19577308120133483, + "grad_norm": 0.1618716729927282, + "learning_rate": 1.9893011345130287e-05, + "loss": 0.7249, + "step": 396 + }, + { + "epoch": 0.19626745766901496, + "grad_norm": 0.15578368948717486, + "learning_rate": 1.9892442523490994e-05, + "loss": 0.7849, + "step": 397 + }, + { + "epoch": 0.1967618341366951, + "grad_norm": 0.15679157053844428, + "learning_rate": 1.9891872201919954e-05, + "loss": 0.7509, + "step": 398 + }, + { + "epoch": 0.19725621060437523, + "grad_norm": 0.15898977385427362, + "learning_rate": 1.9891300380503646e-05, + "loss": 0.732, + "step": 399 + }, + { + "epoch": 0.19775058707205537, + "grad_norm": 0.16088388769307882, + "learning_rate": 1.989072705932877e-05, + "loss": 0.8112, + "step": 400 + }, + { + "epoch": 0.1982449635397355, + "grad_norm": 0.15155523260613368, + "learning_rate": 1.9890152238482255e-05, + "loss": 0.7707, + "step": 401 + }, + { + "epoch": 0.19873934000741564, + "grad_norm": 0.16789065279741447, + "learning_rate": 1.988957591805126e-05, + "loss": 0.7364, + "step": 402 + }, + { + "epoch": 0.19923371647509577, + "grad_norm": 0.15335851320951321, + "learning_rate": 1.9888998098123166e-05, + "loss": 0.7537, + "step": 403 + }, + { + "epoch": 0.19972809294277594, + "grad_norm": 0.16107185517504202, + "learning_rate": 1.988841877878559e-05, + "loss": 0.7767, + "step": 404 + }, + { + "epoch": 0.20022246941045607, + "grad_norm": 0.17231291757481734, + "learning_rate": 1.9887837960126358e-05, + "loss": 0.738, + "step": 405 + }, + { + "epoch": 0.2007168458781362, + "grad_norm": 0.16120132228819362, + "learning_rate": 1.988725564223355e-05, + "loss": 0.7703, + "step": 406 + }, + { + "epoch": 0.20121122234581634, + "grad_norm": 0.1652667385657612, + "learning_rate": 1.9886671825195453e-05, + "loss": 0.713, + "step": 407 + }, + { + "epoch": 0.20170559881349648, + "grad_norm": 0.14445138397218468, + "learning_rate": 1.9886086509100585e-05, + "loss": 0.7225, + "step": 408 + }, + { + "epoch": 0.2021999752811766, + "grad_norm": 0.156621504920077, + "learning_rate": 1.98854996940377e-05, + "loss": 0.7731, + "step": 409 + }, + { + "epoch": 0.20269435174885675, + "grad_norm": 0.16739171360470884, + "learning_rate": 1.9884911380095772e-05, + "loss": 0.7399, + "step": 410 + }, + { + "epoch": 0.20318872821653688, + "grad_norm": 0.1574095634427135, + "learning_rate": 1.9884321567364003e-05, + "loss": 0.7376, + "step": 411 + }, + { + "epoch": 0.20368310468421702, + "grad_norm": 0.16195793355274377, + "learning_rate": 1.9883730255931818e-05, + "loss": 0.7012, + "step": 412 + }, + { + "epoch": 0.20417748115189718, + "grad_norm": 0.1727668763074567, + "learning_rate": 1.988313744588888e-05, + "loss": 0.7638, + "step": 413 + }, + { + "epoch": 0.20467185761957732, + "grad_norm": 0.15579832376809857, + "learning_rate": 1.9882543137325073e-05, + "loss": 0.7592, + "step": 414 + }, + { + "epoch": 0.20516623408725745, + "grad_norm": 0.19300886121945698, + "learning_rate": 1.9881947330330505e-05, + "loss": 0.7888, + "step": 415 + }, + { + "epoch": 0.2056606105549376, + "grad_norm": 0.22389546138212932, + "learning_rate": 1.9881350024995514e-05, + "loss": 0.7525, + "step": 416 + }, + { + "epoch": 0.20615498702261773, + "grad_norm": 0.16199425462341413, + "learning_rate": 1.9880751221410672e-05, + "loss": 0.7749, + "step": 417 + }, + { + "epoch": 0.20664936349029786, + "grad_norm": 0.1681829076053104, + "learning_rate": 1.9880150919666764e-05, + "loss": 0.7813, + "step": 418 + }, + { + "epoch": 0.207143739957978, + "grad_norm": 0.17000642505133612, + "learning_rate": 1.987954911985482e-05, + "loss": 0.7658, + "step": 419 + }, + { + "epoch": 0.20763811642565813, + "grad_norm": 0.1592014105826203, + "learning_rate": 1.987894582206608e-05, + "loss": 0.7122, + "step": 420 + }, + { + "epoch": 0.20813249289333827, + "grad_norm": 0.16740267394053293, + "learning_rate": 1.9878341026392016e-05, + "loss": 0.7749, + "step": 421 + }, + { + "epoch": 0.2086268693610184, + "grad_norm": 0.15489270518741574, + "learning_rate": 1.9877734732924335e-05, + "loss": 0.7462, + "step": 422 + }, + { + "epoch": 0.20912124582869857, + "grad_norm": 0.14760335792359697, + "learning_rate": 1.9877126941754966e-05, + "loss": 0.7687, + "step": 423 + }, + { + "epoch": 0.2096156222963787, + "grad_norm": 0.2271361501655256, + "learning_rate": 1.987651765297606e-05, + "loss": 0.8042, + "step": 424 + }, + { + "epoch": 0.21010999876405884, + "grad_norm": 0.16515229120879366, + "learning_rate": 1.9875906866680002e-05, + "loss": 0.8078, + "step": 425 + }, + { + "epoch": 0.21060437523173897, + "grad_norm": 0.17098101958163198, + "learning_rate": 1.9875294582959407e-05, + "loss": 0.7471, + "step": 426 + }, + { + "epoch": 0.2110987516994191, + "grad_norm": 0.15443294773580707, + "learning_rate": 1.9874680801907108e-05, + "loss": 0.7824, + "step": 427 + }, + { + "epoch": 0.21159312816709924, + "grad_norm": 0.1641571275335793, + "learning_rate": 1.9874065523616165e-05, + "loss": 0.7461, + "step": 428 + }, + { + "epoch": 0.21208750463477938, + "grad_norm": 0.16296439308382232, + "learning_rate": 1.9873448748179872e-05, + "loss": 0.7615, + "step": 429 + }, + { + "epoch": 0.2125818811024595, + "grad_norm": 0.16280219336129217, + "learning_rate": 1.9872830475691747e-05, + "loss": 0.7806, + "step": 430 + }, + { + "epoch": 0.21307625757013965, + "grad_norm": 0.17397592154535496, + "learning_rate": 1.9872210706245538e-05, + "loss": 0.7895, + "step": 431 + }, + { + "epoch": 0.2135706340378198, + "grad_norm": 0.15481512185377816, + "learning_rate": 1.9871589439935212e-05, + "loss": 0.7699, + "step": 432 + }, + { + "epoch": 0.21406501050549995, + "grad_norm": 0.1937263734172744, + "learning_rate": 1.9870966676854972e-05, + "loss": 0.7408, + "step": 433 + }, + { + "epoch": 0.21455938697318008, + "grad_norm": 0.1672233491775638, + "learning_rate": 1.9870342417099244e-05, + "loss": 0.7645, + "step": 434 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 0.39100991180510636, + "learning_rate": 1.986971666076267e-05, + "loss": 0.7685, + "step": 435 + }, + { + "epoch": 0.21554813990854035, + "grad_norm": 0.18760852590247817, + "learning_rate": 1.9869089407940147e-05, + "loss": 0.7481, + "step": 436 + }, + { + "epoch": 0.2160425163762205, + "grad_norm": 0.16396450513940547, + "learning_rate": 1.986846065872677e-05, + "loss": 0.7626, + "step": 437 + }, + { + "epoch": 0.21653689284390062, + "grad_norm": 0.18676362918593256, + "learning_rate": 1.9867830413217876e-05, + "loss": 0.7795, + "step": 438 + }, + { + "epoch": 0.21703126931158076, + "grad_norm": 0.17428138210378466, + "learning_rate": 1.986719867150902e-05, + "loss": 0.7568, + "step": 439 + }, + { + "epoch": 0.2175256457792609, + "grad_norm": 0.15646082453784152, + "learning_rate": 1.9866565433696002e-05, + "loss": 0.7428, + "step": 440 + }, + { + "epoch": 0.21802002224694106, + "grad_norm": 0.1852279377536713, + "learning_rate": 1.9865930699874824e-05, + "loss": 0.7315, + "step": 441 + }, + { + "epoch": 0.2185143987146212, + "grad_norm": 0.16170054021859556, + "learning_rate": 1.9865294470141732e-05, + "loss": 0.7199, + "step": 442 + }, + { + "epoch": 0.21900877518230133, + "grad_norm": 0.16935438756459387, + "learning_rate": 1.9864656744593192e-05, + "loss": 0.7481, + "step": 443 + }, + { + "epoch": 0.21950315164998146, + "grad_norm": 0.17566011635976778, + "learning_rate": 1.9864017523325898e-05, + "loss": 0.7665, + "step": 444 + }, + { + "epoch": 0.2199975281176616, + "grad_norm": 0.16916745208700498, + "learning_rate": 1.9863376806436774e-05, + "loss": 0.7875, + "step": 445 + }, + { + "epoch": 0.22049190458534174, + "grad_norm": 0.1801079431562792, + "learning_rate": 1.9862734594022964e-05, + "loss": 0.78, + "step": 446 + }, + { + "epoch": 0.22098628105302187, + "grad_norm": 0.15800120533413017, + "learning_rate": 1.9862090886181845e-05, + "loss": 0.7523, + "step": 447 + }, + { + "epoch": 0.221480657520702, + "grad_norm": 0.16166648515258064, + "learning_rate": 1.9861445683011023e-05, + "loss": 0.753, + "step": 448 + }, + { + "epoch": 0.22197503398838214, + "grad_norm": 0.17553882747833197, + "learning_rate": 1.986079898460832e-05, + "loss": 0.7368, + "step": 449 + }, + { + "epoch": 0.22246941045606228, + "grad_norm": 0.16322576585144302, + "learning_rate": 1.9860150791071794e-05, + "loss": 0.7316, + "step": 450 + }, + { + "epoch": 0.22296378692374244, + "grad_norm": 0.15379750237539247, + "learning_rate": 1.9859501102499722e-05, + "loss": 0.726, + "step": 451 + }, + { + "epoch": 0.22345816339142258, + "grad_norm": 0.16433878868977853, + "learning_rate": 1.985884991899062e-05, + "loss": 0.7276, + "step": 452 + }, + { + "epoch": 0.2239525398591027, + "grad_norm": 0.16904302940196136, + "learning_rate": 1.985819724064322e-05, + "loss": 0.7545, + "step": 453 + }, + { + "epoch": 0.22444691632678285, + "grad_norm": 0.15997168019678742, + "learning_rate": 1.9857543067556483e-05, + "loss": 0.7397, + "step": 454 + }, + { + "epoch": 0.22494129279446298, + "grad_norm": 0.1692685185002491, + "learning_rate": 1.9856887399829594e-05, + "loss": 0.7499, + "step": 455 + }, + { + "epoch": 0.22543566926214312, + "grad_norm": 0.1546984519117549, + "learning_rate": 1.9856230237561974e-05, + "loss": 0.7924, + "step": 456 + }, + { + "epoch": 0.22593004572982325, + "grad_norm": 0.15733417612250106, + "learning_rate": 1.9855571580853258e-05, + "loss": 0.7486, + "step": 457 + }, + { + "epoch": 0.2264244221975034, + "grad_norm": 0.15692318019944637, + "learning_rate": 1.9854911429803324e-05, + "loss": 0.7693, + "step": 458 + }, + { + "epoch": 0.22691879866518352, + "grad_norm": 0.15592431898518358, + "learning_rate": 1.9854249784512257e-05, + "loss": 0.802, + "step": 459 + }, + { + "epoch": 0.2274131751328637, + "grad_norm": 0.15707924363508294, + "learning_rate": 1.985358664508038e-05, + "loss": 0.7213, + "step": 460 + }, + { + "epoch": 0.22790755160054382, + "grad_norm": 0.15483808610088584, + "learning_rate": 1.9852922011608245e-05, + "loss": 0.7176, + "step": 461 + }, + { + "epoch": 0.22840192806822396, + "grad_norm": 0.15900273550674024, + "learning_rate": 1.9852255884196626e-05, + "loss": 0.7462, + "step": 462 + }, + { + "epoch": 0.2288963045359041, + "grad_norm": 0.15995162168614985, + "learning_rate": 1.985158826294652e-05, + "loss": 0.7667, + "step": 463 + }, + { + "epoch": 0.22939068100358423, + "grad_norm": 0.15531582336848132, + "learning_rate": 1.9850919147959158e-05, + "loss": 0.7703, + "step": 464 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 0.15616519054631905, + "learning_rate": 1.985024853933599e-05, + "loss": 0.7886, + "step": 465 + }, + { + "epoch": 0.2303794339389445, + "grad_norm": 0.15506647661665723, + "learning_rate": 1.9849576437178703e-05, + "loss": 0.7491, + "step": 466 + }, + { + "epoch": 0.23087381040662464, + "grad_norm": 0.15458463065616482, + "learning_rate": 1.98489028415892e-05, + "loss": 0.7813, + "step": 467 + }, + { + "epoch": 0.23136818687430477, + "grad_norm": 0.16490507181700448, + "learning_rate": 1.9848227752669612e-05, + "loss": 0.7581, + "step": 468 + }, + { + "epoch": 0.23186256334198493, + "grad_norm": 0.15739543452056817, + "learning_rate": 1.98475511705223e-05, + "loss": 0.7663, + "step": 469 + }, + { + "epoch": 0.23235693980966507, + "grad_norm": 0.15061278567775893, + "learning_rate": 1.984687309524985e-05, + "loss": 0.7307, + "step": 470 + }, + { + "epoch": 0.2328513162773452, + "grad_norm": 0.148585499052303, + "learning_rate": 1.9846193526955074e-05, + "loss": 0.7425, + "step": 471 + }, + { + "epoch": 0.23334569274502534, + "grad_norm": 0.17439306507664734, + "learning_rate": 1.9845512465741016e-05, + "loss": 0.7515, + "step": 472 + }, + { + "epoch": 0.23384006921270548, + "grad_norm": 0.157820765961761, + "learning_rate": 1.9844829911710936e-05, + "loss": 0.7546, + "step": 473 + }, + { + "epoch": 0.2343344456803856, + "grad_norm": 0.17613722388997496, + "learning_rate": 1.9844145864968326e-05, + "loss": 0.7168, + "step": 474 + }, + { + "epoch": 0.23482882214806575, + "grad_norm": 0.16588033515748565, + "learning_rate": 1.9843460325616908e-05, + "loss": 0.7762, + "step": 475 + }, + { + "epoch": 0.23532319861574588, + "grad_norm": 0.15915699448445758, + "learning_rate": 1.984277329376062e-05, + "loss": 0.7535, + "step": 476 + }, + { + "epoch": 0.23581757508342602, + "grad_norm": 0.16695139208290058, + "learning_rate": 1.9842084769503636e-05, + "loss": 0.7674, + "step": 477 + }, + { + "epoch": 0.23631195155110618, + "grad_norm": 0.15083246765801872, + "learning_rate": 1.984139475295035e-05, + "loss": 0.7528, + "step": 478 + }, + { + "epoch": 0.23680632801878632, + "grad_norm": 0.1572598942190533, + "learning_rate": 1.9840703244205392e-05, + "loss": 0.7263, + "step": 479 + }, + { + "epoch": 0.23730070448646645, + "grad_norm": 0.16286656664202295, + "learning_rate": 1.9840010243373603e-05, + "loss": 0.7945, + "step": 480 + }, + { + "epoch": 0.2377950809541466, + "grad_norm": 0.14771636903962843, + "learning_rate": 1.9839315750560068e-05, + "loss": 0.7462, + "step": 481 + }, + { + "epoch": 0.23828945742182672, + "grad_norm": 0.17136611244331051, + "learning_rate": 1.9838619765870076e-05, + "loss": 0.7647, + "step": 482 + }, + { + "epoch": 0.23878383388950686, + "grad_norm": 0.17439186580666652, + "learning_rate": 1.9837922289409164e-05, + "loss": 0.7719, + "step": 483 + }, + { + "epoch": 0.239278210357187, + "grad_norm": 0.15230780143904768, + "learning_rate": 1.9837223321283087e-05, + "loss": 0.731, + "step": 484 + }, + { + "epoch": 0.23977258682486713, + "grad_norm": 0.1597195773733112, + "learning_rate": 1.983652286159782e-05, + "loss": 0.7357, + "step": 485 + }, + { + "epoch": 0.24026696329254726, + "grad_norm": 0.15772895341194218, + "learning_rate": 1.9835820910459573e-05, + "loss": 0.7232, + "step": 486 + }, + { + "epoch": 0.2407613397602274, + "grad_norm": 0.15704892962266292, + "learning_rate": 1.983511746797478e-05, + "loss": 0.7392, + "step": 487 + }, + { + "epoch": 0.24125571622790756, + "grad_norm": 0.15114854574649253, + "learning_rate": 1.983441253425009e-05, + "loss": 0.753, + "step": 488 + }, + { + "epoch": 0.2417500926955877, + "grad_norm": 0.1612994471131408, + "learning_rate": 1.9833706109392404e-05, + "loss": 0.7158, + "step": 489 + }, + { + "epoch": 0.24224446916326783, + "grad_norm": 0.15257457657027904, + "learning_rate": 1.983299819350882e-05, + "loss": 0.7795, + "step": 490 + }, + { + "epoch": 0.24273884563094797, + "grad_norm": 0.1524513701561887, + "learning_rate": 1.983228878670668e-05, + "loss": 0.7283, + "step": 491 + }, + { + "epoch": 0.2432332220986281, + "grad_norm": 0.16476343050195588, + "learning_rate": 1.9831577889093546e-05, + "loss": 0.7543, + "step": 492 + }, + { + "epoch": 0.24372759856630824, + "grad_norm": 0.16145584518897285, + "learning_rate": 1.983086550077721e-05, + "loss": 0.7723, + "step": 493 + }, + { + "epoch": 0.24422197503398838, + "grad_norm": 0.15721959698876836, + "learning_rate": 1.9830151621865682e-05, + "loss": 0.7364, + "step": 494 + }, + { + "epoch": 0.2447163515016685, + "grad_norm": 0.16437850117112732, + "learning_rate": 1.9829436252467208e-05, + "loss": 0.7602, + "step": 495 + }, + { + "epoch": 0.24521072796934865, + "grad_norm": 0.15503474540401552, + "learning_rate": 1.9828719392690252e-05, + "loss": 0.7246, + "step": 496 + }, + { + "epoch": 0.2457051044370288, + "grad_norm": 0.14974532260398285, + "learning_rate": 1.982800104264351e-05, + "loss": 0.7359, + "step": 497 + }, + { + "epoch": 0.24619948090470894, + "grad_norm": 0.15447979020823988, + "learning_rate": 1.9827281202435898e-05, + "loss": 0.7476, + "step": 498 + }, + { + "epoch": 0.24669385737238908, + "grad_norm": 0.1502279498524954, + "learning_rate": 1.982655987217656e-05, + "loss": 0.7509, + "step": 499 + }, + { + "epoch": 0.24718823384006922, + "grad_norm": 0.15672893225963178, + "learning_rate": 1.9825837051974874e-05, + "loss": 0.7491, + "step": 500 + }, + { + "epoch": 0.24768261030774935, + "grad_norm": 0.1494085870070571, + "learning_rate": 1.982511274194043e-05, + "loss": 0.7229, + "step": 501 + }, + { + "epoch": 0.2481769867754295, + "grad_norm": 0.1669440874032149, + "learning_rate": 1.9824386942183053e-05, + "loss": 0.7138, + "step": 502 + }, + { + "epoch": 0.24867136324310962, + "grad_norm": 0.16868837689993962, + "learning_rate": 1.982365965281279e-05, + "loss": 0.7522, + "step": 503 + }, + { + "epoch": 0.24916573971078976, + "grad_norm": 0.157411187771111, + "learning_rate": 1.9822930873939923e-05, + "loss": 0.7426, + "step": 504 + }, + { + "epoch": 0.2496601161784699, + "grad_norm": 0.1568770133833736, + "learning_rate": 1.9822200605674942e-05, + "loss": 0.7564, + "step": 505 + }, + { + "epoch": 0.25015449264615003, + "grad_norm": 0.1638246709652877, + "learning_rate": 1.982146884812858e-05, + "loss": 0.7579, + "step": 506 + }, + { + "epoch": 0.25015449264615003, + "eval_loss": 0.7476502656936646, + "eval_runtime": 81.8383, + "eval_samples_per_second": 370.902, + "eval_steps_per_second": 46.372, + "step": 506 + }, + { + "epoch": 0.25064886911383016, + "grad_norm": 0.1615893413283426, + "learning_rate": 1.9820735601411787e-05, + "loss": 0.7249, + "step": 507 + }, + { + "epoch": 0.2511432455815103, + "grad_norm": 0.16314119344554448, + "learning_rate": 1.982000086563574e-05, + "loss": 0.7472, + "step": 508 + }, + { + "epoch": 0.25163762204919043, + "grad_norm": 0.16153945138060458, + "learning_rate": 1.981926464091184e-05, + "loss": 0.7077, + "step": 509 + }, + { + "epoch": 0.25213199851687057, + "grad_norm": 0.15585018715100976, + "learning_rate": 1.9818526927351723e-05, + "loss": 0.7446, + "step": 510 + }, + { + "epoch": 0.25262637498455076, + "grad_norm": 0.1645045096720242, + "learning_rate": 1.981778772506724e-05, + "loss": 0.7486, + "step": 511 + }, + { + "epoch": 0.2531207514522309, + "grad_norm": 0.1585640216016322, + "learning_rate": 1.9817047034170477e-05, + "loss": 0.7603, + "step": 512 + }, + { + "epoch": 0.25361512791991103, + "grad_norm": 0.15732272315123227, + "learning_rate": 1.981630485477373e-05, + "loss": 0.7422, + "step": 513 + }, + { + "epoch": 0.25410950438759117, + "grad_norm": 0.16585932701145212, + "learning_rate": 1.9815561186989537e-05, + "loss": 0.7766, + "step": 514 + }, + { + "epoch": 0.2546038808552713, + "grad_norm": 0.1481352585253545, + "learning_rate": 1.981481603093066e-05, + "loss": 0.7174, + "step": 515 + }, + { + "epoch": 0.25509825732295144, + "grad_norm": 0.4024908735382987, + "learning_rate": 1.9814069386710076e-05, + "loss": 0.7813, + "step": 516 + }, + { + "epoch": 0.2555926337906316, + "grad_norm": 0.1722561503580983, + "learning_rate": 1.9813321254441e-05, + "loss": 0.7397, + "step": 517 + }, + { + "epoch": 0.2560870102583117, + "grad_norm": 0.17626939005009176, + "learning_rate": 1.9812571634236863e-05, + "loss": 0.762, + "step": 518 + }, + { + "epoch": 0.25658138672599184, + "grad_norm": 0.7628933972239496, + "learning_rate": 1.981182052621132e-05, + "loss": 0.8403, + "step": 519 + }, + { + "epoch": 0.257075763193672, + "grad_norm": 0.17154948209309154, + "learning_rate": 1.9811067930478266e-05, + "loss": 0.7318, + "step": 520 + }, + { + "epoch": 0.2575701396613521, + "grad_norm": 0.16170475999588163, + "learning_rate": 1.9810313847151814e-05, + "loss": 0.7406, + "step": 521 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.17699240340205943, + "learning_rate": 1.9809558276346294e-05, + "loss": 0.7412, + "step": 522 + }, + { + "epoch": 0.2585588925967124, + "grad_norm": 0.15852743327379967, + "learning_rate": 1.9808801218176272e-05, + "loss": 0.7237, + "step": 523 + }, + { + "epoch": 0.2590532690643925, + "grad_norm": 0.17592852933762576, + "learning_rate": 1.9808042672756534e-05, + "loss": 0.7033, + "step": 524 + }, + { + "epoch": 0.25954764553207266, + "grad_norm": 0.17854164951854123, + "learning_rate": 1.9807282640202098e-05, + "loss": 0.776, + "step": 525 + }, + { + "epoch": 0.2600420219997528, + "grad_norm": 0.1685587009519555, + "learning_rate": 1.9806521120628196e-05, + "loss": 0.7923, + "step": 526 + }, + { + "epoch": 0.26053639846743293, + "grad_norm": 0.1653341403107767, + "learning_rate": 1.9805758114150305e-05, + "loss": 0.7747, + "step": 527 + }, + { + "epoch": 0.26103077493511306, + "grad_norm": 0.16756698518640073, + "learning_rate": 1.98049936208841e-05, + "loss": 0.7409, + "step": 528 + }, + { + "epoch": 0.26152515140279325, + "grad_norm": 0.44202902535289557, + "learning_rate": 1.9804227640945508e-05, + "loss": 0.7409, + "step": 529 + }, + { + "epoch": 0.2620195278704734, + "grad_norm": 0.18559986532248332, + "learning_rate": 1.9803460174450662e-05, + "loss": 0.7196, + "step": 530 + }, + { + "epoch": 0.2625139043381535, + "grad_norm": 0.171852498642609, + "learning_rate": 1.9802691221515936e-05, + "loss": 0.7721, + "step": 531 + }, + { + "epoch": 0.26300828080583366, + "grad_norm": 0.1738125887993007, + "learning_rate": 1.9801920782257914e-05, + "loss": 0.7203, + "step": 532 + }, + { + "epoch": 0.2635026572735138, + "grad_norm": 0.16733373975596014, + "learning_rate": 1.980114885679342e-05, + "loss": 0.745, + "step": 533 + }, + { + "epoch": 0.26399703374119393, + "grad_norm": 0.16822840510967285, + "learning_rate": 1.9800375445239493e-05, + "loss": 0.7415, + "step": 534 + }, + { + "epoch": 0.26449141020887407, + "grad_norm": 0.16384390371791702, + "learning_rate": 1.97996005477134e-05, + "loss": 0.75, + "step": 535 + }, + { + "epoch": 0.2649857866765542, + "grad_norm": 0.17187365406756672, + "learning_rate": 1.9798824164332635e-05, + "loss": 0.7189, + "step": 536 + }, + { + "epoch": 0.26548016314423434, + "grad_norm": 0.16498072651875792, + "learning_rate": 1.9798046295214918e-05, + "loss": 0.7447, + "step": 537 + }, + { + "epoch": 0.2659745396119145, + "grad_norm": 0.16412569354984857, + "learning_rate": 1.979726694047819e-05, + "loss": 0.7695, + "step": 538 + }, + { + "epoch": 0.2664689160795946, + "grad_norm": 0.17871035954191036, + "learning_rate": 1.979648610024062e-05, + "loss": 0.7472, + "step": 539 + }, + { + "epoch": 0.26696329254727474, + "grad_norm": 0.1683838581099352, + "learning_rate": 1.9795703774620608e-05, + "loss": 0.7597, + "step": 540 + }, + { + "epoch": 0.2674576690149549, + "grad_norm": 0.1766936148241555, + "learning_rate": 1.979491996373676e-05, + "loss": 0.7663, + "step": 541 + }, + { + "epoch": 0.267952045482635, + "grad_norm": 0.15736954855136628, + "learning_rate": 1.9794134667707938e-05, + "loss": 0.7602, + "step": 542 + }, + { + "epoch": 0.26844642195031515, + "grad_norm": 0.18405941428617587, + "learning_rate": 1.97933478866532e-05, + "loss": 0.73, + "step": 543 + }, + { + "epoch": 0.2689407984179953, + "grad_norm": 0.1549222910734999, + "learning_rate": 1.979255962069184e-05, + "loss": 0.7387, + "step": 544 + }, + { + "epoch": 0.2694351748856754, + "grad_norm": 0.17400174809214786, + "learning_rate": 1.9791769869943384e-05, + "loss": 0.7532, + "step": 545 + }, + { + "epoch": 0.26992955135335556, + "grad_norm": 0.14950577892093572, + "learning_rate": 1.9790978634527577e-05, + "loss": 0.7606, + "step": 546 + }, + { + "epoch": 0.2704239278210357, + "grad_norm": 0.1659443527457246, + "learning_rate": 1.9790185914564385e-05, + "loss": 0.7247, + "step": 547 + }, + { + "epoch": 0.2709183042887159, + "grad_norm": 0.15950227789114113, + "learning_rate": 1.9789391710174005e-05, + "loss": 0.7199, + "step": 548 + }, + { + "epoch": 0.271412680756396, + "grad_norm": 0.1720804566233888, + "learning_rate": 1.978859602147686e-05, + "loss": 0.7369, + "step": 549 + }, + { + "epoch": 0.27190705722407615, + "grad_norm": 0.15599811033676275, + "learning_rate": 1.978779884859359e-05, + "loss": 0.739, + "step": 550 + }, + { + "epoch": 0.2724014336917563, + "grad_norm": 0.25263744456705745, + "learning_rate": 1.9787000191645072e-05, + "loss": 0.757, + "step": 551 + }, + { + "epoch": 0.2728958101594364, + "grad_norm": 0.16683971352097118, + "learning_rate": 1.97862000507524e-05, + "loss": 0.7569, + "step": 552 + }, + { + "epoch": 0.27339018662711656, + "grad_norm": 0.17332556404148955, + "learning_rate": 1.9785398426036888e-05, + "loss": 0.7562, + "step": 553 + }, + { + "epoch": 0.2738845630947967, + "grad_norm": 0.18478665297830618, + "learning_rate": 1.9784595317620093e-05, + "loss": 0.7558, + "step": 554 + }, + { + "epoch": 0.27437893956247683, + "grad_norm": 0.15740892913637705, + "learning_rate": 1.9783790725623776e-05, + "loss": 0.7502, + "step": 555 + }, + { + "epoch": 0.27487331603015697, + "grad_norm": 0.17435450183863413, + "learning_rate": 1.9782984650169934e-05, + "loss": 0.7675, + "step": 556 + }, + { + "epoch": 0.2753676924978371, + "grad_norm": 0.1545725526713993, + "learning_rate": 1.978217709138079e-05, + "loss": 0.7777, + "step": 557 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.16507736486218708, + "learning_rate": 1.9781368049378788e-05, + "loss": 0.7433, + "step": 558 + }, + { + "epoch": 0.2763564454331974, + "grad_norm": 0.18227409190185526, + "learning_rate": 1.9780557524286602e-05, + "loss": 0.7428, + "step": 559 + }, + { + "epoch": 0.2768508219008775, + "grad_norm": 0.16866122357752245, + "learning_rate": 1.977974551622712e-05, + "loss": 0.7167, + "step": 560 + }, + { + "epoch": 0.27734519836855764, + "grad_norm": 1.066621083038637, + "learning_rate": 1.977893202532347e-05, + "loss": 0.7716, + "step": 561 + }, + { + "epoch": 0.2778395748362378, + "grad_norm": 0.16588323089034596, + "learning_rate": 1.9778117051698987e-05, + "loss": 0.7401, + "step": 562 + }, + { + "epoch": 0.2783339513039179, + "grad_norm": 0.15613401219300313, + "learning_rate": 1.9777300595477248e-05, + "loss": 0.7362, + "step": 563 + }, + { + "epoch": 0.27882832777159805, + "grad_norm": 0.17153213212596644, + "learning_rate": 1.9776482656782043e-05, + "loss": 0.7083, + "step": 564 + }, + { + "epoch": 0.2793227042392782, + "grad_norm": 0.17789566859174488, + "learning_rate": 1.9775663235737397e-05, + "loss": 0.7584, + "step": 565 + }, + { + "epoch": 0.2798170807069584, + "grad_norm": 0.17762883549229516, + "learning_rate": 1.977484233246755e-05, + "loss": 0.7219, + "step": 566 + }, + { + "epoch": 0.2803114571746385, + "grad_norm": 1.3400549223563012, + "learning_rate": 1.977401994709697e-05, + "loss": 0.8473, + "step": 567 + }, + { + "epoch": 0.28080583364231865, + "grad_norm": 0.1694375880859545, + "learning_rate": 1.977319607975035e-05, + "loss": 0.8019, + "step": 568 + }, + { + "epoch": 0.2813002101099988, + "grad_norm": 0.17225531463804059, + "learning_rate": 1.977237073055261e-05, + "loss": 0.7215, + "step": 569 + }, + { + "epoch": 0.2817945865776789, + "grad_norm": 0.18775203154480133, + "learning_rate": 1.9771543899628892e-05, + "loss": 0.7731, + "step": 570 + }, + { + "epoch": 0.28228896304535905, + "grad_norm": 0.18350178756279056, + "learning_rate": 1.9770715587104565e-05, + "loss": 0.7233, + "step": 571 + }, + { + "epoch": 0.2827833395130392, + "grad_norm": 0.16613176946531813, + "learning_rate": 1.9769885793105217e-05, + "loss": 0.7467, + "step": 572 + }, + { + "epoch": 0.2832777159807193, + "grad_norm": 0.1874936146494366, + "learning_rate": 1.9769054517756666e-05, + "loss": 0.7339, + "step": 573 + }, + { + "epoch": 0.28377209244839946, + "grad_norm": 0.1642588806469533, + "learning_rate": 1.9768221761184958e-05, + "loss": 0.7312, + "step": 574 + }, + { + "epoch": 0.2842664689160796, + "grad_norm": 0.19507609409066026, + "learning_rate": 1.9767387523516354e-05, + "loss": 0.7592, + "step": 575 + }, + { + "epoch": 0.28476084538375973, + "grad_norm": 0.15740184295510637, + "learning_rate": 1.9766551804877348e-05, + "loss": 0.7599, + "step": 576 + }, + { + "epoch": 0.28525522185143987, + "grad_norm": 0.17553875378739422, + "learning_rate": 1.9765714605394652e-05, + "loss": 0.7249, + "step": 577 + }, + { + "epoch": 0.28574959831912, + "grad_norm": 0.16524095677346673, + "learning_rate": 1.9764875925195202e-05, + "loss": 0.7293, + "step": 578 + }, + { + "epoch": 0.28624397478680014, + "grad_norm": 0.16905190750884155, + "learning_rate": 1.9764035764406172e-05, + "loss": 0.7355, + "step": 579 + }, + { + "epoch": 0.2867383512544803, + "grad_norm": 0.1709237771216906, + "learning_rate": 1.9763194123154946e-05, + "loss": 0.7362, + "step": 580 + }, + { + "epoch": 0.2872327277221604, + "grad_norm": 0.15308862581429816, + "learning_rate": 1.976235100156913e-05, + "loss": 0.746, + "step": 581 + }, + { + "epoch": 0.28772710418984054, + "grad_norm": 0.16681220304522656, + "learning_rate": 1.9761506399776573e-05, + "loss": 0.7697, + "step": 582 + }, + { + "epoch": 0.2882214806575207, + "grad_norm": 0.14688253155593256, + "learning_rate": 1.976066031790533e-05, + "loss": 0.7705, + "step": 583 + }, + { + "epoch": 0.2887158571252008, + "grad_norm": 0.18621683139149672, + "learning_rate": 1.975981275608369e-05, + "loss": 0.7808, + "step": 584 + }, + { + "epoch": 0.289210233592881, + "grad_norm": 0.16434933675949648, + "learning_rate": 1.975896371444016e-05, + "loss": 0.7472, + "step": 585 + }, + { + "epoch": 0.28970461006056114, + "grad_norm": 0.16942762555441307, + "learning_rate": 1.9758113193103473e-05, + "loss": 0.7295, + "step": 586 + }, + { + "epoch": 0.2901989865282413, + "grad_norm": 0.1660034812310077, + "learning_rate": 1.97572611922026e-05, + "loss": 0.724, + "step": 587 + }, + { + "epoch": 0.2906933629959214, + "grad_norm": 0.15401599218011122, + "learning_rate": 1.9756407711866715e-05, + "loss": 0.7628, + "step": 588 + }, + { + "epoch": 0.29118773946360155, + "grad_norm": 0.157708954365431, + "learning_rate": 1.975555275222523e-05, + "loss": 0.7186, + "step": 589 + }, + { + "epoch": 0.2916821159312817, + "grad_norm": 0.7283793757898056, + "learning_rate": 1.9754696313407776e-05, + "loss": 0.7559, + "step": 590 + }, + { + "epoch": 0.2921764923989618, + "grad_norm": 0.1637298740672838, + "learning_rate": 1.9753838395544208e-05, + "loss": 0.7043, + "step": 591 + }, + { + "epoch": 0.29267086886664195, + "grad_norm": 0.15070805660302067, + "learning_rate": 1.975297899876461e-05, + "loss": 0.7036, + "step": 592 + }, + { + "epoch": 0.2931652453343221, + "grad_norm": 0.15982799278379398, + "learning_rate": 1.9752118123199285e-05, + "loss": 0.7846, + "step": 593 + }, + { + "epoch": 0.2936596218020022, + "grad_norm": 0.1639626154839856, + "learning_rate": 1.9751255768978765e-05, + "loss": 0.7421, + "step": 594 + }, + { + "epoch": 0.29415399826968236, + "grad_norm": 0.14993968990162176, + "learning_rate": 1.9750391936233802e-05, + "loss": 0.7536, + "step": 595 + }, + { + "epoch": 0.2946483747373625, + "grad_norm": 0.14997849689295945, + "learning_rate": 1.9749526625095376e-05, + "loss": 0.7224, + "step": 596 + }, + { + "epoch": 0.29514275120504263, + "grad_norm": 0.16362084248156644, + "learning_rate": 1.9748659835694687e-05, + "loss": 0.7222, + "step": 597 + }, + { + "epoch": 0.29563712767272277, + "grad_norm": 0.17594140603071814, + "learning_rate": 1.9747791568163158e-05, + "loss": 0.7697, + "step": 598 + }, + { + "epoch": 0.2961315041404029, + "grad_norm": 0.15620755071410708, + "learning_rate": 1.9746921822632442e-05, + "loss": 0.81, + "step": 599 + }, + { + "epoch": 0.29662588060808304, + "grad_norm": 0.15852276613899116, + "learning_rate": 1.9746050599234414e-05, + "loss": 0.782, + "step": 600 + }, + { + "epoch": 0.29712025707576317, + "grad_norm": 0.1546520584300148, + "learning_rate": 1.9745177898101173e-05, + "loss": 0.7695, + "step": 601 + }, + { + "epoch": 0.2976146335434433, + "grad_norm": 0.1683355382515037, + "learning_rate": 1.974430371936504e-05, + "loss": 0.7425, + "step": 602 + }, + { + "epoch": 0.29810901001112344, + "grad_norm": 0.15457722754822528, + "learning_rate": 1.974342806315856e-05, + "loss": 0.7494, + "step": 603 + }, + { + "epoch": 0.29860338647880363, + "grad_norm": 0.15927897561467785, + "learning_rate": 1.9742550929614505e-05, + "loss": 0.7455, + "step": 604 + }, + { + "epoch": 0.29909776294648377, + "grad_norm": 0.1510776621113619, + "learning_rate": 1.974167231886587e-05, + "loss": 0.6821, + "step": 605 + }, + { + "epoch": 0.2995921394141639, + "grad_norm": 0.149045816698374, + "learning_rate": 1.9740792231045872e-05, + "loss": 0.7546, + "step": 606 + }, + { + "epoch": 0.30008651588184404, + "grad_norm": 0.15286002303115548, + "learning_rate": 1.973991066628796e-05, + "loss": 0.739, + "step": 607 + }, + { + "epoch": 0.3005808923495242, + "grad_norm": 0.15389185949021744, + "learning_rate": 1.9739027624725788e-05, + "loss": 0.7473, + "step": 608 + }, + { + "epoch": 0.3010752688172043, + "grad_norm": 0.15834532629372675, + "learning_rate": 1.973814310649326e-05, + "loss": 0.749, + "step": 609 + }, + { + "epoch": 0.30156964528488445, + "grad_norm": 0.1478553564674062, + "learning_rate": 1.9737257111724476e-05, + "loss": 0.7195, + "step": 610 + }, + { + "epoch": 0.3020640217525646, + "grad_norm": 0.1747717887518076, + "learning_rate": 1.9736369640553787e-05, + "loss": 0.7642, + "step": 611 + }, + { + "epoch": 0.3025583982202447, + "grad_norm": 0.1506717642358965, + "learning_rate": 1.973548069311575e-05, + "loss": 0.7441, + "step": 612 + }, + { + "epoch": 0.30305277468792485, + "grad_norm": 0.15631998311813342, + "learning_rate": 1.9734590269545147e-05, + "loss": 0.7702, + "step": 613 + }, + { + "epoch": 0.303547151155605, + "grad_norm": 0.1490228493576656, + "learning_rate": 1.9733698369976993e-05, + "loss": 0.7594, + "step": 614 + }, + { + "epoch": 0.3040415276232851, + "grad_norm": 0.15412781580358761, + "learning_rate": 1.973280499454652e-05, + "loss": 0.7238, + "step": 615 + }, + { + "epoch": 0.30453590409096526, + "grad_norm": 0.15910961978292312, + "learning_rate": 1.973191014338918e-05, + "loss": 0.7643, + "step": 616 + }, + { + "epoch": 0.3050302805586454, + "grad_norm": 0.17523401461755808, + "learning_rate": 1.973101381664066e-05, + "loss": 0.7125, + "step": 617 + }, + { + "epoch": 0.30552465702632553, + "grad_norm": 0.17107042624959548, + "learning_rate": 1.9730116014436867e-05, + "loss": 0.7846, + "step": 618 + }, + { + "epoch": 0.30601903349400567, + "grad_norm": 0.16518968466100517, + "learning_rate": 1.9729216736913922e-05, + "loss": 0.7284, + "step": 619 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 0.4492438247573917, + "learning_rate": 1.972831598420818e-05, + "loss": 0.7739, + "step": 620 + }, + { + "epoch": 0.30700778642936594, + "grad_norm": 0.18248680745661533, + "learning_rate": 1.972741375645622e-05, + "loss": 0.7904, + "step": 621 + }, + { + "epoch": 0.3075021628970461, + "grad_norm": 0.16331187653074533, + "learning_rate": 1.9726510053794834e-05, + "loss": 0.7341, + "step": 622 + }, + { + "epoch": 0.30799653936472626, + "grad_norm": 0.1626783657392858, + "learning_rate": 1.972560487636105e-05, + "loss": 0.7534, + "step": 623 + }, + { + "epoch": 0.3084909158324064, + "grad_norm": 0.16468009499535172, + "learning_rate": 1.9724698224292118e-05, + "loss": 0.7504, + "step": 624 + }, + { + "epoch": 0.30898529230008653, + "grad_norm": 0.15492300861547156, + "learning_rate": 1.9723790097725503e-05, + "loss": 0.768, + "step": 625 + }, + { + "epoch": 0.30947966876776667, + "grad_norm": 0.1533626180287336, + "learning_rate": 1.97228804967989e-05, + "loss": 0.713, + "step": 626 + }, + { + "epoch": 0.3099740452354468, + "grad_norm": 0.16363387709367413, + "learning_rate": 1.9721969421650223e-05, + "loss": 0.7417, + "step": 627 + }, + { + "epoch": 0.31046842170312694, + "grad_norm": 0.16103791431915704, + "learning_rate": 1.972105687241762e-05, + "loss": 0.7408, + "step": 628 + }, + { + "epoch": 0.3109627981708071, + "grad_norm": 0.1574622913945536, + "learning_rate": 1.972014284923945e-05, + "loss": 0.7512, + "step": 629 + }, + { + "epoch": 0.3114571746384872, + "grad_norm": 0.1644994612992582, + "learning_rate": 1.9719227352254307e-05, + "loss": 0.7445, + "step": 630 + }, + { + "epoch": 0.31195155110616735, + "grad_norm": 0.1839811997191771, + "learning_rate": 1.9718310381600992e-05, + "loss": 0.806, + "step": 631 + }, + { + "epoch": 0.3124459275738475, + "grad_norm": 0.16394486226518437, + "learning_rate": 1.971739193741855e-05, + "loss": 0.7832, + "step": 632 + }, + { + "epoch": 0.3129403040415276, + "grad_norm": 0.15635764748033137, + "learning_rate": 1.9716472019846233e-05, + "loss": 0.7318, + "step": 633 + }, + { + "epoch": 0.31343468050920775, + "grad_norm": 0.14892978308158497, + "learning_rate": 1.9715550629023524e-05, + "loss": 0.7505, + "step": 634 + }, + { + "epoch": 0.3139290569768879, + "grad_norm": 0.15896112096853654, + "learning_rate": 1.9714627765090126e-05, + "loss": 0.7395, + "step": 635 + }, + { + "epoch": 0.314423433444568, + "grad_norm": 0.15435283628875057, + "learning_rate": 1.9713703428185972e-05, + "loss": 0.707, + "step": 636 + }, + { + "epoch": 0.31491780991224816, + "grad_norm": 0.15166085558375503, + "learning_rate": 1.9712777618451212e-05, + "loss": 0.7259, + "step": 637 + }, + { + "epoch": 0.3154121863799283, + "grad_norm": 0.14510380217138294, + "learning_rate": 1.971185033602622e-05, + "loss": 0.7916, + "step": 638 + }, + { + "epoch": 0.31590656284760843, + "grad_norm": 0.1528959732060769, + "learning_rate": 1.9710921581051593e-05, + "loss": 0.7343, + "step": 639 + }, + { + "epoch": 0.31640093931528857, + "grad_norm": 0.14354139587638673, + "learning_rate": 1.9709991353668156e-05, + "loss": 0.7517, + "step": 640 + }, + { + "epoch": 0.31689531578296876, + "grad_norm": 0.15917772243884631, + "learning_rate": 1.9709059654016953e-05, + "loss": 0.7389, + "step": 641 + }, + { + "epoch": 0.3173896922506489, + "grad_norm": 0.15559759097840012, + "learning_rate": 1.9708126482239248e-05, + "loss": 0.7071, + "step": 642 + }, + { + "epoch": 0.317884068718329, + "grad_norm": 0.15640027979603233, + "learning_rate": 1.9707191838476538e-05, + "loss": 0.7518, + "step": 643 + }, + { + "epoch": 0.31837844518600916, + "grad_norm": 0.15151956752699214, + "learning_rate": 1.9706255722870536e-05, + "loss": 0.695, + "step": 644 + }, + { + "epoch": 0.3188728216536893, + "grad_norm": 0.16191269460529367, + "learning_rate": 1.9705318135563173e-05, + "loss": 0.7011, + "step": 645 + }, + { + "epoch": 0.31936719812136943, + "grad_norm": 0.1506777718632025, + "learning_rate": 1.9704379076696617e-05, + "loss": 0.7119, + "step": 646 + }, + { + "epoch": 0.31986157458904957, + "grad_norm": 0.16027075405994823, + "learning_rate": 1.9703438546413252e-05, + "loss": 0.7063, + "step": 647 + }, + { + "epoch": 0.3203559510567297, + "grad_norm": 0.15899503981928637, + "learning_rate": 1.970249654485568e-05, + "loss": 0.7473, + "step": 648 + }, + { + "epoch": 0.32085032752440984, + "grad_norm": 0.15375916051923522, + "learning_rate": 1.9701553072166735e-05, + "loss": 0.746, + "step": 649 + }, + { + "epoch": 0.32134470399209, + "grad_norm": 0.15220360962951862, + "learning_rate": 1.970060812848947e-05, + "loss": 0.7462, + "step": 650 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 0.1720917558743003, + "learning_rate": 1.9699661713967158e-05, + "loss": 0.7499, + "step": 651 + }, + { + "epoch": 0.32233345692745025, + "grad_norm": 0.15097347886342913, + "learning_rate": 1.9698713828743304e-05, + "loss": 0.6938, + "step": 652 + }, + { + "epoch": 0.3228278333951304, + "grad_norm": 0.1607916959238962, + "learning_rate": 1.9697764472961623e-05, + "loss": 0.7142, + "step": 653 + }, + { + "epoch": 0.3233222098628105, + "grad_norm": 0.1566952290833476, + "learning_rate": 1.9696813646766064e-05, + "loss": 0.7091, + "step": 654 + }, + { + "epoch": 0.32381658633049065, + "grad_norm": 0.1604614213514875, + "learning_rate": 1.9695861350300798e-05, + "loss": 0.7588, + "step": 655 + }, + { + "epoch": 0.3243109627981708, + "grad_norm": 2.401917900694655, + "learning_rate": 1.9694907583710207e-05, + "loss": 0.7851, + "step": 656 + }, + { + "epoch": 0.3248053392658509, + "grad_norm": 0.17353896714506442, + "learning_rate": 1.9693952347138917e-05, + "loss": 0.7176, + "step": 657 + }, + { + "epoch": 0.32529971573353106, + "grad_norm": 0.15376697732907532, + "learning_rate": 1.9692995640731753e-05, + "loss": 0.7325, + "step": 658 + }, + { + "epoch": 0.32579409220121125, + "grad_norm": 0.15623597765330027, + "learning_rate": 1.9692037464633782e-05, + "loss": 0.7226, + "step": 659 + }, + { + "epoch": 0.3262884686688914, + "grad_norm": 0.1622513740428389, + "learning_rate": 1.9691077818990284e-05, + "loss": 0.7596, + "step": 660 + }, + { + "epoch": 0.3267828451365715, + "grad_norm": 0.15412654433391118, + "learning_rate": 1.9690116703946765e-05, + "loss": 0.7463, + "step": 661 + }, + { + "epoch": 0.32727722160425166, + "grad_norm": 0.16895486839839263, + "learning_rate": 1.9689154119648952e-05, + "loss": 0.7544, + "step": 662 + }, + { + "epoch": 0.3277715980719318, + "grad_norm": 0.1518943393376738, + "learning_rate": 1.96881900662428e-05, + "loss": 0.7373, + "step": 663 + }, + { + "epoch": 0.3282659745396119, + "grad_norm": 0.1532803628052374, + "learning_rate": 1.9687224543874474e-05, + "loss": 0.7599, + "step": 664 + }, + { + "epoch": 0.32876035100729206, + "grad_norm": 0.15159062972186793, + "learning_rate": 1.968625755269038e-05, + "loss": 0.6977, + "step": 665 + }, + { + "epoch": 0.3292547274749722, + "grad_norm": 0.15663745233327095, + "learning_rate": 1.9685289092837135e-05, + "loss": 0.7141, + "step": 666 + }, + { + "epoch": 0.32974910394265233, + "grad_norm": 1.1988586032032988, + "learning_rate": 1.9684319164461573e-05, + "loss": 0.7867, + "step": 667 + }, + { + "epoch": 0.33024348041033247, + "grad_norm": 0.1624344075898955, + "learning_rate": 1.9683347767710765e-05, + "loss": 0.7666, + "step": 668 + }, + { + "epoch": 0.3307378568780126, + "grad_norm": 0.16498972015420657, + "learning_rate": 1.9682374902732003e-05, + "loss": 0.6963, + "step": 669 + }, + { + "epoch": 0.33123223334569274, + "grad_norm": 0.16935743363864528, + "learning_rate": 1.9681400569672786e-05, + "loss": 0.7318, + "step": 670 + }, + { + "epoch": 0.3317266098133729, + "grad_norm": 0.17409108702508322, + "learning_rate": 1.968042476868085e-05, + "loss": 0.703, + "step": 671 + }, + { + "epoch": 0.332220986281053, + "grad_norm": 0.15473509128837087, + "learning_rate": 1.9679447499904153e-05, + "loss": 0.7377, + "step": 672 + }, + { + "epoch": 0.33271536274873315, + "grad_norm": 0.17171994033868315, + "learning_rate": 1.967846876349087e-05, + "loss": 0.7357, + "step": 673 + }, + { + "epoch": 0.3332097392164133, + "grad_norm": 0.15856973948623623, + "learning_rate": 1.9677488559589403e-05, + "loss": 0.697, + "step": 674 + }, + { + "epoch": 0.3337041156840934, + "grad_norm": 0.1593362923166687, + "learning_rate": 1.967650688834837e-05, + "loss": 0.7467, + "step": 675 + }, + { + "epoch": 0.33419849215177355, + "grad_norm": 0.15894336876771487, + "learning_rate": 1.967552374991662e-05, + "loss": 0.7173, + "step": 676 + }, + { + "epoch": 0.3346928686194537, + "grad_norm": 0.15803795430764828, + "learning_rate": 1.9674539144443217e-05, + "loss": 0.7199, + "step": 677 + }, + { + "epoch": 0.3351872450871339, + "grad_norm": 0.15457477833062816, + "learning_rate": 1.9673553072077454e-05, + "loss": 0.7521, + "step": 678 + }, + { + "epoch": 0.335681621554814, + "grad_norm": 0.17144957093447, + "learning_rate": 1.9672565532968844e-05, + "loss": 0.7398, + "step": 679 + }, + { + "epoch": 0.33617599802249415, + "grad_norm": 0.15812990791462347, + "learning_rate": 1.9671576527267118e-05, + "loss": 0.6894, + "step": 680 + }, + { + "epoch": 0.3366703744901743, + "grad_norm": 0.163384404994177, + "learning_rate": 1.9670586055122234e-05, + "loss": 0.7044, + "step": 681 + }, + { + "epoch": 0.3371647509578544, + "grad_norm": 0.1691468801307716, + "learning_rate": 1.9669594116684375e-05, + "loss": 0.7301, + "step": 682 + }, + { + "epoch": 0.33765912742553456, + "grad_norm": 0.15383237131648106, + "learning_rate": 1.966860071210394e-05, + "loss": 0.7185, + "step": 683 + }, + { + "epoch": 0.3381535038932147, + "grad_norm": 0.1661685194329167, + "learning_rate": 1.9667605841531548e-05, + "loss": 0.7397, + "step": 684 + }, + { + "epoch": 0.3386478803608948, + "grad_norm": 0.14392375982672104, + "learning_rate": 1.9666609505118053e-05, + "loss": 0.7173, + "step": 685 + }, + { + "epoch": 0.33914225682857496, + "grad_norm": 0.15020903303375513, + "learning_rate": 1.966561170301452e-05, + "loss": 0.74, + "step": 686 + }, + { + "epoch": 0.3396366332962551, + "grad_norm": 0.1603126942140956, + "learning_rate": 1.9664612435372242e-05, + "loss": 0.7345, + "step": 687 + }, + { + "epoch": 0.34013100976393523, + "grad_norm": 0.151882421392461, + "learning_rate": 1.9663611702342728e-05, + "loss": 0.7688, + "step": 688 + }, + { + "epoch": 0.34062538623161537, + "grad_norm": 0.15871216894033766, + "learning_rate": 1.9662609504077715e-05, + "loss": 0.7309, + "step": 689 + }, + { + "epoch": 0.3411197626992955, + "grad_norm": 0.1649839683800111, + "learning_rate": 1.9661605840729164e-05, + "loss": 0.7207, + "step": 690 + }, + { + "epoch": 0.34161413916697564, + "grad_norm": 0.15279786673072837, + "learning_rate": 1.9660600712449247e-05, + "loss": 0.7681, + "step": 691 + }, + { + "epoch": 0.3421085156346558, + "grad_norm": 0.170641927968915, + "learning_rate": 1.9659594119390372e-05, + "loss": 0.7553, + "step": 692 + }, + { + "epoch": 0.3426028921023359, + "grad_norm": 0.15818812749540956, + "learning_rate": 1.965858606170516e-05, + "loss": 0.7231, + "step": 693 + }, + { + "epoch": 0.34309726857001605, + "grad_norm": 0.16634074329634344, + "learning_rate": 1.9657576539546456e-05, + "loss": 0.769, + "step": 694 + }, + { + "epoch": 0.3435916450376962, + "grad_norm": 0.1652627157880426, + "learning_rate": 1.9656565553067332e-05, + "loss": 0.7874, + "step": 695 + }, + { + "epoch": 0.34408602150537637, + "grad_norm": 0.17445065968377244, + "learning_rate": 1.9655553102421076e-05, + "loss": 0.7166, + "step": 696 + }, + { + "epoch": 0.3445803979730565, + "grad_norm": 0.15874480461464155, + "learning_rate": 1.9654539187761193e-05, + "loss": 0.7012, + "step": 697 + }, + { + "epoch": 0.34507477444073664, + "grad_norm": 0.1624767722197373, + "learning_rate": 1.9653523809241424e-05, + "loss": 0.7457, + "step": 698 + }, + { + "epoch": 0.3455691509084168, + "grad_norm": 0.15666212747429975, + "learning_rate": 1.965250696701572e-05, + "loss": 0.7019, + "step": 699 + }, + { + "epoch": 0.3460635273760969, + "grad_norm": 0.15910720233239792, + "learning_rate": 1.9651488661238273e-05, + "loss": 0.7213, + "step": 700 + }, + { + "epoch": 0.34655790384377705, + "grad_norm": 0.15500061159559253, + "learning_rate": 1.9650468892063462e-05, + "loss": 0.7025, + "step": 701 + }, + { + "epoch": 0.3470522803114572, + "grad_norm": 0.14849988845060627, + "learning_rate": 1.964944765964592e-05, + "loss": 0.6971, + "step": 702 + }, + { + "epoch": 0.3475466567791373, + "grad_norm": 0.1633781401799674, + "learning_rate": 1.9648424964140486e-05, + "loss": 0.7364, + "step": 703 + }, + { + "epoch": 0.34804103324681746, + "grad_norm": 0.14988873881010192, + "learning_rate": 1.9647400805702233e-05, + "loss": 0.6942, + "step": 704 + }, + { + "epoch": 0.3485354097144976, + "grad_norm": 0.16118394803778605, + "learning_rate": 1.964637518448644e-05, + "loss": 0.7034, + "step": 705 + }, + { + "epoch": 0.3490297861821777, + "grad_norm": 0.1522002654979852, + "learning_rate": 1.9645348100648617e-05, + "loss": 0.703, + "step": 706 + }, + { + "epoch": 0.34952416264985786, + "grad_norm": 0.1445038795270341, + "learning_rate": 1.9644319554344496e-05, + "loss": 0.7249, + "step": 707 + }, + { + "epoch": 0.350018539117538, + "grad_norm": 0.16233023750945577, + "learning_rate": 1.9643289545730028e-05, + "loss": 0.7001, + "step": 708 + }, + { + "epoch": 0.35051291558521813, + "grad_norm": 0.1550603264375147, + "learning_rate": 1.9642258074961388e-05, + "loss": 0.7565, + "step": 709 + }, + { + "epoch": 0.35100729205289827, + "grad_norm": 0.14995984059522152, + "learning_rate": 1.9641225142194974e-05, + "loss": 0.7104, + "step": 710 + }, + { + "epoch": 0.3515016685205784, + "grad_norm": 0.14780836668488417, + "learning_rate": 1.96401907475874e-05, + "loss": 0.7145, + "step": 711 + }, + { + "epoch": 0.35199604498825854, + "grad_norm": 0.14706656720863703, + "learning_rate": 1.963915489129551e-05, + "loss": 0.7133, + "step": 712 + }, + { + "epoch": 0.3524904214559387, + "grad_norm": 0.14795556168610147, + "learning_rate": 1.9638117573476356e-05, + "loss": 0.7083, + "step": 713 + }, + { + "epoch": 0.3529847979236188, + "grad_norm": 0.15097410155223984, + "learning_rate": 1.963707879428723e-05, + "loss": 0.7282, + "step": 714 + }, + { + "epoch": 0.353479174391299, + "grad_norm": 0.14707549935144434, + "learning_rate": 1.963603855388563e-05, + "loss": 0.7082, + "step": 715 + }, + { + "epoch": 0.35397355085897914, + "grad_norm": 0.1458358397255251, + "learning_rate": 1.963499685242928e-05, + "loss": 0.7515, + "step": 716 + }, + { + "epoch": 0.35446792732665927, + "grad_norm": 0.22532404283419288, + "learning_rate": 1.963395369007613e-05, + "loss": 0.7579, + "step": 717 + }, + { + "epoch": 0.3549623037943394, + "grad_norm": 0.14835500049054198, + "learning_rate": 1.963290906698435e-05, + "loss": 0.6939, + "step": 718 + }, + { + "epoch": 0.35545668026201954, + "grad_norm": 0.1520227932538647, + "learning_rate": 1.9631862983312326e-05, + "loss": 0.6936, + "step": 719 + }, + { + "epoch": 0.3559510567296997, + "grad_norm": 0.1530671067258804, + "learning_rate": 1.963081543921867e-05, + "loss": 0.772, + "step": 720 + }, + { + "epoch": 0.3564454331973798, + "grad_norm": 0.16317121802603515, + "learning_rate": 1.9629766434862216e-05, + "loss": 0.6935, + "step": 721 + }, + { + "epoch": 0.35693980966505995, + "grad_norm": 0.15164081436035556, + "learning_rate": 1.962871597040202e-05, + "loss": 0.732, + "step": 722 + }, + { + "epoch": 0.3574341861327401, + "grad_norm": 0.15952606841569253, + "learning_rate": 1.962766404599736e-05, + "loss": 0.7008, + "step": 723 + }, + { + "epoch": 0.3579285626004202, + "grad_norm": 0.1654587309915103, + "learning_rate": 1.9626610661807723e-05, + "loss": 0.7578, + "step": 724 + }, + { + "epoch": 0.35842293906810035, + "grad_norm": 0.1647526195582362, + "learning_rate": 1.9625555817992837e-05, + "loss": 0.7802, + "step": 725 + }, + { + "epoch": 0.3589173155357805, + "grad_norm": 0.16996360539369224, + "learning_rate": 1.9624499514712637e-05, + "loss": 0.7642, + "step": 726 + }, + { + "epoch": 0.3594116920034606, + "grad_norm": 0.1470317271617239, + "learning_rate": 1.9623441752127284e-05, + "loss": 0.7046, + "step": 727 + }, + { + "epoch": 0.35990606847114076, + "grad_norm": 0.15403460980316377, + "learning_rate": 1.962238253039716e-05, + "loss": 0.7496, + "step": 728 + }, + { + "epoch": 0.3604004449388209, + "grad_norm": 0.14370392028354131, + "learning_rate": 1.962132184968287e-05, + "loss": 0.7162, + "step": 729 + }, + { + "epoch": 0.36089482140650103, + "grad_norm": 0.16056267210303105, + "learning_rate": 1.962025971014524e-05, + "loss": 0.7477, + "step": 730 + }, + { + "epoch": 0.36138919787418117, + "grad_norm": 0.14765457531728426, + "learning_rate": 1.961919611194531e-05, + "loss": 0.702, + "step": 731 + }, + { + "epoch": 0.3618835743418613, + "grad_norm": 0.16092649965259662, + "learning_rate": 1.9618131055244355e-05, + "loss": 0.7402, + "step": 732 + }, + { + "epoch": 0.36237795080954144, + "grad_norm": 0.21230672004834142, + "learning_rate": 1.9617064540203858e-05, + "loss": 0.7148, + "step": 733 + }, + { + "epoch": 0.36287232727722163, + "grad_norm": 0.16038358859524185, + "learning_rate": 1.961599656698553e-05, + "loss": 0.7065, + "step": 734 + }, + { + "epoch": 0.36336670374490176, + "grad_norm": 0.14512706415537074, + "learning_rate": 1.9614927135751302e-05, + "loss": 0.7189, + "step": 735 + }, + { + "epoch": 0.3638610802125819, + "grad_norm": 0.14976646958003353, + "learning_rate": 1.9613856246663324e-05, + "loss": 0.722, + "step": 736 + }, + { + "epoch": 0.36435545668026204, + "grad_norm": 0.16337987590538378, + "learning_rate": 1.9612783899883964e-05, + "loss": 0.7558, + "step": 737 + }, + { + "epoch": 0.36484983314794217, + "grad_norm": 0.1417669888792424, + "learning_rate": 1.9611710095575828e-05, + "loss": 0.7276, + "step": 738 + }, + { + "epoch": 0.3653442096156223, + "grad_norm": 0.1656416746550021, + "learning_rate": 1.961063483390172e-05, + "loss": 0.7408, + "step": 739 + }, + { + "epoch": 0.36583858608330244, + "grad_norm": 0.5479250955085542, + "learning_rate": 1.9609558115024673e-05, + "loss": 0.7146, + "step": 740 + }, + { + "epoch": 0.3663329625509826, + "grad_norm": 0.15736211205358464, + "learning_rate": 1.9608479939107952e-05, + "loss": 0.7215, + "step": 741 + }, + { + "epoch": 0.3668273390186627, + "grad_norm": 0.14948539145940878, + "learning_rate": 1.9607400306315033e-05, + "loss": 0.695, + "step": 742 + }, + { + "epoch": 0.36732171548634285, + "grad_norm": 0.17139043388872705, + "learning_rate": 1.9606319216809614e-05, + "loss": 0.7653, + "step": 743 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 0.15424665708818905, + "learning_rate": 1.9605236670755608e-05, + "loss": 0.7389, + "step": 744 + }, + { + "epoch": 0.3683104684217031, + "grad_norm": 0.1731148219396046, + "learning_rate": 1.9604152668317164e-05, + "loss": 0.7536, + "step": 745 + }, + { + "epoch": 0.36880484488938325, + "grad_norm": 0.1584813793667149, + "learning_rate": 1.9603067209658634e-05, + "loss": 0.7588, + "step": 746 + }, + { + "epoch": 0.3692992213570634, + "grad_norm": 0.16247425065329726, + "learning_rate": 1.9601980294944602e-05, + "loss": 0.7159, + "step": 747 + }, + { + "epoch": 0.3697935978247435, + "grad_norm": 0.1531127017707098, + "learning_rate": 1.9600891924339875e-05, + "loss": 0.7165, + "step": 748 + }, + { + "epoch": 0.37028797429242366, + "grad_norm": 0.14994241690728313, + "learning_rate": 1.9599802098009475e-05, + "loss": 0.7223, + "step": 749 + }, + { + "epoch": 0.3707823507601038, + "grad_norm": 0.16867147154799964, + "learning_rate": 1.9598710816118643e-05, + "loss": 0.7269, + "step": 750 + }, + { + "epoch": 0.37127672722778393, + "grad_norm": 0.17116652362612056, + "learning_rate": 1.9597618078832844e-05, + "loss": 0.7097, + "step": 751 + }, + { + "epoch": 0.3717711036954641, + "grad_norm": 0.16369825082235204, + "learning_rate": 1.9596523886317764e-05, + "loss": 0.7435, + "step": 752 + }, + { + "epoch": 0.37226548016314426, + "grad_norm": 0.15884423441057247, + "learning_rate": 1.9595428238739308e-05, + "loss": 0.7154, + "step": 753 + }, + { + "epoch": 0.3727598566308244, + "grad_norm": 0.16881272637938965, + "learning_rate": 1.95943311362636e-05, + "loss": 0.7016, + "step": 754 + }, + { + "epoch": 0.37325423309850453, + "grad_norm": 0.1510374100554291, + "learning_rate": 1.9593232579056996e-05, + "loss": 0.7445, + "step": 755 + }, + { + "epoch": 0.37374860956618466, + "grad_norm": 0.15116557821002802, + "learning_rate": 1.959213256728606e-05, + "loss": 0.6977, + "step": 756 + }, + { + "epoch": 0.3742429860338648, + "grad_norm": 0.15117913004985037, + "learning_rate": 1.959103110111757e-05, + "loss": 0.7653, + "step": 757 + }, + { + "epoch": 0.37473736250154493, + "grad_norm": 0.18278512287900522, + "learning_rate": 1.958992818071855e-05, + "loss": 0.7683, + "step": 758 + }, + { + "epoch": 0.37523173896922507, + "grad_norm": 0.160178322433072, + "learning_rate": 1.9588823806256213e-05, + "loss": 0.7088, + "step": 759 + }, + { + "epoch": 0.3757261154369052, + "grad_norm": 0.14950110738290523, + "learning_rate": 1.9587717977898025e-05, + "loss": 0.689, + "step": 760 + }, + { + "epoch": 0.37622049190458534, + "grad_norm": 0.1574352344503863, + "learning_rate": 1.9586610695811647e-05, + "loss": 0.7227, + "step": 761 + }, + { + "epoch": 0.3767148683722655, + "grad_norm": 0.16229363070626576, + "learning_rate": 1.9585501960164972e-05, + "loss": 0.7232, + "step": 762 + }, + { + "epoch": 0.3772092448399456, + "grad_norm": 0.16273801626073361, + "learning_rate": 1.958439177112611e-05, + "loss": 0.7253, + "step": 763 + }, + { + "epoch": 0.37770362130762575, + "grad_norm": 0.15016533264806245, + "learning_rate": 1.9583280128863393e-05, + "loss": 0.6757, + "step": 764 + }, + { + "epoch": 0.3781979977753059, + "grad_norm": 0.16312890367593802, + "learning_rate": 1.958216703354537e-05, + "loss": 0.718, + "step": 765 + }, + { + "epoch": 0.378692374242986, + "grad_norm": 0.17312968302379572, + "learning_rate": 1.9581052485340815e-05, + "loss": 0.7309, + "step": 766 + }, + { + "epoch": 0.37918675071066615, + "grad_norm": 0.14547402999260856, + "learning_rate": 1.9579936484418726e-05, + "loss": 0.7198, + "step": 767 + }, + { + "epoch": 0.3796811271783463, + "grad_norm": 0.15650396173238312, + "learning_rate": 1.9578819030948302e-05, + "loss": 0.703, + "step": 768 + }, + { + "epoch": 0.3801755036460264, + "grad_norm": 0.1584055087491127, + "learning_rate": 1.9577700125098988e-05, + "loss": 0.7762, + "step": 769 + }, + { + "epoch": 0.38066988011370656, + "grad_norm": 0.1454248070653287, + "learning_rate": 1.9576579767040434e-05, + "loss": 0.7128, + "step": 770 + }, + { + "epoch": 0.38116425658138675, + "grad_norm": 0.15802079747628375, + "learning_rate": 1.9575457956942508e-05, + "loss": 0.6775, + "step": 771 + }, + { + "epoch": 0.3816586330490669, + "grad_norm": 0.1427578625071399, + "learning_rate": 1.957433469497531e-05, + "loss": 0.7116, + "step": 772 + }, + { + "epoch": 0.382153009516747, + "grad_norm": 0.16068035777162093, + "learning_rate": 1.9573209981309152e-05, + "loss": 0.6872, + "step": 773 + }, + { + "epoch": 0.38264738598442716, + "grad_norm": 0.1591770508810748, + "learning_rate": 1.9572083816114563e-05, + "loss": 0.7406, + "step": 774 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 0.17716739962348127, + "learning_rate": 1.95709561995623e-05, + "loss": 0.717, + "step": 775 + }, + { + "epoch": 0.38363613891978743, + "grad_norm": 0.1603380134133733, + "learning_rate": 1.956982713182334e-05, + "loss": 0.7483, + "step": 776 + }, + { + "epoch": 0.38413051538746756, + "grad_norm": 0.1493192721483825, + "learning_rate": 1.9568696613068868e-05, + "loss": 0.7358, + "step": 777 + }, + { + "epoch": 0.3846248918551477, + "grad_norm": 0.1506271923855618, + "learning_rate": 1.9567564643470307e-05, + "loss": 0.7324, + "step": 778 + }, + { + "epoch": 0.38511926832282783, + "grad_norm": 0.1517402725027683, + "learning_rate": 1.9566431223199288e-05, + "loss": 0.7057, + "step": 779 + }, + { + "epoch": 0.38561364479050797, + "grad_norm": 0.1622769461870765, + "learning_rate": 1.9565296352427664e-05, + "loss": 0.7203, + "step": 780 + }, + { + "epoch": 0.3861080212581881, + "grad_norm": 0.1456312451673674, + "learning_rate": 1.9564160031327505e-05, + "loss": 0.7051, + "step": 781 + }, + { + "epoch": 0.38660239772586824, + "grad_norm": 0.1522381328722986, + "learning_rate": 1.9563022260071108e-05, + "loss": 0.7209, + "step": 782 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.16257675001925373, + "learning_rate": 1.956188303883099e-05, + "loss": 0.7106, + "step": 783 + }, + { + "epoch": 0.3875911506612285, + "grad_norm": 0.14284808703137514, + "learning_rate": 1.9560742367779878e-05, + "loss": 0.7091, + "step": 784 + }, + { + "epoch": 0.38808552712890865, + "grad_norm": 0.15313518317187827, + "learning_rate": 1.9559600247090727e-05, + "loss": 0.7035, + "step": 785 + }, + { + "epoch": 0.3885799035965888, + "grad_norm": 0.15629629290616087, + "learning_rate": 1.9558456676936716e-05, + "loss": 0.7214, + "step": 786 + }, + { + "epoch": 0.3890742800642689, + "grad_norm": 0.15813378359664612, + "learning_rate": 1.9557311657491226e-05, + "loss": 0.7222, + "step": 787 + }, + { + "epoch": 0.38956865653194905, + "grad_norm": 0.14917988616324424, + "learning_rate": 1.9556165188927882e-05, + "loss": 0.6889, + "step": 788 + }, + { + "epoch": 0.39006303299962924, + "grad_norm": 0.15082466589540547, + "learning_rate": 1.9555017271420505e-05, + "loss": 0.7352, + "step": 789 + }, + { + "epoch": 0.3905574094673094, + "grad_norm": 0.15934475269582898, + "learning_rate": 1.9553867905143154e-05, + "loss": 0.7244, + "step": 790 + }, + { + "epoch": 0.3910517859349895, + "grad_norm": 0.1647088781486535, + "learning_rate": 1.9552717090270093e-05, + "loss": 0.7229, + "step": 791 + }, + { + "epoch": 0.39154616240266965, + "grad_norm": 0.15837469913535884, + "learning_rate": 1.9551564826975826e-05, + "loss": 0.7197, + "step": 792 + }, + { + "epoch": 0.3920405388703498, + "grad_norm": 0.1529259935287758, + "learning_rate": 1.9550411115435055e-05, + "loss": 0.689, + "step": 793 + }, + { + "epoch": 0.3925349153380299, + "grad_norm": 0.15492199866002648, + "learning_rate": 1.9549255955822708e-05, + "loss": 0.7471, + "step": 794 + }, + { + "epoch": 0.39302929180571006, + "grad_norm": 0.17201513970517682, + "learning_rate": 1.954809934831394e-05, + "loss": 0.7205, + "step": 795 + }, + { + "epoch": 0.3935236682733902, + "grad_norm": 0.15019240253070162, + "learning_rate": 1.954694129308412e-05, + "loss": 0.7037, + "step": 796 + }, + { + "epoch": 0.39401804474107033, + "grad_norm": 0.16699426265868436, + "learning_rate": 1.9545781790308834e-05, + "loss": 0.6831, + "step": 797 + }, + { + "epoch": 0.39451242120875046, + "grad_norm": 0.15559553915264776, + "learning_rate": 1.9544620840163893e-05, + "loss": 0.7225, + "step": 798 + }, + { + "epoch": 0.3950067976764306, + "grad_norm": 0.1544634853409169, + "learning_rate": 1.9543458442825327e-05, + "loss": 0.7503, + "step": 799 + }, + { + "epoch": 0.39550117414411073, + "grad_norm": 0.15686263191173036, + "learning_rate": 1.954229459846938e-05, + "loss": 0.7135, + "step": 800 + }, + { + "epoch": 0.39599555061179087, + "grad_norm": 0.14595769316082483, + "learning_rate": 1.9541129307272516e-05, + "loss": 0.7385, + "step": 801 + }, + { + "epoch": 0.396489927079471, + "grad_norm": 0.15100258718784665, + "learning_rate": 1.953996256941143e-05, + "loss": 0.7549, + "step": 802 + }, + { + "epoch": 0.39698430354715114, + "grad_norm": 0.1438624139807672, + "learning_rate": 1.9538794385063018e-05, + "loss": 0.7588, + "step": 803 + }, + { + "epoch": 0.3974786800148313, + "grad_norm": 0.16635171761733414, + "learning_rate": 1.953762475440441e-05, + "loss": 0.7616, + "step": 804 + }, + { + "epoch": 0.3979730564825114, + "grad_norm": 0.14701032057515884, + "learning_rate": 1.9536453677612947e-05, + "loss": 0.7611, + "step": 805 + }, + { + "epoch": 0.39846743295019155, + "grad_norm": 0.1534330004006759, + "learning_rate": 1.9535281154866195e-05, + "loss": 0.7262, + "step": 806 + }, + { + "epoch": 0.3989618094178717, + "grad_norm": 0.16174381355137307, + "learning_rate": 1.9534107186341938e-05, + "loss": 0.7023, + "step": 807 + }, + { + "epoch": 0.3994561858855519, + "grad_norm": 0.1512395600404716, + "learning_rate": 1.9532931772218175e-05, + "loss": 0.7449, + "step": 808 + }, + { + "epoch": 0.399950562353232, + "grad_norm": 0.16261398898779542, + "learning_rate": 1.9531754912673128e-05, + "loss": 0.695, + "step": 809 + }, + { + "epoch": 0.40044493882091214, + "grad_norm": 0.14696052616443023, + "learning_rate": 1.9530576607885233e-05, + "loss": 0.7237, + "step": 810 + }, + { + "epoch": 0.4009393152885923, + "grad_norm": 0.14505231416627223, + "learning_rate": 1.9529396858033153e-05, + "loss": 0.6967, + "step": 811 + }, + { + "epoch": 0.4014336917562724, + "grad_norm": 0.19573561105025336, + "learning_rate": 1.952821566329577e-05, + "loss": 0.735, + "step": 812 + }, + { + "epoch": 0.40192806822395255, + "grad_norm": 0.1490842948168577, + "learning_rate": 1.9527033023852178e-05, + "loss": 0.7109, + "step": 813 + }, + { + "epoch": 0.4024224446916327, + "grad_norm": 0.16127941551945899, + "learning_rate": 1.9525848939881694e-05, + "loss": 0.753, + "step": 814 + }, + { + "epoch": 0.4029168211593128, + "grad_norm": 0.14579057470111845, + "learning_rate": 1.9524663411563848e-05, + "loss": 0.7148, + "step": 815 + }, + { + "epoch": 0.40341119762699296, + "grad_norm": 0.16467656555563442, + "learning_rate": 1.9523476439078405e-05, + "loss": 0.7037, + "step": 816 + }, + { + "epoch": 0.4039055740946731, + "grad_norm": 0.1497919123002876, + "learning_rate": 1.9522288022605332e-05, + "loss": 0.7186, + "step": 817 + }, + { + "epoch": 0.4043999505623532, + "grad_norm": 0.15483496301513458, + "learning_rate": 1.952109816232482e-05, + "loss": 0.7419, + "step": 818 + }, + { + "epoch": 0.40489432703003336, + "grad_norm": 0.16663736861066672, + "learning_rate": 1.9519906858417286e-05, + "loss": 0.8045, + "step": 819 + }, + { + "epoch": 0.4053887034977135, + "grad_norm": 0.1439035257434949, + "learning_rate": 1.9518714111063355e-05, + "loss": 0.7053, + "step": 820 + }, + { + "epoch": 0.40588307996539363, + "grad_norm": 0.16090565238596582, + "learning_rate": 1.951751992044388e-05, + "loss": 0.7693, + "step": 821 + }, + { + "epoch": 0.40637745643307377, + "grad_norm": 0.15742749163471317, + "learning_rate": 1.9516324286739925e-05, + "loss": 0.6847, + "step": 822 + }, + { + "epoch": 0.4068718329007539, + "grad_norm": 0.15487526761313783, + "learning_rate": 1.9515127210132783e-05, + "loss": 0.6944, + "step": 823 + }, + { + "epoch": 0.40736620936843404, + "grad_norm": 0.15387791201505535, + "learning_rate": 1.951392869080395e-05, + "loss": 0.7017, + "step": 824 + }, + { + "epoch": 0.4078605858361142, + "grad_norm": 0.1494762916909954, + "learning_rate": 1.9512728728935162e-05, + "loss": 0.7005, + "step": 825 + }, + { + "epoch": 0.40835496230379437, + "grad_norm": 0.15138499066885533, + "learning_rate": 1.9511527324708354e-05, + "loss": 0.69, + "step": 826 + }, + { + "epoch": 0.4088493387714745, + "grad_norm": 0.14645162599771056, + "learning_rate": 1.9510324478305686e-05, + "loss": 0.6983, + "step": 827 + }, + { + "epoch": 0.40934371523915464, + "grad_norm": 0.14843754239440582, + "learning_rate": 1.9509120189909544e-05, + "loss": 0.7419, + "step": 828 + }, + { + "epoch": 0.4098380917068348, + "grad_norm": 0.15039311290922797, + "learning_rate": 1.9507914459702526e-05, + "loss": 0.7457, + "step": 829 + }, + { + "epoch": 0.4103324681745149, + "grad_norm": 0.15454692015746282, + "learning_rate": 1.950670728786745e-05, + "loss": 0.7032, + "step": 830 + }, + { + "epoch": 0.41082684464219504, + "grad_norm": 0.15969311478124418, + "learning_rate": 1.950549867458735e-05, + "loss": 0.7079, + "step": 831 + }, + { + "epoch": 0.4113212211098752, + "grad_norm": 0.16676944180635173, + "learning_rate": 1.950428862004548e-05, + "loss": 0.6832, + "step": 832 + }, + { + "epoch": 0.4118155975775553, + "grad_norm": 0.14520544000174182, + "learning_rate": 1.9503077124425318e-05, + "loss": 0.757, + "step": 833 + }, + { + "epoch": 0.41230997404523545, + "grad_norm": 0.1582594590809799, + "learning_rate": 1.9501864187910548e-05, + "loss": 0.711, + "step": 834 + }, + { + "epoch": 0.4128043505129156, + "grad_norm": 0.15542420542909732, + "learning_rate": 1.950064981068509e-05, + "loss": 0.7121, + "step": 835 + }, + { + "epoch": 0.4132987269805957, + "grad_norm": 0.14831118595198295, + "learning_rate": 1.9499433992933067e-05, + "loss": 0.7006, + "step": 836 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.16156826190067777, + "learning_rate": 1.9498216734838825e-05, + "loss": 0.7311, + "step": 837 + }, + { + "epoch": 0.414287479915956, + "grad_norm": 0.1517539722645047, + "learning_rate": 1.9496998036586935e-05, + "loss": 0.6739, + "step": 838 + }, + { + "epoch": 0.4147818563836361, + "grad_norm": 0.14995228270815908, + "learning_rate": 1.9495777898362172e-05, + "loss": 0.6844, + "step": 839 + }, + { + "epoch": 0.41527623285131626, + "grad_norm": 0.15007220143165412, + "learning_rate": 1.9494556320349546e-05, + "loss": 0.7286, + "step": 840 + }, + { + "epoch": 0.4157706093189964, + "grad_norm": 0.16464063436571583, + "learning_rate": 1.949333330273428e-05, + "loss": 0.6986, + "step": 841 + }, + { + "epoch": 0.41626498578667653, + "grad_norm": 0.15394216996221224, + "learning_rate": 1.9492108845701802e-05, + "loss": 0.7101, + "step": 842 + }, + { + "epoch": 0.41675936225435667, + "grad_norm": 0.15976545489658042, + "learning_rate": 1.9490882949437778e-05, + "loss": 0.71, + "step": 843 + }, + { + "epoch": 0.4172537387220368, + "grad_norm": 0.16929417508333894, + "learning_rate": 1.948965561412808e-05, + "loss": 0.7202, + "step": 844 + }, + { + "epoch": 0.417748115189717, + "grad_norm": 0.15922006582915013, + "learning_rate": 1.94884268399588e-05, + "loss": 0.7357, + "step": 845 + }, + { + "epoch": 0.41824249165739713, + "grad_norm": 0.16131834424798094, + "learning_rate": 1.9487196627116256e-05, + "loss": 0.7437, + "step": 846 + }, + { + "epoch": 0.41873686812507727, + "grad_norm": 0.16135724432759227, + "learning_rate": 1.9485964975786974e-05, + "loss": 0.7285, + "step": 847 + }, + { + "epoch": 0.4192312445927574, + "grad_norm": 0.1861949701572606, + "learning_rate": 1.9484731886157695e-05, + "loss": 0.7479, + "step": 848 + }, + { + "epoch": 0.41972562106043754, + "grad_norm": 0.15661606387386656, + "learning_rate": 1.9483497358415394e-05, + "loss": 0.7029, + "step": 849 + }, + { + "epoch": 0.4202199975281177, + "grad_norm": 0.16669224133669708, + "learning_rate": 1.9482261392747255e-05, + "loss": 0.7158, + "step": 850 + }, + { + "epoch": 0.4207143739957978, + "grad_norm": 0.14767509895235548, + "learning_rate": 1.9481023989340674e-05, + "loss": 0.7381, + "step": 851 + }, + { + "epoch": 0.42120875046347794, + "grad_norm": 0.1619645288179699, + "learning_rate": 1.9479785148383277e-05, + "loss": 0.7181, + "step": 852 + }, + { + "epoch": 0.4217031269311581, + "grad_norm": 0.1573022919354013, + "learning_rate": 1.94785448700629e-05, + "loss": 0.7828, + "step": 853 + }, + { + "epoch": 0.4221975033988382, + "grad_norm": 0.165028655379019, + "learning_rate": 1.9477303154567594e-05, + "loss": 0.7312, + "step": 854 + }, + { + "epoch": 0.42269187986651835, + "grad_norm": 0.1446103951873869, + "learning_rate": 1.9476060002085644e-05, + "loss": 0.707, + "step": 855 + }, + { + "epoch": 0.4231862563341985, + "grad_norm": 0.1558305900389545, + "learning_rate": 1.947481541280553e-05, + "loss": 0.6733, + "step": 856 + }, + { + "epoch": 0.4236806328018786, + "grad_norm": 0.16097317966401783, + "learning_rate": 1.9473569386915968e-05, + "loss": 0.7403, + "step": 857 + }, + { + "epoch": 0.42417500926955876, + "grad_norm": 0.16016717083405646, + "learning_rate": 1.9472321924605885e-05, + "loss": 0.737, + "step": 858 + }, + { + "epoch": 0.4246693857372389, + "grad_norm": 0.1532040917561234, + "learning_rate": 1.9471073026064427e-05, + "loss": 0.6862, + "step": 859 + }, + { + "epoch": 0.425163762204919, + "grad_norm": 0.1487458548547252, + "learning_rate": 1.9469822691480952e-05, + "loss": 0.682, + "step": 860 + }, + { + "epoch": 0.42565813867259916, + "grad_norm": 0.154829568791271, + "learning_rate": 1.9468570921045046e-05, + "loss": 0.7225, + "step": 861 + }, + { + "epoch": 0.4261525151402793, + "grad_norm": 0.1456453869952886, + "learning_rate": 1.9467317714946503e-05, + "loss": 0.689, + "step": 862 + }, + { + "epoch": 0.4266468916079595, + "grad_norm": 0.1577347765265502, + "learning_rate": 1.9466063073375342e-05, + "loss": 0.6522, + "step": 863 + }, + { + "epoch": 0.4271412680756396, + "grad_norm": 0.14487451131067533, + "learning_rate": 1.94648069965218e-05, + "loss": 0.7014, + "step": 864 + }, + { + "epoch": 0.42763564454331976, + "grad_norm": 0.16020378462944335, + "learning_rate": 1.9463549484576326e-05, + "loss": 0.7052, + "step": 865 + }, + { + "epoch": 0.4281300210109999, + "grad_norm": 0.15206719354293297, + "learning_rate": 1.946229053772958e-05, + "loss": 0.6888, + "step": 866 + }, + { + "epoch": 0.42862439747868003, + "grad_norm": 0.1399622186933564, + "learning_rate": 1.9461030156172463e-05, + "loss": 0.6844, + "step": 867 + }, + { + "epoch": 0.42911877394636017, + "grad_norm": 0.16712783476296758, + "learning_rate": 1.9459768340096073e-05, + "loss": 0.68, + "step": 868 + }, + { + "epoch": 0.4296131504140403, + "grad_norm": 0.144950820852366, + "learning_rate": 1.945850508969173e-05, + "loss": 0.7178, + "step": 869 + }, + { + "epoch": 0.43010752688172044, + "grad_norm": 0.23017423371185994, + "learning_rate": 1.945724040515097e-05, + "loss": 0.7404, + "step": 870 + }, + { + "epoch": 0.43060190334940057, + "grad_norm": 0.15306764457441277, + "learning_rate": 1.945597428666556e-05, + "loss": 0.6667, + "step": 871 + }, + { + "epoch": 0.4310962798170807, + "grad_norm": 0.16058381132063532, + "learning_rate": 1.9454706734427464e-05, + "loss": 0.7083, + "step": 872 + }, + { + "epoch": 0.43159065628476084, + "grad_norm": 0.14971950237757675, + "learning_rate": 1.9453437748628875e-05, + "loss": 0.6985, + "step": 873 + }, + { + "epoch": 0.432085032752441, + "grad_norm": 0.16820458656406365, + "learning_rate": 1.945216732946221e-05, + "loss": 0.8044, + "step": 874 + }, + { + "epoch": 0.4325794092201211, + "grad_norm": 0.15723935523313978, + "learning_rate": 1.9450895477120083e-05, + "loss": 0.7248, + "step": 875 + }, + { + "epoch": 0.43307378568780125, + "grad_norm": 0.17487010717329093, + "learning_rate": 1.9449622191795345e-05, + "loss": 0.6881, + "step": 876 + }, + { + "epoch": 0.4335681621554814, + "grad_norm": 0.15693489099643348, + "learning_rate": 1.9448347473681055e-05, + "loss": 0.7447, + "step": 877 + }, + { + "epoch": 0.4340625386231615, + "grad_norm": 0.15052684811999256, + "learning_rate": 1.944707132297049e-05, + "loss": 0.7147, + "step": 878 + }, + { + "epoch": 0.43455691509084166, + "grad_norm": 0.1831589645718345, + "learning_rate": 1.944579373985715e-05, + "loss": 0.7304, + "step": 879 + }, + { + "epoch": 0.4350512915585218, + "grad_norm": 0.14365368996614278, + "learning_rate": 1.944451472453474e-05, + "loss": 0.7154, + "step": 880 + }, + { + "epoch": 0.4355456680262019, + "grad_norm": 0.1790643085263659, + "learning_rate": 1.944323427719719e-05, + "loss": 0.721, + "step": 881 + }, + { + "epoch": 0.4360400444938821, + "grad_norm": 0.15316157412704204, + "learning_rate": 1.944195239803865e-05, + "loss": 0.7071, + "step": 882 + }, + { + "epoch": 0.43653442096156225, + "grad_norm": 0.16508047321166444, + "learning_rate": 1.9440669087253484e-05, + "loss": 0.744, + "step": 883 + }, + { + "epoch": 0.4370287974292424, + "grad_norm": 0.1640244054098838, + "learning_rate": 1.943938434503627e-05, + "loss": 0.6827, + "step": 884 + }, + { + "epoch": 0.4375231738969225, + "grad_norm": 0.154475807445737, + "learning_rate": 1.943809817158181e-05, + "loss": 0.724, + "step": 885 + }, + { + "epoch": 0.43801755036460266, + "grad_norm": 0.1558298846709404, + "learning_rate": 1.9436810567085113e-05, + "loss": 0.6967, + "step": 886 + }, + { + "epoch": 0.4385119268322828, + "grad_norm": 0.15407513834182873, + "learning_rate": 1.9435521531741414e-05, + "loss": 0.7438, + "step": 887 + }, + { + "epoch": 0.43900630329996293, + "grad_norm": 0.14592773945607293, + "learning_rate": 1.9434231065746165e-05, + "loss": 0.6904, + "step": 888 + }, + { + "epoch": 0.43950067976764307, + "grad_norm": 0.1667550186966124, + "learning_rate": 1.9432939169295023e-05, + "loss": 0.7267, + "step": 889 + }, + { + "epoch": 0.4399950562353232, + "grad_norm": 0.1431438071550419, + "learning_rate": 1.9431645842583878e-05, + "loss": 0.686, + "step": 890 + }, + { + "epoch": 0.44048943270300334, + "grad_norm": 0.1569203909097227, + "learning_rate": 1.9430351085808824e-05, + "loss": 0.6805, + "step": 891 + }, + { + "epoch": 0.44098380917068347, + "grad_norm": 0.1568347940950226, + "learning_rate": 1.9429054899166183e-05, + "loss": 0.7077, + "step": 892 + }, + { + "epoch": 0.4414781856383636, + "grad_norm": 0.15406841310598002, + "learning_rate": 1.9427757282852483e-05, + "loss": 0.697, + "step": 893 + }, + { + "epoch": 0.44197256210604374, + "grad_norm": 0.1558657881331361, + "learning_rate": 1.9426458237064477e-05, + "loss": 0.6875, + "step": 894 + }, + { + "epoch": 0.4424669385737239, + "grad_norm": 0.1487499330847824, + "learning_rate": 1.942515776199913e-05, + "loss": 0.6733, + "step": 895 + }, + { + "epoch": 0.442961315041404, + "grad_norm": 0.1513818380762374, + "learning_rate": 1.942385585785363e-05, + "loss": 0.7233, + "step": 896 + }, + { + "epoch": 0.44345569150908415, + "grad_norm": 0.15493714008023352, + "learning_rate": 1.9422552524825366e-05, + "loss": 0.7069, + "step": 897 + }, + { + "epoch": 0.4439500679767643, + "grad_norm": 0.15091356110611356, + "learning_rate": 1.942124776311196e-05, + "loss": 0.6986, + "step": 898 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.1423415556601199, + "learning_rate": 1.941994157291125e-05, + "loss": 0.7103, + "step": 899 + }, + { + "epoch": 0.44493882091212456, + "grad_norm": 0.14710179346495134, + "learning_rate": 1.9418633954421283e-05, + "loss": 0.7441, + "step": 900 + }, + { + "epoch": 0.44543319737980475, + "grad_norm": 0.16083937911363136, + "learning_rate": 1.941732490784032e-05, + "loss": 0.7266, + "step": 901 + }, + { + "epoch": 0.4459275738474849, + "grad_norm": 0.14794415500526806, + "learning_rate": 1.9416014433366857e-05, + "loss": 0.7061, + "step": 902 + }, + { + "epoch": 0.446421950315165, + "grad_norm": 0.141056288925905, + "learning_rate": 1.9414702531199577e-05, + "loss": 0.6737, + "step": 903 + }, + { + "epoch": 0.44691632678284515, + "grad_norm": 0.15187844961619623, + "learning_rate": 1.9413389201537405e-05, + "loss": 0.702, + "step": 904 + }, + { + "epoch": 0.4474107032505253, + "grad_norm": 0.15698440723934884, + "learning_rate": 1.9412074444579475e-05, + "loss": 0.7917, + "step": 905 + }, + { + "epoch": 0.4479050797182054, + "grad_norm": 0.14415544700743643, + "learning_rate": 1.9410758260525128e-05, + "loss": 0.6515, + "step": 906 + }, + { + "epoch": 0.44839945618588556, + "grad_norm": 0.14588744344657648, + "learning_rate": 1.9409440649573935e-05, + "loss": 0.7052, + "step": 907 + }, + { + "epoch": 0.4488938326535657, + "grad_norm": 0.14503674815203532, + "learning_rate": 1.9408121611925677e-05, + "loss": 0.7163, + "step": 908 + }, + { + "epoch": 0.44938820912124583, + "grad_norm": 0.15117436866852685, + "learning_rate": 1.940680114778035e-05, + "loss": 0.7064, + "step": 909 + }, + { + "epoch": 0.44988258558892597, + "grad_norm": 0.14531945005584745, + "learning_rate": 1.940547925733817e-05, + "loss": 0.7238, + "step": 910 + }, + { + "epoch": 0.4503769620566061, + "grad_norm": 0.15627159139724958, + "learning_rate": 1.9404155940799566e-05, + "loss": 0.7447, + "step": 911 + }, + { + "epoch": 0.45087133852428624, + "grad_norm": 0.14858153358746282, + "learning_rate": 1.940283119836518e-05, + "loss": 0.7064, + "step": 912 + }, + { + "epoch": 0.45136571499196637, + "grad_norm": 0.1494230065924164, + "learning_rate": 1.940150503023589e-05, + "loss": 0.6897, + "step": 913 + }, + { + "epoch": 0.4518600914596465, + "grad_norm": 0.15587865414239377, + "learning_rate": 1.9400177436612756e-05, + "loss": 0.7087, + "step": 914 + }, + { + "epoch": 0.45235446792732664, + "grad_norm": 0.14851155545884767, + "learning_rate": 1.9398848417697086e-05, + "loss": 0.7114, + "step": 915 + }, + { + "epoch": 0.4528488443950068, + "grad_norm": 0.14530683885642917, + "learning_rate": 1.9397517973690382e-05, + "loss": 0.6957, + "step": 916 + }, + { + "epoch": 0.4533432208626869, + "grad_norm": 0.15649694182994056, + "learning_rate": 1.9396186104794378e-05, + "loss": 0.753, + "step": 917 + }, + { + "epoch": 0.45383759733036705, + "grad_norm": 0.15357811517230768, + "learning_rate": 1.9394852811211014e-05, + "loss": 0.7297, + "step": 918 + }, + { + "epoch": 0.45433197379804724, + "grad_norm": 0.14714941598875922, + "learning_rate": 1.9393518093142453e-05, + "loss": 0.6998, + "step": 919 + }, + { + "epoch": 0.4548263502657274, + "grad_norm": 0.15057955086913455, + "learning_rate": 1.939218195079107e-05, + "loss": 0.7171, + "step": 920 + }, + { + "epoch": 0.4553207267334075, + "grad_norm": 0.156505739812787, + "learning_rate": 1.939084438435945e-05, + "loss": 0.7353, + "step": 921 + }, + { + "epoch": 0.45581510320108765, + "grad_norm": 0.15692566753867357, + "learning_rate": 1.9389505394050405e-05, + "loss": 0.7495, + "step": 922 + }, + { + "epoch": 0.4563094796687678, + "grad_norm": 0.15987927454659798, + "learning_rate": 1.9388164980066956e-05, + "loss": 0.7293, + "step": 923 + }, + { + "epoch": 0.4568038561364479, + "grad_norm": 0.15600558439552775, + "learning_rate": 1.9386823142612347e-05, + "loss": 0.7533, + "step": 924 + }, + { + "epoch": 0.45729823260412805, + "grad_norm": 0.14325922418271034, + "learning_rate": 1.938547988189003e-05, + "loss": 0.7326, + "step": 925 + }, + { + "epoch": 0.4577926090718082, + "grad_norm": 0.14895844650780446, + "learning_rate": 1.938413519810367e-05, + "loss": 0.7099, + "step": 926 + }, + { + "epoch": 0.4582869855394883, + "grad_norm": 0.14850172828633607, + "learning_rate": 1.938278909145716e-05, + "loss": 0.7398, + "step": 927 + }, + { + "epoch": 0.45878136200716846, + "grad_norm": 0.1391097533796557, + "learning_rate": 1.93814415621546e-05, + "loss": 0.7214, + "step": 928 + }, + { + "epoch": 0.4592757384748486, + "grad_norm": 0.16261846648558867, + "learning_rate": 1.9380092610400306e-05, + "loss": 0.6986, + "step": 929 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.15045741006746002, + "learning_rate": 1.9378742236398818e-05, + "loss": 0.7129, + "step": 930 + }, + { + "epoch": 0.46026449141020886, + "grad_norm": 0.15115486506252473, + "learning_rate": 1.9377390440354877e-05, + "loss": 0.7868, + "step": 931 + }, + { + "epoch": 0.460758867877889, + "grad_norm": 0.14619021884090408, + "learning_rate": 1.937603722247345e-05, + "loss": 0.7105, + "step": 932 + }, + { + "epoch": 0.46125324434556914, + "grad_norm": 0.3072182194636593, + "learning_rate": 1.937468258295972e-05, + "loss": 0.7609, + "step": 933 + }, + { + "epoch": 0.46174762081324927, + "grad_norm": 0.15099154677661858, + "learning_rate": 1.937332652201908e-05, + "loss": 0.702, + "step": 934 + }, + { + "epoch": 0.4622419972809294, + "grad_norm": 0.1430581554083585, + "learning_rate": 1.9371969039857144e-05, + "loss": 0.7025, + "step": 935 + }, + { + "epoch": 0.46273637374860954, + "grad_norm": 0.15327674030642247, + "learning_rate": 1.9370610136679738e-05, + "loss": 0.7273, + "step": 936 + }, + { + "epoch": 0.4632307502162897, + "grad_norm": 0.144904544693621, + "learning_rate": 1.93692498126929e-05, + "loss": 0.7214, + "step": 937 + }, + { + "epoch": 0.46372512668396987, + "grad_norm": 0.15901336342442182, + "learning_rate": 1.9367888068102898e-05, + "loss": 0.6965, + "step": 938 + }, + { + "epoch": 0.46421950315165, + "grad_norm": 0.1431182786162562, + "learning_rate": 1.93665249031162e-05, + "loss": 0.7075, + "step": 939 + }, + { + "epoch": 0.46471387961933014, + "grad_norm": 0.14865049711486608, + "learning_rate": 1.9365160317939488e-05, + "loss": 0.7096, + "step": 940 + }, + { + "epoch": 0.4652082560870103, + "grad_norm": 0.15409340046244216, + "learning_rate": 1.936379431277967e-05, + "loss": 0.7172, + "step": 941 + }, + { + "epoch": 0.4657026325546904, + "grad_norm": 0.14352774567592996, + "learning_rate": 1.936242688784387e-05, + "loss": 0.6578, + "step": 942 + }, + { + "epoch": 0.46619700902237055, + "grad_norm": 0.15001118924580997, + "learning_rate": 1.936105804333942e-05, + "loss": 0.7173, + "step": 943 + }, + { + "epoch": 0.4666913854900507, + "grad_norm": 0.1518864671657012, + "learning_rate": 1.9359687779473865e-05, + "loss": 0.723, + "step": 944 + }, + { + "epoch": 0.4671857619577308, + "grad_norm": 0.1571076748611444, + "learning_rate": 1.9358316096454977e-05, + "loss": 0.7602, + "step": 945 + }, + { + "epoch": 0.46768013842541095, + "grad_norm": 0.15853970487467764, + "learning_rate": 1.9356942994490727e-05, + "loss": 0.6922, + "step": 946 + }, + { + "epoch": 0.4681745148930911, + "grad_norm": 0.14158121718670455, + "learning_rate": 1.935556847378932e-05, + "loss": 0.7202, + "step": 947 + }, + { + "epoch": 0.4686688913607712, + "grad_norm": 0.16397850021280663, + "learning_rate": 1.9354192534559162e-05, + "loss": 0.7143, + "step": 948 + }, + { + "epoch": 0.46916326782845136, + "grad_norm": 0.1436684585194354, + "learning_rate": 1.935281517700888e-05, + "loss": 0.6909, + "step": 949 + }, + { + "epoch": 0.4696576442961315, + "grad_norm": 0.16994328454036273, + "learning_rate": 1.9351436401347308e-05, + "loss": 0.7516, + "step": 950 + }, + { + "epoch": 0.47015202076381163, + "grad_norm": 0.14903252238015627, + "learning_rate": 1.935005620778351e-05, + "loss": 0.7081, + "step": 951 + }, + { + "epoch": 0.47064639723149176, + "grad_norm": 0.15658129800392281, + "learning_rate": 1.9348674596526753e-05, + "loss": 0.7076, + "step": 952 + }, + { + "epoch": 0.4711407736991719, + "grad_norm": 0.16268998690817102, + "learning_rate": 1.9347291567786522e-05, + "loss": 0.7184, + "step": 953 + }, + { + "epoch": 0.47163515016685204, + "grad_norm": 0.1506718491590771, + "learning_rate": 1.9345907121772516e-05, + "loss": 0.7161, + "step": 954 + }, + { + "epoch": 0.47212952663453217, + "grad_norm": 0.15721009970776537, + "learning_rate": 1.9344521258694655e-05, + "loss": 0.707, + "step": 955 + }, + { + "epoch": 0.47262390310221236, + "grad_norm": 0.1505152355543545, + "learning_rate": 1.9343133978763062e-05, + "loss": 0.6921, + "step": 956 + }, + { + "epoch": 0.4731182795698925, + "grad_norm": 0.1609133761275411, + "learning_rate": 1.934174528218809e-05, + "loss": 0.6813, + "step": 957 + }, + { + "epoch": 0.47361265603757263, + "grad_norm": 0.1367821872229749, + "learning_rate": 1.934035516918029e-05, + "loss": 0.6994, + "step": 958 + }, + { + "epoch": 0.47410703250525277, + "grad_norm": 0.1599436222964233, + "learning_rate": 1.933896363995045e-05, + "loss": 0.7516, + "step": 959 + }, + { + "epoch": 0.4746014089729329, + "grad_norm": 0.1437864926090351, + "learning_rate": 1.933757069470954e-05, + "loss": 0.712, + "step": 960 + }, + { + "epoch": 0.47509578544061304, + "grad_norm": 0.1465132970349147, + "learning_rate": 1.9336176333668783e-05, + "loss": 0.7642, + "step": 961 + }, + { + "epoch": 0.4755901619082932, + "grad_norm": 0.15041402819440294, + "learning_rate": 1.933478055703958e-05, + "loss": 0.7437, + "step": 962 + }, + { + "epoch": 0.4760845383759733, + "grad_norm": 1.6285411753547836, + "learning_rate": 1.9333383365033582e-05, + "loss": 0.7287, + "step": 963 + }, + { + "epoch": 0.47657891484365345, + "grad_norm": 0.15758843655112065, + "learning_rate": 1.9331984757862625e-05, + "loss": 0.7275, + "step": 964 + }, + { + "epoch": 0.4770732913113336, + "grad_norm": 0.20428186762336542, + "learning_rate": 1.933058473573877e-05, + "loss": 0.7003, + "step": 965 + }, + { + "epoch": 0.4775676677790137, + "grad_norm": 0.16159015622795067, + "learning_rate": 1.9329183298874303e-05, + "loss": 0.6747, + "step": 966 + }, + { + "epoch": 0.47806204424669385, + "grad_norm": 0.16456264352483121, + "learning_rate": 1.9327780447481712e-05, + "loss": 0.7353, + "step": 967 + }, + { + "epoch": 0.478556420714374, + "grad_norm": 0.15937314094579783, + "learning_rate": 1.93263761817737e-05, + "loss": 0.7353, + "step": 968 + }, + { + "epoch": 0.4790507971820541, + "grad_norm": 0.16362188781757858, + "learning_rate": 1.932497050196319e-05, + "loss": 0.7109, + "step": 969 + }, + { + "epoch": 0.47954517364973426, + "grad_norm": 0.15960779423899688, + "learning_rate": 1.9323563408263316e-05, + "loss": 0.6979, + "step": 970 + }, + { + "epoch": 0.4800395501174144, + "grad_norm": 0.4120251285496098, + "learning_rate": 1.9322154900887428e-05, + "loss": 0.7273, + "step": 971 + }, + { + "epoch": 0.48053392658509453, + "grad_norm": 0.15815596259144765, + "learning_rate": 1.9320744980049087e-05, + "loss": 0.7384, + "step": 972 + }, + { + "epoch": 0.48102830305277466, + "grad_norm": 0.1534272409278291, + "learning_rate": 1.9319333645962074e-05, + "loss": 0.7171, + "step": 973 + }, + { + "epoch": 0.4815226795204548, + "grad_norm": 0.23241494671917182, + "learning_rate": 1.9317920898840377e-05, + "loss": 0.7341, + "step": 974 + }, + { + "epoch": 0.482017055988135, + "grad_norm": 0.15623558250725436, + "learning_rate": 1.9316506738898207e-05, + "loss": 0.6624, + "step": 975 + }, + { + "epoch": 0.4825114324558151, + "grad_norm": 0.1423521937169915, + "learning_rate": 1.9315091166349982e-05, + "loss": 0.6971, + "step": 976 + }, + { + "epoch": 0.48300580892349526, + "grad_norm": 0.1419744308670263, + "learning_rate": 1.931367418141034e-05, + "loss": 0.7089, + "step": 977 + }, + { + "epoch": 0.4835001853911754, + "grad_norm": 0.14785701877970409, + "learning_rate": 1.9312255784294127e-05, + "loss": 0.6893, + "step": 978 + }, + { + "epoch": 0.48399456185885553, + "grad_norm": 0.14219665008772206, + "learning_rate": 1.9310835975216405e-05, + "loss": 0.6977, + "step": 979 + }, + { + "epoch": 0.48448893832653567, + "grad_norm": 0.14374794997947582, + "learning_rate": 1.9309414754392452e-05, + "loss": 0.7262, + "step": 980 + }, + { + "epoch": 0.4849833147942158, + "grad_norm": 0.1490774610706772, + "learning_rate": 1.930799212203776e-05, + "loss": 0.7446, + "step": 981 + }, + { + "epoch": 0.48547769126189594, + "grad_norm": 0.1526885212758215, + "learning_rate": 1.930656807836804e-05, + "loss": 0.742, + "step": 982 + }, + { + "epoch": 0.4859720677295761, + "grad_norm": 0.14576144464560423, + "learning_rate": 1.9305142623599196e-05, + "loss": 0.7358, + "step": 983 + }, + { + "epoch": 0.4864664441972562, + "grad_norm": 0.14819296140720295, + "learning_rate": 1.9303715757947376e-05, + "loss": 0.7138, + "step": 984 + }, + { + "epoch": 0.48696082066493634, + "grad_norm": 0.14151324920404665, + "learning_rate": 1.9302287481628918e-05, + "loss": 0.7031, + "step": 985 + }, + { + "epoch": 0.4874551971326165, + "grad_norm": 0.143786412449448, + "learning_rate": 1.930085779486039e-05, + "loss": 0.7268, + "step": 986 + }, + { + "epoch": 0.4879495736002966, + "grad_norm": 0.15063924934390213, + "learning_rate": 1.9299426697858558e-05, + "loss": 0.7014, + "step": 987 + }, + { + "epoch": 0.48844395006797675, + "grad_norm": 0.14830097560437855, + "learning_rate": 1.9297994190840424e-05, + "loss": 0.6877, + "step": 988 + }, + { + "epoch": 0.4889383265356569, + "grad_norm": 0.13749478340170793, + "learning_rate": 1.9296560274023176e-05, + "loss": 0.6719, + "step": 989 + }, + { + "epoch": 0.489432703003337, + "grad_norm": 0.1414520577091615, + "learning_rate": 1.929512494762424e-05, + "loss": 0.7297, + "step": 990 + }, + { + "epoch": 0.48992707947101716, + "grad_norm": 0.14835115893869755, + "learning_rate": 1.9293688211861238e-05, + "loss": 0.7323, + "step": 991 + }, + { + "epoch": 0.4904214559386973, + "grad_norm": 0.15151382510323289, + "learning_rate": 1.9292250066952023e-05, + "loss": 0.7093, + "step": 992 + }, + { + "epoch": 0.4909158324063775, + "grad_norm": 0.1527360973465737, + "learning_rate": 1.9290810513114645e-05, + "loss": 0.7217, + "step": 993 + }, + { + "epoch": 0.4914102088740576, + "grad_norm": 0.1456699474553989, + "learning_rate": 1.9289369550567378e-05, + "loss": 0.7096, + "step": 994 + }, + { + "epoch": 0.49190458534173775, + "grad_norm": 0.15014967264094645, + "learning_rate": 1.9287927179528707e-05, + "loss": 0.7195, + "step": 995 + }, + { + "epoch": 0.4923989618094179, + "grad_norm": 0.15322810745287332, + "learning_rate": 1.9286483400217327e-05, + "loss": 0.7241, + "step": 996 + }, + { + "epoch": 0.492893338277098, + "grad_norm": 0.15002795067904215, + "learning_rate": 1.9285038212852153e-05, + "loss": 0.7047, + "step": 997 + }, + { + "epoch": 0.49338771474477816, + "grad_norm": 0.18294155846609528, + "learning_rate": 1.9283591617652307e-05, + "loss": 0.7085, + "step": 998 + }, + { + "epoch": 0.4938820912124583, + "grad_norm": 0.15159148731935193, + "learning_rate": 1.928214361483713e-05, + "loss": 0.7164, + "step": 999 + }, + { + "epoch": 0.49437646768013843, + "grad_norm": 0.1513225021764707, + "learning_rate": 1.9280694204626172e-05, + "loss": 0.7357, + "step": 1000 + }, + { + "epoch": 0.49487084414781857, + "grad_norm": 0.1545919459352924, + "learning_rate": 1.9279243387239202e-05, + "loss": 0.7085, + "step": 1001 + }, + { + "epoch": 0.4953652206154987, + "grad_norm": 0.14881491701469227, + "learning_rate": 1.9277791162896195e-05, + "loss": 0.7357, + "step": 1002 + }, + { + "epoch": 0.49585959708317884, + "grad_norm": 0.14452034252474208, + "learning_rate": 1.9276337531817346e-05, + "loss": 0.7035, + "step": 1003 + }, + { + "epoch": 0.496353973550859, + "grad_norm": 0.16487817496498444, + "learning_rate": 1.927488249422306e-05, + "loss": 0.6987, + "step": 1004 + }, + { + "epoch": 0.4968483500185391, + "grad_norm": 0.2573270981408878, + "learning_rate": 1.927342605033395e-05, + "loss": 0.7111, + "step": 1005 + }, + { + "epoch": 0.49734272648621924, + "grad_norm": 0.1508927412322339, + "learning_rate": 1.9271968200370855e-05, + "loss": 0.6991, + "step": 1006 + }, + { + "epoch": 0.4978371029538994, + "grad_norm": 0.16398977031684608, + "learning_rate": 1.9270508944554815e-05, + "loss": 0.701, + "step": 1007 + }, + { + "epoch": 0.4983314794215795, + "grad_norm": 0.14611850780949937, + "learning_rate": 1.926904828310709e-05, + "loss": 0.7171, + "step": 1008 + }, + { + "epoch": 0.49882585588925965, + "grad_norm": 0.16011655637148547, + "learning_rate": 1.926758621624915e-05, + "loss": 0.6785, + "step": 1009 + }, + { + "epoch": 0.4993202323569398, + "grad_norm": 0.1572885139618686, + "learning_rate": 1.926612274420269e-05, + "loss": 0.7152, + "step": 1010 + }, + { + "epoch": 0.4998146088246199, + "grad_norm": 0.152318921930535, + "learning_rate": 1.9264657867189595e-05, + "loss": 0.6991, + "step": 1011 + }, + { + "epoch": 0.5003089852923001, + "grad_norm": 0.1443417241945977, + "learning_rate": 1.9263191585431972e-05, + "loss": 0.6931, + "step": 1012 + }, + { + "epoch": 0.5003089852923001, + "eval_loss": 0.7131645679473877, + "eval_runtime": 81.8841, + "eval_samples_per_second": 370.695, + "eval_steps_per_second": 46.346, + "step": 1012 + }, + { + "epoch": 0.5008033617599802, + "grad_norm": 0.14232633073508563, + "learning_rate": 1.926172389915216e-05, + "loss": 0.6767, + "step": 1013 + }, + { + "epoch": 0.5012977382276603, + "grad_norm": 0.13896905062778325, + "learning_rate": 1.9260254808572685e-05, + "loss": 0.7132, + "step": 1014 + }, + { + "epoch": 0.5017921146953405, + "grad_norm": 0.14936622601941235, + "learning_rate": 1.9258784313916298e-05, + "loss": 0.6986, + "step": 1015 + }, + { + "epoch": 0.5022864911630206, + "grad_norm": 0.14625245545976318, + "learning_rate": 1.9257312415405963e-05, + "loss": 0.7017, + "step": 1016 + }, + { + "epoch": 0.5027808676307007, + "grad_norm": 0.14729552506171695, + "learning_rate": 1.9255839113264852e-05, + "loss": 0.7095, + "step": 1017 + }, + { + "epoch": 0.5032752440983809, + "grad_norm": 0.15527101313896943, + "learning_rate": 1.9254364407716356e-05, + "loss": 0.699, + "step": 1018 + }, + { + "epoch": 0.503769620566061, + "grad_norm": 0.16226603681845622, + "learning_rate": 1.9252888298984077e-05, + "loss": 0.7314, + "step": 1019 + }, + { + "epoch": 0.5042639970337411, + "grad_norm": 0.15392921364637596, + "learning_rate": 1.9251410787291826e-05, + "loss": 0.7021, + "step": 1020 + }, + { + "epoch": 0.5047583735014214, + "grad_norm": 0.1496097832644486, + "learning_rate": 1.9249931872863625e-05, + "loss": 0.699, + "step": 1021 + }, + { + "epoch": 0.5052527499691015, + "grad_norm": 0.19358389121786626, + "learning_rate": 1.924845155592372e-05, + "loss": 0.6638, + "step": 1022 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 0.14420387470186713, + "learning_rate": 1.924696983669656e-05, + "loss": 0.7099, + "step": 1023 + }, + { + "epoch": 0.5062415029044618, + "grad_norm": 0.14656891233069164, + "learning_rate": 1.924548671540681e-05, + "loss": 0.7053, + "step": 1024 + }, + { + "epoch": 0.5067358793721419, + "grad_norm": 0.14664505005500275, + "learning_rate": 1.9244002192279345e-05, + "loss": 0.713, + "step": 1025 + }, + { + "epoch": 0.5072302558398221, + "grad_norm": 0.19653231542961444, + "learning_rate": 1.9242516267539257e-05, + "loss": 0.7029, + "step": 1026 + }, + { + "epoch": 0.5077246323075022, + "grad_norm": 0.14436077006665818, + "learning_rate": 1.9241028941411846e-05, + "loss": 0.6996, + "step": 1027 + }, + { + "epoch": 0.5082190087751823, + "grad_norm": 0.14103268195946086, + "learning_rate": 1.9239540214122625e-05, + "loss": 0.6545, + "step": 1028 + }, + { + "epoch": 0.5087133852428625, + "grad_norm": 0.14356532193526764, + "learning_rate": 1.9238050085897324e-05, + "loss": 0.7527, + "step": 1029 + }, + { + "epoch": 0.5092077617105426, + "grad_norm": 0.14394746771297529, + "learning_rate": 1.923655855696188e-05, + "loss": 0.6993, + "step": 1030 + }, + { + "epoch": 0.5097021381782227, + "grad_norm": 0.15411059412056302, + "learning_rate": 1.9235065627542444e-05, + "loss": 0.6936, + "step": 1031 + }, + { + "epoch": 0.5101965146459029, + "grad_norm": 0.14292569956662665, + "learning_rate": 1.9233571297865383e-05, + "loss": 0.7463, + "step": 1032 + }, + { + "epoch": 0.510690891113583, + "grad_norm": 0.14604557500884927, + "learning_rate": 1.923207556815727e-05, + "loss": 0.7049, + "step": 1033 + }, + { + "epoch": 0.5111852675812631, + "grad_norm": 0.14311879097759964, + "learning_rate": 1.9230578438644897e-05, + "loss": 0.7249, + "step": 1034 + }, + { + "epoch": 0.5116796440489433, + "grad_norm": 0.15500985429555886, + "learning_rate": 1.9229079909555262e-05, + "loss": 0.703, + "step": 1035 + }, + { + "epoch": 0.5121740205166234, + "grad_norm": 0.1470124904838046, + "learning_rate": 1.9227579981115577e-05, + "loss": 0.7622, + "step": 1036 + }, + { + "epoch": 0.5126683969843036, + "grad_norm": 0.14660119068776123, + "learning_rate": 1.922607865355327e-05, + "loss": 0.7027, + "step": 1037 + }, + { + "epoch": 0.5131627734519837, + "grad_norm": 0.1543702108075682, + "learning_rate": 1.9224575927095976e-05, + "loss": 0.7249, + "step": 1038 + }, + { + "epoch": 0.5136571499196638, + "grad_norm": 0.21600367205314783, + "learning_rate": 1.9223071801971546e-05, + "loss": 0.7344, + "step": 1039 + }, + { + "epoch": 0.514151526387344, + "grad_norm": 0.13593589658323382, + "learning_rate": 1.922156627840804e-05, + "loss": 0.6668, + "step": 1040 + }, + { + "epoch": 0.5146459028550241, + "grad_norm": 0.15242315166513778, + "learning_rate": 1.9220059356633736e-05, + "loss": 0.7293, + "step": 1041 + }, + { + "epoch": 0.5151402793227042, + "grad_norm": 0.15495929822980747, + "learning_rate": 1.9218551036877113e-05, + "loss": 0.696, + "step": 1042 + }, + { + "epoch": 0.5156346557903844, + "grad_norm": 0.14303683098413958, + "learning_rate": 1.9217041319366872e-05, + "loss": 0.695, + "step": 1043 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.16496185115222772, + "learning_rate": 1.921553020433192e-05, + "loss": 0.7085, + "step": 1044 + }, + { + "epoch": 0.5166234087257446, + "grad_norm": 0.14497967730423772, + "learning_rate": 1.9214017692001384e-05, + "loss": 0.7233, + "step": 1045 + }, + { + "epoch": 0.5171177851934248, + "grad_norm": 0.15756211942948475, + "learning_rate": 1.921250378260459e-05, + "loss": 0.7547, + "step": 1046 + }, + { + "epoch": 0.5176121616611049, + "grad_norm": 0.1551013468601176, + "learning_rate": 1.921098847637109e-05, + "loss": 0.7443, + "step": 1047 + }, + { + "epoch": 0.518106538128785, + "grad_norm": 0.1401931361143721, + "learning_rate": 1.9209471773530634e-05, + "loss": 0.6724, + "step": 1048 + }, + { + "epoch": 0.5186009145964652, + "grad_norm": 0.16240969908756345, + "learning_rate": 1.9207953674313193e-05, + "loss": 0.7408, + "step": 1049 + }, + { + "epoch": 0.5190952910641453, + "grad_norm": 0.15870409918505887, + "learning_rate": 1.920643417894895e-05, + "loss": 0.6929, + "step": 1050 + }, + { + "epoch": 0.5195896675318254, + "grad_norm": 0.14750855374832555, + "learning_rate": 1.9204913287668295e-05, + "loss": 0.7486, + "step": 1051 + }, + { + "epoch": 0.5200840439995056, + "grad_norm": 0.14300679044438563, + "learning_rate": 1.9203391000701833e-05, + "loss": 0.6834, + "step": 1052 + }, + { + "epoch": 0.5205784204671857, + "grad_norm": 0.1571689077690893, + "learning_rate": 1.9201867318280375e-05, + "loss": 0.7025, + "step": 1053 + }, + { + "epoch": 0.5210727969348659, + "grad_norm": 0.14558138162722847, + "learning_rate": 1.9200342240634953e-05, + "loss": 0.6852, + "step": 1054 + }, + { + "epoch": 0.521567173402546, + "grad_norm": 0.15297021832729857, + "learning_rate": 1.9198815767996802e-05, + "loss": 0.7137, + "step": 1055 + }, + { + "epoch": 0.5220615498702261, + "grad_norm": 0.1488373088843154, + "learning_rate": 1.919728790059737e-05, + "loss": 0.7412, + "step": 1056 + }, + { + "epoch": 0.5225559263379063, + "grad_norm": 0.14891209541341033, + "learning_rate": 1.9195758638668326e-05, + "loss": 0.7423, + "step": 1057 + }, + { + "epoch": 0.5230503028055865, + "grad_norm": 0.15020322982316534, + "learning_rate": 1.9194227982441535e-05, + "loss": 0.7015, + "step": 1058 + }, + { + "epoch": 0.5235446792732666, + "grad_norm": 0.14129692280756834, + "learning_rate": 1.919269593214909e-05, + "loss": 0.7329, + "step": 1059 + }, + { + "epoch": 0.5240390557409468, + "grad_norm": 0.23417850734808754, + "learning_rate": 1.9191162488023277e-05, + "loss": 0.7566, + "step": 1060 + }, + { + "epoch": 0.5245334322086269, + "grad_norm": 0.14521594728108062, + "learning_rate": 1.9189627650296603e-05, + "loss": 0.7109, + "step": 1061 + }, + { + "epoch": 0.525027808676307, + "grad_norm": 0.16478725501302946, + "learning_rate": 1.9188091419201795e-05, + "loss": 0.7013, + "step": 1062 + }, + { + "epoch": 0.5255221851439872, + "grad_norm": 0.16610765334442631, + "learning_rate": 1.9186553794971776e-05, + "loss": 0.7157, + "step": 1063 + }, + { + "epoch": 0.5260165616116673, + "grad_norm": 0.15735957426516808, + "learning_rate": 1.918501477783969e-05, + "loss": 0.7052, + "step": 1064 + }, + { + "epoch": 0.5265109380793475, + "grad_norm": 0.15450749647487286, + "learning_rate": 1.9183474368038884e-05, + "loss": 0.7771, + "step": 1065 + }, + { + "epoch": 0.5270053145470276, + "grad_norm": 0.14891172848221126, + "learning_rate": 1.918193256580293e-05, + "loss": 0.729, + "step": 1066 + }, + { + "epoch": 0.5274996910147077, + "grad_norm": 0.16227205640443187, + "learning_rate": 1.9180389371365594e-05, + "loss": 0.7053, + "step": 1067 + }, + { + "epoch": 0.5279940674823879, + "grad_norm": 0.1471120302820779, + "learning_rate": 1.917884478496086e-05, + "loss": 0.7081, + "step": 1068 + }, + { + "epoch": 0.528488443950068, + "grad_norm": 0.16332770423736193, + "learning_rate": 1.9177298806822933e-05, + "loss": 0.6991, + "step": 1069 + }, + { + "epoch": 0.5289828204177481, + "grad_norm": 0.14497329805859818, + "learning_rate": 1.9175751437186213e-05, + "loss": 0.7046, + "step": 1070 + }, + { + "epoch": 0.5294771968854283, + "grad_norm": 0.14392662997782898, + "learning_rate": 1.9174202676285324e-05, + "loss": 0.7251, + "step": 1071 + }, + { + "epoch": 0.5299715733531084, + "grad_norm": 0.14948995650308472, + "learning_rate": 1.917265252435509e-05, + "loss": 0.6941, + "step": 1072 + }, + { + "epoch": 0.5304659498207885, + "grad_norm": 0.14937277983713662, + "learning_rate": 1.9171100981630555e-05, + "loss": 0.715, + "step": 1073 + }, + { + "epoch": 0.5309603262884687, + "grad_norm": 0.1505491295646815, + "learning_rate": 1.9169548048346968e-05, + "loss": 0.7113, + "step": 1074 + }, + { + "epoch": 0.5314547027561488, + "grad_norm": 0.14861663621077076, + "learning_rate": 1.916799372473979e-05, + "loss": 0.7309, + "step": 1075 + }, + { + "epoch": 0.531949079223829, + "grad_norm": 0.1614311197779999, + "learning_rate": 1.91664380110447e-05, + "loss": 0.6762, + "step": 1076 + }, + { + "epoch": 0.5324434556915091, + "grad_norm": 0.1553490231925976, + "learning_rate": 1.9164880907497576e-05, + "loss": 0.7201, + "step": 1077 + }, + { + "epoch": 0.5329378321591892, + "grad_norm": 0.16328753854118677, + "learning_rate": 1.9163322414334515e-05, + "loss": 0.7638, + "step": 1078 + }, + { + "epoch": 0.5334322086268694, + "grad_norm": 0.14374299961804915, + "learning_rate": 1.9161762531791814e-05, + "loss": 0.7146, + "step": 1079 + }, + { + "epoch": 0.5339265850945495, + "grad_norm": 0.1565169685615777, + "learning_rate": 1.9160201260106e-05, + "loss": 0.6932, + "step": 1080 + }, + { + "epoch": 0.5344209615622296, + "grad_norm": 0.15105748120087173, + "learning_rate": 1.9158638599513793e-05, + "loss": 0.7337, + "step": 1081 + }, + { + "epoch": 0.5349153380299098, + "grad_norm": 0.15807502071554957, + "learning_rate": 1.915707455025213e-05, + "loss": 0.7088, + "step": 1082 + }, + { + "epoch": 0.5354097144975899, + "grad_norm": 0.14696836012801506, + "learning_rate": 1.915550911255816e-05, + "loss": 0.6648, + "step": 1083 + }, + { + "epoch": 0.53590409096527, + "grad_norm": 0.14897293426776106, + "learning_rate": 1.9153942286669242e-05, + "loss": 0.7197, + "step": 1084 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 0.163890804850269, + "learning_rate": 1.9152374072822945e-05, + "loss": 0.7057, + "step": 1085 + }, + { + "epoch": 0.5368928439006303, + "grad_norm": 0.14455320078823453, + "learning_rate": 1.915080447125704e-05, + "loss": 0.6772, + "step": 1086 + }, + { + "epoch": 0.5373872203683104, + "grad_norm": 0.1539200622731823, + "learning_rate": 1.9149233482209528e-05, + "loss": 0.7307, + "step": 1087 + }, + { + "epoch": 0.5378815968359906, + "grad_norm": 0.14474723063262754, + "learning_rate": 1.9147661105918597e-05, + "loss": 0.6887, + "step": 1088 + }, + { + "epoch": 0.5383759733036707, + "grad_norm": 0.14798651157763634, + "learning_rate": 1.9146087342622666e-05, + "loss": 0.7325, + "step": 1089 + }, + { + "epoch": 0.5388703497713508, + "grad_norm": 0.14662312922217188, + "learning_rate": 1.914451219256035e-05, + "loss": 0.7126, + "step": 1090 + }, + { + "epoch": 0.539364726239031, + "grad_norm": 0.1504610801024387, + "learning_rate": 1.914293565597048e-05, + "loss": 0.7491, + "step": 1091 + }, + { + "epoch": 0.5398591027067111, + "grad_norm": 0.14821391099410935, + "learning_rate": 1.9141357733092103e-05, + "loss": 0.7092, + "step": 1092 + }, + { + "epoch": 0.5403534791743912, + "grad_norm": 0.1522156428958628, + "learning_rate": 1.913977842416446e-05, + "loss": 0.7073, + "step": 1093 + }, + { + "epoch": 0.5408478556420714, + "grad_norm": 0.15081697031581798, + "learning_rate": 1.913819772942702e-05, + "loss": 0.7273, + "step": 1094 + }, + { + "epoch": 0.5413422321097516, + "grad_norm": 0.15594240293555503, + "learning_rate": 1.9136615649119457e-05, + "loss": 0.6684, + "step": 1095 + }, + { + "epoch": 0.5418366085774318, + "grad_norm": 0.14729119327575693, + "learning_rate": 1.913503218348164e-05, + "loss": 0.725, + "step": 1096 + }, + { + "epoch": 0.5423309850451119, + "grad_norm": 0.14160326130292644, + "learning_rate": 1.913344733275367e-05, + "loss": 0.6993, + "step": 1097 + }, + { + "epoch": 0.542825361512792, + "grad_norm": 0.1587649747241466, + "learning_rate": 1.9131861097175847e-05, + "loss": 0.7102, + "step": 1098 + }, + { + "epoch": 0.5433197379804722, + "grad_norm": 0.15546820362541536, + "learning_rate": 1.9130273476988676e-05, + "loss": 0.6846, + "step": 1099 + }, + { + "epoch": 0.5438141144481523, + "grad_norm": 0.14852241877210168, + "learning_rate": 1.912868447243289e-05, + "loss": 0.7247, + "step": 1100 + }, + { + "epoch": 0.5443084909158324, + "grad_norm": 0.16701501792955495, + "learning_rate": 1.912709408374941e-05, + "loss": 0.7052, + "step": 1101 + }, + { + "epoch": 0.5448028673835126, + "grad_norm": 0.1414427475637627, + "learning_rate": 1.9125502311179383e-05, + "loss": 0.6754, + "step": 1102 + }, + { + "epoch": 0.5452972438511927, + "grad_norm": 0.1588552025801632, + "learning_rate": 1.9123909154964156e-05, + "loss": 0.7074, + "step": 1103 + }, + { + "epoch": 0.5457916203188728, + "grad_norm": 0.15414219338774016, + "learning_rate": 1.9122314615345292e-05, + "loss": 0.7186, + "step": 1104 + }, + { + "epoch": 0.546285996786553, + "grad_norm": 0.16616293795198825, + "learning_rate": 1.912071869256456e-05, + "loss": 0.7316, + "step": 1105 + }, + { + "epoch": 0.5467803732542331, + "grad_norm": 0.15430811503740535, + "learning_rate": 1.911912138686394e-05, + "loss": 0.7067, + "step": 1106 + }, + { + "epoch": 0.5472747497219133, + "grad_norm": 0.1496676121112595, + "learning_rate": 1.911752269848563e-05, + "loss": 0.7034, + "step": 1107 + }, + { + "epoch": 0.5477691261895934, + "grad_norm": 0.16284405605153313, + "learning_rate": 1.9115922627672015e-05, + "loss": 0.7511, + "step": 1108 + }, + { + "epoch": 0.5482635026572735, + "grad_norm": 0.1610829233296775, + "learning_rate": 1.9114321174665717e-05, + "loss": 0.6671, + "step": 1109 + }, + { + "epoch": 0.5487578791249537, + "grad_norm": 0.16485881085672005, + "learning_rate": 1.9112718339709546e-05, + "loss": 0.7238, + "step": 1110 + }, + { + "epoch": 0.5492522555926338, + "grad_norm": 0.1516726857176166, + "learning_rate": 1.9111114123046537e-05, + "loss": 0.7348, + "step": 1111 + }, + { + "epoch": 0.5497466320603139, + "grad_norm": 0.14898362997232828, + "learning_rate": 1.9109508524919923e-05, + "loss": 0.663, + "step": 1112 + }, + { + "epoch": 0.5502410085279941, + "grad_norm": 0.1864080967215774, + "learning_rate": 1.9107901545573152e-05, + "loss": 0.7145, + "step": 1113 + }, + { + "epoch": 0.5507353849956742, + "grad_norm": 0.1495103838604709, + "learning_rate": 1.910629318524988e-05, + "loss": 0.6955, + "step": 1114 + }, + { + "epoch": 0.5512297614633543, + "grad_norm": 0.16260832321379898, + "learning_rate": 1.9104683444193978e-05, + "loss": 0.7273, + "step": 1115 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.14779333570016012, + "learning_rate": 1.9103072322649514e-05, + "loss": 0.7119, + "step": 1116 + }, + { + "epoch": 0.5522185143987146, + "grad_norm": 0.16525076489234627, + "learning_rate": 1.910145982086078e-05, + "loss": 0.7403, + "step": 1117 + }, + { + "epoch": 0.5527128908663947, + "grad_norm": 0.1622821706479494, + "learning_rate": 1.9099845939072265e-05, + "loss": 0.7321, + "step": 1118 + }, + { + "epoch": 0.5532072673340749, + "grad_norm": 0.14097317026197764, + "learning_rate": 1.9098230677528673e-05, + "loss": 0.6823, + "step": 1119 + }, + { + "epoch": 0.553701643801755, + "grad_norm": 0.1590702754309319, + "learning_rate": 1.909661403647492e-05, + "loss": 0.6783, + "step": 1120 + }, + { + "epoch": 0.5541960202694352, + "grad_norm": 0.15171107306676515, + "learning_rate": 1.909499601615612e-05, + "loss": 0.681, + "step": 1121 + }, + { + "epoch": 0.5546903967371153, + "grad_norm": 0.14764568710515458, + "learning_rate": 1.9093376616817612e-05, + "loss": 0.6597, + "step": 1122 + }, + { + "epoch": 0.5551847732047954, + "grad_norm": 0.14991491328685738, + "learning_rate": 1.9091755838704932e-05, + "loss": 0.7221, + "step": 1123 + }, + { + "epoch": 0.5556791496724756, + "grad_norm": 0.15210550482127097, + "learning_rate": 1.9090133682063827e-05, + "loss": 0.6696, + "step": 1124 + }, + { + "epoch": 0.5561735261401557, + "grad_norm": 0.1596369490783544, + "learning_rate": 1.9088510147140258e-05, + "loss": 0.7016, + "step": 1125 + }, + { + "epoch": 0.5566679026078358, + "grad_norm": 0.15897053734023317, + "learning_rate": 1.908688523418039e-05, + "loss": 0.698, + "step": 1126 + }, + { + "epoch": 0.557162279075516, + "grad_norm": 0.16710147725381863, + "learning_rate": 1.9085258943430603e-05, + "loss": 0.696, + "step": 1127 + }, + { + "epoch": 0.5576566555431961, + "grad_norm": 0.15048580539039197, + "learning_rate": 1.9083631275137473e-05, + "loss": 0.7153, + "step": 1128 + }, + { + "epoch": 0.5581510320108762, + "grad_norm": 0.1596968561271395, + "learning_rate": 1.9082002229547806e-05, + "loss": 0.6827, + "step": 1129 + }, + { + "epoch": 0.5586454084785564, + "grad_norm": 0.1505906704423023, + "learning_rate": 1.9080371806908592e-05, + "loss": 0.6976, + "step": 1130 + }, + { + "epoch": 0.5591397849462365, + "grad_norm": 0.15332042609455762, + "learning_rate": 1.9078740007467046e-05, + "loss": 0.696, + "step": 1131 + }, + { + "epoch": 0.5596341614139168, + "grad_norm": 0.147649004639, + "learning_rate": 1.9077106831470594e-05, + "loss": 0.6944, + "step": 1132 + }, + { + "epoch": 0.5601285378815969, + "grad_norm": 0.15368644924171632, + "learning_rate": 1.9075472279166858e-05, + "loss": 0.779, + "step": 1133 + }, + { + "epoch": 0.560622914349277, + "grad_norm": 0.15216227240028807, + "learning_rate": 1.9073836350803678e-05, + "loss": 0.7072, + "step": 1134 + }, + { + "epoch": 0.5611172908169572, + "grad_norm": 0.15062944884776627, + "learning_rate": 1.90721990466291e-05, + "loss": 0.6907, + "step": 1135 + }, + { + "epoch": 0.5616116672846373, + "grad_norm": 0.16052321702939135, + "learning_rate": 1.907056036689138e-05, + "loss": 0.7171, + "step": 1136 + }, + { + "epoch": 0.5621060437523174, + "grad_norm": 0.15540754745392985, + "learning_rate": 1.9068920311838975e-05, + "loss": 0.7407, + "step": 1137 + }, + { + "epoch": 0.5626004202199976, + "grad_norm": 0.19659765983118427, + "learning_rate": 1.9067278881720565e-05, + "loss": 0.6995, + "step": 1138 + }, + { + "epoch": 0.5630947966876777, + "grad_norm": 0.15133270811163935, + "learning_rate": 1.9065636076785025e-05, + "loss": 0.727, + "step": 1139 + }, + { + "epoch": 0.5635891731553578, + "grad_norm": 0.20149832941252674, + "learning_rate": 1.9063991897281443e-05, + "loss": 0.7028, + "step": 1140 + }, + { + "epoch": 0.564083549623038, + "grad_norm": 0.16163955820916115, + "learning_rate": 1.9062346343459122e-05, + "loss": 0.7404, + "step": 1141 + }, + { + "epoch": 0.5645779260907181, + "grad_norm": 0.15003391870662094, + "learning_rate": 1.9060699415567563e-05, + "loss": 0.6758, + "step": 1142 + }, + { + "epoch": 0.5650723025583982, + "grad_norm": 0.1578784580335935, + "learning_rate": 1.9059051113856476e-05, + "loss": 0.7052, + "step": 1143 + }, + { + "epoch": 0.5655666790260784, + "grad_norm": 0.15230633719007233, + "learning_rate": 1.9057401438575792e-05, + "loss": 0.6696, + "step": 1144 + }, + { + "epoch": 0.5660610554937585, + "grad_norm": 0.1581969573217512, + "learning_rate": 1.9055750389975634e-05, + "loss": 0.7054, + "step": 1145 + }, + { + "epoch": 0.5665554319614386, + "grad_norm": 0.14913349203440593, + "learning_rate": 1.9054097968306347e-05, + "loss": 0.7143, + "step": 1146 + }, + { + "epoch": 0.5670498084291188, + "grad_norm": 0.16289562268884544, + "learning_rate": 1.905244417381847e-05, + "loss": 0.7295, + "step": 1147 + }, + { + "epoch": 0.5675441848967989, + "grad_norm": 0.15579135964268526, + "learning_rate": 1.9050789006762766e-05, + "loss": 0.7295, + "step": 1148 + }, + { + "epoch": 0.568038561364479, + "grad_norm": 0.1648155147608089, + "learning_rate": 1.9049132467390186e-05, + "loss": 0.6897, + "step": 1149 + }, + { + "epoch": 0.5685329378321592, + "grad_norm": 0.14974602871018472, + "learning_rate": 1.904747455595192e-05, + "loss": 0.6777, + "step": 1150 + }, + { + "epoch": 0.5690273142998393, + "grad_norm": 0.161795258534095, + "learning_rate": 1.904581527269933e-05, + "loss": 0.7186, + "step": 1151 + }, + { + "epoch": 0.5695216907675195, + "grad_norm": 0.14533866981623988, + "learning_rate": 1.9044154617884013e-05, + "loss": 0.7011, + "step": 1152 + }, + { + "epoch": 0.5700160672351996, + "grad_norm": 0.1421932971889725, + "learning_rate": 1.9042492591757757e-05, + "loss": 0.6805, + "step": 1153 + }, + { + "epoch": 0.5705104437028797, + "grad_norm": 0.16873958566653227, + "learning_rate": 1.904082919457257e-05, + "loss": 0.7001, + "step": 1154 + }, + { + "epoch": 0.5710048201705599, + "grad_norm": 0.14563410128378737, + "learning_rate": 1.9039164426580667e-05, + "loss": 0.6757, + "step": 1155 + }, + { + "epoch": 0.57149919663824, + "grad_norm": 0.21882232972545032, + "learning_rate": 1.9037498288034455e-05, + "loss": 0.717, + "step": 1156 + }, + { + "epoch": 0.5719935731059201, + "grad_norm": 0.15011912377626907, + "learning_rate": 1.9035830779186567e-05, + "loss": 0.7086, + "step": 1157 + }, + { + "epoch": 0.5724879495736003, + "grad_norm": 0.1545955522405525, + "learning_rate": 1.9034161900289844e-05, + "loss": 0.6944, + "step": 1158 + }, + { + "epoch": 0.5729823260412804, + "grad_norm": 0.15432856797120795, + "learning_rate": 1.9032491651597316e-05, + "loss": 0.7175, + "step": 1159 + }, + { + "epoch": 0.5734767025089605, + "grad_norm": 0.14657424965019364, + "learning_rate": 1.9030820033362238e-05, + "loss": 0.6655, + "step": 1160 + }, + { + "epoch": 0.5739710789766407, + "grad_norm": 0.1949191843722662, + "learning_rate": 1.902914704583807e-05, + "loss": 0.7122, + "step": 1161 + }, + { + "epoch": 0.5744654554443208, + "grad_norm": 0.14850159393029633, + "learning_rate": 1.9027472689278475e-05, + "loss": 0.7135, + "step": 1162 + }, + { + "epoch": 0.574959831912001, + "grad_norm": 0.1444952526086056, + "learning_rate": 1.902579696393733e-05, + "loss": 0.684, + "step": 1163 + }, + { + "epoch": 0.5754542083796811, + "grad_norm": 0.1419748849587667, + "learning_rate": 1.9024119870068705e-05, + "loss": 0.6975, + "step": 1164 + }, + { + "epoch": 0.5759485848473612, + "grad_norm": 0.1425968859430955, + "learning_rate": 1.902244140792689e-05, + "loss": 0.6969, + "step": 1165 + }, + { + "epoch": 0.5764429613150414, + "grad_norm": 0.17121096649794953, + "learning_rate": 1.902076157776639e-05, + "loss": 0.7416, + "step": 1166 + }, + { + "epoch": 0.5769373377827215, + "grad_norm": 0.24904395373980942, + "learning_rate": 1.90190803798419e-05, + "loss": 0.7528, + "step": 1167 + }, + { + "epoch": 0.5774317142504016, + "grad_norm": 0.17356837371044814, + "learning_rate": 1.9017397814408332e-05, + "loss": 0.7208, + "step": 1168 + }, + { + "epoch": 0.5779260907180818, + "grad_norm": 0.15177599391353247, + "learning_rate": 1.90157138817208e-05, + "loss": 0.6783, + "step": 1169 + }, + { + "epoch": 0.578420467185762, + "grad_norm": 0.1536046463654241, + "learning_rate": 1.9014028582034635e-05, + "loss": 0.6785, + "step": 1170 + }, + { + "epoch": 0.5789148436534421, + "grad_norm": 0.1540949099046599, + "learning_rate": 1.901234191560536e-05, + "loss": 0.6844, + "step": 1171 + }, + { + "epoch": 0.5794092201211223, + "grad_norm": 0.14576187206860225, + "learning_rate": 1.9010653882688723e-05, + "loss": 0.676, + "step": 1172 + }, + { + "epoch": 0.5799035965888024, + "grad_norm": 0.15049364535062662, + "learning_rate": 1.9008964483540662e-05, + "loss": 0.7151, + "step": 1173 + }, + { + "epoch": 0.5803979730564826, + "grad_norm": 0.7411230195474802, + "learning_rate": 1.900727371841734e-05, + "loss": 0.6999, + "step": 1174 + }, + { + "epoch": 0.5808923495241627, + "grad_norm": 0.15885264939402002, + "learning_rate": 1.900558158757511e-05, + "loss": 0.7252, + "step": 1175 + }, + { + "epoch": 0.5813867259918428, + "grad_norm": 0.14916669324621157, + "learning_rate": 1.900388809127054e-05, + "loss": 0.6916, + "step": 1176 + }, + { + "epoch": 0.581881102459523, + "grad_norm": 0.15368910262038413, + "learning_rate": 1.900219322976041e-05, + "loss": 0.6961, + "step": 1177 + }, + { + "epoch": 0.5823754789272031, + "grad_norm": 0.1563869676472337, + "learning_rate": 1.9000497003301698e-05, + "loss": 0.6994, + "step": 1178 + }, + { + "epoch": 0.5828698553948832, + "grad_norm": 0.14630141880746228, + "learning_rate": 1.899879941215159e-05, + "loss": 0.6992, + "step": 1179 + }, + { + "epoch": 0.5833642318625634, + "grad_norm": 0.15116208468901926, + "learning_rate": 1.899710045656749e-05, + "loss": 0.6985, + "step": 1180 + }, + { + "epoch": 0.5838586083302435, + "grad_norm": 0.14499928642781532, + "learning_rate": 1.8995400136806993e-05, + "loss": 0.6873, + "step": 1181 + }, + { + "epoch": 0.5843529847979236, + "grad_norm": 0.14755026898185206, + "learning_rate": 1.8993698453127907e-05, + "loss": 0.6969, + "step": 1182 + }, + { + "epoch": 0.5848473612656038, + "grad_norm": 0.17549169522938585, + "learning_rate": 1.8991995405788254e-05, + "loss": 0.7157, + "step": 1183 + }, + { + "epoch": 0.5853417377332839, + "grad_norm": 0.14243603268065128, + "learning_rate": 1.8990290995046255e-05, + "loss": 0.7294, + "step": 1184 + }, + { + "epoch": 0.585836114200964, + "grad_norm": 0.15507295326334805, + "learning_rate": 1.898858522116034e-05, + "loss": 0.6821, + "step": 1185 + }, + { + "epoch": 0.5863304906686442, + "grad_norm": 0.1443396202852229, + "learning_rate": 1.8986878084389143e-05, + "loss": 0.6862, + "step": 1186 + }, + { + "epoch": 0.5868248671363243, + "grad_norm": 0.15307605157091345, + "learning_rate": 1.898516958499151e-05, + "loss": 0.6918, + "step": 1187 + }, + { + "epoch": 0.5873192436040044, + "grad_norm": 0.15431550710602435, + "learning_rate": 1.898345972322648e-05, + "loss": 0.6591, + "step": 1188 + }, + { + "epoch": 0.5878136200716846, + "grad_norm": 0.1522248973768696, + "learning_rate": 1.898174849935333e-05, + "loss": 0.6953, + "step": 1189 + }, + { + "epoch": 0.5883079965393647, + "grad_norm": 3.9451857999902575, + "learning_rate": 1.8980035913631503e-05, + "loss": 0.7075, + "step": 1190 + }, + { + "epoch": 0.5888023730070449, + "grad_norm": 0.1859439631517137, + "learning_rate": 1.8978321966320677e-05, + "loss": 0.7313, + "step": 1191 + }, + { + "epoch": 0.589296749474725, + "grad_norm": 0.15065948926649575, + "learning_rate": 1.8976606657680724e-05, + "loss": 0.6716, + "step": 1192 + }, + { + "epoch": 0.5897911259424051, + "grad_norm": 0.15431474273975415, + "learning_rate": 1.8974889987971732e-05, + "loss": 0.7193, + "step": 1193 + }, + { + "epoch": 0.5902855024100853, + "grad_norm": 0.15732364799828052, + "learning_rate": 1.8973171957453986e-05, + "loss": 0.6965, + "step": 1194 + }, + { + "epoch": 0.5907798788777654, + "grad_norm": 5.7300654030100775, + "learning_rate": 1.8971452566387972e-05, + "loss": 0.965, + "step": 1195 + }, + { + "epoch": 0.5912742553454455, + "grad_norm": 0.21443211348560948, + "learning_rate": 1.8969731815034405e-05, + "loss": 0.7068, + "step": 1196 + }, + { + "epoch": 0.5917686318131257, + "grad_norm": 0.16399228898032497, + "learning_rate": 1.8968009703654186e-05, + "loss": 0.6596, + "step": 1197 + }, + { + "epoch": 0.5922630082808058, + "grad_norm": 0.18319856247483485, + "learning_rate": 1.896628623250843e-05, + "loss": 0.7016, + "step": 1198 + }, + { + "epoch": 0.5927573847484859, + "grad_norm": 0.1583456706755328, + "learning_rate": 1.896456140185845e-05, + "loss": 0.6951, + "step": 1199 + }, + { + "epoch": 0.5932517612161661, + "grad_norm": 0.17042593881933754, + "learning_rate": 1.896283521196578e-05, + "loss": 0.7549, + "step": 1200 + }, + { + "epoch": 0.5937461376838462, + "grad_norm": 0.15657395509128008, + "learning_rate": 1.896110766309215e-05, + "loss": 0.6975, + "step": 1201 + }, + { + "epoch": 0.5942405141515263, + "grad_norm": 0.16993603394114548, + "learning_rate": 1.8959378755499497e-05, + "loss": 0.7414, + "step": 1202 + }, + { + "epoch": 0.5947348906192065, + "grad_norm": 0.16430177746529062, + "learning_rate": 1.895764848944996e-05, + "loss": 0.731, + "step": 1203 + }, + { + "epoch": 0.5952292670868866, + "grad_norm": 0.17765317783363976, + "learning_rate": 1.8955916865205896e-05, + "loss": 0.7207, + "step": 1204 + }, + { + "epoch": 0.5957236435545668, + "grad_norm": 0.16384697778373508, + "learning_rate": 1.8954183883029858e-05, + "loss": 0.6846, + "step": 1205 + }, + { + "epoch": 0.5962180200222469, + "grad_norm": 0.16661731453992853, + "learning_rate": 1.8952449543184606e-05, + "loss": 0.7258, + "step": 1206 + }, + { + "epoch": 0.5967123964899271, + "grad_norm": 0.15417368432563872, + "learning_rate": 1.8950713845933112e-05, + "loss": 0.6965, + "step": 1207 + }, + { + "epoch": 0.5972067729576073, + "grad_norm": 0.17185924799694866, + "learning_rate": 1.894897679153855e-05, + "loss": 0.7623, + "step": 1208 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 0.15766861462488546, + "learning_rate": 1.894723838026429e-05, + "loss": 0.6945, + "step": 1209 + }, + { + "epoch": 0.5981955258929675, + "grad_norm": 0.14485525888236075, + "learning_rate": 1.8945498612373926e-05, + "loss": 0.7155, + "step": 1210 + }, + { + "epoch": 0.5986899023606477, + "grad_norm": 0.16215757833890687, + "learning_rate": 1.8943757488131242e-05, + "loss": 0.6878, + "step": 1211 + }, + { + "epoch": 0.5991842788283278, + "grad_norm": 0.1520516722346122, + "learning_rate": 1.8942015007800242e-05, + "loss": 0.6465, + "step": 1212 + }, + { + "epoch": 0.599678655296008, + "grad_norm": 0.14847959095729793, + "learning_rate": 1.894027117164512e-05, + "loss": 0.6806, + "step": 1213 + }, + { + "epoch": 0.6001730317636881, + "grad_norm": 0.14906065062070312, + "learning_rate": 1.893852597993029e-05, + "loss": 0.7177, + "step": 1214 + }, + { + "epoch": 0.6006674082313682, + "grad_norm": 0.1489017564417107, + "learning_rate": 1.893677943292036e-05, + "loss": 0.7086, + "step": 1215 + }, + { + "epoch": 0.6011617846990484, + "grad_norm": 0.16537477689340366, + "learning_rate": 1.893503153088015e-05, + "loss": 0.7229, + "step": 1216 + }, + { + "epoch": 0.6016561611667285, + "grad_norm": 0.1447635532998319, + "learning_rate": 1.8933282274074682e-05, + "loss": 0.7223, + "step": 1217 + }, + { + "epoch": 0.6021505376344086, + "grad_norm": 0.14463659923950056, + "learning_rate": 1.8931531662769188e-05, + "loss": 0.7167, + "step": 1218 + }, + { + "epoch": 0.6026449141020888, + "grad_norm": 0.1549912610619457, + "learning_rate": 1.8929779697229108e-05, + "loss": 0.7087, + "step": 1219 + }, + { + "epoch": 0.6031392905697689, + "grad_norm": 0.15416720228599715, + "learning_rate": 1.892802637772007e-05, + "loss": 0.6705, + "step": 1220 + }, + { + "epoch": 0.603633667037449, + "grad_norm": 0.1430153878511002, + "learning_rate": 1.8926271704507927e-05, + "loss": 0.6974, + "step": 1221 + }, + { + "epoch": 0.6041280435051292, + "grad_norm": 0.29487684547536763, + "learning_rate": 1.892451567785873e-05, + "loss": 0.6989, + "step": 1222 + }, + { + "epoch": 0.6046224199728093, + "grad_norm": 0.15019154669696808, + "learning_rate": 1.892275829803873e-05, + "loss": 0.6629, + "step": 1223 + }, + { + "epoch": 0.6051167964404894, + "grad_norm": 0.15418148914623028, + "learning_rate": 1.8920999565314395e-05, + "loss": 0.7091, + "step": 1224 + }, + { + "epoch": 0.6056111729081696, + "grad_norm": 0.14983959843278696, + "learning_rate": 1.891923947995238e-05, + "loss": 0.7192, + "step": 1225 + }, + { + "epoch": 0.6061055493758497, + "grad_norm": 0.1605824910707386, + "learning_rate": 1.891747804221957e-05, + "loss": 0.6963, + "step": 1226 + }, + { + "epoch": 0.6065999258435298, + "grad_norm": 0.14992281573019228, + "learning_rate": 1.8915715252383035e-05, + "loss": 0.6954, + "step": 1227 + }, + { + "epoch": 0.60709430231121, + "grad_norm": 0.1506379772934638, + "learning_rate": 1.891395111071005e-05, + "loss": 0.7143, + "step": 1228 + }, + { + "epoch": 0.6075886787788901, + "grad_norm": 0.1555829121165381, + "learning_rate": 1.891218561746811e-05, + "loss": 0.6598, + "step": 1229 + }, + { + "epoch": 0.6080830552465702, + "grad_norm": 0.14018421431647543, + "learning_rate": 1.8910418772924903e-05, + "loss": 0.638, + "step": 1230 + }, + { + "epoch": 0.6085774317142504, + "grad_norm": 0.1590195507425543, + "learning_rate": 1.8908650577348323e-05, + "loss": 0.7278, + "step": 1231 + }, + { + "epoch": 0.6090718081819305, + "grad_norm": 0.15918798874172643, + "learning_rate": 1.8906881031006476e-05, + "loss": 0.7255, + "step": 1232 + }, + { + "epoch": 0.6095661846496107, + "grad_norm": 0.1568449935591741, + "learning_rate": 1.890511013416766e-05, + "loss": 0.7064, + "step": 1233 + }, + { + "epoch": 0.6100605611172908, + "grad_norm": 0.14059897528388018, + "learning_rate": 1.8903337887100398e-05, + "loss": 0.6964, + "step": 1234 + }, + { + "epoch": 0.6105549375849709, + "grad_norm": 0.16098770097173518, + "learning_rate": 1.8901564290073392e-05, + "loss": 0.7162, + "step": 1235 + }, + { + "epoch": 0.6110493140526511, + "grad_norm": 0.15234319743535982, + "learning_rate": 1.8899789343355567e-05, + "loss": 0.6796, + "step": 1236 + }, + { + "epoch": 0.6115436905203312, + "grad_norm": 0.15366922355422077, + "learning_rate": 1.889801304721605e-05, + "loss": 0.6682, + "step": 1237 + }, + { + "epoch": 0.6120380669880113, + "grad_norm": 0.15494057116402646, + "learning_rate": 1.8896235401924167e-05, + "loss": 0.6795, + "step": 1238 + }, + { + "epoch": 0.6125324434556915, + "grad_norm": 0.14779163885380237, + "learning_rate": 1.889445640774945e-05, + "loss": 0.7185, + "step": 1239 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 0.14617434625575265, + "learning_rate": 1.889267606496164e-05, + "loss": 0.7137, + "step": 1240 + }, + { + "epoch": 0.6135211963910517, + "grad_norm": 0.14699960165239384, + "learning_rate": 1.8890894373830682e-05, + "loss": 0.664, + "step": 1241 + }, + { + "epoch": 0.6140155728587319, + "grad_norm": 0.14948234194621196, + "learning_rate": 1.888911133462672e-05, + "loss": 0.6777, + "step": 1242 + }, + { + "epoch": 0.614509949326412, + "grad_norm": 0.15255361162847417, + "learning_rate": 1.8887326947620108e-05, + "loss": 0.6689, + "step": 1243 + }, + { + "epoch": 0.6150043257940923, + "grad_norm": 0.1458736605423569, + "learning_rate": 1.8885541213081397e-05, + "loss": 0.7036, + "step": 1244 + }, + { + "epoch": 0.6154987022617724, + "grad_norm": 0.1402665027021155, + "learning_rate": 1.8883754131281353e-05, + "loss": 0.646, + "step": 1245 + }, + { + "epoch": 0.6159930787294525, + "grad_norm": 0.15470619728114457, + "learning_rate": 1.8881965702490936e-05, + "loss": 0.6824, + "step": 1246 + }, + { + "epoch": 0.6164874551971327, + "grad_norm": 0.1487168258703435, + "learning_rate": 1.888017592698132e-05, + "loss": 0.7531, + "step": 1247 + }, + { + "epoch": 0.6169818316648128, + "grad_norm": 0.16425401295510153, + "learning_rate": 1.887838480502387e-05, + "loss": 0.6942, + "step": 1248 + }, + { + "epoch": 0.6174762081324929, + "grad_norm": 0.15265597241114137, + "learning_rate": 1.8876592336890166e-05, + "loss": 0.6561, + "step": 1249 + }, + { + "epoch": 0.6179705846001731, + "grad_norm": 0.1516544067434483, + "learning_rate": 1.8874798522851994e-05, + "loss": 0.6962, + "step": 1250 + }, + { + "epoch": 0.6184649610678532, + "grad_norm": 0.160855182619402, + "learning_rate": 1.8873003363181336e-05, + "loss": 0.7339, + "step": 1251 + }, + { + "epoch": 0.6189593375355333, + "grad_norm": 0.14979249457363525, + "learning_rate": 1.8871206858150383e-05, + "loss": 0.7221, + "step": 1252 + }, + { + "epoch": 0.6194537140032135, + "grad_norm": 0.15969401482253334, + "learning_rate": 1.8869409008031523e-05, + "loss": 0.7564, + "step": 1253 + }, + { + "epoch": 0.6199480904708936, + "grad_norm": 0.1469483179635192, + "learning_rate": 1.8867609813097355e-05, + "loss": 0.7098, + "step": 1254 + }, + { + "epoch": 0.6204424669385737, + "grad_norm": 0.15341497149888003, + "learning_rate": 1.8865809273620688e-05, + "loss": 0.6953, + "step": 1255 + }, + { + "epoch": 0.6209368434062539, + "grad_norm": 0.15075103277710053, + "learning_rate": 1.886400738987452e-05, + "loss": 0.7114, + "step": 1256 + }, + { + "epoch": 0.621431219873934, + "grad_norm": 0.155623283875116, + "learning_rate": 1.8862204162132055e-05, + "loss": 0.6566, + "step": 1257 + }, + { + "epoch": 0.6219255963416142, + "grad_norm": 0.14982823399595008, + "learning_rate": 1.8860399590666717e-05, + "loss": 0.6558, + "step": 1258 + }, + { + "epoch": 0.6224199728092943, + "grad_norm": 0.1450527952499424, + "learning_rate": 1.8858593675752115e-05, + "loss": 0.739, + "step": 1259 + }, + { + "epoch": 0.6229143492769744, + "grad_norm": 0.16864738765286919, + "learning_rate": 1.885678641766207e-05, + "loss": 0.7229, + "step": 1260 + }, + { + "epoch": 0.6234087257446546, + "grad_norm": 0.24141257990628515, + "learning_rate": 1.885497781667061e-05, + "loss": 0.7182, + "step": 1261 + }, + { + "epoch": 0.6239031022123347, + "grad_norm": 0.1458676402913462, + "learning_rate": 1.8853167873051954e-05, + "loss": 0.7124, + "step": 1262 + }, + { + "epoch": 0.6243974786800148, + "grad_norm": 0.14353438801941626, + "learning_rate": 1.885135658708054e-05, + "loss": 0.7311, + "step": 1263 + }, + { + "epoch": 0.624891855147695, + "grad_norm": 0.1510876880592538, + "learning_rate": 1.8849543959031002e-05, + "loss": 0.7226, + "step": 1264 + }, + { + "epoch": 0.6253862316153751, + "grad_norm": 0.1874364305381507, + "learning_rate": 1.8847729989178173e-05, + "loss": 0.7302, + "step": 1265 + }, + { + "epoch": 0.6258806080830552, + "grad_norm": 0.1496063219923551, + "learning_rate": 1.88459146777971e-05, + "loss": 0.7105, + "step": 1266 + }, + { + "epoch": 0.6263749845507354, + "grad_norm": 0.1442697390012075, + "learning_rate": 1.8844098025163024e-05, + "loss": 0.6686, + "step": 1267 + }, + { + "epoch": 0.6268693610184155, + "grad_norm": 0.15562078591701115, + "learning_rate": 1.8842280031551394e-05, + "loss": 0.7134, + "step": 1268 + }, + { + "epoch": 0.6273637374860956, + "grad_norm": 0.16552132962692553, + "learning_rate": 1.884046069723786e-05, + "loss": 0.7351, + "step": 1269 + }, + { + "epoch": 0.6278581139537758, + "grad_norm": 0.14840237247868546, + "learning_rate": 1.883864002249828e-05, + "loss": 0.6648, + "step": 1270 + }, + { + "epoch": 0.6283524904214559, + "grad_norm": 0.15182234443513168, + "learning_rate": 1.8836818007608716e-05, + "loss": 0.7154, + "step": 1271 + }, + { + "epoch": 0.628846866889136, + "grad_norm": 0.14832161637462826, + "learning_rate": 1.8834994652845418e-05, + "loss": 0.7142, + "step": 1272 + }, + { + "epoch": 0.6293412433568162, + "grad_norm": 0.1508290006705283, + "learning_rate": 1.8833169958484858e-05, + "loss": 0.7463, + "step": 1273 + }, + { + "epoch": 0.6298356198244963, + "grad_norm": 0.1487110411925406, + "learning_rate": 1.88313439248037e-05, + "loss": 0.7255, + "step": 1274 + }, + { + "epoch": 0.6303299962921765, + "grad_norm": 0.13741640864186408, + "learning_rate": 1.8829516552078816e-05, + "loss": 0.6901, + "step": 1275 + }, + { + "epoch": 0.6308243727598566, + "grad_norm": 0.1487901967851983, + "learning_rate": 1.8827687840587284e-05, + "loss": 0.7023, + "step": 1276 + }, + { + "epoch": 0.6313187492275367, + "grad_norm": 0.13983157243579628, + "learning_rate": 1.882585779060637e-05, + "loss": 0.7011, + "step": 1277 + }, + { + "epoch": 0.6318131256952169, + "grad_norm": 0.15624612094610227, + "learning_rate": 1.8824026402413565e-05, + "loss": 0.7852, + "step": 1278 + }, + { + "epoch": 0.632307502162897, + "grad_norm": 0.14285757664301163, + "learning_rate": 1.8822193676286543e-05, + "loss": 0.6954, + "step": 1279 + }, + { + "epoch": 0.6328018786305771, + "grad_norm": 0.14005001972804385, + "learning_rate": 1.8820359612503193e-05, + "loss": 0.6631, + "step": 1280 + }, + { + "epoch": 0.6332962550982574, + "grad_norm": 0.14906389542829634, + "learning_rate": 1.8818524211341603e-05, + "loss": 0.7286, + "step": 1281 + }, + { + "epoch": 0.6337906315659375, + "grad_norm": 0.14360474111417273, + "learning_rate": 1.8816687473080064e-05, + "loss": 0.6865, + "step": 1282 + }, + { + "epoch": 0.6342850080336176, + "grad_norm": 0.14372536151330284, + "learning_rate": 1.881484939799707e-05, + "loss": 0.6668, + "step": 1283 + }, + { + "epoch": 0.6347793845012978, + "grad_norm": 0.1501146877435797, + "learning_rate": 1.8813009986371313e-05, + "loss": 0.698, + "step": 1284 + }, + { + "epoch": 0.6352737609689779, + "grad_norm": 0.1515664761537325, + "learning_rate": 1.88111692384817e-05, + "loss": 0.7288, + "step": 1285 + }, + { + "epoch": 0.635768137436658, + "grad_norm": 0.14539406473542993, + "learning_rate": 1.880932715460732e-05, + "loss": 0.7092, + "step": 1286 + }, + { + "epoch": 0.6362625139043382, + "grad_norm": 0.14543792780925927, + "learning_rate": 1.8807483735027493e-05, + "loss": 0.7082, + "step": 1287 + }, + { + "epoch": 0.6367568903720183, + "grad_norm": 0.1474804841255758, + "learning_rate": 1.8805638980021713e-05, + "loss": 0.6805, + "step": 1288 + }, + { + "epoch": 0.6372512668396985, + "grad_norm": 0.141684512428458, + "learning_rate": 1.8803792889869696e-05, + "loss": 0.7187, + "step": 1289 + }, + { + "epoch": 0.6377456433073786, + "grad_norm": 0.13998713026889517, + "learning_rate": 1.8801945464851353e-05, + "loss": 0.7094, + "step": 1290 + }, + { + "epoch": 0.6382400197750587, + "grad_norm": 0.14238821300028004, + "learning_rate": 1.8800096705246793e-05, + "loss": 0.6479, + "step": 1291 + }, + { + "epoch": 0.6387343962427389, + "grad_norm": 0.14870628649287024, + "learning_rate": 1.8798246611336338e-05, + "loss": 0.7099, + "step": 1292 + }, + { + "epoch": 0.639228772710419, + "grad_norm": 0.1386808181097908, + "learning_rate": 1.8796395183400504e-05, + "loss": 0.7037, + "step": 1293 + }, + { + "epoch": 0.6397231491780991, + "grad_norm": 0.16153188945869187, + "learning_rate": 1.879454242172001e-05, + "loss": 0.741, + "step": 1294 + }, + { + "epoch": 0.6402175256457793, + "grad_norm": 0.146187553414011, + "learning_rate": 1.8792688326575783e-05, + "loss": 0.7298, + "step": 1295 + }, + { + "epoch": 0.6407119021134594, + "grad_norm": 0.14508073550981046, + "learning_rate": 1.8790832898248947e-05, + "loss": 0.7244, + "step": 1296 + }, + { + "epoch": 0.6412062785811395, + "grad_norm": 0.14505480384634725, + "learning_rate": 1.878897613702083e-05, + "loss": 0.6507, + "step": 1297 + }, + { + "epoch": 0.6417006550488197, + "grad_norm": 0.15293546404686686, + "learning_rate": 1.8787118043172962e-05, + "loss": 0.7013, + "step": 1298 + }, + { + "epoch": 0.6421950315164998, + "grad_norm": 0.1529715989756162, + "learning_rate": 1.878525861698707e-05, + "loss": 0.7219, + "step": 1299 + }, + { + "epoch": 0.64268940798418, + "grad_norm": 0.15106875197660277, + "learning_rate": 1.878339785874509e-05, + "loss": 0.7075, + "step": 1300 + }, + { + "epoch": 0.6431837844518601, + "grad_norm": 0.14167545731461909, + "learning_rate": 1.878153576872916e-05, + "loss": 0.6433, + "step": 1301 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 0.14608871774115234, + "learning_rate": 1.8779672347221617e-05, + "loss": 0.687, + "step": 1302 + }, + { + "epoch": 0.6441725373872204, + "grad_norm": 0.13830672295621044, + "learning_rate": 1.8777807594505e-05, + "loss": 0.6768, + "step": 1303 + }, + { + "epoch": 0.6446669138549005, + "grad_norm": 0.134496553098127, + "learning_rate": 1.8775941510862047e-05, + "loss": 0.6424, + "step": 1304 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.1534695364368932, + "learning_rate": 1.877407409657571e-05, + "loss": 0.703, + "step": 1305 + }, + { + "epoch": 0.6456556667902608, + "grad_norm": 0.1454049967009503, + "learning_rate": 1.877220535192912e-05, + "loss": 0.6989, + "step": 1306 + }, + { + "epoch": 0.6461500432579409, + "grad_norm": 0.16422930021695123, + "learning_rate": 1.8770335277205638e-05, + "loss": 0.7167, + "step": 1307 + }, + { + "epoch": 0.646644419725621, + "grad_norm": 0.16262248363613982, + "learning_rate": 1.8768463872688803e-05, + "loss": 0.6847, + "step": 1308 + }, + { + "epoch": 0.6471387961933012, + "grad_norm": 0.1693024603509323, + "learning_rate": 1.876659113866237e-05, + "loss": 0.7014, + "step": 1309 + }, + { + "epoch": 0.6476331726609813, + "grad_norm": 0.15727094493550245, + "learning_rate": 1.8764717075410286e-05, + "loss": 0.7073, + "step": 1310 + }, + { + "epoch": 0.6481275491286614, + "grad_norm": 0.14336230656633814, + "learning_rate": 1.8762841683216702e-05, + "loss": 0.6589, + "step": 1311 + }, + { + "epoch": 0.6486219255963416, + "grad_norm": 0.162994613625415, + "learning_rate": 1.876096496236598e-05, + "loss": 0.6637, + "step": 1312 + }, + { + "epoch": 0.6491163020640217, + "grad_norm": 0.15198879115232458, + "learning_rate": 1.8759086913142672e-05, + "loss": 0.7205, + "step": 1313 + }, + { + "epoch": 0.6496106785317018, + "grad_norm": 0.1459866662438882, + "learning_rate": 1.8757207535831538e-05, + "loss": 0.6814, + "step": 1314 + }, + { + "epoch": 0.650105054999382, + "grad_norm": 0.13864236660356624, + "learning_rate": 1.875532683071753e-05, + "loss": 0.6877, + "step": 1315 + }, + { + "epoch": 0.6505994314670621, + "grad_norm": 0.15242512785231477, + "learning_rate": 1.8753444798085813e-05, + "loss": 0.6959, + "step": 1316 + }, + { + "epoch": 0.6510938079347423, + "grad_norm": 0.1445489369085129, + "learning_rate": 1.8751561438221747e-05, + "loss": 0.6593, + "step": 1317 + }, + { + "epoch": 0.6515881844024225, + "grad_norm": 0.1508292735840228, + "learning_rate": 1.87496767514109e-05, + "loss": 0.6882, + "step": 1318 + }, + { + "epoch": 0.6520825608701026, + "grad_norm": 0.15099462550174442, + "learning_rate": 1.8747790737939027e-05, + "loss": 0.7227, + "step": 1319 + }, + { + "epoch": 0.6525769373377828, + "grad_norm": 0.1440924463309976, + "learning_rate": 1.8745903398092096e-05, + "loss": 0.6812, + "step": 1320 + }, + { + "epoch": 0.6530713138054629, + "grad_norm": 0.13958284499327708, + "learning_rate": 1.8744014732156276e-05, + "loss": 0.6924, + "step": 1321 + }, + { + "epoch": 0.653565690273143, + "grad_norm": 0.16480225135452348, + "learning_rate": 1.8742124740417934e-05, + "loss": 0.7111, + "step": 1322 + }, + { + "epoch": 0.6540600667408232, + "grad_norm": 0.14824324133514122, + "learning_rate": 1.874023342316363e-05, + "loss": 0.6954, + "step": 1323 + }, + { + "epoch": 0.6545544432085033, + "grad_norm": 0.1616535616025004, + "learning_rate": 1.8738340780680143e-05, + "loss": 0.7218, + "step": 1324 + }, + { + "epoch": 0.6550488196761834, + "grad_norm": 0.13869683936039634, + "learning_rate": 1.8736446813254444e-05, + "loss": 0.701, + "step": 1325 + }, + { + "epoch": 0.6555431961438636, + "grad_norm": 0.16088472797652673, + "learning_rate": 1.873455152117369e-05, + "loss": 0.6861, + "step": 1326 + }, + { + "epoch": 0.6560375726115437, + "grad_norm": 0.15078006084529622, + "learning_rate": 1.8732654904725268e-05, + "loss": 0.742, + "step": 1327 + }, + { + "epoch": 0.6565319490792239, + "grad_norm": 0.14029400565884267, + "learning_rate": 1.8730756964196743e-05, + "loss": 0.654, + "step": 1328 + }, + { + "epoch": 0.657026325546904, + "grad_norm": 0.16242450447069715, + "learning_rate": 1.872885769987589e-05, + "loss": 0.7144, + "step": 1329 + }, + { + "epoch": 0.6575207020145841, + "grad_norm": 0.1552256835250385, + "learning_rate": 1.872695711205068e-05, + "loss": 0.7243, + "step": 1330 + }, + { + "epoch": 0.6580150784822643, + "grad_norm": 0.15063782467393758, + "learning_rate": 1.8725055201009295e-05, + "loss": 0.6924, + "step": 1331 + }, + { + "epoch": 0.6585094549499444, + "grad_norm": 0.1587511919407657, + "learning_rate": 1.8723151967040104e-05, + "loss": 0.6925, + "step": 1332 + }, + { + "epoch": 0.6590038314176245, + "grad_norm": 0.15075430874217446, + "learning_rate": 1.8721247410431686e-05, + "loss": 0.7671, + "step": 1333 + }, + { + "epoch": 0.6594982078853047, + "grad_norm": 0.15939976386034288, + "learning_rate": 1.8719341531472816e-05, + "loss": 0.7044, + "step": 1334 + }, + { + "epoch": 0.6599925843529848, + "grad_norm": 0.1416311034851915, + "learning_rate": 1.871743433045247e-05, + "loss": 0.6817, + "step": 1335 + }, + { + "epoch": 0.6604869608206649, + "grad_norm": 0.20404453969500094, + "learning_rate": 1.871552580765983e-05, + "loss": 0.6928, + "step": 1336 + }, + { + "epoch": 0.6609813372883451, + "grad_norm": 0.13514881737130124, + "learning_rate": 1.8713615963384267e-05, + "loss": 0.6576, + "step": 1337 + }, + { + "epoch": 0.6614757137560252, + "grad_norm": 0.14379084456605065, + "learning_rate": 1.8711704797915367e-05, + "loss": 0.6703, + "step": 1338 + }, + { + "epoch": 0.6619700902237053, + "grad_norm": 0.14850680368699365, + "learning_rate": 1.87097923115429e-05, + "loss": 0.7038, + "step": 1339 + }, + { + "epoch": 0.6624644666913855, + "grad_norm": 0.17173444054168754, + "learning_rate": 1.870787850455685e-05, + "loss": 0.683, + "step": 1340 + }, + { + "epoch": 0.6629588431590656, + "grad_norm": 0.14373342415339807, + "learning_rate": 1.87059633772474e-05, + "loss": 0.6635, + "step": 1341 + }, + { + "epoch": 0.6634532196267457, + "grad_norm": 0.1478987855481457, + "learning_rate": 1.870404692990492e-05, + "loss": 0.6647, + "step": 1342 + }, + { + "epoch": 0.6639475960944259, + "grad_norm": 0.14468016926585975, + "learning_rate": 1.8702129162819998e-05, + "loss": 0.6972, + "step": 1343 + }, + { + "epoch": 0.664441972562106, + "grad_norm": 0.15362336704526333, + "learning_rate": 1.8700210076283406e-05, + "loss": 0.6856, + "step": 1344 + }, + { + "epoch": 0.6649363490297862, + "grad_norm": 0.14170736967641553, + "learning_rate": 1.869828967058613e-05, + "loss": 0.6624, + "step": 1345 + }, + { + "epoch": 0.6654307254974663, + "grad_norm": 0.148835014918556, + "learning_rate": 1.8696367946019348e-05, + "loss": 0.7224, + "step": 1346 + }, + { + "epoch": 0.6659251019651464, + "grad_norm": 0.14987016192067756, + "learning_rate": 1.8694444902874437e-05, + "loss": 0.7456, + "step": 1347 + }, + { + "epoch": 0.6664194784328266, + "grad_norm": 0.1399064409802383, + "learning_rate": 1.8692520541442975e-05, + "loss": 0.6852, + "step": 1348 + }, + { + "epoch": 0.6669138549005067, + "grad_norm": 0.14888506184977424, + "learning_rate": 1.869059486201675e-05, + "loss": 0.6715, + "step": 1349 + }, + { + "epoch": 0.6674082313681868, + "grad_norm": 0.14688873150300108, + "learning_rate": 1.868866786488773e-05, + "loss": 0.7056, + "step": 1350 + }, + { + "epoch": 0.667902607835867, + "grad_norm": 0.13669888488648094, + "learning_rate": 1.8686739550348102e-05, + "loss": 0.665, + "step": 1351 + }, + { + "epoch": 0.6683969843035471, + "grad_norm": 0.1520759150929574, + "learning_rate": 1.8684809918690246e-05, + "loss": 0.7068, + "step": 1352 + }, + { + "epoch": 0.6688913607712272, + "grad_norm": 0.1375334438532712, + "learning_rate": 1.8682878970206734e-05, + "loss": 0.6791, + "step": 1353 + }, + { + "epoch": 0.6693857372389074, + "grad_norm": 0.14366755586259322, + "learning_rate": 1.8680946705190343e-05, + "loss": 0.7147, + "step": 1354 + }, + { + "epoch": 0.6698801137065876, + "grad_norm": 0.14814775553020054, + "learning_rate": 1.8679013123934064e-05, + "loss": 0.6957, + "step": 1355 + }, + { + "epoch": 0.6703744901742678, + "grad_norm": 0.14992816868628647, + "learning_rate": 1.8677078226731056e-05, + "loss": 0.6845, + "step": 1356 + }, + { + "epoch": 0.6708688666419479, + "grad_norm": 0.14892283667928577, + "learning_rate": 1.8675142013874706e-05, + "loss": 0.7, + "step": 1357 + }, + { + "epoch": 0.671363243109628, + "grad_norm": 0.14557364734452405, + "learning_rate": 1.8673204485658596e-05, + "loss": 0.674, + "step": 1358 + }, + { + "epoch": 0.6718576195773082, + "grad_norm": 0.1475040891485436, + "learning_rate": 1.867126564237649e-05, + "loss": 0.6857, + "step": 1359 + }, + { + "epoch": 0.6723519960449883, + "grad_norm": 0.15385554156951411, + "learning_rate": 1.866932548432237e-05, + "loss": 0.7312, + "step": 1360 + }, + { + "epoch": 0.6728463725126684, + "grad_norm": 0.14552890365951923, + "learning_rate": 1.8667384011790407e-05, + "loss": 0.7025, + "step": 1361 + }, + { + "epoch": 0.6733407489803486, + "grad_norm": 0.14549986723426403, + "learning_rate": 1.8665441225074975e-05, + "loss": 0.7024, + "step": 1362 + }, + { + "epoch": 0.6738351254480287, + "grad_norm": 0.14798856448524397, + "learning_rate": 1.866349712447065e-05, + "loss": 0.6585, + "step": 1363 + }, + { + "epoch": 0.6743295019157088, + "grad_norm": 0.14339052455702925, + "learning_rate": 1.8661551710272207e-05, + "loss": 0.6901, + "step": 1364 + }, + { + "epoch": 0.674823878383389, + "grad_norm": 0.1565482515980434, + "learning_rate": 1.865960498277461e-05, + "loss": 0.7243, + "step": 1365 + }, + { + "epoch": 0.6753182548510691, + "grad_norm": 0.14774548546175564, + "learning_rate": 1.8657656942273036e-05, + "loss": 0.6876, + "step": 1366 + }, + { + "epoch": 0.6758126313187492, + "grad_norm": 0.13646122800549432, + "learning_rate": 1.865570758906285e-05, + "loss": 0.6489, + "step": 1367 + }, + { + "epoch": 0.6763070077864294, + "grad_norm": 0.14413852397506738, + "learning_rate": 1.8653756923439623e-05, + "loss": 0.7175, + "step": 1368 + }, + { + "epoch": 0.6768013842541095, + "grad_norm": 0.14754970419960575, + "learning_rate": 1.865180494569912e-05, + "loss": 0.69, + "step": 1369 + }, + { + "epoch": 0.6772957607217897, + "grad_norm": 0.1435970900307486, + "learning_rate": 1.8649851656137313e-05, + "loss": 0.6546, + "step": 1370 + }, + { + "epoch": 0.6777901371894698, + "grad_norm": 0.15155545132046463, + "learning_rate": 1.8647897055050362e-05, + "loss": 0.6987, + "step": 1371 + }, + { + "epoch": 0.6782845136571499, + "grad_norm": 0.13712897414260888, + "learning_rate": 1.8645941142734636e-05, + "loss": 0.6525, + "step": 1372 + }, + { + "epoch": 0.6787788901248301, + "grad_norm": 0.15389450430262977, + "learning_rate": 1.8643983919486695e-05, + "loss": 0.6949, + "step": 1373 + }, + { + "epoch": 0.6792732665925102, + "grad_norm": 0.15729276430488442, + "learning_rate": 1.8642025385603303e-05, + "loss": 0.6341, + "step": 1374 + }, + { + "epoch": 0.6797676430601903, + "grad_norm": 0.15137779517813227, + "learning_rate": 1.864006554138142e-05, + "loss": 0.7045, + "step": 1375 + }, + { + "epoch": 0.6802620195278705, + "grad_norm": 0.149028894824447, + "learning_rate": 1.863810438711821e-05, + "loss": 0.6964, + "step": 1376 + }, + { + "epoch": 0.6807563959955506, + "grad_norm": 0.13816987860973398, + "learning_rate": 1.863614192311102e-05, + "loss": 0.6359, + "step": 1377 + }, + { + "epoch": 0.6812507724632307, + "grad_norm": 0.15259276299607577, + "learning_rate": 1.8634178149657415e-05, + "loss": 0.7171, + "step": 1378 + }, + { + "epoch": 0.6817451489309109, + "grad_norm": 0.14616973259842136, + "learning_rate": 1.863221306705515e-05, + "loss": 0.72, + "step": 1379 + }, + { + "epoch": 0.682239525398591, + "grad_norm": 0.1466230080139381, + "learning_rate": 1.8630246675602175e-05, + "loss": 0.6973, + "step": 1380 + }, + { + "epoch": 0.6827339018662711, + "grad_norm": 0.15106941168084814, + "learning_rate": 1.8628278975596644e-05, + "loss": 0.6448, + "step": 1381 + }, + { + "epoch": 0.6832282783339513, + "grad_norm": 0.15467270115025858, + "learning_rate": 1.862630996733691e-05, + "loss": 0.6908, + "step": 1382 + }, + { + "epoch": 0.6837226548016314, + "grad_norm": 0.14115994243684155, + "learning_rate": 1.862433965112152e-05, + "loss": 0.6839, + "step": 1383 + }, + { + "epoch": 0.6842170312693115, + "grad_norm": 0.13866266229651306, + "learning_rate": 1.862236802724922e-05, + "loss": 0.6723, + "step": 1384 + }, + { + "epoch": 0.6847114077369917, + "grad_norm": 0.13881286244058288, + "learning_rate": 1.8620395096018955e-05, + "loss": 0.7199, + "step": 1385 + }, + { + "epoch": 0.6852057842046718, + "grad_norm": 0.15412007857436846, + "learning_rate": 1.861842085772987e-05, + "loss": 0.729, + "step": 1386 + }, + { + "epoch": 0.685700160672352, + "grad_norm": 0.14883615918386814, + "learning_rate": 1.861644531268131e-05, + "loss": 0.7281, + "step": 1387 + }, + { + "epoch": 0.6861945371400321, + "grad_norm": 0.13551187272328694, + "learning_rate": 1.8614468461172813e-05, + "loss": 0.6408, + "step": 1388 + }, + { + "epoch": 0.6866889136077122, + "grad_norm": 0.1554411516083997, + "learning_rate": 1.861249030350411e-05, + "loss": 0.6852, + "step": 1389 + }, + { + "epoch": 0.6871832900753924, + "grad_norm": 0.13895768680877474, + "learning_rate": 1.8610510839975152e-05, + "loss": 0.6617, + "step": 1390 + }, + { + "epoch": 0.6876776665430725, + "grad_norm": 0.16303608076328902, + "learning_rate": 1.8608530070886058e-05, + "loss": 0.7011, + "step": 1391 + }, + { + "epoch": 0.6881720430107527, + "grad_norm": 0.14418750242789924, + "learning_rate": 1.860654799653717e-05, + "loss": 0.7135, + "step": 1392 + }, + { + "epoch": 0.6886664194784329, + "grad_norm": 0.15385607912219829, + "learning_rate": 1.8604564617229012e-05, + "loss": 0.6952, + "step": 1393 + }, + { + "epoch": 0.689160795946113, + "grad_norm": 0.14002364232020073, + "learning_rate": 1.8602579933262317e-05, + "loss": 0.6478, + "step": 1394 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.14466935106301415, + "learning_rate": 1.8600593944938006e-05, + "loss": 0.706, + "step": 1395 + }, + { + "epoch": 0.6901495488814733, + "grad_norm": 0.1408552251632568, + "learning_rate": 1.8598606652557206e-05, + "loss": 0.7065, + "step": 1396 + }, + { + "epoch": 0.6906439253491534, + "grad_norm": 0.14290081910674027, + "learning_rate": 1.859661805642124e-05, + "loss": 0.7054, + "step": 1397 + }, + { + "epoch": 0.6911383018168336, + "grad_norm": 0.13912216400579894, + "learning_rate": 1.8594628156831623e-05, + "loss": 0.6846, + "step": 1398 + }, + { + "epoch": 0.6916326782845137, + "grad_norm": 0.14524464639106702, + "learning_rate": 1.8592636954090072e-05, + "loss": 0.6657, + "step": 1399 + }, + { + "epoch": 0.6921270547521938, + "grad_norm": 0.1388206072419739, + "learning_rate": 1.8590644448498502e-05, + "loss": 0.6928, + "step": 1400 + }, + { + "epoch": 0.692621431219874, + "grad_norm": 0.14817235448600644, + "learning_rate": 1.8588650640359023e-05, + "loss": 0.6927, + "step": 1401 + }, + { + "epoch": 0.6931158076875541, + "grad_norm": 0.14526926313369284, + "learning_rate": 1.858665552997395e-05, + "loss": 0.7141, + "step": 1402 + }, + { + "epoch": 0.6936101841552342, + "grad_norm": 0.14756854070018055, + "learning_rate": 1.858465911764578e-05, + "loss": 0.6601, + "step": 1403 + }, + { + "epoch": 0.6941045606229144, + "grad_norm": 0.14060629352982687, + "learning_rate": 1.8582661403677225e-05, + "loss": 0.7237, + "step": 1404 + }, + { + "epoch": 0.6945989370905945, + "grad_norm": 0.13925816532230084, + "learning_rate": 1.8580662388371185e-05, + "loss": 0.6675, + "step": 1405 + }, + { + "epoch": 0.6950933135582746, + "grad_norm": 0.1446632923489717, + "learning_rate": 1.8578662072030755e-05, + "loss": 0.6978, + "step": 1406 + }, + { + "epoch": 0.6955876900259548, + "grad_norm": 0.14225770268238278, + "learning_rate": 1.8576660454959233e-05, + "loss": 0.6972, + "step": 1407 + }, + { + "epoch": 0.6960820664936349, + "grad_norm": 0.14640072091786913, + "learning_rate": 1.8574657537460114e-05, + "loss": 0.6662, + "step": 1408 + }, + { + "epoch": 0.696576442961315, + "grad_norm": 0.13903658119774534, + "learning_rate": 1.8572653319837087e-05, + "loss": 0.6864, + "step": 1409 + }, + { + "epoch": 0.6970708194289952, + "grad_norm": 0.14846272258718574, + "learning_rate": 1.857064780239404e-05, + "loss": 0.7267, + "step": 1410 + }, + { + "epoch": 0.6975651958966753, + "grad_norm": 0.14280057464397886, + "learning_rate": 1.8568640985435054e-05, + "loss": 0.6873, + "step": 1411 + }, + { + "epoch": 0.6980595723643555, + "grad_norm": 0.1342302430587476, + "learning_rate": 1.8566632869264415e-05, + "loss": 0.7039, + "step": 1412 + }, + { + "epoch": 0.6985539488320356, + "grad_norm": 0.14604331438355106, + "learning_rate": 1.8564623454186603e-05, + "loss": 0.7064, + "step": 1413 + }, + { + "epoch": 0.6990483252997157, + "grad_norm": 0.14136099461660345, + "learning_rate": 1.856261274050629e-05, + "loss": 0.7004, + "step": 1414 + }, + { + "epoch": 0.6995427017673959, + "grad_norm": 0.16396995608246126, + "learning_rate": 1.856060072852835e-05, + "loss": 0.6873, + "step": 1415 + }, + { + "epoch": 0.700037078235076, + "grad_norm": 0.15432839033179294, + "learning_rate": 1.8558587418557844e-05, + "loss": 0.7097, + "step": 1416 + }, + { + "epoch": 0.7005314547027561, + "grad_norm": 0.16241200728563066, + "learning_rate": 1.8556572810900054e-05, + "loss": 0.6743, + "step": 1417 + }, + { + "epoch": 0.7010258311704363, + "grad_norm": 0.149192086502018, + "learning_rate": 1.8554556905860432e-05, + "loss": 0.6467, + "step": 1418 + }, + { + "epoch": 0.7015202076381164, + "grad_norm": 0.14735342980302665, + "learning_rate": 1.855253970374464e-05, + "loss": 0.684, + "step": 1419 + }, + { + "epoch": 0.7020145841057965, + "grad_norm": 0.15032868315334771, + "learning_rate": 1.8550521204858536e-05, + "loss": 0.721, + "step": 1420 + }, + { + "epoch": 0.7025089605734767, + "grad_norm": 0.14869749267501584, + "learning_rate": 1.8548501409508168e-05, + "loss": 0.6643, + "step": 1421 + }, + { + "epoch": 0.7030033370411568, + "grad_norm": 0.14074132626846814, + "learning_rate": 1.8546480317999792e-05, + "loss": 0.6934, + "step": 1422 + }, + { + "epoch": 0.7034977135088369, + "grad_norm": 0.1441742401491287, + "learning_rate": 1.854445793063985e-05, + "loss": 0.6573, + "step": 1423 + }, + { + "epoch": 0.7039920899765171, + "grad_norm": 0.15976842438391745, + "learning_rate": 1.8542434247734986e-05, + "loss": 0.6529, + "step": 1424 + }, + { + "epoch": 0.7044864664441972, + "grad_norm": 0.1573892719854375, + "learning_rate": 1.8540409269592038e-05, + "loss": 0.7221, + "step": 1425 + }, + { + "epoch": 0.7049808429118773, + "grad_norm": 0.14192526414152756, + "learning_rate": 1.853838299651804e-05, + "loss": 0.7194, + "step": 1426 + }, + { + "epoch": 0.7054752193795575, + "grad_norm": 0.1602121525964088, + "learning_rate": 1.8536355428820222e-05, + "loss": 0.6781, + "step": 1427 + }, + { + "epoch": 0.7059695958472376, + "grad_norm": 0.14074142296066833, + "learning_rate": 1.8534326566806023e-05, + "loss": 0.6748, + "step": 1428 + }, + { + "epoch": 0.7064639723149179, + "grad_norm": 0.15115665444265539, + "learning_rate": 1.8532296410783052e-05, + "loss": 0.6891, + "step": 1429 + }, + { + "epoch": 0.706958348782598, + "grad_norm": 0.15548764893897496, + "learning_rate": 1.853026496105914e-05, + "loss": 0.6695, + "step": 1430 + }, + { + "epoch": 0.7074527252502781, + "grad_norm": 0.15043900513225575, + "learning_rate": 1.85282322179423e-05, + "loss": 0.6971, + "step": 1431 + }, + { + "epoch": 0.7079471017179583, + "grad_norm": 0.1787505447683915, + "learning_rate": 1.8526198181740745e-05, + "loss": 0.6909, + "step": 1432 + }, + { + "epoch": 0.7084414781856384, + "grad_norm": 0.14411145597290548, + "learning_rate": 1.8524162852762885e-05, + "loss": 0.7116, + "step": 1433 + }, + { + "epoch": 0.7089358546533185, + "grad_norm": 0.13974958188299186, + "learning_rate": 1.852212623131732e-05, + "loss": 0.6623, + "step": 1434 + }, + { + "epoch": 0.7094302311209987, + "grad_norm": 0.14647737548831763, + "learning_rate": 1.8520088317712856e-05, + "loss": 0.6981, + "step": 1435 + }, + { + "epoch": 0.7099246075886788, + "grad_norm": 0.1407021892161048, + "learning_rate": 1.851804911225848e-05, + "loss": 0.6779, + "step": 1436 + }, + { + "epoch": 0.710418984056359, + "grad_norm": 0.14083707139274823, + "learning_rate": 1.85160086152634e-05, + "loss": 0.686, + "step": 1437 + }, + { + "epoch": 0.7109133605240391, + "grad_norm": 0.14413204405775362, + "learning_rate": 1.8513966827036996e-05, + "loss": 0.6792, + "step": 1438 + }, + { + "epoch": 0.7114077369917192, + "grad_norm": 0.15054812090470066, + "learning_rate": 1.851192374788885e-05, + "loss": 0.6904, + "step": 1439 + }, + { + "epoch": 0.7119021134593994, + "grad_norm": 0.1407980285589203, + "learning_rate": 1.8509879378128748e-05, + "loss": 0.6815, + "step": 1440 + }, + { + "epoch": 0.7123964899270795, + "grad_norm": 0.1493373503228038, + "learning_rate": 1.8507833718066658e-05, + "loss": 0.6763, + "step": 1441 + }, + { + "epoch": 0.7128908663947596, + "grad_norm": 0.1329993453482154, + "learning_rate": 1.8505786768012756e-05, + "loss": 0.6608, + "step": 1442 + }, + { + "epoch": 0.7133852428624398, + "grad_norm": 0.1402190994065376, + "learning_rate": 1.850373852827741e-05, + "loss": 0.7249, + "step": 1443 + }, + { + "epoch": 0.7138796193301199, + "grad_norm": 0.14550650663437112, + "learning_rate": 1.8501688999171178e-05, + "loss": 0.7177, + "step": 1444 + }, + { + "epoch": 0.7143739957978, + "grad_norm": 0.15073399299961424, + "learning_rate": 1.849963818100482e-05, + "loss": 0.6718, + "step": 1445 + }, + { + "epoch": 0.7148683722654802, + "grad_norm": 0.14299142322345962, + "learning_rate": 1.849758607408929e-05, + "loss": 0.6918, + "step": 1446 + }, + { + "epoch": 0.7153627487331603, + "grad_norm": 0.13752231788549266, + "learning_rate": 1.8495532678735734e-05, + "loss": 0.6726, + "step": 1447 + }, + { + "epoch": 0.7158571252008404, + "grad_norm": 0.1501962376631211, + "learning_rate": 1.84934779952555e-05, + "loss": 0.6821, + "step": 1448 + }, + { + "epoch": 0.7163515016685206, + "grad_norm": 0.15095016846478584, + "learning_rate": 1.8491422023960123e-05, + "loss": 0.7581, + "step": 1449 + }, + { + "epoch": 0.7168458781362007, + "grad_norm": 0.1395004524359586, + "learning_rate": 1.8489364765161342e-05, + "loss": 0.6829, + "step": 1450 + }, + { + "epoch": 0.7173402546038808, + "grad_norm": 0.14934905416279226, + "learning_rate": 1.8487306219171084e-05, + "loss": 0.6901, + "step": 1451 + }, + { + "epoch": 0.717834631071561, + "grad_norm": 0.14256441698059052, + "learning_rate": 1.8485246386301474e-05, + "loss": 0.7272, + "step": 1452 + }, + { + "epoch": 0.7183290075392411, + "grad_norm": 0.14427050625153842, + "learning_rate": 1.848318526686483e-05, + "loss": 0.6967, + "step": 1453 + }, + { + "epoch": 0.7188233840069213, + "grad_norm": 0.1448133392367129, + "learning_rate": 1.8481122861173676e-05, + "loss": 0.7068, + "step": 1454 + }, + { + "epoch": 0.7193177604746014, + "grad_norm": 0.15445062410652133, + "learning_rate": 1.847905916954071e-05, + "loss": 0.6462, + "step": 1455 + }, + { + "epoch": 0.7198121369422815, + "grad_norm": 0.1467879456920742, + "learning_rate": 1.8476994192278847e-05, + "loss": 0.727, + "step": 1456 + }, + { + "epoch": 0.7203065134099617, + "grad_norm": 0.14425464517276032, + "learning_rate": 1.847492792970118e-05, + "loss": 0.6788, + "step": 1457 + }, + { + "epoch": 0.7208008898776418, + "grad_norm": 0.15660210932614752, + "learning_rate": 1.8472860382121012e-05, + "loss": 0.6873, + "step": 1458 + }, + { + "epoch": 0.7212952663453219, + "grad_norm": 0.15480881944519248, + "learning_rate": 1.8470791549851825e-05, + "loss": 0.6995, + "step": 1459 + }, + { + "epoch": 0.7217896428130021, + "grad_norm": 0.14805892603515264, + "learning_rate": 1.846872143320731e-05, + "loss": 0.7064, + "step": 1460 + }, + { + "epoch": 0.7222840192806822, + "grad_norm": 0.15978841117255718, + "learning_rate": 1.8466650032501342e-05, + "loss": 0.7031, + "step": 1461 + }, + { + "epoch": 0.7227783957483623, + "grad_norm": 0.15228596033941422, + "learning_rate": 1.8464577348047993e-05, + "loss": 0.7091, + "step": 1462 + }, + { + "epoch": 0.7232727722160425, + "grad_norm": 0.1512110299380274, + "learning_rate": 1.846250338016154e-05, + "loss": 0.6817, + "step": 1463 + }, + { + "epoch": 0.7237671486837226, + "grad_norm": 0.16386508843044803, + "learning_rate": 1.8460428129156434e-05, + "loss": 0.7101, + "step": 1464 + }, + { + "epoch": 0.7242615251514027, + "grad_norm": 0.1606906400718589, + "learning_rate": 1.8458351595347348e-05, + "loss": 0.6686, + "step": 1465 + }, + { + "epoch": 0.7247559016190829, + "grad_norm": 0.14932673655509407, + "learning_rate": 1.845627377904912e-05, + "loss": 0.6874, + "step": 1466 + }, + { + "epoch": 0.7252502780867631, + "grad_norm": 0.14183724101543324, + "learning_rate": 1.8454194680576808e-05, + "loss": 0.6894, + "step": 1467 + }, + { + "epoch": 0.7257446545544433, + "grad_norm": 0.14906550328285983, + "learning_rate": 1.845211430024565e-05, + "loss": 0.6843, + "step": 1468 + }, + { + "epoch": 0.7262390310221234, + "grad_norm": 0.1510689851923094, + "learning_rate": 1.8450032638371075e-05, + "loss": 0.7288, + "step": 1469 + }, + { + "epoch": 0.7267334074898035, + "grad_norm": 0.1427538970569437, + "learning_rate": 1.8447949695268723e-05, + "loss": 0.6327, + "step": 1470 + }, + { + "epoch": 0.7272277839574837, + "grad_norm": 0.1538720888906722, + "learning_rate": 1.844586547125441e-05, + "loss": 0.662, + "step": 1471 + }, + { + "epoch": 0.7277221604251638, + "grad_norm": 0.15369851336410245, + "learning_rate": 1.844377996664416e-05, + "loss": 0.6913, + "step": 1472 + }, + { + "epoch": 0.7282165368928439, + "grad_norm": 0.1406947908534473, + "learning_rate": 1.8441693181754183e-05, + "loss": 0.6781, + "step": 1473 + }, + { + "epoch": 0.7287109133605241, + "grad_norm": 0.14807846076393447, + "learning_rate": 1.8439605116900886e-05, + "loss": 0.7249, + "step": 1474 + }, + { + "epoch": 0.7292052898282042, + "grad_norm": 0.1685182938113322, + "learning_rate": 1.8437515772400866e-05, + "loss": 0.6886, + "step": 1475 + }, + { + "epoch": 0.7296996662958843, + "grad_norm": 0.13592843719763417, + "learning_rate": 1.8435425148570925e-05, + "loss": 0.7324, + "step": 1476 + }, + { + "epoch": 0.7301940427635645, + "grad_norm": 0.1453332813970503, + "learning_rate": 1.8433333245728048e-05, + "loss": 0.6929, + "step": 1477 + }, + { + "epoch": 0.7306884192312446, + "grad_norm": 0.1714021629523762, + "learning_rate": 1.8431240064189417e-05, + "loss": 0.7009, + "step": 1478 + }, + { + "epoch": 0.7311827956989247, + "grad_norm": 0.13209173674525185, + "learning_rate": 1.8429145604272413e-05, + "loss": 0.6644, + "step": 1479 + }, + { + "epoch": 0.7316771721666049, + "grad_norm": 0.16111110379215896, + "learning_rate": 1.8427049866294594e-05, + "loss": 0.71, + "step": 1480 + }, + { + "epoch": 0.732171548634285, + "grad_norm": 0.15381803067697994, + "learning_rate": 1.8424952850573744e-05, + "loss": 0.704, + "step": 1481 + }, + { + "epoch": 0.7326659251019652, + "grad_norm": 0.14425900311172934, + "learning_rate": 1.8422854557427802e-05, + "loss": 0.6659, + "step": 1482 + }, + { + "epoch": 0.7331603015696453, + "grad_norm": 0.14075530543949552, + "learning_rate": 1.842075498717493e-05, + "loss": 0.6988, + "step": 1483 + }, + { + "epoch": 0.7336546780373254, + "grad_norm": 0.1458441522832367, + "learning_rate": 1.841865414013347e-05, + "loss": 0.6694, + "step": 1484 + }, + { + "epoch": 0.7341490545050056, + "grad_norm": 0.1504664668043467, + "learning_rate": 1.8416552016621966e-05, + "loss": 0.6781, + "step": 1485 + }, + { + "epoch": 0.7346434309726857, + "grad_norm": 0.14399554949861967, + "learning_rate": 1.8414448616959143e-05, + "loss": 0.661, + "step": 1486 + }, + { + "epoch": 0.7351378074403658, + "grad_norm": 0.14264366065156767, + "learning_rate": 1.841234394146393e-05, + "loss": 0.7148, + "step": 1487 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 0.14659182909203022, + "learning_rate": 1.8410237990455446e-05, + "loss": 0.7053, + "step": 1488 + }, + { + "epoch": 0.7361265603757261, + "grad_norm": 0.14573337893495208, + "learning_rate": 1.8408130764253003e-05, + "loss": 0.7241, + "step": 1489 + }, + { + "epoch": 0.7366209368434062, + "grad_norm": 0.17611027327980247, + "learning_rate": 1.8406022263176108e-05, + "loss": 0.6928, + "step": 1490 + }, + { + "epoch": 0.7371153133110864, + "grad_norm": 0.1464877405630352, + "learning_rate": 1.8403912487544464e-05, + "loss": 0.7105, + "step": 1491 + }, + { + "epoch": 0.7376096897787665, + "grad_norm": 0.14644568863681226, + "learning_rate": 1.8401801437677956e-05, + "loss": 0.7098, + "step": 1492 + }, + { + "epoch": 0.7381040662464466, + "grad_norm": 0.153954585162942, + "learning_rate": 1.8399689113896674e-05, + "loss": 0.7012, + "step": 1493 + }, + { + "epoch": 0.7385984427141268, + "grad_norm": 0.15438975284250092, + "learning_rate": 1.83975755165209e-05, + "loss": 0.6898, + "step": 1494 + }, + { + "epoch": 0.7390928191818069, + "grad_norm": 0.1421868774091319, + "learning_rate": 1.83954606458711e-05, + "loss": 0.7211, + "step": 1495 + }, + { + "epoch": 0.739587195649487, + "grad_norm": 0.19141620260366646, + "learning_rate": 1.8393344502267945e-05, + "loss": 0.7143, + "step": 1496 + }, + { + "epoch": 0.7400815721171672, + "grad_norm": 0.14834243937297242, + "learning_rate": 1.8391227086032288e-05, + "loss": 0.6875, + "step": 1497 + }, + { + "epoch": 0.7405759485848473, + "grad_norm": 0.14499717222714037, + "learning_rate": 1.838910839748518e-05, + "loss": 0.6779, + "step": 1498 + }, + { + "epoch": 0.7410703250525275, + "grad_norm": 0.17637525470531804, + "learning_rate": 1.8386988436947874e-05, + "loss": 0.7115, + "step": 1499 + }, + { + "epoch": 0.7415647015202076, + "grad_norm": 0.1378134827138001, + "learning_rate": 1.83848672047418e-05, + "loss": 0.7012, + "step": 1500 + }, + { + "epoch": 0.7420590779878877, + "grad_norm": 0.16051197331627998, + "learning_rate": 1.8382744701188585e-05, + "loss": 0.7671, + "step": 1501 + }, + { + "epoch": 0.7425534544555679, + "grad_norm": 0.14641219781059767, + "learning_rate": 1.8380620926610052e-05, + "loss": 0.7078, + "step": 1502 + }, + { + "epoch": 0.743047830923248, + "grad_norm": 0.16027043377444455, + "learning_rate": 1.8378495881328224e-05, + "loss": 0.7429, + "step": 1503 + }, + { + "epoch": 0.7435422073909282, + "grad_norm": 0.16061451892624193, + "learning_rate": 1.83763695656653e-05, + "loss": 0.6832, + "step": 1504 + }, + { + "epoch": 0.7440365838586084, + "grad_norm": 0.24159201058948734, + "learning_rate": 1.8374241979943685e-05, + "loss": 0.6887, + "step": 1505 + }, + { + "epoch": 0.7445309603262885, + "grad_norm": 0.16596274287462687, + "learning_rate": 1.8372113124485975e-05, + "loss": 0.7215, + "step": 1506 + }, + { + "epoch": 0.7450253367939687, + "grad_norm": 0.15357056032886615, + "learning_rate": 1.8369982999614944e-05, + "loss": 0.6841, + "step": 1507 + }, + { + "epoch": 0.7455197132616488, + "grad_norm": 0.15161860041952194, + "learning_rate": 1.8367851605653585e-05, + "loss": 0.655, + "step": 1508 + }, + { + "epoch": 0.7460140897293289, + "grad_norm": 0.15578082249854755, + "learning_rate": 1.8365718942925058e-05, + "loss": 0.6824, + "step": 1509 + }, + { + "epoch": 0.7465084661970091, + "grad_norm": 0.15863191955346115, + "learning_rate": 1.836358501175273e-05, + "loss": 0.6864, + "step": 1510 + }, + { + "epoch": 0.7470028426646892, + "grad_norm": 0.15035430832084598, + "learning_rate": 1.8361449812460157e-05, + "loss": 0.7056, + "step": 1511 + }, + { + "epoch": 0.7474972191323693, + "grad_norm": 0.15037041899733805, + "learning_rate": 1.8359313345371082e-05, + "loss": 0.6893, + "step": 1512 + }, + { + "epoch": 0.7479915956000495, + "grad_norm": 0.1553349865328783, + "learning_rate": 1.8357175610809447e-05, + "loss": 0.7015, + "step": 1513 + }, + { + "epoch": 0.7484859720677296, + "grad_norm": 0.14079625133408796, + "learning_rate": 1.8355036609099388e-05, + "loss": 0.7126, + "step": 1514 + }, + { + "epoch": 0.7489803485354097, + "grad_norm": 0.15243144310508408, + "learning_rate": 1.8352896340565223e-05, + "loss": 0.7159, + "step": 1515 + }, + { + "epoch": 0.7494747250030899, + "grad_norm": 0.14350911646229603, + "learning_rate": 1.8350754805531468e-05, + "loss": 0.6674, + "step": 1516 + }, + { + "epoch": 0.74996910147077, + "grad_norm": 0.1488832450931529, + "learning_rate": 1.834861200432284e-05, + "loss": 0.676, + "step": 1517 + }, + { + "epoch": 0.7504634779384501, + "grad_norm": 0.15252879039458736, + "learning_rate": 1.834646793726423e-05, + "loss": 0.6759, + "step": 1518 + }, + { + "epoch": 0.7504634779384501, + "eval_loss": 0.6912127137184143, + "eval_runtime": 81.858, + "eval_samples_per_second": 370.813, + "eval_steps_per_second": 46.361, + "step": 1518 + }, + { + "epoch": 0.7509578544061303, + "grad_norm": 0.14043665599518776, + "learning_rate": 1.8344322604680734e-05, + "loss": 0.6564, + "step": 1519 + }, + { + "epoch": 0.7514522308738104, + "grad_norm": 0.5790586388280445, + "learning_rate": 1.8342176006897633e-05, + "loss": 0.7031, + "step": 1520 + }, + { + "epoch": 0.7519466073414905, + "grad_norm": 0.16191825450496125, + "learning_rate": 1.8340028144240404e-05, + "loss": 0.6742, + "step": 1521 + }, + { + "epoch": 0.7524409838091707, + "grad_norm": 0.1577335976300695, + "learning_rate": 1.8337879017034715e-05, + "loss": 0.7256, + "step": 1522 + }, + { + "epoch": 0.7529353602768508, + "grad_norm": 0.179202320366807, + "learning_rate": 1.8335728625606427e-05, + "loss": 0.6668, + "step": 1523 + }, + { + "epoch": 0.753429736744531, + "grad_norm": 0.2604777926942097, + "learning_rate": 1.833357697028159e-05, + "loss": 0.7195, + "step": 1524 + }, + { + "epoch": 0.7539241132122111, + "grad_norm": 0.18434483804172697, + "learning_rate": 1.833142405138644e-05, + "loss": 0.719, + "step": 1525 + }, + { + "epoch": 0.7544184896798912, + "grad_norm": 0.16205867382720093, + "learning_rate": 1.8329269869247422e-05, + "loss": 0.7074, + "step": 1526 + }, + { + "epoch": 0.7549128661475714, + "grad_norm": 0.15590280147720859, + "learning_rate": 1.8327114424191153e-05, + "loss": 0.6691, + "step": 1527 + }, + { + "epoch": 0.7554072426152515, + "grad_norm": 0.1700768780110684, + "learning_rate": 1.832495771654446e-05, + "loss": 0.6677, + "step": 1528 + }, + { + "epoch": 0.7559016190829316, + "grad_norm": 0.1535607319641214, + "learning_rate": 1.832279974663434e-05, + "loss": 0.684, + "step": 1529 + }, + { + "epoch": 0.7563959955506118, + "grad_norm": 0.1712085952814901, + "learning_rate": 1.8320640514788002e-05, + "loss": 0.7068, + "step": 1530 + }, + { + "epoch": 0.7568903720182919, + "grad_norm": 0.15348866713306608, + "learning_rate": 1.8318480021332833e-05, + "loss": 0.6857, + "step": 1531 + }, + { + "epoch": 0.757384748485972, + "grad_norm": 0.16687136563553215, + "learning_rate": 1.8316318266596416e-05, + "loss": 0.6797, + "step": 1532 + }, + { + "epoch": 0.7578791249536522, + "grad_norm": 0.15071633526707473, + "learning_rate": 1.8314155250906526e-05, + "loss": 0.6906, + "step": 1533 + }, + { + "epoch": 0.7583735014213323, + "grad_norm": 0.16446800009919016, + "learning_rate": 1.8311990974591128e-05, + "loss": 0.6845, + "step": 1534 + }, + { + "epoch": 0.7588678778890124, + "grad_norm": 0.15308478401755754, + "learning_rate": 1.8309825437978376e-05, + "loss": 0.7443, + "step": 1535 + }, + { + "epoch": 0.7593622543566926, + "grad_norm": 0.15122185584113285, + "learning_rate": 1.830765864139662e-05, + "loss": 0.668, + "step": 1536 + }, + { + "epoch": 0.7598566308243727, + "grad_norm": 0.15445432044466728, + "learning_rate": 1.8305490585174398e-05, + "loss": 0.6819, + "step": 1537 + }, + { + "epoch": 0.7603510072920528, + "grad_norm": 0.14910436017556164, + "learning_rate": 1.8303321269640442e-05, + "loss": 0.7069, + "step": 1538 + }, + { + "epoch": 0.760845383759733, + "grad_norm": 0.14635806946237923, + "learning_rate": 1.8301150695123663e-05, + "loss": 0.7209, + "step": 1539 + }, + { + "epoch": 0.7613397602274131, + "grad_norm": 0.14661406106106556, + "learning_rate": 1.8298978861953184e-05, + "loss": 0.6914, + "step": 1540 + }, + { + "epoch": 0.7618341366950934, + "grad_norm": 0.27933854353473214, + "learning_rate": 1.82968057704583e-05, + "loss": 0.7784, + "step": 1541 + }, + { + "epoch": 0.7623285131627735, + "grad_norm": 0.15224542179687817, + "learning_rate": 1.8294631420968504e-05, + "loss": 0.6521, + "step": 1542 + }, + { + "epoch": 0.7628228896304536, + "grad_norm": 0.15791963204626552, + "learning_rate": 1.8292455813813482e-05, + "loss": 0.6832, + "step": 1543 + }, + { + "epoch": 0.7633172660981338, + "grad_norm": 0.15082440766562527, + "learning_rate": 1.829027894932311e-05, + "loss": 0.6889, + "step": 1544 + }, + { + "epoch": 0.7638116425658139, + "grad_norm": 0.16455364726715757, + "learning_rate": 1.8288100827827446e-05, + "loss": 0.6737, + "step": 1545 + }, + { + "epoch": 0.764306019033494, + "grad_norm": 0.16120194158603576, + "learning_rate": 1.8285921449656752e-05, + "loss": 0.7102, + "step": 1546 + }, + { + "epoch": 0.7648003955011742, + "grad_norm": 0.15296510395913607, + "learning_rate": 1.8283740815141468e-05, + "loss": 0.7018, + "step": 1547 + }, + { + "epoch": 0.7652947719688543, + "grad_norm": 0.15462408258773416, + "learning_rate": 1.8281558924612237e-05, + "loss": 0.706, + "step": 1548 + }, + { + "epoch": 0.7657891484365345, + "grad_norm": 0.14936999235457163, + "learning_rate": 1.8279375778399885e-05, + "loss": 0.7112, + "step": 1549 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 0.2093632768020963, + "learning_rate": 1.827719137683542e-05, + "loss": 0.6575, + "step": 1550 + }, + { + "epoch": 0.7667779013718947, + "grad_norm": 0.16082837807886968, + "learning_rate": 1.8275005720250066e-05, + "loss": 0.6932, + "step": 1551 + }, + { + "epoch": 0.7672722778395749, + "grad_norm": 0.13930660351678245, + "learning_rate": 1.827281880897521e-05, + "loss": 0.6552, + "step": 1552 + }, + { + "epoch": 0.767766654307255, + "grad_norm": 0.14799309838128882, + "learning_rate": 1.8270630643342438e-05, + "loss": 0.7079, + "step": 1553 + }, + { + "epoch": 0.7682610307749351, + "grad_norm": 0.16284114127198598, + "learning_rate": 1.8268441223683537e-05, + "loss": 0.6813, + "step": 1554 + }, + { + "epoch": 0.7687554072426153, + "grad_norm": 0.18555671812623328, + "learning_rate": 1.826625055033047e-05, + "loss": 0.6883, + "step": 1555 + }, + { + "epoch": 0.7692497837102954, + "grad_norm": 0.1426601213629659, + "learning_rate": 1.82640586236154e-05, + "loss": 0.6999, + "step": 1556 + }, + { + "epoch": 0.7697441601779755, + "grad_norm": 0.13645074814976524, + "learning_rate": 1.8261865443870668e-05, + "loss": 0.7068, + "step": 1557 + }, + { + "epoch": 0.7702385366456557, + "grad_norm": 0.1603232425800828, + "learning_rate": 1.8259671011428824e-05, + "loss": 0.6571, + "step": 1558 + }, + { + "epoch": 0.7707329131133358, + "grad_norm": 0.14376595227364958, + "learning_rate": 1.8257475326622587e-05, + "loss": 0.6972, + "step": 1559 + }, + { + "epoch": 0.7712272895810159, + "grad_norm": 0.16151474010467853, + "learning_rate": 1.825527838978488e-05, + "loss": 0.7427, + "step": 1560 + }, + { + "epoch": 0.7717216660486961, + "grad_norm": 0.13443798117517292, + "learning_rate": 1.8253080201248806e-05, + "loss": 0.6764, + "step": 1561 + }, + { + "epoch": 0.7722160425163762, + "grad_norm": 0.14643493838429888, + "learning_rate": 1.825088076134767e-05, + "loss": 0.6673, + "step": 1562 + }, + { + "epoch": 0.7727104189840563, + "grad_norm": 0.1472944829799071, + "learning_rate": 1.8248680070414956e-05, + "loss": 0.6475, + "step": 1563 + }, + { + "epoch": 0.7732047954517365, + "grad_norm": 0.13384039747891707, + "learning_rate": 1.8246478128784345e-05, + "loss": 0.6632, + "step": 1564 + }, + { + "epoch": 0.7736991719194166, + "grad_norm": 0.14976920089557283, + "learning_rate": 1.8244274936789698e-05, + "loss": 0.7357, + "step": 1565 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.13703897993674236, + "learning_rate": 1.8242070494765078e-05, + "loss": 0.6859, + "step": 1566 + }, + { + "epoch": 0.7746879248547769, + "grad_norm": 0.14098570813959616, + "learning_rate": 1.823986480304473e-05, + "loss": 0.6839, + "step": 1567 + }, + { + "epoch": 0.775182301322457, + "grad_norm": 0.15207534957278482, + "learning_rate": 1.823765786196309e-05, + "loss": 0.6722, + "step": 1568 + }, + { + "epoch": 0.7756766777901372, + "grad_norm": 0.14452480076882035, + "learning_rate": 1.8235449671854776e-05, + "loss": 0.7019, + "step": 1569 + }, + { + "epoch": 0.7761710542578173, + "grad_norm": 0.1527979017028005, + "learning_rate": 1.8233240233054613e-05, + "loss": 0.7178, + "step": 1570 + }, + { + "epoch": 0.7766654307254974, + "grad_norm": 0.15981282921089807, + "learning_rate": 1.82310295458976e-05, + "loss": 0.7159, + "step": 1571 + }, + { + "epoch": 0.7771598071931776, + "grad_norm": 1.9001606656005254, + "learning_rate": 1.8228817610718934e-05, + "loss": 0.7355, + "step": 1572 + }, + { + "epoch": 0.7776541836608577, + "grad_norm": 0.14610448297675604, + "learning_rate": 1.822660442785399e-05, + "loss": 0.6662, + "step": 1573 + }, + { + "epoch": 0.7781485601285378, + "grad_norm": 0.1459964039802617, + "learning_rate": 1.8224389997638344e-05, + "loss": 0.6994, + "step": 1574 + }, + { + "epoch": 0.778642936596218, + "grad_norm": 0.14684629037642125, + "learning_rate": 1.8222174320407758e-05, + "loss": 0.6727, + "step": 1575 + }, + { + "epoch": 0.7791373130638981, + "grad_norm": 0.15417030173172477, + "learning_rate": 1.8219957396498183e-05, + "loss": 0.6851, + "step": 1576 + }, + { + "epoch": 0.7796316895315782, + "grad_norm": 0.13526007514827618, + "learning_rate": 1.8217739226245753e-05, + "loss": 0.6503, + "step": 1577 + }, + { + "epoch": 0.7801260659992585, + "grad_norm": 0.16152255492868423, + "learning_rate": 1.82155198099868e-05, + "loss": 0.6964, + "step": 1578 + }, + { + "epoch": 0.7806204424669386, + "grad_norm": 0.14887684602672765, + "learning_rate": 1.8213299148057837e-05, + "loss": 0.6896, + "step": 1579 + }, + { + "epoch": 0.7811148189346188, + "grad_norm": 0.15034950888544518, + "learning_rate": 1.8211077240795573e-05, + "loss": 0.7116, + "step": 1580 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 0.17084195695943624, + "learning_rate": 1.8208854088536903e-05, + "loss": 0.7042, + "step": 1581 + }, + { + "epoch": 0.782103571869979, + "grad_norm": 0.1494169934562388, + "learning_rate": 1.8206629691618904e-05, + "loss": 0.729, + "step": 1582 + }, + { + "epoch": 0.7825979483376592, + "grad_norm": 0.1500383146276052, + "learning_rate": 1.8204404050378856e-05, + "loss": 0.6877, + "step": 1583 + }, + { + "epoch": 0.7830923248053393, + "grad_norm": 0.14673317771863667, + "learning_rate": 1.8202177165154217e-05, + "loss": 0.708, + "step": 1584 + }, + { + "epoch": 0.7835867012730194, + "grad_norm": 0.144840849538174, + "learning_rate": 1.819994903628263e-05, + "loss": 0.6904, + "step": 1585 + }, + { + "epoch": 0.7840810777406996, + "grad_norm": 0.14941469778344915, + "learning_rate": 1.8197719664101944e-05, + "loss": 0.6728, + "step": 1586 + }, + { + "epoch": 0.7845754542083797, + "grad_norm": 0.14181774855963494, + "learning_rate": 1.8195489048950175e-05, + "loss": 0.6572, + "step": 1587 + }, + { + "epoch": 0.7850698306760598, + "grad_norm": 0.15341940960256262, + "learning_rate": 1.8193257191165544e-05, + "loss": 0.7217, + "step": 1588 + }, + { + "epoch": 0.78556420714374, + "grad_norm": 0.1526710240497697, + "learning_rate": 1.8191024091086455e-05, + "loss": 0.7096, + "step": 1589 + }, + { + "epoch": 0.7860585836114201, + "grad_norm": 0.1390532771094414, + "learning_rate": 1.8188789749051494e-05, + "loss": 0.7113, + "step": 1590 + }, + { + "epoch": 0.7865529600791002, + "grad_norm": 0.15575725414287228, + "learning_rate": 1.8186554165399446e-05, + "loss": 0.6882, + "step": 1591 + }, + { + "epoch": 0.7870473365467804, + "grad_norm": 0.14566531641262534, + "learning_rate": 1.818431734046928e-05, + "loss": 0.7062, + "step": 1592 + }, + { + "epoch": 0.7875417130144605, + "grad_norm": 0.14761732803958766, + "learning_rate": 1.8182079274600146e-05, + "loss": 0.69, + "step": 1593 + }, + { + "epoch": 0.7880360894821407, + "grad_norm": 0.14907158675224524, + "learning_rate": 1.817983996813139e-05, + "loss": 0.6497, + "step": 1594 + }, + { + "epoch": 0.7885304659498208, + "grad_norm": 0.1424285532243911, + "learning_rate": 1.817759942140255e-05, + "loss": 0.682, + "step": 1595 + }, + { + "epoch": 0.7890248424175009, + "grad_norm": 0.15234654799729128, + "learning_rate": 1.8175357634753343e-05, + "loss": 0.7063, + "step": 1596 + }, + { + "epoch": 0.7895192188851811, + "grad_norm": 0.1670221935649849, + "learning_rate": 1.8173114608523674e-05, + "loss": 0.7288, + "step": 1597 + }, + { + "epoch": 0.7900135953528612, + "grad_norm": 0.15097584267793876, + "learning_rate": 1.8170870343053646e-05, + "loss": 0.6781, + "step": 1598 + }, + { + "epoch": 0.7905079718205413, + "grad_norm": 0.16402125971009615, + "learning_rate": 1.8168624838683543e-05, + "loss": 0.7026, + "step": 1599 + }, + { + "epoch": 0.7910023482882215, + "grad_norm": 0.14344080811088308, + "learning_rate": 1.8166378095753835e-05, + "loss": 0.6795, + "step": 1600 + }, + { + "epoch": 0.7914967247559016, + "grad_norm": 0.16306369638325907, + "learning_rate": 1.8164130114605177e-05, + "loss": 0.6857, + "step": 1601 + }, + { + "epoch": 0.7919911012235817, + "grad_norm": 0.39016432380927435, + "learning_rate": 1.816188089557843e-05, + "loss": 0.7182, + "step": 1602 + }, + { + "epoch": 0.7924854776912619, + "grad_norm": 0.15288846561701888, + "learning_rate": 1.815963043901462e-05, + "loss": 0.7071, + "step": 1603 + }, + { + "epoch": 0.792979854158942, + "grad_norm": 0.15659042622445377, + "learning_rate": 1.815737874525497e-05, + "loss": 0.6505, + "step": 1604 + }, + { + "epoch": 0.7934742306266221, + "grad_norm": 0.14953889351904756, + "learning_rate": 1.8155125814640896e-05, + "loss": 0.6932, + "step": 1605 + }, + { + "epoch": 0.7939686070943023, + "grad_norm": 0.14080982214818716, + "learning_rate": 1.815287164751399e-05, + "loss": 0.724, + "step": 1606 + }, + { + "epoch": 0.7944629835619824, + "grad_norm": 0.1430907156619964, + "learning_rate": 1.8150616244216047e-05, + "loss": 0.6625, + "step": 1607 + }, + { + "epoch": 0.7949573600296626, + "grad_norm": 0.1641560709536928, + "learning_rate": 1.814835960508903e-05, + "loss": 0.7338, + "step": 1608 + }, + { + "epoch": 0.7954517364973427, + "grad_norm": 0.14777500479954542, + "learning_rate": 1.8146101730475107e-05, + "loss": 0.6338, + "step": 1609 + }, + { + "epoch": 0.7959461129650228, + "grad_norm": 0.15749826458725133, + "learning_rate": 1.814384262071662e-05, + "loss": 0.6938, + "step": 1610 + }, + { + "epoch": 0.796440489432703, + "grad_norm": 0.1583698295369262, + "learning_rate": 1.814158227615611e-05, + "loss": 0.6903, + "step": 1611 + }, + { + "epoch": 0.7969348659003831, + "grad_norm": 0.14063377427981744, + "learning_rate": 1.8139320697136297e-05, + "loss": 0.6905, + "step": 1612 + }, + { + "epoch": 0.7974292423680632, + "grad_norm": 0.1469381220261469, + "learning_rate": 1.813705788400009e-05, + "loss": 0.7041, + "step": 1613 + }, + { + "epoch": 0.7979236188357434, + "grad_norm": 0.14546700548396196, + "learning_rate": 1.8134793837090585e-05, + "loss": 0.6739, + "step": 1614 + }, + { + "epoch": 0.7984179953034236, + "grad_norm": 0.14226240663601714, + "learning_rate": 1.8132528556751073e-05, + "loss": 0.68, + "step": 1615 + }, + { + "epoch": 0.7989123717711037, + "grad_norm": 0.1525150262870643, + "learning_rate": 1.8130262043325015e-05, + "loss": 0.6969, + "step": 1616 + }, + { + "epoch": 0.7994067482387839, + "grad_norm": 0.4488777999287883, + "learning_rate": 1.812799429715607e-05, + "loss": 0.7062, + "step": 1617 + }, + { + "epoch": 0.799901124706464, + "grad_norm": 0.1420380649373736, + "learning_rate": 1.812572531858809e-05, + "loss": 0.675, + "step": 1618 + }, + { + "epoch": 0.8003955011741442, + "grad_norm": 0.14386710011954593, + "learning_rate": 1.8123455107965104e-05, + "loss": 0.6926, + "step": 1619 + }, + { + "epoch": 0.8008898776418243, + "grad_norm": 0.14468436740486956, + "learning_rate": 1.8121183665631326e-05, + "loss": 0.7003, + "step": 1620 + }, + { + "epoch": 0.8013842541095044, + "grad_norm": 0.1399770569202926, + "learning_rate": 1.811891099193116e-05, + "loss": 0.7033, + "step": 1621 + }, + { + "epoch": 0.8018786305771846, + "grad_norm": 0.14010655669772537, + "learning_rate": 1.811663708720921e-05, + "loss": 0.7061, + "step": 1622 + }, + { + "epoch": 0.8023730070448647, + "grad_norm": 0.1455600021069299, + "learning_rate": 1.8114361951810246e-05, + "loss": 0.6532, + "step": 1623 + }, + { + "epoch": 0.8028673835125448, + "grad_norm": 0.1426937683841831, + "learning_rate": 1.8112085586079228e-05, + "loss": 0.6668, + "step": 1624 + }, + { + "epoch": 0.803361759980225, + "grad_norm": 0.14107875531930267, + "learning_rate": 1.810980799036132e-05, + "loss": 0.699, + "step": 1625 + }, + { + "epoch": 0.8038561364479051, + "grad_norm": 0.14036320159481083, + "learning_rate": 1.8107529165001847e-05, + "loss": 0.7303, + "step": 1626 + }, + { + "epoch": 0.8043505129155852, + "grad_norm": 0.13565708260147868, + "learning_rate": 1.8105249110346345e-05, + "loss": 0.6684, + "step": 1627 + }, + { + "epoch": 0.8048448893832654, + "grad_norm": 0.13797802418236838, + "learning_rate": 1.8102967826740517e-05, + "loss": 0.6863, + "step": 1628 + }, + { + "epoch": 0.8053392658509455, + "grad_norm": 0.13655455656184162, + "learning_rate": 1.8100685314530266e-05, + "loss": 0.6686, + "step": 1629 + }, + { + "epoch": 0.8058336423186256, + "grad_norm": 0.13927259321525007, + "learning_rate": 1.8098401574061668e-05, + "loss": 0.673, + "step": 1630 + }, + { + "epoch": 0.8063280187863058, + "grad_norm": 0.13621609413061006, + "learning_rate": 1.8096116605681004e-05, + "loss": 0.6781, + "step": 1631 + }, + { + "epoch": 0.8068223952539859, + "grad_norm": 0.1455416726229564, + "learning_rate": 1.8093830409734717e-05, + "loss": 0.7167, + "step": 1632 + }, + { + "epoch": 0.807316771721666, + "grad_norm": 0.4446431114949388, + "learning_rate": 1.8091542986569465e-05, + "loss": 0.7531, + "step": 1633 + }, + { + "epoch": 0.8078111481893462, + "grad_norm": 0.15738314022897174, + "learning_rate": 1.8089254336532062e-05, + "loss": 0.668, + "step": 1634 + }, + { + "epoch": 0.8083055246570263, + "grad_norm": 0.1345381921426775, + "learning_rate": 1.808696445996953e-05, + "loss": 0.6441, + "step": 1635 + }, + { + "epoch": 0.8087999011247065, + "grad_norm": 0.15039869054897814, + "learning_rate": 1.8084673357229067e-05, + "loss": 0.711, + "step": 1636 + }, + { + "epoch": 0.8092942775923866, + "grad_norm": 0.14203968726684377, + "learning_rate": 1.8082381028658055e-05, + "loss": 0.6858, + "step": 1637 + }, + { + "epoch": 0.8097886540600667, + "grad_norm": 0.1545932733088698, + "learning_rate": 1.8080087474604074e-05, + "loss": 0.7056, + "step": 1638 + }, + { + "epoch": 0.8102830305277469, + "grad_norm": 0.15903760190925467, + "learning_rate": 1.807779269541488e-05, + "loss": 0.7304, + "step": 1639 + }, + { + "epoch": 0.810777406995427, + "grad_norm": 0.14692944061048388, + "learning_rate": 1.807549669143841e-05, + "loss": 0.7581, + "step": 1640 + }, + { + "epoch": 0.8112717834631071, + "grad_norm": 0.14554890576717994, + "learning_rate": 1.8073199463022804e-05, + "loss": 0.6825, + "step": 1641 + }, + { + "epoch": 0.8117661599307873, + "grad_norm": 0.16227188533003253, + "learning_rate": 1.8070901010516368e-05, + "loss": 0.7415, + "step": 1642 + }, + { + "epoch": 0.8122605363984674, + "grad_norm": 0.14237686901132124, + "learning_rate": 1.8068601334267605e-05, + "loss": 0.6725, + "step": 1643 + }, + { + "epoch": 0.8127549128661475, + "grad_norm": 0.15274795525787446, + "learning_rate": 1.8066300434625202e-05, + "loss": 0.7348, + "step": 1644 + }, + { + "epoch": 0.8132492893338277, + "grad_norm": 0.14444834239482687, + "learning_rate": 1.8063998311938026e-05, + "loss": 0.6707, + "step": 1645 + }, + { + "epoch": 0.8137436658015078, + "grad_norm": 0.15348246178510513, + "learning_rate": 1.8061694966555145e-05, + "loss": 0.6782, + "step": 1646 + }, + { + "epoch": 0.814238042269188, + "grad_norm": 0.1376212833793054, + "learning_rate": 1.805939039882579e-05, + "loss": 0.6514, + "step": 1647 + }, + { + "epoch": 0.8147324187368681, + "grad_norm": 0.1669803826970805, + "learning_rate": 1.8057084609099397e-05, + "loss": 0.673, + "step": 1648 + }, + { + "epoch": 0.8152267952045482, + "grad_norm": 0.14667137606021627, + "learning_rate": 1.8054777597725573e-05, + "loss": 0.6617, + "step": 1649 + }, + { + "epoch": 0.8157211716722284, + "grad_norm": 0.1602424568888369, + "learning_rate": 1.8052469365054123e-05, + "loss": 0.6823, + "step": 1650 + }, + { + "epoch": 0.8162155481399085, + "grad_norm": 0.15733470699141058, + "learning_rate": 1.8050159911435024e-05, + "loss": 0.6968, + "step": 1651 + }, + { + "epoch": 0.8167099246075887, + "grad_norm": 0.14568795688192654, + "learning_rate": 1.8047849237218446e-05, + "loss": 0.6809, + "step": 1652 + }, + { + "epoch": 0.8172043010752689, + "grad_norm": 0.1783202041066825, + "learning_rate": 1.8045537342754745e-05, + "loss": 0.7137, + "step": 1653 + }, + { + "epoch": 0.817698677542949, + "grad_norm": 0.14635020652529485, + "learning_rate": 1.8043224228394458e-05, + "loss": 0.657, + "step": 1654 + }, + { + "epoch": 0.8181930540106291, + "grad_norm": 0.16477532850591786, + "learning_rate": 1.804090989448831e-05, + "loss": 0.7032, + "step": 1655 + }, + { + "epoch": 0.8186874304783093, + "grad_norm": 0.13851890841325623, + "learning_rate": 1.8038594341387208e-05, + "loss": 0.6763, + "step": 1656 + }, + { + "epoch": 0.8191818069459894, + "grad_norm": 0.14541578058189877, + "learning_rate": 1.8036277569442245e-05, + "loss": 0.6682, + "step": 1657 + }, + { + "epoch": 0.8196761834136695, + "grad_norm": 0.147882365581909, + "learning_rate": 1.8033959579004704e-05, + "loss": 0.6798, + "step": 1658 + }, + { + "epoch": 0.8201705598813497, + "grad_norm": 0.1428388097543586, + "learning_rate": 1.803164037042604e-05, + "loss": 0.7069, + "step": 1659 + }, + { + "epoch": 0.8206649363490298, + "grad_norm": 0.15641335854183971, + "learning_rate": 1.8029319944057907e-05, + "loss": 0.6833, + "step": 1660 + }, + { + "epoch": 0.82115931281671, + "grad_norm": 0.14595010684511808, + "learning_rate": 1.8026998300252133e-05, + "loss": 0.7179, + "step": 1661 + }, + { + "epoch": 0.8216536892843901, + "grad_norm": 0.13804997801212426, + "learning_rate": 1.802467543936074e-05, + "loss": 0.703, + "step": 1662 + }, + { + "epoch": 0.8221480657520702, + "grad_norm": 0.14427251262915378, + "learning_rate": 1.8022351361735925e-05, + "loss": 0.6409, + "step": 1663 + }, + { + "epoch": 0.8226424422197504, + "grad_norm": 0.15212447128573303, + "learning_rate": 1.8020026067730077e-05, + "loss": 0.6981, + "step": 1664 + }, + { + "epoch": 0.8231368186874305, + "grad_norm": 0.13854302533687943, + "learning_rate": 1.8017699557695765e-05, + "loss": 0.6704, + "step": 1665 + }, + { + "epoch": 0.8236311951551106, + "grad_norm": 0.15258595748497022, + "learning_rate": 1.8015371831985743e-05, + "loss": 0.6526, + "step": 1666 + }, + { + "epoch": 0.8241255716227908, + "grad_norm": 0.1379031874452665, + "learning_rate": 1.801304289095295e-05, + "loss": 0.6869, + "step": 1667 + }, + { + "epoch": 0.8246199480904709, + "grad_norm": 0.1543732777799854, + "learning_rate": 1.8010712734950515e-05, + "loss": 0.6788, + "step": 1668 + }, + { + "epoch": 0.825114324558151, + "grad_norm": 0.1358161652673297, + "learning_rate": 1.8008381364331737e-05, + "loss": 0.6905, + "step": 1669 + }, + { + "epoch": 0.8256087010258312, + "grad_norm": 0.13456280251271033, + "learning_rate": 1.8006048779450114e-05, + "loss": 0.6813, + "step": 1670 + }, + { + "epoch": 0.8261030774935113, + "grad_norm": 0.13271417808413793, + "learning_rate": 1.8003714980659313e-05, + "loss": 0.666, + "step": 1671 + }, + { + "epoch": 0.8265974539611914, + "grad_norm": 0.14556743796983504, + "learning_rate": 1.8001379968313208e-05, + "loss": 0.7151, + "step": 1672 + }, + { + "epoch": 0.8270918304288716, + "grad_norm": 0.13468698606908128, + "learning_rate": 1.7999043742765833e-05, + "loss": 0.6765, + "step": 1673 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.1370747573734279, + "learning_rate": 1.799670630437142e-05, + "loss": 0.6684, + "step": 1674 + }, + { + "epoch": 0.8280805833642318, + "grad_norm": 0.14460692469913236, + "learning_rate": 1.7994367653484375e-05, + "loss": 0.701, + "step": 1675 + }, + { + "epoch": 0.828574959831912, + "grad_norm": 0.14641410477177524, + "learning_rate": 1.79920277904593e-05, + "loss": 0.7183, + "step": 1676 + }, + { + "epoch": 0.8290693362995921, + "grad_norm": 0.140171618485181, + "learning_rate": 1.7989686715650968e-05, + "loss": 0.7175, + "step": 1677 + }, + { + "epoch": 0.8295637127672723, + "grad_norm": 0.14120660087467118, + "learning_rate": 1.7987344429414354e-05, + "loss": 0.6963, + "step": 1678 + }, + { + "epoch": 0.8300580892349524, + "grad_norm": 0.7366253048361538, + "learning_rate": 1.798500093210459e-05, + "loss": 0.7882, + "step": 1679 + }, + { + "epoch": 0.8305524657026325, + "grad_norm": 0.1489433047823356, + "learning_rate": 1.7982656224077016e-05, + "loss": 0.6719, + "step": 1680 + }, + { + "epoch": 0.8310468421703127, + "grad_norm": 0.484022280002144, + "learning_rate": 1.7980310305687142e-05, + "loss": 0.6689, + "step": 1681 + }, + { + "epoch": 0.8315412186379928, + "grad_norm": 0.1497024462354198, + "learning_rate": 1.797796317729067e-05, + "loss": 0.6927, + "step": 1682 + }, + { + "epoch": 0.8320355951056729, + "grad_norm": 0.1528333613090505, + "learning_rate": 1.7975614839243476e-05, + "loss": 0.7562, + "step": 1683 + }, + { + "epoch": 0.8325299715733531, + "grad_norm": 0.1444932305737038, + "learning_rate": 1.7973265291901625e-05, + "loss": 0.6737, + "step": 1684 + }, + { + "epoch": 0.8330243480410332, + "grad_norm": 0.14352130067963062, + "learning_rate": 1.7970914535621368e-05, + "loss": 0.6462, + "step": 1685 + }, + { + "epoch": 0.8335187245087133, + "grad_norm": 0.1450685220700883, + "learning_rate": 1.7968562570759137e-05, + "loss": 0.7189, + "step": 1686 + }, + { + "epoch": 0.8340131009763935, + "grad_norm": 0.1538734242540474, + "learning_rate": 1.796620939767154e-05, + "loss": 0.6594, + "step": 1687 + }, + { + "epoch": 0.8345074774440736, + "grad_norm": 0.14277614291658544, + "learning_rate": 1.7963855016715378e-05, + "loss": 0.7007, + "step": 1688 + }, + { + "epoch": 0.8350018539117539, + "grad_norm": 0.14318323576910547, + "learning_rate": 1.7961499428247632e-05, + "loss": 0.6854, + "step": 1689 + }, + { + "epoch": 0.835496230379434, + "grad_norm": 0.15810141415405504, + "learning_rate": 1.7959142632625463e-05, + "loss": 0.6935, + "step": 1690 + }, + { + "epoch": 0.8359906068471141, + "grad_norm": 0.14214662966294045, + "learning_rate": 1.7956784630206225e-05, + "loss": 0.7154, + "step": 1691 + }, + { + "epoch": 0.8364849833147943, + "grad_norm": 0.14664670948336658, + "learning_rate": 1.795442542134744e-05, + "loss": 0.6675, + "step": 1692 + }, + { + "epoch": 0.8369793597824744, + "grad_norm": 0.14156607222218548, + "learning_rate": 1.7952065006406826e-05, + "loss": 0.6946, + "step": 1693 + }, + { + "epoch": 0.8374737362501545, + "grad_norm": 0.1426385850427147, + "learning_rate": 1.7949703385742277e-05, + "loss": 0.6914, + "step": 1694 + }, + { + "epoch": 0.8379681127178347, + "grad_norm": 0.14078164583050018, + "learning_rate": 1.7947340559711866e-05, + "loss": 0.6574, + "step": 1695 + }, + { + "epoch": 0.8384624891855148, + "grad_norm": 0.1403947210377861, + "learning_rate": 1.7944976528673862e-05, + "loss": 0.6851, + "step": 1696 + }, + { + "epoch": 0.8389568656531949, + "grad_norm": 0.13891405784569694, + "learning_rate": 1.7942611292986708e-05, + "loss": 0.6536, + "step": 1697 + }, + { + "epoch": 0.8394512421208751, + "grad_norm": 0.14325520438399345, + "learning_rate": 1.7940244853009024e-05, + "loss": 0.6975, + "step": 1698 + }, + { + "epoch": 0.8399456185885552, + "grad_norm": 0.14667598941332843, + "learning_rate": 1.7937877209099624e-05, + "loss": 0.6578, + "step": 1699 + }, + { + "epoch": 0.8404399950562353, + "grad_norm": 0.14367243148100425, + "learning_rate": 1.79355083616175e-05, + "loss": 0.6976, + "step": 1700 + }, + { + "epoch": 0.8409343715239155, + "grad_norm": 0.13898883157934422, + "learning_rate": 1.7933138310921827e-05, + "loss": 0.6721, + "step": 1701 + }, + { + "epoch": 0.8414287479915956, + "grad_norm": 0.15022551298900133, + "learning_rate": 1.7930767057371955e-05, + "loss": 0.6841, + "step": 1702 + }, + { + "epoch": 0.8419231244592758, + "grad_norm": 0.14188763904553506, + "learning_rate": 1.792839460132743e-05, + "loss": 0.6817, + "step": 1703 + }, + { + "epoch": 0.8424175009269559, + "grad_norm": 0.15023834723140406, + "learning_rate": 1.7926020943147974e-05, + "loss": 0.6726, + "step": 1704 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 0.14244503885155982, + "learning_rate": 1.7923646083193484e-05, + "loss": 0.6797, + "step": 1705 + }, + { + "epoch": 0.8434062538623162, + "grad_norm": 0.15093051092979007, + "learning_rate": 1.792127002182405e-05, + "loss": 0.6754, + "step": 1706 + }, + { + "epoch": 0.8439006303299963, + "grad_norm": 0.14101649574883368, + "learning_rate": 1.791889275939994e-05, + "loss": 0.6811, + "step": 1707 + }, + { + "epoch": 0.8443950067976764, + "grad_norm": 0.14324925381576673, + "learning_rate": 1.7916514296281603e-05, + "loss": 0.6756, + "step": 1708 + }, + { + "epoch": 0.8448893832653566, + "grad_norm": 0.1489706748519599, + "learning_rate": 1.7914134632829667e-05, + "loss": 0.7084, + "step": 1709 + }, + { + "epoch": 0.8453837597330367, + "grad_norm": 0.14011563468422966, + "learning_rate": 1.7911753769404954e-05, + "loss": 0.6791, + "step": 1710 + }, + { + "epoch": 0.8458781362007168, + "grad_norm": 1.5211076079436083, + "learning_rate": 1.7909371706368458e-05, + "loss": 0.7055, + "step": 1711 + }, + { + "epoch": 0.846372512668397, + "grad_norm": 0.14531802641985986, + "learning_rate": 1.7906988444081353e-05, + "loss": 0.6653, + "step": 1712 + }, + { + "epoch": 0.8468668891360771, + "grad_norm": 0.14888440470328873, + "learning_rate": 1.7904603982905004e-05, + "loss": 0.6677, + "step": 1713 + }, + { + "epoch": 0.8473612656037572, + "grad_norm": 0.14138863119690653, + "learning_rate": 1.7902218323200948e-05, + "loss": 0.6818, + "step": 1714 + }, + { + "epoch": 0.8478556420714374, + "grad_norm": 0.1475465376026652, + "learning_rate": 1.789983146533091e-05, + "loss": 0.6909, + "step": 1715 + }, + { + "epoch": 0.8483500185391175, + "grad_norm": 0.1483612209316308, + "learning_rate": 1.7897443409656792e-05, + "loss": 0.6714, + "step": 1716 + }, + { + "epoch": 0.8488443950067976, + "grad_norm": 0.15606708283696613, + "learning_rate": 1.789505415654069e-05, + "loss": 0.6825, + "step": 1717 + }, + { + "epoch": 0.8493387714744778, + "grad_norm": 0.15141363584968784, + "learning_rate": 1.789266370634486e-05, + "loss": 0.6981, + "step": 1718 + }, + { + "epoch": 0.8498331479421579, + "grad_norm": 0.20980154621359487, + "learning_rate": 1.789027205943176e-05, + "loss": 0.6534, + "step": 1719 + }, + { + "epoch": 0.850327524409838, + "grad_norm": 0.1504944523588419, + "learning_rate": 1.7887879216164016e-05, + "loss": 0.6654, + "step": 1720 + }, + { + "epoch": 0.8508219008775182, + "grad_norm": 0.31431129155103227, + "learning_rate": 1.7885485176904446e-05, + "loss": 0.7187, + "step": 1721 + }, + { + "epoch": 0.8513162773451983, + "grad_norm": 0.1496111897206991, + "learning_rate": 1.7883089942016035e-05, + "loss": 0.6404, + "step": 1722 + }, + { + "epoch": 0.8518106538128785, + "grad_norm": 0.1451051307428492, + "learning_rate": 1.788069351186197e-05, + "loss": 0.6824, + "step": 1723 + }, + { + "epoch": 0.8523050302805586, + "grad_norm": 0.15423381057450064, + "learning_rate": 1.78782958868056e-05, + "loss": 0.6929, + "step": 1724 + }, + { + "epoch": 0.8527994067482387, + "grad_norm": 0.14352233162055464, + "learning_rate": 1.7875897067210463e-05, + "loss": 0.6704, + "step": 1725 + }, + { + "epoch": 0.853293783215919, + "grad_norm": 0.14630427722358555, + "learning_rate": 1.7873497053440277e-05, + "loss": 0.7178, + "step": 1726 + }, + { + "epoch": 0.8537881596835991, + "grad_norm": 0.14213085302205095, + "learning_rate": 1.787109584585894e-05, + "loss": 0.7031, + "step": 1727 + }, + { + "epoch": 0.8542825361512792, + "grad_norm": 0.1582881713427826, + "learning_rate": 1.786869344483054e-05, + "loss": 0.7169, + "step": 1728 + }, + { + "epoch": 0.8547769126189594, + "grad_norm": 0.14297649444814595, + "learning_rate": 1.7866289850719335e-05, + "loss": 0.6787, + "step": 1729 + }, + { + "epoch": 0.8552712890866395, + "grad_norm": 0.15287853510060612, + "learning_rate": 1.7863885063889766e-05, + "loss": 0.6942, + "step": 1730 + }, + { + "epoch": 0.8557656655543197, + "grad_norm": 0.14661167117736038, + "learning_rate": 1.7861479084706457e-05, + "loss": 0.6871, + "step": 1731 + }, + { + "epoch": 0.8562600420219998, + "grad_norm": 1.1949357138278918, + "learning_rate": 1.7859071913534213e-05, + "loss": 0.6982, + "step": 1732 + }, + { + "epoch": 0.8567544184896799, + "grad_norm": 0.16714031105130076, + "learning_rate": 1.7856663550738017e-05, + "loss": 0.726, + "step": 1733 + }, + { + "epoch": 0.8572487949573601, + "grad_norm": 0.24500016539240044, + "learning_rate": 1.7854253996683036e-05, + "loss": 0.7148, + "step": 1734 + }, + { + "epoch": 0.8577431714250402, + "grad_norm": 0.19136134244697442, + "learning_rate": 1.7851843251734616e-05, + "loss": 0.7191, + "step": 1735 + }, + { + "epoch": 0.8582375478927203, + "grad_norm": 1.474434970800943, + "learning_rate": 1.7849431316258284e-05, + "loss": 0.672, + "step": 1736 + }, + { + "epoch": 0.8587319243604005, + "grad_norm": 0.20641076110279866, + "learning_rate": 1.784701819061975e-05, + "loss": 0.7408, + "step": 1737 + }, + { + "epoch": 0.8592263008280806, + "grad_norm": 0.18027304790604012, + "learning_rate": 1.7844603875184897e-05, + "loss": 0.6532, + "step": 1738 + }, + { + "epoch": 0.8597206772957607, + "grad_norm": 0.5104718851078444, + "learning_rate": 1.7842188370319796e-05, + "loss": 0.7264, + "step": 1739 + }, + { + "epoch": 0.8602150537634409, + "grad_norm": 0.18645462278393402, + "learning_rate": 1.783977167639069e-05, + "loss": 0.7068, + "step": 1740 + }, + { + "epoch": 0.860709430231121, + "grad_norm": 0.17345694769756084, + "learning_rate": 1.7837353793764022e-05, + "loss": 0.6573, + "step": 1741 + }, + { + "epoch": 0.8612038066988011, + "grad_norm": 0.20442256031129663, + "learning_rate": 1.7834934722806384e-05, + "loss": 0.7688, + "step": 1742 + }, + { + "epoch": 0.8616981831664813, + "grad_norm": 0.21646377422411253, + "learning_rate": 1.7832514463884577e-05, + "loss": 0.7329, + "step": 1743 + }, + { + "epoch": 0.8621925596341614, + "grad_norm": 0.16985020501583215, + "learning_rate": 1.7830093017365563e-05, + "loss": 0.6604, + "step": 1744 + }, + { + "epoch": 0.8626869361018416, + "grad_norm": 0.2128087334670451, + "learning_rate": 1.78276703836165e-05, + "loss": 0.6917, + "step": 1745 + }, + { + "epoch": 0.8631813125695217, + "grad_norm": 0.17789431614524293, + "learning_rate": 1.7825246563004707e-05, + "loss": 0.6984, + "step": 1746 + }, + { + "epoch": 0.8636756890372018, + "grad_norm": 0.9034075430180688, + "learning_rate": 1.78228215558977e-05, + "loss": 0.705, + "step": 1747 + }, + { + "epoch": 0.864170065504882, + "grad_norm": 0.1613957738200823, + "learning_rate": 1.7820395362663166e-05, + "loss": 0.6539, + "step": 1748 + }, + { + "epoch": 0.8646644419725621, + "grad_norm": 0.1745459481993726, + "learning_rate": 1.7817967983668975e-05, + "loss": 0.714, + "step": 1749 + }, + { + "epoch": 0.8651588184402422, + "grad_norm": 0.15906179202416765, + "learning_rate": 1.7815539419283178e-05, + "loss": 0.6731, + "step": 1750 + }, + { + "epoch": 0.8656531949079224, + "grad_norm": 0.16987047851831652, + "learning_rate": 1.7813109669874e-05, + "loss": 0.6792, + "step": 1751 + }, + { + "epoch": 0.8661475713756025, + "grad_norm": 0.16018849236972635, + "learning_rate": 1.781067873580985e-05, + "loss": 0.6327, + "step": 1752 + }, + { + "epoch": 0.8666419478432826, + "grad_norm": 0.27221384699615675, + "learning_rate": 1.7808246617459316e-05, + "loss": 0.702, + "step": 1753 + }, + { + "epoch": 0.8671363243109628, + "grad_norm": 0.46757470694101, + "learning_rate": 1.780581331519117e-05, + "loss": 0.7319, + "step": 1754 + }, + { + "epoch": 0.8676307007786429, + "grad_norm": 0.1566592696393691, + "learning_rate": 1.7803378829374353e-05, + "loss": 0.6899, + "step": 1755 + }, + { + "epoch": 0.868125077246323, + "grad_norm": 0.15718506768364726, + "learning_rate": 1.7800943160377993e-05, + "loss": 0.7093, + "step": 1756 + }, + { + "epoch": 0.8686194537140032, + "grad_norm": 0.6679197250203217, + "learning_rate": 1.7798506308571398e-05, + "loss": 0.6886, + "step": 1757 + }, + { + "epoch": 0.8691138301816833, + "grad_norm": 0.18886173918442087, + "learning_rate": 1.779606827432405e-05, + "loss": 0.6578, + "step": 1758 + }, + { + "epoch": 0.8696082066493634, + "grad_norm": 0.31265694365949437, + "learning_rate": 1.7793629058005617e-05, + "loss": 0.738, + "step": 1759 + }, + { + "epoch": 0.8701025831170436, + "grad_norm": 0.9833995126301568, + "learning_rate": 1.7791188659985942e-05, + "loss": 0.7382, + "step": 1760 + }, + { + "epoch": 0.8705969595847237, + "grad_norm": 0.2642100214381445, + "learning_rate": 1.7788747080635046e-05, + "loss": 0.7073, + "step": 1761 + }, + { + "epoch": 0.8710913360524039, + "grad_norm": 0.1903634003838162, + "learning_rate": 1.7786304320323134e-05, + "loss": 0.6884, + "step": 1762 + }, + { + "epoch": 0.8715857125200841, + "grad_norm": 0.1704170747734664, + "learning_rate": 1.7783860379420584e-05, + "loss": 0.6678, + "step": 1763 + }, + { + "epoch": 0.8720800889877642, + "grad_norm": 0.1726739110799715, + "learning_rate": 1.7781415258297957e-05, + "loss": 0.7321, + "step": 1764 + }, + { + "epoch": 0.8725744654554444, + "grad_norm": 0.22929356305934723, + "learning_rate": 1.777896895732599e-05, + "loss": 0.7044, + "step": 1765 + }, + { + "epoch": 0.8730688419231245, + "grad_norm": 0.16977975741876977, + "learning_rate": 1.7776521476875608e-05, + "loss": 0.6776, + "step": 1766 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 0.16501659651897221, + "learning_rate": 1.77740728173179e-05, + "loss": 0.698, + "step": 1767 + }, + { + "epoch": 0.8740575948584848, + "grad_norm": 0.23152321209835133, + "learning_rate": 1.7771622979024145e-05, + "loss": 0.7121, + "step": 1768 + }, + { + "epoch": 0.8745519713261649, + "grad_norm": 0.14997786595011092, + "learning_rate": 1.7769171962365797e-05, + "loss": 0.6958, + "step": 1769 + }, + { + "epoch": 0.875046347793845, + "grad_norm": 0.16820863586843973, + "learning_rate": 1.776671976771449e-05, + "loss": 0.6486, + "step": 1770 + }, + { + "epoch": 0.8755407242615252, + "grad_norm": 0.15016956512277996, + "learning_rate": 1.7764266395442033e-05, + "loss": 0.6966, + "step": 1771 + }, + { + "epoch": 0.8760351007292053, + "grad_norm": 0.15217540177073785, + "learning_rate": 1.776181184592042e-05, + "loss": 0.722, + "step": 1772 + }, + { + "epoch": 0.8765294771968855, + "grad_norm": 0.14422305832664672, + "learning_rate": 1.7759356119521815e-05, + "loss": 0.6769, + "step": 1773 + }, + { + "epoch": 0.8770238536645656, + "grad_norm": 0.15094231812406086, + "learning_rate": 1.775689921661857e-05, + "loss": 0.7279, + "step": 1774 + }, + { + "epoch": 0.8775182301322457, + "grad_norm": 0.1422541107776089, + "learning_rate": 1.7754441137583205e-05, + "loss": 0.6741, + "step": 1775 + }, + { + "epoch": 0.8780126065999259, + "grad_norm": 0.1406120462889535, + "learning_rate": 1.7751981882788427e-05, + "loss": 0.6854, + "step": 1776 + }, + { + "epoch": 0.878506983067606, + "grad_norm": 0.1528536845100775, + "learning_rate": 1.774952145260712e-05, + "loss": 0.7123, + "step": 1777 + }, + { + "epoch": 0.8790013595352861, + "grad_norm": 0.14058364927881772, + "learning_rate": 1.774705984741234e-05, + "loss": 0.6495, + "step": 1778 + }, + { + "epoch": 0.8794957360029663, + "grad_norm": 0.15562125362081453, + "learning_rate": 1.7744597067577327e-05, + "loss": 0.7119, + "step": 1779 + }, + { + "epoch": 0.8799901124706464, + "grad_norm": 0.16918475935198105, + "learning_rate": 1.7742133113475497e-05, + "loss": 0.6865, + "step": 1780 + }, + { + "epoch": 0.8804844889383265, + "grad_norm": 0.1416583960475054, + "learning_rate": 1.7739667985480447e-05, + "loss": 0.6949, + "step": 1781 + }, + { + "epoch": 0.8809788654060067, + "grad_norm": 0.1444812635405514, + "learning_rate": 1.773720168396595e-05, + "loss": 0.7223, + "step": 1782 + }, + { + "epoch": 0.8814732418736868, + "grad_norm": 0.13770232789341627, + "learning_rate": 1.773473420930595e-05, + "loss": 0.6802, + "step": 1783 + }, + { + "epoch": 0.8819676183413669, + "grad_norm": 0.14767136028310698, + "learning_rate": 1.7732265561874583e-05, + "loss": 0.7235, + "step": 1784 + }, + { + "epoch": 0.8824619948090471, + "grad_norm": 0.14207888628095705, + "learning_rate": 1.7729795742046148e-05, + "loss": 0.689, + "step": 1785 + }, + { + "epoch": 0.8829563712767272, + "grad_norm": 0.14708100308927624, + "learning_rate": 1.772732475019514e-05, + "loss": 0.7485, + "step": 1786 + }, + { + "epoch": 0.8834507477444073, + "grad_norm": 0.1466920132706352, + "learning_rate": 1.772485258669621e-05, + "loss": 0.7559, + "step": 1787 + }, + { + "epoch": 0.8839451242120875, + "grad_norm": 0.14965156052428835, + "learning_rate": 1.77223792519242e-05, + "loss": 0.6751, + "step": 1788 + }, + { + "epoch": 0.8844395006797676, + "grad_norm": 0.13709202564408013, + "learning_rate": 1.771990474625413e-05, + "loss": 0.6877, + "step": 1789 + }, + { + "epoch": 0.8849338771474478, + "grad_norm": 0.13550862365024596, + "learning_rate": 1.7717429070061195e-05, + "loss": 0.6571, + "step": 1790 + }, + { + "epoch": 0.8854282536151279, + "grad_norm": 0.14723513602561034, + "learning_rate": 1.771495222372076e-05, + "loss": 0.6934, + "step": 1791 + }, + { + "epoch": 0.885922630082808, + "grad_norm": 0.14737803153435236, + "learning_rate": 1.771247420760838e-05, + "loss": 0.7195, + "step": 1792 + }, + { + "epoch": 0.8864170065504882, + "grad_norm": 0.14855027407066865, + "learning_rate": 1.770999502209978e-05, + "loss": 0.6913, + "step": 1793 + }, + { + "epoch": 0.8869113830181683, + "grad_norm": 0.131697256977941, + "learning_rate": 1.7707514667570865e-05, + "loss": 0.6895, + "step": 1794 + }, + { + "epoch": 0.8874057594858484, + "grad_norm": 0.14144341449344255, + "learning_rate": 1.770503314439772e-05, + "loss": 0.6873, + "step": 1795 + }, + { + "epoch": 0.8879001359535286, + "grad_norm": 0.14288602860359548, + "learning_rate": 1.7702550452956593e-05, + "loss": 0.6996, + "step": 1796 + }, + { + "epoch": 0.8883945124212087, + "grad_norm": 1.00429345568483, + "learning_rate": 1.770006659362393e-05, + "loss": 0.7187, + "step": 1797 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.14935768039834935, + "learning_rate": 1.7697581566776338e-05, + "loss": 0.6774, + "step": 1798 + }, + { + "epoch": 0.889383265356569, + "grad_norm": 0.14012681852537295, + "learning_rate": 1.7695095372790607e-05, + "loss": 0.6666, + "step": 1799 + }, + { + "epoch": 0.8898776418242491, + "grad_norm": 0.14208434069639703, + "learning_rate": 1.7692608012043707e-05, + "loss": 0.6919, + "step": 1800 + }, + { + "epoch": 0.8903720182919294, + "grad_norm": 0.15309069417860555, + "learning_rate": 1.769011948491278e-05, + "loss": 0.6827, + "step": 1801 + }, + { + "epoch": 0.8908663947596095, + "grad_norm": 0.15335762209366957, + "learning_rate": 1.7687629791775146e-05, + "loss": 0.6574, + "step": 1802 + }, + { + "epoch": 0.8913607712272896, + "grad_norm": 0.4184076352564979, + "learning_rate": 1.76851389330083e-05, + "loss": 0.6609, + "step": 1803 + }, + { + "epoch": 0.8918551476949698, + "grad_norm": 0.15134850040583603, + "learning_rate": 1.7682646908989923e-05, + "loss": 0.694, + "step": 1804 + }, + { + "epoch": 0.8923495241626499, + "grad_norm": 0.32518378773454454, + "learning_rate": 1.7680153720097856e-05, + "loss": 0.6982, + "step": 1805 + }, + { + "epoch": 0.89284390063033, + "grad_norm": 0.15061080972228338, + "learning_rate": 1.7677659366710134e-05, + "loss": 0.6837, + "step": 1806 + }, + { + "epoch": 0.8933382770980102, + "grad_norm": 0.1505326342656472, + "learning_rate": 1.767516384920496e-05, + "loss": 0.6791, + "step": 1807 + }, + { + "epoch": 0.8938326535656903, + "grad_norm": 0.15209132578951146, + "learning_rate": 1.7672667167960705e-05, + "loss": 0.6664, + "step": 1808 + }, + { + "epoch": 0.8943270300333704, + "grad_norm": 0.15422003793713454, + "learning_rate": 1.767016932335594e-05, + "loss": 0.7172, + "step": 1809 + }, + { + "epoch": 0.8948214065010506, + "grad_norm": 0.14987779643904459, + "learning_rate": 1.7667670315769388e-05, + "loss": 0.6914, + "step": 1810 + }, + { + "epoch": 0.8953157829687307, + "grad_norm": 0.20418134891449433, + "learning_rate": 1.7665170145579965e-05, + "loss": 0.677, + "step": 1811 + }, + { + "epoch": 0.8958101594364108, + "grad_norm": 0.1450237664991632, + "learning_rate": 1.7662668813166753e-05, + "loss": 0.6974, + "step": 1812 + }, + { + "epoch": 0.896304535904091, + "grad_norm": 0.1419705729958329, + "learning_rate": 1.7660166318909014e-05, + "loss": 0.7027, + "step": 1813 + }, + { + "epoch": 0.8967989123717711, + "grad_norm": 0.13848452570644953, + "learning_rate": 1.7657662663186186e-05, + "loss": 0.652, + "step": 1814 + }, + { + "epoch": 0.8972932888394513, + "grad_norm": 0.15208758581671736, + "learning_rate": 1.7655157846377885e-05, + "loss": 0.6356, + "step": 1815 + }, + { + "epoch": 0.8977876653071314, + "grad_norm": 0.1451401622961971, + "learning_rate": 1.7652651868863904e-05, + "loss": 0.6544, + "step": 1816 + }, + { + "epoch": 0.8982820417748115, + "grad_norm": 0.13717695313033149, + "learning_rate": 1.7650144731024205e-05, + "loss": 0.6651, + "step": 1817 + }, + { + "epoch": 0.8987764182424917, + "grad_norm": 0.14994828099557359, + "learning_rate": 1.764763643323893e-05, + "loss": 0.6789, + "step": 1818 + }, + { + "epoch": 0.8992707947101718, + "grad_norm": 0.1350800357820652, + "learning_rate": 1.7645126975888396e-05, + "loss": 0.6707, + "step": 1819 + }, + { + "epoch": 0.8997651711778519, + "grad_norm": 0.2889304112443799, + "learning_rate": 1.76426163593531e-05, + "loss": 0.6911, + "step": 1820 + }, + { + "epoch": 0.9002595476455321, + "grad_norm": 0.22913734948929287, + "learning_rate": 1.7640104584013715e-05, + "loss": 0.7059, + "step": 1821 + }, + { + "epoch": 0.9007539241132122, + "grad_norm": 0.17054398494044662, + "learning_rate": 1.7637591650251077e-05, + "loss": 0.7148, + "step": 1822 + }, + { + "epoch": 0.9012483005808923, + "grad_norm": 0.19337170479651214, + "learning_rate": 1.7635077558446217e-05, + "loss": 0.6718, + "step": 1823 + }, + { + "epoch": 0.9017426770485725, + "grad_norm": 0.14968739353219937, + "learning_rate": 1.7632562308980327e-05, + "loss": 0.6785, + "step": 1824 + }, + { + "epoch": 0.9022370535162526, + "grad_norm": 0.16535639243228495, + "learning_rate": 1.763004590223478e-05, + "loss": 0.7078, + "step": 1825 + }, + { + "epoch": 0.9027314299839327, + "grad_norm": 0.15557121602808263, + "learning_rate": 1.762752833859112e-05, + "loss": 0.7078, + "step": 1826 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.15352201403453472, + "learning_rate": 1.7625009618431077e-05, + "loss": 0.7025, + "step": 1827 + }, + { + "epoch": 0.903720182919293, + "grad_norm": 0.16593838696926053, + "learning_rate": 1.7622489742136546e-05, + "loss": 0.6425, + "step": 1828 + }, + { + "epoch": 0.9042145593869731, + "grad_norm": 0.13863068546937327, + "learning_rate": 1.7619968710089597e-05, + "loss": 0.6619, + "step": 1829 + }, + { + "epoch": 0.9047089358546533, + "grad_norm": 0.14149439502439987, + "learning_rate": 1.7617446522672486e-05, + "loss": 0.6665, + "step": 1830 + }, + { + "epoch": 0.9052033123223334, + "grad_norm": 0.16934297519369243, + "learning_rate": 1.761492318026763e-05, + "loss": 0.703, + "step": 1831 + }, + { + "epoch": 0.9056976887900136, + "grad_norm": 0.13142149319593321, + "learning_rate": 1.7612398683257635e-05, + "loss": 0.6655, + "step": 1832 + }, + { + "epoch": 0.9061920652576937, + "grad_norm": 0.13956177201468895, + "learning_rate": 1.7609873032025274e-05, + "loss": 0.6404, + "step": 1833 + }, + { + "epoch": 0.9066864417253738, + "grad_norm": 0.24389333634220467, + "learning_rate": 1.760734622695349e-05, + "loss": 0.6905, + "step": 1834 + }, + { + "epoch": 0.907180818193054, + "grad_norm": 0.13910363454148866, + "learning_rate": 1.7604818268425412e-05, + "loss": 0.6793, + "step": 1835 + }, + { + "epoch": 0.9076751946607341, + "grad_norm": 0.14728313313617863, + "learning_rate": 1.760228915682434e-05, + "loss": 0.6921, + "step": 1836 + }, + { + "epoch": 0.9081695711284142, + "grad_norm": 0.13891749153046556, + "learning_rate": 1.759975889253375e-05, + "loss": 0.6621, + "step": 1837 + }, + { + "epoch": 0.9086639475960945, + "grad_norm": 0.5207143815499395, + "learning_rate": 1.7597227475937285e-05, + "loss": 0.6652, + "step": 1838 + }, + { + "epoch": 0.9091583240637746, + "grad_norm": 0.13692606346709169, + "learning_rate": 1.7594694907418773e-05, + "loss": 0.6643, + "step": 1839 + }, + { + "epoch": 0.9096527005314547, + "grad_norm": 0.16526578839417833, + "learning_rate": 1.7592161187362208e-05, + "loss": 0.6399, + "step": 1840 + }, + { + "epoch": 0.9101470769991349, + "grad_norm": 0.1514574606011257, + "learning_rate": 1.7589626316151767e-05, + "loss": 0.6851, + "step": 1841 + }, + { + "epoch": 0.910641453466815, + "grad_norm": 0.14091967955094123, + "learning_rate": 1.7587090294171797e-05, + "loss": 0.6585, + "step": 1842 + }, + { + "epoch": 0.9111358299344952, + "grad_norm": 0.14956766964794063, + "learning_rate": 1.7584553121806817e-05, + "loss": 0.7154, + "step": 1843 + }, + { + "epoch": 0.9116302064021753, + "grad_norm": 0.14337204524773223, + "learning_rate": 1.7582014799441524e-05, + "loss": 0.7211, + "step": 1844 + }, + { + "epoch": 0.9121245828698554, + "grad_norm": 0.13857540288138684, + "learning_rate": 1.757947532746079e-05, + "loss": 0.6471, + "step": 1845 + }, + { + "epoch": 0.9126189593375356, + "grad_norm": 0.14488461150786228, + "learning_rate": 1.757693470624966e-05, + "loss": 0.6738, + "step": 1846 + }, + { + "epoch": 0.9131133358052157, + "grad_norm": 0.22329732748225953, + "learning_rate": 1.7574392936193354e-05, + "loss": 0.7305, + "step": 1847 + }, + { + "epoch": 0.9136077122728958, + "grad_norm": 0.13869700269196608, + "learning_rate": 1.757185001767726e-05, + "loss": 0.6493, + "step": 1848 + }, + { + "epoch": 0.914102088740576, + "grad_norm": 0.14714520195785835, + "learning_rate": 1.756930595108695e-05, + "loss": 0.7034, + "step": 1849 + }, + { + "epoch": 0.9145964652082561, + "grad_norm": 0.13345706364953341, + "learning_rate": 1.7566760736808167e-05, + "loss": 0.6568, + "step": 1850 + }, + { + "epoch": 0.9150908416759362, + "grad_norm": 0.13771428380231332, + "learning_rate": 1.7564214375226822e-05, + "loss": 0.7098, + "step": 1851 + }, + { + "epoch": 0.9155852181436164, + "grad_norm": 0.15363096522433742, + "learning_rate": 1.7561666866729006e-05, + "loss": 0.6788, + "step": 1852 + }, + { + "epoch": 0.9160795946112965, + "grad_norm": 0.16656962822369384, + "learning_rate": 1.755911821170099e-05, + "loss": 0.6723, + "step": 1853 + }, + { + "epoch": 0.9165739710789766, + "grad_norm": 0.15802090734973992, + "learning_rate": 1.75565684105292e-05, + "loss": 0.7354, + "step": 1854 + }, + { + "epoch": 0.9170683475466568, + "grad_norm": 0.14704468521887665, + "learning_rate": 1.755401746360025e-05, + "loss": 0.6569, + "step": 1855 + }, + { + "epoch": 0.9175627240143369, + "grad_norm": 0.1426873442119077, + "learning_rate": 1.7551465371300928e-05, + "loss": 0.7144, + "step": 1856 + }, + { + "epoch": 0.918057100482017, + "grad_norm": 0.14041385864771153, + "learning_rate": 1.7548912134018193e-05, + "loss": 0.6903, + "step": 1857 + }, + { + "epoch": 0.9185514769496972, + "grad_norm": 0.14277011976085824, + "learning_rate": 1.7546357752139173e-05, + "loss": 0.6935, + "step": 1858 + }, + { + "epoch": 0.9190458534173773, + "grad_norm": 0.16707658010445897, + "learning_rate": 1.7543802226051178e-05, + "loss": 0.6665, + "step": 1859 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.13201334141788437, + "learning_rate": 1.754124555614168e-05, + "loss": 0.6464, + "step": 1860 + }, + { + "epoch": 0.9200346063527376, + "grad_norm": 0.14858372761342206, + "learning_rate": 1.753868774279834e-05, + "loss": 0.6804, + "step": 1861 + }, + { + "epoch": 0.9205289828204177, + "grad_norm": 0.143016285513508, + "learning_rate": 1.753612878640898e-05, + "loss": 0.6706, + "step": 1862 + }, + { + "epoch": 0.9210233592880979, + "grad_norm": 0.14072589066205946, + "learning_rate": 1.75335686873616e-05, + "loss": 0.6912, + "step": 1863 + }, + { + "epoch": 0.921517735755778, + "grad_norm": 0.13878646111395215, + "learning_rate": 1.7531007446044366e-05, + "loss": 0.6823, + "step": 1864 + }, + { + "epoch": 0.9220121122234581, + "grad_norm": 0.14052058128267778, + "learning_rate": 1.7528445062845636e-05, + "loss": 0.6702, + "step": 1865 + }, + { + "epoch": 0.9225064886911383, + "grad_norm": 0.13527856270236127, + "learning_rate": 1.752588153815392e-05, + "loss": 0.688, + "step": 1866 + }, + { + "epoch": 0.9230008651588184, + "grad_norm": 0.14396283554025668, + "learning_rate": 1.752331687235791e-05, + "loss": 0.713, + "step": 1867 + }, + { + "epoch": 0.9234952416264985, + "grad_norm": 0.17489307857047193, + "learning_rate": 1.7520751065846477e-05, + "loss": 0.6656, + "step": 1868 + }, + { + "epoch": 0.9239896180941787, + "grad_norm": 0.14846621682012212, + "learning_rate": 1.7518184119008655e-05, + "loss": 0.7157, + "step": 1869 + }, + { + "epoch": 0.9244839945618588, + "grad_norm": 0.17076353823830037, + "learning_rate": 1.7515616032233652e-05, + "loss": 0.6729, + "step": 1870 + }, + { + "epoch": 0.924978371029539, + "grad_norm": 0.13704825438353585, + "learning_rate": 1.7513046805910855e-05, + "loss": 0.6818, + "step": 1871 + }, + { + "epoch": 0.9254727474972191, + "grad_norm": 0.1336122695015489, + "learning_rate": 1.751047644042982e-05, + "loss": 0.6424, + "step": 1872 + }, + { + "epoch": 0.9259671239648992, + "grad_norm": 0.14208623863832576, + "learning_rate": 1.7507904936180275e-05, + "loss": 0.6952, + "step": 1873 + }, + { + "epoch": 0.9264615004325794, + "grad_norm": 0.1398900058103471, + "learning_rate": 1.7505332293552123e-05, + "loss": 0.7217, + "step": 1874 + }, + { + "epoch": 0.9269558769002596, + "grad_norm": 0.14273435236396723, + "learning_rate": 1.750275851293544e-05, + "loss": 0.675, + "step": 1875 + }, + { + "epoch": 0.9274502533679397, + "grad_norm": 0.15023069453965965, + "learning_rate": 1.750018359472047e-05, + "loss": 0.7441, + "step": 1876 + }, + { + "epoch": 0.9279446298356199, + "grad_norm": 0.13438944816826992, + "learning_rate": 1.749760753929763e-05, + "loss": 0.6559, + "step": 1877 + }, + { + "epoch": 0.9284390063033, + "grad_norm": 0.14667875501567385, + "learning_rate": 1.7495030347057516e-05, + "loss": 0.7138, + "step": 1878 + }, + { + "epoch": 0.9289333827709801, + "grad_norm": 0.13980631431923707, + "learning_rate": 1.7492452018390896e-05, + "loss": 0.6914, + "step": 1879 + }, + { + "epoch": 0.9294277592386603, + "grad_norm": 0.14094073055143136, + "learning_rate": 1.7489872553688697e-05, + "loss": 0.7095, + "step": 1880 + }, + { + "epoch": 0.9299221357063404, + "grad_norm": 0.13785975128035052, + "learning_rate": 1.7487291953342033e-05, + "loss": 0.6569, + "step": 1881 + }, + { + "epoch": 0.9304165121740205, + "grad_norm": 0.14359836467383957, + "learning_rate": 1.7484710217742185e-05, + "loss": 0.6976, + "step": 1882 + }, + { + "epoch": 0.9309108886417007, + "grad_norm": 0.13223816393054338, + "learning_rate": 1.748212734728061e-05, + "loss": 0.6685, + "step": 1883 + }, + { + "epoch": 0.9314052651093808, + "grad_norm": 0.1370307271179948, + "learning_rate": 1.7479543342348923e-05, + "loss": 0.6301, + "step": 1884 + }, + { + "epoch": 0.931899641577061, + "grad_norm": 0.1479020515603185, + "learning_rate": 1.7476958203338926e-05, + "loss": 0.6861, + "step": 1885 + }, + { + "epoch": 0.9323940180447411, + "grad_norm": 0.14137655305565788, + "learning_rate": 1.7474371930642594e-05, + "loss": 0.6805, + "step": 1886 + }, + { + "epoch": 0.9328883945124212, + "grad_norm": 0.14196265861738894, + "learning_rate": 1.7471784524652062e-05, + "loss": 0.7094, + "step": 1887 + }, + { + "epoch": 0.9333827709801014, + "grad_norm": 0.13589944152216218, + "learning_rate": 1.7469195985759643e-05, + "loss": 0.7112, + "step": 1888 + }, + { + "epoch": 0.9338771474477815, + "grad_norm": 0.15553113303013397, + "learning_rate": 1.7466606314357823e-05, + "loss": 0.6707, + "step": 1889 + }, + { + "epoch": 0.9343715239154616, + "grad_norm": 0.136672187913397, + "learning_rate": 1.7464015510839257e-05, + "loss": 0.7156, + "step": 1890 + }, + { + "epoch": 0.9348659003831418, + "grad_norm": 0.14895134784359462, + "learning_rate": 1.7461423575596775e-05, + "loss": 0.6912, + "step": 1891 + }, + { + "epoch": 0.9353602768508219, + "grad_norm": 0.13598441035421097, + "learning_rate": 1.7458830509023377e-05, + "loss": 0.6867, + "step": 1892 + }, + { + "epoch": 0.935854653318502, + "grad_norm": 0.1463392354018799, + "learning_rate": 1.745623631151223e-05, + "loss": 0.6807, + "step": 1893 + }, + { + "epoch": 0.9363490297861822, + "grad_norm": 0.13675366427150176, + "learning_rate": 1.745364098345668e-05, + "loss": 0.6553, + "step": 1894 + }, + { + "epoch": 0.9368434062538623, + "grad_norm": 0.16396407582593778, + "learning_rate": 1.745104452525024e-05, + "loss": 0.644, + "step": 1895 + }, + { + "epoch": 0.9373377827215424, + "grad_norm": 0.14006877839243062, + "learning_rate": 1.7448446937286594e-05, + "loss": 0.7029, + "step": 1896 + }, + { + "epoch": 0.9378321591892226, + "grad_norm": 0.13240970241568717, + "learning_rate": 1.7445848219959606e-05, + "loss": 0.7004, + "step": 1897 + }, + { + "epoch": 0.9383265356569027, + "grad_norm": 0.13182669599368904, + "learning_rate": 1.7443248373663293e-05, + "loss": 0.6803, + "step": 1898 + }, + { + "epoch": 0.9388209121245829, + "grad_norm": 0.13996860997631125, + "learning_rate": 1.7440647398791862e-05, + "loss": 0.7139, + "step": 1899 + }, + { + "epoch": 0.939315288592263, + "grad_norm": 0.1334897079634593, + "learning_rate": 1.7438045295739678e-05, + "loss": 0.6981, + "step": 1900 + }, + { + "epoch": 0.9398096650599431, + "grad_norm": 0.1486645051994342, + "learning_rate": 1.7435442064901288e-05, + "loss": 0.6855, + "step": 1901 + }, + { + "epoch": 0.9403040415276233, + "grad_norm": 0.14478773003492262, + "learning_rate": 1.74328377066714e-05, + "loss": 0.6861, + "step": 1902 + }, + { + "epoch": 0.9407984179953034, + "grad_norm": 0.13280504136025584, + "learning_rate": 1.74302322214449e-05, + "loss": 0.6546, + "step": 1903 + }, + { + "epoch": 0.9412927944629835, + "grad_norm": 0.1438344342530702, + "learning_rate": 1.742762560961684e-05, + "loss": 0.7012, + "step": 1904 + }, + { + "epoch": 0.9417871709306637, + "grad_norm": 0.13265663314210227, + "learning_rate": 1.7425017871582442e-05, + "loss": 0.6506, + "step": 1905 + }, + { + "epoch": 0.9422815473983438, + "grad_norm": 0.14059466511172297, + "learning_rate": 1.742240900773711e-05, + "loss": 0.6894, + "step": 1906 + }, + { + "epoch": 0.9427759238660239, + "grad_norm": 0.14344657737287667, + "learning_rate": 1.7419799018476404e-05, + "loss": 0.69, + "step": 1907 + }, + { + "epoch": 0.9432703003337041, + "grad_norm": 0.13832887950482223, + "learning_rate": 1.741718790419606e-05, + "loss": 0.7086, + "step": 1908 + }, + { + "epoch": 0.9437646768013842, + "grad_norm": 0.13992200166559302, + "learning_rate": 1.741457566529199e-05, + "loss": 0.7087, + "step": 1909 + }, + { + "epoch": 0.9442590532690643, + "grad_norm": 0.12983597076056805, + "learning_rate": 1.741196230216027e-05, + "loss": 0.6685, + "step": 1910 + }, + { + "epoch": 0.9447534297367445, + "grad_norm": 0.1428036615494095, + "learning_rate": 1.7409347815197148e-05, + "loss": 0.6789, + "step": 1911 + }, + { + "epoch": 0.9452478062044247, + "grad_norm": 0.1437331005531198, + "learning_rate": 1.7406732204799046e-05, + "loss": 0.6576, + "step": 1912 + }, + { + "epoch": 0.9457421826721049, + "grad_norm": 0.13736035155819978, + "learning_rate": 1.7404115471362552e-05, + "loss": 0.653, + "step": 1913 + }, + { + "epoch": 0.946236559139785, + "grad_norm": 0.13876891405809927, + "learning_rate": 1.740149761528442e-05, + "loss": 0.6653, + "step": 1914 + }, + { + "epoch": 0.9467309356074651, + "grad_norm": 0.16271275815120392, + "learning_rate": 1.739887863696159e-05, + "loss": 0.6863, + "step": 1915 + }, + { + "epoch": 0.9472253120751453, + "grad_norm": 0.1375912454842742, + "learning_rate": 1.7396258536791152e-05, + "loss": 0.6991, + "step": 1916 + }, + { + "epoch": 0.9477196885428254, + "grad_norm": 0.1374479468297318, + "learning_rate": 1.7393637315170385e-05, + "loss": 0.6196, + "step": 1917 + }, + { + "epoch": 0.9482140650105055, + "grad_norm": 0.13622392702263209, + "learning_rate": 1.739101497249672e-05, + "loss": 0.6786, + "step": 1918 + }, + { + "epoch": 0.9487084414781857, + "grad_norm": 0.1362392762619839, + "learning_rate": 1.7388391509167774e-05, + "loss": 0.7207, + "step": 1919 + }, + { + "epoch": 0.9492028179458658, + "grad_norm": 0.16976231277159512, + "learning_rate": 1.7385766925581327e-05, + "loss": 0.7271, + "step": 1920 + }, + { + "epoch": 0.9496971944135459, + "grad_norm": 0.1390933929599052, + "learning_rate": 1.7383141222135324e-05, + "loss": 0.6881, + "step": 1921 + }, + { + "epoch": 0.9501915708812261, + "grad_norm": 0.1338502563883833, + "learning_rate": 1.7380514399227888e-05, + "loss": 0.6891, + "step": 1922 + }, + { + "epoch": 0.9506859473489062, + "grad_norm": 0.1379582938328988, + "learning_rate": 1.737788645725731e-05, + "loss": 0.691, + "step": 1923 + }, + { + "epoch": 0.9511803238165863, + "grad_norm": 0.13572860538678136, + "learning_rate": 1.7375257396622044e-05, + "loss": 0.6411, + "step": 1924 + }, + { + "epoch": 0.9516747002842665, + "grad_norm": 0.13790555286181785, + "learning_rate": 1.7372627217720723e-05, + "loss": 0.6769, + "step": 1925 + }, + { + "epoch": 0.9521690767519466, + "grad_norm": 0.14446993399999022, + "learning_rate": 1.736999592095214e-05, + "loss": 0.6879, + "step": 1926 + }, + { + "epoch": 0.9526634532196268, + "grad_norm": 0.13666996665298053, + "learning_rate": 1.736736350671527e-05, + "loss": 0.7081, + "step": 1927 + }, + { + "epoch": 0.9531578296873069, + "grad_norm": 0.14895807429916266, + "learning_rate": 1.736472997540925e-05, + "loss": 0.679, + "step": 1928 + }, + { + "epoch": 0.953652206154987, + "grad_norm": 0.14335677960941431, + "learning_rate": 1.736209532743338e-05, + "loss": 0.6538, + "step": 1929 + }, + { + "epoch": 0.9541465826226672, + "grad_norm": 0.1907776568921772, + "learning_rate": 1.735945956318714e-05, + "loss": 0.6798, + "step": 1930 + }, + { + "epoch": 0.9546409590903473, + "grad_norm": 0.13898763613899556, + "learning_rate": 1.7356822683070174e-05, + "loss": 0.7033, + "step": 1931 + }, + { + "epoch": 0.9551353355580274, + "grad_norm": 0.1455695572105377, + "learning_rate": 1.7354184687482294e-05, + "loss": 0.6783, + "step": 1932 + }, + { + "epoch": 0.9556297120257076, + "grad_norm": 0.14004043821549447, + "learning_rate": 1.735154557682349e-05, + "loss": 0.6805, + "step": 1933 + }, + { + "epoch": 0.9561240884933877, + "grad_norm": 0.1443915630449077, + "learning_rate": 1.734890535149391e-05, + "loss": 0.6606, + "step": 1934 + }, + { + "epoch": 0.9566184649610678, + "grad_norm": 0.15589752093941014, + "learning_rate": 1.7346264011893876e-05, + "loss": 0.7304, + "step": 1935 + }, + { + "epoch": 0.957112841428748, + "grad_norm": 0.14061638326345685, + "learning_rate": 1.7343621558423877e-05, + "loss": 0.6733, + "step": 1936 + }, + { + "epoch": 0.9576072178964281, + "grad_norm": 0.1476148892329012, + "learning_rate": 1.7340977991484577e-05, + "loss": 0.6851, + "step": 1937 + }, + { + "epoch": 0.9581015943641082, + "grad_norm": 0.14614126057383164, + "learning_rate": 1.73383333114768e-05, + "loss": 0.6357, + "step": 1938 + }, + { + "epoch": 0.9585959708317884, + "grad_norm": 0.1338914034448092, + "learning_rate": 1.7335687518801543e-05, + "loss": 0.6645, + "step": 1939 + }, + { + "epoch": 0.9590903472994685, + "grad_norm": 0.13732733899650593, + "learning_rate": 1.7333040613859974e-05, + "loss": 0.6851, + "step": 1940 + }, + { + "epoch": 0.9595847237671487, + "grad_norm": 0.14157264939552466, + "learning_rate": 1.7330392597053423e-05, + "loss": 0.6538, + "step": 1941 + }, + { + "epoch": 0.9600791002348288, + "grad_norm": 0.13628729116767543, + "learning_rate": 1.73277434687834e-05, + "loss": 0.6415, + "step": 1942 + }, + { + "epoch": 0.9605734767025089, + "grad_norm": 0.1363819511806482, + "learning_rate": 1.732509322945157e-05, + "loss": 0.6896, + "step": 1943 + }, + { + "epoch": 0.9610678531701891, + "grad_norm": 0.1390978501079898, + "learning_rate": 1.7322441879459778e-05, + "loss": 0.7069, + "step": 1944 + }, + { + "epoch": 0.9615622296378692, + "grad_norm": 0.1385979721457932, + "learning_rate": 1.731978941921003e-05, + "loss": 0.7017, + "step": 1945 + }, + { + "epoch": 0.9620566061055493, + "grad_norm": 0.14535453781769242, + "learning_rate": 1.73171358491045e-05, + "loss": 0.6756, + "step": 1946 + }, + { + "epoch": 0.9625509825732295, + "grad_norm": 0.13749986885329793, + "learning_rate": 1.731448116954554e-05, + "loss": 0.6344, + "step": 1947 + }, + { + "epoch": 0.9630453590409096, + "grad_norm": 0.1434402839441481, + "learning_rate": 1.7311825380935654e-05, + "loss": 0.6727, + "step": 1948 + }, + { + "epoch": 0.9635397355085898, + "grad_norm": 0.14167189653604417, + "learning_rate": 1.7309168483677527e-05, + "loss": 0.705, + "step": 1949 + }, + { + "epoch": 0.96403411197627, + "grad_norm": 0.13295465346700625, + "learning_rate": 1.7306510478174014e-05, + "loss": 0.6397, + "step": 1950 + }, + { + "epoch": 0.9645284884439501, + "grad_norm": 0.14434814479930264, + "learning_rate": 1.7303851364828124e-05, + "loss": 0.6605, + "step": 1951 + }, + { + "epoch": 0.9650228649116303, + "grad_norm": 0.13244794671052865, + "learning_rate": 1.7301191144043047e-05, + "loss": 0.6644, + "step": 1952 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.25476713238639803, + "learning_rate": 1.7298529816222134e-05, + "loss": 0.6775, + "step": 1953 + }, + { + "epoch": 0.9660116178469905, + "grad_norm": 0.1468472063285335, + "learning_rate": 1.7295867381768908e-05, + "loss": 0.6648, + "step": 1954 + }, + { + "epoch": 0.9665059943146707, + "grad_norm": 0.1508267571808235, + "learning_rate": 1.7293203841087058e-05, + "loss": 0.6663, + "step": 1955 + }, + { + "epoch": 0.9670003707823508, + "grad_norm": 0.1464405111500648, + "learning_rate": 1.729053919458044e-05, + "loss": 0.702, + "step": 1956 + }, + { + "epoch": 0.9674947472500309, + "grad_norm": 0.13918463381989898, + "learning_rate": 1.728787344265308e-05, + "loss": 0.6885, + "step": 1957 + }, + { + "epoch": 0.9679891237177111, + "grad_norm": 0.1496089239809398, + "learning_rate": 1.728520658570917e-05, + "loss": 0.6545, + "step": 1958 + }, + { + "epoch": 0.9684835001853912, + "grad_norm": 0.13590877478370772, + "learning_rate": 1.7282538624153066e-05, + "loss": 0.6263, + "step": 1959 + }, + { + "epoch": 0.9689778766530713, + "grad_norm": 0.15173719016243267, + "learning_rate": 1.7279869558389295e-05, + "loss": 0.7037, + "step": 1960 + }, + { + "epoch": 0.9694722531207515, + "grad_norm": 0.15552358688448115, + "learning_rate": 1.7277199388822555e-05, + "loss": 0.6684, + "step": 1961 + }, + { + "epoch": 0.9699666295884316, + "grad_norm": 0.14317836688064053, + "learning_rate": 1.7274528115857707e-05, + "loss": 0.6915, + "step": 1962 + }, + { + "epoch": 0.9704610060561117, + "grad_norm": 0.1618316646114628, + "learning_rate": 1.727185573989978e-05, + "loss": 0.628, + "step": 1963 + }, + { + "epoch": 0.9709553825237919, + "grad_norm": 0.13824687531389435, + "learning_rate": 1.726918226135397e-05, + "loss": 0.6926, + "step": 1964 + }, + { + "epoch": 0.971449758991472, + "grad_norm": 0.14076834245539138, + "learning_rate": 1.7266507680625638e-05, + "loss": 0.6694, + "step": 1965 + }, + { + "epoch": 0.9719441354591521, + "grad_norm": 0.14415567401109305, + "learning_rate": 1.726383199812032e-05, + "loss": 0.6697, + "step": 1966 + }, + { + "epoch": 0.9724385119268323, + "grad_norm": 0.14453149213250374, + "learning_rate": 1.7261155214243706e-05, + "loss": 0.6854, + "step": 1967 + }, + { + "epoch": 0.9729328883945124, + "grad_norm": 0.14052632000936732, + "learning_rate": 1.725847732940167e-05, + "loss": 0.664, + "step": 1968 + }, + { + "epoch": 0.9734272648621926, + "grad_norm": 0.13874037266920894, + "learning_rate": 1.7255798344000235e-05, + "loss": 0.6952, + "step": 1969 + }, + { + "epoch": 0.9739216413298727, + "grad_norm": 0.14742962420450037, + "learning_rate": 1.725311825844561e-05, + "loss": 0.667, + "step": 1970 + }, + { + "epoch": 0.9744160177975528, + "grad_norm": 0.13969423254009256, + "learning_rate": 1.7250437073144146e-05, + "loss": 0.6411, + "step": 1971 + }, + { + "epoch": 0.974910394265233, + "grad_norm": 0.14692246118137253, + "learning_rate": 1.724775478850239e-05, + "loss": 0.7027, + "step": 1972 + }, + { + "epoch": 0.9754047707329131, + "grad_norm": 0.1388971865840868, + "learning_rate": 1.724507140492703e-05, + "loss": 0.6732, + "step": 1973 + }, + { + "epoch": 0.9758991472005932, + "grad_norm": 0.1437097530514817, + "learning_rate": 1.7242386922824935e-05, + "loss": 0.6685, + "step": 1974 + }, + { + "epoch": 0.9763935236682734, + "grad_norm": 0.3579006920573024, + "learning_rate": 1.7239701342603136e-05, + "loss": 0.7534, + "step": 1975 + }, + { + "epoch": 0.9768879001359535, + "grad_norm": 0.14164911320977194, + "learning_rate": 1.7237014664668833e-05, + "loss": 0.7246, + "step": 1976 + }, + { + "epoch": 0.9773822766036336, + "grad_norm": 0.14195014259143635, + "learning_rate": 1.7234326889429385e-05, + "loss": 0.7164, + "step": 1977 + }, + { + "epoch": 0.9778766530713138, + "grad_norm": 0.15251277424805362, + "learning_rate": 1.7231638017292337e-05, + "loss": 0.6535, + "step": 1978 + }, + { + "epoch": 0.9783710295389939, + "grad_norm": 0.14355967452136184, + "learning_rate": 1.7228948048665375e-05, + "loss": 0.6743, + "step": 1979 + }, + { + "epoch": 0.978865406006674, + "grad_norm": 0.15948508643065223, + "learning_rate": 1.7226256983956363e-05, + "loss": 0.6634, + "step": 1980 + }, + { + "epoch": 0.9793597824743542, + "grad_norm": 0.14014831492307925, + "learning_rate": 1.7223564823573337e-05, + "loss": 0.6793, + "step": 1981 + }, + { + "epoch": 0.9798541589420343, + "grad_norm": 0.13887377088450145, + "learning_rate": 1.7220871567924492e-05, + "loss": 0.6659, + "step": 1982 + }, + { + "epoch": 0.9803485354097144, + "grad_norm": 0.15196331203496766, + "learning_rate": 1.7218177217418183e-05, + "loss": 0.6788, + "step": 1983 + }, + { + "epoch": 0.9808429118773946, + "grad_norm": 0.13719845113997411, + "learning_rate": 1.7215481772462944e-05, + "loss": 0.7136, + "step": 1984 + }, + { + "epoch": 0.9813372883450747, + "grad_norm": 0.13986190102114243, + "learning_rate": 1.721278523346747e-05, + "loss": 0.6743, + "step": 1985 + }, + { + "epoch": 0.981831664812755, + "grad_norm": 0.14545356674509882, + "learning_rate": 1.721008760084062e-05, + "loss": 0.6441, + "step": 1986 + }, + { + "epoch": 0.9823260412804351, + "grad_norm": 0.14417809563987954, + "learning_rate": 1.720738887499142e-05, + "loss": 0.7026, + "step": 1987 + }, + { + "epoch": 0.9828204177481152, + "grad_norm": 0.14854820498656426, + "learning_rate": 1.7204689056329058e-05, + "loss": 0.6739, + "step": 1988 + }, + { + "epoch": 0.9833147942157954, + "grad_norm": 0.14340214620131106, + "learning_rate": 1.7201988145262897e-05, + "loss": 0.6727, + "step": 1989 + }, + { + "epoch": 0.9838091706834755, + "grad_norm": 0.14596571267976646, + "learning_rate": 1.7199286142202454e-05, + "loss": 0.6918, + "step": 1990 + }, + { + "epoch": 0.9843035471511556, + "grad_norm": 0.13943230912830337, + "learning_rate": 1.7196583047557425e-05, + "loss": 0.6817, + "step": 1991 + }, + { + "epoch": 0.9847979236188358, + "grad_norm": 0.13891263008871266, + "learning_rate": 1.7193878861737657e-05, + "loss": 0.6613, + "step": 1992 + }, + { + "epoch": 0.9852923000865159, + "grad_norm": 0.13942538907662405, + "learning_rate": 1.7191173585153174e-05, + "loss": 0.6509, + "step": 1993 + }, + { + "epoch": 0.985786676554196, + "grad_norm": 0.13693443747072204, + "learning_rate": 1.718846721821416e-05, + "loss": 0.6843, + "step": 1994 + }, + { + "epoch": 0.9862810530218762, + "grad_norm": 0.14138910721795822, + "learning_rate": 1.718575976133096e-05, + "loss": 0.6472, + "step": 1995 + }, + { + "epoch": 0.9867754294895563, + "grad_norm": 0.15303914837400656, + "learning_rate": 1.7183051214914096e-05, + "loss": 0.743, + "step": 1996 + }, + { + "epoch": 0.9872698059572365, + "grad_norm": 0.1351747411754208, + "learning_rate": 1.7180341579374244e-05, + "loss": 0.6719, + "step": 1997 + }, + { + "epoch": 0.9877641824249166, + "grad_norm": 0.15162811180685087, + "learning_rate": 1.7177630855122256e-05, + "loss": 0.683, + "step": 1998 + }, + { + "epoch": 0.9882585588925967, + "grad_norm": 0.1363642393483218, + "learning_rate": 1.7174919042569137e-05, + "loss": 0.6747, + "step": 1999 + }, + { + "epoch": 0.9887529353602769, + "grad_norm": 0.1477252681994416, + "learning_rate": 1.7172206142126068e-05, + "loss": 0.7036, + "step": 2000 + }, + { + "epoch": 0.989247311827957, + "grad_norm": 0.13627488139108843, + "learning_rate": 1.7169492154204385e-05, + "loss": 0.5953, + "step": 2001 + }, + { + "epoch": 0.9897416882956371, + "grad_norm": 0.14089892162576437, + "learning_rate": 1.7166777079215595e-05, + "loss": 0.6755, + "step": 2002 + }, + { + "epoch": 0.9902360647633173, + "grad_norm": 0.1412495421144976, + "learning_rate": 1.716406091757137e-05, + "loss": 0.695, + "step": 2003 + }, + { + "epoch": 0.9907304412309974, + "grad_norm": 0.14163443153293265, + "learning_rate": 1.7161343669683542e-05, + "loss": 0.6585, + "step": 2004 + }, + { + "epoch": 0.9912248176986775, + "grad_norm": 0.14041421574091667, + "learning_rate": 1.7158625335964116e-05, + "loss": 0.6953, + "step": 2005 + }, + { + "epoch": 0.9917191941663577, + "grad_norm": 0.14370794738287693, + "learning_rate": 1.7155905916825253e-05, + "loss": 0.6881, + "step": 2006 + }, + { + "epoch": 0.9922135706340378, + "grad_norm": 0.13664254289956032, + "learning_rate": 1.7153185412679283e-05, + "loss": 0.6824, + "step": 2007 + }, + { + "epoch": 0.992707947101718, + "grad_norm": 0.14703169859900345, + "learning_rate": 1.7150463823938702e-05, + "loss": 0.6458, + "step": 2008 + }, + { + "epoch": 0.9932023235693981, + "grad_norm": 0.13820475866958015, + "learning_rate": 1.7147741151016163e-05, + "loss": 0.6866, + "step": 2009 + }, + { + "epoch": 0.9936967000370782, + "grad_norm": 0.14378303954039304, + "learning_rate": 1.7145017394324495e-05, + "loss": 0.6934, + "step": 2010 + }, + { + "epoch": 0.9941910765047584, + "grad_norm": 0.15318168019474904, + "learning_rate": 1.7142292554276678e-05, + "loss": 0.6646, + "step": 2011 + }, + { + "epoch": 0.9946854529724385, + "grad_norm": 0.13583421310589702, + "learning_rate": 1.7139566631285868e-05, + "loss": 0.6476, + "step": 2012 + }, + { + "epoch": 0.9951798294401186, + "grad_norm": 0.13656232723411818, + "learning_rate": 1.713683962576538e-05, + "loss": 0.6726, + "step": 2013 + }, + { + "epoch": 0.9956742059077988, + "grad_norm": 4.125180296027735, + "learning_rate": 1.7134111538128694e-05, + "loss": 0.7322, + "step": 2014 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 0.15524827766823218, + "learning_rate": 1.713138236878945e-05, + "loss": 0.664, + "step": 2015 + }, + { + "epoch": 0.996662958843159, + "grad_norm": 0.1509447030917703, + "learning_rate": 1.7128652118161458e-05, + "loss": 0.6829, + "step": 2016 + }, + { + "epoch": 0.9971573353108392, + "grad_norm": 0.15246297934920644, + "learning_rate": 1.7125920786658688e-05, + "loss": 0.7138, + "step": 2017 + }, + { + "epoch": 0.9976517117785193, + "grad_norm": 0.1527345018118034, + "learning_rate": 1.712318837469528e-05, + "loss": 0.6244, + "step": 2018 + }, + { + "epoch": 0.9981460882461994, + "grad_norm": 0.16253452819327988, + "learning_rate": 1.712045488268553e-05, + "loss": 0.6663, + "step": 2019 + }, + { + "epoch": 0.9986404647138796, + "grad_norm": 0.17189986506613708, + "learning_rate": 1.7117720311043897e-05, + "loss": 0.7003, + "step": 2020 + }, + { + "epoch": 0.9991348411815597, + "grad_norm": 0.14649032042579488, + "learning_rate": 1.7114984660185015e-05, + "loss": 0.6774, + "step": 2021 + }, + { + "epoch": 0.9996292176492398, + "grad_norm": 0.14929872208676423, + "learning_rate": 1.711224793052367e-05, + "loss": 0.6886, + "step": 2022 + }, + { + "epoch": 1.0, + "grad_norm": 0.14929872208676423, + "learning_rate": 1.7109510122474818e-05, + "loss": 0.6656, + "step": 2023 + }, + { + "epoch": 1.0004943764676801, + "grad_norm": 0.18178389290822858, + "learning_rate": 1.710677123645357e-05, + "loss": 0.6007, + "step": 2024 + }, + { + "epoch": 1.0004943764676801, + "eval_loss": 0.6826846599578857, + "eval_runtime": 81.7115, + "eval_samples_per_second": 371.478, + "eval_steps_per_second": 46.444, + "step": 2024 + }, + { + "epoch": 1.0009887529353603, + "grad_norm": 0.14846450285715862, + "learning_rate": 1.710403127287522e-05, + "loss": 0.6413, + "step": 2025 + }, + { + "epoch": 1.0014831294030404, + "grad_norm": 0.15637083658584916, + "learning_rate": 1.71012902321552e-05, + "loss": 0.65, + "step": 2026 + }, + { + "epoch": 1.0019775058707205, + "grad_norm": 0.16700718254849864, + "learning_rate": 1.7098548114709122e-05, + "loss": 0.5984, + "step": 2027 + }, + { + "epoch": 1.0024718823384007, + "grad_norm": 0.15601939053167585, + "learning_rate": 1.7095804920952758e-05, + "loss": 0.6356, + "step": 2028 + }, + { + "epoch": 1.0029662588060808, + "grad_norm": 0.15774047402831795, + "learning_rate": 1.7093060651302042e-05, + "loss": 0.6429, + "step": 2029 + }, + { + "epoch": 1.003460635273761, + "grad_norm": 0.1486088545851433, + "learning_rate": 1.7090315306173068e-05, + "loss": 0.6108, + "step": 2030 + }, + { + "epoch": 1.003955011741441, + "grad_norm": 0.15438900885712412, + "learning_rate": 1.70875688859821e-05, + "loss": 0.6464, + "step": 2031 + }, + { + "epoch": 1.0044493882091212, + "grad_norm": 0.14987464202437, + "learning_rate": 1.7084821391145558e-05, + "loss": 0.6255, + "step": 2032 + }, + { + "epoch": 1.0049437646768014, + "grad_norm": 0.17907578816698233, + "learning_rate": 1.708207282208003e-05, + "loss": 0.6395, + "step": 2033 + }, + { + "epoch": 1.0054381411444815, + "grad_norm": 0.1574066241076069, + "learning_rate": 1.7079323179202262e-05, + "loss": 0.6475, + "step": 2034 + }, + { + "epoch": 1.0059325176121616, + "grad_norm": 0.14126597172891686, + "learning_rate": 1.7076572462929173e-05, + "loss": 0.6227, + "step": 2035 + }, + { + "epoch": 1.0064268940798418, + "grad_norm": 0.14480676837314913, + "learning_rate": 1.7073820673677833e-05, + "loss": 0.6503, + "step": 2036 + }, + { + "epoch": 1.006921270547522, + "grad_norm": 0.14423929484158318, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.5651, + "step": 2037 + }, + { + "epoch": 1.007415647015202, + "grad_norm": 0.14313290777546908, + "learning_rate": 1.7068313877909507e-05, + "loss": 0.632, + "step": 2038 + }, + { + "epoch": 1.0079100234828822, + "grad_norm": 0.1474819888954858, + "learning_rate": 1.7065558872227488e-05, + "loss": 0.6357, + "step": 2039 + }, + { + "epoch": 1.0084043999505623, + "grad_norm": 0.14722137614244588, + "learning_rate": 1.706280279523714e-05, + "loss": 0.6627, + "step": 2040 + }, + { + "epoch": 1.0088987764182424, + "grad_norm": 0.141858920342838, + "learning_rate": 1.7060045647356357e-05, + "loss": 0.6082, + "step": 2041 + }, + { + "epoch": 1.0093931528859226, + "grad_norm": 0.4720202514351848, + "learning_rate": 1.7057287429003185e-05, + "loss": 0.6396, + "step": 2042 + }, + { + "epoch": 1.0098875293536027, + "grad_norm": 0.14843371119185922, + "learning_rate": 1.7054528140595835e-05, + "loss": 0.5925, + "step": 2043 + }, + { + "epoch": 1.0103819058212828, + "grad_norm": 0.15469140531771597, + "learning_rate": 1.705176778255268e-05, + "loss": 0.6904, + "step": 2044 + }, + { + "epoch": 1.010876282288963, + "grad_norm": 0.21972075693884788, + "learning_rate": 1.704900635529226e-05, + "loss": 0.6125, + "step": 2045 + }, + { + "epoch": 1.0113706587566431, + "grad_norm": 0.14274896656074304, + "learning_rate": 1.7046243859233275e-05, + "loss": 0.6625, + "step": 2046 + }, + { + "epoch": 1.0118650352243233, + "grad_norm": 0.15117931814759553, + "learning_rate": 1.7043480294794583e-05, + "loss": 0.6284, + "step": 2047 + }, + { + "epoch": 1.0123594116920034, + "grad_norm": 0.14384527860603671, + "learning_rate": 1.7040715662395207e-05, + "loss": 0.6289, + "step": 2048 + }, + { + "epoch": 1.0128537881596835, + "grad_norm": 0.1437617290676505, + "learning_rate": 1.703794996245433e-05, + "loss": 0.6302, + "step": 2049 + }, + { + "epoch": 1.0133481646273637, + "grad_norm": 0.14646054229392208, + "learning_rate": 1.7035183195391303e-05, + "loss": 0.6423, + "step": 2050 + }, + { + "epoch": 1.0138425410950438, + "grad_norm": 0.15338492758538472, + "learning_rate": 1.703241536162563e-05, + "loss": 0.6315, + "step": 2051 + }, + { + "epoch": 1.014336917562724, + "grad_norm": 0.13932610330602582, + "learning_rate": 1.7029646461576984e-05, + "loss": 0.6289, + "step": 2052 + }, + { + "epoch": 1.014831294030404, + "grad_norm": 0.1446643105932519, + "learning_rate": 1.7026876495665194e-05, + "loss": 0.602, + "step": 2053 + }, + { + "epoch": 1.0153256704980842, + "grad_norm": 0.14657326254671982, + "learning_rate": 1.7024105464310257e-05, + "loss": 0.5758, + "step": 2054 + }, + { + "epoch": 1.0158200469657643, + "grad_norm": 0.1303147933316635, + "learning_rate": 1.702133336793232e-05, + "loss": 0.627, + "step": 2055 + }, + { + "epoch": 1.0163144234334445, + "grad_norm": 0.14742701002279676, + "learning_rate": 1.701856020695171e-05, + "loss": 0.6191, + "step": 2056 + }, + { + "epoch": 1.0168087999011246, + "grad_norm": 0.14542363784250498, + "learning_rate": 1.7015785981788902e-05, + "loss": 0.6454, + "step": 2057 + }, + { + "epoch": 1.0173031763688047, + "grad_norm": 0.13414313638977474, + "learning_rate": 1.7013010692864527e-05, + "loss": 0.6513, + "step": 2058 + }, + { + "epoch": 1.0177975528364849, + "grad_norm": 0.15259487611752887, + "learning_rate": 1.7010234340599393e-05, + "loss": 0.6407, + "step": 2059 + }, + { + "epoch": 1.0182919293041652, + "grad_norm": 0.1504390655306874, + "learning_rate": 1.700745692541446e-05, + "loss": 0.6052, + "step": 2060 + }, + { + "epoch": 1.0187863057718454, + "grad_norm": 0.13932221853918075, + "learning_rate": 1.700467844773085e-05, + "loss": 0.6176, + "step": 2061 + }, + { + "epoch": 1.0192806822395255, + "grad_norm": 0.1438833606145575, + "learning_rate": 1.700189890796985e-05, + "loss": 0.6513, + "step": 2062 + }, + { + "epoch": 1.0197750587072056, + "grad_norm": 0.14406354128351442, + "learning_rate": 1.6999118306552903e-05, + "loss": 0.62, + "step": 2063 + }, + { + "epoch": 1.0202694351748858, + "grad_norm": 0.14818728894918318, + "learning_rate": 1.6996336643901613e-05, + "loss": 0.6486, + "step": 2064 + }, + { + "epoch": 1.020763811642566, + "grad_norm": 0.15742967981920486, + "learning_rate": 1.6993553920437745e-05, + "loss": 0.6877, + "step": 2065 + }, + { + "epoch": 1.021258188110246, + "grad_norm": 0.15570308134175215, + "learning_rate": 1.6990770136583234e-05, + "loss": 0.6457, + "step": 2066 + }, + { + "epoch": 1.0217525645779262, + "grad_norm": 0.141347362737822, + "learning_rate": 1.6987985292760163e-05, + "loss": 0.6439, + "step": 2067 + }, + { + "epoch": 1.0222469410456063, + "grad_norm": 0.1464516061793762, + "learning_rate": 1.6985199389390782e-05, + "loss": 0.6084, + "step": 2068 + }, + { + "epoch": 1.0227413175132865, + "grad_norm": 0.15126012405026856, + "learning_rate": 1.6982412426897505e-05, + "loss": 0.666, + "step": 2069 + }, + { + "epoch": 1.0232356939809666, + "grad_norm": 0.15127405200253896, + "learning_rate": 1.6979624405702895e-05, + "loss": 0.5997, + "step": 2070 + }, + { + "epoch": 1.0237300704486467, + "grad_norm": 0.154611048358368, + "learning_rate": 1.697683532622969e-05, + "loss": 0.6217, + "step": 2071 + }, + { + "epoch": 1.0242244469163269, + "grad_norm": 0.14500564642976985, + "learning_rate": 1.6974045188900775e-05, + "loss": 0.6105, + "step": 2072 + }, + { + "epoch": 1.024718823384007, + "grad_norm": 0.14859297071778094, + "learning_rate": 1.6971253994139205e-05, + "loss": 0.6385, + "step": 2073 + }, + { + "epoch": 1.0252131998516871, + "grad_norm": 0.14082158642230705, + "learning_rate": 1.696846174236819e-05, + "loss": 0.6381, + "step": 2074 + }, + { + "epoch": 1.0257075763193673, + "grad_norm": 0.1437357177041898, + "learning_rate": 1.6965668434011112e-05, + "loss": 0.6208, + "step": 2075 + }, + { + "epoch": 1.0262019527870474, + "grad_norm": 0.15700000830455182, + "learning_rate": 1.696287406949149e-05, + "loss": 0.6128, + "step": 2076 + }, + { + "epoch": 1.0266963292547275, + "grad_norm": 0.13314343049410926, + "learning_rate": 1.6960078649233024e-05, + "loss": 0.6174, + "step": 2077 + }, + { + "epoch": 1.0271907057224077, + "grad_norm": 0.15317895660189831, + "learning_rate": 1.6957282173659567e-05, + "loss": 0.6299, + "step": 2078 + }, + { + "epoch": 1.0276850821900878, + "grad_norm": 0.13986322073828114, + "learning_rate": 1.695448464319513e-05, + "loss": 0.6232, + "step": 2079 + }, + { + "epoch": 1.028179458657768, + "grad_norm": 0.15783234775852203, + "learning_rate": 1.6951686058263884e-05, + "loss": 0.6424, + "step": 2080 + }, + { + "epoch": 1.028673835125448, + "grad_norm": 0.1338606139523347, + "learning_rate": 1.6948886419290167e-05, + "loss": 0.5943, + "step": 2081 + }, + { + "epoch": 1.0291682115931282, + "grad_norm": 0.13681098452238408, + "learning_rate": 1.6946085726698465e-05, + "loss": 0.6214, + "step": 2082 + }, + { + "epoch": 1.0296625880608083, + "grad_norm": 0.14699359299629985, + "learning_rate": 1.6943283980913438e-05, + "loss": 0.6248, + "step": 2083 + }, + { + "epoch": 1.0301569645284885, + "grad_norm": 0.14034791266047772, + "learning_rate": 1.694048118235989e-05, + "loss": 0.5894, + "step": 2084 + }, + { + "epoch": 1.0306513409961686, + "grad_norm": 0.13830714670501834, + "learning_rate": 1.6937677331462796e-05, + "loss": 0.6231, + "step": 2085 + }, + { + "epoch": 1.0311457174638488, + "grad_norm": 0.13677520647900088, + "learning_rate": 1.693487242864729e-05, + "loss": 0.6047, + "step": 2086 + }, + { + "epoch": 1.031640093931529, + "grad_norm": 0.14144359148078606, + "learning_rate": 1.6932066474338658e-05, + "loss": 0.641, + "step": 2087 + }, + { + "epoch": 1.032134470399209, + "grad_norm": 0.1335122055109684, + "learning_rate": 1.692925946896235e-05, + "loss": 0.6486, + "step": 2088 + }, + { + "epoch": 1.0326288468668892, + "grad_norm": 0.1354196155606789, + "learning_rate": 1.6926451412943982e-05, + "loss": 0.6354, + "step": 2089 + }, + { + "epoch": 1.0331232233345693, + "grad_norm": 0.1455966755648798, + "learning_rate": 1.6923642306709312e-05, + "loss": 0.6273, + "step": 2090 + }, + { + "epoch": 1.0336175998022494, + "grad_norm": 0.14594970177701325, + "learning_rate": 1.6920832150684278e-05, + "loss": 0.6058, + "step": 2091 + }, + { + "epoch": 1.0341119762699296, + "grad_norm": 0.15339306441049158, + "learning_rate": 1.691802094529496e-05, + "loss": 0.6693, + "step": 2092 + }, + { + "epoch": 1.0346063527376097, + "grad_norm": 0.1432680468112502, + "learning_rate": 1.6915208690967607e-05, + "loss": 0.6614, + "step": 2093 + }, + { + "epoch": 1.0351007292052898, + "grad_norm": 0.1411338674699603, + "learning_rate": 1.6912395388128627e-05, + "loss": 0.6348, + "step": 2094 + }, + { + "epoch": 1.03559510567297, + "grad_norm": 0.14335754317302693, + "learning_rate": 1.690958103720458e-05, + "loss": 0.6507, + "step": 2095 + }, + { + "epoch": 1.03608948214065, + "grad_norm": 0.14682556995434728, + "learning_rate": 1.690676563862219e-05, + "loss": 0.6364, + "step": 2096 + }, + { + "epoch": 1.0365838586083302, + "grad_norm": 0.15122251173592147, + "learning_rate": 1.690394919280834e-05, + "loss": 0.7268, + "step": 2097 + }, + { + "epoch": 1.0370782350760104, + "grad_norm": 0.14751842540901094, + "learning_rate": 1.6901131700190073e-05, + "loss": 0.6387, + "step": 2098 + }, + { + "epoch": 1.0375726115436905, + "grad_norm": 0.17091256016060977, + "learning_rate": 1.6898313161194584e-05, + "loss": 0.6103, + "step": 2099 + }, + { + "epoch": 1.0380669880113707, + "grad_norm": 0.14356184114605985, + "learning_rate": 1.6895493576249235e-05, + "loss": 0.639, + "step": 2100 + }, + { + "epoch": 1.0385613644790508, + "grad_norm": 0.1454639489779529, + "learning_rate": 1.6892672945781537e-05, + "loss": 0.6433, + "step": 2101 + }, + { + "epoch": 1.039055740946731, + "grad_norm": 0.1510976077902503, + "learning_rate": 1.6889851270219172e-05, + "loss": 0.6337, + "step": 2102 + }, + { + "epoch": 1.039550117414411, + "grad_norm": 0.14245323173024674, + "learning_rate": 1.688702854998997e-05, + "loss": 0.6487, + "step": 2103 + }, + { + "epoch": 1.0400444938820912, + "grad_norm": 0.15283095993478668, + "learning_rate": 1.6884204785521924e-05, + "loss": 0.6249, + "step": 2104 + }, + { + "epoch": 1.0405388703497713, + "grad_norm": 0.1557344790245446, + "learning_rate": 1.688137997724319e-05, + "loss": 0.6223, + "step": 2105 + }, + { + "epoch": 1.0410332468174515, + "grad_norm": 0.13572576067755832, + "learning_rate": 1.6878554125582066e-05, + "loss": 0.6341, + "step": 2106 + }, + { + "epoch": 1.0415276232851316, + "grad_norm": 0.14801862568083812, + "learning_rate": 1.6875727230967025e-05, + "loss": 0.5999, + "step": 2107 + }, + { + "epoch": 1.0420219997528117, + "grad_norm": 0.14875103475740223, + "learning_rate": 1.687289929382669e-05, + "loss": 0.6699, + "step": 2108 + }, + { + "epoch": 1.0425163762204919, + "grad_norm": 0.13762061932681685, + "learning_rate": 1.6870070314589847e-05, + "loss": 0.6537, + "step": 2109 + }, + { + "epoch": 1.043010752688172, + "grad_norm": 0.1464411208334356, + "learning_rate": 1.6867240293685435e-05, + "loss": 0.606, + "step": 2110 + }, + { + "epoch": 1.0435051291558521, + "grad_norm": 0.14253364813778927, + "learning_rate": 1.6864409231542557e-05, + "loss": 0.6306, + "step": 2111 + }, + { + "epoch": 1.0439995056235323, + "grad_norm": 0.13863411778369947, + "learning_rate": 1.6861577128590465e-05, + "loss": 0.5894, + "step": 2112 + }, + { + "epoch": 1.0444938820912124, + "grad_norm": 0.1357013113648618, + "learning_rate": 1.6858743985258573e-05, + "loss": 0.6269, + "step": 2113 + }, + { + "epoch": 1.0449882585588925, + "grad_norm": 0.14690863238793925, + "learning_rate": 1.685590980197646e-05, + "loss": 0.6476, + "step": 2114 + }, + { + "epoch": 1.0454826350265727, + "grad_norm": 0.14497631388012228, + "learning_rate": 1.685307457917385e-05, + "loss": 0.6176, + "step": 2115 + }, + { + "epoch": 1.0459770114942528, + "grad_norm": 0.1344232081835438, + "learning_rate": 1.6850238317280633e-05, + "loss": 0.6497, + "step": 2116 + }, + { + "epoch": 1.046471387961933, + "grad_norm": 0.1426572910738216, + "learning_rate": 1.6847401016726858e-05, + "loss": 0.6329, + "step": 2117 + }, + { + "epoch": 1.046965764429613, + "grad_norm": 0.14299103850287956, + "learning_rate": 1.684456267794272e-05, + "loss": 0.6795, + "step": 2118 + }, + { + "epoch": 1.0474601408972932, + "grad_norm": 0.1443991554567319, + "learning_rate": 1.6841723301358586e-05, + "loss": 0.6366, + "step": 2119 + }, + { + "epoch": 1.0479545173649734, + "grad_norm": 0.13940648407265072, + "learning_rate": 1.683888288740497e-05, + "loss": 0.6155, + "step": 2120 + }, + { + "epoch": 1.0484488938326535, + "grad_norm": 0.13641821487741498, + "learning_rate": 1.683604143651255e-05, + "loss": 0.6326, + "step": 2121 + }, + { + "epoch": 1.0489432703003336, + "grad_norm": 0.13754791078584383, + "learning_rate": 1.6833198949112155e-05, + "loss": 0.6305, + "step": 2122 + }, + { + "epoch": 1.0494376467680138, + "grad_norm": 0.13552049011531464, + "learning_rate": 1.6830355425634775e-05, + "loss": 0.5833, + "step": 2123 + }, + { + "epoch": 1.049932023235694, + "grad_norm": 0.13416863915416766, + "learning_rate": 1.682751086651156e-05, + "loss": 0.6338, + "step": 2124 + }, + { + "epoch": 1.050426399703374, + "grad_norm": 0.13877945459434413, + "learning_rate": 1.6824665272173806e-05, + "loss": 0.6539, + "step": 2125 + }, + { + "epoch": 1.0509207761710542, + "grad_norm": 0.1471396210294033, + "learning_rate": 1.682181864305298e-05, + "loss": 0.6567, + "step": 2126 + }, + { + "epoch": 1.0514151526387343, + "grad_norm": 0.13994252942623173, + "learning_rate": 1.68189709795807e-05, + "loss": 0.6334, + "step": 2127 + }, + { + "epoch": 1.0519095291064144, + "grad_norm": 0.13997519969018743, + "learning_rate": 1.6816122282188735e-05, + "loss": 0.6254, + "step": 2128 + }, + { + "epoch": 1.0524039055740946, + "grad_norm": 0.14580072858945883, + "learning_rate": 1.681327255130902e-05, + "loss": 0.6316, + "step": 2129 + }, + { + "epoch": 1.0528982820417747, + "grad_norm": 0.14544641650435086, + "learning_rate": 1.6810421787373644e-05, + "loss": 0.6409, + "step": 2130 + }, + { + "epoch": 1.0533926585094548, + "grad_norm": 0.14453074956947962, + "learning_rate": 1.6807569990814842e-05, + "loss": 0.6441, + "step": 2131 + }, + { + "epoch": 1.053887034977135, + "grad_norm": 0.13823313572552595, + "learning_rate": 1.680471716206502e-05, + "loss": 0.5889, + "step": 2132 + }, + { + "epoch": 1.0543814114448151, + "grad_norm": 0.1436361818985018, + "learning_rate": 1.680186330155674e-05, + "loss": 0.625, + "step": 2133 + }, + { + "epoch": 1.0548757879124953, + "grad_norm": 0.13994651758176796, + "learning_rate": 1.6799008409722713e-05, + "loss": 0.6237, + "step": 2134 + }, + { + "epoch": 1.0553701643801756, + "grad_norm": 0.14408782519125585, + "learning_rate": 1.679615248699581e-05, + "loss": 0.6374, + "step": 2135 + }, + { + "epoch": 1.0558645408478557, + "grad_norm": 0.17391790166065335, + "learning_rate": 1.6793295533809054e-05, + "loss": 0.6358, + "step": 2136 + }, + { + "epoch": 1.0563589173155359, + "grad_norm": 0.14138736619073092, + "learning_rate": 1.6790437550595626e-05, + "loss": 0.6178, + "step": 2137 + }, + { + "epoch": 1.056853293783216, + "grad_norm": 0.14853917208619466, + "learning_rate": 1.678757853778887e-05, + "loss": 0.673, + "step": 2138 + }, + { + "epoch": 1.0573476702508962, + "grad_norm": 0.14165474730912692, + "learning_rate": 1.678471849582228e-05, + "loss": 0.6478, + "step": 2139 + }, + { + "epoch": 1.0578420467185763, + "grad_norm": 0.13693653370211833, + "learning_rate": 1.6781857425129507e-05, + "loss": 0.6096, + "step": 2140 + }, + { + "epoch": 1.0583364231862564, + "grad_norm": 0.1310989799182569, + "learning_rate": 1.6778995326144354e-05, + "loss": 0.6295, + "step": 2141 + }, + { + "epoch": 1.0588307996539366, + "grad_norm": 0.14837531050740607, + "learning_rate": 1.6776132199300792e-05, + "loss": 0.6302, + "step": 2142 + }, + { + "epoch": 1.0593251761216167, + "grad_norm": 0.14039758417755263, + "learning_rate": 1.677326804503293e-05, + "loss": 0.6351, + "step": 2143 + }, + { + "epoch": 1.0598195525892968, + "grad_norm": 0.1397551331483713, + "learning_rate": 1.677040286377505e-05, + "loss": 0.6283, + "step": 2144 + }, + { + "epoch": 1.060313929056977, + "grad_norm": 0.13779797130903837, + "learning_rate": 1.676753665596158e-05, + "loss": 0.6382, + "step": 2145 + }, + { + "epoch": 1.060808305524657, + "grad_norm": 0.1424055395244823, + "learning_rate": 1.6764669422027106e-05, + "loss": 0.6151, + "step": 2146 + }, + { + "epoch": 1.0613026819923372, + "grad_norm": 0.13692219153757618, + "learning_rate": 1.676180116240637e-05, + "loss": 0.661, + "step": 2147 + }, + { + "epoch": 1.0617970584600174, + "grad_norm": 0.13849773242017313, + "learning_rate": 1.6758931877534263e-05, + "loss": 0.6555, + "step": 2148 + }, + { + "epoch": 1.0622914349276975, + "grad_norm": 0.13693520807312165, + "learning_rate": 1.6756061567845843e-05, + "loss": 0.598, + "step": 2149 + }, + { + "epoch": 1.0627858113953776, + "grad_norm": 0.14434880749471832, + "learning_rate": 1.6753190233776323e-05, + "loss": 0.6247, + "step": 2150 + }, + { + "epoch": 1.0632801878630578, + "grad_norm": 0.1376692189182417, + "learning_rate": 1.6750317875761057e-05, + "loss": 0.6373, + "step": 2151 + }, + { + "epoch": 1.063774564330738, + "grad_norm": 0.13898087750800736, + "learning_rate": 1.6747444494235565e-05, + "loss": 0.6015, + "step": 2152 + }, + { + "epoch": 1.064268940798418, + "grad_norm": 0.13452295434189337, + "learning_rate": 1.6744570089635527e-05, + "loss": 0.6344, + "step": 2153 + }, + { + "epoch": 1.0647633172660982, + "grad_norm": 0.1591669098505732, + "learning_rate": 1.6741694662396763e-05, + "loss": 0.6168, + "step": 2154 + }, + { + "epoch": 1.0652576937337783, + "grad_norm": 0.13824932082386387, + "learning_rate": 1.6738818212955263e-05, + "loss": 0.6377, + "step": 2155 + }, + { + "epoch": 1.0657520702014585, + "grad_norm": 0.1410359282443109, + "learning_rate": 1.673594074174716e-05, + "loss": 0.6325, + "step": 2156 + }, + { + "epoch": 1.0662464466691386, + "grad_norm": 0.1497847241595414, + "learning_rate": 1.673306224920876e-05, + "loss": 0.6057, + "step": 2157 + }, + { + "epoch": 1.0667408231368187, + "grad_norm": 0.1382748559737145, + "learning_rate": 1.67301827357765e-05, + "loss": 0.6103, + "step": 2158 + }, + { + "epoch": 1.0672351996044989, + "grad_norm": 0.15236413597685797, + "learning_rate": 1.6727302201886986e-05, + "loss": 0.6487, + "step": 2159 + }, + { + "epoch": 1.067729576072179, + "grad_norm": 0.15035319688358026, + "learning_rate": 1.6724420647976972e-05, + "loss": 0.6108, + "step": 2160 + }, + { + "epoch": 1.0682239525398591, + "grad_norm": 0.14830146453788134, + "learning_rate": 1.6721538074483385e-05, + "loss": 0.6074, + "step": 2161 + }, + { + "epoch": 1.0687183290075393, + "grad_norm": 0.15154232258292055, + "learning_rate": 1.671865448184327e-05, + "loss": 0.6427, + "step": 2162 + }, + { + "epoch": 1.0692127054752194, + "grad_norm": 0.13275790643754765, + "learning_rate": 1.671576987049387e-05, + "loss": 0.6075, + "step": 2163 + }, + { + "epoch": 1.0697070819428995, + "grad_norm": 0.1430653787591778, + "learning_rate": 1.6712884240872555e-05, + "loss": 0.6077, + "step": 2164 + }, + { + "epoch": 1.0702014584105797, + "grad_norm": 0.1487059286541024, + "learning_rate": 1.670999759341685e-05, + "loss": 0.6204, + "step": 2165 + }, + { + "epoch": 1.0706958348782598, + "grad_norm": 0.1331919856233242, + "learning_rate": 1.670710992856444e-05, + "loss": 0.6153, + "step": 2166 + }, + { + "epoch": 1.07119021134594, + "grad_norm": 0.14431365903822319, + "learning_rate": 1.6704221246753172e-05, + "loss": 0.612, + "step": 2167 + }, + { + "epoch": 1.07168458781362, + "grad_norm": 0.14022911842201688, + "learning_rate": 1.6701331548421037e-05, + "loss": 0.6566, + "step": 2168 + }, + { + "epoch": 1.0721789642813002, + "grad_norm": 0.14483693851935758, + "learning_rate": 1.669844083400618e-05, + "loss": 0.6325, + "step": 2169 + }, + { + "epoch": 1.0726733407489804, + "grad_norm": 0.14157588843918725, + "learning_rate": 1.66955491039469e-05, + "loss": 0.6066, + "step": 2170 + }, + { + "epoch": 1.0731677172166605, + "grad_norm": 0.14088408483161177, + "learning_rate": 1.669265635868166e-05, + "loss": 0.6568, + "step": 2171 + }, + { + "epoch": 1.0736620936843406, + "grad_norm": 0.14714675677270148, + "learning_rate": 1.6689762598649063e-05, + "loss": 0.6327, + "step": 2172 + }, + { + "epoch": 1.0741564701520208, + "grad_norm": 0.1484344060611113, + "learning_rate": 1.6686867824287877e-05, + "loss": 0.6448, + "step": 2173 + }, + { + "epoch": 1.074650846619701, + "grad_norm": 0.14269795243264385, + "learning_rate": 1.668397203603702e-05, + "loss": 0.6319, + "step": 2174 + }, + { + "epoch": 1.075145223087381, + "grad_norm": 0.1434634664064872, + "learning_rate": 1.668107523433556e-05, + "loss": 0.6562, + "step": 2175 + }, + { + "epoch": 1.0756395995550612, + "grad_norm": 0.1492771692524354, + "learning_rate": 1.667817741962272e-05, + "loss": 0.6469, + "step": 2176 + }, + { + "epoch": 1.0761339760227413, + "grad_norm": 0.15090990786026823, + "learning_rate": 1.6675278592337885e-05, + "loss": 0.6492, + "step": 2177 + }, + { + "epoch": 1.0766283524904214, + "grad_norm": 0.14565380871993866, + "learning_rate": 1.6672378752920576e-05, + "loss": 0.6765, + "step": 2178 + }, + { + "epoch": 1.0771227289581016, + "grad_norm": 0.14735096159570607, + "learning_rate": 1.666947790181049e-05, + "loss": 0.6233, + "step": 2179 + }, + { + "epoch": 1.0776171054257817, + "grad_norm": 0.15177609620040602, + "learning_rate": 1.666657603944746e-05, + "loss": 0.6544, + "step": 2180 + }, + { + "epoch": 1.0781114818934618, + "grad_norm": 0.13938966512084866, + "learning_rate": 1.666367316627148e-05, + "loss": 0.5875, + "step": 2181 + }, + { + "epoch": 1.078605858361142, + "grad_norm": 0.1407445562834444, + "learning_rate": 1.6660769282722688e-05, + "loss": 0.6257, + "step": 2182 + }, + { + "epoch": 1.0791002348288221, + "grad_norm": 0.14003310868424065, + "learning_rate": 1.6657864389241397e-05, + "loss": 0.6301, + "step": 2183 + }, + { + "epoch": 1.0795946112965022, + "grad_norm": 0.136500739016167, + "learning_rate": 1.665495848626804e-05, + "loss": 0.6227, + "step": 2184 + }, + { + "epoch": 1.0800889877641824, + "grad_norm": 0.13793947530875172, + "learning_rate": 1.6652051574243237e-05, + "loss": 0.6092, + "step": 2185 + }, + { + "epoch": 1.0805833642318625, + "grad_norm": 0.14463622847216498, + "learning_rate": 1.6649143653607736e-05, + "loss": 0.6636, + "step": 2186 + }, + { + "epoch": 1.0810777406995427, + "grad_norm": 0.15254032650202773, + "learning_rate": 1.664623472480246e-05, + "loss": 0.6108, + "step": 2187 + }, + { + "epoch": 1.0815721171672228, + "grad_norm": 0.14532846846227537, + "learning_rate": 1.6643324788268457e-05, + "loss": 0.6587, + "step": 2188 + }, + { + "epoch": 1.082066493634903, + "grad_norm": 0.1551054928100989, + "learning_rate": 1.664041384444695e-05, + "loss": 0.6514, + "step": 2189 + }, + { + "epoch": 1.082560870102583, + "grad_norm": 0.14892058235149438, + "learning_rate": 1.663750189377931e-05, + "loss": 0.6196, + "step": 2190 + }, + { + "epoch": 1.0830552465702632, + "grad_norm": 0.15777570026594936, + "learning_rate": 1.663458893670706e-05, + "loss": 0.6398, + "step": 2191 + }, + { + "epoch": 1.0835496230379433, + "grad_norm": 0.1446175581127031, + "learning_rate": 1.663167497367187e-05, + "loss": 0.6529, + "step": 2192 + }, + { + "epoch": 1.0840439995056235, + "grad_norm": 0.1458031896611157, + "learning_rate": 1.6628760005115564e-05, + "loss": 0.6288, + "step": 2193 + }, + { + "epoch": 1.0845383759733036, + "grad_norm": 0.14617928565330227, + "learning_rate": 1.6625844031480128e-05, + "loss": 0.6283, + "step": 2194 + }, + { + "epoch": 1.0850327524409837, + "grad_norm": 0.1413806961200778, + "learning_rate": 1.6622927053207686e-05, + "loss": 0.653, + "step": 2195 + }, + { + "epoch": 1.0855271289086639, + "grad_norm": 0.14009993169477614, + "learning_rate": 1.6620009070740534e-05, + "loss": 0.6453, + "step": 2196 + }, + { + "epoch": 1.086021505376344, + "grad_norm": 0.14639033895359088, + "learning_rate": 1.6617090084521094e-05, + "loss": 0.6197, + "step": 2197 + }, + { + "epoch": 1.0865158818440241, + "grad_norm": 0.14450019493124425, + "learning_rate": 1.6614170094991962e-05, + "loss": 0.592, + "step": 2198 + }, + { + "epoch": 1.0870102583117043, + "grad_norm": 0.13993869309382984, + "learning_rate": 1.661124910259588e-05, + "loss": 0.5935, + "step": 2199 + }, + { + "epoch": 1.0875046347793844, + "grad_norm": 0.14774040985099923, + "learning_rate": 1.660832710777574e-05, + "loss": 0.6078, + "step": 2200 + }, + { + "epoch": 1.0879990112470646, + "grad_norm": 0.14297464248978478, + "learning_rate": 1.660540411097458e-05, + "loss": 0.6112, + "step": 2201 + }, + { + "epoch": 1.0884933877147447, + "grad_norm": 0.15163978408520784, + "learning_rate": 1.6602480112635606e-05, + "loss": 0.624, + "step": 2202 + }, + { + "epoch": 1.0889877641824248, + "grad_norm": 0.13653084016305925, + "learning_rate": 1.659955511320216e-05, + "loss": 0.6007, + "step": 2203 + }, + { + "epoch": 1.089482140650105, + "grad_norm": 0.1419449555824918, + "learning_rate": 1.6596629113117742e-05, + "loss": 0.6412, + "step": 2204 + }, + { + "epoch": 1.089976517117785, + "grad_norm": 0.1586466445748469, + "learning_rate": 1.6593702112826004e-05, + "loss": 0.6283, + "step": 2205 + }, + { + "epoch": 1.0904708935854652, + "grad_norm": 0.1370282361308216, + "learning_rate": 1.6590774112770754e-05, + "loss": 0.6156, + "step": 2206 + }, + { + "epoch": 1.0909652700531454, + "grad_norm": 0.15408977831989112, + "learning_rate": 1.6587845113395943e-05, + "loss": 0.6067, + "step": 2207 + }, + { + "epoch": 1.0914596465208257, + "grad_norm": 0.14662309829551604, + "learning_rate": 1.6584915115145678e-05, + "loss": 0.6539, + "step": 2208 + }, + { + "epoch": 1.0919540229885056, + "grad_norm": 0.1367666481726735, + "learning_rate": 1.658198411846422e-05, + "loss": 0.6108, + "step": 2209 + }, + { + "epoch": 1.092448399456186, + "grad_norm": 0.1393448971688964, + "learning_rate": 1.6579052123795977e-05, + "loss": 0.6598, + "step": 2210 + }, + { + "epoch": 1.0929427759238661, + "grad_norm": 0.14711950600842152, + "learning_rate": 1.6576119131585505e-05, + "loss": 0.6178, + "step": 2211 + }, + { + "epoch": 1.0934371523915463, + "grad_norm": 0.1346917134551066, + "learning_rate": 1.6573185142277525e-05, + "loss": 0.6058, + "step": 2212 + }, + { + "epoch": 1.0939315288592264, + "grad_norm": 0.1379082219397486, + "learning_rate": 1.657025015631689e-05, + "loss": 0.6156, + "step": 2213 + }, + { + "epoch": 1.0944259053269065, + "grad_norm": 0.13860457680248273, + "learning_rate": 1.656731417414862e-05, + "loss": 0.5983, + "step": 2214 + }, + { + "epoch": 1.0949202817945867, + "grad_norm": 0.14504275197101413, + "learning_rate": 1.6564377196217883e-05, + "loss": 0.6546, + "step": 2215 + }, + { + "epoch": 1.0954146582622668, + "grad_norm": 0.13440005171614894, + "learning_rate": 1.6561439222969992e-05, + "loss": 0.6117, + "step": 2216 + }, + { + "epoch": 1.095909034729947, + "grad_norm": 0.13924619390667228, + "learning_rate": 1.6558500254850412e-05, + "loss": 0.6346, + "step": 2217 + }, + { + "epoch": 1.096403411197627, + "grad_norm": 0.13927101860250424, + "learning_rate": 1.6555560292304767e-05, + "loss": 0.6375, + "step": 2218 + }, + { + "epoch": 1.0968977876653072, + "grad_norm": 0.1496450155184432, + "learning_rate": 1.6552619335778822e-05, + "loss": 0.6242, + "step": 2219 + }, + { + "epoch": 1.0973921641329873, + "grad_norm": 0.13989644680733396, + "learning_rate": 1.6549677385718498e-05, + "loss": 0.6251, + "step": 2220 + }, + { + "epoch": 1.0978865406006675, + "grad_norm": 0.1562524910193376, + "learning_rate": 1.654673444256986e-05, + "loss": 0.6213, + "step": 2221 + }, + { + "epoch": 1.0983809170683476, + "grad_norm": 0.13888562525592735, + "learning_rate": 1.6543790506779136e-05, + "loss": 0.6552, + "step": 2222 + }, + { + "epoch": 1.0988752935360278, + "grad_norm": 0.14531965089038848, + "learning_rate": 1.6540845578792692e-05, + "loss": 0.6424, + "step": 2223 + }, + { + "epoch": 1.0993696700037079, + "grad_norm": 0.15710806482148384, + "learning_rate": 1.6537899659057058e-05, + "loss": 0.6413, + "step": 2224 + }, + { + "epoch": 1.099864046471388, + "grad_norm": 0.1411604132192269, + "learning_rate": 1.6534952748018894e-05, + "loss": 0.6366, + "step": 2225 + }, + { + "epoch": 1.1003584229390682, + "grad_norm": 0.14770835991256284, + "learning_rate": 1.6532004846125034e-05, + "loss": 0.6291, + "step": 2226 + }, + { + "epoch": 1.1008527994067483, + "grad_norm": 0.13905582926394663, + "learning_rate": 1.652905595382244e-05, + "loss": 0.6075, + "step": 2227 + }, + { + "epoch": 1.1013471758744284, + "grad_norm": 0.13791147516259802, + "learning_rate": 1.652610607155825e-05, + "loss": 0.6046, + "step": 2228 + }, + { + "epoch": 1.1018415523421086, + "grad_norm": 0.15229713232089898, + "learning_rate": 1.6523155199779722e-05, + "loss": 0.6272, + "step": 2229 + }, + { + "epoch": 1.1023359288097887, + "grad_norm": 0.14151378195580094, + "learning_rate": 1.652020333893428e-05, + "loss": 0.6963, + "step": 2230 + }, + { + "epoch": 1.1028303052774688, + "grad_norm": 0.15670953267387364, + "learning_rate": 1.6517250489469507e-05, + "loss": 0.6651, + "step": 2231 + }, + { + "epoch": 1.103324681745149, + "grad_norm": 0.15377004549796422, + "learning_rate": 1.6514296651833118e-05, + "loss": 0.6211, + "step": 2232 + }, + { + "epoch": 1.103819058212829, + "grad_norm": 0.14326133367344585, + "learning_rate": 1.651134182647299e-05, + "loss": 0.6026, + "step": 2233 + }, + { + "epoch": 1.1043134346805092, + "grad_norm": 0.13600090435781761, + "learning_rate": 1.650838601383714e-05, + "loss": 0.5913, + "step": 2234 + }, + { + "epoch": 1.1048078111481894, + "grad_norm": 0.14899602040437454, + "learning_rate": 1.6505429214373748e-05, + "loss": 0.6237, + "step": 2235 + }, + { + "epoch": 1.1053021876158695, + "grad_norm": 0.14427213074628684, + "learning_rate": 1.650247142853113e-05, + "loss": 0.6522, + "step": 2236 + }, + { + "epoch": 1.1057965640835496, + "grad_norm": 0.1536601445889569, + "learning_rate": 1.6499512656757756e-05, + "loss": 0.6053, + "step": 2237 + }, + { + "epoch": 1.1062909405512298, + "grad_norm": 0.14639188121357294, + "learning_rate": 1.649655289950225e-05, + "loss": 0.6687, + "step": 2238 + }, + { + "epoch": 1.10678531701891, + "grad_norm": 0.14231095592196838, + "learning_rate": 1.6493592157213383e-05, + "loss": 0.6235, + "step": 2239 + }, + { + "epoch": 1.10727969348659, + "grad_norm": 0.16401219239074327, + "learning_rate": 1.6490630430340072e-05, + "loss": 0.6605, + "step": 2240 + }, + { + "epoch": 1.1077740699542702, + "grad_norm": 0.1427502590756843, + "learning_rate": 1.648766771933139e-05, + "loss": 0.6081, + "step": 2241 + }, + { + "epoch": 1.1082684464219503, + "grad_norm": 0.14342948304026756, + "learning_rate": 1.6484704024636552e-05, + "loss": 0.6989, + "step": 2242 + }, + { + "epoch": 1.1087628228896305, + "grad_norm": 0.13618990385233193, + "learning_rate": 1.6481739346704922e-05, + "loss": 0.5982, + "step": 2243 + }, + { + "epoch": 1.1092571993573106, + "grad_norm": 0.14292316043729003, + "learning_rate": 1.6478773685986022e-05, + "loss": 0.6362, + "step": 2244 + }, + { + "epoch": 1.1097515758249907, + "grad_norm": 0.15566574667029168, + "learning_rate": 1.6475807042929515e-05, + "loss": 0.6294, + "step": 2245 + }, + { + "epoch": 1.1102459522926709, + "grad_norm": 0.14944123337336349, + "learning_rate": 1.6472839417985216e-05, + "loss": 0.6082, + "step": 2246 + }, + { + "epoch": 1.110740328760351, + "grad_norm": 0.15140053870755468, + "learning_rate": 1.6469870811603085e-05, + "loss": 0.6158, + "step": 2247 + }, + { + "epoch": 1.1112347052280311, + "grad_norm": 0.15106377471822197, + "learning_rate": 1.646690122423324e-05, + "loss": 0.6269, + "step": 2248 + }, + { + "epoch": 1.1117290816957113, + "grad_norm": 0.14052868224181705, + "learning_rate": 1.6463930656325938e-05, + "loss": 0.6192, + "step": 2249 + }, + { + "epoch": 1.1122234581633914, + "grad_norm": 0.14521545835921063, + "learning_rate": 1.6460959108331592e-05, + "loss": 0.6557, + "step": 2250 + }, + { + "epoch": 1.1127178346310715, + "grad_norm": 0.14846143805240844, + "learning_rate": 1.6457986580700753e-05, + "loss": 0.6452, + "step": 2251 + }, + { + "epoch": 1.1132122110987517, + "grad_norm": 0.15503836395119247, + "learning_rate": 1.645501307388413e-05, + "loss": 0.6372, + "step": 2252 + }, + { + "epoch": 1.1137065875664318, + "grad_norm": 0.1419915658178751, + "learning_rate": 1.6452038588332583e-05, + "loss": 0.6398, + "step": 2253 + }, + { + "epoch": 1.114200964034112, + "grad_norm": 0.15290547881609487, + "learning_rate": 1.644906312449711e-05, + "loss": 0.6535, + "step": 2254 + }, + { + "epoch": 1.114695340501792, + "grad_norm": 0.14231679069122843, + "learning_rate": 1.6446086682828865e-05, + "loss": 0.5745, + "step": 2255 + }, + { + "epoch": 1.1151897169694722, + "grad_norm": 0.1440314941561043, + "learning_rate": 1.6443109263779145e-05, + "loss": 0.6262, + "step": 2256 + }, + { + "epoch": 1.1156840934371524, + "grad_norm": 0.1492764164210716, + "learning_rate": 1.6440130867799404e-05, + "loss": 0.6289, + "step": 2257 + }, + { + "epoch": 1.1161784699048325, + "grad_norm": 0.15356360770896885, + "learning_rate": 1.6437151495341234e-05, + "loss": 0.6031, + "step": 2258 + }, + { + "epoch": 1.1166728463725126, + "grad_norm": 0.13192219988442117, + "learning_rate": 1.643417114685638e-05, + "loss": 0.6257, + "step": 2259 + }, + { + "epoch": 1.1171672228401928, + "grad_norm": 0.1487995997088337, + "learning_rate": 1.6431189822796732e-05, + "loss": 0.6167, + "step": 2260 + }, + { + "epoch": 1.117661599307873, + "grad_norm": 0.1495954402141495, + "learning_rate": 1.6428207523614337e-05, + "loss": 0.635, + "step": 2261 + }, + { + "epoch": 1.118155975775553, + "grad_norm": 0.14133879591579013, + "learning_rate": 1.642522424976138e-05, + "loss": 0.61, + "step": 2262 + }, + { + "epoch": 1.1186503522432332, + "grad_norm": 0.1474076119615893, + "learning_rate": 1.6422240001690193e-05, + "loss": 0.6483, + "step": 2263 + }, + { + "epoch": 1.1191447287109133, + "grad_norm": 0.15447985830217395, + "learning_rate": 1.6419254779853268e-05, + "loss": 0.6722, + "step": 2264 + }, + { + "epoch": 1.1196391051785934, + "grad_norm": 0.1450095615186012, + "learning_rate": 1.6416268584703225e-05, + "loss": 0.6279, + "step": 2265 + }, + { + "epoch": 1.1201334816462736, + "grad_norm": 0.14636454691880224, + "learning_rate": 1.6413281416692853e-05, + "loss": 0.6245, + "step": 2266 + }, + { + "epoch": 1.1206278581139537, + "grad_norm": 0.14252325382672876, + "learning_rate": 1.641029327627507e-05, + "loss": 0.6528, + "step": 2267 + }, + { + "epoch": 1.1211222345816338, + "grad_norm": 0.13531368571761485, + "learning_rate": 1.6407304163902958e-05, + "loss": 0.6595, + "step": 2268 + }, + { + "epoch": 1.121616611049314, + "grad_norm": 0.14096257756065247, + "learning_rate": 1.6404314080029736e-05, + "loss": 0.5834, + "step": 2269 + }, + { + "epoch": 1.1221109875169941, + "grad_norm": 0.13830681272338416, + "learning_rate": 1.640132302510877e-05, + "loss": 0.6315, + "step": 2270 + }, + { + "epoch": 1.1226053639846743, + "grad_norm": 0.13780924636377195, + "learning_rate": 1.6398330999593573e-05, + "loss": 0.6059, + "step": 2271 + }, + { + "epoch": 1.1230997404523544, + "grad_norm": 0.14016659388203828, + "learning_rate": 1.639533800393781e-05, + "loss": 0.6307, + "step": 2272 + }, + { + "epoch": 1.1235941169200345, + "grad_norm": 0.14310521553921413, + "learning_rate": 1.63923440385953e-05, + "loss": 0.6447, + "step": 2273 + }, + { + "epoch": 1.1240884933877147, + "grad_norm": 0.1419959457580901, + "learning_rate": 1.6389349104019986e-05, + "loss": 0.6066, + "step": 2274 + }, + { + "epoch": 1.1245828698553948, + "grad_norm": 0.14186978843762774, + "learning_rate": 1.6386353200665982e-05, + "loss": 0.6257, + "step": 2275 + }, + { + "epoch": 1.125077246323075, + "grad_norm": 0.14391461504963698, + "learning_rate": 1.6383356328987535e-05, + "loss": 0.6316, + "step": 2276 + }, + { + "epoch": 1.125571622790755, + "grad_norm": 0.1437406739245742, + "learning_rate": 1.638035848943904e-05, + "loss": 0.6412, + "step": 2277 + }, + { + "epoch": 1.1260659992584352, + "grad_norm": 0.1381735171027813, + "learning_rate": 1.6377359682475047e-05, + "loss": 0.6819, + "step": 2278 + }, + { + "epoch": 1.1265603757261156, + "grad_norm": 0.14873889412243033, + "learning_rate": 1.6374359908550245e-05, + "loss": 0.6351, + "step": 2279 + }, + { + "epoch": 1.1270547521937955, + "grad_norm": 0.14964172987302954, + "learning_rate": 1.6371359168119467e-05, + "loss": 0.629, + "step": 2280 + }, + { + "epoch": 1.1275491286614758, + "grad_norm": 0.14395145547157695, + "learning_rate": 1.6368357461637706e-05, + "loss": 0.6426, + "step": 2281 + }, + { + "epoch": 1.1280435051291557, + "grad_norm": 0.14963367276155404, + "learning_rate": 1.6365354789560086e-05, + "loss": 0.6292, + "step": 2282 + }, + { + "epoch": 1.128537881596836, + "grad_norm": 0.1315170007116335, + "learning_rate": 1.6362351152341888e-05, + "loss": 0.6066, + "step": 2283 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.14182249093432675, + "learning_rate": 1.6359346550438533e-05, + "loss": 0.5995, + "step": 2284 + }, + { + "epoch": 1.1295266345321964, + "grad_norm": 0.14112103176977364, + "learning_rate": 1.635634098430559e-05, + "loss": 0.6543, + "step": 2285 + }, + { + "epoch": 1.1300210109998763, + "grad_norm": 0.1421015768692741, + "learning_rate": 1.635333445439878e-05, + "loss": 0.6262, + "step": 2286 + }, + { + "epoch": 1.1305153874675566, + "grad_norm": 0.1394065871235515, + "learning_rate": 1.635032696117396e-05, + "loss": 0.6329, + "step": 2287 + }, + { + "epoch": 1.1310097639352368, + "grad_norm": 0.13949195533853626, + "learning_rate": 1.6347318505087143e-05, + "loss": 0.6167, + "step": 2288 + }, + { + "epoch": 1.131504140402917, + "grad_norm": 0.14491022055664224, + "learning_rate": 1.634430908659448e-05, + "loss": 0.6256, + "step": 2289 + }, + { + "epoch": 1.131998516870597, + "grad_norm": 0.14254602681546097, + "learning_rate": 1.6341298706152266e-05, + "loss": 0.5954, + "step": 2290 + }, + { + "epoch": 1.1324928933382772, + "grad_norm": 0.13006305950562777, + "learning_rate": 1.6338287364216954e-05, + "loss": 0.6247, + "step": 2291 + }, + { + "epoch": 1.1329872698059573, + "grad_norm": 0.14786608815863236, + "learning_rate": 1.6335275061245135e-05, + "loss": 0.6285, + "step": 2292 + }, + { + "epoch": 1.1334816462736375, + "grad_norm": 0.14031249862332013, + "learning_rate": 1.6332261797693545e-05, + "loss": 0.6813, + "step": 2293 + }, + { + "epoch": 1.1339760227413176, + "grad_norm": 0.14089168737606123, + "learning_rate": 1.6329247574019068e-05, + "loss": 0.6534, + "step": 2294 + }, + { + "epoch": 1.1344703992089977, + "grad_norm": 0.1447285335120903, + "learning_rate": 1.632623239067873e-05, + "loss": 0.6263, + "step": 2295 + }, + { + "epoch": 1.1349647756766779, + "grad_norm": 0.1418952177837647, + "learning_rate": 1.632321624812971e-05, + "loss": 0.6097, + "step": 2296 + }, + { + "epoch": 1.135459152144358, + "grad_norm": 0.1389473861563929, + "learning_rate": 1.6320199146829323e-05, + "loss": 0.6385, + "step": 2297 + }, + { + "epoch": 1.1359535286120381, + "grad_norm": 0.14962935039271108, + "learning_rate": 1.631718108723504e-05, + "loss": 0.6601, + "step": 2298 + }, + { + "epoch": 1.1364479050797183, + "grad_norm": 0.13892135170761297, + "learning_rate": 1.631416206980446e-05, + "loss": 0.6086, + "step": 2299 + }, + { + "epoch": 1.1369422815473984, + "grad_norm": 0.16108097360643583, + "learning_rate": 1.631114209499535e-05, + "loss": 0.6473, + "step": 2300 + }, + { + "epoch": 1.1374366580150785, + "grad_norm": 0.15524361650442298, + "learning_rate": 1.6308121163265602e-05, + "loss": 0.6481, + "step": 2301 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 0.31645567406899366, + "learning_rate": 1.630509927507327e-05, + "loss": 0.6498, + "step": 2302 + }, + { + "epoch": 1.1384254109504388, + "grad_norm": 0.15687081122708665, + "learning_rate": 1.6302076430876545e-05, + "loss": 0.6035, + "step": 2303 + }, + { + "epoch": 1.138919787418119, + "grad_norm": 0.14430865176176425, + "learning_rate": 1.6299052631133753e-05, + "loss": 0.6356, + "step": 2304 + }, + { + "epoch": 1.139414163885799, + "grad_norm": 0.1390314873352687, + "learning_rate": 1.629602787630338e-05, + "loss": 0.6351, + "step": 2305 + }, + { + "epoch": 1.1399085403534792, + "grad_norm": 0.15356198201674476, + "learning_rate": 1.629300216684405e-05, + "loss": 0.606, + "step": 2306 + }, + { + "epoch": 1.1404029168211594, + "grad_norm": 0.1448614254602839, + "learning_rate": 1.628997550321454e-05, + "loss": 0.5979, + "step": 2307 + }, + { + "epoch": 1.1408972932888395, + "grad_norm": 0.15949596338386518, + "learning_rate": 1.6286947885873755e-05, + "loss": 0.6245, + "step": 2308 + }, + { + "epoch": 1.1413916697565196, + "grad_norm": 0.1485225020180573, + "learning_rate": 1.628391931528076e-05, + "loss": 0.6306, + "step": 2309 + }, + { + "epoch": 1.1418860462241998, + "grad_norm": 0.14738394801262258, + "learning_rate": 1.628088979189476e-05, + "loss": 0.6362, + "step": 2310 + }, + { + "epoch": 1.14238042269188, + "grad_norm": 0.14716610556986642, + "learning_rate": 1.6277859316175102e-05, + "loss": 0.6296, + "step": 2311 + }, + { + "epoch": 1.14287479915956, + "grad_norm": 0.14334149319873762, + "learning_rate": 1.6274827888581275e-05, + "loss": 0.608, + "step": 2312 + }, + { + "epoch": 1.1433691756272402, + "grad_norm": 0.1550138897600089, + "learning_rate": 1.6271795509572922e-05, + "loss": 0.6139, + "step": 2313 + }, + { + "epoch": 1.1438635520949203, + "grad_norm": 0.1489986289962828, + "learning_rate": 1.6268762179609825e-05, + "loss": 0.6419, + "step": 2314 + }, + { + "epoch": 1.1443579285626004, + "grad_norm": 0.154993595261941, + "learning_rate": 1.62657278991519e-05, + "loss": 0.6343, + "step": 2315 + }, + { + "epoch": 1.1448523050302806, + "grad_norm": 0.14286324098362005, + "learning_rate": 1.626269266865923e-05, + "loss": 0.6714, + "step": 2316 + }, + { + "epoch": 1.1453466814979607, + "grad_norm": 0.15333473564620803, + "learning_rate": 1.625965648859202e-05, + "loss": 0.6308, + "step": 2317 + }, + { + "epoch": 1.1458410579656408, + "grad_norm": 0.14954363386680097, + "learning_rate": 1.6256619359410626e-05, + "loss": 0.6299, + "step": 2318 + }, + { + "epoch": 1.146335434433321, + "grad_norm": 0.14962536055591866, + "learning_rate": 1.625358128157556e-05, + "loss": 0.6246, + "step": 2319 + }, + { + "epoch": 1.1468298109010011, + "grad_norm": 0.1437453325743569, + "learning_rate": 1.6250542255547456e-05, + "loss": 0.6344, + "step": 2320 + }, + { + "epoch": 1.1473241873686812, + "grad_norm": 0.15302652710720813, + "learning_rate": 1.6247502281787115e-05, + "loss": 0.6263, + "step": 2321 + }, + { + "epoch": 1.1478185638363614, + "grad_norm": 0.14270176812068075, + "learning_rate": 1.624446136075546e-05, + "loss": 0.5944, + "step": 2322 + }, + { + "epoch": 1.1483129403040415, + "grad_norm": 0.14675813553913597, + "learning_rate": 1.6241419492913567e-05, + "loss": 0.6287, + "step": 2323 + }, + { + "epoch": 1.1488073167717217, + "grad_norm": 0.14955605113466008, + "learning_rate": 1.6238376678722664e-05, + "loss": 0.6148, + "step": 2324 + }, + { + "epoch": 1.1493016932394018, + "grad_norm": 0.15028036883067772, + "learning_rate": 1.6235332918644112e-05, + "loss": 0.6545, + "step": 2325 + }, + { + "epoch": 1.149796069707082, + "grad_norm": 0.1482292399208351, + "learning_rate": 1.6232288213139416e-05, + "loss": 0.6035, + "step": 2326 + }, + { + "epoch": 1.150290446174762, + "grad_norm": 0.15472510436605014, + "learning_rate": 1.6229242562670226e-05, + "loss": 0.6368, + "step": 2327 + }, + { + "epoch": 1.1507848226424422, + "grad_norm": 0.1407105695883157, + "learning_rate": 1.622619596769834e-05, + "loss": 0.6299, + "step": 2328 + }, + { + "epoch": 1.1512791991101223, + "grad_norm": 0.15229874228279391, + "learning_rate": 1.622314842868569e-05, + "loss": 0.636, + "step": 2329 + }, + { + "epoch": 1.1517735755778025, + "grad_norm": 0.1536595644457373, + "learning_rate": 1.622009994609436e-05, + "loss": 0.601, + "step": 2330 + }, + { + "epoch": 1.1522679520454826, + "grad_norm": 3.748461325179999, + "learning_rate": 1.621705052038657e-05, + "loss": 0.744, + "step": 2331 + }, + { + "epoch": 1.1527623285131627, + "grad_norm": 0.1659005974345259, + "learning_rate": 1.621400015202469e-05, + "loss": 0.6115, + "step": 2332 + }, + { + "epoch": 1.1532567049808429, + "grad_norm": 0.15647768632123246, + "learning_rate": 1.6210948841471226e-05, + "loss": 0.6341, + "step": 2333 + }, + { + "epoch": 1.153751081448523, + "grad_norm": 0.1469305930929379, + "learning_rate": 1.620789658918883e-05, + "loss": 0.5822, + "step": 2334 + }, + { + "epoch": 1.1542454579162031, + "grad_norm": 0.15318973432612829, + "learning_rate": 1.6204843395640296e-05, + "loss": 0.6525, + "step": 2335 + }, + { + "epoch": 1.1547398343838833, + "grad_norm": 0.15052452222945123, + "learning_rate": 1.6201789261288564e-05, + "loss": 0.6198, + "step": 2336 + }, + { + "epoch": 1.1552342108515634, + "grad_norm": 0.15005481174986124, + "learning_rate": 1.619873418659671e-05, + "loss": 0.6407, + "step": 2337 + }, + { + "epoch": 1.1557285873192435, + "grad_norm": 0.15122417637038157, + "learning_rate": 1.6195678172027965e-05, + "loss": 0.6403, + "step": 2338 + }, + { + "epoch": 1.1562229637869237, + "grad_norm": 0.14796125492442483, + "learning_rate": 1.6192621218045687e-05, + "loss": 0.6553, + "step": 2339 + }, + { + "epoch": 1.1567173402546038, + "grad_norm": 0.15766059772146437, + "learning_rate": 1.618956332511338e-05, + "loss": 0.6075, + "step": 2340 + }, + { + "epoch": 1.157211716722284, + "grad_norm": 0.15471796923687695, + "learning_rate": 1.6186504493694704e-05, + "loss": 0.6344, + "step": 2341 + }, + { + "epoch": 1.157706093189964, + "grad_norm": 0.14548128305609204, + "learning_rate": 1.6183444724253443e-05, + "loss": 0.645, + "step": 2342 + }, + { + "epoch": 1.1582004696576442, + "grad_norm": 0.1581914547393869, + "learning_rate": 1.6180384017253537e-05, + "loss": 0.6378, + "step": 2343 + }, + { + "epoch": 1.1586948461253244, + "grad_norm": 0.14954510810526406, + "learning_rate": 1.6177322373159062e-05, + "loss": 0.6805, + "step": 2344 + }, + { + "epoch": 1.1591892225930045, + "grad_norm": 0.1864557940069778, + "learning_rate": 1.6174259792434233e-05, + "loss": 0.6326, + "step": 2345 + }, + { + "epoch": 1.1596835990606846, + "grad_norm": 0.14902647374561828, + "learning_rate": 1.6171196275543414e-05, + "loss": 0.6253, + "step": 2346 + }, + { + "epoch": 1.1601779755283648, + "grad_norm": 0.14556550539887633, + "learning_rate": 1.6168131822951106e-05, + "loss": 0.5929, + "step": 2347 + }, + { + "epoch": 1.160672351996045, + "grad_norm": 0.14103318642178644, + "learning_rate": 1.6165066435121956e-05, + "loss": 0.5954, + "step": 2348 + }, + { + "epoch": 1.161166728463725, + "grad_norm": 0.16141024362400294, + "learning_rate": 1.6162000112520747e-05, + "loss": 0.6476, + "step": 2349 + }, + { + "epoch": 1.1616611049314052, + "grad_norm": 0.1360643682027393, + "learning_rate": 1.6158932855612408e-05, + "loss": 0.6216, + "step": 2350 + }, + { + "epoch": 1.1621554813990853, + "grad_norm": 0.1418906461595427, + "learning_rate": 1.6155864664862012e-05, + "loss": 0.6194, + "step": 2351 + }, + { + "epoch": 1.1626498578667654, + "grad_norm": 0.14788672782230758, + "learning_rate": 1.6152795540734766e-05, + "loss": 0.6578, + "step": 2352 + }, + { + "epoch": 1.1631442343344456, + "grad_norm": 0.14753522657754806, + "learning_rate": 1.6149725483696027e-05, + "loss": 0.6559, + "step": 2353 + }, + { + "epoch": 1.163638610802126, + "grad_norm": 0.14605134862051308, + "learning_rate": 1.6146654494211283e-05, + "loss": 0.6788, + "step": 2354 + }, + { + "epoch": 1.1641329872698059, + "grad_norm": 0.15845077604592045, + "learning_rate": 1.614358257274618e-05, + "loss": 0.6268, + "step": 2355 + }, + { + "epoch": 1.1646273637374862, + "grad_norm": 0.1428300957721085, + "learning_rate": 1.6140509719766484e-05, + "loss": 0.6451, + "step": 2356 + }, + { + "epoch": 1.1651217402051661, + "grad_norm": 0.14441264897979553, + "learning_rate": 1.613743593573812e-05, + "loss": 0.6266, + "step": 2357 + }, + { + "epoch": 1.1656161166728465, + "grad_norm": 0.14737329609177563, + "learning_rate": 1.613436122112715e-05, + "loss": 0.6608, + "step": 2358 + }, + { + "epoch": 1.1661104931405264, + "grad_norm": 0.14030640996246074, + "learning_rate": 1.6131285576399763e-05, + "loss": 0.6529, + "step": 2359 + }, + { + "epoch": 1.1666048696082068, + "grad_norm": 0.14141089647275654, + "learning_rate": 1.612820900202231e-05, + "loss": 0.5866, + "step": 2360 + }, + { + "epoch": 1.1670992460758867, + "grad_norm": 0.13303222896002082, + "learning_rate": 1.6125131498461272e-05, + "loss": 0.5799, + "step": 2361 + }, + { + "epoch": 1.167593622543567, + "grad_norm": 0.14481816852231685, + "learning_rate": 1.612205306618327e-05, + "loss": 0.6179, + "step": 2362 + }, + { + "epoch": 1.1680879990112472, + "grad_norm": 0.14362862993435796, + "learning_rate": 1.6118973705655073e-05, + "loss": 0.6111, + "step": 2363 + }, + { + "epoch": 1.1685823754789273, + "grad_norm": 0.13573163865710466, + "learning_rate": 1.611589341734358e-05, + "loss": 0.6143, + "step": 2364 + }, + { + "epoch": 1.1690767519466074, + "grad_norm": 0.13709132086207088, + "learning_rate": 1.611281220171584e-05, + "loss": 0.6167, + "step": 2365 + }, + { + "epoch": 1.1695711284142876, + "grad_norm": 0.1394417637919276, + "learning_rate": 1.610973005923904e-05, + "loss": 0.6207, + "step": 2366 + }, + { + "epoch": 1.1700655048819677, + "grad_norm": 0.1451008713529761, + "learning_rate": 1.6106646990380505e-05, + "loss": 0.587, + "step": 2367 + }, + { + "epoch": 1.1705598813496478, + "grad_norm": 0.13627659174918358, + "learning_rate": 1.6103562995607705e-05, + "loss": 0.6426, + "step": 2368 + }, + { + "epoch": 1.171054257817328, + "grad_norm": 0.1384740848206736, + "learning_rate": 1.6100478075388242e-05, + "loss": 0.6473, + "step": 2369 + }, + { + "epoch": 1.171548634285008, + "grad_norm": 0.14384403384513753, + "learning_rate": 1.6097392230189868e-05, + "loss": 0.6461, + "step": 2370 + }, + { + "epoch": 1.1720430107526882, + "grad_norm": 0.1420984665773144, + "learning_rate": 1.609430546048047e-05, + "loss": 0.6042, + "step": 2371 + }, + { + "epoch": 1.1725373872203684, + "grad_norm": 0.13411617740463408, + "learning_rate": 1.6091217766728077e-05, + "loss": 0.6183, + "step": 2372 + }, + { + "epoch": 1.1730317636880485, + "grad_norm": 0.14103152744636407, + "learning_rate": 1.608812914940086e-05, + "loss": 0.6294, + "step": 2373 + }, + { + "epoch": 1.1735261401557286, + "grad_norm": 0.13583520800292412, + "learning_rate": 1.6085039608967123e-05, + "loss": 0.6394, + "step": 2374 + }, + { + "epoch": 1.1740205166234088, + "grad_norm": 0.13343606683083972, + "learning_rate": 1.608194914589532e-05, + "loss": 0.6197, + "step": 2375 + }, + { + "epoch": 1.174514893091089, + "grad_norm": 0.13720255563215072, + "learning_rate": 1.6078857760654034e-05, + "loss": 0.6438, + "step": 2376 + }, + { + "epoch": 1.175009269558769, + "grad_norm": 0.13783393117844803, + "learning_rate": 1.6075765453711992e-05, + "loss": 0.6424, + "step": 2377 + }, + { + "epoch": 1.1755036460264492, + "grad_norm": 0.13439240792326718, + "learning_rate": 1.6072672225538066e-05, + "loss": 0.6002, + "step": 2378 + }, + { + "epoch": 1.1759980224941293, + "grad_norm": 0.1355798237685145, + "learning_rate": 1.6069578076601265e-05, + "loss": 0.6557, + "step": 2379 + }, + { + "epoch": 1.1764923989618095, + "grad_norm": 0.14031807599737367, + "learning_rate": 1.606648300737073e-05, + "loss": 0.5999, + "step": 2380 + }, + { + "epoch": 1.1769867754294896, + "grad_norm": 0.13426776425851775, + "learning_rate": 1.6063387018315756e-05, + "loss": 0.6, + "step": 2381 + }, + { + "epoch": 1.1774811518971697, + "grad_norm": 0.1445200064352133, + "learning_rate": 1.6060290109905766e-05, + "loss": 0.5982, + "step": 2382 + }, + { + "epoch": 1.1779755283648499, + "grad_norm": 0.1400226616543515, + "learning_rate": 1.605719228261032e-05, + "loss": 0.6336, + "step": 2383 + }, + { + "epoch": 1.17846990483253, + "grad_norm": 0.13758750273100845, + "learning_rate": 1.6054093536899132e-05, + "loss": 0.6425, + "step": 2384 + }, + { + "epoch": 1.1789642813002101, + "grad_norm": 0.14011406002037471, + "learning_rate": 1.605099387324204e-05, + "loss": 0.6373, + "step": 2385 + }, + { + "epoch": 1.1794586577678903, + "grad_norm": 0.14705358817963413, + "learning_rate": 1.6047893292109026e-05, + "loss": 0.608, + "step": 2386 + }, + { + "epoch": 1.1799530342355704, + "grad_norm": 0.1367933086062921, + "learning_rate": 1.6044791793970217e-05, + "loss": 0.6238, + "step": 2387 + }, + { + "epoch": 1.1804474107032505, + "grad_norm": 0.13874643815565157, + "learning_rate": 1.604168937929588e-05, + "loss": 0.6205, + "step": 2388 + }, + { + "epoch": 1.1809417871709307, + "grad_norm": 0.14387521520332966, + "learning_rate": 1.6038586048556402e-05, + "loss": 0.6082, + "step": 2389 + }, + { + "epoch": 1.1814361636386108, + "grad_norm": 0.13592164167637288, + "learning_rate": 1.6035481802222333e-05, + "loss": 0.6814, + "step": 2390 + }, + { + "epoch": 1.181930540106291, + "grad_norm": 0.14271707380898455, + "learning_rate": 1.6032376640764345e-05, + "loss": 0.5959, + "step": 2391 + }, + { + "epoch": 1.182424916573971, + "grad_norm": 0.14117801301068011, + "learning_rate": 1.6029270564653258e-05, + "loss": 0.6742, + "step": 2392 + }, + { + "epoch": 1.1829192930416512, + "grad_norm": 0.14115289029556877, + "learning_rate": 1.602616357436003e-05, + "loss": 0.6227, + "step": 2393 + }, + { + "epoch": 1.1834136695093314, + "grad_norm": 0.14269363185352044, + "learning_rate": 1.6023055670355748e-05, + "loss": 0.6154, + "step": 2394 + }, + { + "epoch": 1.1839080459770115, + "grad_norm": 0.15138267025190788, + "learning_rate": 1.6019946853111654e-05, + "loss": 0.6238, + "step": 2395 + }, + { + "epoch": 1.1844024224446916, + "grad_norm": 0.13124425790519215, + "learning_rate": 1.6016837123099112e-05, + "loss": 0.6322, + "step": 2396 + }, + { + "epoch": 1.1848967989123718, + "grad_norm": 0.14934375856460938, + "learning_rate": 1.601372648078963e-05, + "loss": 0.6174, + "step": 2397 + }, + { + "epoch": 1.185391175380052, + "grad_norm": 0.14543052811506557, + "learning_rate": 1.6010614926654868e-05, + "loss": 0.6131, + "step": 2398 + }, + { + "epoch": 1.185885551847732, + "grad_norm": 0.13849287078775355, + "learning_rate": 1.60075024611666e-05, + "loss": 0.6645, + "step": 2399 + }, + { + "epoch": 1.1863799283154122, + "grad_norm": 0.15019983286757674, + "learning_rate": 1.600438908479676e-05, + "loss": 0.6935, + "step": 2400 + }, + { + "epoch": 1.1868743047830923, + "grad_norm": 0.14040655505786565, + "learning_rate": 1.6001274798017405e-05, + "loss": 0.6519, + "step": 2401 + }, + { + "epoch": 1.1873686812507724, + "grad_norm": 0.14636999297918768, + "learning_rate": 1.5998159601300734e-05, + "loss": 0.6241, + "step": 2402 + }, + { + "epoch": 1.1878630577184526, + "grad_norm": 0.14336394324650625, + "learning_rate": 1.599504349511909e-05, + "loss": 0.6527, + "step": 2403 + }, + { + "epoch": 1.1883574341861327, + "grad_norm": 0.15828959728007416, + "learning_rate": 1.5991926479944944e-05, + "loss": 0.5743, + "step": 2404 + }, + { + "epoch": 1.1888518106538128, + "grad_norm": 0.13396040915592972, + "learning_rate": 1.5988808556250918e-05, + "loss": 0.6495, + "step": 2405 + }, + { + "epoch": 1.189346187121493, + "grad_norm": 0.16065117517924404, + "learning_rate": 1.5985689724509755e-05, + "loss": 0.6093, + "step": 2406 + }, + { + "epoch": 1.1898405635891731, + "grad_norm": 0.13612351825121946, + "learning_rate": 1.5982569985194355e-05, + "loss": 0.5938, + "step": 2407 + }, + { + "epoch": 1.1903349400568533, + "grad_norm": 0.18088476713896556, + "learning_rate": 1.5979449338777738e-05, + "loss": 0.6167, + "step": 2408 + }, + { + "epoch": 1.1908293165245334, + "grad_norm": 0.14540727427964997, + "learning_rate": 1.5976327785733073e-05, + "loss": 0.6479, + "step": 2409 + }, + { + "epoch": 1.1913236929922135, + "grad_norm": 0.13995815748469975, + "learning_rate": 1.597320532653366e-05, + "loss": 0.6014, + "step": 2410 + }, + { + "epoch": 1.1918180694598937, + "grad_norm": 0.1384548283780686, + "learning_rate": 1.5970081961652937e-05, + "loss": 0.6403, + "step": 2411 + }, + { + "epoch": 1.1923124459275738, + "grad_norm": 0.1622239783984809, + "learning_rate": 1.5966957691564485e-05, + "loss": 0.6165, + "step": 2412 + }, + { + "epoch": 1.192806822395254, + "grad_norm": 0.1476740185778487, + "learning_rate": 1.5963832516742016e-05, + "loss": 0.6628, + "step": 2413 + }, + { + "epoch": 1.193301198862934, + "grad_norm": 0.14063031270056645, + "learning_rate": 1.596070643765938e-05, + "loss": 0.6128, + "step": 2414 + }, + { + "epoch": 1.1937955753306142, + "grad_norm": 0.1451207934168669, + "learning_rate": 1.5957579454790574e-05, + "loss": 0.604, + "step": 2415 + }, + { + "epoch": 1.1942899517982943, + "grad_norm": 0.14986725756580746, + "learning_rate": 1.595445156860971e-05, + "loss": 0.6067, + "step": 2416 + }, + { + "epoch": 1.1947843282659745, + "grad_norm": 0.14344393177030076, + "learning_rate": 1.595132277959106e-05, + "loss": 0.6543, + "step": 2417 + }, + { + "epoch": 1.1952787047336546, + "grad_norm": 0.15285421445679673, + "learning_rate": 1.5948193088209024e-05, + "loss": 0.6169, + "step": 2418 + }, + { + "epoch": 1.1957730812013347, + "grad_norm": 0.16573286375137905, + "learning_rate": 1.5945062494938136e-05, + "loss": 0.615, + "step": 2419 + }, + { + "epoch": 1.1962674576690149, + "grad_norm": 0.14379140587685335, + "learning_rate": 1.594193100025307e-05, + "loss": 0.5835, + "step": 2420 + }, + { + "epoch": 1.196761834136695, + "grad_norm": 0.14764266636785084, + "learning_rate": 1.593879860462863e-05, + "loss": 0.6144, + "step": 2421 + }, + { + "epoch": 1.1972562106043751, + "grad_norm": 0.14304254239157613, + "learning_rate": 1.593566530853977e-05, + "loss": 0.6176, + "step": 2422 + }, + { + "epoch": 1.1977505870720553, + "grad_norm": 0.1354968750127945, + "learning_rate": 1.593253111246157e-05, + "loss": 0.6256, + "step": 2423 + }, + { + "epoch": 1.1982449635397354, + "grad_norm": 0.15355887724278489, + "learning_rate": 1.5929396016869247e-05, + "loss": 0.5881, + "step": 2424 + }, + { + "epoch": 1.1987393400074156, + "grad_norm": 0.1389382095694867, + "learning_rate": 1.5926260022238163e-05, + "loss": 0.6598, + "step": 2425 + }, + { + "epoch": 1.1992337164750957, + "grad_norm": 0.1568859130247589, + "learning_rate": 1.5923123129043806e-05, + "loss": 0.6341, + "step": 2426 + }, + { + "epoch": 1.199728092942776, + "grad_norm": 0.14429511728023722, + "learning_rate": 1.59199853377618e-05, + "loss": 0.6083, + "step": 2427 + }, + { + "epoch": 1.200222469410456, + "grad_norm": 0.1416843819295269, + "learning_rate": 1.5916846648867918e-05, + "loss": 0.6041, + "step": 2428 + }, + { + "epoch": 1.2007168458781363, + "grad_norm": 0.14707243744935544, + "learning_rate": 1.5913707062838053e-05, + "loss": 0.6288, + "step": 2429 + }, + { + "epoch": 1.2012112223458162, + "grad_norm": 0.1419857213343906, + "learning_rate": 1.5910566580148248e-05, + "loss": 0.6412, + "step": 2430 + }, + { + "epoch": 1.2017055988134966, + "grad_norm": 0.15095676564878427, + "learning_rate": 1.590742520127467e-05, + "loss": 0.6242, + "step": 2431 + }, + { + "epoch": 1.2021999752811765, + "grad_norm": 0.14015689701339043, + "learning_rate": 1.590428292669363e-05, + "loss": 0.6426, + "step": 2432 + }, + { + "epoch": 1.2026943517488569, + "grad_norm": 0.14823002887239492, + "learning_rate": 1.590113975688158e-05, + "loss": 0.5952, + "step": 2433 + }, + { + "epoch": 1.2031887282165368, + "grad_norm": 0.1542068334961151, + "learning_rate": 1.5897995692315084e-05, + "loss": 0.6216, + "step": 2434 + }, + { + "epoch": 1.2036831046842171, + "grad_norm": 0.14252717597193754, + "learning_rate": 1.589485073347087e-05, + "loss": 0.6233, + "step": 2435 + }, + { + "epoch": 1.2041774811518973, + "grad_norm": 0.14717235546143057, + "learning_rate": 1.5891704880825784e-05, + "loss": 0.6192, + "step": 2436 + }, + { + "epoch": 1.2046718576195774, + "grad_norm": 0.1428245247165134, + "learning_rate": 1.5888558134856814e-05, + "loss": 0.6285, + "step": 2437 + }, + { + "epoch": 1.2051662340872575, + "grad_norm": 0.14992683157706926, + "learning_rate": 1.5885410496041084e-05, + "loss": 0.6496, + "step": 2438 + }, + { + "epoch": 1.2056606105549377, + "grad_norm": 0.15312500395727507, + "learning_rate": 1.588226196485585e-05, + "loss": 0.6486, + "step": 2439 + }, + { + "epoch": 1.2061549870226178, + "grad_norm": 0.14885777600249694, + "learning_rate": 1.58791125417785e-05, + "loss": 0.6291, + "step": 2440 + }, + { + "epoch": 1.206649363490298, + "grad_norm": 0.14060763166475831, + "learning_rate": 1.587596222728657e-05, + "loss": 0.6111, + "step": 2441 + }, + { + "epoch": 1.207143739957978, + "grad_norm": 0.14820405809735657, + "learning_rate": 1.5872811021857724e-05, + "loss": 0.645, + "step": 2442 + }, + { + "epoch": 1.2076381164256582, + "grad_norm": 0.14394301527297343, + "learning_rate": 1.586965892596975e-05, + "loss": 0.6289, + "step": 2443 + }, + { + "epoch": 1.2081324928933384, + "grad_norm": 0.1428271702552451, + "learning_rate": 1.5866505940100592e-05, + "loss": 0.6213, + "step": 2444 + }, + { + "epoch": 1.2086268693610185, + "grad_norm": 0.1567800539975478, + "learning_rate": 1.5863352064728313e-05, + "loss": 0.6217, + "step": 2445 + }, + { + "epoch": 1.2091212458286986, + "grad_norm": 0.1428865006487669, + "learning_rate": 1.5860197300331116e-05, + "loss": 0.5988, + "step": 2446 + }, + { + "epoch": 1.2096156222963788, + "grad_norm": 0.1505525500748668, + "learning_rate": 1.5857041647387346e-05, + "loss": 0.6147, + "step": 2447 + }, + { + "epoch": 1.210109998764059, + "grad_norm": 0.14602355715841603, + "learning_rate": 1.5853885106375466e-05, + "loss": 0.678, + "step": 2448 + }, + { + "epoch": 1.210604375231739, + "grad_norm": 0.13717336728881221, + "learning_rate": 1.5850727677774088e-05, + "loss": 0.6161, + "step": 2449 + }, + { + "epoch": 1.2110987516994192, + "grad_norm": 0.14457361076594216, + "learning_rate": 1.5847569362061956e-05, + "loss": 0.6157, + "step": 2450 + }, + { + "epoch": 1.2115931281670993, + "grad_norm": 0.14716418792407823, + "learning_rate": 1.5844410159717943e-05, + "loss": 0.664, + "step": 2451 + }, + { + "epoch": 1.2120875046347794, + "grad_norm": 0.15192924029893806, + "learning_rate": 1.5841250071221058e-05, + "loss": 0.6377, + "step": 2452 + }, + { + "epoch": 1.2125818811024596, + "grad_norm": 0.1389320062161231, + "learning_rate": 1.5838089097050453e-05, + "loss": 0.5946, + "step": 2453 + }, + { + "epoch": 1.2130762575701397, + "grad_norm": 0.1537498064190025, + "learning_rate": 1.58349272376854e-05, + "loss": 0.6362, + "step": 2454 + }, + { + "epoch": 1.2135706340378198, + "grad_norm": 0.1456108157947581, + "learning_rate": 1.583176449360532e-05, + "loss": 0.6503, + "step": 2455 + }, + { + "epoch": 1.2140650105055, + "grad_norm": 0.1467899084192721, + "learning_rate": 1.582860086528976e-05, + "loss": 0.5902, + "step": 2456 + }, + { + "epoch": 1.21455938697318, + "grad_norm": 0.14100151348828494, + "learning_rate": 1.582543635321839e-05, + "loss": 0.6197, + "step": 2457 + }, + { + "epoch": 1.2150537634408602, + "grad_norm": 0.141041020802, + "learning_rate": 1.5822270957871048e-05, + "loss": 0.6223, + "step": 2458 + }, + { + "epoch": 1.2155481399085404, + "grad_norm": 0.13966509767176427, + "learning_rate": 1.5819104679727664e-05, + "loss": 0.5954, + "step": 2459 + }, + { + "epoch": 1.2160425163762205, + "grad_norm": 0.1374656841746688, + "learning_rate": 1.581593751926833e-05, + "loss": 0.6627, + "step": 2460 + }, + { + "epoch": 1.2165368928439007, + "grad_norm": 0.13991130494290466, + "learning_rate": 1.5812769476973266e-05, + "loss": 0.6353, + "step": 2461 + }, + { + "epoch": 1.2170312693115808, + "grad_norm": 0.14153296454460393, + "learning_rate": 1.5809600553322814e-05, + "loss": 0.6674, + "step": 2462 + }, + { + "epoch": 1.217525645779261, + "grad_norm": 0.14113654951845095, + "learning_rate": 1.580643074879747e-05, + "loss": 0.6245, + "step": 2463 + }, + { + "epoch": 1.218020022246941, + "grad_norm": 0.13918013122053666, + "learning_rate": 1.5803260063877847e-05, + "loss": 0.6436, + "step": 2464 + }, + { + "epoch": 1.2185143987146212, + "grad_norm": 0.1427027298751174, + "learning_rate": 1.5800088499044696e-05, + "loss": 0.6618, + "step": 2465 + }, + { + "epoch": 1.2190087751823013, + "grad_norm": 0.14091981075478785, + "learning_rate": 1.5796916054778903e-05, + "loss": 0.6409, + "step": 2466 + }, + { + "epoch": 1.2195031516499815, + "grad_norm": 0.14891029164743275, + "learning_rate": 1.579374273156149e-05, + "loss": 0.6175, + "step": 2467 + }, + { + "epoch": 1.2199975281176616, + "grad_norm": 0.1440131898249815, + "learning_rate": 1.5790568529873603e-05, + "loss": 0.6404, + "step": 2468 + }, + { + "epoch": 1.2204919045853417, + "grad_norm": 0.14703270356848594, + "learning_rate": 1.5787393450196532e-05, + "loss": 0.689, + "step": 2469 + }, + { + "epoch": 1.2209862810530219, + "grad_norm": 0.1419487142495089, + "learning_rate": 1.5784217493011695e-05, + "loss": 0.6205, + "step": 2470 + }, + { + "epoch": 1.221480657520702, + "grad_norm": 0.14134320821704388, + "learning_rate": 1.578104065880064e-05, + "loss": 0.6351, + "step": 2471 + }, + { + "epoch": 1.2219750339883821, + "grad_norm": 0.1439191523812546, + "learning_rate": 1.5777862948045055e-05, + "loss": 0.6407, + "step": 2472 + }, + { + "epoch": 1.2224694104560623, + "grad_norm": 0.13835610206104412, + "learning_rate": 1.5774684361226754e-05, + "loss": 0.5809, + "step": 2473 + }, + { + "epoch": 1.2229637869237424, + "grad_norm": 0.13998254517501083, + "learning_rate": 1.577150489882769e-05, + "loss": 0.6167, + "step": 2474 + }, + { + "epoch": 1.2234581633914225, + "grad_norm": 0.14106206019850792, + "learning_rate": 1.5768324561329946e-05, + "loss": 0.6144, + "step": 2475 + }, + { + "epoch": 1.2239525398591027, + "grad_norm": 0.13874062182095925, + "learning_rate": 1.5765143349215736e-05, + "loss": 0.6161, + "step": 2476 + }, + { + "epoch": 1.2244469163267828, + "grad_norm": 0.1371678490545807, + "learning_rate": 1.5761961262967405e-05, + "loss": 0.6377, + "step": 2477 + }, + { + "epoch": 1.224941292794463, + "grad_norm": 0.1434949332197458, + "learning_rate": 1.5758778303067442e-05, + "loss": 0.5809, + "step": 2478 + }, + { + "epoch": 1.225435669262143, + "grad_norm": 0.14070965943416075, + "learning_rate": 1.575559446999845e-05, + "loss": 0.6326, + "step": 2479 + }, + { + "epoch": 1.2259300457298232, + "grad_norm": 0.14556452457947797, + "learning_rate": 1.5752409764243184e-05, + "loss": 0.6388, + "step": 2480 + }, + { + "epoch": 1.2264244221975034, + "grad_norm": 0.15060648135302523, + "learning_rate": 1.5749224186284514e-05, + "loss": 0.6182, + "step": 2481 + }, + { + "epoch": 1.2269187986651835, + "grad_norm": 0.14703097056769063, + "learning_rate": 1.5746037736605454e-05, + "loss": 0.6494, + "step": 2482 + }, + { + "epoch": 1.2274131751328636, + "grad_norm": 0.15082110054177802, + "learning_rate": 1.574285041568915e-05, + "loss": 0.583, + "step": 2483 + }, + { + "epoch": 1.2279075516005438, + "grad_norm": 0.13102295070885348, + "learning_rate": 1.5739662224018863e-05, + "loss": 0.6163, + "step": 2484 + }, + { + "epoch": 1.228401928068224, + "grad_norm": 0.15311993611247512, + "learning_rate": 1.5736473162078017e-05, + "loss": 0.6002, + "step": 2485 + }, + { + "epoch": 1.228896304535904, + "grad_norm": 0.14683550498474482, + "learning_rate": 1.573328323035014e-05, + "loss": 0.669, + "step": 2486 + }, + { + "epoch": 1.2293906810035842, + "grad_norm": 0.136054338079254, + "learning_rate": 1.57300924293189e-05, + "loss": 0.614, + "step": 2487 + }, + { + "epoch": 1.2298850574712643, + "grad_norm": 0.14441727667619858, + "learning_rate": 1.5726900759468104e-05, + "loss": 0.6123, + "step": 2488 + }, + { + "epoch": 1.2303794339389444, + "grad_norm": 0.14056112638478835, + "learning_rate": 1.5723708221281688e-05, + "loss": 0.6075, + "step": 2489 + }, + { + "epoch": 1.2308738104066246, + "grad_norm": 0.14582999665120977, + "learning_rate": 1.5720514815243714e-05, + "loss": 0.6211, + "step": 2490 + }, + { + "epoch": 1.2313681868743047, + "grad_norm": 0.14624786229973238, + "learning_rate": 1.5717320541838378e-05, + "loss": 0.6236, + "step": 2491 + }, + { + "epoch": 1.2318625633419849, + "grad_norm": 0.1477039834758759, + "learning_rate": 1.571412540155001e-05, + "loss": 0.6782, + "step": 2492 + }, + { + "epoch": 1.232356939809665, + "grad_norm": 0.14191633534147954, + "learning_rate": 1.571092939486307e-05, + "loss": 0.5775, + "step": 2493 + }, + { + "epoch": 1.2328513162773451, + "grad_norm": 0.13695343005109267, + "learning_rate": 1.5707732522262148e-05, + "loss": 0.6459, + "step": 2494 + }, + { + "epoch": 1.2333456927450253, + "grad_norm": 0.151164483807481, + "learning_rate": 1.5704534784231964e-05, + "loss": 0.6237, + "step": 2495 + }, + { + "epoch": 1.2338400692127054, + "grad_norm": 0.14878380824132698, + "learning_rate": 1.570133618125738e-05, + "loss": 0.6277, + "step": 2496 + }, + { + "epoch": 1.2343344456803855, + "grad_norm": 0.13822224425357524, + "learning_rate": 1.569813671382338e-05, + "loss": 0.5997, + "step": 2497 + }, + { + "epoch": 1.2348288221480657, + "grad_norm": 0.13932443441840772, + "learning_rate": 1.569493638241507e-05, + "loss": 0.6087, + "step": 2498 + }, + { + "epoch": 1.2353231986157458, + "grad_norm": 0.14545465689727527, + "learning_rate": 1.5691735187517706e-05, + "loss": 0.6043, + "step": 2499 + }, + { + "epoch": 1.235817575083426, + "grad_norm": 0.13984246968894812, + "learning_rate": 1.5688533129616665e-05, + "loss": 0.6297, + "step": 2500 + }, + { + "epoch": 1.236311951551106, + "grad_norm": 0.14683590078572462, + "learning_rate": 1.5685330209197452e-05, + "loss": 0.6289, + "step": 2501 + }, + { + "epoch": 1.2368063280187864, + "grad_norm": 0.14065270084004208, + "learning_rate": 1.5682126426745714e-05, + "loss": 0.6228, + "step": 2502 + }, + { + "epoch": 1.2373007044864663, + "grad_norm": 0.15384481399507285, + "learning_rate": 1.567892178274721e-05, + "loss": 0.6635, + "step": 2503 + }, + { + "epoch": 1.2377950809541467, + "grad_norm": 0.1701964230989054, + "learning_rate": 1.5675716277687853e-05, + "loss": 0.5895, + "step": 2504 + }, + { + "epoch": 1.2382894574218266, + "grad_norm": 0.13569192285453066, + "learning_rate": 1.5672509912053664e-05, + "loss": 0.6022, + "step": 2505 + }, + { + "epoch": 1.238783833889507, + "grad_norm": 0.13735871583528728, + "learning_rate": 1.5669302686330812e-05, + "loss": 0.658, + "step": 2506 + }, + { + "epoch": 1.2392782103571869, + "grad_norm": 0.14467576383364536, + "learning_rate": 1.566609460100559e-05, + "loss": 0.6094, + "step": 2507 + }, + { + "epoch": 1.2397725868248672, + "grad_norm": 0.14504283097752776, + "learning_rate": 1.5662885656564414e-05, + "loss": 0.6136, + "step": 2508 + }, + { + "epoch": 1.2402669632925472, + "grad_norm": 0.14683563189782062, + "learning_rate": 1.5659675853493844e-05, + "loss": 0.6451, + "step": 2509 + }, + { + "epoch": 1.2407613397602275, + "grad_norm": 0.14734802928395577, + "learning_rate": 1.5656465192280558e-05, + "loss": 0.6385, + "step": 2510 + }, + { + "epoch": 1.2412557162279076, + "grad_norm": 0.14499492578796355, + "learning_rate": 1.5653253673411372e-05, + "loss": 0.6282, + "step": 2511 + }, + { + "epoch": 1.2417500926955878, + "grad_norm": 0.14994057301065658, + "learning_rate": 1.565004129737323e-05, + "loss": 0.6156, + "step": 2512 + }, + { + "epoch": 1.242244469163268, + "grad_norm": 0.13445759727435608, + "learning_rate": 1.5646828064653202e-05, + "loss": 0.6891, + "step": 2513 + }, + { + "epoch": 1.242738845630948, + "grad_norm": 0.14186159591997188, + "learning_rate": 1.5643613975738495e-05, + "loss": 0.635, + "step": 2514 + }, + { + "epoch": 1.2432332220986282, + "grad_norm": 0.14350753337521133, + "learning_rate": 1.564039903111644e-05, + "loss": 0.661, + "step": 2515 + }, + { + "epoch": 1.2437275985663083, + "grad_norm": 0.15638561403200035, + "learning_rate": 1.56371832312745e-05, + "loss": 0.6075, + "step": 2516 + }, + { + "epoch": 1.2442219750339885, + "grad_norm": 0.13787390520475196, + "learning_rate": 1.5633966576700265e-05, + "loss": 0.6443, + "step": 2517 + }, + { + "epoch": 1.2447163515016686, + "grad_norm": 0.14639521997411764, + "learning_rate": 1.5630749067881464e-05, + "loss": 0.6365, + "step": 2518 + }, + { + "epoch": 1.2452107279693487, + "grad_norm": 0.1386709065361384, + "learning_rate": 1.5627530705305946e-05, + "loss": 0.6183, + "step": 2519 + }, + { + "epoch": 1.2457051044370289, + "grad_norm": 0.1476255999656282, + "learning_rate": 1.5624311489461684e-05, + "loss": 0.6658, + "step": 2520 + }, + { + "epoch": 1.246199480904709, + "grad_norm": 0.14466703874156606, + "learning_rate": 1.56210914208368e-05, + "loss": 0.6391, + "step": 2521 + }, + { + "epoch": 1.2466938573723891, + "grad_norm": 0.13923107234743248, + "learning_rate": 1.5617870499919526e-05, + "loss": 0.6282, + "step": 2522 + }, + { + "epoch": 1.2471882338400693, + "grad_norm": 0.13331677278771353, + "learning_rate": 1.5614648727198232e-05, + "loss": 0.6334, + "step": 2523 + }, + { + "epoch": 1.2476826103077494, + "grad_norm": 0.14267919969906634, + "learning_rate": 1.561142610316142e-05, + "loss": 0.63, + "step": 2524 + }, + { + "epoch": 1.2481769867754295, + "grad_norm": 0.14049653734789688, + "learning_rate": 1.5608202628297713e-05, + "loss": 0.636, + "step": 2525 + }, + { + "epoch": 1.2486713632431097, + "grad_norm": 0.14510593616841141, + "learning_rate": 1.5604978303095867e-05, + "loss": 0.6385, + "step": 2526 + }, + { + "epoch": 1.2491657397107898, + "grad_norm": 0.14239488311847684, + "learning_rate": 1.5601753128044773e-05, + "loss": 0.6602, + "step": 2527 + }, + { + "epoch": 1.24966011617847, + "grad_norm": 0.1352365365646187, + "learning_rate": 1.559852710363344e-05, + "loss": 0.6132, + "step": 2528 + }, + { + "epoch": 1.25015449264615, + "grad_norm": 0.140510217948912, + "learning_rate": 1.559530023035101e-05, + "loss": 0.6295, + "step": 2529 + }, + { + "epoch": 1.2506488691138302, + "grad_norm": 0.13649397880422728, + "learning_rate": 1.5592072508686754e-05, + "loss": 0.6385, + "step": 2530 + }, + { + "epoch": 1.2506488691138302, + "eval_loss": 0.6708462238311768, + "eval_runtime": 81.6997, + "eval_samples_per_second": 371.531, + "eval_steps_per_second": 46.451, + "step": 2530 + }, + { + "epoch": 1.2511432455815104, + "grad_norm": 0.1386021064931651, + "learning_rate": 1.5588843939130077e-05, + "loss": 0.6002, + "step": 2531 + }, + { + "epoch": 1.2516376220491905, + "grad_norm": 0.13524382462127765, + "learning_rate": 1.5585614522170506e-05, + "loss": 0.6237, + "step": 2532 + }, + { + "epoch": 1.2521319985168706, + "grad_norm": 0.14654364403518044, + "learning_rate": 1.5582384258297694e-05, + "loss": 0.6335, + "step": 2533 + }, + { + "epoch": 1.2526263749845508, + "grad_norm": 0.138031757762939, + "learning_rate": 1.557915314800143e-05, + "loss": 0.5848, + "step": 2534 + }, + { + "epoch": 1.253120751452231, + "grad_norm": 0.13904920942136653, + "learning_rate": 1.557592119177163e-05, + "loss": 0.5951, + "step": 2535 + }, + { + "epoch": 1.253615127919911, + "grad_norm": 0.13517785324744525, + "learning_rate": 1.5572688390098328e-05, + "loss": 0.6614, + "step": 2536 + }, + { + "epoch": 1.2541095043875912, + "grad_norm": 0.13916628408820372, + "learning_rate": 1.5569454743471702e-05, + "loss": 0.6067, + "step": 2537 + }, + { + "epoch": 1.2546038808552713, + "grad_norm": 0.13898396146692957, + "learning_rate": 1.556622025238205e-05, + "loss": 0.6337, + "step": 2538 + }, + { + "epoch": 1.2550982573229514, + "grad_norm": 0.13551315489290536, + "learning_rate": 1.5562984917319795e-05, + "loss": 0.6335, + "step": 2539 + }, + { + "epoch": 1.2555926337906316, + "grad_norm": 0.22744605857166902, + "learning_rate": 1.5559748738775493e-05, + "loss": 0.6026, + "step": 2540 + }, + { + "epoch": 1.2560870102583117, + "grad_norm": 0.25868193461949335, + "learning_rate": 1.5556511717239828e-05, + "loss": 0.6095, + "step": 2541 + }, + { + "epoch": 1.2565813867259918, + "grad_norm": 0.14308075934895292, + "learning_rate": 1.5553273853203608e-05, + "loss": 0.5829, + "step": 2542 + }, + { + "epoch": 1.257075763193672, + "grad_norm": 0.14596513831975186, + "learning_rate": 1.555003514715777e-05, + "loss": 0.6437, + "step": 2543 + }, + { + "epoch": 1.2575701396613521, + "grad_norm": 0.14252224775056563, + "learning_rate": 1.554679559959338e-05, + "loss": 0.6407, + "step": 2544 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 0.14158736084162638, + "learning_rate": 1.5543555211001638e-05, + "loss": 0.6108, + "step": 2545 + }, + { + "epoch": 1.2585588925967124, + "grad_norm": 0.14039755720826214, + "learning_rate": 1.5540313981873853e-05, + "loss": 0.5885, + "step": 2546 + }, + { + "epoch": 1.2590532690643925, + "grad_norm": 0.14043503307422267, + "learning_rate": 1.5537071912701482e-05, + "loss": 0.6375, + "step": 2547 + }, + { + "epoch": 1.2595476455320727, + "grad_norm": 0.14383970588758224, + "learning_rate": 1.5533829003976098e-05, + "loss": 0.6378, + "step": 2548 + }, + { + "epoch": 1.2600420219997528, + "grad_norm": 0.14106744259919662, + "learning_rate": 1.55305852561894e-05, + "loss": 0.6546, + "step": 2549 + }, + { + "epoch": 1.260536398467433, + "grad_norm": 0.1535948002767365, + "learning_rate": 1.5527340669833227e-05, + "loss": 0.6157, + "step": 2550 + }, + { + "epoch": 1.261030774935113, + "grad_norm": 0.14369507466416345, + "learning_rate": 1.5524095245399525e-05, + "loss": 0.6487, + "step": 2551 + }, + { + "epoch": 1.2615251514027932, + "grad_norm": 0.14332927081886265, + "learning_rate": 1.5520848983380386e-05, + "loss": 0.6073, + "step": 2552 + }, + { + "epoch": 1.2620195278704733, + "grad_norm": 0.16549042218543994, + "learning_rate": 1.5517601884268022e-05, + "loss": 0.6357, + "step": 2553 + }, + { + "epoch": 1.2625139043381535, + "grad_norm": 0.14076008566632517, + "learning_rate": 1.5514353948554765e-05, + "loss": 0.5975, + "step": 2554 + }, + { + "epoch": 1.2630082808058336, + "grad_norm": 0.14494896298047008, + "learning_rate": 1.5511105176733084e-05, + "loss": 0.6627, + "step": 2555 + }, + { + "epoch": 1.2635026572735137, + "grad_norm": 0.1399758914735637, + "learning_rate": 1.550785556929557e-05, + "loss": 0.639, + "step": 2556 + }, + { + "epoch": 1.2639970337411939, + "grad_norm": 0.14930664744509461, + "learning_rate": 1.550460512673494e-05, + "loss": 0.6255, + "step": 2557 + }, + { + "epoch": 1.264491410208874, + "grad_norm": 0.18573599945970423, + "learning_rate": 1.5501353849544046e-05, + "loss": 0.6208, + "step": 2558 + }, + { + "epoch": 1.2649857866765541, + "grad_norm": 0.13911633261337106, + "learning_rate": 1.5498101738215847e-05, + "loss": 0.6413, + "step": 2559 + }, + { + "epoch": 1.2654801631442343, + "grad_norm": 0.14670574629325023, + "learning_rate": 1.5494848793243456e-05, + "loss": 0.6548, + "step": 2560 + }, + { + "epoch": 1.2659745396119144, + "grad_norm": 0.1690704459390366, + "learning_rate": 1.5491595015120086e-05, + "loss": 0.6176, + "step": 2561 + }, + { + "epoch": 1.2664689160795946, + "grad_norm": 0.13647408926373167, + "learning_rate": 1.548834040433909e-05, + "loss": 0.5948, + "step": 2562 + }, + { + "epoch": 1.2669632925472747, + "grad_norm": 0.14002211956842767, + "learning_rate": 1.548508496139395e-05, + "loss": 0.6236, + "step": 2563 + }, + { + "epoch": 1.2674576690149548, + "grad_norm": 0.15481633453796592, + "learning_rate": 1.5481828686778266e-05, + "loss": 0.642, + "step": 2564 + }, + { + "epoch": 1.267952045482635, + "grad_norm": 0.13870775712798988, + "learning_rate": 1.547857158098577e-05, + "loss": 0.6365, + "step": 2565 + }, + { + "epoch": 1.268446421950315, + "grad_norm": 0.13626849964116833, + "learning_rate": 1.547531364451031e-05, + "loss": 0.6261, + "step": 2566 + }, + { + "epoch": 1.2689407984179952, + "grad_norm": 0.14139986890098083, + "learning_rate": 1.5472054877845876e-05, + "loss": 0.6718, + "step": 2567 + }, + { + "epoch": 1.2694351748856754, + "grad_norm": 0.14631833130801897, + "learning_rate": 1.546879528148657e-05, + "loss": 0.638, + "step": 2568 + }, + { + "epoch": 1.2699295513533555, + "grad_norm": 0.14795192487863046, + "learning_rate": 1.5465534855926626e-05, + "loss": 0.5996, + "step": 2569 + }, + { + "epoch": 1.2704239278210356, + "grad_norm": 0.14647766131417952, + "learning_rate": 1.5462273601660407e-05, + "loss": 0.6746, + "step": 2570 + }, + { + "epoch": 1.270918304288716, + "grad_norm": 0.14024745123001472, + "learning_rate": 1.5459011519182393e-05, + "loss": 0.62, + "step": 2571 + }, + { + "epoch": 1.271412680756396, + "grad_norm": 0.1398531949150052, + "learning_rate": 1.5455748608987192e-05, + "loss": 0.575, + "step": 2572 + }, + { + "epoch": 1.2719070572240763, + "grad_norm": 0.1368214731653041, + "learning_rate": 1.5452484871569545e-05, + "loss": 0.6066, + "step": 2573 + }, + { + "epoch": 1.2724014336917562, + "grad_norm": 0.13540208038533658, + "learning_rate": 1.5449220307424312e-05, + "loss": 0.6177, + "step": 2574 + }, + { + "epoch": 1.2728958101594365, + "grad_norm": 0.14898568382240043, + "learning_rate": 1.5445954917046477e-05, + "loss": 0.6308, + "step": 2575 + }, + { + "epoch": 1.2733901866271164, + "grad_norm": 0.13874229470785393, + "learning_rate": 1.5442688700931152e-05, + "loss": 0.6357, + "step": 2576 + }, + { + "epoch": 1.2738845630947968, + "grad_norm": 0.14346419197933352, + "learning_rate": 1.543942165957357e-05, + "loss": 0.6051, + "step": 2577 + }, + { + "epoch": 1.2743789395624767, + "grad_norm": 0.13730250191080184, + "learning_rate": 1.5436153793469102e-05, + "loss": 0.5729, + "step": 2578 + }, + { + "epoch": 1.274873316030157, + "grad_norm": 0.13839734200676698, + "learning_rate": 1.543288510311323e-05, + "loss": 0.6405, + "step": 2579 + }, + { + "epoch": 1.275367692497837, + "grad_norm": 0.1506975166079922, + "learning_rate": 1.542961558900156e-05, + "loss": 0.6283, + "step": 2580 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 0.13762449636108967, + "learning_rate": 1.542634525162984e-05, + "loss": 0.6323, + "step": 2581 + }, + { + "epoch": 1.2763564454331973, + "grad_norm": 0.1541223045222392, + "learning_rate": 1.5423074091493928e-05, + "loss": 0.6056, + "step": 2582 + }, + { + "epoch": 1.2768508219008776, + "grad_norm": 0.13925169781468724, + "learning_rate": 1.5419802109089803e-05, + "loss": 0.6331, + "step": 2583 + }, + { + "epoch": 1.2773451983685575, + "grad_norm": 0.14241592081083546, + "learning_rate": 1.541652930491359e-05, + "loss": 0.6258, + "step": 2584 + }, + { + "epoch": 1.277839574836238, + "grad_norm": 0.1448324000137307, + "learning_rate": 1.5413255679461506e-05, + "loss": 0.6333, + "step": 2585 + }, + { + "epoch": 1.2783339513039178, + "grad_norm": 0.1384015047117146, + "learning_rate": 1.540998123322993e-05, + "loss": 0.6148, + "step": 2586 + }, + { + "epoch": 1.2788283277715982, + "grad_norm": 0.13437524813047805, + "learning_rate": 1.540670596671533e-05, + "loss": 0.6072, + "step": 2587 + }, + { + "epoch": 1.279322704239278, + "grad_norm": 0.14435601455105537, + "learning_rate": 1.540342988041433e-05, + "loss": 0.6176, + "step": 2588 + }, + { + "epoch": 1.2798170807069584, + "grad_norm": 0.13410222617125264, + "learning_rate": 1.5400152974823653e-05, + "loss": 0.6238, + "step": 2589 + }, + { + "epoch": 1.2803114571746386, + "grad_norm": 0.1525895624810332, + "learning_rate": 1.5396875250440168e-05, + "loss": 0.6291, + "step": 2590 + }, + { + "epoch": 1.2808058336423187, + "grad_norm": 0.1406106113851172, + "learning_rate": 1.539359670776084e-05, + "loss": 0.6172, + "step": 2591 + }, + { + "epoch": 1.2813002101099988, + "grad_norm": 0.4439155806888791, + "learning_rate": 1.5390317347282787e-05, + "loss": 0.6188, + "step": 2592 + }, + { + "epoch": 1.281794586577679, + "grad_norm": 0.13590904862775058, + "learning_rate": 1.5387037169503237e-05, + "loss": 0.6102, + "step": 2593 + }, + { + "epoch": 1.282288963045359, + "grad_norm": 0.13999382123944182, + "learning_rate": 1.538375617491954e-05, + "loss": 0.6419, + "step": 2594 + }, + { + "epoch": 1.2827833395130392, + "grad_norm": 0.14445728728957644, + "learning_rate": 1.5380474364029175e-05, + "loss": 0.6297, + "step": 2595 + }, + { + "epoch": 1.2832777159807194, + "grad_norm": 0.13879817806443046, + "learning_rate": 1.5377191737329744e-05, + "loss": 0.6544, + "step": 2596 + }, + { + "epoch": 1.2837720924483995, + "grad_norm": 0.13757628065774324, + "learning_rate": 1.5373908295318973e-05, + "loss": 0.6345, + "step": 2597 + }, + { + "epoch": 1.2842664689160797, + "grad_norm": 0.14240477895336262, + "learning_rate": 1.537062403849471e-05, + "loss": 0.6145, + "step": 2598 + }, + { + "epoch": 1.2847608453837598, + "grad_norm": 0.14034918866147325, + "learning_rate": 1.5367338967354924e-05, + "loss": 0.6518, + "step": 2599 + }, + { + "epoch": 1.28525522185144, + "grad_norm": 0.13707246071393556, + "learning_rate": 1.5364053082397717e-05, + "loss": 0.6637, + "step": 2600 + }, + { + "epoch": 1.28574959831912, + "grad_norm": 0.1457178965843136, + "learning_rate": 1.5360766384121304e-05, + "loss": 0.6196, + "step": 2601 + }, + { + "epoch": 1.2862439747868002, + "grad_norm": 0.13996938337573278, + "learning_rate": 1.5357478873024024e-05, + "loss": 0.5968, + "step": 2602 + }, + { + "epoch": 1.2867383512544803, + "grad_norm": 0.1403790749788397, + "learning_rate": 1.535419054960435e-05, + "loss": 0.626, + "step": 2603 + }, + { + "epoch": 1.2872327277221605, + "grad_norm": 0.13884819228639322, + "learning_rate": 1.535090141436087e-05, + "loss": 0.6144, + "step": 2604 + }, + { + "epoch": 1.2877271041898406, + "grad_norm": 0.13097516510511154, + "learning_rate": 1.5347611467792284e-05, + "loss": 0.6371, + "step": 2605 + }, + { + "epoch": 1.2882214806575207, + "grad_norm": 0.13819334442890074, + "learning_rate": 1.5344320710397442e-05, + "loss": 0.6132, + "step": 2606 + }, + { + "epoch": 1.2887158571252009, + "grad_norm": 0.14728948323349425, + "learning_rate": 1.5341029142675297e-05, + "loss": 0.6448, + "step": 2607 + }, + { + "epoch": 1.289210233592881, + "grad_norm": 0.14615600674863044, + "learning_rate": 1.5337736765124925e-05, + "loss": 0.6565, + "step": 2608 + }, + { + "epoch": 1.2897046100605611, + "grad_norm": 0.13642055187665644, + "learning_rate": 1.5334443578245535e-05, + "loss": 0.6582, + "step": 2609 + }, + { + "epoch": 1.2901989865282413, + "grad_norm": 0.13841940892066296, + "learning_rate": 1.5331149582536447e-05, + "loss": 0.6528, + "step": 2610 + }, + { + "epoch": 1.2906933629959214, + "grad_norm": 0.1374406746538801, + "learning_rate": 1.532785477849712e-05, + "loss": 0.6212, + "step": 2611 + }, + { + "epoch": 1.2911877394636015, + "grad_norm": 0.14201072569521214, + "learning_rate": 1.5324559166627115e-05, + "loss": 0.6532, + "step": 2612 + }, + { + "epoch": 1.2916821159312817, + "grad_norm": 0.14349669908977822, + "learning_rate": 1.532126274742613e-05, + "loss": 0.6471, + "step": 2613 + }, + { + "epoch": 1.2921764923989618, + "grad_norm": 0.13696380680729658, + "learning_rate": 1.5317965521393982e-05, + "loss": 0.6694, + "step": 2614 + }, + { + "epoch": 1.292670868866642, + "grad_norm": 0.15024295317352693, + "learning_rate": 1.531466748903061e-05, + "loss": 0.6058, + "step": 2615 + }, + { + "epoch": 1.293165245334322, + "grad_norm": 0.14957334306393283, + "learning_rate": 1.5311368650836077e-05, + "loss": 0.6223, + "step": 2616 + }, + { + "epoch": 1.2936596218020022, + "grad_norm": 0.14411475760993847, + "learning_rate": 1.5308069007310557e-05, + "loss": 0.6096, + "step": 2617 + }, + { + "epoch": 1.2941539982696824, + "grad_norm": 0.1466262000533132, + "learning_rate": 1.530476855895436e-05, + "loss": 0.6681, + "step": 2618 + }, + { + "epoch": 1.2946483747373625, + "grad_norm": 0.14175760074848914, + "learning_rate": 1.530146730626792e-05, + "loss": 0.639, + "step": 2619 + }, + { + "epoch": 1.2951427512050426, + "grad_norm": 0.1396696281503014, + "learning_rate": 1.5298165249751777e-05, + "loss": 0.6352, + "step": 2620 + }, + { + "epoch": 1.2956371276727228, + "grad_norm": 0.1451535229578902, + "learning_rate": 1.5294862389906607e-05, + "loss": 0.6349, + "step": 2621 + }, + { + "epoch": 1.296131504140403, + "grad_norm": 0.14473332574141615, + "learning_rate": 1.5291558727233198e-05, + "loss": 0.6178, + "step": 2622 + }, + { + "epoch": 1.296625880608083, + "grad_norm": 0.13437191679740196, + "learning_rate": 1.5288254262232474e-05, + "loss": 0.6363, + "step": 2623 + }, + { + "epoch": 1.2971202570757632, + "grad_norm": 0.17163623659831154, + "learning_rate": 1.5284948995405457e-05, + "loss": 0.6393, + "step": 2624 + }, + { + "epoch": 1.2976146335434433, + "grad_norm": 0.14268836395049697, + "learning_rate": 1.5281642927253318e-05, + "loss": 0.5923, + "step": 2625 + }, + { + "epoch": 1.2981090100111234, + "grad_norm": 0.14729540905208846, + "learning_rate": 1.527833605827733e-05, + "loss": 0.6214, + "step": 2626 + }, + { + "epoch": 1.2986033864788036, + "grad_norm": 0.1575057478633641, + "learning_rate": 1.5275028388978897e-05, + "loss": 0.6513, + "step": 2627 + }, + { + "epoch": 1.2990977629464837, + "grad_norm": 0.149193895678932, + "learning_rate": 1.5271719919859536e-05, + "loss": 0.6106, + "step": 2628 + }, + { + "epoch": 1.2995921394141638, + "grad_norm": 0.14496027268070188, + "learning_rate": 1.526841065142089e-05, + "loss": 0.6315, + "step": 2629 + }, + { + "epoch": 1.300086515881844, + "grad_norm": 0.15913776566336774, + "learning_rate": 1.5265100584164733e-05, + "loss": 0.6362, + "step": 2630 + }, + { + "epoch": 1.3005808923495241, + "grad_norm": 0.14379100223844468, + "learning_rate": 1.5261789718592944e-05, + "loss": 0.6614, + "step": 2631 + }, + { + "epoch": 1.3010752688172043, + "grad_norm": 0.14760285069278206, + "learning_rate": 1.5258478055207527e-05, + "loss": 0.5921, + "step": 2632 + }, + { + "epoch": 1.3015696452848844, + "grad_norm": 0.13766322164408665, + "learning_rate": 1.5255165594510615e-05, + "loss": 0.638, + "step": 2633 + }, + { + "epoch": 1.3020640217525645, + "grad_norm": 0.1447533532922998, + "learning_rate": 1.5251852337004454e-05, + "loss": 0.6677, + "step": 2634 + }, + { + "epoch": 1.3025583982202447, + "grad_norm": 0.14777675129461026, + "learning_rate": 1.5248538283191409e-05, + "loss": 0.6673, + "step": 2635 + }, + { + "epoch": 1.3030527746879248, + "grad_norm": 0.14274296522894311, + "learning_rate": 1.524522343357398e-05, + "loss": 0.6085, + "step": 2636 + }, + { + "epoch": 1.303547151155605, + "grad_norm": 0.1497730372200368, + "learning_rate": 1.524190778865477e-05, + "loss": 0.5671, + "step": 2637 + }, + { + "epoch": 1.304041527623285, + "grad_norm": 0.1345721859402317, + "learning_rate": 1.5238591348936516e-05, + "loss": 0.5997, + "step": 2638 + }, + { + "epoch": 1.3045359040909652, + "grad_norm": 0.14346293940028548, + "learning_rate": 1.5235274114922063e-05, + "loss": 0.6596, + "step": 2639 + }, + { + "epoch": 1.3050302805586453, + "grad_norm": 0.1550351773252653, + "learning_rate": 1.523195608711439e-05, + "loss": 0.6312, + "step": 2640 + }, + { + "epoch": 1.3055246570263255, + "grad_norm": 0.13342803024589914, + "learning_rate": 1.5228637266016585e-05, + "loss": 0.5998, + "step": 2641 + }, + { + "epoch": 1.3060190334940056, + "grad_norm": 0.13755813980028733, + "learning_rate": 1.5225317652131865e-05, + "loss": 0.6307, + "step": 2642 + }, + { + "epoch": 1.3065134099616857, + "grad_norm": 0.1437400887699555, + "learning_rate": 1.522199724596356e-05, + "loss": 0.6048, + "step": 2643 + }, + { + "epoch": 1.3070077864293659, + "grad_norm": 0.1437343032322997, + "learning_rate": 1.5218676048015125e-05, + "loss": 0.5909, + "step": 2644 + }, + { + "epoch": 1.307502162897046, + "grad_norm": 0.13477316380199056, + "learning_rate": 1.5215354058790128e-05, + "loss": 0.6192, + "step": 2645 + }, + { + "epoch": 1.3079965393647264, + "grad_norm": 0.1441386343534566, + "learning_rate": 1.5212031278792273e-05, + "loss": 0.643, + "step": 2646 + }, + { + "epoch": 1.3084909158324063, + "grad_norm": 0.15623854511045798, + "learning_rate": 1.520870770852536e-05, + "loss": 0.6506, + "step": 2647 + }, + { + "epoch": 1.3089852923000866, + "grad_norm": 0.1438962525194523, + "learning_rate": 1.5205383348493334e-05, + "loss": 0.6458, + "step": 2648 + }, + { + "epoch": 1.3094796687677666, + "grad_norm": 0.13981372011937748, + "learning_rate": 1.5202058199200243e-05, + "loss": 0.6712, + "step": 2649 + }, + { + "epoch": 1.309974045235447, + "grad_norm": 0.14795021984231305, + "learning_rate": 1.5198732261150258e-05, + "loss": 0.6392, + "step": 2650 + }, + { + "epoch": 1.3104684217031268, + "grad_norm": 0.15153911195262035, + "learning_rate": 1.519540553484767e-05, + "loss": 0.635, + "step": 2651 + }, + { + "epoch": 1.3109627981708072, + "grad_norm": 0.13668481859744097, + "learning_rate": 1.5192078020796896e-05, + "loss": 0.5953, + "step": 2652 + }, + { + "epoch": 1.311457174638487, + "grad_norm": 0.14773649113602755, + "learning_rate": 1.5188749719502462e-05, + "loss": 0.6226, + "step": 2653 + }, + { + "epoch": 1.3119515511061675, + "grad_norm": 0.13547056708617403, + "learning_rate": 1.5185420631469022e-05, + "loss": 0.6335, + "step": 2654 + }, + { + "epoch": 1.3124459275738474, + "grad_norm": 0.1502449542061567, + "learning_rate": 1.518209075720134e-05, + "loss": 0.6092, + "step": 2655 + }, + { + "epoch": 1.3129403040415277, + "grad_norm": 0.1357271487411288, + "learning_rate": 1.5178760097204315e-05, + "loss": 0.617, + "step": 2656 + }, + { + "epoch": 1.3134346805092076, + "grad_norm": 0.13245255488239638, + "learning_rate": 1.5175428651982942e-05, + "loss": 0.6371, + "step": 2657 + }, + { + "epoch": 1.313929056976888, + "grad_norm": 0.13972692991875632, + "learning_rate": 1.517209642204236e-05, + "loss": 0.6545, + "step": 2658 + }, + { + "epoch": 1.314423433444568, + "grad_norm": 0.14071962103335395, + "learning_rate": 1.5168763407887808e-05, + "loss": 0.6179, + "step": 2659 + }, + { + "epoch": 1.3149178099122483, + "grad_norm": 0.132132746485195, + "learning_rate": 1.5165429610024651e-05, + "loss": 0.6341, + "step": 2660 + }, + { + "epoch": 1.3154121863799282, + "grad_norm": 0.13949167891068043, + "learning_rate": 1.5162095028958377e-05, + "loss": 0.6093, + "step": 2661 + }, + { + "epoch": 1.3159065628476085, + "grad_norm": 0.13542114841923122, + "learning_rate": 1.5158759665194585e-05, + "loss": 0.6463, + "step": 2662 + }, + { + "epoch": 1.3164009393152885, + "grad_norm": 0.13891251571279195, + "learning_rate": 1.5155423519238998e-05, + "loss": 0.6074, + "step": 2663 + }, + { + "epoch": 1.3168953157829688, + "grad_norm": 0.13601117913518881, + "learning_rate": 1.5152086591597455e-05, + "loss": 0.6276, + "step": 2664 + }, + { + "epoch": 1.317389692250649, + "grad_norm": 0.13861139319417135, + "learning_rate": 1.5148748882775914e-05, + "loss": 0.6089, + "step": 2665 + }, + { + "epoch": 1.317884068718329, + "grad_norm": 0.1329663745685746, + "learning_rate": 1.5145410393280453e-05, + "loss": 0.6192, + "step": 2666 + }, + { + "epoch": 1.3183784451860092, + "grad_norm": 0.14294731750628678, + "learning_rate": 1.5142071123617262e-05, + "loss": 0.6542, + "step": 2667 + }, + { + "epoch": 1.3188728216536894, + "grad_norm": 0.1543362118928087, + "learning_rate": 1.5138731074292663e-05, + "loss": 0.6607, + "step": 2668 + }, + { + "epoch": 1.3193671981213695, + "grad_norm": 0.15870614949684428, + "learning_rate": 1.5135390245813085e-05, + "loss": 0.6728, + "step": 2669 + }, + { + "epoch": 1.3198615745890496, + "grad_norm": 0.1451259022555469, + "learning_rate": 1.5132048638685073e-05, + "loss": 0.6255, + "step": 2670 + }, + { + "epoch": 1.3203559510567298, + "grad_norm": 0.1410894642461328, + "learning_rate": 1.51287062534153e-05, + "loss": 0.634, + "step": 2671 + }, + { + "epoch": 1.32085032752441, + "grad_norm": 0.13775805024790364, + "learning_rate": 1.5125363090510549e-05, + "loss": 0.6358, + "step": 2672 + }, + { + "epoch": 1.32134470399209, + "grad_norm": 0.1438973502300624, + "learning_rate": 1.5122019150477724e-05, + "loss": 0.6071, + "step": 2673 + }, + { + "epoch": 1.3218390804597702, + "grad_norm": 0.1448952973320633, + "learning_rate": 1.5118674433823848e-05, + "loss": 0.6246, + "step": 2674 + }, + { + "epoch": 1.3223334569274503, + "grad_norm": 0.13983662290680998, + "learning_rate": 1.511532894105606e-05, + "loss": 0.5841, + "step": 2675 + }, + { + "epoch": 1.3228278333951304, + "grad_norm": 0.13568524267338533, + "learning_rate": 1.5111982672681618e-05, + "loss": 0.6234, + "step": 2676 + }, + { + "epoch": 1.3233222098628106, + "grad_norm": 0.1403828259720639, + "learning_rate": 1.5108635629207893e-05, + "loss": 0.6555, + "step": 2677 + }, + { + "epoch": 1.3238165863304907, + "grad_norm": 0.14200417687189748, + "learning_rate": 1.5105287811142381e-05, + "loss": 0.6208, + "step": 2678 + }, + { + "epoch": 1.3243109627981708, + "grad_norm": 0.13837422620465614, + "learning_rate": 1.5101939218992688e-05, + "loss": 0.6519, + "step": 2679 + }, + { + "epoch": 1.324805339265851, + "grad_norm": 0.14683804119147292, + "learning_rate": 1.5098589853266545e-05, + "loss": 0.6571, + "step": 2680 + }, + { + "epoch": 1.3252997157335311, + "grad_norm": 0.13937246858788288, + "learning_rate": 1.509523971447179e-05, + "loss": 0.6477, + "step": 2681 + }, + { + "epoch": 1.3257940922012112, + "grad_norm": 0.14689527155355003, + "learning_rate": 1.5091888803116392e-05, + "loss": 0.6272, + "step": 2682 + }, + { + "epoch": 1.3262884686688914, + "grad_norm": 0.1361827989150905, + "learning_rate": 1.5088537119708426e-05, + "loss": 0.6144, + "step": 2683 + }, + { + "epoch": 1.3267828451365715, + "grad_norm": 0.13962500734050787, + "learning_rate": 1.5085184664756087e-05, + "loss": 0.6517, + "step": 2684 + }, + { + "epoch": 1.3272772216042517, + "grad_norm": 0.14694808343322927, + "learning_rate": 1.5081831438767691e-05, + "loss": 0.5867, + "step": 2685 + }, + { + "epoch": 1.3277715980719318, + "grad_norm": 0.14244608190317284, + "learning_rate": 1.5078477442251665e-05, + "loss": 0.6054, + "step": 2686 + }, + { + "epoch": 1.328265974539612, + "grad_norm": 0.14380009362841584, + "learning_rate": 1.5075122675716548e-05, + "loss": 0.6461, + "step": 2687 + }, + { + "epoch": 1.328760351007292, + "grad_norm": 0.13811352325058984, + "learning_rate": 1.5071767139671018e-05, + "loss": 0.6526, + "step": 2688 + }, + { + "epoch": 1.3292547274749722, + "grad_norm": 0.14811996096886684, + "learning_rate": 1.5068410834623845e-05, + "loss": 0.64, + "step": 2689 + }, + { + "epoch": 1.3297491039426523, + "grad_norm": 0.15820923698761746, + "learning_rate": 1.5065053761083927e-05, + "loss": 0.5841, + "step": 2690 + }, + { + "epoch": 1.3302434804103325, + "grad_norm": 0.14570561396865048, + "learning_rate": 1.5061695919560282e-05, + "loss": 0.6292, + "step": 2691 + }, + { + "epoch": 1.3307378568780126, + "grad_norm": 0.13759090526818946, + "learning_rate": 1.505833731056203e-05, + "loss": 0.6779, + "step": 2692 + }, + { + "epoch": 1.3312322333456927, + "grad_norm": 0.15570882233131952, + "learning_rate": 1.5054977934598425e-05, + "loss": 0.6374, + "step": 2693 + }, + { + "epoch": 1.3317266098133729, + "grad_norm": 0.14591012248517188, + "learning_rate": 1.5051617792178822e-05, + "loss": 0.6224, + "step": 2694 + }, + { + "epoch": 1.332220986281053, + "grad_norm": 0.13906418223341888, + "learning_rate": 1.5048256883812706e-05, + "loss": 0.6349, + "step": 2695 + }, + { + "epoch": 1.3327153627487331, + "grad_norm": 0.14429093123628167, + "learning_rate": 1.504489521000967e-05, + "loss": 0.5926, + "step": 2696 + }, + { + "epoch": 1.3332097392164133, + "grad_norm": 0.13372861301154154, + "learning_rate": 1.5041532771279422e-05, + "loss": 0.6295, + "step": 2697 + }, + { + "epoch": 1.3337041156840934, + "grad_norm": 0.1395709952843268, + "learning_rate": 1.5038169568131786e-05, + "loss": 0.6085, + "step": 2698 + }, + { + "epoch": 1.3341984921517736, + "grad_norm": 0.16256507870553238, + "learning_rate": 1.503480560107671e-05, + "loss": 0.6439, + "step": 2699 + }, + { + "epoch": 1.3346928686194537, + "grad_norm": 0.14339340468895861, + "learning_rate": 1.5031440870624247e-05, + "loss": 0.6379, + "step": 2700 + }, + { + "epoch": 1.3351872450871338, + "grad_norm": 0.14905522773558946, + "learning_rate": 1.5028075377284576e-05, + "loss": 0.6219, + "step": 2701 + }, + { + "epoch": 1.335681621554814, + "grad_norm": 0.13509692046794955, + "learning_rate": 1.5024709121567988e-05, + "loss": 0.6088, + "step": 2702 + }, + { + "epoch": 1.336175998022494, + "grad_norm": 0.13772626882267658, + "learning_rate": 1.502134210398488e-05, + "loss": 0.66, + "step": 2703 + }, + { + "epoch": 1.3366703744901742, + "grad_norm": 0.15945068074905935, + "learning_rate": 1.501797432504578e-05, + "loss": 0.6246, + "step": 2704 + }, + { + "epoch": 1.3371647509578544, + "grad_norm": 0.13534580403979848, + "learning_rate": 1.5014605785261318e-05, + "loss": 0.6269, + "step": 2705 + }, + { + "epoch": 1.3376591274255345, + "grad_norm": 0.14904506794187952, + "learning_rate": 1.5011236485142249e-05, + "loss": 0.5736, + "step": 2706 + }, + { + "epoch": 1.3381535038932146, + "grad_norm": 0.14014290712357316, + "learning_rate": 1.5007866425199443e-05, + "loss": 0.6562, + "step": 2707 + }, + { + "epoch": 1.3386478803608948, + "grad_norm": 0.13362345187536182, + "learning_rate": 1.5004495605943877e-05, + "loss": 0.6248, + "step": 2708 + }, + { + "epoch": 1.339142256828575, + "grad_norm": 0.14029592400972019, + "learning_rate": 1.5001124027886649e-05, + "loss": 0.6475, + "step": 2709 + }, + { + "epoch": 1.339636633296255, + "grad_norm": 0.13873829946660748, + "learning_rate": 1.499775169153897e-05, + "loss": 0.6198, + "step": 2710 + }, + { + "epoch": 1.3401310097639352, + "grad_norm": 0.32877344201943653, + "learning_rate": 1.4994378597412171e-05, + "loss": 0.638, + "step": 2711 + }, + { + "epoch": 1.3406253862316153, + "grad_norm": 0.13892103589552743, + "learning_rate": 1.4991004746017692e-05, + "loss": 0.6419, + "step": 2712 + }, + { + "epoch": 1.3411197626992954, + "grad_norm": 0.13861538854997954, + "learning_rate": 1.4987630137867091e-05, + "loss": 0.6146, + "step": 2713 + }, + { + "epoch": 1.3416141391669756, + "grad_norm": 0.13902366771199812, + "learning_rate": 1.4984254773472033e-05, + "loss": 0.6517, + "step": 2714 + }, + { + "epoch": 1.3421085156346557, + "grad_norm": 0.1339472424719849, + "learning_rate": 1.4980878653344318e-05, + "loss": 0.5922, + "step": 2715 + }, + { + "epoch": 1.3426028921023359, + "grad_norm": 0.14561427067621677, + "learning_rate": 1.4977501777995835e-05, + "loss": 0.6647, + "step": 2716 + }, + { + "epoch": 1.343097268570016, + "grad_norm": 0.14541040620096393, + "learning_rate": 1.49741241479386e-05, + "loss": 0.6315, + "step": 2717 + }, + { + "epoch": 1.3435916450376961, + "grad_norm": 0.13313735650993042, + "learning_rate": 1.4970745763684748e-05, + "loss": 0.6218, + "step": 2718 + }, + { + "epoch": 1.3440860215053765, + "grad_norm": 0.14362216016796983, + "learning_rate": 1.496736662574652e-05, + "loss": 0.6463, + "step": 2719 + }, + { + "epoch": 1.3445803979730564, + "grad_norm": 0.1379012971634843, + "learning_rate": 1.4963986734636277e-05, + "loss": 0.6429, + "step": 2720 + }, + { + "epoch": 1.3450747744407368, + "grad_norm": 0.13568270808830568, + "learning_rate": 1.4960606090866488e-05, + "loss": 0.6511, + "step": 2721 + }, + { + "epoch": 1.3455691509084167, + "grad_norm": 0.15202229460533465, + "learning_rate": 1.4957224694949744e-05, + "loss": 0.6175, + "step": 2722 + }, + { + "epoch": 1.346063527376097, + "grad_norm": 0.13906031149659906, + "learning_rate": 1.4953842547398743e-05, + "loss": 0.6433, + "step": 2723 + }, + { + "epoch": 1.346557903843777, + "grad_norm": 0.14590638900244643, + "learning_rate": 1.4950459648726298e-05, + "loss": 0.6003, + "step": 2724 + }, + { + "epoch": 1.3470522803114573, + "grad_norm": 0.14250630068459824, + "learning_rate": 1.4947075999445341e-05, + "loss": 0.6692, + "step": 2725 + }, + { + "epoch": 1.3475466567791372, + "grad_norm": 0.15947756528775092, + "learning_rate": 1.4943691600068912e-05, + "loss": 0.5857, + "step": 2726 + }, + { + "epoch": 1.3480410332468176, + "grad_norm": 0.13897308742651254, + "learning_rate": 1.494030645111017e-05, + "loss": 0.6211, + "step": 2727 + }, + { + "epoch": 1.3485354097144975, + "grad_norm": 0.1413256486626413, + "learning_rate": 1.4936920553082383e-05, + "loss": 0.6438, + "step": 2728 + }, + { + "epoch": 1.3490297861821778, + "grad_norm": 0.14653191857970735, + "learning_rate": 1.4933533906498937e-05, + "loss": 0.6364, + "step": 2729 + }, + { + "epoch": 1.3495241626498578, + "grad_norm": 0.1387994480332369, + "learning_rate": 1.4930146511873322e-05, + "loss": 0.5966, + "step": 2730 + }, + { + "epoch": 1.350018539117538, + "grad_norm": 0.13390035181858378, + "learning_rate": 1.4926758369719157e-05, + "loss": 0.6486, + "step": 2731 + }, + { + "epoch": 1.350512915585218, + "grad_norm": 0.14027760751605398, + "learning_rate": 1.492336948055016e-05, + "loss": 0.6003, + "step": 2732 + }, + { + "epoch": 1.3510072920528984, + "grad_norm": 0.14411749026883106, + "learning_rate": 1.4919979844880171e-05, + "loss": 0.6103, + "step": 2733 + }, + { + "epoch": 1.3515016685205783, + "grad_norm": 0.14233170400227485, + "learning_rate": 1.4916589463223137e-05, + "loss": 0.6568, + "step": 2734 + }, + { + "epoch": 1.3519960449882586, + "grad_norm": 0.15227758977029734, + "learning_rate": 1.4913198336093125e-05, + "loss": 0.5934, + "step": 2735 + }, + { + "epoch": 1.3524904214559386, + "grad_norm": 0.13922717734092444, + "learning_rate": 1.4909806464004303e-05, + "loss": 0.6167, + "step": 2736 + }, + { + "epoch": 1.352984797923619, + "grad_norm": 0.14138675613039287, + "learning_rate": 1.4906413847470972e-05, + "loss": 0.6248, + "step": 2737 + }, + { + "epoch": 1.353479174391299, + "grad_norm": 0.1535537969512427, + "learning_rate": 1.4903020487007532e-05, + "loss": 0.6348, + "step": 2738 + }, + { + "epoch": 1.3539735508589792, + "grad_norm": 0.15386537794716112, + "learning_rate": 1.4899626383128487e-05, + "loss": 0.6297, + "step": 2739 + }, + { + "epoch": 1.3544679273266593, + "grad_norm": 0.14197996932945406, + "learning_rate": 1.4896231536348475e-05, + "loss": 0.6579, + "step": 2740 + }, + { + "epoch": 1.3549623037943395, + "grad_norm": 0.13927311589909291, + "learning_rate": 1.4892835947182233e-05, + "loss": 0.6573, + "step": 2741 + }, + { + "epoch": 1.3554566802620196, + "grad_norm": 0.2553623016717157, + "learning_rate": 1.4889439616144617e-05, + "loss": 0.6165, + "step": 2742 + }, + { + "epoch": 1.3559510567296997, + "grad_norm": 0.14113365366044653, + "learning_rate": 1.4886042543750586e-05, + "loss": 0.602, + "step": 2743 + }, + { + "epoch": 1.3564454331973799, + "grad_norm": 0.13884840147496594, + "learning_rate": 1.4882644730515223e-05, + "loss": 0.6793, + "step": 2744 + }, + { + "epoch": 1.35693980966506, + "grad_norm": 0.15029956781159254, + "learning_rate": 1.4879246176953715e-05, + "loss": 0.6123, + "step": 2745 + }, + { + "epoch": 1.3574341861327401, + "grad_norm": 0.15194077261592434, + "learning_rate": 1.4875846883581367e-05, + "loss": 0.6172, + "step": 2746 + }, + { + "epoch": 1.3579285626004203, + "grad_norm": 0.13891816185819647, + "learning_rate": 1.487244685091359e-05, + "loss": 0.5962, + "step": 2747 + }, + { + "epoch": 1.3584229390681004, + "grad_norm": 0.1385855732480872, + "learning_rate": 1.4869046079465914e-05, + "loss": 0.6234, + "step": 2748 + }, + { + "epoch": 1.3589173155357805, + "grad_norm": 0.13852464603164524, + "learning_rate": 1.4865644569753977e-05, + "loss": 0.6595, + "step": 2749 + }, + { + "epoch": 1.3594116920034607, + "grad_norm": 0.15226323878274442, + "learning_rate": 1.4862242322293525e-05, + "loss": 0.6132, + "step": 2750 + }, + { + "epoch": 1.3599060684711408, + "grad_norm": 0.14370620639611492, + "learning_rate": 1.485883933760043e-05, + "loss": 0.5612, + "step": 2751 + }, + { + "epoch": 1.360400444938821, + "grad_norm": 0.1388628728491942, + "learning_rate": 1.4855435616190654e-05, + "loss": 0.6114, + "step": 2752 + }, + { + "epoch": 1.360894821406501, + "grad_norm": 0.14600273073514553, + "learning_rate": 1.4852031158580293e-05, + "loss": 0.6573, + "step": 2753 + }, + { + "epoch": 1.3613891978741812, + "grad_norm": 0.14089430495970823, + "learning_rate": 1.4848625965285542e-05, + "loss": 0.6002, + "step": 2754 + }, + { + "epoch": 1.3618835743418614, + "grad_norm": 0.14154608822415837, + "learning_rate": 1.4845220036822705e-05, + "loss": 0.622, + "step": 2755 + }, + { + "epoch": 1.3623779508095415, + "grad_norm": 0.14843876949283155, + "learning_rate": 1.4841813373708207e-05, + "loss": 0.6281, + "step": 2756 + }, + { + "epoch": 1.3628723272772216, + "grad_norm": 0.13391092986284264, + "learning_rate": 1.4838405976458581e-05, + "loss": 0.5957, + "step": 2757 + }, + { + "epoch": 1.3633667037449018, + "grad_norm": 0.14257490939737102, + "learning_rate": 1.4834997845590467e-05, + "loss": 0.6513, + "step": 2758 + }, + { + "epoch": 1.363861080212582, + "grad_norm": 0.18458923336333943, + "learning_rate": 1.4831588981620619e-05, + "loss": 0.5864, + "step": 2759 + }, + { + "epoch": 1.364355456680262, + "grad_norm": 0.14032735882851022, + "learning_rate": 1.4828179385065907e-05, + "loss": 0.6518, + "step": 2760 + }, + { + "epoch": 1.3648498331479422, + "grad_norm": 0.13984310461557364, + "learning_rate": 1.4824769056443305e-05, + "loss": 0.623, + "step": 2761 + }, + { + "epoch": 1.3653442096156223, + "grad_norm": 0.138854076519694, + "learning_rate": 1.48213579962699e-05, + "loss": 0.6117, + "step": 2762 + }, + { + "epoch": 1.3658385860833024, + "grad_norm": 0.1436427492879292, + "learning_rate": 1.4817946205062887e-05, + "loss": 0.6559, + "step": 2763 + }, + { + "epoch": 1.3663329625509826, + "grad_norm": 0.14553417239847058, + "learning_rate": 1.4814533683339587e-05, + "loss": 0.61, + "step": 2764 + }, + { + "epoch": 1.3668273390186627, + "grad_norm": 0.13623802117556005, + "learning_rate": 1.481112043161741e-05, + "loss": 0.6011, + "step": 2765 + }, + { + "epoch": 1.3673217154863428, + "grad_norm": 0.15086132287561813, + "learning_rate": 1.4807706450413885e-05, + "loss": 0.6156, + "step": 2766 + }, + { + "epoch": 1.367816091954023, + "grad_norm": 0.1430795521901572, + "learning_rate": 1.4804291740246665e-05, + "loss": 0.607, + "step": 2767 + }, + { + "epoch": 1.3683104684217031, + "grad_norm": 0.13148133387331845, + "learning_rate": 1.4800876301633493e-05, + "loss": 0.6811, + "step": 2768 + }, + { + "epoch": 1.3688048448893833, + "grad_norm": 0.14399917890617384, + "learning_rate": 1.4797460135092232e-05, + "loss": 0.6538, + "step": 2769 + }, + { + "epoch": 1.3692992213570634, + "grad_norm": 0.15487057297531323, + "learning_rate": 1.4794043241140861e-05, + "loss": 0.6328, + "step": 2770 + }, + { + "epoch": 1.3697935978247435, + "grad_norm": 0.1379148167243696, + "learning_rate": 1.4790625620297454e-05, + "loss": 0.6525, + "step": 2771 + }, + { + "epoch": 1.3702879742924237, + "grad_norm": 0.1434052008713569, + "learning_rate": 1.4787207273080212e-05, + "loss": 0.6419, + "step": 2772 + }, + { + "epoch": 1.3707823507601038, + "grad_norm": 0.15042998863074322, + "learning_rate": 1.4783788200007436e-05, + "loss": 0.634, + "step": 2773 + }, + { + "epoch": 1.371276727227784, + "grad_norm": 0.1370105058296191, + "learning_rate": 1.4780368401597539e-05, + "loss": 0.6552, + "step": 2774 + }, + { + "epoch": 1.371771103695464, + "grad_norm": 0.1573050301024773, + "learning_rate": 1.4776947878369044e-05, + "loss": 0.6262, + "step": 2775 + }, + { + "epoch": 1.3722654801631442, + "grad_norm": 0.20223434022975448, + "learning_rate": 1.4773526630840587e-05, + "loss": 0.62, + "step": 2776 + }, + { + "epoch": 1.3727598566308243, + "grad_norm": 0.13672695129490645, + "learning_rate": 1.4770104659530905e-05, + "loss": 0.6248, + "step": 2777 + }, + { + "epoch": 1.3732542330985045, + "grad_norm": 0.14643788995252696, + "learning_rate": 1.476668196495886e-05, + "loss": 0.6336, + "step": 2778 + }, + { + "epoch": 1.3737486095661846, + "grad_norm": 0.15136836218009744, + "learning_rate": 1.4763258547643409e-05, + "loss": 0.6155, + "step": 2779 + }, + { + "epoch": 1.3742429860338647, + "grad_norm": 0.14244646421601218, + "learning_rate": 1.4759834408103623e-05, + "loss": 0.6875, + "step": 2780 + }, + { + "epoch": 1.3747373625015449, + "grad_norm": 0.1468928405829278, + "learning_rate": 1.475640954685869e-05, + "loss": 0.6277, + "step": 2781 + }, + { + "epoch": 1.375231738969225, + "grad_norm": 0.1486141794180901, + "learning_rate": 1.4752983964427891e-05, + "loss": 0.6389, + "step": 2782 + }, + { + "epoch": 1.3757261154369052, + "grad_norm": 0.14366247857776476, + "learning_rate": 1.4749557661330637e-05, + "loss": 0.6096, + "step": 2783 + }, + { + "epoch": 1.3762204919045853, + "grad_norm": 0.14159744770061852, + "learning_rate": 1.4746130638086436e-05, + "loss": 0.6144, + "step": 2784 + }, + { + "epoch": 1.3767148683722654, + "grad_norm": 0.1349754122395386, + "learning_rate": 1.47427028952149e-05, + "loss": 0.5818, + "step": 2785 + }, + { + "epoch": 1.3772092448399456, + "grad_norm": 0.13537672247371912, + "learning_rate": 1.4739274433235764e-05, + "loss": 0.6614, + "step": 2786 + }, + { + "epoch": 1.3777036213076257, + "grad_norm": 0.1415343589252597, + "learning_rate": 1.4735845252668863e-05, + "loss": 0.6226, + "step": 2787 + }, + { + "epoch": 1.3781979977753058, + "grad_norm": 0.14347201239249116, + "learning_rate": 1.473241535403414e-05, + "loss": 0.6127, + "step": 2788 + }, + { + "epoch": 1.378692374242986, + "grad_norm": 0.13794260421560647, + "learning_rate": 1.4728984737851658e-05, + "loss": 0.6252, + "step": 2789 + }, + { + "epoch": 1.379186750710666, + "grad_norm": 0.1491755241670813, + "learning_rate": 1.472555340464157e-05, + "loss": 0.6506, + "step": 2790 + }, + { + "epoch": 1.3796811271783462, + "grad_norm": 0.14118006625571386, + "learning_rate": 1.4722121354924157e-05, + "loss": 0.6104, + "step": 2791 + }, + { + "epoch": 1.3801755036460264, + "grad_norm": 0.14455251545566733, + "learning_rate": 1.4718688589219797e-05, + "loss": 0.6069, + "step": 2792 + }, + { + "epoch": 1.3806698801137065, + "grad_norm": 0.1324383314833896, + "learning_rate": 1.4715255108048978e-05, + "loss": 0.6216, + "step": 2793 + }, + { + "epoch": 1.3811642565813869, + "grad_norm": 0.14247969341104472, + "learning_rate": 1.4711820911932302e-05, + "loss": 0.6207, + "step": 2794 + }, + { + "epoch": 1.3816586330490668, + "grad_norm": 0.14476757847217278, + "learning_rate": 1.4708386001390475e-05, + "loss": 0.6087, + "step": 2795 + }, + { + "epoch": 1.3821530095167471, + "grad_norm": 0.14672513734096532, + "learning_rate": 1.4704950376944304e-05, + "loss": 0.6056, + "step": 2796 + }, + { + "epoch": 1.382647385984427, + "grad_norm": 0.13354354544279629, + "learning_rate": 1.4701514039114728e-05, + "loss": 0.6175, + "step": 2797 + }, + { + "epoch": 1.3831417624521074, + "grad_norm": 0.13911187439773978, + "learning_rate": 1.4698076988422765e-05, + "loss": 0.6061, + "step": 2798 + }, + { + "epoch": 1.3836361389197873, + "grad_norm": 0.15032117138738701, + "learning_rate": 1.4694639225389553e-05, + "loss": 0.6255, + "step": 2799 + }, + { + "epoch": 1.3841305153874677, + "grad_norm": 0.14279307883467277, + "learning_rate": 1.4691200750536351e-05, + "loss": 0.6267, + "step": 2800 + }, + { + "epoch": 1.3846248918551476, + "grad_norm": 0.15010854063418552, + "learning_rate": 1.4687761564384506e-05, + "loss": 0.6348, + "step": 2801 + }, + { + "epoch": 1.385119268322828, + "grad_norm": 0.144818514006616, + "learning_rate": 1.4684321667455483e-05, + "loss": 0.6239, + "step": 2802 + }, + { + "epoch": 1.3856136447905079, + "grad_norm": 0.15018784381117667, + "learning_rate": 1.4680881060270855e-05, + "loss": 0.6137, + "step": 2803 + }, + { + "epoch": 1.3861080212581882, + "grad_norm": 0.14932598523188267, + "learning_rate": 1.4677439743352296e-05, + "loss": 0.6486, + "step": 2804 + }, + { + "epoch": 1.3866023977258681, + "grad_norm": 0.13473810992998436, + "learning_rate": 1.4673997717221595e-05, + "loss": 0.5919, + "step": 2805 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.13897412835780867, + "learning_rate": 1.467055498240065e-05, + "loss": 0.6132, + "step": 2806 + }, + { + "epoch": 1.3875911506612284, + "grad_norm": 0.14327543297412268, + "learning_rate": 1.4667111539411454e-05, + "loss": 0.6287, + "step": 2807 + }, + { + "epoch": 1.3880855271289088, + "grad_norm": 0.1387038177122625, + "learning_rate": 1.4663667388776122e-05, + "loss": 0.6731, + "step": 2808 + }, + { + "epoch": 1.3885799035965887, + "grad_norm": 0.14888134448692286, + "learning_rate": 1.4660222531016865e-05, + "loss": 0.5927, + "step": 2809 + }, + { + "epoch": 1.389074280064269, + "grad_norm": 0.14036525690658908, + "learning_rate": 1.465677696665601e-05, + "loss": 0.5993, + "step": 2810 + }, + { + "epoch": 1.389568656531949, + "grad_norm": 0.13633848301471344, + "learning_rate": 1.4653330696215986e-05, + "loss": 0.6638, + "step": 2811 + }, + { + "epoch": 1.3900630329996293, + "grad_norm": 0.14455537196588136, + "learning_rate": 1.4649883720219329e-05, + "loss": 0.618, + "step": 2812 + }, + { + "epoch": 1.3905574094673094, + "grad_norm": 0.1376698380090889, + "learning_rate": 1.464643603918869e-05, + "loss": 0.6347, + "step": 2813 + }, + { + "epoch": 1.3910517859349896, + "grad_norm": 0.1370720518822891, + "learning_rate": 1.4642987653646809e-05, + "loss": 0.6411, + "step": 2814 + }, + { + "epoch": 1.3915461624026697, + "grad_norm": 0.137195938255852, + "learning_rate": 1.4639538564116552e-05, + "loss": 0.5988, + "step": 2815 + }, + { + "epoch": 1.3920405388703498, + "grad_norm": 0.13891939080111085, + "learning_rate": 1.4636088771120881e-05, + "loss": 0.6379, + "step": 2816 + }, + { + "epoch": 1.39253491533803, + "grad_norm": 0.13556945449765764, + "learning_rate": 1.463263827518287e-05, + "loss": 0.643, + "step": 2817 + }, + { + "epoch": 1.3930292918057101, + "grad_norm": 0.15511919328764642, + "learning_rate": 1.462918707682569e-05, + "loss": 0.6361, + "step": 2818 + }, + { + "epoch": 1.3935236682733902, + "grad_norm": 0.1350015835696895, + "learning_rate": 1.4625735176572633e-05, + "loss": 0.6111, + "step": 2819 + }, + { + "epoch": 1.3940180447410704, + "grad_norm": 0.13873870020644138, + "learning_rate": 1.462228257494709e-05, + "loss": 0.6147, + "step": 2820 + }, + { + "epoch": 1.3945124212087505, + "grad_norm": 0.1431813805062969, + "learning_rate": 1.4618829272472553e-05, + "loss": 0.6258, + "step": 2821 + }, + { + "epoch": 1.3950067976764307, + "grad_norm": 0.15060964053383474, + "learning_rate": 1.461537526967263e-05, + "loss": 0.64, + "step": 2822 + }, + { + "epoch": 1.3955011741441108, + "grad_norm": 0.1357833440325525, + "learning_rate": 1.4611920567071028e-05, + "loss": 0.6, + "step": 2823 + }, + { + "epoch": 1.395995550611791, + "grad_norm": 0.13683109503734872, + "learning_rate": 1.4608465165191564e-05, + "loss": 0.62, + "step": 2824 + }, + { + "epoch": 1.396489927079471, + "grad_norm": 0.14763058903932777, + "learning_rate": 1.460500906455816e-05, + "loss": 0.5967, + "step": 2825 + }, + { + "epoch": 1.3969843035471512, + "grad_norm": 0.15543761285416685, + "learning_rate": 1.4601552265694843e-05, + "loss": 0.6439, + "step": 2826 + }, + { + "epoch": 1.3974786800148313, + "grad_norm": 0.13860036681776905, + "learning_rate": 1.4598094769125747e-05, + "loss": 0.6175, + "step": 2827 + }, + { + "epoch": 1.3979730564825115, + "grad_norm": 0.14491101660014524, + "learning_rate": 1.4594636575375115e-05, + "loss": 0.6376, + "step": 2828 + }, + { + "epoch": 1.3984674329501916, + "grad_norm": 0.1344537484105292, + "learning_rate": 1.4591177684967286e-05, + "loss": 0.6222, + "step": 2829 + }, + { + "epoch": 1.3989618094178717, + "grad_norm": 0.13320593982511078, + "learning_rate": 1.4587718098426713e-05, + "loss": 0.6202, + "step": 2830 + }, + { + "epoch": 1.3994561858855519, + "grad_norm": 0.1445730195170749, + "learning_rate": 1.4584257816277951e-05, + "loss": 0.5917, + "step": 2831 + }, + { + "epoch": 1.399950562353232, + "grad_norm": 0.15553620627553158, + "learning_rate": 1.4580796839045667e-05, + "loss": 0.6515, + "step": 2832 + }, + { + "epoch": 1.4004449388209121, + "grad_norm": 0.1352364486645284, + "learning_rate": 1.4577335167254627e-05, + "loss": 0.6479, + "step": 2833 + }, + { + "epoch": 1.4009393152885923, + "grad_norm": 0.14376857942800897, + "learning_rate": 1.4573872801429701e-05, + "loss": 0.6256, + "step": 2834 + }, + { + "epoch": 1.4014336917562724, + "grad_norm": 0.14670759084113283, + "learning_rate": 1.4570409742095865e-05, + "loss": 0.6006, + "step": 2835 + }, + { + "epoch": 1.4019280682239526, + "grad_norm": 0.15029336418247424, + "learning_rate": 1.4566945989778207e-05, + "loss": 0.6426, + "step": 2836 + }, + { + "epoch": 1.4024224446916327, + "grad_norm": 0.14417033505211585, + "learning_rate": 1.456348154500191e-05, + "loss": 0.5992, + "step": 2837 + }, + { + "epoch": 1.4029168211593128, + "grad_norm": 0.14119710436380087, + "learning_rate": 1.456001640829227e-05, + "loss": 0.6026, + "step": 2838 + }, + { + "epoch": 1.403411197626993, + "grad_norm": 0.1402822433061976, + "learning_rate": 1.4556550580174684e-05, + "loss": 0.6151, + "step": 2839 + }, + { + "epoch": 1.403905574094673, + "grad_norm": 0.1477961933193636, + "learning_rate": 1.4553084061174657e-05, + "loss": 0.6726, + "step": 2840 + }, + { + "epoch": 1.4043999505623532, + "grad_norm": 0.13960494971727605, + "learning_rate": 1.4549616851817791e-05, + "loss": 0.6335, + "step": 2841 + }, + { + "epoch": 1.4048943270300334, + "grad_norm": 0.13908460959436192, + "learning_rate": 1.4546148952629805e-05, + "loss": 0.624, + "step": 2842 + }, + { + "epoch": 1.4053887034977135, + "grad_norm": 0.1420697892674259, + "learning_rate": 1.454268036413651e-05, + "loss": 0.6118, + "step": 2843 + }, + { + "epoch": 1.4058830799653936, + "grad_norm": 0.14797733767691337, + "learning_rate": 1.4539211086863832e-05, + "loss": 0.6269, + "step": 2844 + }, + { + "epoch": 1.4063774564330738, + "grad_norm": 0.15090261142671363, + "learning_rate": 1.4535741121337789e-05, + "loss": 0.6374, + "step": 2845 + }, + { + "epoch": 1.406871832900754, + "grad_norm": 0.14142558121363688, + "learning_rate": 1.4532270468084524e-05, + "loss": 0.6245, + "step": 2846 + }, + { + "epoch": 1.407366209368434, + "grad_norm": 0.14565814694458953, + "learning_rate": 1.452879912763026e-05, + "loss": 0.6589, + "step": 2847 + }, + { + "epoch": 1.4078605858361142, + "grad_norm": 0.1570071962773354, + "learning_rate": 1.4525327100501337e-05, + "loss": 0.6409, + "step": 2848 + }, + { + "epoch": 1.4083549623037943, + "grad_norm": 0.15271558771013077, + "learning_rate": 1.4521854387224205e-05, + "loss": 0.6209, + "step": 2849 + }, + { + "epoch": 1.4088493387714744, + "grad_norm": 0.15362225002099375, + "learning_rate": 1.4518380988325405e-05, + "loss": 0.6553, + "step": 2850 + }, + { + "epoch": 1.4093437152391546, + "grad_norm": 0.149844359959524, + "learning_rate": 1.4514906904331582e-05, + "loss": 0.5989, + "step": 2851 + }, + { + "epoch": 1.4098380917068347, + "grad_norm": 0.13917980851282444, + "learning_rate": 1.4511432135769504e-05, + "loss": 0.636, + "step": 2852 + }, + { + "epoch": 1.4103324681745149, + "grad_norm": 0.15318434353400012, + "learning_rate": 1.4507956683166018e-05, + "loss": 0.6192, + "step": 2853 + }, + { + "epoch": 1.410826844642195, + "grad_norm": 0.14860454391094116, + "learning_rate": 1.4504480547048092e-05, + "loss": 0.5784, + "step": 2854 + }, + { + "epoch": 1.4113212211098751, + "grad_norm": 0.14572207816316385, + "learning_rate": 1.450100372794279e-05, + "loss": 0.6594, + "step": 2855 + }, + { + "epoch": 1.4118155975775553, + "grad_norm": 0.14622011694027773, + "learning_rate": 1.4497526226377281e-05, + "loss": 0.6435, + "step": 2856 + }, + { + "epoch": 1.4123099740452354, + "grad_norm": 0.14205218750456278, + "learning_rate": 1.4494048042878839e-05, + "loss": 0.6345, + "step": 2857 + }, + { + "epoch": 1.4128043505129155, + "grad_norm": 0.13915091727345222, + "learning_rate": 1.4490569177974836e-05, + "loss": 0.6352, + "step": 2858 + }, + { + "epoch": 1.4132987269805957, + "grad_norm": 0.15395674540415022, + "learning_rate": 1.4487089632192756e-05, + "loss": 0.6265, + "step": 2859 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 0.13384843201133045, + "learning_rate": 1.4483609406060181e-05, + "loss": 0.6265, + "step": 2860 + }, + { + "epoch": 1.414287479915956, + "grad_norm": 0.13558991023763392, + "learning_rate": 1.4480128500104795e-05, + "loss": 0.5827, + "step": 2861 + }, + { + "epoch": 1.414781856383636, + "grad_norm": 0.16002871995536785, + "learning_rate": 1.4476646914854388e-05, + "loss": 0.6492, + "step": 2862 + }, + { + "epoch": 1.4152762328513162, + "grad_norm": 0.13792379381710476, + "learning_rate": 1.447316465083685e-05, + "loss": 0.6023, + "step": 2863 + }, + { + "epoch": 1.4157706093189963, + "grad_norm": 0.15824578825432634, + "learning_rate": 1.4469681708580177e-05, + "loss": 0.6099, + "step": 2864 + }, + { + "epoch": 1.4162649857866765, + "grad_norm": 0.1421774311010707, + "learning_rate": 1.4466198088612469e-05, + "loss": 0.6487, + "step": 2865 + }, + { + "epoch": 1.4167593622543566, + "grad_norm": 0.14547007208332008, + "learning_rate": 1.4462713791461926e-05, + "loss": 0.6923, + "step": 2866 + }, + { + "epoch": 1.4172537387220367, + "grad_norm": 0.1526554888720157, + "learning_rate": 1.4459228817656847e-05, + "loss": 0.6061, + "step": 2867 + }, + { + "epoch": 1.4177481151897169, + "grad_norm": 0.13079124473949683, + "learning_rate": 1.445574316772564e-05, + "loss": 0.6316, + "step": 2868 + }, + { + "epoch": 1.4182424916573972, + "grad_norm": 0.14088909496406044, + "learning_rate": 1.4452256842196816e-05, + "loss": 0.6149, + "step": 2869 + }, + { + "epoch": 1.4187368681250772, + "grad_norm": 0.15567181877240455, + "learning_rate": 1.4448769841598982e-05, + "loss": 0.6343, + "step": 2870 + }, + { + "epoch": 1.4192312445927575, + "grad_norm": 0.1356204535138772, + "learning_rate": 1.4445282166460852e-05, + "loss": 0.6052, + "step": 2871 + }, + { + "epoch": 1.4197256210604374, + "grad_norm": 0.14239556307913617, + "learning_rate": 1.444179381731124e-05, + "loss": 0.6138, + "step": 2872 + }, + { + "epoch": 1.4202199975281178, + "grad_norm": 0.15219028297096646, + "learning_rate": 1.4438304794679069e-05, + "loss": 0.619, + "step": 2873 + }, + { + "epoch": 1.4207143739957977, + "grad_norm": 0.136115268947613, + "learning_rate": 1.4434815099093352e-05, + "loss": 0.5756, + "step": 2874 + }, + { + "epoch": 1.421208750463478, + "grad_norm": 0.14149686478403983, + "learning_rate": 1.443132473108321e-05, + "loss": 0.659, + "step": 2875 + }, + { + "epoch": 1.421703126931158, + "grad_norm": 0.13793971376354078, + "learning_rate": 1.4427833691177873e-05, + "loss": 0.6246, + "step": 2876 + }, + { + "epoch": 1.4221975033988383, + "grad_norm": 0.14088463603579018, + "learning_rate": 1.4424341979906662e-05, + "loss": 0.5773, + "step": 2877 + }, + { + "epoch": 1.4226918798665182, + "grad_norm": 0.13493373062564257, + "learning_rate": 1.4420849597799005e-05, + "loss": 0.6153, + "step": 2878 + }, + { + "epoch": 1.4231862563341986, + "grad_norm": 0.1318996542893176, + "learning_rate": 1.4417356545384433e-05, + "loss": 0.6369, + "step": 2879 + }, + { + "epoch": 1.4236806328018785, + "grad_norm": 0.1486966067764712, + "learning_rate": 1.4413862823192575e-05, + "loss": 0.5913, + "step": 2880 + }, + { + "epoch": 1.4241750092695589, + "grad_norm": 0.1493430935950028, + "learning_rate": 1.4410368431753158e-05, + "loss": 0.6254, + "step": 2881 + }, + { + "epoch": 1.4246693857372388, + "grad_norm": 0.14069970118918707, + "learning_rate": 1.4406873371596026e-05, + "loss": 0.6097, + "step": 2882 + }, + { + "epoch": 1.4251637622049191, + "grad_norm": 0.13961422481532518, + "learning_rate": 1.4403377643251107e-05, + "loss": 0.6123, + "step": 2883 + }, + { + "epoch": 1.425658138672599, + "grad_norm": 0.13727484295861195, + "learning_rate": 1.4399881247248437e-05, + "loss": 0.6478, + "step": 2884 + }, + { + "epoch": 1.4261525151402794, + "grad_norm": 0.14578237085811, + "learning_rate": 1.4396384184118159e-05, + "loss": 0.6241, + "step": 2885 + }, + { + "epoch": 1.4266468916079595, + "grad_norm": 0.13930946327814325, + "learning_rate": 1.4392886454390507e-05, + "loss": 0.6345, + "step": 2886 + }, + { + "epoch": 1.4271412680756397, + "grad_norm": 0.1396047734562252, + "learning_rate": 1.4389388058595822e-05, + "loss": 0.6401, + "step": 2887 + }, + { + "epoch": 1.4276356445433198, + "grad_norm": 0.13842234544414278, + "learning_rate": 1.4385888997264543e-05, + "loss": 0.6088, + "step": 2888 + }, + { + "epoch": 1.428130021011, + "grad_norm": 0.13074891968687705, + "learning_rate": 1.4382389270927215e-05, + "loss": 0.5936, + "step": 2889 + }, + { + "epoch": 1.42862439747868, + "grad_norm": 0.13811225498834645, + "learning_rate": 1.437888888011448e-05, + "loss": 0.5966, + "step": 2890 + }, + { + "epoch": 1.4291187739463602, + "grad_norm": 0.14127848565952736, + "learning_rate": 1.4375387825357078e-05, + "loss": 0.6387, + "step": 2891 + }, + { + "epoch": 1.4296131504140404, + "grad_norm": 0.14240390594711996, + "learning_rate": 1.4371886107185857e-05, + "loss": 0.5862, + "step": 2892 + }, + { + "epoch": 1.4301075268817205, + "grad_norm": 0.1326402314294208, + "learning_rate": 1.436838372613176e-05, + "loss": 0.6286, + "step": 2893 + }, + { + "epoch": 1.4306019033494006, + "grad_norm": 0.13902765986349266, + "learning_rate": 1.436488068272583e-05, + "loss": 0.6116, + "step": 2894 + }, + { + "epoch": 1.4310962798170808, + "grad_norm": 0.13861788352533463, + "learning_rate": 1.4361376977499217e-05, + "loss": 0.6295, + "step": 2895 + }, + { + "epoch": 1.431590656284761, + "grad_norm": 0.13973127056154905, + "learning_rate": 1.4357872610983163e-05, + "loss": 0.6275, + "step": 2896 + }, + { + "epoch": 1.432085032752441, + "grad_norm": 0.1318134526045063, + "learning_rate": 1.4354367583709012e-05, + "loss": 0.6383, + "step": 2897 + }, + { + "epoch": 1.4325794092201212, + "grad_norm": 0.14130479253103134, + "learning_rate": 1.435086189620822e-05, + "loss": 0.6191, + "step": 2898 + }, + { + "epoch": 1.4330737856878013, + "grad_norm": 0.1318510917584635, + "learning_rate": 1.434735554901232e-05, + "loss": 0.6466, + "step": 2899 + }, + { + "epoch": 1.4335681621554814, + "grad_norm": 0.14033214580134618, + "learning_rate": 1.4343848542652967e-05, + "loss": 0.6105, + "step": 2900 + }, + { + "epoch": 1.4340625386231616, + "grad_norm": 0.13805635400691135, + "learning_rate": 1.4340340877661908e-05, + "loss": 0.6213, + "step": 2901 + }, + { + "epoch": 1.4345569150908417, + "grad_norm": 0.13296940426139442, + "learning_rate": 1.4336832554570987e-05, + "loss": 0.6158, + "step": 2902 + }, + { + "epoch": 1.4350512915585218, + "grad_norm": 0.14152539299935163, + "learning_rate": 1.4333323573912146e-05, + "loss": 0.6058, + "step": 2903 + }, + { + "epoch": 1.435545668026202, + "grad_norm": 0.1434356866809943, + "learning_rate": 1.4329813936217436e-05, + "loss": 0.601, + "step": 2904 + }, + { + "epoch": 1.4360400444938821, + "grad_norm": 0.1455810607505509, + "learning_rate": 1.4326303642019002e-05, + "loss": 0.6425, + "step": 2905 + }, + { + "epoch": 1.4365344209615623, + "grad_norm": 0.15009969350754943, + "learning_rate": 1.4322792691849087e-05, + "loss": 0.6188, + "step": 2906 + }, + { + "epoch": 1.4370287974292424, + "grad_norm": 0.14025633869020435, + "learning_rate": 1.4319281086240036e-05, + "loss": 0.5912, + "step": 2907 + }, + { + "epoch": 1.4375231738969225, + "grad_norm": 0.14064524170914047, + "learning_rate": 1.4315768825724292e-05, + "loss": 0.6251, + "step": 2908 + }, + { + "epoch": 1.4380175503646027, + "grad_norm": 0.1451097926800141, + "learning_rate": 1.43122559108344e-05, + "loss": 0.6597, + "step": 2909 + }, + { + "epoch": 1.4385119268322828, + "grad_norm": 0.1443534370235262, + "learning_rate": 1.4308742342103001e-05, + "loss": 0.6462, + "step": 2910 + }, + { + "epoch": 1.439006303299963, + "grad_norm": 0.1350986888834082, + "learning_rate": 1.4305228120062836e-05, + "loss": 0.6381, + "step": 2911 + }, + { + "epoch": 1.439500679767643, + "grad_norm": 0.14012274364611074, + "learning_rate": 1.4301713245246744e-05, + "loss": 0.6498, + "step": 2912 + }, + { + "epoch": 1.4399950562353232, + "grad_norm": 0.13926565557790402, + "learning_rate": 1.4298197718187665e-05, + "loss": 0.5938, + "step": 2913 + }, + { + "epoch": 1.4404894327030033, + "grad_norm": 0.13511403702802505, + "learning_rate": 1.4294681539418642e-05, + "loss": 0.6186, + "step": 2914 + }, + { + "epoch": 1.4409838091706835, + "grad_norm": 0.13927346205904162, + "learning_rate": 1.4291164709472809e-05, + "loss": 0.6546, + "step": 2915 + }, + { + "epoch": 1.4414781856383636, + "grad_norm": 0.14073485974488079, + "learning_rate": 1.4287647228883397e-05, + "loss": 0.5846, + "step": 2916 + }, + { + "epoch": 1.4419725621060437, + "grad_norm": 0.12876100086826572, + "learning_rate": 1.4284129098183745e-05, + "loss": 0.6228, + "step": 2917 + }, + { + "epoch": 1.4424669385737239, + "grad_norm": 0.1402543809002387, + "learning_rate": 1.428061031790729e-05, + "loss": 0.5909, + "step": 2918 + }, + { + "epoch": 1.442961315041404, + "grad_norm": 0.1399707822814097, + "learning_rate": 1.4277090888587557e-05, + "loss": 0.6466, + "step": 2919 + }, + { + "epoch": 1.4434556915090841, + "grad_norm": 0.14698833376218212, + "learning_rate": 1.4273570810758176e-05, + "loss": 0.5985, + "step": 2920 + }, + { + "epoch": 1.4439500679767643, + "grad_norm": 0.12982088607012618, + "learning_rate": 1.4270050084952882e-05, + "loss": 0.623, + "step": 2921 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.13557858262595723, + "learning_rate": 1.4266528711705496e-05, + "loss": 0.6281, + "step": 2922 + }, + { + "epoch": 1.4449388209121246, + "grad_norm": 0.1317147006671685, + "learning_rate": 1.4263006691549943e-05, + "loss": 0.6222, + "step": 2923 + }, + { + "epoch": 1.4454331973798047, + "grad_norm": 0.1334742360657188, + "learning_rate": 1.4259484025020248e-05, + "loss": 0.6421, + "step": 2924 + }, + { + "epoch": 1.4459275738474848, + "grad_norm": 0.1393516447454601, + "learning_rate": 1.4255960712650527e-05, + "loss": 0.6038, + "step": 2925 + }, + { + "epoch": 1.446421950315165, + "grad_norm": 0.13202432274831433, + "learning_rate": 1.4252436754975005e-05, + "loss": 0.6079, + "step": 2926 + }, + { + "epoch": 1.446916326782845, + "grad_norm": 0.13585613640161534, + "learning_rate": 1.4248912152527994e-05, + "loss": 0.6252, + "step": 2927 + }, + { + "epoch": 1.4474107032505252, + "grad_norm": 0.14016989232448074, + "learning_rate": 1.4245386905843914e-05, + "loss": 0.622, + "step": 2928 + }, + { + "epoch": 1.4479050797182054, + "grad_norm": 0.1292677561688165, + "learning_rate": 1.4241861015457272e-05, + "loss": 0.6227, + "step": 2929 + }, + { + "epoch": 1.4483994561858855, + "grad_norm": 0.14161010911308433, + "learning_rate": 1.4238334481902673e-05, + "loss": 0.6158, + "step": 2930 + }, + { + "epoch": 1.4488938326535656, + "grad_norm": 0.1329987282394572, + "learning_rate": 1.423480730571484e-05, + "loss": 0.5868, + "step": 2931 + }, + { + "epoch": 1.4493882091212458, + "grad_norm": 0.14522073707614785, + "learning_rate": 1.423127948742856e-05, + "loss": 0.6076, + "step": 2932 + }, + { + "epoch": 1.449882585588926, + "grad_norm": 0.1394732550215994, + "learning_rate": 1.4227751027578743e-05, + "loss": 0.6599, + "step": 2933 + }, + { + "epoch": 1.450376962056606, + "grad_norm": 0.14527256174453546, + "learning_rate": 1.4224221926700388e-05, + "loss": 0.6283, + "step": 2934 + }, + { + "epoch": 1.4508713385242862, + "grad_norm": 0.14340947805388696, + "learning_rate": 1.4220692185328592e-05, + "loss": 0.6111, + "step": 2935 + }, + { + "epoch": 1.4513657149919663, + "grad_norm": 0.13485781310786693, + "learning_rate": 1.4217161803998549e-05, + "loss": 0.5907, + "step": 2936 + }, + { + "epoch": 1.4518600914596465, + "grad_norm": 0.1358776718716294, + "learning_rate": 1.4213630783245547e-05, + "loss": 0.635, + "step": 2937 + }, + { + "epoch": 1.4523544679273266, + "grad_norm": 0.13647107055419783, + "learning_rate": 1.4210099123604974e-05, + "loss": 0.6102, + "step": 2938 + }, + { + "epoch": 1.4528488443950067, + "grad_norm": 0.13711051617231373, + "learning_rate": 1.4206566825612315e-05, + "loss": 0.6305, + "step": 2939 + }, + { + "epoch": 1.4533432208626869, + "grad_norm": 0.14634894639835253, + "learning_rate": 1.4203033889803152e-05, + "loss": 0.6326, + "step": 2940 + }, + { + "epoch": 1.453837597330367, + "grad_norm": 0.1333988527182693, + "learning_rate": 1.4199500316713161e-05, + "loss": 0.6206, + "step": 2941 + }, + { + "epoch": 1.4543319737980474, + "grad_norm": 0.13671958400044928, + "learning_rate": 1.4195966106878119e-05, + "loss": 0.6503, + "step": 2942 + }, + { + "epoch": 1.4548263502657273, + "grad_norm": 0.14330384014686584, + "learning_rate": 1.4192431260833894e-05, + "loss": 0.6332, + "step": 2943 + }, + { + "epoch": 1.4553207267334076, + "grad_norm": 0.14456344136075472, + "learning_rate": 1.4188895779116457e-05, + "loss": 0.6677, + "step": 2944 + }, + { + "epoch": 1.4558151032010875, + "grad_norm": 0.37895219491887555, + "learning_rate": 1.4185359662261869e-05, + "loss": 0.6337, + "step": 2945 + }, + { + "epoch": 1.456309479668768, + "grad_norm": 0.14170031661415725, + "learning_rate": 1.4181822910806289e-05, + "loss": 0.6474, + "step": 2946 + }, + { + "epoch": 1.4568038561364478, + "grad_norm": 0.1517312987215549, + "learning_rate": 1.4178285525285978e-05, + "loss": 0.6459, + "step": 2947 + }, + { + "epoch": 1.4572982326041282, + "grad_norm": 0.1397317123257146, + "learning_rate": 1.4174747506237285e-05, + "loss": 0.63, + "step": 2948 + }, + { + "epoch": 1.457792609071808, + "grad_norm": 0.16389150900813565, + "learning_rate": 1.4171208854196658e-05, + "loss": 0.6641, + "step": 2949 + }, + { + "epoch": 1.4582869855394884, + "grad_norm": 0.14052242000441537, + "learning_rate": 1.4167669569700645e-05, + "loss": 0.6116, + "step": 2950 + }, + { + "epoch": 1.4587813620071683, + "grad_norm": 0.1469259197470585, + "learning_rate": 1.416412965328589e-05, + "loss": 0.611, + "step": 2951 + }, + { + "epoch": 1.4592757384748487, + "grad_norm": 0.1430482099763021, + "learning_rate": 1.4160589105489117e-05, + "loss": 0.6239, + "step": 2952 + }, + { + "epoch": 1.4597701149425286, + "grad_norm": 0.1406560527592341, + "learning_rate": 1.415704792684717e-05, + "loss": 0.6361, + "step": 2953 + }, + { + "epoch": 1.460264491410209, + "grad_norm": 0.15886479004032616, + "learning_rate": 1.4153506117896968e-05, + "loss": 0.624, + "step": 2954 + }, + { + "epoch": 1.460758867877889, + "grad_norm": 0.14072492814497106, + "learning_rate": 1.4149963679175541e-05, + "loss": 0.6413, + "step": 2955 + }, + { + "epoch": 1.4612532443455692, + "grad_norm": 0.14045260106182927, + "learning_rate": 1.4146420611220002e-05, + "loss": 0.6663, + "step": 2956 + }, + { + "epoch": 1.4617476208132492, + "grad_norm": 0.15205059385578237, + "learning_rate": 1.4142876914567571e-05, + "loss": 0.5963, + "step": 2957 + }, + { + "epoch": 1.4622419972809295, + "grad_norm": 0.1451104970980061, + "learning_rate": 1.4139332589755552e-05, + "loss": 0.6241, + "step": 2958 + }, + { + "epoch": 1.4627363737486094, + "grad_norm": 0.18136477386725033, + "learning_rate": 1.4135787637321354e-05, + "loss": 0.6025, + "step": 2959 + }, + { + "epoch": 1.4632307502162898, + "grad_norm": 0.1478955601930153, + "learning_rate": 1.4132242057802472e-05, + "loss": 0.6111, + "step": 2960 + }, + { + "epoch": 1.46372512668397, + "grad_norm": 0.13964144786545526, + "learning_rate": 1.412869585173651e-05, + "loss": 0.6174, + "step": 2961 + }, + { + "epoch": 1.46421950315165, + "grad_norm": 0.14569550515875224, + "learning_rate": 1.4125149019661146e-05, + "loss": 0.6402, + "step": 2962 + }, + { + "epoch": 1.4647138796193302, + "grad_norm": 0.13694334630077565, + "learning_rate": 1.4121601562114174e-05, + "loss": 0.6276, + "step": 2963 + }, + { + "epoch": 1.4652082560870103, + "grad_norm": 0.149627341638833, + "learning_rate": 1.4118053479633471e-05, + "loss": 0.6206, + "step": 2964 + }, + { + "epoch": 1.4657026325546905, + "grad_norm": 0.17919835121661307, + "learning_rate": 1.4114504772757007e-05, + "loss": 0.6659, + "step": 2965 + }, + { + "epoch": 1.4661970090223706, + "grad_norm": 0.14088255515093084, + "learning_rate": 1.411095544202286e-05, + "loss": 0.6396, + "step": 2966 + }, + { + "epoch": 1.4666913854900507, + "grad_norm": 0.13824414887786968, + "learning_rate": 1.4107405487969188e-05, + "loss": 0.611, + "step": 2967 + }, + { + "epoch": 1.4671857619577309, + "grad_norm": 0.14441121852779393, + "learning_rate": 1.4103854911134247e-05, + "loss": 0.6826, + "step": 2968 + }, + { + "epoch": 1.467680138425411, + "grad_norm": 0.14512861963423263, + "learning_rate": 1.4100303712056395e-05, + "loss": 0.6342, + "step": 2969 + }, + { + "epoch": 1.4681745148930911, + "grad_norm": 0.14427413902555997, + "learning_rate": 1.4096751891274079e-05, + "loss": 0.6152, + "step": 2970 + }, + { + "epoch": 1.4686688913607713, + "grad_norm": 0.14457420057972245, + "learning_rate": 1.4093199449325837e-05, + "loss": 0.6123, + "step": 2971 + }, + { + "epoch": 1.4691632678284514, + "grad_norm": 0.14085693963479254, + "learning_rate": 1.4089646386750304e-05, + "loss": 0.6266, + "step": 2972 + }, + { + "epoch": 1.4696576442961315, + "grad_norm": 0.1477334353903533, + "learning_rate": 1.4086092704086212e-05, + "loss": 0.6548, + "step": 2973 + }, + { + "epoch": 1.4701520207638117, + "grad_norm": 0.1390421184670276, + "learning_rate": 1.4082538401872383e-05, + "loss": 0.5794, + "step": 2974 + }, + { + "epoch": 1.4706463972314918, + "grad_norm": 0.13163349159083493, + "learning_rate": 1.4078983480647738e-05, + "loss": 0.6164, + "step": 2975 + }, + { + "epoch": 1.471140773699172, + "grad_norm": 0.140027229867609, + "learning_rate": 1.4075427940951282e-05, + "loss": 0.6072, + "step": 2976 + }, + { + "epoch": 1.471635150166852, + "grad_norm": 0.15153917553950716, + "learning_rate": 1.4071871783322128e-05, + "loss": 0.6609, + "step": 2977 + }, + { + "epoch": 1.4721295266345322, + "grad_norm": 0.13901486382542894, + "learning_rate": 1.406831500829947e-05, + "loss": 0.6326, + "step": 2978 + }, + { + "epoch": 1.4726239031022124, + "grad_norm": 0.13973770357074244, + "learning_rate": 1.4064757616422597e-05, + "loss": 0.6061, + "step": 2979 + }, + { + "epoch": 1.4731182795698925, + "grad_norm": 0.14040966820124565, + "learning_rate": 1.4061199608230904e-05, + "loss": 0.6271, + "step": 2980 + }, + { + "epoch": 1.4736126560375726, + "grad_norm": 0.13540444001524782, + "learning_rate": 1.4057640984263865e-05, + "loss": 0.6338, + "step": 2981 + }, + { + "epoch": 1.4741070325052528, + "grad_norm": 0.14089523570261997, + "learning_rate": 1.405408174506105e-05, + "loss": 0.6332, + "step": 2982 + }, + { + "epoch": 1.474601408972933, + "grad_norm": 1.0686737446294654, + "learning_rate": 1.405052189116213e-05, + "loss": 0.6527, + "step": 2983 + }, + { + "epoch": 1.475095785440613, + "grad_norm": 0.1534438231463447, + "learning_rate": 1.4046961423106868e-05, + "loss": 0.6234, + "step": 2984 + }, + { + "epoch": 1.4755901619082932, + "grad_norm": 0.13831457055363294, + "learning_rate": 1.4043400341435105e-05, + "loss": 0.6045, + "step": 2985 + }, + { + "epoch": 1.4760845383759733, + "grad_norm": 0.1421328085993191, + "learning_rate": 1.4039838646686793e-05, + "loss": 0.6151, + "step": 2986 + }, + { + "epoch": 1.4765789148436534, + "grad_norm": 0.1883818194126707, + "learning_rate": 1.4036276339401976e-05, + "loss": 0.6264, + "step": 2987 + }, + { + "epoch": 1.4770732913113336, + "grad_norm": 0.13809509495548872, + "learning_rate": 1.4032713420120774e-05, + "loss": 0.6043, + "step": 2988 + }, + { + "epoch": 1.4775676677790137, + "grad_norm": 0.1433305376453007, + "learning_rate": 1.402914988938342e-05, + "loss": 0.6302, + "step": 2989 + }, + { + "epoch": 1.4780620442466939, + "grad_norm": 0.1428885632690781, + "learning_rate": 1.4025585747730226e-05, + "loss": 0.6505, + "step": 2990 + }, + { + "epoch": 1.478556420714374, + "grad_norm": 0.14708044171839268, + "learning_rate": 1.4022020995701602e-05, + "loss": 0.6249, + "step": 2991 + }, + { + "epoch": 1.4790507971820541, + "grad_norm": 0.14807404719063733, + "learning_rate": 1.4018455633838051e-05, + "loss": 0.6396, + "step": 2992 + }, + { + "epoch": 1.4795451736497343, + "grad_norm": 0.1451198102911349, + "learning_rate": 1.4014889662680169e-05, + "loss": 0.5889, + "step": 2993 + }, + { + "epoch": 1.4800395501174144, + "grad_norm": 0.24908619613030686, + "learning_rate": 1.4011323082768638e-05, + "loss": 0.6005, + "step": 2994 + }, + { + "epoch": 1.4805339265850945, + "grad_norm": 0.1506205377943093, + "learning_rate": 1.4007755894644239e-05, + "loss": 0.6776, + "step": 2995 + }, + { + "epoch": 1.4810283030527747, + "grad_norm": 1.7465720915527232, + "learning_rate": 1.4004188098847848e-05, + "loss": 0.6237, + "step": 2996 + }, + { + "epoch": 1.4815226795204548, + "grad_norm": 0.15561584732371053, + "learning_rate": 1.4000619695920428e-05, + "loss": 0.6166, + "step": 2997 + }, + { + "epoch": 1.482017055988135, + "grad_norm": 0.16579790737168382, + "learning_rate": 1.3997050686403027e-05, + "loss": 0.6061, + "step": 2998 + }, + { + "epoch": 1.482511432455815, + "grad_norm": 0.1498560134706213, + "learning_rate": 1.3993481070836797e-05, + "loss": 0.6026, + "step": 2999 + }, + { + "epoch": 1.4830058089234952, + "grad_norm": 0.1539890911334075, + "learning_rate": 1.398991084976298e-05, + "loss": 0.6482, + "step": 3000 + }, + { + "epoch": 1.4835001853911753, + "grad_norm": 0.1476741808783552, + "learning_rate": 1.3986340023722902e-05, + "loss": 0.6005, + "step": 3001 + }, + { + "epoch": 1.4839945618588555, + "grad_norm": 0.1343554464370001, + "learning_rate": 1.3982768593257989e-05, + "loss": 0.6305, + "step": 3002 + }, + { + "epoch": 1.4844889383265356, + "grad_norm": 0.14773134929430015, + "learning_rate": 1.3979196558909758e-05, + "loss": 0.6276, + "step": 3003 + }, + { + "epoch": 1.4849833147942157, + "grad_norm": 0.15099510180450967, + "learning_rate": 1.397562392121981e-05, + "loss": 0.5741, + "step": 3004 + }, + { + "epoch": 1.4854776912618959, + "grad_norm": 0.14171056713157784, + "learning_rate": 1.3972050680729845e-05, + "loss": 0.6307, + "step": 3005 + }, + { + "epoch": 1.485972067729576, + "grad_norm": 0.15451405478969024, + "learning_rate": 1.3968476837981653e-05, + "loss": 0.6375, + "step": 3006 + }, + { + "epoch": 1.4864664441972562, + "grad_norm": 0.14591382560171356, + "learning_rate": 1.3964902393517112e-05, + "loss": 0.6542, + "step": 3007 + }, + { + "epoch": 1.4869608206649363, + "grad_norm": 0.15437539417624635, + "learning_rate": 1.3961327347878197e-05, + "loss": 0.6454, + "step": 3008 + }, + { + "epoch": 1.4874551971326164, + "grad_norm": 0.1435832795728058, + "learning_rate": 1.3957751701606965e-05, + "loss": 0.6313, + "step": 3009 + }, + { + "epoch": 1.4879495736002966, + "grad_norm": 0.15371751177191295, + "learning_rate": 1.3954175455245578e-05, + "loss": 0.5727, + "step": 3010 + }, + { + "epoch": 1.4884439500679767, + "grad_norm": 0.13493904918086394, + "learning_rate": 1.3950598609336274e-05, + "loss": 0.6105, + "step": 3011 + }, + { + "epoch": 1.4889383265356568, + "grad_norm": 0.137163273919017, + "learning_rate": 1.3947021164421388e-05, + "loss": 0.6612, + "step": 3012 + }, + { + "epoch": 1.489432703003337, + "grad_norm": 0.14491897826797936, + "learning_rate": 1.3943443121043356e-05, + "loss": 0.6241, + "step": 3013 + }, + { + "epoch": 1.489927079471017, + "grad_norm": 0.14230384142740554, + "learning_rate": 1.3939864479744687e-05, + "loss": 0.6591, + "step": 3014 + }, + { + "epoch": 1.4904214559386972, + "grad_norm": 0.1341229147948466, + "learning_rate": 1.3936285241067985e-05, + "loss": 0.6367, + "step": 3015 + }, + { + "epoch": 1.4909158324063774, + "grad_norm": 0.14316476748094908, + "learning_rate": 1.393270540555596e-05, + "loss": 0.5982, + "step": 3016 + }, + { + "epoch": 1.4914102088740577, + "grad_norm": 0.14449715825732778, + "learning_rate": 1.3929124973751393e-05, + "loss": 0.6338, + "step": 3017 + }, + { + "epoch": 1.4919045853417376, + "grad_norm": 0.14161078663789448, + "learning_rate": 1.3925543946197166e-05, + "loss": 0.5932, + "step": 3018 + }, + { + "epoch": 1.492398961809418, + "grad_norm": 0.1417428931074649, + "learning_rate": 1.3921962323436249e-05, + "loss": 0.6511, + "step": 3019 + }, + { + "epoch": 1.492893338277098, + "grad_norm": 0.1437242110645632, + "learning_rate": 1.39183801060117e-05, + "loss": 0.634, + "step": 3020 + }, + { + "epoch": 1.4933877147447783, + "grad_norm": 0.14319762824415483, + "learning_rate": 1.391479729446667e-05, + "loss": 0.601, + "step": 3021 + }, + { + "epoch": 1.4938820912124582, + "grad_norm": 0.14504977931592097, + "learning_rate": 1.3911213889344403e-05, + "loss": 0.5963, + "step": 3022 + }, + { + "epoch": 1.4943764676801385, + "grad_norm": 0.1413306953385926, + "learning_rate": 1.3907629891188224e-05, + "loss": 0.5885, + "step": 3023 + }, + { + "epoch": 1.4948708441478185, + "grad_norm": 0.1352271741122538, + "learning_rate": 1.3904045300541556e-05, + "loss": 0.6234, + "step": 3024 + }, + { + "epoch": 1.4953652206154988, + "grad_norm": 0.13738299745861665, + "learning_rate": 1.3900460117947905e-05, + "loss": 0.585, + "step": 3025 + }, + { + "epoch": 1.4958595970831787, + "grad_norm": 0.14853754637183203, + "learning_rate": 1.3896874343950877e-05, + "loss": 0.5923, + "step": 3026 + }, + { + "epoch": 1.496353973550859, + "grad_norm": 0.13708254477329837, + "learning_rate": 1.3893287979094156e-05, + "loss": 0.6069, + "step": 3027 + }, + { + "epoch": 1.496848350018539, + "grad_norm": 0.13450476380594106, + "learning_rate": 1.3889701023921523e-05, + "loss": 0.5904, + "step": 3028 + }, + { + "epoch": 1.4973427264862194, + "grad_norm": 0.1318562290995645, + "learning_rate": 1.3886113478976848e-05, + "loss": 0.6429, + "step": 3029 + }, + { + "epoch": 1.4978371029538993, + "grad_norm": 0.14131160357589476, + "learning_rate": 1.388252534480409e-05, + "loss": 0.6143, + "step": 3030 + }, + { + "epoch": 1.4983314794215796, + "grad_norm": 0.14269535241038583, + "learning_rate": 1.387893662194729e-05, + "loss": 0.61, + "step": 3031 + }, + { + "epoch": 1.4988258558892595, + "grad_norm": 0.14425291569998744, + "learning_rate": 1.3875347310950588e-05, + "loss": 0.6637, + "step": 3032 + }, + { + "epoch": 1.49932023235694, + "grad_norm": 0.13453712681573657, + "learning_rate": 1.3871757412358213e-05, + "loss": 0.6534, + "step": 3033 + }, + { + "epoch": 1.4998146088246198, + "grad_norm": 0.1398898771410891, + "learning_rate": 1.3868166926714474e-05, + "loss": 0.6116, + "step": 3034 + }, + { + "epoch": 1.5003089852923002, + "grad_norm": 0.13471703373561947, + "learning_rate": 1.3864575854563783e-05, + "loss": 0.6278, + "step": 3035 + }, + { + "epoch": 1.50080336175998, + "grad_norm": 0.13721534851319256, + "learning_rate": 1.3860984196450621e-05, + "loss": 0.6501, + "step": 3036 + }, + { + "epoch": 1.50080336175998, + "eval_loss": 0.6652488112449646, + "eval_runtime": 81.8493, + "eval_samples_per_second": 370.852, + "eval_steps_per_second": 46.366, + "step": 3036 + }, + { + "epoch": 1.5012977382276604, + "grad_norm": 0.8013883131282049, + "learning_rate": 1.3857391952919581e-05, + "loss": 0.6361, + "step": 3037 + }, + { + "epoch": 1.5017921146953404, + "grad_norm": 0.13785833041885487, + "learning_rate": 1.3853799124515326e-05, + "loss": 0.598, + "step": 3038 + }, + { + "epoch": 1.5022864911630207, + "grad_norm": 0.14791660944667356, + "learning_rate": 1.3850205711782618e-05, + "loss": 0.6838, + "step": 3039 + }, + { + "epoch": 1.5027808676307006, + "grad_norm": 0.1612877943276478, + "learning_rate": 1.3846611715266305e-05, + "loss": 0.6176, + "step": 3040 + }, + { + "epoch": 1.503275244098381, + "grad_norm": 0.14254324607450525, + "learning_rate": 1.384301713551132e-05, + "loss": 0.6784, + "step": 3041 + }, + { + "epoch": 1.503769620566061, + "grad_norm": 0.14499508658590204, + "learning_rate": 1.383942197306269e-05, + "loss": 0.6386, + "step": 3042 + }, + { + "epoch": 1.5042639970337413, + "grad_norm": 0.30275684370798034, + "learning_rate": 1.3835826228465531e-05, + "loss": 0.624, + "step": 3043 + }, + { + "epoch": 1.5047583735014214, + "grad_norm": 0.1364301567915292, + "learning_rate": 1.3832229902265039e-05, + "loss": 0.5689, + "step": 3044 + }, + { + "epoch": 1.5052527499691015, + "grad_norm": 0.14051284314304244, + "learning_rate": 1.3828632995006504e-05, + "loss": 0.6488, + "step": 3045 + }, + { + "epoch": 1.5057471264367817, + "grad_norm": 0.14460927480722705, + "learning_rate": 1.3825035507235307e-05, + "loss": 0.6284, + "step": 3046 + }, + { + "epoch": 1.5062415029044618, + "grad_norm": 0.14264893928445588, + "learning_rate": 1.382143743949691e-05, + "loss": 0.6233, + "step": 3047 + }, + { + "epoch": 1.506735879372142, + "grad_norm": 0.1453164319320926, + "learning_rate": 1.3817838792336866e-05, + "loss": 0.6238, + "step": 3048 + }, + { + "epoch": 1.507230255839822, + "grad_norm": 0.13280352151959962, + "learning_rate": 1.3814239566300822e-05, + "loss": 0.5981, + "step": 3049 + }, + { + "epoch": 1.5077246323075022, + "grad_norm": 0.14433520755470147, + "learning_rate": 1.38106397619345e-05, + "loss": 0.6319, + "step": 3050 + }, + { + "epoch": 1.5082190087751823, + "grad_norm": 0.1418555767019073, + "learning_rate": 1.380703937978372e-05, + "loss": 0.611, + "step": 3051 + }, + { + "epoch": 1.5087133852428625, + "grad_norm": 0.13137207785304836, + "learning_rate": 1.3803438420394386e-05, + "loss": 0.6181, + "step": 3052 + }, + { + "epoch": 1.5092077617105426, + "grad_norm": 0.14715879776512114, + "learning_rate": 1.3799836884312492e-05, + "loss": 0.6478, + "step": 3053 + }, + { + "epoch": 1.5097021381782227, + "grad_norm": 0.14365553077490012, + "learning_rate": 1.3796234772084114e-05, + "loss": 0.6362, + "step": 3054 + }, + { + "epoch": 1.5101965146459029, + "grad_norm": 0.14314326329012933, + "learning_rate": 1.3792632084255423e-05, + "loss": 0.6052, + "step": 3055 + }, + { + "epoch": 1.510690891113583, + "grad_norm": 0.14019075367143452, + "learning_rate": 1.3789028821372665e-05, + "loss": 0.5797, + "step": 3056 + }, + { + "epoch": 1.5111852675812631, + "grad_norm": 0.13557660724741738, + "learning_rate": 1.378542498398219e-05, + "loss": 0.6028, + "step": 3057 + }, + { + "epoch": 1.5116796440489433, + "grad_norm": 0.13695076081619978, + "learning_rate": 1.3781820572630417e-05, + "loss": 0.6625, + "step": 3058 + }, + { + "epoch": 1.5121740205166234, + "grad_norm": 0.14351778497468076, + "learning_rate": 1.3778215587863875e-05, + "loss": 0.6433, + "step": 3059 + }, + { + "epoch": 1.5126683969843036, + "grad_norm": 0.13874069858961313, + "learning_rate": 1.3774610030229152e-05, + "loss": 0.6005, + "step": 3060 + }, + { + "epoch": 1.5131627734519837, + "grad_norm": 0.14208790333470078, + "learning_rate": 1.3771003900272941e-05, + "loss": 0.6262, + "step": 3061 + }, + { + "epoch": 1.5136571499196638, + "grad_norm": 0.14237052545801285, + "learning_rate": 1.3767397198542027e-05, + "loss": 0.6606, + "step": 3062 + }, + { + "epoch": 1.514151526387344, + "grad_norm": 0.13427448922937438, + "learning_rate": 1.3763789925583263e-05, + "loss": 0.605, + "step": 3063 + }, + { + "epoch": 1.514645902855024, + "grad_norm": 0.1418560383099023, + "learning_rate": 1.3760182081943595e-05, + "loss": 0.655, + "step": 3064 + }, + { + "epoch": 1.5151402793227042, + "grad_norm": 0.14355689106966676, + "learning_rate": 1.375657366817007e-05, + "loss": 0.6458, + "step": 3065 + }, + { + "epoch": 1.5156346557903844, + "grad_norm": 0.1375336715684756, + "learning_rate": 1.3752964684809802e-05, + "loss": 0.6314, + "step": 3066 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 0.14592014837713776, + "learning_rate": 1.3749355132410002e-05, + "loss": 0.6507, + "step": 3067 + }, + { + "epoch": 1.5166234087257446, + "grad_norm": 0.14025710158229668, + "learning_rate": 1.3745745011517969e-05, + "loss": 0.5896, + "step": 3068 + }, + { + "epoch": 1.5171177851934248, + "grad_norm": 0.15773760235612255, + "learning_rate": 1.3742134322681074e-05, + "loss": 0.6246, + "step": 3069 + }, + { + "epoch": 1.517612161661105, + "grad_norm": 0.15265312285277557, + "learning_rate": 1.3738523066446794e-05, + "loss": 0.6383, + "step": 3070 + }, + { + "epoch": 1.518106538128785, + "grad_norm": 0.1413781744476011, + "learning_rate": 1.3734911243362674e-05, + "loss": 0.6201, + "step": 3071 + }, + { + "epoch": 1.5186009145964652, + "grad_norm": 0.1564694852407532, + "learning_rate": 1.373129885397636e-05, + "loss": 0.6126, + "step": 3072 + }, + { + "epoch": 1.5190952910641453, + "grad_norm": 0.13335819732872947, + "learning_rate": 1.3727685898835574e-05, + "loss": 0.6102, + "step": 3073 + }, + { + "epoch": 1.5195896675318254, + "grad_norm": 0.15771400112619988, + "learning_rate": 1.3724072378488124e-05, + "loss": 0.6252, + "step": 3074 + }, + { + "epoch": 1.5200840439995056, + "grad_norm": 0.14295960589455833, + "learning_rate": 1.372045829348191e-05, + "loss": 0.6688, + "step": 3075 + }, + { + "epoch": 1.5205784204671857, + "grad_norm": 0.13334427821170824, + "learning_rate": 1.3716843644364914e-05, + "loss": 0.6492, + "step": 3076 + }, + { + "epoch": 1.5210727969348659, + "grad_norm": 0.150228450268583, + "learning_rate": 1.3713228431685201e-05, + "loss": 0.6315, + "step": 3077 + }, + { + "epoch": 1.521567173402546, + "grad_norm": 0.14232428246468498, + "learning_rate": 1.3709612655990928e-05, + "loss": 0.6015, + "step": 3078 + }, + { + "epoch": 1.5220615498702261, + "grad_norm": 0.1331293041856876, + "learning_rate": 1.3705996317830333e-05, + "loss": 0.6553, + "step": 3079 + }, + { + "epoch": 1.5225559263379063, + "grad_norm": 0.15037869235156218, + "learning_rate": 1.3702379417751734e-05, + "loss": 0.6111, + "step": 3080 + }, + { + "epoch": 1.5230503028055864, + "grad_norm": 0.15223474994143452, + "learning_rate": 1.3698761956303543e-05, + "loss": 0.6249, + "step": 3081 + }, + { + "epoch": 1.5235446792732668, + "grad_norm": 0.1403111178880944, + "learning_rate": 1.369514393403426e-05, + "loss": 0.6141, + "step": 3082 + }, + { + "epoch": 1.5240390557409467, + "grad_norm": 0.14787313784196118, + "learning_rate": 1.3691525351492452e-05, + "loss": 0.6184, + "step": 3083 + }, + { + "epoch": 1.524533432208627, + "grad_norm": 0.21374640495119593, + "learning_rate": 1.3687906209226794e-05, + "loss": 0.6143, + "step": 3084 + }, + { + "epoch": 1.525027808676307, + "grad_norm": 0.15344442584975124, + "learning_rate": 1.3684286507786029e-05, + "loss": 0.6299, + "step": 3085 + }, + { + "epoch": 1.5255221851439873, + "grad_norm": 0.1405364146373388, + "learning_rate": 1.3680666247718991e-05, + "loss": 0.6196, + "step": 3086 + }, + { + "epoch": 1.5260165616116672, + "grad_norm": 0.14144425771636968, + "learning_rate": 1.3677045429574602e-05, + "loss": 0.62, + "step": 3087 + }, + { + "epoch": 1.5265109380793476, + "grad_norm": 0.16355437721978094, + "learning_rate": 1.3673424053901862e-05, + "loss": 0.6254, + "step": 3088 + }, + { + "epoch": 1.5270053145470275, + "grad_norm": 0.15782783118026678, + "learning_rate": 1.3669802121249857e-05, + "loss": 0.629, + "step": 3089 + }, + { + "epoch": 1.5274996910147078, + "grad_norm": 0.14764410473606998, + "learning_rate": 1.3666179632167764e-05, + "loss": 0.6223, + "step": 3090 + }, + { + "epoch": 1.5279940674823878, + "grad_norm": 0.1546738562389159, + "learning_rate": 1.3662556587204832e-05, + "loss": 0.6373, + "step": 3091 + }, + { + "epoch": 1.528488443950068, + "grad_norm": 0.139535171276612, + "learning_rate": 1.365893298691041e-05, + "loss": 0.6162, + "step": 3092 + }, + { + "epoch": 1.528982820417748, + "grad_norm": 0.14514446544172063, + "learning_rate": 1.3655308831833915e-05, + "loss": 0.6589, + "step": 3093 + }, + { + "epoch": 1.5294771968854284, + "grad_norm": 0.1557178235103779, + "learning_rate": 1.3651684122524857e-05, + "loss": 0.6089, + "step": 3094 + }, + { + "epoch": 1.5299715733531083, + "grad_norm": 0.13044021209749984, + "learning_rate": 1.3648058859532839e-05, + "loss": 0.6444, + "step": 3095 + }, + { + "epoch": 1.5304659498207887, + "grad_norm": 0.14348574593068117, + "learning_rate": 1.3644433043407526e-05, + "loss": 0.5938, + "step": 3096 + }, + { + "epoch": 1.5309603262884686, + "grad_norm": 0.14858762525495595, + "learning_rate": 1.3640806674698681e-05, + "loss": 0.6473, + "step": 3097 + }, + { + "epoch": 1.531454702756149, + "grad_norm": 0.16175198336075922, + "learning_rate": 1.3637179753956154e-05, + "loss": 0.612, + "step": 3098 + }, + { + "epoch": 1.5319490792238288, + "grad_norm": 0.13928810346413797, + "learning_rate": 1.3633552281729866e-05, + "loss": 0.6148, + "step": 3099 + }, + { + "epoch": 1.5324434556915092, + "grad_norm": 0.15774531360622396, + "learning_rate": 1.3629924258569835e-05, + "loss": 0.6157, + "step": 3100 + }, + { + "epoch": 1.532937832159189, + "grad_norm": 0.14109271351044586, + "learning_rate": 1.3626295685026154e-05, + "loss": 0.6157, + "step": 3101 + }, + { + "epoch": 1.5334322086268695, + "grad_norm": 0.14009129748410415, + "learning_rate": 1.3622666561649004e-05, + "loss": 0.63, + "step": 3102 + }, + { + "epoch": 1.5339265850945494, + "grad_norm": 0.15246984846574838, + "learning_rate": 1.3619036888988642e-05, + "loss": 0.5999, + "step": 3103 + }, + { + "epoch": 1.5344209615622297, + "grad_norm": 0.14120125060372465, + "learning_rate": 1.3615406667595417e-05, + "loss": 0.5824, + "step": 3104 + }, + { + "epoch": 1.5349153380299096, + "grad_norm": 0.1563486616015802, + "learning_rate": 1.3611775898019757e-05, + "loss": 0.6277, + "step": 3105 + }, + { + "epoch": 1.53540971449759, + "grad_norm": 0.14515756461623977, + "learning_rate": 1.3608144580812176e-05, + "loss": 0.5854, + "step": 3106 + }, + { + "epoch": 1.53590409096527, + "grad_norm": 0.13221752444015705, + "learning_rate": 1.3604512716523262e-05, + "loss": 0.639, + "step": 3107 + }, + { + "epoch": 1.5363984674329503, + "grad_norm": 0.14286882869375056, + "learning_rate": 1.3600880305703704e-05, + "loss": 0.6126, + "step": 3108 + }, + { + "epoch": 1.5368928439006302, + "grad_norm": 0.13644091260669677, + "learning_rate": 1.3597247348904253e-05, + "loss": 0.5943, + "step": 3109 + }, + { + "epoch": 1.5373872203683105, + "grad_norm": 0.13098493974057587, + "learning_rate": 1.3593613846675755e-05, + "loss": 0.6204, + "step": 3110 + }, + { + "epoch": 1.5378815968359905, + "grad_norm": 0.13503503795816327, + "learning_rate": 1.3589979799569137e-05, + "loss": 0.6135, + "step": 3111 + }, + { + "epoch": 1.5383759733036708, + "grad_norm": 0.13389281022833657, + "learning_rate": 1.3586345208135411e-05, + "loss": 0.6069, + "step": 3112 + }, + { + "epoch": 1.5388703497713507, + "grad_norm": 0.1403544625613614, + "learning_rate": 1.3582710072925664e-05, + "loss": 0.647, + "step": 3113 + }, + { + "epoch": 1.539364726239031, + "grad_norm": 0.14446129498645244, + "learning_rate": 1.357907439449107e-05, + "loss": 0.6527, + "step": 3114 + }, + { + "epoch": 1.539859102706711, + "grad_norm": 0.36106076137448406, + "learning_rate": 1.3575438173382888e-05, + "loss": 0.6062, + "step": 3115 + }, + { + "epoch": 1.5403534791743914, + "grad_norm": 0.13272995286331787, + "learning_rate": 1.3571801410152449e-05, + "loss": 0.596, + "step": 3116 + }, + { + "epoch": 1.5408478556420713, + "grad_norm": 0.1394873042539069, + "learning_rate": 1.3568164105351185e-05, + "loss": 0.6667, + "step": 3117 + }, + { + "epoch": 1.5413422321097516, + "grad_norm": 0.16176728663467385, + "learning_rate": 1.356452625953059e-05, + "loss": 0.6489, + "step": 3118 + }, + { + "epoch": 1.5418366085774318, + "grad_norm": 0.1411649205708121, + "learning_rate": 1.3560887873242253e-05, + "loss": 0.6524, + "step": 3119 + }, + { + "epoch": 1.542330985045112, + "grad_norm": 0.14902104212746492, + "learning_rate": 1.3557248947037837e-05, + "loss": 0.6555, + "step": 3120 + }, + { + "epoch": 1.542825361512792, + "grad_norm": 0.14781975165278738, + "learning_rate": 1.3553609481469094e-05, + "loss": 0.5742, + "step": 3121 + }, + { + "epoch": 1.5433197379804722, + "grad_norm": 0.13187763292844984, + "learning_rate": 1.3549969477087853e-05, + "loss": 0.6184, + "step": 3122 + }, + { + "epoch": 1.5438141144481523, + "grad_norm": 0.14362633998713126, + "learning_rate": 1.3546328934446027e-05, + "loss": 0.6194, + "step": 3123 + }, + { + "epoch": 1.5443084909158324, + "grad_norm": 0.16400456057921461, + "learning_rate": 1.3542687854095604e-05, + "loss": 0.6087, + "step": 3124 + }, + { + "epoch": 1.5448028673835126, + "grad_norm": 0.128282278921887, + "learning_rate": 1.3539046236588672e-05, + "loss": 0.6307, + "step": 3125 + }, + { + "epoch": 1.5452972438511927, + "grad_norm": 0.15754254980251528, + "learning_rate": 1.3535404082477375e-05, + "loss": 0.6183, + "step": 3126 + }, + { + "epoch": 1.5457916203188728, + "grad_norm": 0.14898169534294542, + "learning_rate": 1.3531761392313953e-05, + "loss": 0.6721, + "step": 3127 + }, + { + "epoch": 1.546285996786553, + "grad_norm": 0.14497968915023804, + "learning_rate": 1.3528118166650732e-05, + "loss": 0.5814, + "step": 3128 + }, + { + "epoch": 1.5467803732542331, + "grad_norm": 0.13663657481381458, + "learning_rate": 1.3524474406040105e-05, + "loss": 0.6162, + "step": 3129 + }, + { + "epoch": 1.5472747497219133, + "grad_norm": 0.14239409495974992, + "learning_rate": 1.352083011103456e-05, + "loss": 0.6312, + "step": 3130 + }, + { + "epoch": 1.5477691261895934, + "grad_norm": 0.1541262071694138, + "learning_rate": 1.3517185282186659e-05, + "loss": 0.6606, + "step": 3131 + }, + { + "epoch": 1.5482635026572735, + "grad_norm": 0.1469374300980286, + "learning_rate": 1.3513539920049034e-05, + "loss": 0.6584, + "step": 3132 + }, + { + "epoch": 1.5487578791249537, + "grad_norm": 0.14217412609510513, + "learning_rate": 1.3509894025174423e-05, + "loss": 0.6029, + "step": 3133 + }, + { + "epoch": 1.5492522555926338, + "grad_norm": 0.14791891984861727, + "learning_rate": 1.3506247598115629e-05, + "loss": 0.5994, + "step": 3134 + }, + { + "epoch": 1.549746632060314, + "grad_norm": 0.14127630215876447, + "learning_rate": 1.3502600639425535e-05, + "loss": 0.6666, + "step": 3135 + }, + { + "epoch": 1.550241008527994, + "grad_norm": 0.14864532881970954, + "learning_rate": 1.3498953149657105e-05, + "loss": 0.6082, + "step": 3136 + }, + { + "epoch": 1.5507353849956742, + "grad_norm": 0.14124239388539161, + "learning_rate": 1.349530512936339e-05, + "loss": 0.6276, + "step": 3137 + }, + { + "epoch": 1.5512297614633543, + "grad_norm": 0.13509801925692957, + "learning_rate": 1.3491656579097518e-05, + "loss": 0.6702, + "step": 3138 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 0.13999714397893068, + "learning_rate": 1.3488007499412694e-05, + "loss": 0.572, + "step": 3139 + }, + { + "epoch": 1.5522185143987146, + "grad_norm": 0.13458778745735156, + "learning_rate": 1.3484357890862203e-05, + "loss": 0.5872, + "step": 3140 + }, + { + "epoch": 1.5527128908663947, + "grad_norm": 0.14002025560665748, + "learning_rate": 1.3480707753999424e-05, + "loss": 0.6315, + "step": 3141 + }, + { + "epoch": 1.5532072673340749, + "grad_norm": 0.1474697947726157, + "learning_rate": 1.34770570893778e-05, + "loss": 0.621, + "step": 3142 + }, + { + "epoch": 1.553701643801755, + "grad_norm": 0.13819555113462578, + "learning_rate": 1.347340589755085e-05, + "loss": 0.587, + "step": 3143 + }, + { + "epoch": 1.5541960202694352, + "grad_norm": 0.13392051197534433, + "learning_rate": 1.3469754179072198e-05, + "loss": 0.6124, + "step": 3144 + }, + { + "epoch": 1.5546903967371153, + "grad_norm": 0.14017289935517033, + "learning_rate": 1.3466101934495522e-05, + "loss": 0.626, + "step": 3145 + }, + { + "epoch": 1.5551847732047954, + "grad_norm": 0.13871476451624526, + "learning_rate": 1.3462449164374591e-05, + "loss": 0.5891, + "step": 3146 + }, + { + "epoch": 1.5556791496724756, + "grad_norm": 0.1366678625290967, + "learning_rate": 1.3458795869263258e-05, + "loss": 0.6519, + "step": 3147 + }, + { + "epoch": 1.5561735261401557, + "grad_norm": 0.139542177618392, + "learning_rate": 1.3455142049715444e-05, + "loss": 0.6049, + "step": 3148 + }, + { + "epoch": 1.5566679026078358, + "grad_norm": 0.13584339596625358, + "learning_rate": 1.3451487706285158e-05, + "loss": 0.6439, + "step": 3149 + }, + { + "epoch": 1.557162279075516, + "grad_norm": 0.13052318837754115, + "learning_rate": 1.3447832839526488e-05, + "loss": 0.6007, + "step": 3150 + }, + { + "epoch": 1.557656655543196, + "grad_norm": 0.14942667859346004, + "learning_rate": 1.3444177449993598e-05, + "loss": 0.6818, + "step": 3151 + }, + { + "epoch": 1.5581510320108762, + "grad_norm": 0.14001170296496568, + "learning_rate": 1.3440521538240732e-05, + "loss": 0.6181, + "step": 3152 + }, + { + "epoch": 1.5586454084785564, + "grad_norm": 0.1349769671550634, + "learning_rate": 1.3436865104822217e-05, + "loss": 0.6448, + "step": 3153 + }, + { + "epoch": 1.5591397849462365, + "grad_norm": 0.14533978421355925, + "learning_rate": 1.3433208150292451e-05, + "loss": 0.632, + "step": 3154 + }, + { + "epoch": 1.5596341614139169, + "grad_norm": 0.13891127357243369, + "learning_rate": 1.342955067520592e-05, + "loss": 0.6092, + "step": 3155 + }, + { + "epoch": 1.5601285378815968, + "grad_norm": 0.1435676401189588, + "learning_rate": 1.3425892680117185e-05, + "loss": 0.6347, + "step": 3156 + }, + { + "epoch": 1.5606229143492771, + "grad_norm": 0.14191217648774643, + "learning_rate": 1.3422234165580884e-05, + "loss": 0.6577, + "step": 3157 + }, + { + "epoch": 1.561117290816957, + "grad_norm": 0.13666697314318452, + "learning_rate": 1.3418575132151736e-05, + "loss": 0.6542, + "step": 3158 + }, + { + "epoch": 1.5616116672846374, + "grad_norm": 0.14191696006079188, + "learning_rate": 1.3414915580384538e-05, + "loss": 0.6177, + "step": 3159 + }, + { + "epoch": 1.5621060437523173, + "grad_norm": 0.1317361639487236, + "learning_rate": 1.3411255510834166e-05, + "loss": 0.6418, + "step": 3160 + }, + { + "epoch": 1.5626004202199977, + "grad_norm": 0.13395957688893542, + "learning_rate": 1.340759492405558e-05, + "loss": 0.6237, + "step": 3161 + }, + { + "epoch": 1.5630947966876776, + "grad_norm": 0.13662385435948918, + "learning_rate": 1.3403933820603806e-05, + "loss": 0.6519, + "step": 3162 + }, + { + "epoch": 1.563589173155358, + "grad_norm": 0.13997228513910412, + "learning_rate": 1.3400272201033952e-05, + "loss": 0.6089, + "step": 3163 + }, + { + "epoch": 1.5640835496230379, + "grad_norm": 0.13594893664216282, + "learning_rate": 1.3396610065901219e-05, + "loss": 0.6393, + "step": 3164 + }, + { + "epoch": 1.5645779260907182, + "grad_norm": 0.13838923293035457, + "learning_rate": 1.3392947415760864e-05, + "loss": 0.6385, + "step": 3165 + }, + { + "epoch": 1.5650723025583981, + "grad_norm": 0.13197230718863584, + "learning_rate": 1.3389284251168237e-05, + "loss": 0.6125, + "step": 3166 + }, + { + "epoch": 1.5655666790260785, + "grad_norm": 0.13837261881666593, + "learning_rate": 1.3385620572678763e-05, + "loss": 0.6356, + "step": 3167 + }, + { + "epoch": 1.5660610554937584, + "grad_norm": 0.1451344582049357, + "learning_rate": 1.3381956380847942e-05, + "loss": 0.6347, + "step": 3168 + }, + { + "epoch": 1.5665554319614388, + "grad_norm": 0.13626680166382432, + "learning_rate": 1.3378291676231355e-05, + "loss": 0.6174, + "step": 3169 + }, + { + "epoch": 1.5670498084291187, + "grad_norm": 0.13471489187920455, + "learning_rate": 1.3374626459384655e-05, + "loss": 0.6023, + "step": 3170 + }, + { + "epoch": 1.567544184896799, + "grad_norm": 0.13271908673872745, + "learning_rate": 1.337096073086358e-05, + "loss": 0.6029, + "step": 3171 + }, + { + "epoch": 1.568038561364479, + "grad_norm": 0.1424595580063818, + "learning_rate": 1.3367294491223944e-05, + "loss": 0.5978, + "step": 3172 + }, + { + "epoch": 1.5685329378321593, + "grad_norm": 0.13848932761608737, + "learning_rate": 1.336362774102163e-05, + "loss": 0.6271, + "step": 3173 + }, + { + "epoch": 1.5690273142998392, + "grad_norm": 0.1494768045290518, + "learning_rate": 1.3359960480812614e-05, + "loss": 0.6305, + "step": 3174 + }, + { + "epoch": 1.5695216907675196, + "grad_norm": 0.14547374702517424, + "learning_rate": 1.3356292711152938e-05, + "loss": 0.6296, + "step": 3175 + }, + { + "epoch": 1.5700160672351995, + "grad_norm": 0.14075348017943853, + "learning_rate": 1.3352624432598717e-05, + "loss": 0.5862, + "step": 3176 + }, + { + "epoch": 1.5705104437028798, + "grad_norm": 0.1496765132759404, + "learning_rate": 1.3348955645706162e-05, + "loss": 0.6388, + "step": 3177 + }, + { + "epoch": 1.5710048201705598, + "grad_norm": 0.1446725327692207, + "learning_rate": 1.3345286351031544e-05, + "loss": 0.6127, + "step": 3178 + }, + { + "epoch": 1.5714991966382401, + "grad_norm": 0.1374789790588409, + "learning_rate": 1.3341616549131209e-05, + "loss": 0.6339, + "step": 3179 + }, + { + "epoch": 1.57199357310592, + "grad_norm": 0.1545880068248095, + "learning_rate": 1.3337946240561595e-05, + "loss": 0.5943, + "step": 3180 + }, + { + "epoch": 1.5724879495736004, + "grad_norm": 0.1359957257399134, + "learning_rate": 1.3334275425879208e-05, + "loss": 0.6712, + "step": 3181 + }, + { + "epoch": 1.5729823260412803, + "grad_norm": 0.14734166316454345, + "learning_rate": 1.3330604105640633e-05, + "loss": 0.6535, + "step": 3182 + }, + { + "epoch": 1.5734767025089607, + "grad_norm": 0.1399963134189284, + "learning_rate": 1.3326932280402524e-05, + "loss": 0.6624, + "step": 3183 + }, + { + "epoch": 1.5739710789766406, + "grad_norm": 0.15040691064773848, + "learning_rate": 1.3323259950721626e-05, + "loss": 0.6209, + "step": 3184 + }, + { + "epoch": 1.574465455444321, + "grad_norm": 0.14274443003498988, + "learning_rate": 1.3319587117154746e-05, + "loss": 0.6353, + "step": 3185 + }, + { + "epoch": 1.5749598319120008, + "grad_norm": 0.13918403876306587, + "learning_rate": 1.3315913780258778e-05, + "loss": 0.61, + "step": 3186 + }, + { + "epoch": 1.5754542083796812, + "grad_norm": 0.137672590381422, + "learning_rate": 1.3312239940590683e-05, + "loss": 0.5952, + "step": 3187 + }, + { + "epoch": 1.5759485848473611, + "grad_norm": 0.13879892402216648, + "learning_rate": 1.3308565598707508e-05, + "loss": 0.5899, + "step": 3188 + }, + { + "epoch": 1.5764429613150415, + "grad_norm": 0.1446327173734715, + "learning_rate": 1.3304890755166366e-05, + "loss": 0.6184, + "step": 3189 + }, + { + "epoch": 1.5769373377827214, + "grad_norm": 0.13314644102550002, + "learning_rate": 1.3301215410524462e-05, + "loss": 0.5929, + "step": 3190 + }, + { + "epoch": 1.5774317142504017, + "grad_norm": 0.14610427035672652, + "learning_rate": 1.3297539565339057e-05, + "loss": 0.6099, + "step": 3191 + }, + { + "epoch": 1.5779260907180817, + "grad_norm": 0.1468422981165961, + "learning_rate": 1.3293863220167497e-05, + "loss": 0.6209, + "step": 3192 + }, + { + "epoch": 1.578420467185762, + "grad_norm": 0.13568395654859466, + "learning_rate": 1.329018637556721e-05, + "loss": 0.6001, + "step": 3193 + }, + { + "epoch": 1.5789148436534421, + "grad_norm": 0.14034647520646434, + "learning_rate": 1.3286509032095691e-05, + "loss": 0.6156, + "step": 3194 + }, + { + "epoch": 1.5794092201211223, + "grad_norm": 0.13870820312463789, + "learning_rate": 1.3282831190310513e-05, + "loss": 0.6034, + "step": 3195 + }, + { + "epoch": 1.5799035965888024, + "grad_norm": 0.13707912847825426, + "learning_rate": 1.3279152850769323e-05, + "loss": 0.6368, + "step": 3196 + }, + { + "epoch": 1.5803979730564826, + "grad_norm": 0.133024057185432, + "learning_rate": 1.3275474014029855e-05, + "loss": 0.61, + "step": 3197 + }, + { + "epoch": 1.5808923495241627, + "grad_norm": 0.14069380553150157, + "learning_rate": 1.3271794680649897e-05, + "loss": 0.6198, + "step": 3198 + }, + { + "epoch": 1.5813867259918428, + "grad_norm": 0.13595418852031405, + "learning_rate": 1.326811485118733e-05, + "loss": 0.6384, + "step": 3199 + }, + { + "epoch": 1.581881102459523, + "grad_norm": 0.1374841988493327, + "learning_rate": 1.3264434526200105e-05, + "loss": 0.6275, + "step": 3200 + }, + { + "epoch": 1.582375478927203, + "grad_norm": 0.13587177495952504, + "learning_rate": 1.3260753706246247e-05, + "loss": 0.616, + "step": 3201 + }, + { + "epoch": 1.5828698553948832, + "grad_norm": 0.13107259141276476, + "learning_rate": 1.3257072391883856e-05, + "loss": 0.605, + "step": 3202 + }, + { + "epoch": 1.5833642318625634, + "grad_norm": 0.1421534420562897, + "learning_rate": 1.3253390583671109e-05, + "loss": 0.6484, + "step": 3203 + }, + { + "epoch": 1.5838586083302435, + "grad_norm": 0.13292361909975428, + "learning_rate": 1.3249708282166255e-05, + "loss": 0.6401, + "step": 3204 + }, + { + "epoch": 1.5843529847979236, + "grad_norm": 0.14113863056174822, + "learning_rate": 1.3246025487927617e-05, + "loss": 0.661, + "step": 3205 + }, + { + "epoch": 1.5848473612656038, + "grad_norm": 0.136762278425901, + "learning_rate": 1.3242342201513599e-05, + "loss": 0.5916, + "step": 3206 + }, + { + "epoch": 1.585341737733284, + "grad_norm": 0.13186886778600512, + "learning_rate": 1.3238658423482675e-05, + "loss": 0.6008, + "step": 3207 + }, + { + "epoch": 1.585836114200964, + "grad_norm": 0.13034732568133967, + "learning_rate": 1.3234974154393395e-05, + "loss": 0.652, + "step": 3208 + }, + { + "epoch": 1.5863304906686442, + "grad_norm": 0.137985634037361, + "learning_rate": 1.3231289394804376e-05, + "loss": 0.6331, + "step": 3209 + }, + { + "epoch": 1.5868248671363243, + "grad_norm": 0.13434476855767405, + "learning_rate": 1.3227604145274327e-05, + "loss": 0.6323, + "step": 3210 + }, + { + "epoch": 1.5873192436040044, + "grad_norm": 0.14456726095765857, + "learning_rate": 1.3223918406362011e-05, + "loss": 0.602, + "step": 3211 + }, + { + "epoch": 1.5878136200716846, + "grad_norm": 0.1382447003794755, + "learning_rate": 1.3220232178626277e-05, + "loss": 0.6212, + "step": 3212 + }, + { + "epoch": 1.5883079965393647, + "grad_norm": 0.13537083246689202, + "learning_rate": 1.3216545462626051e-05, + "loss": 0.6251, + "step": 3213 + }, + { + "epoch": 1.5888023730070449, + "grad_norm": 0.14332947603867238, + "learning_rate": 1.321285825892032e-05, + "loss": 0.6177, + "step": 3214 + }, + { + "epoch": 1.589296749474725, + "grad_norm": 0.13927171830827012, + "learning_rate": 1.3209170568068157e-05, + "loss": 0.6209, + "step": 3215 + }, + { + "epoch": 1.5897911259424051, + "grad_norm": 0.1426307630218252, + "learning_rate": 1.3205482390628703e-05, + "loss": 0.6079, + "step": 3216 + }, + { + "epoch": 1.5902855024100853, + "grad_norm": 0.13794792922866117, + "learning_rate": 1.3201793727161174e-05, + "loss": 0.6289, + "step": 3217 + }, + { + "epoch": 1.5907798788777654, + "grad_norm": 0.1391780697296845, + "learning_rate": 1.319810457822486e-05, + "loss": 0.6457, + "step": 3218 + }, + { + "epoch": 1.5912742553454455, + "grad_norm": 0.1369301781460764, + "learning_rate": 1.3194414944379125e-05, + "loss": 0.6203, + "step": 3219 + }, + { + "epoch": 1.5917686318131257, + "grad_norm": 0.14242907060272478, + "learning_rate": 1.3190724826183407e-05, + "loss": 0.6335, + "step": 3220 + }, + { + "epoch": 1.5922630082808058, + "grad_norm": 0.14070384726270427, + "learning_rate": 1.3187034224197214e-05, + "loss": 0.652, + "step": 3221 + }, + { + "epoch": 1.592757384748486, + "grad_norm": 0.1423789692879403, + "learning_rate": 1.3183343138980132e-05, + "loss": 0.5887, + "step": 3222 + }, + { + "epoch": 1.593251761216166, + "grad_norm": 0.13343542244928341, + "learning_rate": 1.3179651571091818e-05, + "loss": 0.607, + "step": 3223 + }, + { + "epoch": 1.5937461376838462, + "grad_norm": 0.14120775297499866, + "learning_rate": 1.3175959521092003e-05, + "loss": 0.6199, + "step": 3224 + }, + { + "epoch": 1.5942405141515263, + "grad_norm": 0.13762305203974795, + "learning_rate": 1.3172266989540485e-05, + "loss": 0.6739, + "step": 3225 + }, + { + "epoch": 1.5947348906192065, + "grad_norm": 0.14157981383786575, + "learning_rate": 1.3168573976997148e-05, + "loss": 0.615, + "step": 3226 + }, + { + "epoch": 1.5952292670868866, + "grad_norm": 0.13570484320033752, + "learning_rate": 1.3164880484021938e-05, + "loss": 0.5821, + "step": 3227 + }, + { + "epoch": 1.5957236435545668, + "grad_norm": 0.13094438338899217, + "learning_rate": 1.3161186511174875e-05, + "loss": 0.674, + "step": 3228 + }, + { + "epoch": 1.5962180200222469, + "grad_norm": 0.136595055544307, + "learning_rate": 1.3157492059016055e-05, + "loss": 0.6722, + "step": 3229 + }, + { + "epoch": 1.5967123964899272, + "grad_norm": 0.13448728031676652, + "learning_rate": 1.315379712810565e-05, + "loss": 0.5965, + "step": 3230 + }, + { + "epoch": 1.5972067729576072, + "grad_norm": 0.1411252156489022, + "learning_rate": 1.3150101719003896e-05, + "loss": 0.6125, + "step": 3231 + }, + { + "epoch": 1.5977011494252875, + "grad_norm": 0.1284860418275866, + "learning_rate": 1.3146405832271105e-05, + "loss": 0.6001, + "step": 3232 + }, + { + "epoch": 1.5981955258929674, + "grad_norm": 0.13466845205568026, + "learning_rate": 1.3142709468467665e-05, + "loss": 0.6299, + "step": 3233 + }, + { + "epoch": 1.5986899023606478, + "grad_norm": 0.13370216458343762, + "learning_rate": 1.3139012628154033e-05, + "loss": 0.6276, + "step": 3234 + }, + { + "epoch": 1.5991842788283277, + "grad_norm": 0.13745845207288956, + "learning_rate": 1.3135315311890737e-05, + "loss": 0.6056, + "step": 3235 + }, + { + "epoch": 1.599678655296008, + "grad_norm": 0.12917075168009745, + "learning_rate": 1.313161752023838e-05, + "loss": 0.6246, + "step": 3236 + }, + { + "epoch": 1.600173031763688, + "grad_norm": 0.13653130920840414, + "learning_rate": 1.3127919253757637e-05, + "loss": 0.6417, + "step": 3237 + }, + { + "epoch": 1.6006674082313683, + "grad_norm": 0.13840805167694406, + "learning_rate": 1.3124220513009252e-05, + "loss": 0.5769, + "step": 3238 + }, + { + "epoch": 1.6011617846990482, + "grad_norm": 0.13728031388668377, + "learning_rate": 1.3120521298554043e-05, + "loss": 0.6064, + "step": 3239 + }, + { + "epoch": 1.6016561611667286, + "grad_norm": 0.13907097572585994, + "learning_rate": 1.3116821610952902e-05, + "loss": 0.5923, + "step": 3240 + }, + { + "epoch": 1.6021505376344085, + "grad_norm": 0.13814903527606867, + "learning_rate": 1.3113121450766783e-05, + "loss": 0.602, + "step": 3241 + }, + { + "epoch": 1.6026449141020889, + "grad_norm": 0.1322946238302828, + "learning_rate": 1.3109420818556731e-05, + "loss": 0.6378, + "step": 3242 + }, + { + "epoch": 1.6031392905697688, + "grad_norm": 0.13826569103668201, + "learning_rate": 1.3105719714883845e-05, + "loss": 0.6424, + "step": 3243 + }, + { + "epoch": 1.6036336670374491, + "grad_norm": 0.13523797527321885, + "learning_rate": 1.3102018140309297e-05, + "loss": 0.6712, + "step": 3244 + }, + { + "epoch": 1.604128043505129, + "grad_norm": 0.13605751710003355, + "learning_rate": 1.3098316095394341e-05, + "loss": 0.6335, + "step": 3245 + }, + { + "epoch": 1.6046224199728094, + "grad_norm": 0.13737273810505993, + "learning_rate": 1.3094613580700295e-05, + "loss": 0.613, + "step": 3246 + }, + { + "epoch": 1.6051167964404893, + "grad_norm": 0.13544787362402191, + "learning_rate": 1.3090910596788541e-05, + "loss": 0.6133, + "step": 3247 + }, + { + "epoch": 1.6056111729081697, + "grad_norm": 0.14046299058143344, + "learning_rate": 1.308720714422055e-05, + "loss": 0.5854, + "step": 3248 + }, + { + "epoch": 1.6061055493758496, + "grad_norm": 0.13702796426679278, + "learning_rate": 1.3083503223557852e-05, + "loss": 0.6268, + "step": 3249 + }, + { + "epoch": 1.60659992584353, + "grad_norm": 0.1427291036094163, + "learning_rate": 1.307979883536205e-05, + "loss": 0.6286, + "step": 3250 + }, + { + "epoch": 1.6070943023112099, + "grad_norm": 0.1352766542017847, + "learning_rate": 1.3076093980194815e-05, + "loss": 0.6183, + "step": 3251 + }, + { + "epoch": 1.6075886787788902, + "grad_norm": 0.1429338853847033, + "learning_rate": 1.3072388658617896e-05, + "loss": 0.6154, + "step": 3252 + }, + { + "epoch": 1.6080830552465701, + "grad_norm": 0.1380121452172336, + "learning_rate": 1.3068682871193105e-05, + "loss": 0.7014, + "step": 3253 + }, + { + "epoch": 1.6085774317142505, + "grad_norm": 0.16759319984920187, + "learning_rate": 1.3064976618482332e-05, + "loss": 0.6555, + "step": 3254 + }, + { + "epoch": 1.6090718081819304, + "grad_norm": 0.1423913655061918, + "learning_rate": 1.3061269901047528e-05, + "loss": 0.6246, + "step": 3255 + }, + { + "epoch": 1.6095661846496108, + "grad_norm": 0.14412253784532283, + "learning_rate": 1.3057562719450732e-05, + "loss": 0.6121, + "step": 3256 + }, + { + "epoch": 1.6100605611172907, + "grad_norm": 0.13973435380562466, + "learning_rate": 1.305385507425403e-05, + "loss": 0.5924, + "step": 3257 + }, + { + "epoch": 1.610554937584971, + "grad_norm": 0.1358383712736842, + "learning_rate": 1.3050146966019592e-05, + "loss": 0.6373, + "step": 3258 + }, + { + "epoch": 1.611049314052651, + "grad_norm": 0.13398025201658742, + "learning_rate": 1.3046438395309665e-05, + "loss": 0.609, + "step": 3259 + }, + { + "epoch": 1.6115436905203313, + "grad_norm": 0.1378946792536679, + "learning_rate": 1.3042729362686546e-05, + "loss": 0.6136, + "step": 3260 + }, + { + "epoch": 1.6120380669880112, + "grad_norm": 0.14018238138724756, + "learning_rate": 1.3039019868712617e-05, + "loss": 0.652, + "step": 3261 + }, + { + "epoch": 1.6125324434556916, + "grad_norm": 0.2087087739294766, + "learning_rate": 1.3035309913950332e-05, + "loss": 0.6172, + "step": 3262 + }, + { + "epoch": 1.6130268199233715, + "grad_norm": 0.13481513965107542, + "learning_rate": 1.30315994989622e-05, + "loss": 0.6426, + "step": 3263 + }, + { + "epoch": 1.6135211963910518, + "grad_norm": 0.13858291529349068, + "learning_rate": 1.3027888624310816e-05, + "loss": 0.6092, + "step": 3264 + }, + { + "epoch": 1.6140155728587318, + "grad_norm": 0.14515479333411135, + "learning_rate": 1.3024177290558835e-05, + "loss": 0.6434, + "step": 3265 + }, + { + "epoch": 1.6145099493264121, + "grad_norm": 0.3532616848798379, + "learning_rate": 1.3020465498268986e-05, + "loss": 0.6521, + "step": 3266 + }, + { + "epoch": 1.6150043257940923, + "grad_norm": 0.14350994063820474, + "learning_rate": 1.3016753248004064e-05, + "loss": 0.6117, + "step": 3267 + }, + { + "epoch": 1.6154987022617724, + "grad_norm": 0.14417730184396127, + "learning_rate": 1.3013040540326935e-05, + "loss": 0.6759, + "step": 3268 + }, + { + "epoch": 1.6159930787294525, + "grad_norm": 0.13800613766347275, + "learning_rate": 1.3009327375800536e-05, + "loss": 0.616, + "step": 3269 + }, + { + "epoch": 1.6164874551971327, + "grad_norm": 0.14528191630977094, + "learning_rate": 1.300561375498787e-05, + "loss": 0.6126, + "step": 3270 + }, + { + "epoch": 1.6169818316648128, + "grad_norm": 0.13340647695759067, + "learning_rate": 1.300189967845201e-05, + "loss": 0.6085, + "step": 3271 + }, + { + "epoch": 1.617476208132493, + "grad_norm": 0.14314539287884773, + "learning_rate": 1.2998185146756108e-05, + "loss": 0.6692, + "step": 3272 + }, + { + "epoch": 1.617970584600173, + "grad_norm": 0.16045115847595043, + "learning_rate": 1.2994470160463367e-05, + "loss": 0.6227, + "step": 3273 + }, + { + "epoch": 1.6184649610678532, + "grad_norm": 0.14272274148304515, + "learning_rate": 1.2990754720137066e-05, + "loss": 0.63, + "step": 3274 + }, + { + "epoch": 1.6189593375355333, + "grad_norm": 0.13339020290256426, + "learning_rate": 1.2987038826340563e-05, + "loss": 0.595, + "step": 3275 + }, + { + "epoch": 1.6194537140032135, + "grad_norm": 0.13330207800583002, + "learning_rate": 1.2983322479637277e-05, + "loss": 0.6236, + "step": 3276 + }, + { + "epoch": 1.6199480904708936, + "grad_norm": 0.14691641662941754, + "learning_rate": 1.2979605680590686e-05, + "loss": 0.6107, + "step": 3277 + }, + { + "epoch": 1.6204424669385737, + "grad_norm": 0.1358829801865821, + "learning_rate": 1.2975888429764354e-05, + "loss": 0.6299, + "step": 3278 + }, + { + "epoch": 1.6209368434062539, + "grad_norm": 0.13521273612375, + "learning_rate": 1.2972170727721904e-05, + "loss": 0.6036, + "step": 3279 + }, + { + "epoch": 1.621431219873934, + "grad_norm": 0.14166976340821896, + "learning_rate": 1.2968452575027024e-05, + "loss": 0.5854, + "step": 3280 + }, + { + "epoch": 1.6219255963416142, + "grad_norm": 0.1432166905156229, + "learning_rate": 1.2964733972243484e-05, + "loss": 0.5945, + "step": 3281 + }, + { + "epoch": 1.6224199728092943, + "grad_norm": 0.13671454885242726, + "learning_rate": 1.2961014919935106e-05, + "loss": 0.6308, + "step": 3282 + }, + { + "epoch": 1.6229143492769744, + "grad_norm": 0.13895267810489836, + "learning_rate": 1.2957295418665789e-05, + "loss": 0.5975, + "step": 3283 + }, + { + "epoch": 1.6234087257446546, + "grad_norm": 0.12953339968617975, + "learning_rate": 1.2953575468999503e-05, + "loss": 0.6205, + "step": 3284 + }, + { + "epoch": 1.6239031022123347, + "grad_norm": 0.13693153428020666, + "learning_rate": 1.2949855071500277e-05, + "loss": 0.5851, + "step": 3285 + }, + { + "epoch": 1.6243974786800148, + "grad_norm": 0.14124126568936066, + "learning_rate": 1.2946134226732215e-05, + "loss": 0.693, + "step": 3286 + }, + { + "epoch": 1.624891855147695, + "grad_norm": 0.4320467988643534, + "learning_rate": 1.2942412935259483e-05, + "loss": 0.5746, + "step": 3287 + }, + { + "epoch": 1.625386231615375, + "grad_norm": 0.13271857963371514, + "learning_rate": 1.293869119764632e-05, + "loss": 0.6356, + "step": 3288 + }, + { + "epoch": 1.6258806080830552, + "grad_norm": 0.1654091215456113, + "learning_rate": 1.2934969014457037e-05, + "loss": 0.642, + "step": 3289 + }, + { + "epoch": 1.6263749845507354, + "grad_norm": 0.16489457025770488, + "learning_rate": 1.2931246386255996e-05, + "loss": 0.5898, + "step": 3290 + }, + { + "epoch": 1.6268693610184155, + "grad_norm": 0.14401958166260512, + "learning_rate": 1.2927523313607639e-05, + "loss": 0.6645, + "step": 3291 + }, + { + "epoch": 1.6273637374860956, + "grad_norm": 0.14805491747509478, + "learning_rate": 1.2923799797076484e-05, + "loss": 0.6075, + "step": 3292 + }, + { + "epoch": 1.6278581139537758, + "grad_norm": 0.13307103945040427, + "learning_rate": 1.292007583722709e-05, + "loss": 0.6326, + "step": 3293 + }, + { + "epoch": 1.628352490421456, + "grad_norm": 0.13735507237817693, + "learning_rate": 1.2916351434624108e-05, + "loss": 0.6311, + "step": 3294 + }, + { + "epoch": 1.628846866889136, + "grad_norm": 0.14553538073205718, + "learning_rate": 1.2912626589832247e-05, + "loss": 0.624, + "step": 3295 + }, + { + "epoch": 1.6293412433568162, + "grad_norm": 0.14314139601312664, + "learning_rate": 1.2908901303416274e-05, + "loss": 0.6142, + "step": 3296 + }, + { + "epoch": 1.6298356198244963, + "grad_norm": 0.14572050170017783, + "learning_rate": 1.2905175575941045e-05, + "loss": 0.6062, + "step": 3297 + }, + { + "epoch": 1.6303299962921765, + "grad_norm": 0.14326052249223714, + "learning_rate": 1.290144940797146e-05, + "loss": 0.6332, + "step": 3298 + }, + { + "epoch": 1.6308243727598566, + "grad_norm": 0.13297965510109194, + "learning_rate": 1.28977228000725e-05, + "loss": 0.6045, + "step": 3299 + }, + { + "epoch": 1.6313187492275367, + "grad_norm": 0.14286170391022884, + "learning_rate": 1.2893995752809206e-05, + "loss": 0.6244, + "step": 3300 + }, + { + "epoch": 1.6318131256952169, + "grad_norm": 0.14138850896211833, + "learning_rate": 1.2890268266746689e-05, + "loss": 0.637, + "step": 3301 + }, + { + "epoch": 1.632307502162897, + "grad_norm": 0.13412033470760393, + "learning_rate": 1.2886540342450124e-05, + "loss": 0.6272, + "step": 3302 + }, + { + "epoch": 1.6328018786305771, + "grad_norm": 0.14385001650424567, + "learning_rate": 1.2882811980484755e-05, + "loss": 0.6046, + "step": 3303 + }, + { + "epoch": 1.6332962550982573, + "grad_norm": 0.13818405546330384, + "learning_rate": 1.287908318141589e-05, + "loss": 0.6547, + "step": 3304 + }, + { + "epoch": 1.6337906315659376, + "grad_norm": 0.14288270439914372, + "learning_rate": 1.287535394580891e-05, + "loss": 0.6091, + "step": 3305 + }, + { + "epoch": 1.6342850080336175, + "grad_norm": 0.1452836057768316, + "learning_rate": 1.2871624274229249e-05, + "loss": 0.6433, + "step": 3306 + }, + { + "epoch": 1.634779384501298, + "grad_norm": 0.14041397237200878, + "learning_rate": 1.2867894167242416e-05, + "loss": 0.6039, + "step": 3307 + }, + { + "epoch": 1.6352737609689778, + "grad_norm": 0.14972096559180736, + "learning_rate": 1.286416362541399e-05, + "loss": 0.6202, + "step": 3308 + }, + { + "epoch": 1.6357681374366582, + "grad_norm": 0.14521612415031798, + "learning_rate": 1.2860432649309607e-05, + "loss": 0.6326, + "step": 3309 + }, + { + "epoch": 1.636262513904338, + "grad_norm": 0.15394192581249344, + "learning_rate": 1.2856701239494969e-05, + "loss": 0.6469, + "step": 3310 + }, + { + "epoch": 1.6367568903720184, + "grad_norm": 0.15445433427771119, + "learning_rate": 1.2852969396535852e-05, + "loss": 0.6198, + "step": 3311 + }, + { + "epoch": 1.6372512668396983, + "grad_norm": 0.14501929117793708, + "learning_rate": 1.2849237120998094e-05, + "loss": 0.6056, + "step": 3312 + }, + { + "epoch": 1.6377456433073787, + "grad_norm": 0.14247487127578087, + "learning_rate": 1.2845504413447597e-05, + "loss": 0.6273, + "step": 3313 + }, + { + "epoch": 1.6382400197750586, + "grad_norm": 0.15424551661417452, + "learning_rate": 1.2841771274450325e-05, + "loss": 0.9194, + "step": 3314 + }, + { + "epoch": 1.638734396242739, + "grad_norm": 6.038448261960974, + "learning_rate": 1.2838037704572315e-05, + "loss": 0.6043, + "step": 3315 + }, + { + "epoch": 1.639228772710419, + "grad_norm": 0.1463131088913639, + "learning_rate": 1.2834303704379665e-05, + "loss": 0.6361, + "step": 3316 + }, + { + "epoch": 1.6397231491780992, + "grad_norm": 0.1474198688451094, + "learning_rate": 1.283056927443854e-05, + "loss": 0.6299, + "step": 3317 + }, + { + "epoch": 1.6402175256457792, + "grad_norm": 0.14185778541364005, + "learning_rate": 1.2826834415315165e-05, + "loss": 0.5907, + "step": 3318 + }, + { + "epoch": 1.6407119021134595, + "grad_norm": 0.15473232216255234, + "learning_rate": 1.282309912757584e-05, + "loss": 0.6183, + "step": 3319 + }, + { + "epoch": 1.6412062785811394, + "grad_norm": 0.15742200086702357, + "learning_rate": 1.2819363411786922e-05, + "loss": 0.6192, + "step": 3320 + }, + { + "epoch": 1.6417006550488198, + "grad_norm": 0.14088549254477067, + "learning_rate": 1.2815627268514837e-05, + "loss": 0.645, + "step": 3321 + }, + { + "epoch": 1.6421950315164997, + "grad_norm": 0.1559384499992665, + "learning_rate": 1.2811890698326069e-05, + "loss": 0.6177, + "step": 3322 + }, + { + "epoch": 1.64268940798418, + "grad_norm": 0.14996115766583998, + "learning_rate": 1.2808153701787172e-05, + "loss": 0.6575, + "step": 3323 + }, + { + "epoch": 1.64318378445186, + "grad_norm": 0.15115775059752387, + "learning_rate": 1.2804416279464771e-05, + "loss": 0.625, + "step": 3324 + }, + { + "epoch": 1.6436781609195403, + "grad_norm": 0.15286103196886608, + "learning_rate": 1.2800678431925546e-05, + "loss": 0.5899, + "step": 3325 + }, + { + "epoch": 1.6441725373872202, + "grad_norm": 0.14597209351555535, + "learning_rate": 1.279694015973624e-05, + "loss": 0.6114, + "step": 3326 + }, + { + "epoch": 1.6446669138549006, + "grad_norm": 0.13504692192810133, + "learning_rate": 1.2793201463463671e-05, + "loss": 0.6215, + "step": 3327 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 0.1339821070659464, + "learning_rate": 1.2789462343674712e-05, + "loss": 0.6016, + "step": 3328 + }, + { + "epoch": 1.6456556667902609, + "grad_norm": 0.13962789770712478, + "learning_rate": 1.2785722800936302e-05, + "loss": 0.6255, + "step": 3329 + }, + { + "epoch": 1.6461500432579408, + "grad_norm": 0.1523230861018428, + "learning_rate": 1.2781982835815449e-05, + "loss": 0.6468, + "step": 3330 + }, + { + "epoch": 1.6466444197256211, + "grad_norm": 0.14945588119006264, + "learning_rate": 1.2778242448879219e-05, + "loss": 0.6227, + "step": 3331 + }, + { + "epoch": 1.647138796193301, + "grad_norm": 0.14704102046967654, + "learning_rate": 1.2774501640694746e-05, + "loss": 0.6364, + "step": 3332 + }, + { + "epoch": 1.6476331726609814, + "grad_norm": 0.14364902316261283, + "learning_rate": 1.2770760411829223e-05, + "loss": 0.6, + "step": 3333 + }, + { + "epoch": 1.6481275491286613, + "grad_norm": 0.1483859429013549, + "learning_rate": 1.2767018762849915e-05, + "loss": 0.6225, + "step": 3334 + }, + { + "epoch": 1.6486219255963417, + "grad_norm": 0.14278907625481232, + "learning_rate": 1.2763276694324143e-05, + "loss": 0.6361, + "step": 3335 + }, + { + "epoch": 1.6491163020640216, + "grad_norm": 0.14747419640312953, + "learning_rate": 1.2759534206819293e-05, + "loss": 0.6928, + "step": 3336 + }, + { + "epoch": 1.649610678531702, + "grad_norm": 0.6709416067338753, + "learning_rate": 1.2755791300902816e-05, + "loss": 0.6431, + "step": 3337 + }, + { + "epoch": 1.6501050549993819, + "grad_norm": 0.163892750690712, + "learning_rate": 1.2752047977142232e-05, + "loss": 0.6366, + "step": 3338 + }, + { + "epoch": 1.6505994314670622, + "grad_norm": 0.15054177222590812, + "learning_rate": 1.2748304236105114e-05, + "loss": 0.6787, + "step": 3339 + }, + { + "epoch": 1.6510938079347421, + "grad_norm": 0.1663703026141309, + "learning_rate": 1.27445600783591e-05, + "loss": 0.5831, + "step": 3340 + }, + { + "epoch": 1.6515881844024225, + "grad_norm": 0.2111590414510785, + "learning_rate": 1.2740815504471904e-05, + "loss": 0.59, + "step": 3341 + }, + { + "epoch": 1.6520825608701026, + "grad_norm": 0.18718811282075193, + "learning_rate": 1.2737070515011284e-05, + "loss": 0.682, + "step": 3342 + }, + { + "epoch": 1.6525769373377828, + "grad_norm": 0.17214942795949675, + "learning_rate": 1.2733325110545071e-05, + "loss": 0.6506, + "step": 3343 + }, + { + "epoch": 1.653071313805463, + "grad_norm": 0.1839824718984628, + "learning_rate": 1.2729579291641164e-05, + "loss": 0.6395, + "step": 3344 + }, + { + "epoch": 1.653565690273143, + "grad_norm": 0.17226468504725218, + "learning_rate": 1.2725833058867514e-05, + "loss": 0.6255, + "step": 3345 + }, + { + "epoch": 1.6540600667408232, + "grad_norm": 0.185366760271937, + "learning_rate": 1.2722086412792143e-05, + "loss": 0.6554, + "step": 3346 + }, + { + "epoch": 1.6545544432085033, + "grad_norm": 0.15557545484859847, + "learning_rate": 1.271833935398313e-05, + "loss": 0.6231, + "step": 3347 + }, + { + "epoch": 1.6550488196761834, + "grad_norm": 0.14798214900481696, + "learning_rate": 1.2714591883008622e-05, + "loss": 0.5978, + "step": 3348 + }, + { + "epoch": 1.6555431961438636, + "grad_norm": 0.16165589444719822, + "learning_rate": 1.2710844000436822e-05, + "loss": 0.5962, + "step": 3349 + }, + { + "epoch": 1.6560375726115437, + "grad_norm": 0.15442902809386735, + "learning_rate": 1.2707095706836001e-05, + "loss": 0.6266, + "step": 3350 + }, + { + "epoch": 1.6565319490792239, + "grad_norm": 0.20846046824963985, + "learning_rate": 1.2703347002774491e-05, + "loss": 0.6673, + "step": 3351 + }, + { + "epoch": 1.657026325546904, + "grad_norm": 0.14634398215320166, + "learning_rate": 1.2699597888820682e-05, + "loss": 0.5904, + "step": 3352 + }, + { + "epoch": 1.6575207020145841, + "grad_norm": 0.14097583604586456, + "learning_rate": 1.2695848365543032e-05, + "loss": 0.6524, + "step": 3353 + }, + { + "epoch": 1.6580150784822643, + "grad_norm": 0.17473802683900488, + "learning_rate": 1.2692098433510064e-05, + "loss": 0.6403, + "step": 3354 + }, + { + "epoch": 1.6585094549499444, + "grad_norm": 0.14405438640066542, + "learning_rate": 1.268834809329035e-05, + "loss": 0.6716, + "step": 3355 + }, + { + "epoch": 1.6590038314176245, + "grad_norm": 0.14113601538041637, + "learning_rate": 1.2684597345452532e-05, + "loss": 0.5921, + "step": 3356 + }, + { + "epoch": 1.6594982078853047, + "grad_norm": 0.16586504788098505, + "learning_rate": 1.2680846190565315e-05, + "loss": 0.6297, + "step": 3357 + }, + { + "epoch": 1.6599925843529848, + "grad_norm": 0.14397419260509142, + "learning_rate": 1.267709462919747e-05, + "loss": 0.661, + "step": 3358 + }, + { + "epoch": 1.660486960820665, + "grad_norm": 0.16116445974806848, + "learning_rate": 1.2673342661917811e-05, + "loss": 0.6009, + "step": 3359 + }, + { + "epoch": 1.660981337288345, + "grad_norm": 0.13831721760762328, + "learning_rate": 1.2669590289295239e-05, + "loss": 0.6226, + "step": 3360 + }, + { + "epoch": 1.6614757137560252, + "grad_norm": 0.14656309935652914, + "learning_rate": 1.26658375118987e-05, + "loss": 0.6108, + "step": 3361 + }, + { + "epoch": 1.6619700902237053, + "grad_norm": 0.14054208449672465, + "learning_rate": 1.26620843302972e-05, + "loss": 0.6395, + "step": 3362 + }, + { + "epoch": 1.6624644666913855, + "grad_norm": 0.1363884150023906, + "learning_rate": 1.2658330745059815e-05, + "loss": 0.6137, + "step": 3363 + }, + { + "epoch": 1.6629588431590656, + "grad_norm": 0.14127442415134717, + "learning_rate": 1.2654576756755681e-05, + "loss": 0.6297, + "step": 3364 + }, + { + "epoch": 1.6634532196267457, + "grad_norm": 0.18877350549910601, + "learning_rate": 1.2650822365953988e-05, + "loss": 0.6118, + "step": 3365 + }, + { + "epoch": 1.6639475960944259, + "grad_norm": 0.14220648329532076, + "learning_rate": 1.2647067573223995e-05, + "loss": 0.6413, + "step": 3366 + }, + { + "epoch": 1.664441972562106, + "grad_norm": 0.14283584103384087, + "learning_rate": 1.2643312379135018e-05, + "loss": 0.6078, + "step": 3367 + }, + { + "epoch": 1.6649363490297862, + "grad_norm": 0.15390063113065985, + "learning_rate": 1.2639556784256435e-05, + "loss": 0.6423, + "step": 3368 + }, + { + "epoch": 1.6654307254974663, + "grad_norm": 0.13769553056543674, + "learning_rate": 1.2635800789157683e-05, + "loss": 0.5968, + "step": 3369 + }, + { + "epoch": 1.6659251019651464, + "grad_norm": 0.14315255745061828, + "learning_rate": 1.2632044394408265e-05, + "loss": 0.6105, + "step": 3370 + }, + { + "epoch": 1.6664194784328266, + "grad_norm": 0.13972799666956823, + "learning_rate": 1.2628287600577734e-05, + "loss": 0.5799, + "step": 3371 + }, + { + "epoch": 1.6669138549005067, + "grad_norm": 0.14673689526492384, + "learning_rate": 1.2624530408235716e-05, + "loss": 0.6175, + "step": 3372 + }, + { + "epoch": 1.6674082313681868, + "grad_norm": 0.14692346972328404, + "learning_rate": 1.2620772817951883e-05, + "loss": 0.6463, + "step": 3373 + }, + { + "epoch": 1.667902607835867, + "grad_norm": 0.1380710929165707, + "learning_rate": 1.2617014830295991e-05, + "loss": 0.6172, + "step": 3374 + }, + { + "epoch": 1.668396984303547, + "grad_norm": 0.12891068279930795, + "learning_rate": 1.2613256445837823e-05, + "loss": 0.6262, + "step": 3375 + }, + { + "epoch": 1.6688913607712272, + "grad_norm": 0.15672498055126405, + "learning_rate": 1.2609497665147254e-05, + "loss": 0.6096, + "step": 3376 + }, + { + "epoch": 1.6693857372389074, + "grad_norm": 0.137309130896055, + "learning_rate": 1.2605738488794204e-05, + "loss": 0.6332, + "step": 3377 + }, + { + "epoch": 1.6698801137065877, + "grad_norm": 0.14357832725203493, + "learning_rate": 1.2601978917348646e-05, + "loss": 0.6003, + "step": 3378 + }, + { + "epoch": 1.6703744901742676, + "grad_norm": 0.14964163344284268, + "learning_rate": 1.259821895138063e-05, + "loss": 0.6017, + "step": 3379 + }, + { + "epoch": 1.670868866641948, + "grad_norm": 0.1350865510219292, + "learning_rate": 1.259445859146025e-05, + "loss": 0.6316, + "step": 3380 + }, + { + "epoch": 1.671363243109628, + "grad_norm": 0.1419715842465061, + "learning_rate": 1.2590697838157673e-05, + "loss": 0.6759, + "step": 3381 + }, + { + "epoch": 1.6718576195773083, + "grad_norm": 0.14683018630381428, + "learning_rate": 1.2586936692043118e-05, + "loss": 0.6366, + "step": 3382 + }, + { + "epoch": 1.6723519960449882, + "grad_norm": 0.14122456523605803, + "learning_rate": 1.2583175153686859e-05, + "loss": 0.6225, + "step": 3383 + }, + { + "epoch": 1.6728463725126685, + "grad_norm": 0.14465036426908368, + "learning_rate": 1.2579413223659245e-05, + "loss": 0.6367, + "step": 3384 + }, + { + "epoch": 1.6733407489803485, + "grad_norm": 0.14012457656241398, + "learning_rate": 1.257565090253067e-05, + "loss": 0.6038, + "step": 3385 + }, + { + "epoch": 1.6738351254480288, + "grad_norm": 0.13312236988554696, + "learning_rate": 1.2571888190871588e-05, + "loss": 0.6301, + "step": 3386 + }, + { + "epoch": 1.6743295019157087, + "grad_norm": 0.1487763123170716, + "learning_rate": 1.2568125089252525e-05, + "loss": 0.6354, + "step": 3387 + }, + { + "epoch": 1.674823878383389, + "grad_norm": 0.1482723621360688, + "learning_rate": 1.2564361598244052e-05, + "loss": 0.6044, + "step": 3388 + }, + { + "epoch": 1.675318254851069, + "grad_norm": 0.14504214217391156, + "learning_rate": 1.2560597718416805e-05, + "loss": 0.6519, + "step": 3389 + }, + { + "epoch": 1.6758126313187494, + "grad_norm": 0.3531414020285163, + "learning_rate": 1.2556833450341484e-05, + "loss": 0.6021, + "step": 3390 + }, + { + "epoch": 1.6763070077864293, + "grad_norm": 0.15106596710795975, + "learning_rate": 1.2553068794588834e-05, + "loss": 0.5936, + "step": 3391 + }, + { + "epoch": 1.6768013842541096, + "grad_norm": 0.14721545348836024, + "learning_rate": 1.2549303751729669e-05, + "loss": 0.605, + "step": 3392 + }, + { + "epoch": 1.6772957607217895, + "grad_norm": 0.13954137658341084, + "learning_rate": 1.2545538322334867e-05, + "loss": 0.6195, + "step": 3393 + }, + { + "epoch": 1.67779013718947, + "grad_norm": 0.13771995915302027, + "learning_rate": 1.2541772506975349e-05, + "loss": 0.5907, + "step": 3394 + }, + { + "epoch": 1.6782845136571498, + "grad_norm": 0.16531452327171559, + "learning_rate": 1.2538006306222108e-05, + "loss": 0.6474, + "step": 3395 + }, + { + "epoch": 1.6787788901248302, + "grad_norm": 0.13894165251991775, + "learning_rate": 1.2534239720646188e-05, + "loss": 0.644, + "step": 3396 + }, + { + "epoch": 1.67927326659251, + "grad_norm": 0.1446913759496128, + "learning_rate": 1.2530472750818696e-05, + "loss": 0.6464, + "step": 3397 + }, + { + "epoch": 1.6797676430601904, + "grad_norm": 0.1556463359165334, + "learning_rate": 1.2526705397310794e-05, + "loss": 0.6265, + "step": 3398 + }, + { + "epoch": 1.6802620195278704, + "grad_norm": 0.15044638871828703, + "learning_rate": 1.2522937660693701e-05, + "loss": 0.6568, + "step": 3399 + }, + { + "epoch": 1.6807563959955507, + "grad_norm": 0.14215192788359166, + "learning_rate": 1.2519169541538701e-05, + "loss": 0.575, + "step": 3400 + }, + { + "epoch": 1.6812507724632306, + "grad_norm": 0.1370148302397377, + "learning_rate": 1.2515401040417126e-05, + "loss": 0.6132, + "step": 3401 + }, + { + "epoch": 1.681745148930911, + "grad_norm": 0.14145865892305726, + "learning_rate": 1.2511632157900375e-05, + "loss": 0.6122, + "step": 3402 + }, + { + "epoch": 1.682239525398591, + "grad_norm": 0.14353953447751905, + "learning_rate": 1.2507862894559899e-05, + "loss": 0.5945, + "step": 3403 + }, + { + "epoch": 1.6827339018662713, + "grad_norm": 0.13651178798986136, + "learning_rate": 1.2504093250967211e-05, + "loss": 0.6785, + "step": 3404 + }, + { + "epoch": 1.6832282783339512, + "grad_norm": 0.1402473888118249, + "learning_rate": 1.2500323227693873e-05, + "loss": 0.6382, + "step": 3405 + }, + { + "epoch": 1.6837226548016315, + "grad_norm": 0.32802512789193194, + "learning_rate": 1.2496552825311521e-05, + "loss": 0.61, + "step": 3406 + }, + { + "epoch": 1.6842170312693114, + "grad_norm": 0.14654704727094545, + "learning_rate": 1.2492782044391835e-05, + "loss": 0.6232, + "step": 3407 + }, + { + "epoch": 1.6847114077369918, + "grad_norm": 0.14253128859457673, + "learning_rate": 1.2489010885506552e-05, + "loss": 0.6636, + "step": 3408 + }, + { + "epoch": 1.6852057842046717, + "grad_norm": 0.15339869843927526, + "learning_rate": 1.2485239349227471e-05, + "loss": 0.6316, + "step": 3409 + }, + { + "epoch": 1.685700160672352, + "grad_norm": 3.0007254414486972, + "learning_rate": 1.2481467436126455e-05, + "loss": 0.6739, + "step": 3410 + }, + { + "epoch": 1.686194537140032, + "grad_norm": 0.14336445761630295, + "learning_rate": 1.2477695146775406e-05, + "loss": 0.614, + "step": 3411 + }, + { + "epoch": 1.6866889136077123, + "grad_norm": 0.14742354790367027, + "learning_rate": 1.2473922481746299e-05, + "loss": 0.6011, + "step": 3412 + }, + { + "epoch": 1.6871832900753923, + "grad_norm": 0.1380896210830806, + "learning_rate": 1.2470149441611161e-05, + "loss": 0.5974, + "step": 3413 + }, + { + "epoch": 1.6876776665430726, + "grad_norm": 0.14208895118820558, + "learning_rate": 1.2466376026942072e-05, + "loss": 0.6243, + "step": 3414 + }, + { + "epoch": 1.6881720430107527, + "grad_norm": 0.14686708564904455, + "learning_rate": 1.2462602238311177e-05, + "loss": 0.6192, + "step": 3415 + }, + { + "epoch": 1.6886664194784329, + "grad_norm": 0.1492043392848075, + "learning_rate": 1.245882807629067e-05, + "loss": 0.6175, + "step": 3416 + }, + { + "epoch": 1.689160795946113, + "grad_norm": 0.13979194842181367, + "learning_rate": 1.2455053541452806e-05, + "loss": 0.6382, + "step": 3417 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 0.171920145828318, + "learning_rate": 1.2451278634369892e-05, + "loss": 0.6159, + "step": 3418 + }, + { + "epoch": 1.6901495488814733, + "grad_norm": 0.13789936367023467, + "learning_rate": 1.2447503355614296e-05, + "loss": 0.6019, + "step": 3419 + }, + { + "epoch": 1.6906439253491534, + "grad_norm": 0.14281199200693745, + "learning_rate": 1.2443727705758448e-05, + "loss": 0.6299, + "step": 3420 + }, + { + "epoch": 1.6911383018168336, + "grad_norm": 0.15064579489287158, + "learning_rate": 1.2439951685374816e-05, + "loss": 0.5929, + "step": 3421 + }, + { + "epoch": 1.6916326782845137, + "grad_norm": 0.13935793504511282, + "learning_rate": 1.2436175295035939e-05, + "loss": 0.5932, + "step": 3422 + }, + { + "epoch": 1.6921270547521938, + "grad_norm": 0.1371161362507313, + "learning_rate": 1.2432398535314412e-05, + "loss": 0.6016, + "step": 3423 + }, + { + "epoch": 1.692621431219874, + "grad_norm": 0.14820059140947095, + "learning_rate": 1.242862140678288e-05, + "loss": 0.6082, + "step": 3424 + }, + { + "epoch": 1.693115807687554, + "grad_norm": 0.1326448189395883, + "learning_rate": 1.2424843910014044e-05, + "loss": 0.621, + "step": 3425 + }, + { + "epoch": 1.6936101841552342, + "grad_norm": 0.14434751110841132, + "learning_rate": 1.2421066045580665e-05, + "loss": 0.6197, + "step": 3426 + }, + { + "epoch": 1.6941045606229144, + "grad_norm": 0.1427527159560715, + "learning_rate": 1.2417287814055561e-05, + "loss": 0.6023, + "step": 3427 + }, + { + "epoch": 1.6945989370905945, + "grad_norm": 0.14107997153511373, + "learning_rate": 1.24135092160116e-05, + "loss": 0.6524, + "step": 3428 + }, + { + "epoch": 1.6950933135582746, + "grad_norm": 0.1423531327503464, + "learning_rate": 1.2409730252021709e-05, + "loss": 0.617, + "step": 3429 + }, + { + "epoch": 1.6955876900259548, + "grad_norm": 0.14388641918807984, + "learning_rate": 1.2405950922658865e-05, + "loss": 0.6545, + "step": 3430 + }, + { + "epoch": 1.696082066493635, + "grad_norm": 0.15203545500351226, + "learning_rate": 1.2402171228496111e-05, + "loss": 0.5932, + "step": 3431 + }, + { + "epoch": 1.696576442961315, + "grad_norm": 0.1329489918539225, + "learning_rate": 1.2398391170106539e-05, + "loss": 0.6358, + "step": 3432 + }, + { + "epoch": 1.6970708194289952, + "grad_norm": 0.14170134368454476, + "learning_rate": 1.2394610748063292e-05, + "loss": 0.6551, + "step": 3433 + }, + { + "epoch": 1.6975651958966753, + "grad_norm": 0.13908858744142805, + "learning_rate": 1.2390829962939576e-05, + "loss": 0.6318, + "step": 3434 + }, + { + "epoch": 1.6980595723643555, + "grad_norm": 0.13067454047521748, + "learning_rate": 1.238704881530865e-05, + "loss": 0.6472, + "step": 3435 + }, + { + "epoch": 1.6985539488320356, + "grad_norm": 0.13812087485159877, + "learning_rate": 1.2383267305743825e-05, + "loss": 0.615, + "step": 3436 + }, + { + "epoch": 1.6990483252997157, + "grad_norm": 0.13925480424494402, + "learning_rate": 1.2379485434818468e-05, + "loss": 0.5852, + "step": 3437 + }, + { + "epoch": 1.6995427017673959, + "grad_norm": 0.13397484606137727, + "learning_rate": 1.2375703203106e-05, + "loss": 0.618, + "step": 3438 + }, + { + "epoch": 1.700037078235076, + "grad_norm": 0.13714991484744618, + "learning_rate": 1.2371920611179902e-05, + "loss": 0.6324, + "step": 3439 + }, + { + "epoch": 1.7005314547027561, + "grad_norm": 0.13696639545880743, + "learning_rate": 1.2368137659613706e-05, + "loss": 0.6106, + "step": 3440 + }, + { + "epoch": 1.7010258311704363, + "grad_norm": 0.14465526298492845, + "learning_rate": 1.2364354348980993e-05, + "loss": 0.5996, + "step": 3441 + }, + { + "epoch": 1.7015202076381164, + "grad_norm": 0.1359255804110314, + "learning_rate": 1.2360570679855407e-05, + "loss": 0.609, + "step": 3442 + }, + { + "epoch": 1.7020145841057965, + "grad_norm": 0.13480598462378574, + "learning_rate": 1.2356786652810649e-05, + "loss": 0.5694, + "step": 3443 + }, + { + "epoch": 1.7025089605734767, + "grad_norm": 0.13547579664679493, + "learning_rate": 1.2353002268420454e-05, + "loss": 0.5827, + "step": 3444 + }, + { + "epoch": 1.7030033370411568, + "grad_norm": 0.13754295882043552, + "learning_rate": 1.2349217527258638e-05, + "loss": 0.5986, + "step": 3445 + }, + { + "epoch": 1.703497713508837, + "grad_norm": 0.13678860348970825, + "learning_rate": 1.2345432429899053e-05, + "loss": 0.674, + "step": 3446 + }, + { + "epoch": 1.703992089976517, + "grad_norm": 0.14675392526096892, + "learning_rate": 1.2341646976915614e-05, + "loss": 0.571, + "step": 3447 + }, + { + "epoch": 1.7044864664441972, + "grad_norm": 0.1356612873852687, + "learning_rate": 1.2337861168882284e-05, + "loss": 0.597, + "step": 3448 + }, + { + "epoch": 1.7049808429118773, + "grad_norm": 0.13243385994603127, + "learning_rate": 1.2334075006373084e-05, + "loss": 0.6327, + "step": 3449 + }, + { + "epoch": 1.7054752193795575, + "grad_norm": 0.1432841768452584, + "learning_rate": 1.2330288489962083e-05, + "loss": 0.62, + "step": 3450 + }, + { + "epoch": 1.7059695958472376, + "grad_norm": 0.14335280900619438, + "learning_rate": 1.2326501620223412e-05, + "loss": 0.6078, + "step": 3451 + }, + { + "epoch": 1.7064639723149178, + "grad_norm": 0.1337782865902718, + "learning_rate": 1.232271439773125e-05, + "loss": 0.5702, + "step": 3452 + }, + { + "epoch": 1.7069583487825981, + "grad_norm": 0.13401642135052583, + "learning_rate": 1.2318926823059834e-05, + "loss": 0.6276, + "step": 3453 + }, + { + "epoch": 1.707452725250278, + "grad_norm": 0.14337071326488762, + "learning_rate": 1.2315138896783445e-05, + "loss": 0.6255, + "step": 3454 + }, + { + "epoch": 1.7079471017179584, + "grad_norm": 0.13568840832086748, + "learning_rate": 1.2311350619476425e-05, + "loss": 0.616, + "step": 3455 + }, + { + "epoch": 1.7084414781856383, + "grad_norm": 0.1413381730064604, + "learning_rate": 1.2307561991713175e-05, + "loss": 0.5962, + "step": 3456 + }, + { + "epoch": 1.7089358546533187, + "grad_norm": 0.1379441553574766, + "learning_rate": 1.2303773014068132e-05, + "loss": 0.6533, + "step": 3457 + }, + { + "epoch": 1.7094302311209986, + "grad_norm": 0.15394641638257173, + "learning_rate": 1.2299983687115804e-05, + "loss": 0.6164, + "step": 3458 + }, + { + "epoch": 1.709924607588679, + "grad_norm": 0.13844873841132033, + "learning_rate": 1.229619401143074e-05, + "loss": 0.6044, + "step": 3459 + }, + { + "epoch": 1.7104189840563588, + "grad_norm": 0.15410348291201575, + "learning_rate": 1.2292403987587544e-05, + "loss": 0.6067, + "step": 3460 + }, + { + "epoch": 1.7109133605240392, + "grad_norm": 0.19127288418701824, + "learning_rate": 1.2288613616160878e-05, + "loss": 0.6115, + "step": 3461 + }, + { + "epoch": 1.711407736991719, + "grad_norm": 0.14636395381110592, + "learning_rate": 1.2284822897725453e-05, + "loss": 0.5983, + "step": 3462 + }, + { + "epoch": 1.7119021134593995, + "grad_norm": 0.13637271820010247, + "learning_rate": 1.228103183285603e-05, + "loss": 0.6277, + "step": 3463 + }, + { + "epoch": 1.7123964899270794, + "grad_norm": 0.13923959740239694, + "learning_rate": 1.227724042212743e-05, + "loss": 0.5993, + "step": 3464 + }, + { + "epoch": 1.7128908663947597, + "grad_norm": 0.13876015063645628, + "learning_rate": 1.2273448666114516e-05, + "loss": 0.6077, + "step": 3465 + }, + { + "epoch": 1.7133852428624397, + "grad_norm": 0.14638452603239704, + "learning_rate": 1.2269656565392216e-05, + "loss": 0.6286, + "step": 3466 + }, + { + "epoch": 1.71387961933012, + "grad_norm": 0.14759849407708328, + "learning_rate": 1.2265864120535498e-05, + "loss": 0.6379, + "step": 3467 + }, + { + "epoch": 1.7143739957978, + "grad_norm": 0.14516197926319338, + "learning_rate": 1.2262071332119387e-05, + "loss": 0.6024, + "step": 3468 + }, + { + "epoch": 1.7148683722654803, + "grad_norm": 0.13850286542897178, + "learning_rate": 1.2258278200718969e-05, + "loss": 0.5686, + "step": 3469 + }, + { + "epoch": 1.7153627487331602, + "grad_norm": 0.12932879441431172, + "learning_rate": 1.2254484726909366e-05, + "loss": 0.5922, + "step": 3470 + }, + { + "epoch": 1.7158571252008405, + "grad_norm": 0.1423462307951182, + "learning_rate": 1.2250690911265762e-05, + "loss": 0.6697, + "step": 3471 + }, + { + "epoch": 1.7163515016685205, + "grad_norm": 0.14609513703386878, + "learning_rate": 1.2246896754363391e-05, + "loss": 0.6164, + "step": 3472 + }, + { + "epoch": 1.7168458781362008, + "grad_norm": 0.1442814480666908, + "learning_rate": 1.2243102256777537e-05, + "loss": 0.6197, + "step": 3473 + }, + { + "epoch": 1.7173402546038807, + "grad_norm": 0.14087453647423445, + "learning_rate": 1.2239307419083534e-05, + "loss": 0.6071, + "step": 3474 + }, + { + "epoch": 1.717834631071561, + "grad_norm": 0.13300983511187436, + "learning_rate": 1.223551224185678e-05, + "loss": 0.6115, + "step": 3475 + }, + { + "epoch": 1.718329007539241, + "grad_norm": 0.1421042531077882, + "learning_rate": 1.2231716725672707e-05, + "loss": 0.6121, + "step": 3476 + }, + { + "epoch": 1.7188233840069214, + "grad_norm": 0.14236609148001547, + "learning_rate": 1.2227920871106806e-05, + "loss": 0.6156, + "step": 3477 + }, + { + "epoch": 1.7193177604746013, + "grad_norm": 0.13956077139690296, + "learning_rate": 1.2224124678734625e-05, + "loss": 0.5975, + "step": 3478 + }, + { + "epoch": 1.7198121369422816, + "grad_norm": 0.14595383566509254, + "learning_rate": 1.2220328149131755e-05, + "loss": 0.581, + "step": 3479 + }, + { + "epoch": 1.7203065134099615, + "grad_norm": 0.1301026431763286, + "learning_rate": 1.221653128287384e-05, + "loss": 0.5948, + "step": 3480 + }, + { + "epoch": 1.720800889877642, + "grad_norm": 0.14209289378439588, + "learning_rate": 1.221273408053658e-05, + "loss": 0.6193, + "step": 3481 + }, + { + "epoch": 1.7212952663453218, + "grad_norm": 0.141289365468822, + "learning_rate": 1.2208936542695715e-05, + "loss": 0.5956, + "step": 3482 + }, + { + "epoch": 1.7217896428130022, + "grad_norm": 0.14509868933207876, + "learning_rate": 1.2205138669927049e-05, + "loss": 0.5859, + "step": 3483 + }, + { + "epoch": 1.722284019280682, + "grad_norm": 0.13950740851053273, + "learning_rate": 1.2201340462806428e-05, + "loss": 0.6374, + "step": 3484 + }, + { + "epoch": 1.7227783957483624, + "grad_norm": 0.1338925291846883, + "learning_rate": 1.2197541921909752e-05, + "loss": 0.629, + "step": 3485 + }, + { + "epoch": 1.7232727722160424, + "grad_norm": 0.15225662856845987, + "learning_rate": 1.2193743047812971e-05, + "loss": 0.6868, + "step": 3486 + }, + { + "epoch": 1.7237671486837227, + "grad_norm": 0.1385726830147364, + "learning_rate": 1.2189943841092084e-05, + "loss": 0.6338, + "step": 3487 + }, + { + "epoch": 1.7242615251514026, + "grad_norm": 0.1365839660269162, + "learning_rate": 1.2186144302323146e-05, + "loss": 0.5856, + "step": 3488 + }, + { + "epoch": 1.724755901619083, + "grad_norm": 0.13993652125883568, + "learning_rate": 1.2182344432082256e-05, + "loss": 0.5956, + "step": 3489 + }, + { + "epoch": 1.7252502780867631, + "grad_norm": 0.1439001969558654, + "learning_rate": 1.2178544230945563e-05, + "loss": 0.6064, + "step": 3490 + }, + { + "epoch": 1.7257446545544433, + "grad_norm": 0.1412154111116739, + "learning_rate": 1.2174743699489272e-05, + "loss": 0.7181, + "step": 3491 + }, + { + "epoch": 1.7262390310221234, + "grad_norm": 0.7149983158972704, + "learning_rate": 1.2170942838289637e-05, + "loss": 0.6329, + "step": 3492 + }, + { + "epoch": 1.7267334074898035, + "grad_norm": 0.13614799224753826, + "learning_rate": 1.2167141647922952e-05, + "loss": 0.6576, + "step": 3493 + }, + { + "epoch": 1.7272277839574837, + "grad_norm": 0.14102365285214516, + "learning_rate": 1.2163340128965574e-05, + "loss": 0.611, + "step": 3494 + }, + { + "epoch": 1.7277221604251638, + "grad_norm": 0.14389417520823328, + "learning_rate": 1.2159538281993906e-05, + "loss": 0.6365, + "step": 3495 + }, + { + "epoch": 1.728216536892844, + "grad_norm": 0.14637395651199153, + "learning_rate": 1.2155736107584395e-05, + "loss": 0.6119, + "step": 3496 + }, + { + "epoch": 1.728710913360524, + "grad_norm": 0.14802974133831528, + "learning_rate": 1.2151933606313544e-05, + "loss": 0.6042, + "step": 3497 + }, + { + "epoch": 1.7292052898282042, + "grad_norm": 0.1404947483660512, + "learning_rate": 1.2148130778757906e-05, + "loss": 0.6498, + "step": 3498 + }, + { + "epoch": 1.7296996662958843, + "grad_norm": 0.1408887958353708, + "learning_rate": 1.2144327625494077e-05, + "loss": 0.6325, + "step": 3499 + }, + { + "epoch": 1.7301940427635645, + "grad_norm": 0.15243506138876983, + "learning_rate": 1.2140524147098707e-05, + "loss": 0.6419, + "step": 3500 + }, + { + "epoch": 1.7306884192312446, + "grad_norm": 0.14408253986440542, + "learning_rate": 1.2136720344148494e-05, + "loss": 0.6244, + "step": 3501 + }, + { + "epoch": 1.7311827956989247, + "grad_norm": 0.13153819695056287, + "learning_rate": 1.2132916217220189e-05, + "loss": 0.6087, + "step": 3502 + }, + { + "epoch": 1.7316771721666049, + "grad_norm": 0.14107563864851946, + "learning_rate": 1.2129111766890588e-05, + "loss": 0.6404, + "step": 3503 + }, + { + "epoch": 1.732171548634285, + "grad_norm": 0.13737941252859456, + "learning_rate": 1.2125306993736535e-05, + "loss": 0.5951, + "step": 3504 + }, + { + "epoch": 1.7326659251019652, + "grad_norm": 0.13196827878857786, + "learning_rate": 1.2121501898334926e-05, + "loss": 0.6194, + "step": 3505 + }, + { + "epoch": 1.7331603015696453, + "grad_norm": 0.13356060871939077, + "learning_rate": 1.2117696481262706e-05, + "loss": 0.6176, + "step": 3506 + }, + { + "epoch": 1.7336546780373254, + "grad_norm": 0.1318178051917299, + "learning_rate": 1.2113890743096863e-05, + "loss": 0.6063, + "step": 3507 + }, + { + "epoch": 1.7341490545050056, + "grad_norm": 0.13497433069645146, + "learning_rate": 1.2110084684414445e-05, + "loss": 0.604, + "step": 3508 + }, + { + "epoch": 1.7346434309726857, + "grad_norm": 0.13699960216036322, + "learning_rate": 1.2106278305792536e-05, + "loss": 0.6461, + "step": 3509 + }, + { + "epoch": 1.7351378074403658, + "grad_norm": 0.1360571336184904, + "learning_rate": 1.210247160780828e-05, + "loss": 0.6438, + "step": 3510 + }, + { + "epoch": 1.735632183908046, + "grad_norm": 0.1422361965586983, + "learning_rate": 1.209866459103886e-05, + "loss": 0.6588, + "step": 3511 + }, + { + "epoch": 1.736126560375726, + "grad_norm": 0.14113833140528206, + "learning_rate": 1.209485725606151e-05, + "loss": 0.586, + "step": 3512 + }, + { + "epoch": 1.7366209368434062, + "grad_norm": 0.1298998978996687, + "learning_rate": 1.2091049603453518e-05, + "loss": 0.6246, + "step": 3513 + }, + { + "epoch": 1.7371153133110864, + "grad_norm": 0.137837526307996, + "learning_rate": 1.208724163379221e-05, + "loss": 0.6281, + "step": 3514 + }, + { + "epoch": 1.7376096897787665, + "grad_norm": 0.13227672516426153, + "learning_rate": 1.2083433347654968e-05, + "loss": 0.5957, + "step": 3515 + }, + { + "epoch": 1.7381040662464466, + "grad_norm": 0.14856336002126524, + "learning_rate": 1.2079624745619223e-05, + "loss": 0.5968, + "step": 3516 + }, + { + "epoch": 1.7385984427141268, + "grad_norm": 0.12990966892329406, + "learning_rate": 1.2075815828262443e-05, + "loss": 0.6311, + "step": 3517 + }, + { + "epoch": 1.739092819181807, + "grad_norm": 0.1376194244190739, + "learning_rate": 1.207200659616216e-05, + "loss": 0.61, + "step": 3518 + }, + { + "epoch": 1.739587195649487, + "grad_norm": 0.1427472624474213, + "learning_rate": 1.206819704989594e-05, + "loss": 0.6392, + "step": 3519 + }, + { + "epoch": 1.7400815721171672, + "grad_norm": 0.13271696844203043, + "learning_rate": 1.20643871900414e-05, + "loss": 0.5847, + "step": 3520 + }, + { + "epoch": 1.7405759485848473, + "grad_norm": 0.13541857856945266, + "learning_rate": 1.206057701717621e-05, + "loss": 0.5999, + "step": 3521 + }, + { + "epoch": 1.7410703250525275, + "grad_norm": 0.1387467137675311, + "learning_rate": 1.2056766531878083e-05, + "loss": 0.6562, + "step": 3522 + }, + { + "epoch": 1.7415647015202076, + "grad_norm": 0.1434288811379358, + "learning_rate": 1.2052955734724777e-05, + "loss": 0.5948, + "step": 3523 + }, + { + "epoch": 1.7420590779878877, + "grad_norm": 0.13009207175824233, + "learning_rate": 1.2049144626294105e-05, + "loss": 0.6267, + "step": 3524 + }, + { + "epoch": 1.7425534544555679, + "grad_norm": 0.13698221790556436, + "learning_rate": 1.2045333207163923e-05, + "loss": 0.5915, + "step": 3525 + }, + { + "epoch": 1.743047830923248, + "grad_norm": 0.13952763832691015, + "learning_rate": 1.2041521477912124e-05, + "loss": 0.6516, + "step": 3526 + }, + { + "epoch": 1.7435422073909281, + "grad_norm": 0.13733863867044813, + "learning_rate": 1.2037709439116669e-05, + "loss": 0.616, + "step": 3527 + }, + { + "epoch": 1.7440365838586085, + "grad_norm": 0.1315556383047878, + "learning_rate": 1.2033897091355548e-05, + "loss": 0.5721, + "step": 3528 + }, + { + "epoch": 1.7445309603262884, + "grad_norm": 0.13650033695526528, + "learning_rate": 1.2030084435206809e-05, + "loss": 0.5958, + "step": 3529 + }, + { + "epoch": 1.7450253367939688, + "grad_norm": 0.13523867120609023, + "learning_rate": 1.2026271471248536e-05, + "loss": 0.6358, + "step": 3530 + }, + { + "epoch": 1.7455197132616487, + "grad_norm": 0.13923717202507715, + "learning_rate": 1.2022458200058873e-05, + "loss": 0.6201, + "step": 3531 + }, + { + "epoch": 1.746014089729329, + "grad_norm": 0.15276806129583395, + "learning_rate": 1.2018644622215998e-05, + "loss": 0.5894, + "step": 3532 + }, + { + "epoch": 1.746508466197009, + "grad_norm": 0.12974439271697344, + "learning_rate": 1.2014830738298145e-05, + "loss": 0.6539, + "step": 3533 + }, + { + "epoch": 1.7470028426646893, + "grad_norm": 0.14348130078156565, + "learning_rate": 1.2011016548883585e-05, + "loss": 0.6155, + "step": 3534 + }, + { + "epoch": 1.7474972191323692, + "grad_norm": 0.14490955026545646, + "learning_rate": 1.2007202054550646e-05, + "loss": 0.6832, + "step": 3535 + }, + { + "epoch": 1.7479915956000496, + "grad_norm": 0.14275188583213289, + "learning_rate": 1.2003387255877695e-05, + "loss": 0.6254, + "step": 3536 + }, + { + "epoch": 1.7484859720677295, + "grad_norm": 0.13516329653285974, + "learning_rate": 1.1999572153443142e-05, + "loss": 0.6082, + "step": 3537 + }, + { + "epoch": 1.7489803485354098, + "grad_norm": 0.1431889657721834, + "learning_rate": 1.199575674782546e-05, + "loss": 0.5999, + "step": 3538 + }, + { + "epoch": 1.7494747250030898, + "grad_norm": 0.13456495161150436, + "learning_rate": 1.1991941039603144e-05, + "loss": 0.6493, + "step": 3539 + }, + { + "epoch": 1.7499691014707701, + "grad_norm": 0.13703038497436054, + "learning_rate": 1.1988125029354753e-05, + "loss": 0.6189, + "step": 3540 + }, + { + "epoch": 1.75046347793845, + "grad_norm": 0.14463248549232327, + "learning_rate": 1.198430871765889e-05, + "loss": 0.6511, + "step": 3541 + }, + { + "epoch": 1.7509578544061304, + "grad_norm": 0.14293876741025663, + "learning_rate": 1.1980492105094188e-05, + "loss": 0.6032, + "step": 3542 + }, + { + "epoch": 1.7509578544061304, + "eval_loss": 0.6554675698280334, + "eval_runtime": 81.7651, + "eval_samples_per_second": 371.234, + "eval_steps_per_second": 46.413, + "step": 3542 + }, + { + "epoch": 1.7514522308738103, + "grad_norm": 0.13454983368277812, + "learning_rate": 1.1976675192239345e-05, + "loss": 0.6205, + "step": 3543 + }, + { + "epoch": 1.7519466073414907, + "grad_norm": 0.14057196425372048, + "learning_rate": 1.1972857979673097e-05, + "loss": 0.6537, + "step": 3544 + }, + { + "epoch": 1.7524409838091706, + "grad_norm": 0.14181022024890735, + "learning_rate": 1.196904046797422e-05, + "loss": 0.6163, + "step": 3545 + }, + { + "epoch": 1.752935360276851, + "grad_norm": 0.1375610734419273, + "learning_rate": 1.1965222657721545e-05, + "loss": 0.6407, + "step": 3546 + }, + { + "epoch": 1.7534297367445308, + "grad_norm": 0.1421739172448462, + "learning_rate": 1.1961404549493942e-05, + "loss": 0.6347, + "step": 3547 + }, + { + "epoch": 1.7539241132122112, + "grad_norm": 0.13808528852716745, + "learning_rate": 1.1957586143870327e-05, + "loss": 0.6124, + "step": 3548 + }, + { + "epoch": 1.7544184896798911, + "grad_norm": 0.13991543007112886, + "learning_rate": 1.1953767441429664e-05, + "loss": 0.6061, + "step": 3549 + }, + { + "epoch": 1.7549128661475715, + "grad_norm": 0.14310011929203492, + "learning_rate": 1.1949948442750956e-05, + "loss": 0.6569, + "step": 3550 + }, + { + "epoch": 1.7554072426152514, + "grad_norm": 0.14199874239783417, + "learning_rate": 1.194612914841326e-05, + "loss": 0.5883, + "step": 3551 + }, + { + "epoch": 1.7559016190829317, + "grad_norm": 0.13740657840938517, + "learning_rate": 1.1942309558995672e-05, + "loss": 0.6299, + "step": 3552 + }, + { + "epoch": 1.7563959955506117, + "grad_norm": 0.1447651148642724, + "learning_rate": 1.193848967507733e-05, + "loss": 0.5954, + "step": 3553 + }, + { + "epoch": 1.756890372018292, + "grad_norm": 0.1413895357087322, + "learning_rate": 1.1934669497237423e-05, + "loss": 0.6184, + "step": 3554 + }, + { + "epoch": 1.757384748485972, + "grad_norm": 0.1349098241309228, + "learning_rate": 1.193084902605518e-05, + "loss": 0.6339, + "step": 3555 + }, + { + "epoch": 1.7578791249536523, + "grad_norm": 0.13620014137909928, + "learning_rate": 1.1927028262109874e-05, + "loss": 0.5414, + "step": 3556 + }, + { + "epoch": 1.7583735014213322, + "grad_norm": 0.1302454744189211, + "learning_rate": 1.1923207205980829e-05, + "loss": 0.5968, + "step": 3557 + }, + { + "epoch": 1.7588678778890126, + "grad_norm": 0.13064173862794082, + "learning_rate": 1.1919385858247408e-05, + "loss": 0.631, + "step": 3558 + }, + { + "epoch": 1.7593622543566925, + "grad_norm": 0.1429720925350131, + "learning_rate": 1.1915564219489018e-05, + "loss": 0.6078, + "step": 3559 + }, + { + "epoch": 1.7598566308243728, + "grad_norm": 0.13815359529064167, + "learning_rate": 1.1911742290285111e-05, + "loss": 0.6421, + "step": 3560 + }, + { + "epoch": 1.7603510072920527, + "grad_norm": 0.12989824739836636, + "learning_rate": 1.1907920071215184e-05, + "loss": 0.6077, + "step": 3561 + }, + { + "epoch": 1.760845383759733, + "grad_norm": 0.13950473515410886, + "learning_rate": 1.1904097562858776e-05, + "loss": 0.5671, + "step": 3562 + }, + { + "epoch": 1.761339760227413, + "grad_norm": 0.13160181326240433, + "learning_rate": 1.1900274765795472e-05, + "loss": 0.5918, + "step": 3563 + }, + { + "epoch": 1.7618341366950934, + "grad_norm": 0.14259242362726268, + "learning_rate": 1.18964516806049e-05, + "loss": 0.58, + "step": 3564 + }, + { + "epoch": 1.7623285131627735, + "grad_norm": 0.14113920747079686, + "learning_rate": 1.1892628307866729e-05, + "loss": 0.6414, + "step": 3565 + }, + { + "epoch": 1.7628228896304536, + "grad_norm": 0.13186692128607527, + "learning_rate": 1.1888804648160677e-05, + "loss": 0.6135, + "step": 3566 + }, + { + "epoch": 1.7633172660981338, + "grad_norm": 0.14130097284275295, + "learning_rate": 1.1884980702066502e-05, + "loss": 0.6089, + "step": 3567 + }, + { + "epoch": 1.763811642565814, + "grad_norm": 0.13606106606392482, + "learning_rate": 1.1881156470164006e-05, + "loss": 0.5658, + "step": 3568 + }, + { + "epoch": 1.764306019033494, + "grad_norm": 0.1271395860267746, + "learning_rate": 1.1877331953033031e-05, + "loss": 0.6127, + "step": 3569 + }, + { + "epoch": 1.7648003955011742, + "grad_norm": 0.13558102315469062, + "learning_rate": 1.1873507151253472e-05, + "loss": 0.6146, + "step": 3570 + }, + { + "epoch": 1.7652947719688543, + "grad_norm": 0.1408268400814914, + "learning_rate": 1.1869682065405258e-05, + "loss": 0.6329, + "step": 3571 + }, + { + "epoch": 1.7657891484365345, + "grad_norm": 0.1472823858874109, + "learning_rate": 1.1865856696068361e-05, + "loss": 0.562, + "step": 3572 + }, + { + "epoch": 1.7662835249042146, + "grad_norm": 0.1370516432515508, + "learning_rate": 1.1862031043822802e-05, + "loss": 0.5586, + "step": 3573 + }, + { + "epoch": 1.7667779013718947, + "grad_norm": 0.13749417499433556, + "learning_rate": 1.1858205109248642e-05, + "loss": 0.644, + "step": 3574 + }, + { + "epoch": 1.7672722778395749, + "grad_norm": 0.15048263362825132, + "learning_rate": 1.185437889292598e-05, + "loss": 0.601, + "step": 3575 + }, + { + "epoch": 1.767766654307255, + "grad_norm": 0.13305064277440795, + "learning_rate": 1.1850552395434967e-05, + "loss": 0.6065, + "step": 3576 + }, + { + "epoch": 1.7682610307749351, + "grad_norm": 0.13057770051025167, + "learning_rate": 1.1846725617355789e-05, + "loss": 0.6519, + "step": 3577 + }, + { + "epoch": 1.7687554072426153, + "grad_norm": 0.14160429130209648, + "learning_rate": 1.1842898559268682e-05, + "loss": 0.6281, + "step": 3578 + }, + { + "epoch": 1.7692497837102954, + "grad_norm": 0.1447057821032413, + "learning_rate": 1.1839071221753916e-05, + "loss": 0.6419, + "step": 3579 + }, + { + "epoch": 1.7697441601779755, + "grad_norm": 0.13750713683701526, + "learning_rate": 1.1835243605391806e-05, + "loss": 0.6, + "step": 3580 + }, + { + "epoch": 1.7702385366456557, + "grad_norm": 0.13534333659723904, + "learning_rate": 1.1831415710762713e-05, + "loss": 0.5762, + "step": 3581 + }, + { + "epoch": 1.7707329131133358, + "grad_norm": 0.1334538222466362, + "learning_rate": 1.1827587538447036e-05, + "loss": 0.6074, + "step": 3582 + }, + { + "epoch": 1.771227289581016, + "grad_norm": 0.13912546674540366, + "learning_rate": 1.1823759089025219e-05, + "loss": 0.6324, + "step": 3583 + }, + { + "epoch": 1.771721666048696, + "grad_norm": 0.1375982678045244, + "learning_rate": 1.181993036307775e-05, + "loss": 0.5572, + "step": 3584 + }, + { + "epoch": 1.7722160425163762, + "grad_norm": 0.135501338609511, + "learning_rate": 1.181610136118515e-05, + "loss": 0.6021, + "step": 3585 + }, + { + "epoch": 1.7727104189840563, + "grad_norm": 0.13654693260531628, + "learning_rate": 1.1812272083927989e-05, + "loss": 0.6371, + "step": 3586 + }, + { + "epoch": 1.7732047954517365, + "grad_norm": 0.13425776869195777, + "learning_rate": 1.180844253188688e-05, + "loss": 0.6196, + "step": 3587 + }, + { + "epoch": 1.7736991719194166, + "grad_norm": 0.1675721000102422, + "learning_rate": 1.1804612705642476e-05, + "loss": 0.6297, + "step": 3588 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.13469460373267297, + "learning_rate": 1.1800782605775463e-05, + "loss": 0.6611, + "step": 3589 + }, + { + "epoch": 1.7746879248547769, + "grad_norm": 0.13265998798745973, + "learning_rate": 1.1796952232866584e-05, + "loss": 0.584, + "step": 3590 + }, + { + "epoch": 1.775182301322457, + "grad_norm": 0.1368969027177881, + "learning_rate": 1.1793121587496612e-05, + "loss": 0.5798, + "step": 3591 + }, + { + "epoch": 1.7756766777901372, + "grad_norm": 0.1420072187983131, + "learning_rate": 1.1789290670246365e-05, + "loss": 0.6771, + "step": 3592 + }, + { + "epoch": 1.7761710542578173, + "grad_norm": 0.14725298190547065, + "learning_rate": 1.1785459481696704e-05, + "loss": 0.6184, + "step": 3593 + }, + { + "epoch": 1.7766654307254974, + "grad_norm": 0.13650838031201315, + "learning_rate": 1.1781628022428527e-05, + "loss": 0.6115, + "step": 3594 + }, + { + "epoch": 1.7771598071931776, + "grad_norm": 0.14046803432340718, + "learning_rate": 1.1777796293022774e-05, + "loss": 0.6018, + "step": 3595 + }, + { + "epoch": 1.7776541836608577, + "grad_norm": 0.13360754650402404, + "learning_rate": 1.1773964294060431e-05, + "loss": 0.6534, + "step": 3596 + }, + { + "epoch": 1.7781485601285378, + "grad_norm": 0.13552490237368728, + "learning_rate": 1.1770132026122518e-05, + "loss": 0.6282, + "step": 3597 + }, + { + "epoch": 1.778642936596218, + "grad_norm": 0.13368865482043646, + "learning_rate": 1.1766299489790098e-05, + "loss": 0.606, + "step": 3598 + }, + { + "epoch": 1.779137313063898, + "grad_norm": 0.14211800372965244, + "learning_rate": 1.1762466685644278e-05, + "loss": 0.6486, + "step": 3599 + }, + { + "epoch": 1.7796316895315782, + "grad_norm": 0.14171743218440175, + "learning_rate": 1.1758633614266206e-05, + "loss": 0.6463, + "step": 3600 + }, + { + "epoch": 1.7801260659992586, + "grad_norm": 0.28408496531472066, + "learning_rate": 1.1754800276237061e-05, + "loss": 0.6759, + "step": 3601 + }, + { + "epoch": 1.7806204424669385, + "grad_norm": 0.1402384361753884, + "learning_rate": 1.175096667213807e-05, + "loss": 0.5998, + "step": 3602 + }, + { + "epoch": 1.7811148189346189, + "grad_norm": 0.1450090784304344, + "learning_rate": 1.1747132802550504e-05, + "loss": 0.6052, + "step": 3603 + }, + { + "epoch": 1.7816091954022988, + "grad_norm": 0.142549622725205, + "learning_rate": 1.174329866805567e-05, + "loss": 0.6366, + "step": 3604 + }, + { + "epoch": 1.7821035718699791, + "grad_norm": 0.13786409285259327, + "learning_rate": 1.1739464269234908e-05, + "loss": 0.6634, + "step": 3605 + }, + { + "epoch": 1.782597948337659, + "grad_norm": 0.14860581711022133, + "learning_rate": 1.1735629606669609e-05, + "loss": 0.6454, + "step": 3606 + }, + { + "epoch": 1.7830923248053394, + "grad_norm": 0.13787353931315585, + "learning_rate": 1.1731794680941201e-05, + "loss": 0.6447, + "step": 3607 + }, + { + "epoch": 1.7835867012730193, + "grad_norm": 0.14579463864704126, + "learning_rate": 1.172795949263115e-05, + "loss": 0.6311, + "step": 3608 + }, + { + "epoch": 1.7840810777406997, + "grad_norm": 0.14827197350678023, + "learning_rate": 1.1724124042320958e-05, + "loss": 0.5754, + "step": 3609 + }, + { + "epoch": 1.7845754542083796, + "grad_norm": 0.12984280121072111, + "learning_rate": 1.172028833059218e-05, + "loss": 0.6225, + "step": 3610 + }, + { + "epoch": 1.78506983067606, + "grad_norm": 0.13243087251497795, + "learning_rate": 1.1716452358026396e-05, + "loss": 0.6083, + "step": 3611 + }, + { + "epoch": 1.7855642071437399, + "grad_norm": 0.14369413059078526, + "learning_rate": 1.1712616125205235e-05, + "loss": 0.6304, + "step": 3612 + }, + { + "epoch": 1.7860585836114202, + "grad_norm": 0.1418265232590439, + "learning_rate": 1.1708779632710357e-05, + "loss": 0.6533, + "step": 3613 + }, + { + "epoch": 1.7865529600791001, + "grad_norm": 0.13631416934947826, + "learning_rate": 1.1704942881123469e-05, + "loss": 0.637, + "step": 3614 + }, + { + "epoch": 1.7870473365467805, + "grad_norm": 0.13227708214511752, + "learning_rate": 1.1701105871026317e-05, + "loss": 0.6413, + "step": 3615 + }, + { + "epoch": 1.7875417130144604, + "grad_norm": 0.14475273605949107, + "learning_rate": 1.169726860300068e-05, + "loss": 0.6064, + "step": 3616 + }, + { + "epoch": 1.7880360894821408, + "grad_norm": 0.13821394957900196, + "learning_rate": 1.1693431077628383e-05, + "loss": 0.6536, + "step": 3617 + }, + { + "epoch": 1.7885304659498207, + "grad_norm": 0.13806774879378733, + "learning_rate": 1.1689593295491286e-05, + "loss": 0.6229, + "step": 3618 + }, + { + "epoch": 1.789024842417501, + "grad_norm": 0.15176632981875868, + "learning_rate": 1.1685755257171286e-05, + "loss": 0.6246, + "step": 3619 + }, + { + "epoch": 1.789519218885181, + "grad_norm": 0.13915127095640842, + "learning_rate": 1.1681916963250326e-05, + "loss": 0.6175, + "step": 3620 + }, + { + "epoch": 1.7900135953528613, + "grad_norm": 0.14142794479886717, + "learning_rate": 1.1678078414310382e-05, + "loss": 0.6122, + "step": 3621 + }, + { + "epoch": 1.7905079718205412, + "grad_norm": 0.14122207434796838, + "learning_rate": 1.167423961093347e-05, + "loss": 0.5832, + "step": 3622 + }, + { + "epoch": 1.7910023482882216, + "grad_norm": 0.20716468222557743, + "learning_rate": 1.1670400553701644e-05, + "loss": 0.6238, + "step": 3623 + }, + { + "epoch": 1.7914967247559015, + "grad_norm": 0.13932155118588202, + "learning_rate": 1.1666561243196997e-05, + "loss": 0.6679, + "step": 3624 + }, + { + "epoch": 1.7919911012235819, + "grad_norm": 0.14443627324193933, + "learning_rate": 1.1662721680001664e-05, + "loss": 0.6642, + "step": 3625 + }, + { + "epoch": 1.7924854776912618, + "grad_norm": 0.13946995124600117, + "learning_rate": 1.1658881864697808e-05, + "loss": 0.6069, + "step": 3626 + }, + { + "epoch": 1.7929798541589421, + "grad_norm": 0.13646795544813933, + "learning_rate": 1.1655041797867645e-05, + "loss": 0.6489, + "step": 3627 + }, + { + "epoch": 1.793474230626622, + "grad_norm": 0.13504490442994727, + "learning_rate": 1.165120148009342e-05, + "loss": 0.6167, + "step": 3628 + }, + { + "epoch": 1.7939686070943024, + "grad_norm": 0.13835307474851222, + "learning_rate": 1.1647360911957413e-05, + "loss": 0.6633, + "step": 3629 + }, + { + "epoch": 1.7944629835619823, + "grad_norm": 0.14027393835662635, + "learning_rate": 1.1643520094041949e-05, + "loss": 0.6141, + "step": 3630 + }, + { + "epoch": 1.7949573600296627, + "grad_norm": 0.13701794066121892, + "learning_rate": 1.1639679026929387e-05, + "loss": 0.6254, + "step": 3631 + }, + { + "epoch": 1.7954517364973426, + "grad_norm": 0.13394133939176056, + "learning_rate": 1.1635837711202125e-05, + "loss": 0.6388, + "step": 3632 + }, + { + "epoch": 1.795946112965023, + "grad_norm": 0.13617128263909803, + "learning_rate": 1.1631996147442604e-05, + "loss": 0.5706, + "step": 3633 + }, + { + "epoch": 1.7964404894327028, + "grad_norm": 0.13723028450966337, + "learning_rate": 1.1628154336233288e-05, + "loss": 0.6166, + "step": 3634 + }, + { + "epoch": 1.7969348659003832, + "grad_norm": 0.13445265723335498, + "learning_rate": 1.1624312278156693e-05, + "loss": 0.6386, + "step": 3635 + }, + { + "epoch": 1.7974292423680631, + "grad_norm": 0.17913626707422337, + "learning_rate": 1.162046997379537e-05, + "loss": 0.6527, + "step": 3636 + }, + { + "epoch": 1.7979236188357435, + "grad_norm": 0.14165867018127085, + "learning_rate": 1.1616627423731898e-05, + "loss": 0.6042, + "step": 3637 + }, + { + "epoch": 1.7984179953034236, + "grad_norm": 0.1383809116437864, + "learning_rate": 1.1612784628548902e-05, + "loss": 0.631, + "step": 3638 + }, + { + "epoch": 1.7989123717711037, + "grad_norm": 0.13883971643134907, + "learning_rate": 1.1608941588829045e-05, + "loss": 0.6018, + "step": 3639 + }, + { + "epoch": 1.7994067482387839, + "grad_norm": 0.1367893693554347, + "learning_rate": 1.1605098305155025e-05, + "loss": 0.6276, + "step": 3640 + }, + { + "epoch": 1.799901124706464, + "grad_norm": 0.13235517339559186, + "learning_rate": 1.1601254778109572e-05, + "loss": 0.5947, + "step": 3641 + }, + { + "epoch": 1.8003955011741442, + "grad_norm": 0.1328662966314614, + "learning_rate": 1.1597411008275456e-05, + "loss": 0.6471, + "step": 3642 + }, + { + "epoch": 1.8008898776418243, + "grad_norm": 0.14263502753638296, + "learning_rate": 1.1593566996235487e-05, + "loss": 0.6279, + "step": 3643 + }, + { + "epoch": 1.8013842541095044, + "grad_norm": 0.14876697109348927, + "learning_rate": 1.1589722742572513e-05, + "loss": 0.5933, + "step": 3644 + }, + { + "epoch": 1.8018786305771846, + "grad_norm": 0.13128969393622325, + "learning_rate": 1.1585878247869408e-05, + "loss": 0.6332, + "step": 3645 + }, + { + "epoch": 1.8023730070448647, + "grad_norm": 0.13600781345241322, + "learning_rate": 1.1582033512709096e-05, + "loss": 0.6619, + "step": 3646 + }, + { + "epoch": 1.8028673835125448, + "grad_norm": 0.48231126208272335, + "learning_rate": 1.1578188537674529e-05, + "loss": 0.6336, + "step": 3647 + }, + { + "epoch": 1.803361759980225, + "grad_norm": 0.14286224853947663, + "learning_rate": 1.1574343323348693e-05, + "loss": 0.6256, + "step": 3648 + }, + { + "epoch": 1.803856136447905, + "grad_norm": 0.13416533709870176, + "learning_rate": 1.1570497870314622e-05, + "loss": 0.6232, + "step": 3649 + }, + { + "epoch": 1.8043505129155852, + "grad_norm": 0.1328810415515225, + "learning_rate": 1.1566652179155375e-05, + "loss": 0.6053, + "step": 3650 + }, + { + "epoch": 1.8048448893832654, + "grad_norm": 0.14642837839361644, + "learning_rate": 1.1562806250454051e-05, + "loss": 0.6783, + "step": 3651 + }, + { + "epoch": 1.8053392658509455, + "grad_norm": 0.13959521261738947, + "learning_rate": 1.1558960084793786e-05, + "loss": 0.6034, + "step": 3652 + }, + { + "epoch": 1.8058336423186256, + "grad_norm": 0.1421659732222494, + "learning_rate": 1.1555113682757754e-05, + "loss": 0.5938, + "step": 3653 + }, + { + "epoch": 1.8063280187863058, + "grad_norm": 0.13896827959272426, + "learning_rate": 1.1551267044929155e-05, + "loss": 0.62, + "step": 3654 + }, + { + "epoch": 1.806822395253986, + "grad_norm": 0.1376674139545606, + "learning_rate": 1.1547420171891237e-05, + "loss": 0.6155, + "step": 3655 + }, + { + "epoch": 1.807316771721666, + "grad_norm": 0.14422540369694817, + "learning_rate": 1.1543573064227278e-05, + "loss": 0.5797, + "step": 3656 + }, + { + "epoch": 1.8078111481893462, + "grad_norm": 0.13836198204588177, + "learning_rate": 1.1539725722520587e-05, + "loss": 0.6104, + "step": 3657 + }, + { + "epoch": 1.8083055246570263, + "grad_norm": 0.13654319461344241, + "learning_rate": 1.153587814735452e-05, + "loss": 0.6114, + "step": 3658 + }, + { + "epoch": 1.8087999011247065, + "grad_norm": 0.1537400594432464, + "learning_rate": 1.1532030339312459e-05, + "loss": 0.6067, + "step": 3659 + }, + { + "epoch": 1.8092942775923866, + "grad_norm": 0.14361477606347353, + "learning_rate": 1.1528182298977824e-05, + "loss": 0.592, + "step": 3660 + }, + { + "epoch": 1.8097886540600667, + "grad_norm": 0.13088997231037222, + "learning_rate": 1.152433402693407e-05, + "loss": 0.5956, + "step": 3661 + }, + { + "epoch": 1.8102830305277469, + "grad_norm": 0.15256014045421337, + "learning_rate": 1.1520485523764686e-05, + "loss": 0.6197, + "step": 3662 + }, + { + "epoch": 1.810777406995427, + "grad_norm": 0.13527255174958053, + "learning_rate": 1.15166367900532e-05, + "loss": 0.5793, + "step": 3663 + }, + { + "epoch": 1.8112717834631071, + "grad_norm": 0.13003777968267957, + "learning_rate": 1.1512787826383172e-05, + "loss": 0.6088, + "step": 3664 + }, + { + "epoch": 1.8117661599307873, + "grad_norm": 0.13663462006489252, + "learning_rate": 1.1508938633338191e-05, + "loss": 0.6026, + "step": 3665 + }, + { + "epoch": 1.8122605363984674, + "grad_norm": 0.14251252957346722, + "learning_rate": 1.15050892115019e-05, + "loss": 0.6567, + "step": 3666 + }, + { + "epoch": 1.8127549128661475, + "grad_norm": 0.1418626268094188, + "learning_rate": 1.1501239561457955e-05, + "loss": 0.6402, + "step": 3667 + }, + { + "epoch": 1.8132492893338277, + "grad_norm": 0.1360702233092261, + "learning_rate": 1.1497389683790055e-05, + "loss": 0.6444, + "step": 3668 + }, + { + "epoch": 1.8137436658015078, + "grad_norm": 0.13785211806299907, + "learning_rate": 1.1493539579081938e-05, + "loss": 0.5913, + "step": 3669 + }, + { + "epoch": 1.814238042269188, + "grad_norm": 0.1366929015291679, + "learning_rate": 1.1489689247917368e-05, + "loss": 0.6241, + "step": 3670 + }, + { + "epoch": 1.814732418736868, + "grad_norm": 0.1418891591772876, + "learning_rate": 1.1485838690880148e-05, + "loss": 0.6151, + "step": 3671 + }, + { + "epoch": 1.8152267952045482, + "grad_norm": 0.1451077513593688, + "learning_rate": 1.148198790855412e-05, + "loss": 0.5953, + "step": 3672 + }, + { + "epoch": 1.8157211716722284, + "grad_norm": 0.1499317063279626, + "learning_rate": 1.147813690152315e-05, + "loss": 0.5824, + "step": 3673 + }, + { + "epoch": 1.8162155481399085, + "grad_norm": 0.13311457490532375, + "learning_rate": 1.1474285670371146e-05, + "loss": 0.6341, + "step": 3674 + }, + { + "epoch": 1.8167099246075886, + "grad_norm": 0.15168215528781176, + "learning_rate": 1.1470434215682045e-05, + "loss": 0.6258, + "step": 3675 + }, + { + "epoch": 1.817204301075269, + "grad_norm": 0.15267765111841078, + "learning_rate": 1.1466582538039821e-05, + "loss": 0.5957, + "step": 3676 + }, + { + "epoch": 1.817698677542949, + "grad_norm": 0.8663517158264161, + "learning_rate": 1.1462730638028479e-05, + "loss": 0.629, + "step": 3677 + }, + { + "epoch": 1.8181930540106293, + "grad_norm": 0.1464392402431932, + "learning_rate": 1.1458878516232061e-05, + "loss": 0.62, + "step": 3678 + }, + { + "epoch": 1.8186874304783092, + "grad_norm": 0.14506737880869933, + "learning_rate": 1.1455026173234644e-05, + "loss": 0.5852, + "step": 3679 + }, + { + "epoch": 1.8191818069459895, + "grad_norm": 0.13615370046321715, + "learning_rate": 1.1451173609620331e-05, + "loss": 0.5682, + "step": 3680 + }, + { + "epoch": 1.8196761834136694, + "grad_norm": 0.13858145639590855, + "learning_rate": 1.1447320825973263e-05, + "loss": 0.6097, + "step": 3681 + }, + { + "epoch": 1.8201705598813498, + "grad_norm": 0.14515075448752837, + "learning_rate": 1.144346782287762e-05, + "loss": 0.6244, + "step": 3682 + }, + { + "epoch": 1.8206649363490297, + "grad_norm": 0.15056798604305555, + "learning_rate": 1.1439614600917604e-05, + "loss": 0.6232, + "step": 3683 + }, + { + "epoch": 1.82115931281671, + "grad_norm": 0.13922602679663723, + "learning_rate": 1.1435761160677457e-05, + "loss": 0.6288, + "step": 3684 + }, + { + "epoch": 1.82165368928439, + "grad_norm": 0.142801622451074, + "learning_rate": 1.1431907502741455e-05, + "loss": 0.6038, + "step": 3685 + }, + { + "epoch": 1.8221480657520703, + "grad_norm": 0.14359067949255108, + "learning_rate": 1.1428053627693908e-05, + "loss": 0.6287, + "step": 3686 + }, + { + "epoch": 1.8226424422197502, + "grad_norm": 0.13696280331595762, + "learning_rate": 1.1424199536119147e-05, + "loss": 0.6941, + "step": 3687 + }, + { + "epoch": 1.8231368186874306, + "grad_norm": 0.6615021319943484, + "learning_rate": 1.1420345228601553e-05, + "loss": 0.6333, + "step": 3688 + }, + { + "epoch": 1.8236311951551105, + "grad_norm": 0.14151667008885718, + "learning_rate": 1.141649070572553e-05, + "loss": 0.6268, + "step": 3689 + }, + { + "epoch": 1.8241255716227909, + "grad_norm": 0.1404135317062887, + "learning_rate": 1.141263596807551e-05, + "loss": 0.6389, + "step": 3690 + }, + { + "epoch": 1.8246199480904708, + "grad_norm": 0.13245873766341854, + "learning_rate": 1.140878101623597e-05, + "loss": 0.6537, + "step": 3691 + }, + { + "epoch": 1.8251143245581511, + "grad_norm": 0.14111577511156925, + "learning_rate": 1.1404925850791414e-05, + "loss": 0.6201, + "step": 3692 + }, + { + "epoch": 1.825608701025831, + "grad_norm": 0.14021961232679483, + "learning_rate": 1.1401070472326372e-05, + "loss": 0.6183, + "step": 3693 + }, + { + "epoch": 1.8261030774935114, + "grad_norm": 0.17514463946321543, + "learning_rate": 1.1397214881425417e-05, + "loss": 0.6126, + "step": 3694 + }, + { + "epoch": 1.8265974539611913, + "grad_norm": 0.13460050308128, + "learning_rate": 1.1393359078673148e-05, + "loss": 0.5772, + "step": 3695 + }, + { + "epoch": 1.8270918304288717, + "grad_norm": 0.1365706648973131, + "learning_rate": 1.1389503064654194e-05, + "loss": 0.594, + "step": 3696 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 0.1321564034654902, + "learning_rate": 1.1385646839953223e-05, + "loss": 0.6456, + "step": 3697 + }, + { + "epoch": 1.828080583364232, + "grad_norm": 0.14114822075213798, + "learning_rate": 1.1381790405154933e-05, + "loss": 0.61, + "step": 3698 + }, + { + "epoch": 1.8285749598319119, + "grad_norm": 0.1372869554999072, + "learning_rate": 1.1377933760844047e-05, + "loss": 0.5941, + "step": 3699 + }, + { + "epoch": 1.8290693362995922, + "grad_norm": 0.1383622948283573, + "learning_rate": 1.1374076907605326e-05, + "loss": 0.5972, + "step": 3700 + }, + { + "epoch": 1.8295637127672721, + "grad_norm": 0.12823285127948922, + "learning_rate": 1.1370219846023562e-05, + "loss": 0.6203, + "step": 3701 + }, + { + "epoch": 1.8300580892349525, + "grad_norm": 0.14110039014996117, + "learning_rate": 1.1366362576683583e-05, + "loss": 0.6114, + "step": 3702 + }, + { + "epoch": 1.8305524657026324, + "grad_norm": 0.13348119943783038, + "learning_rate": 1.1362505100170234e-05, + "loss": 0.5737, + "step": 3703 + }, + { + "epoch": 1.8310468421703128, + "grad_norm": 0.1402279978975788, + "learning_rate": 1.1358647417068408e-05, + "loss": 0.6314, + "step": 3704 + }, + { + "epoch": 1.8315412186379927, + "grad_norm": 0.12828884835559984, + "learning_rate": 1.1354789527963026e-05, + "loss": 0.6077, + "step": 3705 + }, + { + "epoch": 1.832035595105673, + "grad_norm": 0.14305907031131035, + "learning_rate": 1.1350931433439026e-05, + "loss": 0.5855, + "step": 3706 + }, + { + "epoch": 1.832529971573353, + "grad_norm": 0.13277722768773806, + "learning_rate": 1.1347073134081392e-05, + "loss": 0.6137, + "step": 3707 + }, + { + "epoch": 1.8330243480410333, + "grad_norm": 0.13866800411000962, + "learning_rate": 1.1343214630475139e-05, + "loss": 0.5716, + "step": 3708 + }, + { + "epoch": 1.8335187245087132, + "grad_norm": 0.13852131003588125, + "learning_rate": 1.1339355923205304e-05, + "loss": 0.5987, + "step": 3709 + }, + { + "epoch": 1.8340131009763936, + "grad_norm": 0.13820450634283876, + "learning_rate": 1.1335497012856963e-05, + "loss": 0.622, + "step": 3710 + }, + { + "epoch": 1.8345074774440735, + "grad_norm": 0.13871911671345524, + "learning_rate": 1.1331637900015215e-05, + "loss": 0.6081, + "step": 3711 + }, + { + "epoch": 1.8350018539117539, + "grad_norm": 0.1399948540938063, + "learning_rate": 1.13277785852652e-05, + "loss": 0.5691, + "step": 3712 + }, + { + "epoch": 1.835496230379434, + "grad_norm": 0.13685318406715816, + "learning_rate": 1.1323919069192075e-05, + "loss": 0.6654, + "step": 3713 + }, + { + "epoch": 1.8359906068471141, + "grad_norm": 0.13807984637933338, + "learning_rate": 1.1320059352381044e-05, + "loss": 0.6053, + "step": 3714 + }, + { + "epoch": 1.8364849833147943, + "grad_norm": 0.12689504535455803, + "learning_rate": 1.1316199435417328e-05, + "loss": 0.6325, + "step": 3715 + }, + { + "epoch": 1.8369793597824744, + "grad_norm": 0.1418229766012228, + "learning_rate": 1.1312339318886183e-05, + "loss": 0.6095, + "step": 3716 + }, + { + "epoch": 1.8374737362501545, + "grad_norm": 0.12861897932968994, + "learning_rate": 1.1308479003372895e-05, + "loss": 0.6264, + "step": 3717 + }, + { + "epoch": 1.8379681127178347, + "grad_norm": 0.1344573019788116, + "learning_rate": 1.1304618489462782e-05, + "loss": 0.5956, + "step": 3718 + }, + { + "epoch": 1.8384624891855148, + "grad_norm": 0.13437660778293975, + "learning_rate": 1.1300757777741191e-05, + "loss": 0.5946, + "step": 3719 + }, + { + "epoch": 1.838956865653195, + "grad_norm": 0.1435106119265411, + "learning_rate": 1.1296896868793494e-05, + "loss": 0.629, + "step": 3720 + }, + { + "epoch": 1.839451242120875, + "grad_norm": 0.13488192303947122, + "learning_rate": 1.1293035763205108e-05, + "loss": 0.6205, + "step": 3721 + }, + { + "epoch": 1.8399456185885552, + "grad_norm": 0.13423772402377232, + "learning_rate": 1.1289174461561456e-05, + "loss": 0.6225, + "step": 3722 + }, + { + "epoch": 1.8404399950562353, + "grad_norm": 0.14134242419669074, + "learning_rate": 1.1285312964448014e-05, + "loss": 0.6239, + "step": 3723 + }, + { + "epoch": 1.8409343715239155, + "grad_norm": 0.1380977215437364, + "learning_rate": 1.1281451272450271e-05, + "loss": 0.6038, + "step": 3724 + }, + { + "epoch": 1.8414287479915956, + "grad_norm": 0.14086219079918236, + "learning_rate": 1.1277589386153757e-05, + "loss": 0.6297, + "step": 3725 + }, + { + "epoch": 1.8419231244592758, + "grad_norm": 0.13572500624998132, + "learning_rate": 1.1273727306144027e-05, + "loss": 0.6274, + "step": 3726 + }, + { + "epoch": 1.8424175009269559, + "grad_norm": 0.13565977032046253, + "learning_rate": 1.1269865033006661e-05, + "loss": 0.6018, + "step": 3727 + }, + { + "epoch": 1.842911877394636, + "grad_norm": 0.13993773344900806, + "learning_rate": 1.1266002567327275e-05, + "loss": 0.6078, + "step": 3728 + }, + { + "epoch": 1.8434062538623162, + "grad_norm": 0.13110308742858745, + "learning_rate": 1.126213990969151e-05, + "loss": 0.6099, + "step": 3729 + }, + { + "epoch": 1.8439006303299963, + "grad_norm": 0.13088706743220807, + "learning_rate": 1.125827706068504e-05, + "loss": 0.6274, + "step": 3730 + }, + { + "epoch": 1.8443950067976764, + "grad_norm": 0.13264898220955748, + "learning_rate": 1.125441402089356e-05, + "loss": 0.6128, + "step": 3731 + }, + { + "epoch": 1.8448893832653566, + "grad_norm": 0.1359676349707901, + "learning_rate": 1.1250550790902808e-05, + "loss": 0.6026, + "step": 3732 + }, + { + "epoch": 1.8453837597330367, + "grad_norm": 0.12897166451760292, + "learning_rate": 1.1246687371298532e-05, + "loss": 0.583, + "step": 3733 + }, + { + "epoch": 1.8458781362007168, + "grad_norm": 0.13044364825541985, + "learning_rate": 1.124282376266653e-05, + "loss": 0.6087, + "step": 3734 + }, + { + "epoch": 1.846372512668397, + "grad_norm": 0.1335798911535538, + "learning_rate": 1.1238959965592615e-05, + "loss": 0.6321, + "step": 3735 + }, + { + "epoch": 1.846866889136077, + "grad_norm": 0.1360492481779964, + "learning_rate": 1.1235095980662623e-05, + "loss": 0.6347, + "step": 3736 + }, + { + "epoch": 1.8473612656037572, + "grad_norm": 0.13522822490791264, + "learning_rate": 1.1231231808462438e-05, + "loss": 0.5846, + "step": 3737 + }, + { + "epoch": 1.8478556420714374, + "grad_norm": 0.1333942638384183, + "learning_rate": 1.1227367449577958e-05, + "loss": 0.6148, + "step": 3738 + }, + { + "epoch": 1.8483500185391175, + "grad_norm": 0.12831518935315323, + "learning_rate": 1.1223502904595105e-05, + "loss": 0.6118, + "step": 3739 + }, + { + "epoch": 1.8488443950067976, + "grad_norm": 0.14186733209179284, + "learning_rate": 1.1219638174099846e-05, + "loss": 0.6433, + "step": 3740 + }, + { + "epoch": 1.8493387714744778, + "grad_norm": 0.1399930740166073, + "learning_rate": 1.1215773258678161e-05, + "loss": 0.623, + "step": 3741 + }, + { + "epoch": 1.849833147942158, + "grad_norm": 0.13879766816079656, + "learning_rate": 1.1211908158916072e-05, + "loss": 0.657, + "step": 3742 + }, + { + "epoch": 1.850327524409838, + "grad_norm": 0.13435304991185187, + "learning_rate": 1.1208042875399611e-05, + "loss": 0.6205, + "step": 3743 + }, + { + "epoch": 1.8508219008775182, + "grad_norm": 0.1361577499463384, + "learning_rate": 1.1204177408714856e-05, + "loss": 0.603, + "step": 3744 + }, + { + "epoch": 1.8513162773451983, + "grad_norm": 0.13371811129895272, + "learning_rate": 1.12003117594479e-05, + "loss": 0.5962, + "step": 3745 + }, + { + "epoch": 1.8518106538128785, + "grad_norm": 0.1402080914049837, + "learning_rate": 1.1196445928184866e-05, + "loss": 0.6231, + "step": 3746 + }, + { + "epoch": 1.8523050302805586, + "grad_norm": 0.13719729213908677, + "learning_rate": 1.119257991551191e-05, + "loss": 0.6472, + "step": 3747 + }, + { + "epoch": 1.8527994067482387, + "grad_norm": 0.13698090987647082, + "learning_rate": 1.1188713722015217e-05, + "loss": 0.6314, + "step": 3748 + }, + { + "epoch": 1.853293783215919, + "grad_norm": 0.1278396657954513, + "learning_rate": 1.1184847348280987e-05, + "loss": 0.63, + "step": 3749 + }, + { + "epoch": 1.853788159683599, + "grad_norm": 0.12976583483594686, + "learning_rate": 1.1180980794895458e-05, + "loss": 0.5836, + "step": 3750 + }, + { + "epoch": 1.8542825361512794, + "grad_norm": 0.13066892188392026, + "learning_rate": 1.1177114062444894e-05, + "loss": 0.6057, + "step": 3751 + }, + { + "epoch": 1.8547769126189593, + "grad_norm": 0.13517770500099938, + "learning_rate": 1.1173247151515578e-05, + "loss": 0.621, + "step": 3752 + }, + { + "epoch": 1.8552712890866396, + "grad_norm": 0.12701747749882217, + "learning_rate": 1.1169380062693835e-05, + "loss": 0.6002, + "step": 3753 + }, + { + "epoch": 1.8557656655543195, + "grad_norm": 0.13158374959687064, + "learning_rate": 1.1165512796566006e-05, + "loss": 0.6074, + "step": 3754 + }, + { + "epoch": 1.856260042022, + "grad_norm": 0.13337236458551957, + "learning_rate": 1.1161645353718458e-05, + "loss": 0.6322, + "step": 3755 + }, + { + "epoch": 1.8567544184896798, + "grad_norm": 0.13270887143995755, + "learning_rate": 1.1157777734737589e-05, + "loss": 0.6161, + "step": 3756 + }, + { + "epoch": 1.8572487949573602, + "grad_norm": 0.13706596748480662, + "learning_rate": 1.1153909940209829e-05, + "loss": 0.6095, + "step": 3757 + }, + { + "epoch": 1.85774317142504, + "grad_norm": 0.1308638046050449, + "learning_rate": 1.1150041970721618e-05, + "loss": 0.5815, + "step": 3758 + }, + { + "epoch": 1.8582375478927204, + "grad_norm": 0.1308890969316707, + "learning_rate": 1.1146173826859443e-05, + "loss": 0.589, + "step": 3759 + }, + { + "epoch": 1.8587319243604004, + "grad_norm": 0.1395630309190326, + "learning_rate": 1.1142305509209801e-05, + "loss": 0.6328, + "step": 3760 + }, + { + "epoch": 1.8592263008280807, + "grad_norm": 0.14243909965829687, + "learning_rate": 1.1138437018359225e-05, + "loss": 0.5972, + "step": 3761 + }, + { + "epoch": 1.8597206772957606, + "grad_norm": 0.13254646062627454, + "learning_rate": 1.1134568354894271e-05, + "loss": 0.6273, + "step": 3762 + }, + { + "epoch": 1.860215053763441, + "grad_norm": 0.13564452830734333, + "learning_rate": 1.1130699519401515e-05, + "loss": 0.5785, + "step": 3763 + }, + { + "epoch": 1.860709430231121, + "grad_norm": 0.13302426203446746, + "learning_rate": 1.112683051246758e-05, + "loss": 0.6042, + "step": 3764 + }, + { + "epoch": 1.8612038066988013, + "grad_norm": 0.13328749164263978, + "learning_rate": 1.1122961334679086e-05, + "loss": 0.615, + "step": 3765 + }, + { + "epoch": 1.8616981831664812, + "grad_norm": 0.13793399768962342, + "learning_rate": 1.1119091986622695e-05, + "loss": 0.6067, + "step": 3766 + }, + { + "epoch": 1.8621925596341615, + "grad_norm": 0.1358059502390317, + "learning_rate": 1.1115222468885098e-05, + "loss": 0.6141, + "step": 3767 + }, + { + "epoch": 1.8626869361018414, + "grad_norm": 0.13770274809388955, + "learning_rate": 1.1111352782053008e-05, + "loss": 0.6067, + "step": 3768 + }, + { + "epoch": 1.8631813125695218, + "grad_norm": 0.14068598339735375, + "learning_rate": 1.1107482926713156e-05, + "loss": 0.6366, + "step": 3769 + }, + { + "epoch": 1.8636756890372017, + "grad_norm": 0.1441677558178572, + "learning_rate": 1.110361290345231e-05, + "loss": 0.6076, + "step": 3770 + }, + { + "epoch": 1.864170065504882, + "grad_norm": 0.13056594707169697, + "learning_rate": 1.109974271285726e-05, + "loss": 0.6284, + "step": 3771 + }, + { + "epoch": 1.864664441972562, + "grad_norm": 0.1342861626331938, + "learning_rate": 1.109587235551481e-05, + "loss": 0.6031, + "step": 3772 + }, + { + "epoch": 1.8651588184402423, + "grad_norm": 0.13798761605279372, + "learning_rate": 1.109200183201181e-05, + "loss": 0.6312, + "step": 3773 + }, + { + "epoch": 1.8656531949079223, + "grad_norm": 0.13374258663632027, + "learning_rate": 1.1088131142935119e-05, + "loss": 0.5925, + "step": 3774 + }, + { + "epoch": 1.8661475713756026, + "grad_norm": 0.14007087529208395, + "learning_rate": 1.1084260288871627e-05, + "loss": 0.6048, + "step": 3775 + }, + { + "epoch": 1.8666419478432825, + "grad_norm": 0.13369447744046206, + "learning_rate": 1.108038927040825e-05, + "loss": 0.6037, + "step": 3776 + }, + { + "epoch": 1.8671363243109629, + "grad_norm": 0.14047722616992891, + "learning_rate": 1.1076518088131924e-05, + "loss": 0.6191, + "step": 3777 + }, + { + "epoch": 1.8676307007786428, + "grad_norm": 0.13685403298601173, + "learning_rate": 1.1072646742629617e-05, + "loss": 0.613, + "step": 3778 + }, + { + "epoch": 1.8681250772463232, + "grad_norm": 0.1327366611556623, + "learning_rate": 1.1068775234488317e-05, + "loss": 0.6217, + "step": 3779 + }, + { + "epoch": 1.868619453714003, + "grad_norm": 0.13148345574887138, + "learning_rate": 1.1064903564295033e-05, + "loss": 0.6108, + "step": 3780 + }, + { + "epoch": 1.8691138301816834, + "grad_norm": 0.15789782361089072, + "learning_rate": 1.1061031732636809e-05, + "loss": 0.6339, + "step": 3781 + }, + { + "epoch": 1.8696082066493633, + "grad_norm": 0.13710803496086227, + "learning_rate": 1.1057159740100705e-05, + "loss": 0.6323, + "step": 3782 + }, + { + "epoch": 1.8701025831170437, + "grad_norm": 0.13319165523739118, + "learning_rate": 1.1053287587273806e-05, + "loss": 0.561, + "step": 3783 + }, + { + "epoch": 1.8705969595847236, + "grad_norm": 0.12955862489699824, + "learning_rate": 1.1049415274743231e-05, + "loss": 0.6567, + "step": 3784 + }, + { + "epoch": 1.871091336052404, + "grad_norm": 0.13277116733085598, + "learning_rate": 1.1045542803096106e-05, + "loss": 0.5876, + "step": 3785 + }, + { + "epoch": 1.871585712520084, + "grad_norm": 0.1489607930563271, + "learning_rate": 1.1041670172919597e-05, + "loss": 0.6442, + "step": 3786 + }, + { + "epoch": 1.8720800889877642, + "grad_norm": 0.1330079929296991, + "learning_rate": 1.1037797384800886e-05, + "loss": 0.6253, + "step": 3787 + }, + { + "epoch": 1.8725744654554444, + "grad_norm": 0.13015494094908692, + "learning_rate": 1.1033924439327177e-05, + "loss": 0.6078, + "step": 3788 + }, + { + "epoch": 1.8730688419231245, + "grad_norm": 0.13926801076164175, + "learning_rate": 1.1030051337085708e-05, + "loss": 0.6189, + "step": 3789 + }, + { + "epoch": 1.8735632183908046, + "grad_norm": 0.1361677322382266, + "learning_rate": 1.1026178078663729e-05, + "loss": 0.584, + "step": 3790 + }, + { + "epoch": 1.8740575948584848, + "grad_norm": 0.1434223786658035, + "learning_rate": 1.1022304664648524e-05, + "loss": 0.6208, + "step": 3791 + }, + { + "epoch": 1.874551971326165, + "grad_norm": 0.13665393236875528, + "learning_rate": 1.1018431095627391e-05, + "loss": 0.609, + "step": 3792 + }, + { + "epoch": 1.875046347793845, + "grad_norm": 0.14508765510051863, + "learning_rate": 1.1014557372187658e-05, + "loss": 0.6645, + "step": 3793 + }, + { + "epoch": 1.8755407242615252, + "grad_norm": 0.1382928465338786, + "learning_rate": 1.1010683494916672e-05, + "loss": 0.607, + "step": 3794 + }, + { + "epoch": 1.8760351007292053, + "grad_norm": 0.1560579134534535, + "learning_rate": 1.1006809464401811e-05, + "loss": 0.6093, + "step": 3795 + }, + { + "epoch": 1.8765294771968855, + "grad_norm": 0.13540829839478274, + "learning_rate": 1.1002935281230463e-05, + "loss": 0.5982, + "step": 3796 + }, + { + "epoch": 1.8770238536645656, + "grad_norm": 0.13497575020249086, + "learning_rate": 1.0999060945990057e-05, + "loss": 0.6572, + "step": 3797 + }, + { + "epoch": 1.8775182301322457, + "grad_norm": 0.14146116670963038, + "learning_rate": 1.0995186459268028e-05, + "loss": 0.6127, + "step": 3798 + }, + { + "epoch": 1.8780126065999259, + "grad_norm": 0.13043426770815716, + "learning_rate": 1.0991311821651842e-05, + "loss": 0.6225, + "step": 3799 + }, + { + "epoch": 1.878506983067606, + "grad_norm": 0.13001801557769888, + "learning_rate": 1.0987437033728991e-05, + "loss": 0.6387, + "step": 3800 + }, + { + "epoch": 1.8790013595352861, + "grad_norm": 0.14169781941492623, + "learning_rate": 1.0983562096086984e-05, + "loss": 0.5907, + "step": 3801 + }, + { + "epoch": 1.8794957360029663, + "grad_norm": 0.13091335339397359, + "learning_rate": 1.097968700931335e-05, + "loss": 0.6174, + "step": 3802 + }, + { + "epoch": 1.8799901124706464, + "grad_norm": 0.13127772758112177, + "learning_rate": 1.097581177399565e-05, + "loss": 0.6193, + "step": 3803 + }, + { + "epoch": 1.8804844889383265, + "grad_norm": 0.13215845737387338, + "learning_rate": 1.0971936390721465e-05, + "loss": 0.6149, + "step": 3804 + }, + { + "epoch": 1.8809788654060067, + "grad_norm": 0.13819418965969837, + "learning_rate": 1.096806086007839e-05, + "loss": 0.5994, + "step": 3805 + }, + { + "epoch": 1.8814732418736868, + "grad_norm": 0.135669725778545, + "learning_rate": 1.0964185182654052e-05, + "loss": 0.633, + "step": 3806 + }, + { + "epoch": 1.881967618341367, + "grad_norm": 0.13482180170424973, + "learning_rate": 1.0960309359036096e-05, + "loss": 0.5923, + "step": 3807 + }, + { + "epoch": 1.882461994809047, + "grad_norm": 0.13832753284257085, + "learning_rate": 1.0956433389812192e-05, + "loss": 0.6119, + "step": 3808 + }, + { + "epoch": 1.8829563712767272, + "grad_norm": 0.13878895159201338, + "learning_rate": 1.0952557275570026e-05, + "loss": 0.5966, + "step": 3809 + }, + { + "epoch": 1.8834507477444073, + "grad_norm": 0.13562190784638575, + "learning_rate": 1.0948681016897312e-05, + "loss": 0.6349, + "step": 3810 + }, + { + "epoch": 1.8839451242120875, + "grad_norm": 0.14213671706011416, + "learning_rate": 1.0944804614381784e-05, + "loss": 0.6166, + "step": 3811 + }, + { + "epoch": 1.8844395006797676, + "grad_norm": 0.1393020257671835, + "learning_rate": 1.0940928068611199e-05, + "loss": 0.6464, + "step": 3812 + }, + { + "epoch": 1.8849338771474478, + "grad_norm": 0.13237890738093933, + "learning_rate": 1.0937051380173328e-05, + "loss": 0.6185, + "step": 3813 + }, + { + "epoch": 1.885428253615128, + "grad_norm": 0.138655009126555, + "learning_rate": 1.0933174549655981e-05, + "loss": 0.6268, + "step": 3814 + }, + { + "epoch": 1.885922630082808, + "grad_norm": 0.13453691043794289, + "learning_rate": 1.0929297577646967e-05, + "loss": 0.6124, + "step": 3815 + }, + { + "epoch": 1.8864170065504882, + "grad_norm": 0.13210119825552838, + "learning_rate": 1.0925420464734135e-05, + "loss": 0.5967, + "step": 3816 + }, + { + "epoch": 1.8869113830181683, + "grad_norm": 0.13870185860186623, + "learning_rate": 1.092154321150535e-05, + "loss": 0.6115, + "step": 3817 + }, + { + "epoch": 1.8874057594858484, + "grad_norm": 0.13011806589197186, + "learning_rate": 1.0917665818548491e-05, + "loss": 0.6443, + "step": 3818 + }, + { + "epoch": 1.8879001359535286, + "grad_norm": 0.14531866633496382, + "learning_rate": 1.0913788286451465e-05, + "loss": 0.5987, + "step": 3819 + }, + { + "epoch": 1.8883945124212087, + "grad_norm": 0.1325884624452367, + "learning_rate": 1.0909910615802207e-05, + "loss": 0.5824, + "step": 3820 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.13357960254975065, + "learning_rate": 1.0906032807188649e-05, + "loss": 0.6301, + "step": 3821 + }, + { + "epoch": 1.889383265356569, + "grad_norm": 0.1470703424487774, + "learning_rate": 1.0902154861198775e-05, + "loss": 0.599, + "step": 3822 + }, + { + "epoch": 1.889877641824249, + "grad_norm": 0.1315006665873097, + "learning_rate": 1.0898276778420566e-05, + "loss": 0.5892, + "step": 3823 + }, + { + "epoch": 1.8903720182919295, + "grad_norm": 0.1286008842125209, + "learning_rate": 1.0894398559442036e-05, + "loss": 0.6321, + "step": 3824 + }, + { + "epoch": 1.8908663947596094, + "grad_norm": 0.13790036200710473, + "learning_rate": 1.0890520204851217e-05, + "loss": 0.6273, + "step": 3825 + }, + { + "epoch": 1.8913607712272897, + "grad_norm": 0.13905695848348426, + "learning_rate": 1.0886641715236158e-05, + "loss": 0.6186, + "step": 3826 + }, + { + "epoch": 1.8918551476949697, + "grad_norm": 0.13145209223331358, + "learning_rate": 1.0882763091184932e-05, + "loss": 0.6008, + "step": 3827 + }, + { + "epoch": 1.89234952416265, + "grad_norm": 0.13208040369637164, + "learning_rate": 1.0878884333285631e-05, + "loss": 0.5634, + "step": 3828 + }, + { + "epoch": 1.89284390063033, + "grad_norm": 0.12987215931679838, + "learning_rate": 1.0875005442126366e-05, + "loss": 0.6121, + "step": 3829 + }, + { + "epoch": 1.8933382770980103, + "grad_norm": 0.1317680923132745, + "learning_rate": 1.0871126418295277e-05, + "loss": 0.6558, + "step": 3830 + }, + { + "epoch": 1.8938326535656902, + "grad_norm": 0.13451988253874073, + "learning_rate": 1.0867247262380512e-05, + "loss": 0.6079, + "step": 3831 + }, + { + "epoch": 1.8943270300333706, + "grad_norm": 0.12978067183779624, + "learning_rate": 1.086336797497024e-05, + "loss": 0.6301, + "step": 3832 + }, + { + "epoch": 1.8948214065010505, + "grad_norm": 0.13298207592494238, + "learning_rate": 1.0859488556652664e-05, + "loss": 0.6062, + "step": 3833 + }, + { + "epoch": 1.8953157829687308, + "grad_norm": 0.13202670852644743, + "learning_rate": 1.0855609008015989e-05, + "loss": 0.5985, + "step": 3834 + }, + { + "epoch": 1.8958101594364107, + "grad_norm": 0.13014380479269794, + "learning_rate": 1.0851729329648451e-05, + "loss": 0.6374, + "step": 3835 + }, + { + "epoch": 1.896304535904091, + "grad_norm": 0.13242621008389774, + "learning_rate": 1.08478495221383e-05, + "loss": 0.6294, + "step": 3836 + }, + { + "epoch": 1.896798912371771, + "grad_norm": 0.13299261436399978, + "learning_rate": 1.0843969586073812e-05, + "loss": 0.6113, + "step": 3837 + }, + { + "epoch": 1.8972932888394514, + "grad_norm": 0.13527228606169284, + "learning_rate": 1.0840089522043278e-05, + "loss": 0.6416, + "step": 3838 + }, + { + "epoch": 1.8977876653071313, + "grad_norm": 0.13295658972209473, + "learning_rate": 1.0836209330635004e-05, + "loss": 0.6057, + "step": 3839 + }, + { + "epoch": 1.8982820417748116, + "grad_norm": 0.13751636446681598, + "learning_rate": 1.0832329012437324e-05, + "loss": 0.635, + "step": 3840 + }, + { + "epoch": 1.8987764182424915, + "grad_norm": 0.13454500841890224, + "learning_rate": 1.0828448568038588e-05, + "loss": 0.6227, + "step": 3841 + }, + { + "epoch": 1.899270794710172, + "grad_norm": 0.13273948841916294, + "learning_rate": 1.0824567998027162e-05, + "loss": 0.604, + "step": 3842 + }, + { + "epoch": 1.8997651711778518, + "grad_norm": 0.1444821780450374, + "learning_rate": 1.0820687302991435e-05, + "loss": 0.6093, + "step": 3843 + }, + { + "epoch": 1.9002595476455322, + "grad_norm": 0.12890076978714773, + "learning_rate": 1.081680648351981e-05, + "loss": 0.5992, + "step": 3844 + }, + { + "epoch": 1.900753924113212, + "grad_norm": 0.18822276074557137, + "learning_rate": 1.0812925540200718e-05, + "loss": 0.6378, + "step": 3845 + }, + { + "epoch": 1.9012483005808924, + "grad_norm": 0.14203164728881987, + "learning_rate": 1.0809044473622602e-05, + "loss": 0.6114, + "step": 3846 + }, + { + "epoch": 1.9017426770485724, + "grad_norm": 0.1342660343269294, + "learning_rate": 1.0805163284373922e-05, + "loss": 0.6655, + "step": 3847 + }, + { + "epoch": 1.9022370535162527, + "grad_norm": 0.13309132806282087, + "learning_rate": 1.0801281973043161e-05, + "loss": 0.6243, + "step": 3848 + }, + { + "epoch": 1.9027314299839326, + "grad_norm": 0.13549405986276516, + "learning_rate": 1.0797400540218819e-05, + "loss": 0.6398, + "step": 3849 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 0.15017962184810404, + "learning_rate": 1.0793518986489417e-05, + "loss": 0.6388, + "step": 3850 + }, + { + "epoch": 1.903720182919293, + "grad_norm": 0.12922499208154964, + "learning_rate": 1.0789637312443483e-05, + "loss": 0.6351, + "step": 3851 + }, + { + "epoch": 1.9042145593869733, + "grad_norm": 0.13595406439113947, + "learning_rate": 1.0785755518669583e-05, + "loss": 0.5884, + "step": 3852 + }, + { + "epoch": 1.9047089358546532, + "grad_norm": 0.13270718467935844, + "learning_rate": 1.0781873605756289e-05, + "loss": 0.6131, + "step": 3853 + }, + { + "epoch": 1.9052033123223335, + "grad_norm": 0.13373754930134246, + "learning_rate": 1.077799157429218e-05, + "loss": 0.6046, + "step": 3854 + }, + { + "epoch": 1.9056976887900134, + "grad_norm": 0.13515408832691264, + "learning_rate": 1.0774109424865875e-05, + "loss": 0.6049, + "step": 3855 + }, + { + "epoch": 1.9061920652576938, + "grad_norm": 0.13618263748285486, + "learning_rate": 1.0770227158066002e-05, + "loss": 0.5951, + "step": 3856 + }, + { + "epoch": 1.9066864417253737, + "grad_norm": 0.13145969314218303, + "learning_rate": 1.0766344774481203e-05, + "loss": 0.6103, + "step": 3857 + }, + { + "epoch": 1.907180818193054, + "grad_norm": 0.14218531912580934, + "learning_rate": 1.076246227470014e-05, + "loss": 0.6049, + "step": 3858 + }, + { + "epoch": 1.907675194660734, + "grad_norm": 0.1357045701056064, + "learning_rate": 1.0758579659311496e-05, + "loss": 0.6242, + "step": 3859 + }, + { + "epoch": 1.9081695711284143, + "grad_norm": 0.14420136658576588, + "learning_rate": 1.0754696928903965e-05, + "loss": 0.6274, + "step": 3860 + }, + { + "epoch": 1.9086639475960945, + "grad_norm": 0.13096359065919802, + "learning_rate": 1.0750814084066265e-05, + "loss": 0.6446, + "step": 3861 + }, + { + "epoch": 1.9091583240637746, + "grad_norm": 0.1441042695567318, + "learning_rate": 1.0746931125387128e-05, + "loss": 0.6266, + "step": 3862 + }, + { + "epoch": 1.9096527005314547, + "grad_norm": 0.141876112610439, + "learning_rate": 1.07430480534553e-05, + "loss": 0.5978, + "step": 3863 + }, + { + "epoch": 1.9101470769991349, + "grad_norm": 0.13243238878665986, + "learning_rate": 1.0739164868859555e-05, + "loss": 0.6089, + "step": 3864 + }, + { + "epoch": 1.910641453466815, + "grad_norm": 0.12527199515370027, + "learning_rate": 1.0735281572188667e-05, + "loss": 0.5751, + "step": 3865 + }, + { + "epoch": 1.9111358299344952, + "grad_norm": 0.14481125116957713, + "learning_rate": 1.073139816403145e-05, + "loss": 0.6226, + "step": 3866 + }, + { + "epoch": 1.9116302064021753, + "grad_norm": 0.12679239249481455, + "learning_rate": 1.072751464497671e-05, + "loss": 0.5828, + "step": 3867 + }, + { + "epoch": 1.9121245828698554, + "grad_norm": 0.1350383154577462, + "learning_rate": 1.0723631015613289e-05, + "loss": 0.5843, + "step": 3868 + }, + { + "epoch": 1.9126189593375356, + "grad_norm": 0.13803379488129944, + "learning_rate": 1.0719747276530037e-05, + "loss": 0.6265, + "step": 3869 + }, + { + "epoch": 1.9131133358052157, + "grad_norm": 0.13087853305909264, + "learning_rate": 1.0715863428315819e-05, + "loss": 0.5957, + "step": 3870 + }, + { + "epoch": 1.9136077122728958, + "grad_norm": 0.137179370318582, + "learning_rate": 1.0711979471559521e-05, + "loss": 0.5798, + "step": 3871 + }, + { + "epoch": 1.914102088740576, + "grad_norm": 0.13016691462088542, + "learning_rate": 1.0708095406850048e-05, + "loss": 0.6312, + "step": 3872 + }, + { + "epoch": 1.914596465208256, + "grad_norm": 0.13651028782446645, + "learning_rate": 1.0704211234776311e-05, + "loss": 0.64, + "step": 3873 + }, + { + "epoch": 1.9150908416759362, + "grad_norm": 0.13283820074998368, + "learning_rate": 1.070032695592725e-05, + "loss": 0.6673, + "step": 3874 + }, + { + "epoch": 1.9155852181436164, + "grad_norm": 0.13216738867294173, + "learning_rate": 1.069644257089181e-05, + "loss": 0.6517, + "step": 3875 + }, + { + "epoch": 1.9160795946112965, + "grad_norm": 0.13494076546424605, + "learning_rate": 1.0692558080258959e-05, + "loss": 0.6327, + "step": 3876 + }, + { + "epoch": 1.9165739710789766, + "grad_norm": 0.13108055677677458, + "learning_rate": 1.0688673484617679e-05, + "loss": 0.6075, + "step": 3877 + }, + { + "epoch": 1.9170683475466568, + "grad_norm": 0.13135110274647976, + "learning_rate": 1.0684788784556965e-05, + "loss": 0.6087, + "step": 3878 + }, + { + "epoch": 1.917562724014337, + "grad_norm": 0.1403321379535541, + "learning_rate": 1.068090398066584e-05, + "loss": 0.6232, + "step": 3879 + }, + { + "epoch": 1.918057100482017, + "grad_norm": 0.13702390212162, + "learning_rate": 1.0677019073533324e-05, + "loss": 0.6149, + "step": 3880 + }, + { + "epoch": 1.9185514769496972, + "grad_norm": 0.129140067613515, + "learning_rate": 1.0673134063748463e-05, + "loss": 0.6047, + "step": 3881 + }, + { + "epoch": 1.9190458534173773, + "grad_norm": 0.13819452888314956, + "learning_rate": 1.0669248951900326e-05, + "loss": 0.6098, + "step": 3882 + }, + { + "epoch": 1.9195402298850575, + "grad_norm": 0.13252602113552978, + "learning_rate": 1.0665363738577978e-05, + "loss": 0.6246, + "step": 3883 + }, + { + "epoch": 1.9200346063527376, + "grad_norm": 0.13347780106534626, + "learning_rate": 1.0661478424370518e-05, + "loss": 0.5815, + "step": 3884 + }, + { + "epoch": 1.9205289828204177, + "grad_norm": 0.1283770974255093, + "learning_rate": 1.0657593009867048e-05, + "loss": 0.6141, + "step": 3885 + }, + { + "epoch": 1.9210233592880979, + "grad_norm": 0.12965949379602434, + "learning_rate": 1.0653707495656696e-05, + "loss": 0.6062, + "step": 3886 + }, + { + "epoch": 1.921517735755778, + "grad_norm": 0.13469898662250815, + "learning_rate": 1.0649821882328595e-05, + "loss": 0.6425, + "step": 3887 + }, + { + "epoch": 1.9220121122234581, + "grad_norm": 0.13971598379036124, + "learning_rate": 1.06459361704719e-05, + "loss": 0.593, + "step": 3888 + }, + { + "epoch": 1.9225064886911383, + "grad_norm": 0.12932244710521448, + "learning_rate": 1.0642050360675776e-05, + "loss": 0.625, + "step": 3889 + }, + { + "epoch": 1.9230008651588184, + "grad_norm": 0.13959957842279805, + "learning_rate": 1.0638164453529403e-05, + "loss": 0.6263, + "step": 3890 + }, + { + "epoch": 1.9234952416264985, + "grad_norm": 0.13949906713285487, + "learning_rate": 1.0634278449621982e-05, + "loss": 0.631, + "step": 3891 + }, + { + "epoch": 1.9239896180941787, + "grad_norm": 0.13592206338143215, + "learning_rate": 1.0630392349542724e-05, + "loss": 0.6193, + "step": 3892 + }, + { + "epoch": 1.9244839945618588, + "grad_norm": 0.1344639387133387, + "learning_rate": 1.0626506153880854e-05, + "loss": 0.5719, + "step": 3893 + }, + { + "epoch": 1.924978371029539, + "grad_norm": 0.12959004540309993, + "learning_rate": 1.062261986322561e-05, + "loss": 0.6388, + "step": 3894 + }, + { + "epoch": 1.925472747497219, + "grad_norm": 0.1386738105667734, + "learning_rate": 1.0618733478166252e-05, + "loss": 0.5989, + "step": 3895 + }, + { + "epoch": 1.9259671239648992, + "grad_norm": 0.13415035816851087, + "learning_rate": 1.0614846999292045e-05, + "loss": 0.6203, + "step": 3896 + }, + { + "epoch": 1.9264615004325794, + "grad_norm": 0.13051801428497137, + "learning_rate": 1.0610960427192273e-05, + "loss": 0.578, + "step": 3897 + }, + { + "epoch": 1.9269558769002595, + "grad_norm": 0.12881414411141673, + "learning_rate": 1.0607073762456236e-05, + "loss": 0.6274, + "step": 3898 + }, + { + "epoch": 1.9274502533679398, + "grad_norm": 0.1381686711874273, + "learning_rate": 1.0603187005673247e-05, + "loss": 0.5812, + "step": 3899 + }, + { + "epoch": 1.9279446298356198, + "grad_norm": 0.13956276414770916, + "learning_rate": 1.0599300157432625e-05, + "loss": 0.6251, + "step": 3900 + }, + { + "epoch": 1.9284390063033001, + "grad_norm": 0.14430834596989078, + "learning_rate": 1.0595413218323715e-05, + "loss": 0.6905, + "step": 3901 + }, + { + "epoch": 1.92893338277098, + "grad_norm": 0.1380391743253743, + "learning_rate": 1.0591526188935872e-05, + "loss": 0.6228, + "step": 3902 + }, + { + "epoch": 1.9294277592386604, + "grad_norm": 0.13489987509129425, + "learning_rate": 1.0587639069858458e-05, + "loss": 0.5731, + "step": 3903 + }, + { + "epoch": 1.9299221357063403, + "grad_norm": 0.1330114180083171, + "learning_rate": 1.0583751861680854e-05, + "loss": 0.6152, + "step": 3904 + }, + { + "epoch": 1.9304165121740207, + "grad_norm": 0.13058263035231624, + "learning_rate": 1.0579864564992455e-05, + "loss": 0.6443, + "step": 3905 + }, + { + "epoch": 1.9309108886417006, + "grad_norm": 0.4301667004804796, + "learning_rate": 1.057597718038267e-05, + "loss": 0.6411, + "step": 3906 + }, + { + "epoch": 1.931405265109381, + "grad_norm": 0.13603295631712178, + "learning_rate": 1.057208970844092e-05, + "loss": 0.6012, + "step": 3907 + }, + { + "epoch": 1.9318996415770608, + "grad_norm": 0.13802158577178542, + "learning_rate": 1.0568202149756637e-05, + "loss": 0.6049, + "step": 3908 + }, + { + "epoch": 1.9323940180447412, + "grad_norm": 0.13759038539695823, + "learning_rate": 1.0564314504919269e-05, + "loss": 0.6357, + "step": 3909 + }, + { + "epoch": 1.9328883945124211, + "grad_norm": 0.14724794583106188, + "learning_rate": 1.0560426774518275e-05, + "loss": 0.6133, + "step": 3910 + }, + { + "epoch": 1.9333827709801015, + "grad_norm": 0.13566706917558405, + "learning_rate": 1.0556538959143128e-05, + "loss": 0.6251, + "step": 3911 + }, + { + "epoch": 1.9338771474477814, + "grad_norm": 0.12488645746098025, + "learning_rate": 1.055265105938332e-05, + "loss": 0.6096, + "step": 3912 + }, + { + "epoch": 1.9343715239154617, + "grad_norm": 0.13628193244477305, + "learning_rate": 1.0548763075828346e-05, + "loss": 0.6085, + "step": 3913 + }, + { + "epoch": 1.9348659003831417, + "grad_norm": 0.25449781510714853, + "learning_rate": 1.0544875009067713e-05, + "loss": 0.6638, + "step": 3914 + }, + { + "epoch": 1.935360276850822, + "grad_norm": 0.13728888457911823, + "learning_rate": 1.0540986859690953e-05, + "loss": 0.637, + "step": 3915 + }, + { + "epoch": 1.935854653318502, + "grad_norm": 0.1356625025844829, + "learning_rate": 1.0537098628287596e-05, + "loss": 0.6361, + "step": 3916 + }, + { + "epoch": 1.9363490297861823, + "grad_norm": 0.13766432415629223, + "learning_rate": 1.0533210315447196e-05, + "loss": 0.6594, + "step": 3917 + }, + { + "epoch": 1.9368434062538622, + "grad_norm": 0.14241402079989424, + "learning_rate": 1.0529321921759312e-05, + "loss": 0.591, + "step": 3918 + }, + { + "epoch": 1.9373377827215426, + "grad_norm": 0.13741817542564677, + "learning_rate": 1.0525433447813522e-05, + "loss": 0.5675, + "step": 3919 + }, + { + "epoch": 1.9378321591892225, + "grad_norm": 0.14234520454061902, + "learning_rate": 1.0521544894199407e-05, + "loss": 0.5847, + "step": 3920 + }, + { + "epoch": 1.9383265356569028, + "grad_norm": 0.13948752132347528, + "learning_rate": 1.0517656261506566e-05, + "loss": 0.6203, + "step": 3921 + }, + { + "epoch": 1.9388209121245827, + "grad_norm": 0.13270993867483039, + "learning_rate": 1.0513767550324611e-05, + "loss": 0.6131, + "step": 3922 + }, + { + "epoch": 1.939315288592263, + "grad_norm": 0.1303151325957187, + "learning_rate": 1.0509878761243164e-05, + "loss": 0.6354, + "step": 3923 + }, + { + "epoch": 1.939809665059943, + "grad_norm": 0.13351249457867262, + "learning_rate": 1.0505989894851856e-05, + "loss": 0.6126, + "step": 3924 + }, + { + "epoch": 1.9403040415276234, + "grad_norm": 0.1375755978089479, + "learning_rate": 1.0502100951740335e-05, + "loss": 0.6025, + "step": 3925 + }, + { + "epoch": 1.9407984179953033, + "grad_norm": 0.13626155103039297, + "learning_rate": 1.0498211932498257e-05, + "loss": 0.575, + "step": 3926 + }, + { + "epoch": 1.9412927944629836, + "grad_norm": 0.13804388937063208, + "learning_rate": 1.0494322837715289e-05, + "loss": 0.6309, + "step": 3927 + }, + { + "epoch": 1.9417871709306636, + "grad_norm": 0.13773600475986297, + "learning_rate": 1.0490433667981116e-05, + "loss": 0.6185, + "step": 3928 + }, + { + "epoch": 1.942281547398344, + "grad_norm": 0.13310069821632603, + "learning_rate": 1.0486544423885428e-05, + "loss": 0.6449, + "step": 3929 + }, + { + "epoch": 1.9427759238660238, + "grad_norm": 0.14077306891410318, + "learning_rate": 1.0482655106017922e-05, + "loss": 0.616, + "step": 3930 + }, + { + "epoch": 1.9432703003337042, + "grad_norm": 0.1301922602319807, + "learning_rate": 1.0478765714968318e-05, + "loss": 0.5766, + "step": 3931 + }, + { + "epoch": 1.943764676801384, + "grad_norm": 0.12933177799137222, + "learning_rate": 1.0474876251326342e-05, + "loss": 0.5806, + "step": 3932 + }, + { + "epoch": 1.9442590532690645, + "grad_norm": 0.14401913417921178, + "learning_rate": 1.0470986715681724e-05, + "loss": 0.6187, + "step": 3933 + }, + { + "epoch": 1.9447534297367444, + "grad_norm": 0.13342923461450432, + "learning_rate": 1.0467097108624215e-05, + "loss": 0.5994, + "step": 3934 + }, + { + "epoch": 1.9452478062044247, + "grad_norm": 0.13825565768351625, + "learning_rate": 1.0463207430743576e-05, + "loss": 0.6071, + "step": 3935 + }, + { + "epoch": 1.9457421826721049, + "grad_norm": 0.13608086941756717, + "learning_rate": 1.0459317682629566e-05, + "loss": 0.6143, + "step": 3936 + }, + { + "epoch": 1.946236559139785, + "grad_norm": 0.13296417905379956, + "learning_rate": 1.0455427864871971e-05, + "loss": 0.5755, + "step": 3937 + }, + { + "epoch": 1.9467309356074651, + "grad_norm": 0.14026609068523702, + "learning_rate": 1.0451537978060582e-05, + "loss": 0.6336, + "step": 3938 + }, + { + "epoch": 1.9472253120751453, + "grad_norm": 0.1394179118684672, + "learning_rate": 1.0447648022785197e-05, + "loss": 0.6198, + "step": 3939 + }, + { + "epoch": 1.9477196885428254, + "grad_norm": 0.14169483809630212, + "learning_rate": 1.0443757999635625e-05, + "loss": 0.6264, + "step": 3940 + }, + { + "epoch": 1.9482140650105055, + "grad_norm": 0.13870595554260157, + "learning_rate": 1.0439867909201689e-05, + "loss": 0.619, + "step": 3941 + }, + { + "epoch": 1.9487084414781857, + "grad_norm": 0.13532767491131956, + "learning_rate": 1.0435977752073219e-05, + "loss": 0.5979, + "step": 3942 + }, + { + "epoch": 1.9492028179458658, + "grad_norm": 0.1364584931682323, + "learning_rate": 1.0432087528840056e-05, + "loss": 0.589, + "step": 3943 + }, + { + "epoch": 1.949697194413546, + "grad_norm": 0.12964338047238314, + "learning_rate": 1.0428197240092053e-05, + "loss": 0.6074, + "step": 3944 + }, + { + "epoch": 1.950191570881226, + "grad_norm": 0.13112694388241164, + "learning_rate": 1.0424306886419069e-05, + "loss": 0.5876, + "step": 3945 + }, + { + "epoch": 1.9506859473489062, + "grad_norm": 0.1392845371155043, + "learning_rate": 1.0420416468410976e-05, + "loss": 0.6343, + "step": 3946 + }, + { + "epoch": 1.9511803238165863, + "grad_norm": 0.13758550201804134, + "learning_rate": 1.0416525986657654e-05, + "loss": 0.5934, + "step": 3947 + }, + { + "epoch": 1.9516747002842665, + "grad_norm": 0.12586947690659414, + "learning_rate": 1.0412635441748997e-05, + "loss": 0.632, + "step": 3948 + }, + { + "epoch": 1.9521690767519466, + "grad_norm": 0.1596138024064431, + "learning_rate": 1.04087448342749e-05, + "loss": 0.6057, + "step": 3949 + }, + { + "epoch": 1.9526634532196268, + "grad_norm": 0.13993696463780195, + "learning_rate": 1.0404854164825275e-05, + "loss": 0.61, + "step": 3950 + }, + { + "epoch": 1.953157829687307, + "grad_norm": 0.1431442400126163, + "learning_rate": 1.0400963433990044e-05, + "loss": 0.6507, + "step": 3951 + }, + { + "epoch": 1.953652206154987, + "grad_norm": 0.1385917366699111, + "learning_rate": 1.0397072642359125e-05, + "loss": 0.6315, + "step": 3952 + }, + { + "epoch": 1.9541465826226672, + "grad_norm": 0.13572164142799056, + "learning_rate": 1.0393181790522467e-05, + "loss": 0.6101, + "step": 3953 + }, + { + "epoch": 1.9546409590903473, + "grad_norm": 0.13300247283963781, + "learning_rate": 1.0389290879070008e-05, + "loss": 0.5875, + "step": 3954 + }, + { + "epoch": 1.9551353355580274, + "grad_norm": 0.13451607868355928, + "learning_rate": 1.0385399908591712e-05, + "loss": 0.6299, + "step": 3955 + }, + { + "epoch": 1.9556297120257076, + "grad_norm": 0.1361592139245782, + "learning_rate": 1.0381508879677535e-05, + "loss": 0.6447, + "step": 3956 + }, + { + "epoch": 1.9561240884933877, + "grad_norm": 0.14479292589771112, + "learning_rate": 1.0377617792917456e-05, + "loss": 0.6355, + "step": 3957 + }, + { + "epoch": 1.9566184649610678, + "grad_norm": 0.13276975399250457, + "learning_rate": 1.0373726648901454e-05, + "loss": 0.6078, + "step": 3958 + }, + { + "epoch": 1.957112841428748, + "grad_norm": 0.1382673401598148, + "learning_rate": 1.0369835448219521e-05, + "loss": 0.629, + "step": 3959 + }, + { + "epoch": 1.957607217896428, + "grad_norm": 0.20377538881428142, + "learning_rate": 1.0365944191461656e-05, + "loss": 0.6011, + "step": 3960 + }, + { + "epoch": 1.9581015943641082, + "grad_norm": 0.13773672083675775, + "learning_rate": 1.036205287921787e-05, + "loss": 0.6067, + "step": 3961 + }, + { + "epoch": 1.9585959708317884, + "grad_norm": 0.13634679572035252, + "learning_rate": 1.0358161512078178e-05, + "loss": 0.572, + "step": 3962 + }, + { + "epoch": 1.9590903472994685, + "grad_norm": 0.13223983330160677, + "learning_rate": 1.0354270090632596e-05, + "loss": 0.6229, + "step": 3963 + }, + { + "epoch": 1.9595847237671487, + "grad_norm": 0.13381623352305438, + "learning_rate": 1.0350378615471173e-05, + "loss": 0.6628, + "step": 3964 + }, + { + "epoch": 1.9600791002348288, + "grad_norm": 0.14213611895958692, + "learning_rate": 1.0346487087183939e-05, + "loss": 0.5808, + "step": 3965 + }, + { + "epoch": 1.960573476702509, + "grad_norm": 0.1293877749940894, + "learning_rate": 1.0342595506360942e-05, + "loss": 0.6748, + "step": 3966 + }, + { + "epoch": 1.961067853170189, + "grad_norm": 0.14019842176842828, + "learning_rate": 1.0338703873592244e-05, + "loss": 0.6282, + "step": 3967 + }, + { + "epoch": 1.9615622296378692, + "grad_norm": 0.13489758841821306, + "learning_rate": 1.0334812189467912e-05, + "loss": 0.6153, + "step": 3968 + }, + { + "epoch": 1.9620566061055493, + "grad_norm": 0.1327401420745092, + "learning_rate": 1.0330920454578011e-05, + "loss": 0.6235, + "step": 3969 + }, + { + "epoch": 1.9625509825732295, + "grad_norm": 0.12983943307414422, + "learning_rate": 1.0327028669512629e-05, + "loss": 0.6423, + "step": 3970 + }, + { + "epoch": 1.9630453590409096, + "grad_norm": 0.13062296236990573, + "learning_rate": 1.0323136834861849e-05, + "loss": 0.5941, + "step": 3971 + }, + { + "epoch": 1.96353973550859, + "grad_norm": 0.14640500117081842, + "learning_rate": 1.0319244951215768e-05, + "loss": 0.6155, + "step": 3972 + }, + { + "epoch": 1.9640341119762699, + "grad_norm": 0.13479153455817716, + "learning_rate": 1.0315353019164489e-05, + "loss": 0.5902, + "step": 3973 + }, + { + "epoch": 1.9645284884439502, + "grad_norm": 0.1254790856105758, + "learning_rate": 1.0311461039298125e-05, + "loss": 0.5856, + "step": 3974 + }, + { + "epoch": 1.9650228649116301, + "grad_norm": 0.13938918989836888, + "learning_rate": 1.0307569012206788e-05, + "loss": 0.6106, + "step": 3975 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 0.13838560574325287, + "learning_rate": 1.0303676938480608e-05, + "loss": 0.6037, + "step": 3976 + }, + { + "epoch": 1.9660116178469904, + "grad_norm": 0.13905535192838653, + "learning_rate": 1.0299784818709714e-05, + "loss": 0.6437, + "step": 3977 + }, + { + "epoch": 1.9665059943146708, + "grad_norm": 0.1320153102858723, + "learning_rate": 1.0295892653484247e-05, + "loss": 0.6231, + "step": 3978 + }, + { + "epoch": 1.9670003707823507, + "grad_norm": 0.12945571343290918, + "learning_rate": 1.0292000443394347e-05, + "loss": 0.6077, + "step": 3979 + }, + { + "epoch": 1.967494747250031, + "grad_norm": 0.13429041182999293, + "learning_rate": 1.0288108189030175e-05, + "loss": 0.6032, + "step": 3980 + }, + { + "epoch": 1.967989123717711, + "grad_norm": 0.13413204236136458, + "learning_rate": 1.0284215890981885e-05, + "loss": 0.5933, + "step": 3981 + }, + { + "epoch": 1.9684835001853913, + "grad_norm": 0.13652405321278444, + "learning_rate": 1.0280323549839642e-05, + "loss": 0.6274, + "step": 3982 + }, + { + "epoch": 1.9689778766530712, + "grad_norm": 0.12726613185397057, + "learning_rate": 1.0276431166193621e-05, + "loss": 0.6159, + "step": 3983 + }, + { + "epoch": 1.9694722531207516, + "grad_norm": 0.13703044198737807, + "learning_rate": 1.0272538740634002e-05, + "loss": 0.616, + "step": 3984 + }, + { + "epoch": 1.9699666295884315, + "grad_norm": 0.14147114922568219, + "learning_rate": 1.0268646273750961e-05, + "loss": 0.5896, + "step": 3985 + }, + { + "epoch": 1.9704610060561119, + "grad_norm": 0.13705069561454866, + "learning_rate": 1.0264753766134703e-05, + "loss": 0.6248, + "step": 3986 + }, + { + "epoch": 1.9709553825237918, + "grad_norm": 0.13479477723903127, + "learning_rate": 1.0260861218375416e-05, + "loss": 0.5868, + "step": 3987 + }, + { + "epoch": 1.9714497589914721, + "grad_norm": 0.1487888493351546, + "learning_rate": 1.025696863106331e-05, + "loss": 0.6143, + "step": 3988 + }, + { + "epoch": 1.971944135459152, + "grad_norm": 0.13002586338867136, + "learning_rate": 1.0253076004788587e-05, + "loss": 0.5694, + "step": 3989 + }, + { + "epoch": 1.9724385119268324, + "grad_norm": 0.1322978955747011, + "learning_rate": 1.0249183340141469e-05, + "loss": 0.6214, + "step": 3990 + }, + { + "epoch": 1.9729328883945123, + "grad_norm": 0.1406352144594129, + "learning_rate": 1.0245290637712172e-05, + "loss": 0.6277, + "step": 3991 + }, + { + "epoch": 1.9734272648621927, + "grad_norm": 0.13782826052937036, + "learning_rate": 1.0241397898090933e-05, + "loss": 0.6219, + "step": 3992 + }, + { + "epoch": 1.9739216413298726, + "grad_norm": 0.13439954405053384, + "learning_rate": 1.023750512186797e-05, + "loss": 0.6129, + "step": 3993 + }, + { + "epoch": 1.974416017797553, + "grad_norm": 0.13753499804042008, + "learning_rate": 1.0233612309633537e-05, + "loss": 0.6359, + "step": 3994 + }, + { + "epoch": 1.9749103942652328, + "grad_norm": 0.12791174824658524, + "learning_rate": 1.0229719461977868e-05, + "loss": 0.6235, + "step": 3995 + }, + { + "epoch": 1.9754047707329132, + "grad_norm": 0.13774979819986177, + "learning_rate": 1.022582657949121e-05, + "loss": 0.587, + "step": 3996 + }, + { + "epoch": 1.9758991472005931, + "grad_norm": 0.13032263661956145, + "learning_rate": 1.0221933662763828e-05, + "loss": 0.6156, + "step": 3997 + }, + { + "epoch": 1.9763935236682735, + "grad_norm": 0.13178961794802202, + "learning_rate": 1.021804071238597e-05, + "loss": 0.5899, + "step": 3998 + }, + { + "epoch": 1.9768879001359534, + "grad_norm": 0.17947098103262638, + "learning_rate": 1.021414772894791e-05, + "loss": 0.6228, + "step": 3999 + }, + { + "epoch": 1.9773822766036337, + "grad_norm": 0.13413031510042242, + "learning_rate": 1.0210254713039913e-05, + "loss": 0.631, + "step": 4000 + }, + { + "epoch": 1.9778766530713137, + "grad_norm": 0.17034510671157624, + "learning_rate": 1.0206361665252253e-05, + "loss": 0.6423, + "step": 4001 + }, + { + "epoch": 1.978371029538994, + "grad_norm": 0.13550604597427207, + "learning_rate": 1.0202468586175214e-05, + "loss": 0.6203, + "step": 4002 + }, + { + "epoch": 1.978865406006674, + "grad_norm": 0.13414613575771447, + "learning_rate": 1.0198575476399076e-05, + "loss": 0.6096, + "step": 4003 + }, + { + "epoch": 1.9793597824743543, + "grad_norm": 0.13022119346710148, + "learning_rate": 1.0194682336514128e-05, + "loss": 0.6222, + "step": 4004 + }, + { + "epoch": 1.9798541589420342, + "grad_norm": 0.135396480765568, + "learning_rate": 1.0190789167110667e-05, + "loss": 0.6254, + "step": 4005 + }, + { + "epoch": 1.9803485354097146, + "grad_norm": 0.13380336833810652, + "learning_rate": 1.0186895968778987e-05, + "loss": 0.5796, + "step": 4006 + }, + { + "epoch": 1.9808429118773945, + "grad_norm": 0.13605602359146474, + "learning_rate": 1.0183002742109392e-05, + "loss": 0.6623, + "step": 4007 + }, + { + "epoch": 1.9813372883450748, + "grad_norm": 0.1350395329958356, + "learning_rate": 1.0179109487692188e-05, + "loss": 0.64, + "step": 4008 + }, + { + "epoch": 1.981831664812755, + "grad_norm": 0.13413640616527112, + "learning_rate": 1.0175216206117684e-05, + "loss": 0.6146, + "step": 4009 + }, + { + "epoch": 1.982326041280435, + "grad_norm": 0.13436309284169004, + "learning_rate": 1.0171322897976203e-05, + "loss": 0.5883, + "step": 4010 + }, + { + "epoch": 1.9828204177481152, + "grad_norm": 0.13267315033029922, + "learning_rate": 1.0167429563858055e-05, + "loss": 0.6346, + "step": 4011 + }, + { + "epoch": 1.9833147942157954, + "grad_norm": 0.13375995281477088, + "learning_rate": 1.0163536204353565e-05, + "loss": 0.6201, + "step": 4012 + }, + { + "epoch": 1.9838091706834755, + "grad_norm": 0.13487801128416843, + "learning_rate": 1.0159642820053062e-05, + "loss": 0.6074, + "step": 4013 + }, + { + "epoch": 1.9843035471511556, + "grad_norm": 0.1315153724442807, + "learning_rate": 1.0155749411546877e-05, + "loss": 0.6199, + "step": 4014 + }, + { + "epoch": 1.9847979236188358, + "grad_norm": 0.1373241762526805, + "learning_rate": 1.015185597942534e-05, + "loss": 0.6355, + "step": 4015 + }, + { + "epoch": 1.985292300086516, + "grad_norm": 0.1423965281474022, + "learning_rate": 1.0147962524278794e-05, + "loss": 0.6088, + "step": 4016 + }, + { + "epoch": 1.985786676554196, + "grad_norm": 0.1330414123643978, + "learning_rate": 1.014406904669758e-05, + "loss": 0.6108, + "step": 4017 + }, + { + "epoch": 1.9862810530218762, + "grad_norm": 0.13769262275412844, + "learning_rate": 1.0140175547272033e-05, + "loss": 0.6305, + "step": 4018 + }, + { + "epoch": 1.9867754294895563, + "grad_norm": 0.12853965764636485, + "learning_rate": 1.0136282026592512e-05, + "loss": 0.6107, + "step": 4019 + }, + { + "epoch": 1.9872698059572365, + "grad_norm": 0.13465234938839288, + "learning_rate": 1.0132388485249365e-05, + "loss": 0.5956, + "step": 4020 + }, + { + "epoch": 1.9877641824249166, + "grad_norm": 0.13121576869607204, + "learning_rate": 1.0128494923832945e-05, + "loss": 0.6281, + "step": 4021 + }, + { + "epoch": 1.9882585588925967, + "grad_norm": 0.14072719375732148, + "learning_rate": 1.012460134293361e-05, + "loss": 0.6204, + "step": 4022 + }, + { + "epoch": 1.9887529353602769, + "grad_norm": 0.135858679265063, + "learning_rate": 1.0120707743141722e-05, + "loss": 0.5818, + "step": 4023 + }, + { + "epoch": 1.989247311827957, + "grad_norm": 0.1305187832221183, + "learning_rate": 1.0116814125047643e-05, + "loss": 0.618, + "step": 4024 + }, + { + "epoch": 1.9897416882956371, + "grad_norm": 0.13318456094489212, + "learning_rate": 1.0112920489241738e-05, + "loss": 0.6074, + "step": 4025 + }, + { + "epoch": 1.9902360647633173, + "grad_norm": 0.14398738148423365, + "learning_rate": 1.0109026836314376e-05, + "loss": 0.6179, + "step": 4026 + }, + { + "epoch": 1.9907304412309974, + "grad_norm": 0.15565757512206446, + "learning_rate": 1.0105133166855927e-05, + "loss": 0.6093, + "step": 4027 + }, + { + "epoch": 1.9912248176986775, + "grad_norm": 0.1304196123330571, + "learning_rate": 1.0101239481456769e-05, + "loss": 0.5987, + "step": 4028 + }, + { + "epoch": 1.9917191941663577, + "grad_norm": 0.13455518879408176, + "learning_rate": 1.0097345780707271e-05, + "loss": 0.6276, + "step": 4029 + }, + { + "epoch": 1.9922135706340378, + "grad_norm": 0.13566408460817989, + "learning_rate": 1.009345206519782e-05, + "loss": 0.6734, + "step": 4030 + }, + { + "epoch": 1.992707947101718, + "grad_norm": 0.14091953083260783, + "learning_rate": 1.0089558335518789e-05, + "loss": 0.6211, + "step": 4031 + }, + { + "epoch": 1.993202323569398, + "grad_norm": 0.13227595367038936, + "learning_rate": 1.0085664592260569e-05, + "loss": 0.6419, + "step": 4032 + }, + { + "epoch": 1.9936967000370782, + "grad_norm": 0.13137615404019906, + "learning_rate": 1.008177083601354e-05, + "loss": 0.5753, + "step": 4033 + }, + { + "epoch": 1.9941910765047584, + "grad_norm": 0.12996965727916934, + "learning_rate": 1.0077877067368087e-05, + "loss": 0.5993, + "step": 4034 + }, + { + "epoch": 1.9946854529724385, + "grad_norm": 0.1293830850024956, + "learning_rate": 1.0073983286914602e-05, + "loss": 0.5764, + "step": 4035 + }, + { + "epoch": 1.9951798294401186, + "grad_norm": 0.1365025120549272, + "learning_rate": 1.0070089495243476e-05, + "loss": 0.607, + "step": 4036 + }, + { + "epoch": 1.9956742059077988, + "grad_norm": 0.14003264990087325, + "learning_rate": 1.0066195692945098e-05, + "loss": 0.571, + "step": 4037 + }, + { + "epoch": 1.996168582375479, + "grad_norm": 0.13489288828698714, + "learning_rate": 1.0062301880609867e-05, + "loss": 0.6018, + "step": 4038 + }, + { + "epoch": 1.996662958843159, + "grad_norm": 0.13514694610131794, + "learning_rate": 1.0058408058828173e-05, + "loss": 0.6399, + "step": 4039 + }, + { + "epoch": 1.9971573353108392, + "grad_norm": 0.14661717435116195, + "learning_rate": 1.0054514228190415e-05, + "loss": 0.6133, + "step": 4040 + }, + { + "epoch": 1.9976517117785193, + "grad_norm": 0.14021801108751952, + "learning_rate": 1.0050620389286994e-05, + "loss": 0.5996, + "step": 4041 + }, + { + "epoch": 1.9981460882461994, + "grad_norm": 0.13916714052771423, + "learning_rate": 1.0046726542708303e-05, + "loss": 0.6257, + "step": 4042 + }, + { + "epoch": 1.9986404647138796, + "grad_norm": 0.13570432398044124, + "learning_rate": 1.004283268904475e-05, + "loss": 0.6032, + "step": 4043 + }, + { + "epoch": 1.9991348411815597, + "grad_norm": 0.13967927401384383, + "learning_rate": 1.0038938828886732e-05, + "loss": 0.6285, + "step": 4044 + }, + { + "epoch": 1.9996292176492398, + "grad_norm": 0.13063230225180056, + "learning_rate": 1.0035044962824652e-05, + "loss": 0.6213, + "step": 4045 + }, + { + "epoch": 2.0, + "grad_norm": 0.14093646334766338, + "learning_rate": 1.0031151091448917e-05, + "loss": 0.5915, + "step": 4046 + }, + { + "epoch": 2.0004943764676804, + "grad_norm": 0.17322941896230343, + "learning_rate": 1.0027257215349928e-05, + "loss": 0.5643, + "step": 4047 + }, + { + "epoch": 2.0009887529353603, + "grad_norm": 0.1738624357332444, + "learning_rate": 1.0023363335118088e-05, + "loss": 0.5211, + "step": 4048 + }, + { + "epoch": 2.0009887529353603, + "eval_loss": 0.6498528718948364, + "eval_runtime": 81.623, + "eval_samples_per_second": 371.881, + "eval_steps_per_second": 46.494, + "step": 4048 + }, + { + "epoch": 2.0014831294030406, + "grad_norm": 0.13754097959183514, + "learning_rate": 1.0019469451343806e-05, + "loss": 0.6073, + "step": 4049 + }, + { + "epoch": 2.0019775058707205, + "grad_norm": 0.15162194142249094, + "learning_rate": 1.001557556461749e-05, + "loss": 0.5677, + "step": 4050 + }, + { + "epoch": 2.002471882338401, + "grad_norm": 0.16575938025047332, + "learning_rate": 1.0011681675529545e-05, + "loss": 0.5665, + "step": 4051 + }, + { + "epoch": 2.002966258806081, + "grad_norm": 0.1532989832685918, + "learning_rate": 1.0007787784670376e-05, + "loss": 0.5473, + "step": 4052 + }, + { + "epoch": 2.003460635273761, + "grad_norm": 0.15055003109312176, + "learning_rate": 1.0003893892630391e-05, + "loss": 0.5693, + "step": 4053 + }, + { + "epoch": 2.003955011741441, + "grad_norm": 0.15681823947875012, + "learning_rate": 1e-05, + "loss": 0.5454, + "step": 4054 + }, + { + "epoch": 2.0044493882091214, + "grad_norm": 0.15294693527026695, + "learning_rate": 9.99610610736961e-06, + "loss": 0.5651, + "step": 4055 + }, + { + "epoch": 2.0049437646768014, + "grad_norm": 0.15029201834997752, + "learning_rate": 9.992212215329626e-06, + "loss": 0.58, + "step": 4056 + }, + { + "epoch": 2.0054381411444817, + "grad_norm": 0.1426496144379601, + "learning_rate": 9.988318324470456e-06, + "loss": 0.5759, + "step": 4057 + }, + { + "epoch": 2.0059325176121616, + "grad_norm": 0.15597613046701997, + "learning_rate": 9.98442443538251e-06, + "loss": 0.55, + "step": 4058 + }, + { + "epoch": 2.006426894079842, + "grad_norm": 0.15440164241836815, + "learning_rate": 9.980530548656195e-06, + "loss": 0.5494, + "step": 4059 + }, + { + "epoch": 2.006921270547522, + "grad_norm": 0.14573399366833645, + "learning_rate": 9.976636664881916e-06, + "loss": 0.5741, + "step": 4060 + }, + { + "epoch": 2.0074156470152023, + "grad_norm": 0.139697446189044, + "learning_rate": 9.972742784650079e-06, + "loss": 0.5377, + "step": 4061 + }, + { + "epoch": 2.007910023482882, + "grad_norm": 0.1445918215122678, + "learning_rate": 9.968848908551088e-06, + "loss": 0.5465, + "step": 4062 + }, + { + "epoch": 2.0084043999505625, + "grad_norm": 0.14992917840299927, + "learning_rate": 9.964955037175348e-06, + "loss": 0.5982, + "step": 4063 + }, + { + "epoch": 2.0088987764182424, + "grad_norm": 0.13407302383153666, + "learning_rate": 9.96106117111327e-06, + "loss": 0.544, + "step": 4064 + }, + { + "epoch": 2.009393152885923, + "grad_norm": 0.14994873849222662, + "learning_rate": 9.957167310955253e-06, + "loss": 0.5714, + "step": 4065 + }, + { + "epoch": 2.0098875293536027, + "grad_norm": 0.14532836603914454, + "learning_rate": 9.9532734572917e-06, + "loss": 0.5618, + "step": 4066 + }, + { + "epoch": 2.010381905821283, + "grad_norm": 0.14632484989785263, + "learning_rate": 9.94937961071301e-06, + "loss": 0.5811, + "step": 4067 + }, + { + "epoch": 2.010876282288963, + "grad_norm": 0.13265906630973806, + "learning_rate": 9.945485771809585e-06, + "loss": 0.5642, + "step": 4068 + }, + { + "epoch": 2.0113706587566433, + "grad_norm": 0.13496773301304607, + "learning_rate": 9.94159194117183e-06, + "loss": 0.5599, + "step": 4069 + }, + { + "epoch": 2.0118650352243233, + "grad_norm": 0.1432626926797487, + "learning_rate": 9.937698119390137e-06, + "loss": 0.5521, + "step": 4070 + }, + { + "epoch": 2.0123594116920036, + "grad_norm": 0.13392088884877318, + "learning_rate": 9.933804307054904e-06, + "loss": 0.5731, + "step": 4071 + }, + { + "epoch": 2.0128537881596835, + "grad_norm": 0.14116018589918064, + "learning_rate": 9.929910504756529e-06, + "loss": 0.5503, + "step": 4072 + }, + { + "epoch": 2.013348164627364, + "grad_norm": 0.13413944318285687, + "learning_rate": 9.926016713085403e-06, + "loss": 0.5731, + "step": 4073 + }, + { + "epoch": 2.013842541095044, + "grad_norm": 0.1372206372797363, + "learning_rate": 9.922122932631915e-06, + "loss": 0.6083, + "step": 4074 + }, + { + "epoch": 2.014336917562724, + "grad_norm": 0.1397600922078369, + "learning_rate": 9.918229163986463e-06, + "loss": 0.5535, + "step": 4075 + }, + { + "epoch": 2.014831294030404, + "grad_norm": 0.1416647147337223, + "learning_rate": 9.914335407739435e-06, + "loss": 0.6088, + "step": 4076 + }, + { + "epoch": 2.0153256704980844, + "grad_norm": 0.1437724516584708, + "learning_rate": 9.910441664481213e-06, + "loss": 0.581, + "step": 4077 + }, + { + "epoch": 2.0158200469657643, + "grad_norm": 0.1404096313200748, + "learning_rate": 9.906547934802184e-06, + "loss": 0.5539, + "step": 4078 + }, + { + "epoch": 2.0163144234334447, + "grad_norm": 0.1390257974239863, + "learning_rate": 9.90265421929273e-06, + "loss": 0.5635, + "step": 4079 + }, + { + "epoch": 2.0168087999011246, + "grad_norm": 0.14837738761313193, + "learning_rate": 9.898760518543236e-06, + "loss": 0.5895, + "step": 4080 + }, + { + "epoch": 2.017303176368805, + "grad_norm": 0.1384440597855662, + "learning_rate": 9.894866833144076e-06, + "loss": 0.546, + "step": 4081 + }, + { + "epoch": 2.017797552836485, + "grad_norm": 0.1336373499888548, + "learning_rate": 9.890973163685627e-06, + "loss": 0.5533, + "step": 4082 + }, + { + "epoch": 2.0182919293041652, + "grad_norm": 0.13518082534417578, + "learning_rate": 9.887079510758268e-06, + "loss": 0.5867, + "step": 4083 + }, + { + "epoch": 2.018786305771845, + "grad_norm": 0.14882915919533377, + "learning_rate": 9.883185874952362e-06, + "loss": 0.5724, + "step": 4084 + }, + { + "epoch": 2.0192806822395255, + "grad_norm": 0.14536482001588294, + "learning_rate": 9.879292256858281e-06, + "loss": 0.5717, + "step": 4085 + }, + { + "epoch": 2.0197750587072054, + "grad_norm": 0.15105313147169855, + "learning_rate": 9.875398657066391e-06, + "loss": 0.5549, + "step": 4086 + }, + { + "epoch": 2.0202694351748858, + "grad_norm": 0.13541235441757857, + "learning_rate": 9.871505076167057e-06, + "loss": 0.5773, + "step": 4087 + }, + { + "epoch": 2.0207638116425657, + "grad_norm": 0.14168994976788873, + "learning_rate": 9.867611514750637e-06, + "loss": 0.5797, + "step": 4088 + }, + { + "epoch": 2.021258188110246, + "grad_norm": 0.1533752076835178, + "learning_rate": 9.86371797340749e-06, + "loss": 0.5385, + "step": 4089 + }, + { + "epoch": 2.021752564577926, + "grad_norm": 0.28344134595160203, + "learning_rate": 9.859824452727967e-06, + "loss": 0.5927, + "step": 4090 + }, + { + "epoch": 2.0222469410456063, + "grad_norm": 0.15899994673329296, + "learning_rate": 9.855930953302425e-06, + "loss": 0.5346, + "step": 4091 + }, + { + "epoch": 2.0227413175132862, + "grad_norm": 0.13362716040211362, + "learning_rate": 9.852037475721209e-06, + "loss": 0.5526, + "step": 4092 + }, + { + "epoch": 2.0232356939809666, + "grad_norm": 0.14148516688480278, + "learning_rate": 9.84814402057466e-06, + "loss": 0.5668, + "step": 4093 + }, + { + "epoch": 2.0237300704486465, + "grad_norm": 0.13587203840907494, + "learning_rate": 9.844250588453126e-06, + "loss": 0.5683, + "step": 4094 + }, + { + "epoch": 2.024224446916327, + "grad_norm": 0.137519722514532, + "learning_rate": 9.840357179946938e-06, + "loss": 0.5578, + "step": 4095 + }, + { + "epoch": 2.0247188233840068, + "grad_norm": 0.1285802826715201, + "learning_rate": 9.836463795646437e-06, + "loss": 0.5349, + "step": 4096 + }, + { + "epoch": 2.025213199851687, + "grad_norm": 0.14832768827133258, + "learning_rate": 9.832570436141948e-06, + "loss": 0.5962, + "step": 4097 + }, + { + "epoch": 2.025707576319367, + "grad_norm": 0.14327264798636058, + "learning_rate": 9.8286771020238e-06, + "loss": 0.5683, + "step": 4098 + }, + { + "epoch": 2.0262019527870474, + "grad_norm": 0.1446353356530882, + "learning_rate": 9.824783793882319e-06, + "loss": 0.5866, + "step": 4099 + }, + { + "epoch": 2.0266963292547273, + "grad_norm": 0.13271817416076467, + "learning_rate": 9.820890512307817e-06, + "loss": 0.5447, + "step": 4100 + }, + { + "epoch": 2.0271907057224077, + "grad_norm": 0.1387384849946903, + "learning_rate": 9.816997257890612e-06, + "loss": 0.5502, + "step": 4101 + }, + { + "epoch": 2.0276850821900876, + "grad_norm": 0.13438529577404398, + "learning_rate": 9.813104031221016e-06, + "loss": 0.5981, + "step": 4102 + }, + { + "epoch": 2.028179458657768, + "grad_norm": 0.13921267250737868, + "learning_rate": 9.809210832889338e-06, + "loss": 0.5815, + "step": 4103 + }, + { + "epoch": 2.028673835125448, + "grad_norm": 0.13463932287518662, + "learning_rate": 9.805317663485875e-06, + "loss": 0.5525, + "step": 4104 + }, + { + "epoch": 2.029168211593128, + "grad_norm": 0.13128579168711443, + "learning_rate": 9.801424523600928e-06, + "loss": 0.527, + "step": 4105 + }, + { + "epoch": 2.029662588060808, + "grad_norm": 0.13868051139242446, + "learning_rate": 9.797531413824787e-06, + "loss": 0.5549, + "step": 4106 + }, + { + "epoch": 2.0301569645284885, + "grad_norm": 0.1419733070470783, + "learning_rate": 9.793638334747747e-06, + "loss": 0.5828, + "step": 4107 + }, + { + "epoch": 2.0306513409961684, + "grad_norm": 0.14028506659560513, + "learning_rate": 9.78974528696009e-06, + "loss": 0.5656, + "step": 4108 + }, + { + "epoch": 2.0311457174638488, + "grad_norm": 0.14098000190011944, + "learning_rate": 9.785852271052092e-06, + "loss": 0.5639, + "step": 4109 + }, + { + "epoch": 2.0316400939315287, + "grad_norm": 0.13120549270387602, + "learning_rate": 9.781959287614032e-06, + "loss": 0.5369, + "step": 4110 + }, + { + "epoch": 2.032134470399209, + "grad_norm": 0.1441397277703467, + "learning_rate": 9.778066337236177e-06, + "loss": 0.5864, + "step": 4111 + }, + { + "epoch": 2.032628846866889, + "grad_norm": 0.14516198032269081, + "learning_rate": 9.77417342050879e-06, + "loss": 0.5793, + "step": 4112 + }, + { + "epoch": 2.0331232233345693, + "grad_norm": 0.14803948847411982, + "learning_rate": 9.770280538022137e-06, + "loss": 0.5713, + "step": 4113 + }, + { + "epoch": 2.033617599802249, + "grad_norm": 0.14154828125728655, + "learning_rate": 9.766387690366466e-06, + "loss": 0.5665, + "step": 4114 + }, + { + "epoch": 2.0341119762699296, + "grad_norm": 0.130807102856764, + "learning_rate": 9.762494878132033e-06, + "loss": 0.5499, + "step": 4115 + }, + { + "epoch": 2.0346063527376095, + "grad_norm": 0.1401853793853016, + "learning_rate": 9.758602101909074e-06, + "loss": 0.5716, + "step": 4116 + }, + { + "epoch": 2.03510072920529, + "grad_norm": 0.14324623685311458, + "learning_rate": 9.754709362287826e-06, + "loss": 0.5686, + "step": 4117 + }, + { + "epoch": 2.0355951056729698, + "grad_norm": 0.13592697545506768, + "learning_rate": 9.750816659858536e-06, + "loss": 0.5757, + "step": 4118 + }, + { + "epoch": 2.03608948214065, + "grad_norm": 0.14967620750143576, + "learning_rate": 9.746923995211417e-06, + "loss": 0.5479, + "step": 4119 + }, + { + "epoch": 2.0365838586083305, + "grad_norm": 0.140604996350216, + "learning_rate": 9.743031368936696e-06, + "loss": 0.586, + "step": 4120 + }, + { + "epoch": 2.0370782350760104, + "grad_norm": 0.13665317771740887, + "learning_rate": 9.739138781624586e-06, + "loss": 0.5739, + "step": 4121 + }, + { + "epoch": 2.0375726115436907, + "grad_norm": 0.1349594609547172, + "learning_rate": 9.735246233865302e-06, + "loss": 0.6042, + "step": 4122 + }, + { + "epoch": 2.0380669880113707, + "grad_norm": 0.1438606516397185, + "learning_rate": 9.731353726249038e-06, + "loss": 0.5479, + "step": 4123 + }, + { + "epoch": 2.038561364479051, + "grad_norm": 0.1321039121019582, + "learning_rate": 9.727461259366003e-06, + "loss": 0.582, + "step": 4124 + }, + { + "epoch": 2.039055740946731, + "grad_norm": 0.13521632144075185, + "learning_rate": 9.723568833806382e-06, + "loss": 0.5699, + "step": 4125 + }, + { + "epoch": 2.0395501174144113, + "grad_norm": 0.13513395316965338, + "learning_rate": 9.719676450160361e-06, + "loss": 0.5391, + "step": 4126 + }, + { + "epoch": 2.040044493882091, + "grad_norm": 0.14803130842496062, + "learning_rate": 9.71578410901812e-06, + "loss": 0.5953, + "step": 4127 + }, + { + "epoch": 2.0405388703497715, + "grad_norm": 0.14231477536394613, + "learning_rate": 9.711891810969826e-06, + "loss": 0.5709, + "step": 4128 + }, + { + "epoch": 2.0410332468174515, + "grad_norm": 0.14268380997921953, + "learning_rate": 9.707999556605653e-06, + "loss": 0.5525, + "step": 4129 + }, + { + "epoch": 2.041527623285132, + "grad_norm": 0.13959767277180685, + "learning_rate": 9.704107346515756e-06, + "loss": 0.5525, + "step": 4130 + }, + { + "epoch": 2.0420219997528117, + "grad_norm": 0.14366376344197435, + "learning_rate": 9.700215181290287e-06, + "loss": 0.5769, + "step": 4131 + }, + { + "epoch": 2.042516376220492, + "grad_norm": 0.13770020876207736, + "learning_rate": 9.696323061519397e-06, + "loss": 0.5571, + "step": 4132 + }, + { + "epoch": 2.043010752688172, + "grad_norm": 0.14285097122357965, + "learning_rate": 9.692430987793215e-06, + "loss": 0.5584, + "step": 4133 + }, + { + "epoch": 2.0435051291558524, + "grad_norm": 0.14618567102297403, + "learning_rate": 9.688538960701878e-06, + "loss": 0.5493, + "step": 4134 + }, + { + "epoch": 2.0439995056235323, + "grad_norm": 0.13921799276153993, + "learning_rate": 9.684646980835513e-06, + "loss": 0.5729, + "step": 4135 + }, + { + "epoch": 2.0444938820912126, + "grad_norm": 0.14395900014508817, + "learning_rate": 9.680755048784235e-06, + "loss": 0.5257, + "step": 4136 + }, + { + "epoch": 2.0449882585588925, + "grad_norm": 0.14235916282617103, + "learning_rate": 9.676863165138156e-06, + "loss": 0.5369, + "step": 4137 + }, + { + "epoch": 2.045482635026573, + "grad_norm": 0.1349565203501828, + "learning_rate": 9.672971330487375e-06, + "loss": 0.5564, + "step": 4138 + }, + { + "epoch": 2.045977011494253, + "grad_norm": 0.14768693196304178, + "learning_rate": 9.669079545421989e-06, + "loss": 0.5616, + "step": 4139 + }, + { + "epoch": 2.046471387961933, + "grad_norm": 0.14239177059862093, + "learning_rate": 9.66518781053209e-06, + "loss": 0.5778, + "step": 4140 + }, + { + "epoch": 2.046965764429613, + "grad_norm": 0.13531087642475034, + "learning_rate": 9.661296126407757e-06, + "loss": 0.5724, + "step": 4141 + }, + { + "epoch": 2.0474601408972934, + "grad_norm": 0.14236618159119588, + "learning_rate": 9.657404493639061e-06, + "loss": 0.5934, + "step": 4142 + }, + { + "epoch": 2.0479545173649734, + "grad_norm": 0.14185050339916652, + "learning_rate": 9.653512912816067e-06, + "loss": 0.5705, + "step": 4143 + }, + { + "epoch": 2.0484488938326537, + "grad_norm": 0.13398244812959112, + "learning_rate": 9.649621384528832e-06, + "loss": 0.5761, + "step": 4144 + }, + { + "epoch": 2.0489432703003336, + "grad_norm": 0.1432685739232881, + "learning_rate": 9.645729909367402e-06, + "loss": 0.5406, + "step": 4145 + }, + { + "epoch": 2.049437646768014, + "grad_norm": 0.13442488403433744, + "learning_rate": 9.641838487921827e-06, + "loss": 0.5747, + "step": 4146 + }, + { + "epoch": 2.049932023235694, + "grad_norm": 0.13494026847363308, + "learning_rate": 9.637947120782131e-06, + "loss": 0.5317, + "step": 4147 + }, + { + "epoch": 2.0504263997033743, + "grad_norm": 0.13651680699648636, + "learning_rate": 9.634055808538347e-06, + "loss": 0.5526, + "step": 4148 + }, + { + "epoch": 2.050920776171054, + "grad_norm": 0.14698910447912314, + "learning_rate": 9.630164551780484e-06, + "loss": 0.5541, + "step": 4149 + }, + { + "epoch": 2.0514151526387345, + "grad_norm": 0.13793996575122716, + "learning_rate": 9.626273351098547e-06, + "loss": 0.5881, + "step": 4150 + }, + { + "epoch": 2.0519095291064144, + "grad_norm": 0.1352071328407657, + "learning_rate": 9.622382207082548e-06, + "loss": 0.6034, + "step": 4151 + }, + { + "epoch": 2.052403905574095, + "grad_norm": 0.1453344260865873, + "learning_rate": 9.618491120322468e-06, + "loss": 0.5308, + "step": 4152 + }, + { + "epoch": 2.0528982820417747, + "grad_norm": 0.13551964339708825, + "learning_rate": 9.614600091408293e-06, + "loss": 0.5705, + "step": 4153 + }, + { + "epoch": 2.053392658509455, + "grad_norm": 0.14916269840716848, + "learning_rate": 9.610709120929993e-06, + "loss": 0.5817, + "step": 4154 + }, + { + "epoch": 2.053887034977135, + "grad_norm": 0.13954916690578958, + "learning_rate": 9.60681820947754e-06, + "loss": 0.5775, + "step": 4155 + }, + { + "epoch": 2.0543814114448153, + "grad_norm": 0.13553141812382158, + "learning_rate": 9.602927357640876e-06, + "loss": 0.5559, + "step": 4156 + }, + { + "epoch": 2.0548757879124953, + "grad_norm": 0.13470373786512407, + "learning_rate": 9.599036566009961e-06, + "loss": 0.5515, + "step": 4157 + }, + { + "epoch": 2.0553701643801756, + "grad_norm": 0.13926770866420754, + "learning_rate": 9.595145835174729e-06, + "loss": 0.5598, + "step": 4158 + }, + { + "epoch": 2.0558645408478555, + "grad_norm": 0.13374342048784335, + "learning_rate": 9.591255165725104e-06, + "loss": 0.54, + "step": 4159 + }, + { + "epoch": 2.056358917315536, + "grad_norm": 0.13574989860704564, + "learning_rate": 9.587364558251008e-06, + "loss": 0.5259, + "step": 4160 + }, + { + "epoch": 2.056853293783216, + "grad_norm": 0.14637948073139276, + "learning_rate": 9.583474013342347e-06, + "loss": 0.6153, + "step": 4161 + }, + { + "epoch": 2.057347670250896, + "grad_norm": 0.13740848194269853, + "learning_rate": 9.579583531589027e-06, + "loss": 0.557, + "step": 4162 + }, + { + "epoch": 2.057842046718576, + "grad_norm": 0.14786000003257574, + "learning_rate": 9.575693113580935e-06, + "loss": 0.5507, + "step": 4163 + }, + { + "epoch": 2.0583364231862564, + "grad_norm": 0.142643253483024, + "learning_rate": 9.57180275990795e-06, + "loss": 0.5544, + "step": 4164 + }, + { + "epoch": 2.0588307996539363, + "grad_norm": 0.13715351738865086, + "learning_rate": 9.567912471159949e-06, + "loss": 0.5559, + "step": 4165 + }, + { + "epoch": 2.0593251761216167, + "grad_norm": 0.13533824406435013, + "learning_rate": 9.564022247926786e-06, + "loss": 0.6005, + "step": 4166 + }, + { + "epoch": 2.0598195525892966, + "grad_norm": 0.14452137405699145, + "learning_rate": 9.560132090798314e-06, + "loss": 0.5471, + "step": 4167 + }, + { + "epoch": 2.060313929056977, + "grad_norm": 0.1374791085250075, + "learning_rate": 9.556242000364378e-06, + "loss": 0.5673, + "step": 4168 + }, + { + "epoch": 2.060808305524657, + "grad_norm": 0.14097680363661558, + "learning_rate": 9.552351977214806e-06, + "loss": 0.5546, + "step": 4169 + }, + { + "epoch": 2.0613026819923372, + "grad_norm": 0.13719764214698935, + "learning_rate": 9.54846202193942e-06, + "loss": 0.5625, + "step": 4170 + }, + { + "epoch": 2.061797058460017, + "grad_norm": 0.13601794477843632, + "learning_rate": 9.544572135128034e-06, + "loss": 0.5386, + "step": 4171 + }, + { + "epoch": 2.0622914349276975, + "grad_norm": 0.13337073029885177, + "learning_rate": 9.540682317370436e-06, + "loss": 0.5657, + "step": 4172 + }, + { + "epoch": 2.0627858113953774, + "grad_norm": 0.13982737464533454, + "learning_rate": 9.536792569256429e-06, + "loss": 0.5577, + "step": 4173 + }, + { + "epoch": 2.063280187863058, + "grad_norm": 0.1421272109771025, + "learning_rate": 9.532902891375788e-06, + "loss": 0.5785, + "step": 4174 + }, + { + "epoch": 2.0637745643307377, + "grad_norm": 0.1418196892923885, + "learning_rate": 9.52901328431828e-06, + "loss": 0.5728, + "step": 4175 + }, + { + "epoch": 2.064268940798418, + "grad_norm": 0.13790739313654155, + "learning_rate": 9.525123748673663e-06, + "loss": 0.6009, + "step": 4176 + }, + { + "epoch": 2.064763317266098, + "grad_norm": 0.13763647100154652, + "learning_rate": 9.521234285031682e-06, + "loss": 0.5528, + "step": 4177 + }, + { + "epoch": 2.0652576937337783, + "grad_norm": 0.13761179813353053, + "learning_rate": 9.51734489398208e-06, + "loss": 0.5582, + "step": 4178 + }, + { + "epoch": 2.0657520702014582, + "grad_norm": 0.1316896544808256, + "learning_rate": 9.513455576114575e-06, + "loss": 0.5653, + "step": 4179 + }, + { + "epoch": 2.0662464466691386, + "grad_norm": 0.13780145153729362, + "learning_rate": 9.509566332018885e-06, + "loss": 0.5422, + "step": 4180 + }, + { + "epoch": 2.0667408231368185, + "grad_norm": 0.13543712036688568, + "learning_rate": 9.505677162284713e-06, + "loss": 0.5651, + "step": 4181 + }, + { + "epoch": 2.067235199604499, + "grad_norm": 0.1373619150662551, + "learning_rate": 9.501788067501748e-06, + "loss": 0.5561, + "step": 4182 + }, + { + "epoch": 2.0677295760721788, + "grad_norm": 0.13758691158060166, + "learning_rate": 9.497899048259668e-06, + "loss": 0.5662, + "step": 4183 + }, + { + "epoch": 2.068223952539859, + "grad_norm": 0.1429045199989255, + "learning_rate": 9.494010105148148e-06, + "loss": 0.5735, + "step": 4184 + }, + { + "epoch": 2.068718329007539, + "grad_norm": 0.13476212640335133, + "learning_rate": 9.49012123875684e-06, + "loss": 0.541, + "step": 4185 + }, + { + "epoch": 2.0692127054752194, + "grad_norm": 0.13816814071414735, + "learning_rate": 9.48623244967539e-06, + "loss": 0.5507, + "step": 4186 + }, + { + "epoch": 2.0697070819428993, + "grad_norm": 0.13980838038697233, + "learning_rate": 9.482343738493436e-06, + "loss": 0.6038, + "step": 4187 + }, + { + "epoch": 2.0702014584105797, + "grad_norm": 0.14145540124818218, + "learning_rate": 9.478455105800594e-06, + "loss": 0.6382, + "step": 4188 + }, + { + "epoch": 2.0706958348782596, + "grad_norm": 0.1570918092992698, + "learning_rate": 9.47456655218648e-06, + "loss": 0.5585, + "step": 4189 + }, + { + "epoch": 2.07119021134594, + "grad_norm": 0.13828977352451033, + "learning_rate": 9.47067807824069e-06, + "loss": 0.5512, + "step": 4190 + }, + { + "epoch": 2.07168458781362, + "grad_norm": 0.14260924332709846, + "learning_rate": 9.466789684552808e-06, + "loss": 0.5427, + "step": 4191 + }, + { + "epoch": 2.0721789642813, + "grad_norm": 0.1422239638829419, + "learning_rate": 9.462901371712408e-06, + "loss": 0.5474, + "step": 4192 + }, + { + "epoch": 2.07267334074898, + "grad_norm": 0.1535802034472443, + "learning_rate": 9.459013140309052e-06, + "loss": 0.5878, + "step": 4193 + }, + { + "epoch": 2.0731677172166605, + "grad_norm": 0.13796923444360062, + "learning_rate": 9.455124990932289e-06, + "loss": 0.5602, + "step": 4194 + }, + { + "epoch": 2.073662093684341, + "grad_norm": 0.14591625223535068, + "learning_rate": 9.451236924171657e-06, + "loss": 0.5351, + "step": 4195 + }, + { + "epoch": 2.0741564701520208, + "grad_norm": 0.1547554154444469, + "learning_rate": 9.447348940616683e-06, + "loss": 0.592, + "step": 4196 + }, + { + "epoch": 2.074650846619701, + "grad_norm": 0.14562665122492507, + "learning_rate": 9.443461040856873e-06, + "loss": 0.5593, + "step": 4197 + }, + { + "epoch": 2.075145223087381, + "grad_norm": 0.14532805470124108, + "learning_rate": 9.439573225481729e-06, + "loss": 0.5618, + "step": 4198 + }, + { + "epoch": 2.0756395995550614, + "grad_norm": 0.13957951843845268, + "learning_rate": 9.435685495080731e-06, + "loss": 0.5786, + "step": 4199 + }, + { + "epoch": 2.0761339760227413, + "grad_norm": 0.14018800845429222, + "learning_rate": 9.431797850243367e-06, + "loss": 0.578, + "step": 4200 + }, + { + "epoch": 2.0766283524904217, + "grad_norm": 0.15188267486584134, + "learning_rate": 9.427910291559083e-06, + "loss": 0.5686, + "step": 4201 + }, + { + "epoch": 2.0771227289581016, + "grad_norm": 0.14377341858822748, + "learning_rate": 9.424022819617332e-06, + "loss": 0.5592, + "step": 4202 + }, + { + "epoch": 2.077617105425782, + "grad_norm": 0.1429674725860256, + "learning_rate": 9.420135435007547e-06, + "loss": 0.5645, + "step": 4203 + }, + { + "epoch": 2.078111481893462, + "grad_norm": 0.13813288525777215, + "learning_rate": 9.416248138319152e-06, + "loss": 0.5643, + "step": 4204 + }, + { + "epoch": 2.078605858361142, + "grad_norm": 0.14221778445907898, + "learning_rate": 9.412360930141544e-06, + "loss": 0.5598, + "step": 4205 + }, + { + "epoch": 2.079100234828822, + "grad_norm": 0.1368327167921205, + "learning_rate": 9.40847381106413e-06, + "loss": 0.5725, + "step": 4206 + }, + { + "epoch": 2.0795946112965025, + "grad_norm": 0.13873942576571494, + "learning_rate": 9.404586781676286e-06, + "loss": 0.5466, + "step": 4207 + }, + { + "epoch": 2.0800889877641824, + "grad_norm": 0.13872445982079523, + "learning_rate": 9.400699842567376e-06, + "loss": 0.5465, + "step": 4208 + }, + { + "epoch": 2.0805833642318627, + "grad_norm": 0.14164242302974092, + "learning_rate": 9.396812994326756e-06, + "loss": 0.5646, + "step": 4209 + }, + { + "epoch": 2.0810777406995427, + "grad_norm": 0.13411221676188426, + "learning_rate": 9.392926237543765e-06, + "loss": 0.524, + "step": 4210 + }, + { + "epoch": 2.081572117167223, + "grad_norm": 0.13071198803505443, + "learning_rate": 9.389039572807727e-06, + "loss": 0.5535, + "step": 4211 + }, + { + "epoch": 2.082066493634903, + "grad_norm": 0.1487440871827829, + "learning_rate": 9.385153000707958e-06, + "loss": 0.5896, + "step": 4212 + }, + { + "epoch": 2.0825608701025833, + "grad_norm": 0.13729283835714504, + "learning_rate": 9.381266521833751e-06, + "loss": 0.5451, + "step": 4213 + }, + { + "epoch": 2.083055246570263, + "grad_norm": 0.13977290957080243, + "learning_rate": 9.377380136774394e-06, + "loss": 0.5643, + "step": 4214 + }, + { + "epoch": 2.0835496230379436, + "grad_norm": 0.1356884949572954, + "learning_rate": 9.373493846119153e-06, + "loss": 0.6019, + "step": 4215 + }, + { + "epoch": 2.0840439995056235, + "grad_norm": 0.3182640872496196, + "learning_rate": 9.36960765045728e-06, + "loss": 0.5753, + "step": 4216 + }, + { + "epoch": 2.084538375973304, + "grad_norm": 0.14656931264514034, + "learning_rate": 9.365721550378021e-06, + "loss": 0.5567, + "step": 4217 + }, + { + "epoch": 2.0850327524409837, + "grad_norm": 0.1415855071508352, + "learning_rate": 9.3618355464706e-06, + "loss": 0.556, + "step": 4218 + }, + { + "epoch": 2.085527128908664, + "grad_norm": 0.13687144993380634, + "learning_rate": 9.357949639324229e-06, + "loss": 0.5406, + "step": 4219 + }, + { + "epoch": 2.086021505376344, + "grad_norm": 0.13887201475788566, + "learning_rate": 9.354063829528105e-06, + "loss": 0.5335, + "step": 4220 + }, + { + "epoch": 2.0865158818440244, + "grad_norm": 0.13509146916379028, + "learning_rate": 9.350178117671405e-06, + "loss": 0.5701, + "step": 4221 + }, + { + "epoch": 2.0870102583117043, + "grad_norm": 0.14855946606645132, + "learning_rate": 9.346292504343306e-06, + "loss": 0.5432, + "step": 4222 + }, + { + "epoch": 2.0875046347793846, + "grad_norm": 0.13799303703259513, + "learning_rate": 9.342406990132954e-06, + "loss": 0.6082, + "step": 4223 + }, + { + "epoch": 2.0879990112470646, + "grad_norm": 0.13740211101857377, + "learning_rate": 9.338521575629487e-06, + "loss": 0.5716, + "step": 4224 + }, + { + "epoch": 2.088493387714745, + "grad_norm": 0.13723304613291196, + "learning_rate": 9.334636261422027e-06, + "loss": 0.5325, + "step": 4225 + }, + { + "epoch": 2.088987764182425, + "grad_norm": 0.1395350984936875, + "learning_rate": 9.33075104809968e-06, + "loss": 0.5539, + "step": 4226 + }, + { + "epoch": 2.089482140650105, + "grad_norm": 0.1407968904549311, + "learning_rate": 9.326865936251537e-06, + "loss": 0.585, + "step": 4227 + }, + { + "epoch": 2.089976517117785, + "grad_norm": 0.15110975569216212, + "learning_rate": 9.322980926466678e-06, + "loss": 0.6024, + "step": 4228 + }, + { + "epoch": 2.0904708935854655, + "grad_norm": 0.13154261209911094, + "learning_rate": 9.319096019334163e-06, + "loss": 0.5729, + "step": 4229 + }, + { + "epoch": 2.0909652700531454, + "grad_norm": 0.1447057739407865, + "learning_rate": 9.315211215443037e-06, + "loss": 0.5833, + "step": 4230 + }, + { + "epoch": 2.0914596465208257, + "grad_norm": 0.142432231195333, + "learning_rate": 9.311326515382326e-06, + "loss": 0.61, + "step": 4231 + }, + { + "epoch": 2.0919540229885056, + "grad_norm": 0.1445741287711705, + "learning_rate": 9.307441919741041e-06, + "loss": 0.5727, + "step": 4232 + }, + { + "epoch": 2.092448399456186, + "grad_norm": 0.13609161647468146, + "learning_rate": 9.303557429108193e-06, + "loss": 0.5563, + "step": 4233 + }, + { + "epoch": 2.092942775923866, + "grad_norm": 0.13720661449850136, + "learning_rate": 9.299673044072753e-06, + "loss": 0.564, + "step": 4234 + }, + { + "epoch": 2.0934371523915463, + "grad_norm": 0.13948698715233657, + "learning_rate": 9.295788765223692e-06, + "loss": 0.5716, + "step": 4235 + }, + { + "epoch": 2.093931528859226, + "grad_norm": 0.13465889501994482, + "learning_rate": 9.291904593149957e-06, + "loss": 0.5563, + "step": 4236 + }, + { + "epoch": 2.0944259053269065, + "grad_norm": 0.13430454511762752, + "learning_rate": 9.288020528440484e-06, + "loss": 0.563, + "step": 4237 + }, + { + "epoch": 2.0949202817945864, + "grad_norm": 0.1355335234158672, + "learning_rate": 9.284136571684183e-06, + "loss": 0.5906, + "step": 4238 + }, + { + "epoch": 2.095414658262267, + "grad_norm": 0.140511370556862, + "learning_rate": 9.280252723469965e-06, + "loss": 0.603, + "step": 4239 + }, + { + "epoch": 2.0959090347299467, + "grad_norm": 0.13629818923180986, + "learning_rate": 9.276368984386715e-06, + "loss": 0.5719, + "step": 4240 + }, + { + "epoch": 2.096403411197627, + "grad_norm": 0.1372814291072013, + "learning_rate": 9.272485355023293e-06, + "loss": 0.5693, + "step": 4241 + }, + { + "epoch": 2.096897787665307, + "grad_norm": 0.13737915545196674, + "learning_rate": 9.268601835968555e-06, + "loss": 0.5398, + "step": 4242 + }, + { + "epoch": 2.0973921641329873, + "grad_norm": 0.12926808265544124, + "learning_rate": 9.264718427811333e-06, + "loss": 0.5689, + "step": 4243 + }, + { + "epoch": 2.0978865406006673, + "grad_norm": 0.14205629260338637, + "learning_rate": 9.260835131140448e-06, + "loss": 0.591, + "step": 4244 + }, + { + "epoch": 2.0983809170683476, + "grad_norm": 0.1415817125881677, + "learning_rate": 9.256951946544701e-06, + "loss": 0.5758, + "step": 4245 + }, + { + "epoch": 2.0988752935360275, + "grad_norm": 0.13558911720380185, + "learning_rate": 9.253068874612876e-06, + "loss": 0.584, + "step": 4246 + }, + { + "epoch": 2.099369670003708, + "grad_norm": 0.13830189273150464, + "learning_rate": 9.24918591593374e-06, + "loss": 0.5366, + "step": 4247 + }, + { + "epoch": 2.099864046471388, + "grad_norm": 0.1398331711719029, + "learning_rate": 9.245303071096038e-06, + "loss": 0.5554, + "step": 4248 + }, + { + "epoch": 2.100358422939068, + "grad_norm": 0.1348702622961078, + "learning_rate": 9.241420340688507e-06, + "loss": 0.5837, + "step": 4249 + }, + { + "epoch": 2.100852799406748, + "grad_norm": 0.13653879533593846, + "learning_rate": 9.237537725299861e-06, + "loss": 0.5746, + "step": 4250 + }, + { + "epoch": 2.1013471758744284, + "grad_norm": 0.1348486288958463, + "learning_rate": 9.2336552255188e-06, + "loss": 0.577, + "step": 4251 + }, + { + "epoch": 2.1018415523421083, + "grad_norm": 0.13167043335007722, + "learning_rate": 9.229772841934e-06, + "loss": 0.5372, + "step": 4252 + }, + { + "epoch": 2.1023359288097887, + "grad_norm": 0.130255843297564, + "learning_rate": 9.225890575134128e-06, + "loss": 0.5613, + "step": 4253 + }, + { + "epoch": 2.1028303052774686, + "grad_norm": 0.1355868602297709, + "learning_rate": 9.222008425707822e-06, + "loss": 0.5525, + "step": 4254 + }, + { + "epoch": 2.103324681745149, + "grad_norm": 0.13976665790351234, + "learning_rate": 9.218126394243716e-06, + "loss": 0.5854, + "step": 4255 + }, + { + "epoch": 2.103819058212829, + "grad_norm": 0.13104404247195112, + "learning_rate": 9.214244481330419e-06, + "loss": 0.5404, + "step": 4256 + }, + { + "epoch": 2.1043134346805092, + "grad_norm": 0.13634451718738586, + "learning_rate": 9.210362687556518e-06, + "loss": 0.61, + "step": 4257 + }, + { + "epoch": 2.104807811148189, + "grad_norm": 0.1347588230382206, + "learning_rate": 9.20648101351059e-06, + "loss": 0.545, + "step": 4258 + }, + { + "epoch": 2.1053021876158695, + "grad_norm": 0.1411016799077994, + "learning_rate": 9.202599459781183e-06, + "loss": 0.5641, + "step": 4259 + }, + { + "epoch": 2.1057965640835494, + "grad_norm": 0.13526411731829455, + "learning_rate": 9.19871802695684e-06, + "loss": 0.5843, + "step": 4260 + }, + { + "epoch": 2.10629094055123, + "grad_norm": 0.14348139962437184, + "learning_rate": 9.19483671562608e-06, + "loss": 0.5833, + "step": 4261 + }, + { + "epoch": 2.1067853170189097, + "grad_norm": 0.13757956808398825, + "learning_rate": 9.1909555263774e-06, + "loss": 0.5661, + "step": 4262 + }, + { + "epoch": 2.10727969348659, + "grad_norm": 0.14010682584830844, + "learning_rate": 9.187074459799285e-06, + "loss": 0.5809, + "step": 4263 + }, + { + "epoch": 2.10777406995427, + "grad_norm": 0.13853114665614816, + "learning_rate": 9.183193516480193e-06, + "loss": 0.5963, + "step": 4264 + }, + { + "epoch": 2.1082684464219503, + "grad_norm": 0.14396179172672083, + "learning_rate": 9.179312697008569e-06, + "loss": 0.5697, + "step": 4265 + }, + { + "epoch": 2.1087628228896302, + "grad_norm": 0.1452247304872144, + "learning_rate": 9.17543200197284e-06, + "loss": 0.5459, + "step": 4266 + }, + { + "epoch": 2.1092571993573106, + "grad_norm": 0.1433294528316872, + "learning_rate": 9.171551431961416e-06, + "loss": 0.5884, + "step": 4267 + }, + { + "epoch": 2.1097515758249905, + "grad_norm": 0.14438164415822952, + "learning_rate": 9.167670987562677e-06, + "loss": 0.5837, + "step": 4268 + }, + { + "epoch": 2.110245952292671, + "grad_norm": 0.1393509455559721, + "learning_rate": 9.163790669364998e-06, + "loss": 0.5809, + "step": 4269 + }, + { + "epoch": 2.1107403287603512, + "grad_norm": 0.14784239817802047, + "learning_rate": 9.159910477956724e-06, + "loss": 0.5742, + "step": 4270 + }, + { + "epoch": 2.111234705228031, + "grad_norm": 0.14052085296797695, + "learning_rate": 9.156030413926188e-06, + "loss": 0.5611, + "step": 4271 + }, + { + "epoch": 2.1117290816957115, + "grad_norm": 0.14122208493521818, + "learning_rate": 9.152150477861701e-06, + "loss": 0.5539, + "step": 4272 + }, + { + "epoch": 2.1122234581633914, + "grad_norm": 0.13463484210988422, + "learning_rate": 9.148270670351552e-06, + "loss": 0.5507, + "step": 4273 + }, + { + "epoch": 2.1127178346310718, + "grad_norm": 0.1455279163860856, + "learning_rate": 9.144390991984014e-06, + "loss": 0.5746, + "step": 4274 + }, + { + "epoch": 2.1132122110987517, + "grad_norm": 0.1404307913524575, + "learning_rate": 9.140511443347341e-06, + "loss": 0.5636, + "step": 4275 + }, + { + "epoch": 2.113706587566432, + "grad_norm": 0.1326864761315675, + "learning_rate": 9.136632025029762e-06, + "loss": 0.5474, + "step": 4276 + }, + { + "epoch": 2.114200964034112, + "grad_norm": 0.14926558634070736, + "learning_rate": 9.132752737619493e-06, + "loss": 0.576, + "step": 4277 + }, + { + "epoch": 2.1146953405017923, + "grad_norm": 0.13693625186755864, + "learning_rate": 9.128873581704726e-06, + "loss": 0.5564, + "step": 4278 + }, + { + "epoch": 2.1151897169694722, + "grad_norm": 0.1341580302085258, + "learning_rate": 9.124994557873638e-06, + "loss": 0.5624, + "step": 4279 + }, + { + "epoch": 2.1156840934371526, + "grad_norm": 0.145325334750497, + "learning_rate": 9.121115666714375e-06, + "loss": 0.5985, + "step": 4280 + }, + { + "epoch": 2.1161784699048325, + "grad_norm": 0.13984633505233435, + "learning_rate": 9.11723690881507e-06, + "loss": 0.5406, + "step": 4281 + }, + { + "epoch": 2.116672846372513, + "grad_norm": 0.1406823731224702, + "learning_rate": 9.113358284763846e-06, + "loss": 0.5802, + "step": 4282 + }, + { + "epoch": 2.1171672228401928, + "grad_norm": 0.14021079356406554, + "learning_rate": 9.109479795148787e-06, + "loss": 0.5519, + "step": 4283 + }, + { + "epoch": 2.117661599307873, + "grad_norm": 0.1390452340028371, + "learning_rate": 9.105601440557966e-06, + "loss": 0.5539, + "step": 4284 + }, + { + "epoch": 2.118155975775553, + "grad_norm": 0.14120113122469874, + "learning_rate": 9.101723221579437e-06, + "loss": 0.5623, + "step": 4285 + }, + { + "epoch": 2.1186503522432334, + "grad_norm": 0.14749615857014844, + "learning_rate": 9.097845138801232e-06, + "loss": 0.546, + "step": 4286 + }, + { + "epoch": 2.1191447287109133, + "grad_norm": 0.14625940368737814, + "learning_rate": 9.093967192811351e-06, + "loss": 0.5782, + "step": 4287 + }, + { + "epoch": 2.1196391051785937, + "grad_norm": 0.1400806044075594, + "learning_rate": 9.090089384197798e-06, + "loss": 0.5515, + "step": 4288 + }, + { + "epoch": 2.1201334816462736, + "grad_norm": 0.13622917883573066, + "learning_rate": 9.086211713548537e-06, + "loss": 0.5684, + "step": 4289 + }, + { + "epoch": 2.120627858113954, + "grad_norm": 0.1448632058217366, + "learning_rate": 9.082334181451514e-06, + "loss": 0.5356, + "step": 4290 + }, + { + "epoch": 2.121122234581634, + "grad_norm": 0.12954501794519413, + "learning_rate": 9.078456788494654e-06, + "loss": 0.5686, + "step": 4291 + }, + { + "epoch": 2.121616611049314, + "grad_norm": 0.1400298063873926, + "learning_rate": 9.074579535265864e-06, + "loss": 0.5559, + "step": 4292 + }, + { + "epoch": 2.122110987516994, + "grad_norm": 0.1405927708222211, + "learning_rate": 9.070702422353033e-06, + "loss": 0.5867, + "step": 4293 + }, + { + "epoch": 2.1226053639846745, + "grad_norm": 0.13696047554488228, + "learning_rate": 9.066825450344022e-06, + "loss": 0.5817, + "step": 4294 + }, + { + "epoch": 2.1230997404523544, + "grad_norm": 0.1361541365721378, + "learning_rate": 9.062948619826673e-06, + "loss": 0.5742, + "step": 4295 + }, + { + "epoch": 2.1235941169200347, + "grad_norm": 0.14413495785540162, + "learning_rate": 9.059071931388808e-06, + "loss": 0.5636, + "step": 4296 + }, + { + "epoch": 2.1240884933877147, + "grad_norm": 0.1366983517138749, + "learning_rate": 9.055195385618221e-06, + "loss": 0.5623, + "step": 4297 + }, + { + "epoch": 2.124582869855395, + "grad_norm": 0.1391090822924653, + "learning_rate": 9.05131898310269e-06, + "loss": 0.5897, + "step": 4298 + }, + { + "epoch": 2.125077246323075, + "grad_norm": 0.14166854353291528, + "learning_rate": 9.047442724429977e-06, + "loss": 0.5703, + "step": 4299 + }, + { + "epoch": 2.1255716227907553, + "grad_norm": 0.1387322257736246, + "learning_rate": 9.043566610187812e-06, + "loss": 0.558, + "step": 4300 + }, + { + "epoch": 2.126065999258435, + "grad_norm": 0.14603446856785404, + "learning_rate": 9.039690640963906e-06, + "loss": 0.5624, + "step": 4301 + }, + { + "epoch": 2.1265603757261156, + "grad_norm": 0.14228312097332624, + "learning_rate": 9.035814817345951e-06, + "loss": 0.572, + "step": 4302 + }, + { + "epoch": 2.1270547521937955, + "grad_norm": 0.1502259041141741, + "learning_rate": 9.03193913992161e-06, + "loss": 0.5377, + "step": 4303 + }, + { + "epoch": 2.127549128661476, + "grad_norm": 0.14563464699093356, + "learning_rate": 9.028063609278537e-06, + "loss": 0.5813, + "step": 4304 + }, + { + "epoch": 2.1280435051291557, + "grad_norm": 0.14508483963878072, + "learning_rate": 9.024188226004353e-06, + "loss": 0.558, + "step": 4305 + }, + { + "epoch": 2.128537881596836, + "grad_norm": 0.1374500887534965, + "learning_rate": 9.020312990686654e-06, + "loss": 0.5614, + "step": 4306 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 0.1377592487077335, + "learning_rate": 9.016437903913022e-06, + "loss": 0.607, + "step": 4307 + }, + { + "epoch": 2.1295266345321964, + "grad_norm": 0.13893846929527048, + "learning_rate": 9.012562966271014e-06, + "loss": 0.555, + "step": 4308 + }, + { + "epoch": 2.1300210109998763, + "grad_norm": 0.14327756865929736, + "learning_rate": 9.00868817834816e-06, + "loss": 0.5752, + "step": 4309 + }, + { + "epoch": 2.1305153874675566, + "grad_norm": 0.15335215370193506, + "learning_rate": 9.004813540731976e-06, + "loss": 0.5734, + "step": 4310 + }, + { + "epoch": 2.1310097639352366, + "grad_norm": 0.1398110970926819, + "learning_rate": 9.000939054009947e-06, + "loss": 0.5332, + "step": 4311 + }, + { + "epoch": 2.131504140402917, + "grad_norm": 0.15897918446510817, + "learning_rate": 8.99706471876954e-06, + "loss": 0.5352, + "step": 4312 + }, + { + "epoch": 2.131998516870597, + "grad_norm": 0.1458464132027153, + "learning_rate": 8.993190535598196e-06, + "loss": 0.5616, + "step": 4313 + }, + { + "epoch": 2.132492893338277, + "grad_norm": 0.14284411029904365, + "learning_rate": 8.989316505083328e-06, + "loss": 0.5361, + "step": 4314 + }, + { + "epoch": 2.132987269805957, + "grad_norm": 0.14361594461110938, + "learning_rate": 8.985442627812345e-06, + "loss": 0.5357, + "step": 4315 + }, + { + "epoch": 2.1334816462736375, + "grad_norm": 0.14029544040537847, + "learning_rate": 8.981568904372612e-06, + "loss": 0.5854, + "step": 4316 + }, + { + "epoch": 2.1339760227413174, + "grad_norm": 0.14324047500159562, + "learning_rate": 8.977695335351479e-06, + "loss": 0.5599, + "step": 4317 + }, + { + "epoch": 2.1344703992089977, + "grad_norm": 0.1376150923034715, + "learning_rate": 8.973821921336273e-06, + "loss": 0.5613, + "step": 4318 + }, + { + "epoch": 2.1349647756766776, + "grad_norm": 0.14388786855203115, + "learning_rate": 8.969948662914297e-06, + "loss": 0.5767, + "step": 4319 + }, + { + "epoch": 2.135459152144358, + "grad_norm": 0.1417825932047592, + "learning_rate": 8.966075560672823e-06, + "loss": 0.5981, + "step": 4320 + }, + { + "epoch": 2.135953528612038, + "grad_norm": 0.14336662017793986, + "learning_rate": 8.962202615199116e-06, + "loss": 0.5748, + "step": 4321 + }, + { + "epoch": 2.1364479050797183, + "grad_norm": 0.1378384900148898, + "learning_rate": 8.958329827080406e-06, + "loss": 0.5532, + "step": 4322 + }, + { + "epoch": 2.136942281547398, + "grad_norm": 0.1325049136893976, + "learning_rate": 8.954457196903897e-06, + "loss": 0.5622, + "step": 4323 + }, + { + "epoch": 2.1374366580150785, + "grad_norm": 0.13739515754354364, + "learning_rate": 8.950584725256774e-06, + "loss": 0.5819, + "step": 4324 + }, + { + "epoch": 2.1379310344827585, + "grad_norm": 0.1481060776295292, + "learning_rate": 8.946712412726193e-06, + "loss": 0.5805, + "step": 4325 + }, + { + "epoch": 2.138425410950439, + "grad_norm": 0.13563130990749267, + "learning_rate": 8.942840259899298e-06, + "loss": 0.597, + "step": 4326 + }, + { + "epoch": 2.1389197874181187, + "grad_norm": 0.14057789692198955, + "learning_rate": 8.938968267363195e-06, + "loss": 0.5689, + "step": 4327 + }, + { + "epoch": 2.139414163885799, + "grad_norm": 0.1442003923955781, + "learning_rate": 8.93509643570497e-06, + "loss": 0.5869, + "step": 4328 + }, + { + "epoch": 2.139908540353479, + "grad_norm": 0.13130729258365764, + "learning_rate": 8.93122476551169e-06, + "loss": 0.5697, + "step": 4329 + }, + { + "epoch": 2.1404029168211594, + "grad_norm": 0.1421392668921455, + "learning_rate": 8.927353257370388e-06, + "loss": 0.5664, + "step": 4330 + }, + { + "epoch": 2.1408972932888393, + "grad_norm": 0.14574463395598689, + "learning_rate": 8.923481911868078e-06, + "loss": 0.547, + "step": 4331 + }, + { + "epoch": 2.1413916697565196, + "grad_norm": 0.13245963170669237, + "learning_rate": 8.919610729591754e-06, + "loss": 0.5706, + "step": 4332 + }, + { + "epoch": 2.1418860462241995, + "grad_norm": 0.3560826175419385, + "learning_rate": 8.915739711128376e-06, + "loss": 0.6, + "step": 4333 + }, + { + "epoch": 2.14238042269188, + "grad_norm": 0.1339876546140954, + "learning_rate": 8.911868857064885e-06, + "loss": 0.5772, + "step": 4334 + }, + { + "epoch": 2.14287479915956, + "grad_norm": 0.14221225266435195, + "learning_rate": 8.907998167988195e-06, + "loss": 0.6047, + "step": 4335 + }, + { + "epoch": 2.14336917562724, + "grad_norm": 0.1391554414038667, + "learning_rate": 8.90412764448519e-06, + "loss": 0.5677, + "step": 4336 + }, + { + "epoch": 2.14386355209492, + "grad_norm": 0.14809255571663388, + "learning_rate": 8.900257287142744e-06, + "loss": 0.613, + "step": 4337 + }, + { + "epoch": 2.1443579285626004, + "grad_norm": 0.1378827751377483, + "learning_rate": 8.896387096547693e-06, + "loss": 0.5589, + "step": 4338 + }, + { + "epoch": 2.1448523050302803, + "grad_norm": 0.14387969214914748, + "learning_rate": 8.892517073286847e-06, + "loss": 0.5601, + "step": 4339 + }, + { + "epoch": 2.1453466814979607, + "grad_norm": 0.13467048450102515, + "learning_rate": 8.888647217946997e-06, + "loss": 0.5772, + "step": 4340 + }, + { + "epoch": 2.145841057965641, + "grad_norm": 0.13748381754386602, + "learning_rate": 8.884777531114902e-06, + "loss": 0.5842, + "step": 4341 + }, + { + "epoch": 2.146335434433321, + "grad_norm": 0.1379545506646214, + "learning_rate": 8.880908013377307e-06, + "loss": 0.5427, + "step": 4342 + }, + { + "epoch": 2.146829810901001, + "grad_norm": 0.13518643618967216, + "learning_rate": 8.877038665320918e-06, + "loss": 0.5839, + "step": 4343 + }, + { + "epoch": 2.1473241873686812, + "grad_norm": 0.1418982128104446, + "learning_rate": 8.873169487532425e-06, + "loss": 0.5719, + "step": 4344 + }, + { + "epoch": 2.1478185638363616, + "grad_norm": 0.14203249093627376, + "learning_rate": 8.869300480598486e-06, + "loss": 0.5813, + "step": 4345 + }, + { + "epoch": 2.1483129403040415, + "grad_norm": 0.1375581827401386, + "learning_rate": 8.865431645105734e-06, + "loss": 0.5657, + "step": 4346 + }, + { + "epoch": 2.1488073167717214, + "grad_norm": 0.14186396676459753, + "learning_rate": 8.861562981640776e-06, + "loss": 0.5759, + "step": 4347 + }, + { + "epoch": 2.149301693239402, + "grad_norm": 0.14250728939410676, + "learning_rate": 8.8576944907902e-06, + "loss": 0.5743, + "step": 4348 + }, + { + "epoch": 2.149796069707082, + "grad_norm": 0.14214506302376012, + "learning_rate": 8.853826173140559e-06, + "loss": 0.5558, + "step": 4349 + }, + { + "epoch": 2.150290446174762, + "grad_norm": 0.14196755128588134, + "learning_rate": 8.849958029278383e-06, + "loss": 0.6052, + "step": 4350 + }, + { + "epoch": 2.1507848226424424, + "grad_norm": 0.1376978620699883, + "learning_rate": 8.846090059790176e-06, + "loss": 0.5665, + "step": 4351 + }, + { + "epoch": 2.1512791991101223, + "grad_norm": 0.1446223610310357, + "learning_rate": 8.84222226526241e-06, + "loss": 0.5781, + "step": 4352 + }, + { + "epoch": 2.1517735755778027, + "grad_norm": 0.14323274185640728, + "learning_rate": 8.838354646281544e-06, + "loss": 0.5497, + "step": 4353 + }, + { + "epoch": 2.1522679520454826, + "grad_norm": 0.13282959696988086, + "learning_rate": 8.834487203433998e-06, + "loss": 0.5381, + "step": 4354 + }, + { + "epoch": 2.152762328513163, + "grad_norm": 0.13632211265698216, + "learning_rate": 8.830619937306168e-06, + "loss": 0.5617, + "step": 4355 + }, + { + "epoch": 2.153256704980843, + "grad_norm": 0.14508124774468117, + "learning_rate": 8.826752848484425e-06, + "loss": 0.5799, + "step": 4356 + }, + { + "epoch": 2.1537510814485232, + "grad_norm": 0.1425861193697375, + "learning_rate": 8.822885937555113e-06, + "loss": 0.5682, + "step": 4357 + }, + { + "epoch": 2.154245457916203, + "grad_norm": 0.13771186422748455, + "learning_rate": 8.819019205104544e-06, + "loss": 0.5205, + "step": 4358 + }, + { + "epoch": 2.1547398343838835, + "grad_norm": 0.13421265622487757, + "learning_rate": 8.815152651719015e-06, + "loss": 0.5548, + "step": 4359 + }, + { + "epoch": 2.1552342108515634, + "grad_norm": 0.14265860785263593, + "learning_rate": 8.811286277984785e-06, + "loss": 0.5427, + "step": 4360 + }, + { + "epoch": 2.1557285873192438, + "grad_norm": 0.13136512056413335, + "learning_rate": 8.807420084488092e-06, + "loss": 0.5587, + "step": 4361 + }, + { + "epoch": 2.1562229637869237, + "grad_norm": 0.13858761122229266, + "learning_rate": 8.803554071815139e-06, + "loss": 0.5204, + "step": 4362 + }, + { + "epoch": 2.156717340254604, + "grad_norm": 0.15639989785203523, + "learning_rate": 8.799688240552102e-06, + "loss": 0.5928, + "step": 4363 + }, + { + "epoch": 2.157211716722284, + "grad_norm": 0.14581865330111485, + "learning_rate": 8.795822591285147e-06, + "loss": 0.5441, + "step": 4364 + }, + { + "epoch": 2.1577060931899643, + "grad_norm": 0.1319400588951781, + "learning_rate": 8.79195712460039e-06, + "loss": 0.5601, + "step": 4365 + }, + { + "epoch": 2.1582004696576442, + "grad_norm": 0.1430944473429892, + "learning_rate": 8.788091841083932e-06, + "loss": 0.5616, + "step": 4366 + }, + { + "epoch": 2.1586948461253246, + "grad_norm": 0.1450067378732743, + "learning_rate": 8.78422674132184e-06, + "loss": 0.6173, + "step": 4367 + }, + { + "epoch": 2.1591892225930045, + "grad_norm": 0.14437677527343376, + "learning_rate": 8.78036182590016e-06, + "loss": 0.5547, + "step": 4368 + }, + { + "epoch": 2.159683599060685, + "grad_norm": 0.14419014632370594, + "learning_rate": 8.776497095404897e-06, + "loss": 0.537, + "step": 4369 + }, + { + "epoch": 2.1601779755283648, + "grad_norm": 0.15500787145645062, + "learning_rate": 8.772632550422047e-06, + "loss": 0.5853, + "step": 4370 + }, + { + "epoch": 2.160672351996045, + "grad_norm": 0.14175832773601965, + "learning_rate": 8.768768191537565e-06, + "loss": 0.5621, + "step": 4371 + }, + { + "epoch": 2.161166728463725, + "grad_norm": 0.14557850897535551, + "learning_rate": 8.764904019337378e-06, + "loss": 0.5693, + "step": 4372 + }, + { + "epoch": 2.1616611049314054, + "grad_norm": 0.13680950546412088, + "learning_rate": 8.76104003440739e-06, + "loss": 0.5566, + "step": 4373 + }, + { + "epoch": 2.1621554813990853, + "grad_norm": 0.13804721354814764, + "learning_rate": 8.75717623733347e-06, + "loss": 0.5167, + "step": 4374 + }, + { + "epoch": 2.1626498578667657, + "grad_norm": 0.13964008351085225, + "learning_rate": 8.753312628701468e-06, + "loss": 0.5379, + "step": 4375 + }, + { + "epoch": 2.1631442343344456, + "grad_norm": 0.14135323267193206, + "learning_rate": 8.749449209097197e-06, + "loss": 0.5418, + "step": 4376 + }, + { + "epoch": 2.163638610802126, + "grad_norm": 0.13633031101971113, + "learning_rate": 8.745585979106443e-06, + "loss": 0.548, + "step": 4377 + }, + { + "epoch": 2.164132987269806, + "grad_norm": 0.14151564986087833, + "learning_rate": 8.741722939314967e-06, + "loss": 0.5754, + "step": 4378 + }, + { + "epoch": 2.164627363737486, + "grad_norm": 0.14108077884473552, + "learning_rate": 8.737860090308495e-06, + "loss": 0.5455, + "step": 4379 + }, + { + "epoch": 2.165121740205166, + "grad_norm": 0.1402717218442066, + "learning_rate": 8.733997432672729e-06, + "loss": 0.5614, + "step": 4380 + }, + { + "epoch": 2.1656161166728465, + "grad_norm": 0.13842961933243772, + "learning_rate": 8.730134966993342e-06, + "loss": 0.5478, + "step": 4381 + }, + { + "epoch": 2.1661104931405264, + "grad_norm": 0.13760008248380334, + "learning_rate": 8.726272693855976e-06, + "loss": 0.5624, + "step": 4382 + }, + { + "epoch": 2.1666048696082068, + "grad_norm": 0.14234225010081536, + "learning_rate": 8.722410613846244e-06, + "loss": 0.614, + "step": 4383 + }, + { + "epoch": 2.1670992460758867, + "grad_norm": 0.14354074053211255, + "learning_rate": 8.71854872754973e-06, + "loss": 0.599, + "step": 4384 + }, + { + "epoch": 2.167593622543567, + "grad_norm": 0.14400296596964465, + "learning_rate": 8.714687035551988e-06, + "loss": 0.5647, + "step": 4385 + }, + { + "epoch": 2.168087999011247, + "grad_norm": 0.13769910827838225, + "learning_rate": 8.710825538438544e-06, + "loss": 0.5936, + "step": 4386 + }, + { + "epoch": 2.1685823754789273, + "grad_norm": 0.1316404366804844, + "learning_rate": 8.706964236794897e-06, + "loss": 0.6073, + "step": 4387 + }, + { + "epoch": 2.169076751946607, + "grad_norm": 0.14104795571907275, + "learning_rate": 8.703103131206508e-06, + "loss": 0.5749, + "step": 4388 + }, + { + "epoch": 2.1695711284142876, + "grad_norm": 0.13296253207809963, + "learning_rate": 8.699242222258814e-06, + "loss": 0.5647, + "step": 4389 + }, + { + "epoch": 2.1700655048819675, + "grad_norm": 0.13236799939118024, + "learning_rate": 8.695381510537221e-06, + "loss": 0.569, + "step": 4390 + }, + { + "epoch": 2.170559881349648, + "grad_norm": 0.1340565949153966, + "learning_rate": 8.691520996627107e-06, + "loss": 0.5445, + "step": 4391 + }, + { + "epoch": 2.1710542578173277, + "grad_norm": 0.13543117247583122, + "learning_rate": 8.68766068111382e-06, + "loss": 0.5887, + "step": 4392 + }, + { + "epoch": 2.171548634285008, + "grad_norm": 0.13878783089071742, + "learning_rate": 8.683800564582675e-06, + "loss": 0.5516, + "step": 4393 + }, + { + "epoch": 2.172043010752688, + "grad_norm": 0.14276615817278157, + "learning_rate": 8.679940647618961e-06, + "loss": 0.5825, + "step": 4394 + }, + { + "epoch": 2.1725373872203684, + "grad_norm": 0.14221185628360256, + "learning_rate": 8.676080930807928e-06, + "loss": 0.5942, + "step": 4395 + }, + { + "epoch": 2.1730317636880483, + "grad_norm": 0.139362231830131, + "learning_rate": 8.672221414734802e-06, + "loss": 0.5817, + "step": 4396 + }, + { + "epoch": 2.1735261401557286, + "grad_norm": 0.154963668320803, + "learning_rate": 8.668362099984786e-06, + "loss": 0.5554, + "step": 4397 + }, + { + "epoch": 2.1740205166234086, + "grad_norm": 0.16730110553073085, + "learning_rate": 8.66450298714304e-06, + "loss": 0.5291, + "step": 4398 + }, + { + "epoch": 2.174514893091089, + "grad_norm": 0.13365962940666026, + "learning_rate": 8.660644076794699e-06, + "loss": 0.559, + "step": 4399 + }, + { + "epoch": 2.175009269558769, + "grad_norm": 0.15277362376102918, + "learning_rate": 8.656785369524864e-06, + "loss": 0.5927, + "step": 4400 + }, + { + "epoch": 2.175503646026449, + "grad_norm": 0.14411492199806808, + "learning_rate": 8.652926865918613e-06, + "loss": 0.5667, + "step": 4401 + }, + { + "epoch": 2.175998022494129, + "grad_norm": 0.14323839466369143, + "learning_rate": 8.649068566560976e-06, + "loss": 0.5633, + "step": 4402 + }, + { + "epoch": 2.1764923989618095, + "grad_norm": 0.1350876575383756, + "learning_rate": 8.645210472036978e-06, + "loss": 0.5681, + "step": 4403 + }, + { + "epoch": 2.1769867754294894, + "grad_norm": 0.1411310876027112, + "learning_rate": 8.641352582931593e-06, + "loss": 0.5402, + "step": 4404 + }, + { + "epoch": 2.1774811518971697, + "grad_norm": 0.1380521990774921, + "learning_rate": 8.637494899829768e-06, + "loss": 0.5411, + "step": 4405 + }, + { + "epoch": 2.1779755283648496, + "grad_norm": 0.13233268407322754, + "learning_rate": 8.633637423316422e-06, + "loss": 0.5887, + "step": 4406 + }, + { + "epoch": 2.17846990483253, + "grad_norm": 0.14425508126645464, + "learning_rate": 8.629780153976438e-06, + "loss": 0.6137, + "step": 4407 + }, + { + "epoch": 2.17896428130021, + "grad_norm": 0.15154524012553078, + "learning_rate": 8.625923092394675e-06, + "loss": 0.5738, + "step": 4408 + }, + { + "epoch": 2.1794586577678903, + "grad_norm": 0.13977435218727086, + "learning_rate": 8.622066239155957e-06, + "loss": 0.5836, + "step": 4409 + }, + { + "epoch": 2.17995303423557, + "grad_norm": 0.13593680535128083, + "learning_rate": 8.61820959484507e-06, + "loss": 0.5555, + "step": 4410 + }, + { + "epoch": 2.1804474107032505, + "grad_norm": 0.13401252813490405, + "learning_rate": 8.61435316004678e-06, + "loss": 0.5214, + "step": 4411 + }, + { + "epoch": 2.1809417871709305, + "grad_norm": 0.13306276266838218, + "learning_rate": 8.610496935345811e-06, + "loss": 0.5475, + "step": 4412 + }, + { + "epoch": 2.181436163638611, + "grad_norm": 0.14329488074821975, + "learning_rate": 8.606640921326855e-06, + "loss": 0.5696, + "step": 4413 + }, + { + "epoch": 2.1819305401062907, + "grad_norm": 0.13971085025849322, + "learning_rate": 8.602785118574586e-06, + "loss": 0.5632, + "step": 4414 + }, + { + "epoch": 2.182424916573971, + "grad_norm": 0.1387035927129756, + "learning_rate": 8.598929527673631e-06, + "loss": 0.5681, + "step": 4415 + }, + { + "epoch": 2.1829192930416514, + "grad_norm": 0.13889607620285552, + "learning_rate": 8.595074149208591e-06, + "loss": 0.5724, + "step": 4416 + }, + { + "epoch": 2.1834136695093314, + "grad_norm": 0.13542385921439862, + "learning_rate": 8.591218983764036e-06, + "loss": 0.5701, + "step": 4417 + }, + { + "epoch": 2.1839080459770113, + "grad_norm": 0.14105907892012326, + "learning_rate": 8.587364031924492e-06, + "loss": 0.5746, + "step": 4418 + }, + { + "epoch": 2.1844024224446916, + "grad_norm": 0.141432964258941, + "learning_rate": 8.583509294274474e-06, + "loss": 0.6008, + "step": 4419 + }, + { + "epoch": 2.184896798912372, + "grad_norm": 0.13961585431215376, + "learning_rate": 8.57965477139845e-06, + "loss": 0.5374, + "step": 4420 + }, + { + "epoch": 2.185391175380052, + "grad_norm": 0.13634414536855638, + "learning_rate": 8.575800463880856e-06, + "loss": 0.5982, + "step": 4421 + }, + { + "epoch": 2.1858855518477323, + "grad_norm": 0.1403198711797279, + "learning_rate": 8.571946372306097e-06, + "loss": 0.5894, + "step": 4422 + }, + { + "epoch": 2.186379928315412, + "grad_norm": 0.13725135573397673, + "learning_rate": 8.568092497258544e-06, + "loss": 0.565, + "step": 4423 + }, + { + "epoch": 2.1868743047830925, + "grad_norm": 0.14261743286332465, + "learning_rate": 8.564238839322544e-06, + "loss": 0.563, + "step": 4424 + }, + { + "epoch": 2.1873686812507724, + "grad_norm": 0.13802714175937966, + "learning_rate": 8.560385399082398e-06, + "loss": 0.5685, + "step": 4425 + }, + { + "epoch": 2.187863057718453, + "grad_norm": 0.1335445981336087, + "learning_rate": 8.556532177122383e-06, + "loss": 0.5275, + "step": 4426 + }, + { + "epoch": 2.1883574341861327, + "grad_norm": 0.14538937987056688, + "learning_rate": 8.55267917402674e-06, + "loss": 0.5687, + "step": 4427 + }, + { + "epoch": 2.188851810653813, + "grad_norm": 0.1371332704026663, + "learning_rate": 8.548826390379674e-06, + "loss": 0.5473, + "step": 4428 + }, + { + "epoch": 2.189346187121493, + "grad_norm": 0.13219029621421075, + "learning_rate": 8.54497382676536e-06, + "loss": 0.5409, + "step": 4429 + }, + { + "epoch": 2.1898405635891733, + "grad_norm": 0.1368538450333644, + "learning_rate": 8.54112148376794e-06, + "loss": 0.5652, + "step": 4430 + }, + { + "epoch": 2.1903349400568533, + "grad_norm": 0.14195299661592575, + "learning_rate": 8.537269361971523e-06, + "loss": 0.5645, + "step": 4431 + }, + { + "epoch": 2.1908293165245336, + "grad_norm": 0.13181587580897564, + "learning_rate": 8.533417461960182e-06, + "loss": 0.5468, + "step": 4432 + }, + { + "epoch": 2.1913236929922135, + "grad_norm": 0.13480091851922787, + "learning_rate": 8.529565784317958e-06, + "loss": 0.5563, + "step": 4433 + }, + { + "epoch": 2.191818069459894, + "grad_norm": 0.1407942753065929, + "learning_rate": 8.525714329628855e-06, + "loss": 0.6026, + "step": 4434 + }, + { + "epoch": 2.192312445927574, + "grad_norm": 0.13907888851899708, + "learning_rate": 8.521863098476851e-06, + "loss": 0.5247, + "step": 4435 + }, + { + "epoch": 2.192806822395254, + "grad_norm": 0.3260204351291746, + "learning_rate": 8.518012091445884e-06, + "loss": 0.5859, + "step": 4436 + }, + { + "epoch": 2.193301198862934, + "grad_norm": 0.142855629671955, + "learning_rate": 8.514161309119853e-06, + "loss": 0.6168, + "step": 4437 + }, + { + "epoch": 2.1937955753306144, + "grad_norm": 0.14483322680459917, + "learning_rate": 8.510310752082635e-06, + "loss": 0.5672, + "step": 4438 + }, + { + "epoch": 2.1942899517982943, + "grad_norm": 0.1317825515220901, + "learning_rate": 8.506460420918067e-06, + "loss": 0.5535, + "step": 4439 + }, + { + "epoch": 2.1947843282659747, + "grad_norm": 0.13423322543443564, + "learning_rate": 8.502610316209947e-06, + "loss": 0.5483, + "step": 4440 + }, + { + "epoch": 2.1952787047336546, + "grad_norm": 0.14493054858216595, + "learning_rate": 8.498760438542048e-06, + "loss": 0.5598, + "step": 4441 + }, + { + "epoch": 2.195773081201335, + "grad_norm": 0.14161100480451477, + "learning_rate": 8.494910788498101e-06, + "loss": 0.5942, + "step": 4442 + }, + { + "epoch": 2.196267457669015, + "grad_norm": 0.13699438052468482, + "learning_rate": 8.49106136666181e-06, + "loss": 0.566, + "step": 4443 + }, + { + "epoch": 2.1967618341366952, + "grad_norm": 0.13987872697951698, + "learning_rate": 8.487212173616835e-06, + "loss": 0.5814, + "step": 4444 + }, + { + "epoch": 2.197256210604375, + "grad_norm": 0.14110510036441803, + "learning_rate": 8.4833632099468e-06, + "loss": 0.5485, + "step": 4445 + }, + { + "epoch": 2.1977505870720555, + "grad_norm": 0.13679383478706814, + "learning_rate": 8.479514476235317e-06, + "loss": 0.5756, + "step": 4446 + }, + { + "epoch": 2.1982449635397354, + "grad_norm": 0.13660348750885973, + "learning_rate": 8.475665973065934e-06, + "loss": 0.5692, + "step": 4447 + }, + { + "epoch": 2.1987393400074158, + "grad_norm": 0.14697377989117272, + "learning_rate": 8.47181770102218e-06, + "loss": 0.5937, + "step": 4448 + }, + { + "epoch": 2.1992337164750957, + "grad_norm": 0.13798260262694548, + "learning_rate": 8.467969660687543e-06, + "loss": 0.5479, + "step": 4449 + }, + { + "epoch": 2.199728092942776, + "grad_norm": 0.1436842525223068, + "learning_rate": 8.464121852645484e-06, + "loss": 0.5674, + "step": 4450 + }, + { + "epoch": 2.200222469410456, + "grad_norm": 0.1425900713817668, + "learning_rate": 8.460274277479413e-06, + "loss": 0.5741, + "step": 4451 + }, + { + "epoch": 2.2007168458781363, + "grad_norm": 0.1427783578045205, + "learning_rate": 8.456426935772724e-06, + "loss": 0.5841, + "step": 4452 + }, + { + "epoch": 2.2012112223458162, + "grad_norm": 0.14908043350597647, + "learning_rate": 8.452579828108766e-06, + "loss": 0.5793, + "step": 4453 + }, + { + "epoch": 2.2017055988134966, + "grad_norm": 0.14408490868234808, + "learning_rate": 8.448732955070848e-06, + "loss": 0.5732, + "step": 4454 + }, + { + "epoch": 2.2021999752811765, + "grad_norm": 0.13365066633998532, + "learning_rate": 8.444886317242251e-06, + "loss": 0.5333, + "step": 4455 + }, + { + "epoch": 2.202694351748857, + "grad_norm": 0.13518097710858573, + "learning_rate": 8.441039915206215e-06, + "loss": 0.5752, + "step": 4456 + }, + { + "epoch": 2.2031887282165368, + "grad_norm": 0.14752158364971, + "learning_rate": 8.43719374954595e-06, + "loss": 0.5857, + "step": 4457 + }, + { + "epoch": 2.203683104684217, + "grad_norm": 0.14384090630720794, + "learning_rate": 8.433347820844628e-06, + "loss": 0.5564, + "step": 4458 + }, + { + "epoch": 2.204177481151897, + "grad_norm": 0.13760919887244058, + "learning_rate": 8.429502129685381e-06, + "loss": 0.5403, + "step": 4459 + }, + { + "epoch": 2.2046718576195774, + "grad_norm": 0.1467239680077207, + "learning_rate": 8.42565667665131e-06, + "loss": 0.5782, + "step": 4460 + }, + { + "epoch": 2.2051662340872573, + "grad_norm": 0.1365498249992146, + "learning_rate": 8.421811462325478e-06, + "loss": 0.543, + "step": 4461 + }, + { + "epoch": 2.2056606105549377, + "grad_norm": 0.13315603519574207, + "learning_rate": 8.417966487290906e-06, + "loss": 0.5774, + "step": 4462 + }, + { + "epoch": 2.2061549870226176, + "grad_norm": 0.1353966830457209, + "learning_rate": 8.414121752130594e-06, + "loss": 0.5872, + "step": 4463 + }, + { + "epoch": 2.206649363490298, + "grad_norm": 0.1397139875419035, + "learning_rate": 8.41027725742749e-06, + "loss": 0.5627, + "step": 4464 + }, + { + "epoch": 2.207143739957978, + "grad_norm": 0.13584872375747178, + "learning_rate": 8.406433003764514e-06, + "loss": 0.5734, + "step": 4465 + }, + { + "epoch": 2.207638116425658, + "grad_norm": 0.13915378839316597, + "learning_rate": 8.402588991724545e-06, + "loss": 0.5638, + "step": 4466 + }, + { + "epoch": 2.208132492893338, + "grad_norm": 0.1379675519245401, + "learning_rate": 8.39874522189043e-06, + "loss": 0.6049, + "step": 4467 + }, + { + "epoch": 2.2086268693610185, + "grad_norm": 0.13735313414240666, + "learning_rate": 8.394901694844975e-06, + "loss": 0.5822, + "step": 4468 + }, + { + "epoch": 2.2091212458286984, + "grad_norm": 0.14498933416080056, + "learning_rate": 8.391058411170957e-06, + "loss": 0.6397, + "step": 4469 + }, + { + "epoch": 2.2096156222963788, + "grad_norm": 0.14117507117609343, + "learning_rate": 8.387215371451099e-06, + "loss": 0.5737, + "step": 4470 + }, + { + "epoch": 2.2101099987640587, + "grad_norm": 0.13517348822587408, + "learning_rate": 8.383372576268107e-06, + "loss": 0.5256, + "step": 4471 + }, + { + "epoch": 2.210604375231739, + "grad_norm": 0.13427275027091876, + "learning_rate": 8.379530026204635e-06, + "loss": 0.5709, + "step": 4472 + }, + { + "epoch": 2.211098751699419, + "grad_norm": 0.1502852785039408, + "learning_rate": 8.375687721843308e-06, + "loss": 0.5966, + "step": 4473 + }, + { + "epoch": 2.2115931281670993, + "grad_norm": 0.1444266309557329, + "learning_rate": 8.371845663766715e-06, + "loss": 0.5743, + "step": 4474 + }, + { + "epoch": 2.212087504634779, + "grad_norm": 0.13751178317221158, + "learning_rate": 8.3680038525574e-06, + "loss": 0.5817, + "step": 4475 + }, + { + "epoch": 2.2125818811024596, + "grad_norm": 0.1307656929112932, + "learning_rate": 8.364162288797879e-06, + "loss": 0.5431, + "step": 4476 + }, + { + "epoch": 2.2130762575701395, + "grad_norm": 0.1366323135209336, + "learning_rate": 8.360320973070618e-06, + "loss": 0.5572, + "step": 4477 + }, + { + "epoch": 2.21357063403782, + "grad_norm": 0.13237217609939625, + "learning_rate": 8.356479905958053e-06, + "loss": 0.5749, + "step": 4478 + }, + { + "epoch": 2.2140650105054998, + "grad_norm": 0.14056697840803173, + "learning_rate": 8.35263908804259e-06, + "loss": 0.5518, + "step": 4479 + }, + { + "epoch": 2.21455938697318, + "grad_norm": 0.14066316724394246, + "learning_rate": 8.348798519906583e-06, + "loss": 0.5614, + "step": 4480 + }, + { + "epoch": 2.21505376344086, + "grad_norm": 0.14533761058883468, + "learning_rate": 8.344958202132357e-06, + "loss": 0.5775, + "step": 4481 + }, + { + "epoch": 2.2155481399085404, + "grad_norm": 0.1391994515631408, + "learning_rate": 8.341118135302193e-06, + "loss": 0.5667, + "step": 4482 + }, + { + "epoch": 2.2160425163762203, + "grad_norm": 0.13752048565670033, + "learning_rate": 8.337278319998343e-06, + "loss": 0.5638, + "step": 4483 + }, + { + "epoch": 2.2165368928439007, + "grad_norm": 0.13907040561283687, + "learning_rate": 8.333438756803004e-06, + "loss": 0.5486, + "step": 4484 + }, + { + "epoch": 2.2170312693115806, + "grad_norm": 0.14329672081158512, + "learning_rate": 8.32959944629836e-06, + "loss": 0.6079, + "step": 4485 + }, + { + "epoch": 2.217525645779261, + "grad_norm": 0.1406721823287691, + "learning_rate": 8.325760389066535e-06, + "loss": 0.565, + "step": 4486 + }, + { + "epoch": 2.2180200222469413, + "grad_norm": 0.14102904189161025, + "learning_rate": 8.321921585689623e-06, + "loss": 0.6088, + "step": 4487 + }, + { + "epoch": 2.218514398714621, + "grad_norm": 0.13770007162338085, + "learning_rate": 8.318083036749677e-06, + "loss": 0.5375, + "step": 4488 + }, + { + "epoch": 2.219008775182301, + "grad_norm": 0.1404520476411777, + "learning_rate": 8.314244742828716e-06, + "loss": 0.5622, + "step": 4489 + }, + { + "epoch": 2.2195031516499815, + "grad_norm": 0.13800358978757496, + "learning_rate": 8.310406704508718e-06, + "loss": 0.604, + "step": 4490 + }, + { + "epoch": 2.219997528117662, + "grad_norm": 0.13895033808295512, + "learning_rate": 8.30656892237162e-06, + "loss": 0.5727, + "step": 4491 + }, + { + "epoch": 2.2204919045853417, + "grad_norm": 0.14354089323034505, + "learning_rate": 8.302731396999324e-06, + "loss": 0.5554, + "step": 4492 + }, + { + "epoch": 2.2209862810530216, + "grad_norm": 0.14141732838426033, + "learning_rate": 8.298894128973688e-06, + "loss": 0.5885, + "step": 4493 + }, + { + "epoch": 2.221480657520702, + "grad_norm": 0.15035990159428286, + "learning_rate": 8.295057118876536e-06, + "loss": 0.5397, + "step": 4494 + }, + { + "epoch": 2.2219750339883824, + "grad_norm": 0.1431976523539539, + "learning_rate": 8.291220367289648e-06, + "loss": 0.5701, + "step": 4495 + }, + { + "epoch": 2.2224694104560623, + "grad_norm": 0.13945790835518904, + "learning_rate": 8.28738387479477e-06, + "loss": 0.5765, + "step": 4496 + }, + { + "epoch": 2.2229637869237426, + "grad_norm": 0.13964278717624704, + "learning_rate": 8.283547641973606e-06, + "loss": 0.5429, + "step": 4497 + }, + { + "epoch": 2.2234581633914225, + "grad_norm": 0.14436593068764417, + "learning_rate": 8.279711669407822e-06, + "loss": 0.5835, + "step": 4498 + }, + { + "epoch": 2.223952539859103, + "grad_norm": 0.14483092965703467, + "learning_rate": 8.275875957679045e-06, + "loss": 0.6439, + "step": 4499 + }, + { + "epoch": 2.224446916326783, + "grad_norm": 0.1605695774749751, + "learning_rate": 8.272040507368852e-06, + "loss": 0.5465, + "step": 4500 + }, + { + "epoch": 2.224941292794463, + "grad_norm": 0.138445696890551, + "learning_rate": 8.2682053190588e-06, + "loss": 0.5837, + "step": 4501 + }, + { + "epoch": 2.225435669262143, + "grad_norm": 0.14176236205733844, + "learning_rate": 8.264370393330394e-06, + "loss": 0.5905, + "step": 4502 + }, + { + "epoch": 2.2259300457298234, + "grad_norm": 0.13487109231291325, + "learning_rate": 8.260535730765096e-06, + "loss": 0.5292, + "step": 4503 + }, + { + "epoch": 2.2264244221975034, + "grad_norm": 0.1395045534628219, + "learning_rate": 8.256701331944334e-06, + "loss": 0.5645, + "step": 4504 + }, + { + "epoch": 2.2269187986651837, + "grad_norm": 0.13722036689464256, + "learning_rate": 8.252867197449496e-06, + "loss": 0.591, + "step": 4505 + }, + { + "epoch": 2.2274131751328636, + "grad_norm": 0.14046749782310813, + "learning_rate": 8.24903332786193e-06, + "loss": 0.5584, + "step": 4506 + }, + { + "epoch": 2.227907551600544, + "grad_norm": 0.1381797236393888, + "learning_rate": 8.24519972376294e-06, + "loss": 0.5565, + "step": 4507 + }, + { + "epoch": 2.228401928068224, + "grad_norm": 0.13779911856785834, + "learning_rate": 8.241366385733797e-06, + "loss": 0.5935, + "step": 4508 + }, + { + "epoch": 2.2288963045359043, + "grad_norm": 0.1376390749196434, + "learning_rate": 8.237533314355725e-06, + "loss": 0.5747, + "step": 4509 + }, + { + "epoch": 2.229390681003584, + "grad_norm": 0.13342448125465942, + "learning_rate": 8.233700510209905e-06, + "loss": 0.5581, + "step": 4510 + }, + { + "epoch": 2.2298850574712645, + "grad_norm": 0.14399027926371316, + "learning_rate": 8.229867973877485e-06, + "loss": 0.5468, + "step": 4511 + }, + { + "epoch": 2.2303794339389444, + "grad_norm": 0.13659730218681163, + "learning_rate": 8.226035705939572e-06, + "loss": 0.5612, + "step": 4512 + }, + { + "epoch": 2.230873810406625, + "grad_norm": 0.13184577939282072, + "learning_rate": 8.222203706977229e-06, + "loss": 0.5514, + "step": 4513 + }, + { + "epoch": 2.2313681868743047, + "grad_norm": 0.1406861788209103, + "learning_rate": 8.218371977571476e-06, + "loss": 0.5709, + "step": 4514 + }, + { + "epoch": 2.231862563341985, + "grad_norm": 0.14213178913678579, + "learning_rate": 8.2145405183033e-06, + "loss": 0.5947, + "step": 4515 + }, + { + "epoch": 2.232356939809665, + "grad_norm": 0.13490530113404436, + "learning_rate": 8.210709329753635e-06, + "loss": 0.5455, + "step": 4516 + }, + { + "epoch": 2.2328513162773453, + "grad_norm": 0.13958158895401238, + "learning_rate": 8.20687841250339e-06, + "loss": 0.5397, + "step": 4517 + }, + { + "epoch": 2.2333456927450253, + "grad_norm": 0.1343375200389987, + "learning_rate": 8.20304776713342e-06, + "loss": 0.5556, + "step": 4518 + }, + { + "epoch": 2.2338400692127056, + "grad_norm": 0.1362687786229661, + "learning_rate": 8.19921739422454e-06, + "loss": 0.6053, + "step": 4519 + }, + { + "epoch": 2.2343344456803855, + "grad_norm": 0.14814723872087693, + "learning_rate": 8.19538729435753e-06, + "loss": 0.6105, + "step": 4520 + }, + { + "epoch": 2.234828822148066, + "grad_norm": 0.1420013410494249, + "learning_rate": 8.191557468113123e-06, + "loss": 0.5713, + "step": 4521 + }, + { + "epoch": 2.235323198615746, + "grad_norm": 3.123364722072112, + "learning_rate": 8.187727916072013e-06, + "loss": 0.6842, + "step": 4522 + }, + { + "epoch": 2.235817575083426, + "grad_norm": 0.14798023908050573, + "learning_rate": 8.183898638814852e-06, + "loss": 0.5796, + "step": 4523 + }, + { + "epoch": 2.236311951551106, + "grad_norm": 0.1488255904829912, + "learning_rate": 8.180069636922252e-06, + "loss": 0.56, + "step": 4524 + }, + { + "epoch": 2.2368063280187864, + "grad_norm": 0.14050762471615216, + "learning_rate": 8.176240910974784e-06, + "loss": 0.5391, + "step": 4525 + }, + { + "epoch": 2.2373007044864663, + "grad_norm": 0.1397152328157497, + "learning_rate": 8.172412461552967e-06, + "loss": 0.5757, + "step": 4526 + }, + { + "epoch": 2.2377950809541467, + "grad_norm": 0.13993047382434776, + "learning_rate": 8.168584289237289e-06, + "loss": 0.5772, + "step": 4527 + }, + { + "epoch": 2.2382894574218266, + "grad_norm": 0.14656823181696624, + "learning_rate": 8.164756394608198e-06, + "loss": 0.5263, + "step": 4528 + }, + { + "epoch": 2.238783833889507, + "grad_norm": 0.15137533664998679, + "learning_rate": 8.16092877824609e-06, + "loss": 0.6011, + "step": 4529 + }, + { + "epoch": 2.239278210357187, + "grad_norm": 0.14222550482836255, + "learning_rate": 8.15710144073132e-06, + "loss": 0.5662, + "step": 4530 + }, + { + "epoch": 2.2397725868248672, + "grad_norm": 0.14852877888143506, + "learning_rate": 8.153274382644213e-06, + "loss": 0.5456, + "step": 4531 + }, + { + "epoch": 2.240266963292547, + "grad_norm": 0.14186590406479374, + "learning_rate": 8.149447604565038e-06, + "loss": 0.5831, + "step": 4532 + }, + { + "epoch": 2.2407613397602275, + "grad_norm": 0.14911183886451984, + "learning_rate": 8.14562110707402e-06, + "loss": 0.565, + "step": 4533 + }, + { + "epoch": 2.2412557162279074, + "grad_norm": 0.14798789065854656, + "learning_rate": 8.141794890751361e-06, + "loss": 0.5533, + "step": 4534 + }, + { + "epoch": 2.241750092695588, + "grad_norm": 0.14143116828265254, + "learning_rate": 8.137968956177201e-06, + "loss": 0.5931, + "step": 4535 + }, + { + "epoch": 2.2422444691632677, + "grad_norm": 0.14059481460588452, + "learning_rate": 8.134143303931642e-06, + "loss": 0.5984, + "step": 4536 + }, + { + "epoch": 2.242738845630948, + "grad_norm": 0.1509673211877721, + "learning_rate": 8.130317934594747e-06, + "loss": 0.6312, + "step": 4537 + }, + { + "epoch": 2.243233222098628, + "grad_norm": 0.1446668989408346, + "learning_rate": 8.12649284874653e-06, + "loss": 0.5598, + "step": 4538 + }, + { + "epoch": 2.2437275985663083, + "grad_norm": 0.14002812346923485, + "learning_rate": 8.122668046966969e-06, + "loss": 0.5602, + "step": 4539 + }, + { + "epoch": 2.2442219750339882, + "grad_norm": 0.14145789004291676, + "learning_rate": 8.118843529835995e-06, + "loss": 0.5643, + "step": 4540 + }, + { + "epoch": 2.2447163515016686, + "grad_norm": 0.13864602344971505, + "learning_rate": 8.1150192979335e-06, + "loss": 0.5573, + "step": 4541 + }, + { + "epoch": 2.2452107279693485, + "grad_norm": 0.13875590345448363, + "learning_rate": 8.111195351839327e-06, + "loss": 0.5722, + "step": 4542 + }, + { + "epoch": 2.245705104437029, + "grad_norm": 0.1405527396051482, + "learning_rate": 8.107371692133276e-06, + "loss": 0.5753, + "step": 4543 + }, + { + "epoch": 2.246199480904709, + "grad_norm": 0.13873753786391932, + "learning_rate": 8.103548319395104e-06, + "loss": 0.5509, + "step": 4544 + }, + { + "epoch": 2.246693857372389, + "grad_norm": 0.13579528820086492, + "learning_rate": 8.09972523420453e-06, + "loss": 0.5882, + "step": 4545 + }, + { + "epoch": 2.247188233840069, + "grad_norm": 0.141229610317828, + "learning_rate": 8.095902437141228e-06, + "loss": 0.5647, + "step": 4546 + }, + { + "epoch": 2.2476826103077494, + "grad_norm": 0.13691809088586362, + "learning_rate": 8.09207992878482e-06, + "loss": 0.5983, + "step": 4547 + }, + { + "epoch": 2.2481769867754293, + "grad_norm": 0.13846570801734667, + "learning_rate": 8.088257709714892e-06, + "loss": 0.5553, + "step": 4548 + }, + { + "epoch": 2.2486713632431097, + "grad_norm": 0.13754432648565712, + "learning_rate": 8.084435780510983e-06, + "loss": 0.5707, + "step": 4549 + }, + { + "epoch": 2.2491657397107896, + "grad_norm": 0.13372435745576625, + "learning_rate": 8.080614141752594e-06, + "loss": 0.5492, + "step": 4550 + }, + { + "epoch": 2.24966011617847, + "grad_norm": 0.13452317424834895, + "learning_rate": 8.076792794019175e-06, + "loss": 0.5864, + "step": 4551 + }, + { + "epoch": 2.25015449264615, + "grad_norm": 0.1344069771087108, + "learning_rate": 8.072971737890129e-06, + "loss": 0.5447, + "step": 4552 + }, + { + "epoch": 2.25064886911383, + "grad_norm": 0.13641400967580553, + "learning_rate": 8.069150973944826e-06, + "loss": 0.5922, + "step": 4553 + }, + { + "epoch": 2.25114324558151, + "grad_norm": 0.13111638810364887, + "learning_rate": 8.065330502762583e-06, + "loss": 0.5516, + "step": 4554 + }, + { + "epoch": 2.25114324558151, + "eval_loss": 0.6508591175079346, + "eval_runtime": 81.8108, + "eval_samples_per_second": 371.027, + "eval_steps_per_second": 46.388, + "step": 4554 + }, + { + "epoch": 2.2516376220491905, + "grad_norm": 0.1483097677310547, + "learning_rate": 8.061510324922672e-06, + "loss": 0.5915, + "step": 4555 + }, + { + "epoch": 2.2521319985168704, + "grad_norm": 0.13887203765788747, + "learning_rate": 8.057690441004331e-06, + "loss": 0.5757, + "step": 4556 + }, + { + "epoch": 2.2526263749845508, + "grad_norm": 0.14311855060947548, + "learning_rate": 8.053870851586741e-06, + "loss": 0.5623, + "step": 4557 + }, + { + "epoch": 2.253120751452231, + "grad_norm": 0.1380673875824312, + "learning_rate": 8.050051557249046e-06, + "loss": 0.5507, + "step": 4558 + }, + { + "epoch": 2.253615127919911, + "grad_norm": 0.14864404346247417, + "learning_rate": 8.046232558570341e-06, + "loss": 0.5609, + "step": 4559 + }, + { + "epoch": 2.254109504387591, + "grad_norm": 0.13847681907547213, + "learning_rate": 8.042413856129675e-06, + "loss": 0.5684, + "step": 4560 + }, + { + "epoch": 2.2546038808552713, + "grad_norm": 0.1358068763521679, + "learning_rate": 8.038595450506061e-06, + "loss": 0.5528, + "step": 4561 + }, + { + "epoch": 2.2550982573229517, + "grad_norm": 0.13670641246165438, + "learning_rate": 8.034777342278459e-06, + "loss": 0.5534, + "step": 4562 + }, + { + "epoch": 2.2555926337906316, + "grad_norm": 0.1383512195180018, + "learning_rate": 8.030959532025783e-06, + "loss": 0.5593, + "step": 4563 + }, + { + "epoch": 2.2560870102583115, + "grad_norm": 0.13935962356321488, + "learning_rate": 8.027142020326908e-06, + "loss": 0.5674, + "step": 4564 + }, + { + "epoch": 2.256581386725992, + "grad_norm": 0.148631060988877, + "learning_rate": 8.02332480776066e-06, + "loss": 0.5692, + "step": 4565 + }, + { + "epoch": 2.257075763193672, + "grad_norm": 0.13512913829442333, + "learning_rate": 8.019507894905814e-06, + "loss": 0.5501, + "step": 4566 + }, + { + "epoch": 2.257570139661352, + "grad_norm": 0.14486656372898585, + "learning_rate": 8.015691282341113e-06, + "loss": 0.5611, + "step": 4567 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 0.14206699936951275, + "learning_rate": 8.011874970645248e-06, + "loss": 0.597, + "step": 4568 + }, + { + "epoch": 2.2585588925967124, + "grad_norm": 0.1425659143930415, + "learning_rate": 8.008058960396858e-06, + "loss": 0.587, + "step": 4569 + }, + { + "epoch": 2.2590532690643927, + "grad_norm": 0.1454999999470158, + "learning_rate": 8.004243252174546e-06, + "loss": 0.5596, + "step": 4570 + }, + { + "epoch": 2.2595476455320727, + "grad_norm": 0.13923290153589193, + "learning_rate": 8.000427846556858e-06, + "loss": 0.5587, + "step": 4571 + }, + { + "epoch": 2.2600420219997526, + "grad_norm": 0.147952295833286, + "learning_rate": 7.99661274412231e-06, + "loss": 0.5739, + "step": 4572 + }, + { + "epoch": 2.260536398467433, + "grad_norm": 0.1476234239582274, + "learning_rate": 7.992797945449357e-06, + "loss": 0.6083, + "step": 4573 + }, + { + "epoch": 2.2610307749351133, + "grad_norm": 0.14254103969564458, + "learning_rate": 7.988983451116418e-06, + "loss": 0.5601, + "step": 4574 + }, + { + "epoch": 2.261525151402793, + "grad_norm": 0.1429777379561698, + "learning_rate": 7.985169261701862e-06, + "loss": 0.5464, + "step": 4575 + }, + { + "epoch": 2.2620195278704736, + "grad_norm": 0.1479610996511858, + "learning_rate": 7.981355377784008e-06, + "loss": 0.5927, + "step": 4576 + }, + { + "epoch": 2.2625139043381535, + "grad_norm": 0.14422754200736934, + "learning_rate": 7.97754179994113e-06, + "loss": 0.5653, + "step": 4577 + }, + { + "epoch": 2.263008280805834, + "grad_norm": 0.1454697191998191, + "learning_rate": 7.973728528751465e-06, + "loss": 0.5861, + "step": 4578 + }, + { + "epoch": 2.2635026572735137, + "grad_norm": 0.16137889605212719, + "learning_rate": 7.969915564793195e-06, + "loss": 0.602, + "step": 4579 + }, + { + "epoch": 2.263997033741194, + "grad_norm": 0.14212496373516834, + "learning_rate": 7.966102908644454e-06, + "loss": 0.5857, + "step": 4580 + }, + { + "epoch": 2.264491410208874, + "grad_norm": 0.13630621040617485, + "learning_rate": 7.962290560883336e-06, + "loss": 0.5335, + "step": 4581 + }, + { + "epoch": 2.2649857866765544, + "grad_norm": 0.14551767984732913, + "learning_rate": 7.958478522087876e-06, + "loss": 0.5324, + "step": 4582 + }, + { + "epoch": 2.2654801631442343, + "grad_norm": 0.14768511796486672, + "learning_rate": 7.95466679283608e-06, + "loss": 0.5935, + "step": 4583 + }, + { + "epoch": 2.2659745396119146, + "grad_norm": 0.14645850944883682, + "learning_rate": 7.950855373705897e-06, + "loss": 0.5823, + "step": 4584 + }, + { + "epoch": 2.2664689160795946, + "grad_norm": 0.14482787968888902, + "learning_rate": 7.947044265275224e-06, + "loss": 0.5841, + "step": 4585 + }, + { + "epoch": 2.266963292547275, + "grad_norm": 0.14631539669916147, + "learning_rate": 7.94323346812192e-06, + "loss": 0.5526, + "step": 4586 + }, + { + "epoch": 2.267457669014955, + "grad_norm": 0.1515276374370481, + "learning_rate": 7.939422982823792e-06, + "loss": 0.5553, + "step": 4587 + }, + { + "epoch": 2.267952045482635, + "grad_norm": 0.13750883992685403, + "learning_rate": 7.935612809958602e-06, + "loss": 0.5422, + "step": 4588 + }, + { + "epoch": 2.268446421950315, + "grad_norm": 0.13790028684319974, + "learning_rate": 7.931802950104063e-06, + "loss": 0.5321, + "step": 4589 + }, + { + "epoch": 2.2689407984179955, + "grad_norm": 0.14483464953734426, + "learning_rate": 7.927993403837842e-06, + "loss": 0.5778, + "step": 4590 + }, + { + "epoch": 2.2694351748856754, + "grad_norm": 0.13911942072339994, + "learning_rate": 7.92418417173756e-06, + "loss": 0.5802, + "step": 4591 + }, + { + "epoch": 2.2699295513533557, + "grad_norm": 0.14751293134524515, + "learning_rate": 7.920375254380783e-06, + "loss": 0.5625, + "step": 4592 + }, + { + "epoch": 2.2704239278210356, + "grad_norm": 0.1448378126044396, + "learning_rate": 7.916566652345033e-06, + "loss": 0.5621, + "step": 4593 + }, + { + "epoch": 2.270918304288716, + "grad_norm": 0.13188388661185843, + "learning_rate": 7.912758366207793e-06, + "loss": 0.5358, + "step": 4594 + }, + { + "epoch": 2.271412680756396, + "grad_norm": 0.13660435691788506, + "learning_rate": 7.908950396546487e-06, + "loss": 0.5559, + "step": 4595 + }, + { + "epoch": 2.2719070572240763, + "grad_norm": 0.1448725625123837, + "learning_rate": 7.905142743938494e-06, + "loss": 0.5757, + "step": 4596 + }, + { + "epoch": 2.272401433691756, + "grad_norm": 0.1309529655928332, + "learning_rate": 7.901335408961143e-06, + "loss": 0.5509, + "step": 4597 + }, + { + "epoch": 2.2728958101594365, + "grad_norm": 0.13812319376445972, + "learning_rate": 7.897528392191722e-06, + "loss": 0.5685, + "step": 4598 + }, + { + "epoch": 2.2733901866271164, + "grad_norm": 0.1370166327072945, + "learning_rate": 7.893721694207464e-06, + "loss": 0.5442, + "step": 4599 + }, + { + "epoch": 2.273884563094797, + "grad_norm": 0.148469874737963, + "learning_rate": 7.889915315585558e-06, + "loss": 0.5836, + "step": 4600 + }, + { + "epoch": 2.2743789395624767, + "grad_norm": 0.13888008757348333, + "learning_rate": 7.88610925690314e-06, + "loss": 0.5222, + "step": 4601 + }, + { + "epoch": 2.274873316030157, + "grad_norm": 0.14032876852103202, + "learning_rate": 7.882303518737299e-06, + "loss": 0.5977, + "step": 4602 + }, + { + "epoch": 2.275367692497837, + "grad_norm": 0.1463215110148236, + "learning_rate": 7.878498101665079e-06, + "loss": 0.5553, + "step": 4603 + }, + { + "epoch": 2.2758620689655173, + "grad_norm": 0.14056470872058416, + "learning_rate": 7.874693006263467e-06, + "loss": 0.5738, + "step": 4604 + }, + { + "epoch": 2.2763564454331973, + "grad_norm": 0.14227453084489391, + "learning_rate": 7.870888233109415e-06, + "loss": 0.5755, + "step": 4605 + }, + { + "epoch": 2.2768508219008776, + "grad_norm": 0.1402968443014923, + "learning_rate": 7.867083782779813e-06, + "loss": 0.5378, + "step": 4606 + }, + { + "epoch": 2.2773451983685575, + "grad_norm": 0.13424579337321196, + "learning_rate": 7.86327965585151e-06, + "loss": 0.5417, + "step": 4607 + }, + { + "epoch": 2.277839574836238, + "grad_norm": 0.14593660749630222, + "learning_rate": 7.859475852901298e-06, + "loss": 0.5666, + "step": 4608 + }, + { + "epoch": 2.278333951303918, + "grad_norm": 0.15897054605066177, + "learning_rate": 7.855672374505924e-06, + "loss": 0.5733, + "step": 4609 + }, + { + "epoch": 2.278828327771598, + "grad_norm": 0.13961758046706896, + "learning_rate": 7.851869221242097e-06, + "loss": 0.5775, + "step": 4610 + }, + { + "epoch": 2.279322704239278, + "grad_norm": 0.13923776480707034, + "learning_rate": 7.848066393686457e-06, + "loss": 0.5772, + "step": 4611 + }, + { + "epoch": 2.2798170807069584, + "grad_norm": 0.15409108697667653, + "learning_rate": 7.844263892415608e-06, + "loss": 0.5769, + "step": 4612 + }, + { + "epoch": 2.2803114571746383, + "grad_norm": 0.14420068624336332, + "learning_rate": 7.840461718006098e-06, + "loss": 0.5936, + "step": 4613 + }, + { + "epoch": 2.2808058336423187, + "grad_norm": 0.13125650030708627, + "learning_rate": 7.83665987103443e-06, + "loss": 0.5514, + "step": 4614 + }, + { + "epoch": 2.2813002101099986, + "grad_norm": 0.13901505988729637, + "learning_rate": 7.83285835207705e-06, + "loss": 0.5947, + "step": 4615 + }, + { + "epoch": 2.281794586577679, + "grad_norm": 0.14101795042034243, + "learning_rate": 7.829057161710367e-06, + "loss": 0.5927, + "step": 4616 + }, + { + "epoch": 2.282288963045359, + "grad_norm": 0.13769483050913783, + "learning_rate": 7.825256300510731e-06, + "loss": 0.5685, + "step": 4617 + }, + { + "epoch": 2.2827833395130392, + "grad_norm": 0.13717910774913772, + "learning_rate": 7.82145576905444e-06, + "loss": 0.5234, + "step": 4618 + }, + { + "epoch": 2.283277715980719, + "grad_norm": 0.13136488584880981, + "learning_rate": 7.817655567917747e-06, + "loss": 0.5674, + "step": 4619 + }, + { + "epoch": 2.2837720924483995, + "grad_norm": 0.13748601427152152, + "learning_rate": 7.813855697676856e-06, + "loss": 0.548, + "step": 4620 + }, + { + "epoch": 2.2842664689160794, + "grad_norm": 0.13828721034867553, + "learning_rate": 7.810056158907916e-06, + "loss": 0.5602, + "step": 4621 + }, + { + "epoch": 2.28476084538376, + "grad_norm": 0.14730919904857764, + "learning_rate": 7.80625695218703e-06, + "loss": 0.5498, + "step": 4622 + }, + { + "epoch": 2.2852552218514397, + "grad_norm": 0.13607341735340248, + "learning_rate": 7.80245807809025e-06, + "loss": 0.556, + "step": 4623 + }, + { + "epoch": 2.28574959831912, + "grad_norm": 0.13754166042749671, + "learning_rate": 7.798659537193577e-06, + "loss": 0.581, + "step": 4624 + }, + { + "epoch": 2.2862439747868, + "grad_norm": 0.12998512461857395, + "learning_rate": 7.794861330072956e-06, + "loss": 0.5365, + "step": 4625 + }, + { + "epoch": 2.2867383512544803, + "grad_norm": 0.13400460824193805, + "learning_rate": 7.791063457304287e-06, + "loss": 0.5944, + "step": 4626 + }, + { + "epoch": 2.2872327277221602, + "grad_norm": 0.14460501251903465, + "learning_rate": 7.787265919463424e-06, + "loss": 0.5861, + "step": 4627 + }, + { + "epoch": 2.2877271041898406, + "grad_norm": 0.13202040421023098, + "learning_rate": 7.783468717126162e-06, + "loss": 0.5704, + "step": 4628 + }, + { + "epoch": 2.2882214806575205, + "grad_norm": 0.14157026215691848, + "learning_rate": 7.779671850868248e-06, + "loss": 0.5491, + "step": 4629 + }, + { + "epoch": 2.288715857125201, + "grad_norm": 0.13866918010752644, + "learning_rate": 7.775875321265376e-06, + "loss": 0.5574, + "step": 4630 + }, + { + "epoch": 2.289210233592881, + "grad_norm": 0.1470041813273312, + "learning_rate": 7.772079128893192e-06, + "loss": 0.5808, + "step": 4631 + }, + { + "epoch": 2.289704610060561, + "grad_norm": 0.13158317922307058, + "learning_rate": 7.768283274327295e-06, + "loss": 0.5712, + "step": 4632 + }, + { + "epoch": 2.2901989865282415, + "grad_norm": 0.1457214605729983, + "learning_rate": 7.764487758143224e-06, + "loss": 0.5347, + "step": 4633 + }, + { + "epoch": 2.2906933629959214, + "grad_norm": 0.13162060363180245, + "learning_rate": 7.760692580916467e-06, + "loss": 0.5836, + "step": 4634 + }, + { + "epoch": 2.2911877394636013, + "grad_norm": 0.14176444332136437, + "learning_rate": 7.756897743222468e-06, + "loss": 0.5615, + "step": 4635 + }, + { + "epoch": 2.2916821159312817, + "grad_norm": 0.13898511317726328, + "learning_rate": 7.753103245636614e-06, + "loss": 0.5584, + "step": 4636 + }, + { + "epoch": 2.292176492398962, + "grad_norm": 0.1420251408346584, + "learning_rate": 7.74930908873424e-06, + "loss": 0.5481, + "step": 4637 + }, + { + "epoch": 2.292670868866642, + "grad_norm": 0.13742165034900047, + "learning_rate": 7.745515273090636e-06, + "loss": 0.5678, + "step": 4638 + }, + { + "epoch": 2.293165245334322, + "grad_norm": 0.14081610260590044, + "learning_rate": 7.741721799281033e-06, + "loss": 0.5519, + "step": 4639 + }, + { + "epoch": 2.2936596218020022, + "grad_norm": 0.1421307849634302, + "learning_rate": 7.737928667880616e-06, + "loss": 0.555, + "step": 4640 + }, + { + "epoch": 2.2941539982696826, + "grad_norm": 0.14752163009320474, + "learning_rate": 7.734135879464507e-06, + "loss": 0.5742, + "step": 4641 + }, + { + "epoch": 2.2946483747373625, + "grad_norm": 0.1365292077232775, + "learning_rate": 7.730343434607786e-06, + "loss": 0.589, + "step": 4642 + }, + { + "epoch": 2.2951427512050424, + "grad_norm": 0.13834503261542483, + "learning_rate": 7.726551333885486e-06, + "loss": 0.5734, + "step": 4643 + }, + { + "epoch": 2.2956371276727228, + "grad_norm": 0.1395414371349879, + "learning_rate": 7.722759577872575e-06, + "loss": 0.5959, + "step": 4644 + }, + { + "epoch": 2.296131504140403, + "grad_norm": 0.13661407374770307, + "learning_rate": 7.718968167143972e-06, + "loss": 0.5776, + "step": 4645 + }, + { + "epoch": 2.296625880608083, + "grad_norm": 0.14029181226987422, + "learning_rate": 7.71517710227455e-06, + "loss": 0.5907, + "step": 4646 + }, + { + "epoch": 2.297120257075763, + "grad_norm": 0.14511961242825508, + "learning_rate": 7.711386383839127e-06, + "loss": 0.5435, + "step": 4647 + }, + { + "epoch": 2.2976146335434433, + "grad_norm": 0.13512311890310466, + "learning_rate": 7.707596012412458e-06, + "loss": 0.5359, + "step": 4648 + }, + { + "epoch": 2.2981090100111237, + "grad_norm": 0.13317789342368894, + "learning_rate": 7.703805988569262e-06, + "loss": 0.5336, + "step": 4649 + }, + { + "epoch": 2.2986033864788036, + "grad_norm": 0.14133052954740635, + "learning_rate": 7.7000163128842e-06, + "loss": 0.5959, + "step": 4650 + }, + { + "epoch": 2.299097762946484, + "grad_norm": 0.1354646420801379, + "learning_rate": 7.69622698593187e-06, + "loss": 0.5829, + "step": 4651 + }, + { + "epoch": 2.299592139414164, + "grad_norm": 0.14070469774246683, + "learning_rate": 7.692438008286828e-06, + "loss": 0.5707, + "step": 4652 + }, + { + "epoch": 2.300086515881844, + "grad_norm": 0.1409907700813224, + "learning_rate": 7.688649380523573e-06, + "loss": 0.5431, + "step": 4653 + }, + { + "epoch": 2.300580892349524, + "grad_norm": 0.13441075440062988, + "learning_rate": 7.684861103216558e-06, + "loss": 0.585, + "step": 4654 + }, + { + "epoch": 2.3010752688172045, + "grad_norm": 0.1455356121176148, + "learning_rate": 7.681073176940171e-06, + "loss": 0.5763, + "step": 4655 + }, + { + "epoch": 2.3015696452848844, + "grad_norm": 0.1354918259982736, + "learning_rate": 7.677285602268751e-06, + "loss": 0.5659, + "step": 4656 + }, + { + "epoch": 2.3020640217525647, + "grad_norm": 0.1374461202460602, + "learning_rate": 7.673498379776593e-06, + "loss": 0.6001, + "step": 4657 + }, + { + "epoch": 2.3025583982202447, + "grad_norm": 0.13313606749015341, + "learning_rate": 7.669711510037923e-06, + "loss": 0.5665, + "step": 4658 + }, + { + "epoch": 2.303052774687925, + "grad_norm": 0.13684175852741653, + "learning_rate": 7.665924993626921e-06, + "loss": 0.5656, + "step": 4659 + }, + { + "epoch": 2.303547151155605, + "grad_norm": 0.1471196378016571, + "learning_rate": 7.66213883111772e-06, + "loss": 0.5706, + "step": 4660 + }, + { + "epoch": 2.3040415276232853, + "grad_norm": 0.1379249611736692, + "learning_rate": 7.658353023084388e-06, + "loss": 0.5637, + "step": 4661 + }, + { + "epoch": 2.304535904090965, + "grad_norm": 0.14506253218904214, + "learning_rate": 7.654567570100949e-06, + "loss": 0.5984, + "step": 4662 + }, + { + "epoch": 2.3050302805586456, + "grad_norm": 0.13998256827923602, + "learning_rate": 7.650782472741367e-06, + "loss": 0.572, + "step": 4663 + }, + { + "epoch": 2.3055246570263255, + "grad_norm": 0.14395114834964698, + "learning_rate": 7.646997731579546e-06, + "loss": 0.5498, + "step": 4664 + }, + { + "epoch": 2.306019033494006, + "grad_norm": 0.1409007638143667, + "learning_rate": 7.643213347189356e-06, + "loss": 0.5477, + "step": 4665 + }, + { + "epoch": 2.3065134099616857, + "grad_norm": 0.13735345866662468, + "learning_rate": 7.639429320144594e-06, + "loss": 0.5654, + "step": 4666 + }, + { + "epoch": 2.307007786429366, + "grad_norm": 0.13829399217787916, + "learning_rate": 7.63564565101901e-06, + "loss": 0.5316, + "step": 4667 + }, + { + "epoch": 2.307502162897046, + "grad_norm": 0.13654545005645816, + "learning_rate": 7.631862340386299e-06, + "loss": 0.5942, + "step": 4668 + }, + { + "epoch": 2.3079965393647264, + "grad_norm": 0.13939546718401824, + "learning_rate": 7.628079388820099e-06, + "loss": 0.5703, + "step": 4669 + }, + { + "epoch": 2.3084909158324063, + "grad_norm": 0.14269913704300155, + "learning_rate": 7.624296796894001e-06, + "loss": 0.5933, + "step": 4670 + }, + { + "epoch": 2.3089852923000866, + "grad_norm": 0.1394908881060395, + "learning_rate": 7.620514565181535e-06, + "loss": 0.578, + "step": 4671 + }, + { + "epoch": 2.3094796687677666, + "grad_norm": 0.13872299463505006, + "learning_rate": 7.616732694256178e-06, + "loss": 0.5828, + "step": 4672 + }, + { + "epoch": 2.309974045235447, + "grad_norm": 0.13722410801590612, + "learning_rate": 7.612951184691355e-06, + "loss": 0.5796, + "step": 4673 + }, + { + "epoch": 2.310468421703127, + "grad_norm": 0.1487922534841006, + "learning_rate": 7.609170037060427e-06, + "loss": 0.5698, + "step": 4674 + }, + { + "epoch": 2.310962798170807, + "grad_norm": 0.13847625346182618, + "learning_rate": 7.60538925193671e-06, + "loss": 0.5538, + "step": 4675 + }, + { + "epoch": 2.311457174638487, + "grad_norm": 0.13187223954153024, + "learning_rate": 7.601608829893465e-06, + "loss": 0.5687, + "step": 4676 + }, + { + "epoch": 2.3119515511061675, + "grad_norm": 0.1349427820418832, + "learning_rate": 7.597828771503891e-06, + "loss": 0.5532, + "step": 4677 + }, + { + "epoch": 2.3124459275738474, + "grad_norm": 0.13293024190425715, + "learning_rate": 7.594049077341137e-06, + "loss": 0.5395, + "step": 4678 + }, + { + "epoch": 2.3129403040415277, + "grad_norm": 0.14253431328528052, + "learning_rate": 7.590269747978296e-06, + "loss": 0.5888, + "step": 4679 + }, + { + "epoch": 2.3134346805092076, + "grad_norm": 0.14233555437235693, + "learning_rate": 7.5864907839884005e-06, + "loss": 0.5923, + "step": 4680 + }, + { + "epoch": 2.313929056976888, + "grad_norm": 0.1413083329089622, + "learning_rate": 7.58271218594444e-06, + "loss": 0.5564, + "step": 4681 + }, + { + "epoch": 2.314423433444568, + "grad_norm": 0.1361592692726393, + "learning_rate": 7.578933954419336e-06, + "loss": 0.5972, + "step": 4682 + }, + { + "epoch": 2.3149178099122483, + "grad_norm": 0.14565772604821595, + "learning_rate": 7.57515608998596e-06, + "loss": 0.5354, + "step": 4683 + }, + { + "epoch": 2.315412186379928, + "grad_norm": 0.1377387764490606, + "learning_rate": 7.571378593217125e-06, + "loss": 0.563, + "step": 4684 + }, + { + "epoch": 2.3159065628476085, + "grad_norm": 0.1352252807997654, + "learning_rate": 7.567601464685592e-06, + "loss": 0.5661, + "step": 4685 + }, + { + "epoch": 2.3164009393152885, + "grad_norm": 0.1347091312817379, + "learning_rate": 7.5638247049640626e-06, + "loss": 0.5235, + "step": 4686 + }, + { + "epoch": 2.316895315782969, + "grad_norm": 0.14985710770980815, + "learning_rate": 7.560048314625187e-06, + "loss": 0.5642, + "step": 4687 + }, + { + "epoch": 2.3173896922506487, + "grad_norm": 0.13670445533758027, + "learning_rate": 7.556272294241556e-06, + "loss": 0.5574, + "step": 4688 + }, + { + "epoch": 2.317884068718329, + "grad_norm": 0.1357554300235672, + "learning_rate": 7.552496644385705e-06, + "loss": 0.5845, + "step": 4689 + }, + { + "epoch": 2.318378445186009, + "grad_norm": 0.14563188437360602, + "learning_rate": 7.548721365630112e-06, + "loss": 0.5486, + "step": 4690 + }, + { + "epoch": 2.3188728216536894, + "grad_norm": 0.1340619372899859, + "learning_rate": 7.544946458547195e-06, + "loss": 0.5417, + "step": 4691 + }, + { + "epoch": 2.3193671981213693, + "grad_norm": 0.13789208234683928, + "learning_rate": 7.5411719237093314e-06, + "loss": 0.5784, + "step": 4692 + }, + { + "epoch": 2.3198615745890496, + "grad_norm": 0.13397426486388386, + "learning_rate": 7.537397761688825e-06, + "loss": 0.5756, + "step": 4693 + }, + { + "epoch": 2.3203559510567295, + "grad_norm": 0.1379234352123076, + "learning_rate": 7.53362397305793e-06, + "loss": 0.5324, + "step": 4694 + }, + { + "epoch": 2.32085032752441, + "grad_norm": 0.14137583090280062, + "learning_rate": 7.5298505583888424e-06, + "loss": 0.6004, + "step": 4695 + }, + { + "epoch": 2.32134470399209, + "grad_norm": 0.1419325334132686, + "learning_rate": 7.526077518253706e-06, + "loss": 0.5863, + "step": 4696 + }, + { + "epoch": 2.32183908045977, + "grad_norm": 0.1463485568452939, + "learning_rate": 7.5223048532245955e-06, + "loss": 0.596, + "step": 4697 + }, + { + "epoch": 2.32233345692745, + "grad_norm": 0.13988063855227195, + "learning_rate": 7.518532563873548e-06, + "loss": 0.5733, + "step": 4698 + }, + { + "epoch": 2.3228278333951304, + "grad_norm": 0.13412404424783358, + "learning_rate": 7.51476065077253e-06, + "loss": 0.5994, + "step": 4699 + }, + { + "epoch": 2.3233222098628104, + "grad_norm": 0.14130473986844777, + "learning_rate": 7.5109891144934525e-06, + "loss": 0.5633, + "step": 4700 + }, + { + "epoch": 2.3238165863304907, + "grad_norm": 0.14082426959619107, + "learning_rate": 7.5072179556081696e-06, + "loss": 0.5943, + "step": 4701 + }, + { + "epoch": 2.3243109627981706, + "grad_norm": 0.1372167198518899, + "learning_rate": 7.503447174688479e-06, + "loss": 0.5536, + "step": 4702 + }, + { + "epoch": 2.324805339265851, + "grad_norm": 0.13728613606672718, + "learning_rate": 7.499676772306126e-06, + "loss": 0.5657, + "step": 4703 + }, + { + "epoch": 2.325299715733531, + "grad_norm": 0.13294522382954072, + "learning_rate": 7.495906749032793e-06, + "loss": 0.5686, + "step": 4704 + }, + { + "epoch": 2.3257940922012112, + "grad_norm": 0.13869578996487034, + "learning_rate": 7.492137105440104e-06, + "loss": 0.5468, + "step": 4705 + }, + { + "epoch": 2.326288468668891, + "grad_norm": 0.13861524312477705, + "learning_rate": 7.488367842099631e-06, + "loss": 0.5875, + "step": 4706 + }, + { + "epoch": 2.3267828451365715, + "grad_norm": 0.1391522754852786, + "learning_rate": 7.484598959582879e-06, + "loss": 0.5636, + "step": 4707 + }, + { + "epoch": 2.327277221604252, + "grad_norm": 0.13560475866417704, + "learning_rate": 7.480830458461303e-06, + "loss": 0.5829, + "step": 4708 + }, + { + "epoch": 2.327771598071932, + "grad_norm": 0.13708101934457081, + "learning_rate": 7.477062339306301e-06, + "loss": 0.5491, + "step": 4709 + }, + { + "epoch": 2.3282659745396117, + "grad_norm": 0.14427133849173512, + "learning_rate": 7.473294602689209e-06, + "loss": 0.5643, + "step": 4710 + }, + { + "epoch": 2.328760351007292, + "grad_norm": 0.13636126091015804, + "learning_rate": 7.469527249181307e-06, + "loss": 0.5505, + "step": 4711 + }, + { + "epoch": 2.3292547274749724, + "grad_norm": 0.13226796178038724, + "learning_rate": 7.4657602793538135e-06, + "loss": 0.5464, + "step": 4712 + }, + { + "epoch": 2.3297491039426523, + "grad_norm": 0.14397613583632962, + "learning_rate": 7.461993693777893e-06, + "loss": 0.5571, + "step": 4713 + }, + { + "epoch": 2.3302434804103322, + "grad_norm": 0.1354506419912517, + "learning_rate": 7.458227493024651e-06, + "loss": 0.5879, + "step": 4714 + }, + { + "epoch": 2.3307378568780126, + "grad_norm": 0.14296173890321567, + "learning_rate": 7.454461677665137e-06, + "loss": 0.5512, + "step": 4715 + }, + { + "epoch": 2.331232233345693, + "grad_norm": 0.1369206023500929, + "learning_rate": 7.450696248270333e-06, + "loss": 0.5949, + "step": 4716 + }, + { + "epoch": 2.331726609813373, + "grad_norm": 0.1384265098770094, + "learning_rate": 7.4469312054111695e-06, + "loss": 0.5942, + "step": 4717 + }, + { + "epoch": 2.332220986281053, + "grad_norm": 0.13773663240746598, + "learning_rate": 7.443166549658521e-06, + "loss": 0.6351, + "step": 4718 + }, + { + "epoch": 2.332715362748733, + "grad_norm": 0.13885860128994215, + "learning_rate": 7.4394022815831945e-06, + "loss": 0.5577, + "step": 4719 + }, + { + "epoch": 2.3332097392164135, + "grad_norm": 0.14415674076044133, + "learning_rate": 7.435638401755949e-06, + "loss": 0.5767, + "step": 4720 + }, + { + "epoch": 2.3337041156840934, + "grad_norm": 0.13980091273261286, + "learning_rate": 7.4318749107474776e-06, + "loss": 0.5635, + "step": 4721 + }, + { + "epoch": 2.3341984921517733, + "grad_norm": 0.14073591422635962, + "learning_rate": 7.428111809128415e-06, + "loss": 0.5249, + "step": 4722 + }, + { + "epoch": 2.3346928686194537, + "grad_norm": 0.138995709113919, + "learning_rate": 7.424349097469337e-06, + "loss": 0.5715, + "step": 4723 + }, + { + "epoch": 2.335187245087134, + "grad_norm": 0.14151324277889513, + "learning_rate": 7.420586776340757e-06, + "loss": 0.5777, + "step": 4724 + }, + { + "epoch": 2.335681621554814, + "grad_norm": 0.13201584038718078, + "learning_rate": 7.416824846313142e-06, + "loss": 0.5449, + "step": 4725 + }, + { + "epoch": 2.3361759980224943, + "grad_norm": 0.1407961940826418, + "learning_rate": 7.413063307956887e-06, + "loss": 0.5671, + "step": 4726 + }, + { + "epoch": 2.3366703744901742, + "grad_norm": 0.13959971318975298, + "learning_rate": 7.40930216184233e-06, + "loss": 0.5789, + "step": 4727 + }, + { + "epoch": 2.3371647509578546, + "grad_norm": 0.13765935942574267, + "learning_rate": 7.405541408539752e-06, + "loss": 0.554, + "step": 4728 + }, + { + "epoch": 2.3376591274255345, + "grad_norm": 0.1358985177154898, + "learning_rate": 7.401781048619377e-06, + "loss": 0.5499, + "step": 4729 + }, + { + "epoch": 2.338153503893215, + "grad_norm": 0.13104575178466307, + "learning_rate": 7.398021082651354e-06, + "loss": 0.5774, + "step": 4730 + }, + { + "epoch": 2.3386478803608948, + "grad_norm": 0.14827927115140618, + "learning_rate": 7.394261511205798e-06, + "loss": 0.556, + "step": 4731 + }, + { + "epoch": 2.339142256828575, + "grad_norm": 0.13319842531284237, + "learning_rate": 7.390502334852747e-06, + "loss": 0.5711, + "step": 4732 + }, + { + "epoch": 2.339636633296255, + "grad_norm": 0.13168186111176125, + "learning_rate": 7.386743554162179e-06, + "loss": 0.5726, + "step": 4733 + }, + { + "epoch": 2.3401310097639354, + "grad_norm": 0.1421090638541728, + "learning_rate": 7.382985169704016e-06, + "loss": 0.5851, + "step": 4734 + }, + { + "epoch": 2.3406253862316153, + "grad_norm": 0.13487415711728842, + "learning_rate": 7.379227182048117e-06, + "loss": 0.5434, + "step": 4735 + }, + { + "epoch": 2.3411197626992957, + "grad_norm": 0.1407256074417287, + "learning_rate": 7.375469591764288e-06, + "loss": 0.5723, + "step": 4736 + }, + { + "epoch": 2.3416141391669756, + "grad_norm": 0.13630497604936143, + "learning_rate": 7.371712399422269e-06, + "loss": 0.5744, + "step": 4737 + }, + { + "epoch": 2.342108515634656, + "grad_norm": 0.1438766006471117, + "learning_rate": 7.367955605591739e-06, + "loss": 0.5868, + "step": 4738 + }, + { + "epoch": 2.342602892102336, + "grad_norm": 0.37770707222519473, + "learning_rate": 7.36419921084232e-06, + "loss": 0.5784, + "step": 4739 + }, + { + "epoch": 2.343097268570016, + "grad_norm": 0.13701820954110497, + "learning_rate": 7.360443215743565e-06, + "loss": 0.6153, + "step": 4740 + }, + { + "epoch": 2.343591645037696, + "grad_norm": 0.1458189473530566, + "learning_rate": 7.356687620864984e-06, + "loss": 0.5771, + "step": 4741 + }, + { + "epoch": 2.3440860215053765, + "grad_norm": 0.14147038327679573, + "learning_rate": 7.352932426776008e-06, + "loss": 0.5966, + "step": 4742 + }, + { + "epoch": 2.3445803979730564, + "grad_norm": 0.14209859158046662, + "learning_rate": 7.349177634046014e-06, + "loss": 0.5869, + "step": 4743 + }, + { + "epoch": 2.3450747744407368, + "grad_norm": 0.14699975167553275, + "learning_rate": 7.345423243244323e-06, + "loss": 0.5848, + "step": 4744 + }, + { + "epoch": 2.3455691509084167, + "grad_norm": 0.14143361540908034, + "learning_rate": 7.3416692549401905e-06, + "loss": 0.537, + "step": 4745 + }, + { + "epoch": 2.346063527376097, + "grad_norm": 0.14159274518475434, + "learning_rate": 7.337915669702802e-06, + "loss": 0.5832, + "step": 4746 + }, + { + "epoch": 2.346557903843777, + "grad_norm": 0.14032381536649974, + "learning_rate": 7.334162488101303e-06, + "loss": 0.6146, + "step": 4747 + }, + { + "epoch": 2.3470522803114573, + "grad_norm": 0.14554097563385363, + "learning_rate": 7.330409710704764e-06, + "loss": 0.567, + "step": 4748 + }, + { + "epoch": 2.347546656779137, + "grad_norm": 0.1469778073435606, + "learning_rate": 7.326657338082191e-06, + "loss": 0.5712, + "step": 4749 + }, + { + "epoch": 2.3480410332468176, + "grad_norm": 0.14023154706483865, + "learning_rate": 7.322905370802535e-06, + "loss": 0.5595, + "step": 4750 + }, + { + "epoch": 2.3485354097144975, + "grad_norm": 0.13616353728002603, + "learning_rate": 7.319153809434684e-06, + "loss": 0.5736, + "step": 4751 + }, + { + "epoch": 2.349029786182178, + "grad_norm": 0.13647410878096386, + "learning_rate": 7.3154026545474696e-06, + "loss": 0.5401, + "step": 4752 + }, + { + "epoch": 2.3495241626498578, + "grad_norm": 0.13567790520422923, + "learning_rate": 7.311651906709654e-06, + "loss": 0.5808, + "step": 4753 + }, + { + "epoch": 2.350018539117538, + "grad_norm": 0.13757327283150092, + "learning_rate": 7.307901566489939e-06, + "loss": 0.5645, + "step": 4754 + }, + { + "epoch": 2.350512915585218, + "grad_norm": 0.15898444887308533, + "learning_rate": 7.30415163445697e-06, + "loss": 0.5707, + "step": 4755 + }, + { + "epoch": 2.3510072920528984, + "grad_norm": 0.14195444929002554, + "learning_rate": 7.300402111179321e-06, + "loss": 0.5877, + "step": 4756 + }, + { + "epoch": 2.3515016685205783, + "grad_norm": 0.1439342928377243, + "learning_rate": 7.296652997225512e-06, + "loss": 0.5909, + "step": 4757 + }, + { + "epoch": 2.3519960449882586, + "grad_norm": 0.14232749778837714, + "learning_rate": 7.292904293164e-06, + "loss": 0.58, + "step": 4758 + }, + { + "epoch": 2.3524904214559386, + "grad_norm": 0.13039421404931814, + "learning_rate": 7.28915599956318e-06, + "loss": 0.5777, + "step": 4759 + }, + { + "epoch": 2.352984797923619, + "grad_norm": 0.13167958015786937, + "learning_rate": 7.285408116991382e-06, + "loss": 0.5441, + "step": 4760 + }, + { + "epoch": 2.353479174391299, + "grad_norm": 0.1334497625195665, + "learning_rate": 7.281660646016873e-06, + "loss": 0.588, + "step": 4761 + }, + { + "epoch": 2.353973550858979, + "grad_norm": 0.13914480889810182, + "learning_rate": 7.277913587207857e-06, + "loss": 0.544, + "step": 4762 + }, + { + "epoch": 2.354467927326659, + "grad_norm": 0.13601622624238033, + "learning_rate": 7.274166941132485e-06, + "loss": 0.6224, + "step": 4763 + }, + { + "epoch": 2.3549623037943395, + "grad_norm": 0.1464718612160722, + "learning_rate": 7.27042070835884e-06, + "loss": 0.5901, + "step": 4764 + }, + { + "epoch": 2.3554566802620194, + "grad_norm": 0.13561502283485843, + "learning_rate": 7.266674889454932e-06, + "loss": 0.5457, + "step": 4765 + }, + { + "epoch": 2.3559510567296997, + "grad_norm": 0.1373119127969007, + "learning_rate": 7.262929484988721e-06, + "loss": 0.5685, + "step": 4766 + }, + { + "epoch": 2.3564454331973796, + "grad_norm": 0.13330760330099106, + "learning_rate": 7.259184495528102e-06, + "loss": 0.5741, + "step": 4767 + }, + { + "epoch": 2.35693980966506, + "grad_norm": 0.13648503171440313, + "learning_rate": 7.255439921640901e-06, + "loss": 0.5569, + "step": 4768 + }, + { + "epoch": 2.35743418613274, + "grad_norm": 0.14176059017078582, + "learning_rate": 7.251695763894889e-06, + "loss": 0.5758, + "step": 4769 + }, + { + "epoch": 2.3579285626004203, + "grad_norm": 0.12812338788296634, + "learning_rate": 7.2479520228577705e-06, + "loss": 0.5707, + "step": 4770 + }, + { + "epoch": 2.3584229390681, + "grad_norm": 0.13607668272630533, + "learning_rate": 7.244208699097187e-06, + "loss": 0.583, + "step": 4771 + }, + { + "epoch": 2.3589173155357805, + "grad_norm": 0.14074506692398836, + "learning_rate": 7.240465793180713e-06, + "loss": 0.6245, + "step": 4772 + }, + { + "epoch": 2.3594116920034605, + "grad_norm": 0.14143530507691968, + "learning_rate": 7.236723305675859e-06, + "loss": 0.5295, + "step": 4773 + }, + { + "epoch": 2.359906068471141, + "grad_norm": 0.13028060385797233, + "learning_rate": 7.232981237150089e-06, + "loss": 0.548, + "step": 4774 + }, + { + "epoch": 2.3604004449388207, + "grad_norm": 0.13214193931416884, + "learning_rate": 7.22923958817078e-06, + "loss": 0.5718, + "step": 4775 + }, + { + "epoch": 2.360894821406501, + "grad_norm": 0.13668776265950436, + "learning_rate": 7.225498359305257e-06, + "loss": 0.5563, + "step": 4776 + }, + { + "epoch": 2.361389197874181, + "grad_norm": 0.13162095208096147, + "learning_rate": 7.221757551120783e-06, + "loss": 0.5469, + "step": 4777 + }, + { + "epoch": 2.3618835743418614, + "grad_norm": 0.14049658639379267, + "learning_rate": 7.218017164184557e-06, + "loss": 0.5712, + "step": 4778 + }, + { + "epoch": 2.3623779508095413, + "grad_norm": 0.1316063734587211, + "learning_rate": 7.214277199063697e-06, + "loss": 0.5418, + "step": 4779 + }, + { + "epoch": 2.3628723272772216, + "grad_norm": 0.13410844047111448, + "learning_rate": 7.2105376563252895e-06, + "loss": 0.577, + "step": 4780 + }, + { + "epoch": 2.3633667037449015, + "grad_norm": 0.14632641666422633, + "learning_rate": 7.206798536536333e-06, + "loss": 0.5865, + "step": 4781 + }, + { + "epoch": 2.363861080212582, + "grad_norm": 0.13677121114183272, + "learning_rate": 7.2030598402637615e-06, + "loss": 0.5751, + "step": 4782 + }, + { + "epoch": 2.3643554566802623, + "grad_norm": 0.13933334583581547, + "learning_rate": 7.199321568074458e-06, + "loss": 0.549, + "step": 4783 + }, + { + "epoch": 2.364849833147942, + "grad_norm": 0.13002358760895183, + "learning_rate": 7.1955837205352295e-06, + "loss": 0.5818, + "step": 4784 + }, + { + "epoch": 2.365344209615622, + "grad_norm": 0.14021650925965706, + "learning_rate": 7.1918462982128275e-06, + "loss": 0.5552, + "step": 4785 + }, + { + "epoch": 2.3658385860833024, + "grad_norm": 0.13804086324722398, + "learning_rate": 7.188109301673935e-06, + "loss": 0.5702, + "step": 4786 + }, + { + "epoch": 2.366332962550983, + "grad_norm": 0.13595179553517714, + "learning_rate": 7.184372731485167e-06, + "loss": 0.5426, + "step": 4787 + }, + { + "epoch": 2.3668273390186627, + "grad_norm": 0.13342839026259307, + "learning_rate": 7.180636588213083e-06, + "loss": 0.5407, + "step": 4788 + }, + { + "epoch": 2.3673217154863426, + "grad_norm": 0.1340087475197535, + "learning_rate": 7.176900872424164e-06, + "loss": 0.5323, + "step": 4789 + }, + { + "epoch": 2.367816091954023, + "grad_norm": 0.14683257146255882, + "learning_rate": 7.173165584684836e-06, + "loss": 0.6128, + "step": 4790 + }, + { + "epoch": 2.3683104684217033, + "grad_norm": 0.14657042680338672, + "learning_rate": 7.169430725561463e-06, + "loss": 0.5734, + "step": 4791 + }, + { + "epoch": 2.3688048448893833, + "grad_norm": 0.14697649810918836, + "learning_rate": 7.165696295620338e-06, + "loss": 0.5486, + "step": 4792 + }, + { + "epoch": 2.369299221357063, + "grad_norm": 0.14062560745997446, + "learning_rate": 7.161962295427688e-06, + "loss": 0.5746, + "step": 4793 + }, + { + "epoch": 2.3697935978247435, + "grad_norm": 0.14661792389425424, + "learning_rate": 7.158228725549679e-06, + "loss": 0.578, + "step": 4794 + }, + { + "epoch": 2.370287974292424, + "grad_norm": 0.13992746060746164, + "learning_rate": 7.154495586552405e-06, + "loss": 0.6063, + "step": 4795 + }, + { + "epoch": 2.370782350760104, + "grad_norm": 0.13751146133022454, + "learning_rate": 7.150762879001906e-06, + "loss": 0.5534, + "step": 4796 + }, + { + "epoch": 2.3712767272277837, + "grad_norm": 0.14006681874513574, + "learning_rate": 7.147030603464149e-06, + "loss": 0.5284, + "step": 4797 + }, + { + "epoch": 2.371771103695464, + "grad_norm": 0.23727685084619599, + "learning_rate": 7.1432987605050345e-06, + "loss": 0.6393, + "step": 4798 + }, + { + "epoch": 2.3722654801631444, + "grad_norm": 0.146060517788893, + "learning_rate": 7.1395673506903985e-06, + "loss": 0.6151, + "step": 4799 + }, + { + "epoch": 2.3727598566308243, + "grad_norm": 0.14458684510465813, + "learning_rate": 7.135836374586013e-06, + "loss": 0.5628, + "step": 4800 + }, + { + "epoch": 2.3732542330985047, + "grad_norm": 0.13418996163520272, + "learning_rate": 7.132105832757585e-06, + "loss": 0.5609, + "step": 4801 + }, + { + "epoch": 2.3737486095661846, + "grad_norm": 0.13433847915569297, + "learning_rate": 7.128375725770753e-06, + "loss": 0.5676, + "step": 4802 + }, + { + "epoch": 2.374242986033865, + "grad_norm": 0.13123512360906886, + "learning_rate": 7.124646054191093e-06, + "loss": 0.5561, + "step": 4803 + }, + { + "epoch": 2.374737362501545, + "grad_norm": 0.13310218471357624, + "learning_rate": 7.120916818584112e-06, + "loss": 0.5441, + "step": 4804 + }, + { + "epoch": 2.3752317389692252, + "grad_norm": 0.14070979962086383, + "learning_rate": 7.1171880195152485e-06, + "loss": 0.5803, + "step": 4805 + }, + { + "epoch": 2.375726115436905, + "grad_norm": 0.13682739827800625, + "learning_rate": 7.113459657549876e-06, + "loss": 0.58, + "step": 4806 + }, + { + "epoch": 2.3762204919045855, + "grad_norm": 0.1369318400082183, + "learning_rate": 7.109731733253313e-06, + "loss": 0.5739, + "step": 4807 + }, + { + "epoch": 2.3767148683722654, + "grad_norm": 0.131082103762803, + "learning_rate": 7.106004247190797e-06, + "loss": 0.5577, + "step": 4808 + }, + { + "epoch": 2.377209244839946, + "grad_norm": 0.13598756955187602, + "learning_rate": 7.102277199927503e-06, + "loss": 0.5568, + "step": 4809 + }, + { + "epoch": 2.3777036213076257, + "grad_norm": 0.5773522110460195, + "learning_rate": 7.098550592028542e-06, + "loss": 0.6768, + "step": 4810 + }, + { + "epoch": 2.378197997775306, + "grad_norm": 0.13372919599579902, + "learning_rate": 7.09482442405896e-06, + "loss": 0.5378, + "step": 4811 + }, + { + "epoch": 2.378692374242986, + "grad_norm": 0.14048124934448278, + "learning_rate": 7.091098696583724e-06, + "loss": 0.5589, + "step": 4812 + }, + { + "epoch": 2.3791867507106663, + "grad_norm": 0.13950621494196172, + "learning_rate": 7.087373410167757e-06, + "loss": 0.554, + "step": 4813 + }, + { + "epoch": 2.3796811271783462, + "grad_norm": 0.13555665674089193, + "learning_rate": 7.0836485653758956e-06, + "loss": 0.5472, + "step": 4814 + }, + { + "epoch": 2.3801755036460266, + "grad_norm": 0.13794235106230404, + "learning_rate": 7.079924162772913e-06, + "loss": 0.5674, + "step": 4815 + }, + { + "epoch": 2.3806698801137065, + "grad_norm": 0.14031901762390536, + "learning_rate": 7.076200202923522e-06, + "loss": 0.6262, + "step": 4816 + }, + { + "epoch": 2.381164256581387, + "grad_norm": 0.14029020051911567, + "learning_rate": 7.07247668639236e-06, + "loss": 0.5881, + "step": 4817 + }, + { + "epoch": 2.3816586330490668, + "grad_norm": 0.13763627194447026, + "learning_rate": 7.068753613744006e-06, + "loss": 0.5902, + "step": 4818 + }, + { + "epoch": 2.382153009516747, + "grad_norm": 0.1329622996598588, + "learning_rate": 7.065030985542967e-06, + "loss": 0.5401, + "step": 4819 + }, + { + "epoch": 2.382647385984427, + "grad_norm": 0.1319120213173194, + "learning_rate": 7.061308802353683e-06, + "loss": 0.5271, + "step": 4820 + }, + { + "epoch": 2.3831417624521074, + "grad_norm": 0.13233406084971164, + "learning_rate": 7.057587064740521e-06, + "loss": 0.5889, + "step": 4821 + }, + { + "epoch": 2.3836361389197873, + "grad_norm": 0.1374500506672673, + "learning_rate": 7.0538657732677875e-06, + "loss": 0.5576, + "step": 4822 + }, + { + "epoch": 2.3841305153874677, + "grad_norm": 0.13579308636100812, + "learning_rate": 7.050144928499727e-06, + "loss": 0.6042, + "step": 4823 + }, + { + "epoch": 2.3846248918551476, + "grad_norm": 0.1420436727910456, + "learning_rate": 7.0464245310005e-06, + "loss": 0.5763, + "step": 4824 + }, + { + "epoch": 2.385119268322828, + "grad_norm": 0.12989523576839346, + "learning_rate": 7.042704581334212e-06, + "loss": 0.5546, + "step": 4825 + }, + { + "epoch": 2.385613644790508, + "grad_norm": 0.1369513541117079, + "learning_rate": 7.038985080064897e-06, + "loss": 0.5914, + "step": 4826 + }, + { + "epoch": 2.386108021258188, + "grad_norm": 0.13876830634146983, + "learning_rate": 7.035266027756522e-06, + "loss": 0.5939, + "step": 4827 + }, + { + "epoch": 2.386602397725868, + "grad_norm": 0.14086247987597264, + "learning_rate": 7.031547424972975e-06, + "loss": 0.5863, + "step": 4828 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.14196921956004208, + "learning_rate": 7.0278292722781e-06, + "loss": 0.5605, + "step": 4829 + }, + { + "epoch": 2.3875911506612284, + "grad_norm": 0.13591225791342842, + "learning_rate": 7.02411157023565e-06, + "loss": 0.5359, + "step": 4830 + }, + { + "epoch": 2.3880855271289088, + "grad_norm": 0.13110829807870414, + "learning_rate": 7.0203943194093185e-06, + "loss": 0.5541, + "step": 4831 + }, + { + "epoch": 2.3885799035965887, + "grad_norm": 0.14380653735683965, + "learning_rate": 7.016677520362729e-06, + "loss": 0.5814, + "step": 4832 + }, + { + "epoch": 2.389074280064269, + "grad_norm": 0.14072553180828992, + "learning_rate": 7.012961173659437e-06, + "loss": 0.5689, + "step": 4833 + }, + { + "epoch": 2.389568656531949, + "grad_norm": 0.14361679223139984, + "learning_rate": 7.009245279862934e-06, + "loss": 0.5874, + "step": 4834 + }, + { + "epoch": 2.3900630329996293, + "grad_norm": 0.14004240245593808, + "learning_rate": 7.0055298395366365e-06, + "loss": 0.5997, + "step": 4835 + }, + { + "epoch": 2.390557409467309, + "grad_norm": 0.14038289186471775, + "learning_rate": 7.0018148532438955e-06, + "loss": 0.6095, + "step": 4836 + }, + { + "epoch": 2.3910517859349896, + "grad_norm": 0.1441451299945719, + "learning_rate": 6.998100321547991e-06, + "loss": 0.604, + "step": 4837 + }, + { + "epoch": 2.3915461624026695, + "grad_norm": 0.13956843046363512, + "learning_rate": 6.994386245012135e-06, + "loss": 0.5553, + "step": 4838 + }, + { + "epoch": 2.39204053887035, + "grad_norm": 0.139089960597512, + "learning_rate": 6.990672624199467e-06, + "loss": 0.5523, + "step": 4839 + }, + { + "epoch": 2.3925349153380298, + "grad_norm": 0.13757539028962784, + "learning_rate": 6.986959459673068e-06, + "loss": 0.5716, + "step": 4840 + }, + { + "epoch": 2.39302929180571, + "grad_norm": 0.12750792524334423, + "learning_rate": 6.983246751995939e-06, + "loss": 0.5492, + "step": 4841 + }, + { + "epoch": 2.39352366827339, + "grad_norm": 0.13829837866023248, + "learning_rate": 6.979534501731017e-06, + "loss": 0.5628, + "step": 4842 + }, + { + "epoch": 2.3940180447410704, + "grad_norm": 0.14117324656721775, + "learning_rate": 6.975822709441166e-06, + "loss": 0.5868, + "step": 4843 + }, + { + "epoch": 2.3945124212087503, + "grad_norm": 0.13972051340047903, + "learning_rate": 6.972111375689183e-06, + "loss": 0.5556, + "step": 4844 + }, + { + "epoch": 2.3950067976764307, + "grad_norm": 0.13313358381188725, + "learning_rate": 6.9684005010378e-06, + "loss": 0.5258, + "step": 4845 + }, + { + "epoch": 2.3955011741441106, + "grad_norm": 0.1342424342719155, + "learning_rate": 6.964690086049673e-06, + "loss": 0.6051, + "step": 4846 + }, + { + "epoch": 2.395995550611791, + "grad_norm": 0.1332076890573369, + "learning_rate": 6.960980131287385e-06, + "loss": 0.5603, + "step": 4847 + }, + { + "epoch": 2.396489927079471, + "grad_norm": 0.1455953727491397, + "learning_rate": 6.957270637313458e-06, + "loss": 0.593, + "step": 4848 + }, + { + "epoch": 2.396984303547151, + "grad_norm": 0.13269769240336426, + "learning_rate": 6.95356160469034e-06, + "loss": 0.5618, + "step": 4849 + }, + { + "epoch": 2.397478680014831, + "grad_norm": 0.13493770463986907, + "learning_rate": 6.949853033980407e-06, + "loss": 0.5917, + "step": 4850 + }, + { + "epoch": 2.3979730564825115, + "grad_norm": 0.13605552647992872, + "learning_rate": 6.946144925745972e-06, + "loss": 0.5378, + "step": 4851 + }, + { + "epoch": 2.3984674329501914, + "grad_norm": 0.13286002733556246, + "learning_rate": 6.94243728054927e-06, + "loss": 0.5641, + "step": 4852 + }, + { + "epoch": 2.3989618094178717, + "grad_norm": 0.13904030450695495, + "learning_rate": 6.938730098952473e-06, + "loss": 0.5764, + "step": 4853 + }, + { + "epoch": 2.399456185885552, + "grad_norm": 0.13166349016980997, + "learning_rate": 6.935023381517672e-06, + "loss": 0.5727, + "step": 4854 + }, + { + "epoch": 2.399950562353232, + "grad_norm": 0.1352666763561264, + "learning_rate": 6.931317128806895e-06, + "loss": 0.5758, + "step": 4855 + }, + { + "epoch": 2.400444938820912, + "grad_norm": 0.1370815465623046, + "learning_rate": 6.9276113413821075e-06, + "loss": 0.5406, + "step": 4856 + }, + { + "epoch": 2.4009393152885923, + "grad_norm": 0.1345931545030555, + "learning_rate": 6.923906019805187e-06, + "loss": 0.5871, + "step": 4857 + }, + { + "epoch": 2.4014336917562726, + "grad_norm": 0.14378529033692214, + "learning_rate": 6.920201164637953e-06, + "loss": 0.5925, + "step": 4858 + }, + { + "epoch": 2.4019280682239526, + "grad_norm": 0.13240735203289017, + "learning_rate": 6.9164967764421494e-06, + "loss": 0.5478, + "step": 4859 + }, + { + "epoch": 2.4024224446916325, + "grad_norm": 0.14226966336103336, + "learning_rate": 6.912792855779453e-06, + "loss": 0.5641, + "step": 4860 + }, + { + "epoch": 2.402916821159313, + "grad_norm": 0.13826987594638257, + "learning_rate": 6.909089403211459e-06, + "loss": 0.5905, + "step": 4861 + }, + { + "epoch": 2.403411197626993, + "grad_norm": 0.13006140471445057, + "learning_rate": 6.905386419299709e-06, + "loss": 0.5861, + "step": 4862 + }, + { + "epoch": 2.403905574094673, + "grad_norm": 0.1407856633452731, + "learning_rate": 6.901683904605663e-06, + "loss": 0.5843, + "step": 4863 + }, + { + "epoch": 2.404399950562353, + "grad_norm": 0.13684556020321995, + "learning_rate": 6.897981859690706e-06, + "loss": 0.5763, + "step": 4864 + }, + { + "epoch": 2.4048943270300334, + "grad_norm": 0.14879966563802635, + "learning_rate": 6.894280285116159e-06, + "loss": 0.5582, + "step": 4865 + }, + { + "epoch": 2.4053887034977137, + "grad_norm": 0.1347196702728685, + "learning_rate": 6.89057918144327e-06, + "loss": 0.5365, + "step": 4866 + }, + { + "epoch": 2.4058830799653936, + "grad_norm": 0.14335652379744104, + "learning_rate": 6.886878549233215e-06, + "loss": 0.5553, + "step": 4867 + }, + { + "epoch": 2.4063774564330735, + "grad_norm": 0.14048098188069588, + "learning_rate": 6.8831783890471025e-06, + "loss": 0.5866, + "step": 4868 + }, + { + "epoch": 2.406871832900754, + "grad_norm": 0.13640071201631446, + "learning_rate": 6.879478701445961e-06, + "loss": 0.5523, + "step": 4869 + }, + { + "epoch": 2.4073662093684343, + "grad_norm": 0.14200793344414217, + "learning_rate": 6.875779486990754e-06, + "loss": 0.579, + "step": 4870 + }, + { + "epoch": 2.407860585836114, + "grad_norm": 0.13543115688377766, + "learning_rate": 6.872080746242369e-06, + "loss": 0.5792, + "step": 4871 + }, + { + "epoch": 2.4083549623037945, + "grad_norm": 0.15298375478037782, + "learning_rate": 6.868382479761621e-06, + "loss": 0.5365, + "step": 4872 + }, + { + "epoch": 2.4088493387714744, + "grad_norm": 0.13682285092648594, + "learning_rate": 6.864684688109266e-06, + "loss": 0.5703, + "step": 4873 + }, + { + "epoch": 2.409343715239155, + "grad_norm": 0.15031341925887698, + "learning_rate": 6.86098737184597e-06, + "loss": 0.5496, + "step": 4874 + }, + { + "epoch": 2.4098380917068347, + "grad_norm": 0.13478036501921203, + "learning_rate": 6.8572905315323365e-06, + "loss": 0.5783, + "step": 4875 + }, + { + "epoch": 2.410332468174515, + "grad_norm": 0.14147854807444757, + "learning_rate": 6.853594167728896e-06, + "loss": 0.534, + "step": 4876 + }, + { + "epoch": 2.410826844642195, + "grad_norm": 0.13798929122499257, + "learning_rate": 6.849898280996106e-06, + "loss": 0.5607, + "step": 4877 + }, + { + "epoch": 2.4113212211098753, + "grad_norm": 0.14401112498511698, + "learning_rate": 6.8462028718943505e-06, + "loss": 0.613, + "step": 4878 + }, + { + "epoch": 2.4118155975775553, + "grad_norm": 0.1436368280714619, + "learning_rate": 6.842507940983947e-06, + "loss": 0.5472, + "step": 4879 + }, + { + "epoch": 2.4123099740452356, + "grad_norm": 0.1548027282511389, + "learning_rate": 6.838813488825129e-06, + "loss": 0.582, + "step": 4880 + }, + { + "epoch": 2.4128043505129155, + "grad_norm": 0.13825039400744535, + "learning_rate": 6.835119515978067e-06, + "loss": 0.5541, + "step": 4881 + }, + { + "epoch": 2.413298726980596, + "grad_norm": 0.1334495490766065, + "learning_rate": 6.831426023002856e-06, + "loss": 0.5518, + "step": 4882 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 0.13884911526478136, + "learning_rate": 6.827733010459516e-06, + "loss": 0.5273, + "step": 4883 + }, + { + "epoch": 2.414287479915956, + "grad_norm": 0.14612610444259116, + "learning_rate": 6.8240404789080006e-06, + "loss": 0.5881, + "step": 4884 + }, + { + "epoch": 2.414781856383636, + "grad_norm": 0.1425297790118483, + "learning_rate": 6.820348428908183e-06, + "loss": 0.5969, + "step": 4885 + }, + { + "epoch": 2.4152762328513164, + "grad_norm": 0.13718914565527496, + "learning_rate": 6.816656861019871e-06, + "loss": 0.5591, + "step": 4886 + }, + { + "epoch": 2.4157706093189963, + "grad_norm": 0.1405019797044762, + "learning_rate": 6.812965775802789e-06, + "loss": 0.539, + "step": 4887 + }, + { + "epoch": 2.4162649857866767, + "grad_norm": 0.1346063163950698, + "learning_rate": 6.809275173816594e-06, + "loss": 0.5873, + "step": 4888 + }, + { + "epoch": 2.4167593622543566, + "grad_norm": 0.14346949986438565, + "learning_rate": 6.805585055620877e-06, + "loss": 0.6022, + "step": 4889 + }, + { + "epoch": 2.417253738722037, + "grad_norm": 0.14659862954915948, + "learning_rate": 6.801895421775142e-06, + "loss": 0.5554, + "step": 4890 + }, + { + "epoch": 2.417748115189717, + "grad_norm": 0.13667998509971066, + "learning_rate": 6.79820627283883e-06, + "loss": 0.5806, + "step": 4891 + }, + { + "epoch": 2.4182424916573972, + "grad_norm": 0.13520663435219132, + "learning_rate": 6.794517609371301e-06, + "loss": 0.5758, + "step": 4892 + }, + { + "epoch": 2.418736868125077, + "grad_norm": 0.14003971047017783, + "learning_rate": 6.790829431931848e-06, + "loss": 0.5635, + "step": 4893 + }, + { + "epoch": 2.4192312445927575, + "grad_norm": 0.1391321387301704, + "learning_rate": 6.78714174107968e-06, + "loss": 0.5664, + "step": 4894 + }, + { + "epoch": 2.4197256210604374, + "grad_norm": 0.13603697934724995, + "learning_rate": 6.78345453737395e-06, + "loss": 0.5612, + "step": 4895 + }, + { + "epoch": 2.420219997528118, + "grad_norm": 0.13633174445579255, + "learning_rate": 6.7797678213737236e-06, + "loss": 0.5877, + "step": 4896 + }, + { + "epoch": 2.4207143739957977, + "grad_norm": 0.13879555556419182, + "learning_rate": 6.776081593637992e-06, + "loss": 0.5405, + "step": 4897 + }, + { + "epoch": 2.421208750463478, + "grad_norm": 0.13406990212313674, + "learning_rate": 6.772395854725677e-06, + "loss": 0.5686, + "step": 4898 + }, + { + "epoch": 2.421703126931158, + "grad_norm": 0.1354419118985812, + "learning_rate": 6.768710605195624e-06, + "loss": 0.5265, + "step": 4899 + }, + { + "epoch": 2.4221975033988383, + "grad_norm": 0.13617760242181084, + "learning_rate": 6.765025845606609e-06, + "loss": 0.5606, + "step": 4900 + }, + { + "epoch": 2.4226918798665182, + "grad_norm": 0.13655469249509938, + "learning_rate": 6.761341576517326e-06, + "loss": 0.5261, + "step": 4901 + }, + { + "epoch": 2.4231862563341986, + "grad_norm": 0.13758873421588752, + "learning_rate": 6.757657798486405e-06, + "loss": 0.5786, + "step": 4902 + }, + { + "epoch": 2.4236806328018785, + "grad_norm": 0.13533256966396723, + "learning_rate": 6.753974512072387e-06, + "loss": 0.5561, + "step": 4903 + }, + { + "epoch": 2.424175009269559, + "grad_norm": 0.14195799241452617, + "learning_rate": 6.750291717833748e-06, + "loss": 0.5926, + "step": 4904 + }, + { + "epoch": 2.424669385737239, + "grad_norm": 0.14067065945855645, + "learning_rate": 6.7466094163288955e-06, + "loss": 0.5971, + "step": 4905 + }, + { + "epoch": 2.425163762204919, + "grad_norm": 0.1392991577186857, + "learning_rate": 6.7429276081161465e-06, + "loss": 0.5795, + "step": 4906 + }, + { + "epoch": 2.425658138672599, + "grad_norm": 0.14336918938337098, + "learning_rate": 6.739246293753756e-06, + "loss": 0.5778, + "step": 4907 + }, + { + "epoch": 2.4261525151402794, + "grad_norm": 0.13575765880801624, + "learning_rate": 6.735565473799896e-06, + "loss": 0.5569, + "step": 4908 + }, + { + "epoch": 2.4266468916079593, + "grad_norm": 0.1484415522833821, + "learning_rate": 6.731885148812674e-06, + "loss": 0.5396, + "step": 4909 + }, + { + "epoch": 2.4271412680756397, + "grad_norm": 0.13247934852124738, + "learning_rate": 6.728205319350104e-06, + "loss": 0.5371, + "step": 4910 + }, + { + "epoch": 2.4276356445433196, + "grad_norm": 0.13417928557998648, + "learning_rate": 6.724525985970147e-06, + "loss": 0.5978, + "step": 4911 + }, + { + "epoch": 2.428130021011, + "grad_norm": 0.1472588316348362, + "learning_rate": 6.720847149230678e-06, + "loss": 0.5552, + "step": 4912 + }, + { + "epoch": 2.42862439747868, + "grad_norm": 0.14049351221930031, + "learning_rate": 6.717168809689491e-06, + "loss": 0.5771, + "step": 4913 + }, + { + "epoch": 2.42911877394636, + "grad_norm": 0.13863388215543238, + "learning_rate": 6.713490967904313e-06, + "loss": 0.5529, + "step": 4914 + }, + { + "epoch": 2.42961315041404, + "grad_norm": 0.1339558668294605, + "learning_rate": 6.7098136244327915e-06, + "loss": 0.6021, + "step": 4915 + }, + { + "epoch": 2.4301075268817205, + "grad_norm": 0.14306990796239583, + "learning_rate": 6.7061367798325035e-06, + "loss": 0.5646, + "step": 4916 + }, + { + "epoch": 2.4306019033494004, + "grad_norm": 0.14934616495769348, + "learning_rate": 6.702460434660947e-06, + "loss": 0.5863, + "step": 4917 + }, + { + "epoch": 2.4310962798170808, + "grad_norm": 0.13655113846246172, + "learning_rate": 6.6987845894755396e-06, + "loss": 0.5568, + "step": 4918 + }, + { + "epoch": 2.4315906562847607, + "grad_norm": 0.13950933094323506, + "learning_rate": 6.695109244833635e-06, + "loss": 0.5449, + "step": 4919 + }, + { + "epoch": 2.432085032752441, + "grad_norm": 0.14442583099418468, + "learning_rate": 6.691434401292497e-06, + "loss": 0.559, + "step": 4920 + }, + { + "epoch": 2.432579409220121, + "grad_norm": 0.13341020176790813, + "learning_rate": 6.687760059409319e-06, + "loss": 0.5604, + "step": 4921 + }, + { + "epoch": 2.4330737856878013, + "grad_norm": 0.13849948566662756, + "learning_rate": 6.684086219741226e-06, + "loss": 0.5377, + "step": 4922 + }, + { + "epoch": 2.433568162155481, + "grad_norm": 0.13298803533787376, + "learning_rate": 6.680412882845256e-06, + "loss": 0.5306, + "step": 4923 + }, + { + "epoch": 2.4340625386231616, + "grad_norm": 0.14220745466944446, + "learning_rate": 6.676740049278376e-06, + "loss": 0.5403, + "step": 4924 + }, + { + "epoch": 2.4345569150908415, + "grad_norm": 0.1427168934086323, + "learning_rate": 6.673067719597477e-06, + "loss": 0.6056, + "step": 4925 + }, + { + "epoch": 2.435051291558522, + "grad_norm": 0.1417116676672339, + "learning_rate": 6.669395894359369e-06, + "loss": 0.5663, + "step": 4926 + }, + { + "epoch": 2.4355456680262018, + "grad_norm": 0.13980237577722254, + "learning_rate": 6.665724574120791e-06, + "loss": 0.588, + "step": 4927 + }, + { + "epoch": 2.436040044493882, + "grad_norm": 0.15024634254407684, + "learning_rate": 6.662053759438407e-06, + "loss": 0.5889, + "step": 4928 + }, + { + "epoch": 2.4365344209615625, + "grad_norm": 0.13802120632045176, + "learning_rate": 6.658383450868795e-06, + "loss": 0.554, + "step": 4929 + }, + { + "epoch": 2.4370287974292424, + "grad_norm": 0.13436937854679662, + "learning_rate": 6.654713648968463e-06, + "loss": 0.5879, + "step": 4930 + }, + { + "epoch": 2.4375231738969223, + "grad_norm": 0.13746103517168343, + "learning_rate": 6.651044354293842e-06, + "loss": 0.5879, + "step": 4931 + }, + { + "epoch": 2.4380175503646027, + "grad_norm": 0.13870468841976905, + "learning_rate": 6.647375567401283e-06, + "loss": 0.5422, + "step": 4932 + }, + { + "epoch": 2.438511926832283, + "grad_norm": 0.13722049585679255, + "learning_rate": 6.643707288847066e-06, + "loss": 0.5759, + "step": 4933 + }, + { + "epoch": 2.439006303299963, + "grad_norm": 0.1327432657380177, + "learning_rate": 6.640039519187388e-06, + "loss": 0.5464, + "step": 4934 + }, + { + "epoch": 2.439500679767643, + "grad_norm": 0.13626312352321815, + "learning_rate": 6.636372258978374e-06, + "loss": 0.5848, + "step": 4935 + }, + { + "epoch": 2.439995056235323, + "grad_norm": 0.13277744459621085, + "learning_rate": 6.632705508776063e-06, + "loss": 0.5311, + "step": 4936 + }, + { + "epoch": 2.4404894327030036, + "grad_norm": 0.13734173462234708, + "learning_rate": 6.62903926913642e-06, + "loss": 0.5939, + "step": 4937 + }, + { + "epoch": 2.4409838091706835, + "grad_norm": 0.14604143755695836, + "learning_rate": 6.625373540615348e-06, + "loss": 0.556, + "step": 4938 + }, + { + "epoch": 2.4414781856383634, + "grad_norm": 0.1413984671662168, + "learning_rate": 6.621708323768649e-06, + "loss": 0.5636, + "step": 4939 + }, + { + "epoch": 2.4419725621060437, + "grad_norm": 0.13366963689067685, + "learning_rate": 6.618043619152059e-06, + "loss": 0.5344, + "step": 4940 + }, + { + "epoch": 2.442466938573724, + "grad_norm": 0.13554167042916906, + "learning_rate": 6.614379427321238e-06, + "loss": 0.5693, + "step": 4941 + }, + { + "epoch": 2.442961315041404, + "grad_norm": 0.13330183449569905, + "learning_rate": 6.610715748831766e-06, + "loss": 0.5374, + "step": 4942 + }, + { + "epoch": 2.443455691509084, + "grad_norm": 0.13900774171354094, + "learning_rate": 6.607052584239137e-06, + "loss": 0.5562, + "step": 4943 + }, + { + "epoch": 2.4439500679767643, + "grad_norm": 0.1316601165156174, + "learning_rate": 6.603389934098783e-06, + "loss": 0.539, + "step": 4944 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.14021489197491924, + "learning_rate": 6.5997277989660495e-06, + "loss": 0.5622, + "step": 4945 + }, + { + "epoch": 2.4449388209121246, + "grad_norm": 0.13868818777434752, + "learning_rate": 6.5960661793961985e-06, + "loss": 0.5618, + "step": 4946 + }, + { + "epoch": 2.445433197379805, + "grad_norm": 0.13415274721179565, + "learning_rate": 6.592405075944424e-06, + "loss": 0.5879, + "step": 4947 + }, + { + "epoch": 2.445927573847485, + "grad_norm": 0.14390376805496521, + "learning_rate": 6.588744489165832e-06, + "loss": 0.5794, + "step": 4948 + }, + { + "epoch": 2.446421950315165, + "grad_norm": 0.146410337105774, + "learning_rate": 6.585084419615463e-06, + "loss": 0.5781, + "step": 4949 + }, + { + "epoch": 2.446916326782845, + "grad_norm": 0.1380486770713858, + "learning_rate": 6.581424867848266e-06, + "loss": 0.5775, + "step": 4950 + }, + { + "epoch": 2.4474107032505255, + "grad_norm": 0.1350904348881832, + "learning_rate": 6.577765834419119e-06, + "loss": 0.5578, + "step": 4951 + }, + { + "epoch": 2.4479050797182054, + "grad_norm": 0.14457660232741018, + "learning_rate": 6.5741073198828195e-06, + "loss": 0.5636, + "step": 4952 + }, + { + "epoch": 2.4483994561858857, + "grad_norm": 0.13551161252711091, + "learning_rate": 6.570449324794084e-06, + "loss": 0.5667, + "step": 4953 + }, + { + "epoch": 2.4488938326535656, + "grad_norm": 0.13712342020451027, + "learning_rate": 6.566791849707551e-06, + "loss": 0.5755, + "step": 4954 + }, + { + "epoch": 2.449388209121246, + "grad_norm": 0.13692925607824022, + "learning_rate": 6.563134895177786e-06, + "loss": 0.5598, + "step": 4955 + }, + { + "epoch": 2.449882585588926, + "grad_norm": 0.1406333899948023, + "learning_rate": 6.55947846175927e-06, + "loss": 0.5587, + "step": 4956 + }, + { + "epoch": 2.4503769620566063, + "grad_norm": 0.13485501804855332, + "learning_rate": 6.555822550006404e-06, + "loss": 0.5475, + "step": 4957 + }, + { + "epoch": 2.450871338524286, + "grad_norm": 0.1388206836659195, + "learning_rate": 6.552167160473515e-06, + "loss": 0.5641, + "step": 4958 + }, + { + "epoch": 2.4513657149919665, + "grad_norm": 0.13296503606814725, + "learning_rate": 6.548512293714841e-06, + "loss": 0.5806, + "step": 4959 + }, + { + "epoch": 2.4518600914596465, + "grad_norm": 0.1433109520588628, + "learning_rate": 6.544857950284558e-06, + "loss": 0.5836, + "step": 4960 + }, + { + "epoch": 2.452354467927327, + "grad_norm": 0.14379670219157534, + "learning_rate": 6.5412041307367455e-06, + "loss": 0.5538, + "step": 4961 + }, + { + "epoch": 2.4528488443950067, + "grad_norm": 0.135818554237603, + "learning_rate": 6.537550835625411e-06, + "loss": 0.596, + "step": 4962 + }, + { + "epoch": 2.453343220862687, + "grad_norm": 0.13192770589972955, + "learning_rate": 6.533898065504483e-06, + "loss": 0.5847, + "step": 4963 + }, + { + "epoch": 2.453837597330367, + "grad_norm": 0.1362789231615103, + "learning_rate": 6.530245820927806e-06, + "loss": 0.5439, + "step": 4964 + }, + { + "epoch": 2.4543319737980474, + "grad_norm": 0.13649823080455187, + "learning_rate": 6.52659410244915e-06, + "loss": 0.5493, + "step": 4965 + }, + { + "epoch": 2.4548263502657273, + "grad_norm": 0.1361675732732714, + "learning_rate": 6.522942910622206e-06, + "loss": 0.569, + "step": 4966 + }, + { + "epoch": 2.4553207267334076, + "grad_norm": 0.13458197669462643, + "learning_rate": 6.519292246000577e-06, + "loss": 0.5635, + "step": 4967 + }, + { + "epoch": 2.4558151032010875, + "grad_norm": 0.14070130010169019, + "learning_rate": 6.515642109137799e-06, + "loss": 0.5776, + "step": 4968 + }, + { + "epoch": 2.456309479668768, + "grad_norm": 0.14138145323421666, + "learning_rate": 6.511992500587312e-06, + "loss": 0.572, + "step": 4969 + }, + { + "epoch": 2.456803856136448, + "grad_norm": 0.13960404336561377, + "learning_rate": 6.5083434209024835e-06, + "loss": 0.5826, + "step": 4970 + }, + { + "epoch": 2.457298232604128, + "grad_norm": 0.1352524453040527, + "learning_rate": 6.504694870636612e-06, + "loss": 0.5478, + "step": 4971 + }, + { + "epoch": 2.457792609071808, + "grad_norm": 0.13871706353432284, + "learning_rate": 6.501046850342898e-06, + "loss": 0.5564, + "step": 4972 + }, + { + "epoch": 2.4582869855394884, + "grad_norm": 0.1466039093870748, + "learning_rate": 6.49739936057447e-06, + "loss": 0.5412, + "step": 4973 + }, + { + "epoch": 2.4587813620071683, + "grad_norm": 0.13955879806387325, + "learning_rate": 6.493752401884374e-06, + "loss": 0.5966, + "step": 4974 + }, + { + "epoch": 2.4592757384748487, + "grad_norm": 0.1377795740852767, + "learning_rate": 6.49010597482558e-06, + "loss": 0.5764, + "step": 4975 + }, + { + "epoch": 2.4597701149425286, + "grad_norm": 0.13794453147758273, + "learning_rate": 6.486460079950966e-06, + "loss": 0.5344, + "step": 4976 + }, + { + "epoch": 2.460264491410209, + "grad_norm": 0.1465970930700998, + "learning_rate": 6.482814717813346e-06, + "loss": 0.5284, + "step": 4977 + }, + { + "epoch": 2.460758867877889, + "grad_norm": 0.14266428091049757, + "learning_rate": 6.4791698889654445e-06, + "loss": 0.5572, + "step": 4978 + }, + { + "epoch": 2.4612532443455692, + "grad_norm": 0.13988067025848133, + "learning_rate": 6.475525593959897e-06, + "loss": 0.5332, + "step": 4979 + }, + { + "epoch": 2.461747620813249, + "grad_norm": 0.1431307885926062, + "learning_rate": 6.471881833349272e-06, + "loss": 0.584, + "step": 4980 + }, + { + "epoch": 2.4622419972809295, + "grad_norm": 0.1343504465304987, + "learning_rate": 6.4682386076860486e-06, + "loss": 0.5561, + "step": 4981 + }, + { + "epoch": 2.4627363737486094, + "grad_norm": 0.13503673575871164, + "learning_rate": 6.464595917522629e-06, + "loss": 0.5801, + "step": 4982 + }, + { + "epoch": 2.46323075021629, + "grad_norm": 0.13938916505142568, + "learning_rate": 6.460953763411332e-06, + "loss": 0.578, + "step": 4983 + }, + { + "epoch": 2.4637251266839697, + "grad_norm": 0.1433731046658596, + "learning_rate": 6.457312145904398e-06, + "loss": 0.533, + "step": 4984 + }, + { + "epoch": 2.46421950315165, + "grad_norm": 0.13417357278469214, + "learning_rate": 6.453671065553979e-06, + "loss": 0.5927, + "step": 4985 + }, + { + "epoch": 2.46471387961933, + "grad_norm": 0.13786861228796415, + "learning_rate": 6.450030522912147e-06, + "loss": 0.5444, + "step": 4986 + }, + { + "epoch": 2.4652082560870103, + "grad_norm": 0.1419230217189921, + "learning_rate": 6.446390518530909e-06, + "loss": 0.5796, + "step": 4987 + }, + { + "epoch": 2.4657026325546902, + "grad_norm": 0.15036096452484415, + "learning_rate": 6.442751052962165e-06, + "loss": 0.5639, + "step": 4988 + }, + { + "epoch": 2.4661970090223706, + "grad_norm": 0.13289447807030405, + "learning_rate": 6.439112126757751e-06, + "loss": 0.5479, + "step": 4989 + }, + { + "epoch": 2.4666913854900505, + "grad_norm": 0.15053792968668364, + "learning_rate": 6.435473740469413e-06, + "loss": 0.5636, + "step": 4990 + }, + { + "epoch": 2.467185761957731, + "grad_norm": 0.13873248982068773, + "learning_rate": 6.43183589464882e-06, + "loss": 0.5324, + "step": 4991 + }, + { + "epoch": 2.467680138425411, + "grad_norm": 0.1344400299253313, + "learning_rate": 6.42819858984755e-06, + "loss": 0.5408, + "step": 4992 + }, + { + "epoch": 2.468174514893091, + "grad_norm": 0.1361803601311337, + "learning_rate": 6.424561826617115e-06, + "loss": 0.54, + "step": 4993 + }, + { + "epoch": 2.468668891360771, + "grad_norm": 0.14470491450120831, + "learning_rate": 6.420925605508933e-06, + "loss": 0.5732, + "step": 4994 + }, + { + "epoch": 2.4691632678284514, + "grad_norm": 0.13907907301089892, + "learning_rate": 6.41728992707434e-06, + "loss": 0.5438, + "step": 4995 + }, + { + "epoch": 2.4696576442961313, + "grad_norm": 0.13534949128055743, + "learning_rate": 6.413654791864592e-06, + "loss": 0.5779, + "step": 4996 + }, + { + "epoch": 2.4701520207638117, + "grad_norm": 0.18468872530062463, + "learning_rate": 6.410020200430862e-06, + "loss": 0.6428, + "step": 4997 + }, + { + "epoch": 2.4706463972314916, + "grad_norm": 0.1415576358426016, + "learning_rate": 6.406386153324247e-06, + "loss": 0.5568, + "step": 4998 + }, + { + "epoch": 2.471140773699172, + "grad_norm": 0.13560145165354012, + "learning_rate": 6.40275265109575e-06, + "loss": 0.5537, + "step": 4999 + }, + { + "epoch": 2.471635150166852, + "grad_norm": 0.13388220595605843, + "learning_rate": 6.3991196942963e-06, + "loss": 0.5822, + "step": 5000 + }, + { + "epoch": 2.4721295266345322, + "grad_norm": 0.1401080159671051, + "learning_rate": 6.395487283476741e-06, + "loss": 0.5168, + "step": 5001 + }, + { + "epoch": 2.472623903102212, + "grad_norm": 0.14176964875065595, + "learning_rate": 6.391855419187831e-06, + "loss": 0.585, + "step": 5002 + }, + { + "epoch": 2.4731182795698925, + "grad_norm": 0.13368599393117767, + "learning_rate": 6.388224101980247e-06, + "loss": 0.5606, + "step": 5003 + }, + { + "epoch": 2.473612656037573, + "grad_norm": 0.14728779335328027, + "learning_rate": 6.384593332404588e-06, + "loss": 0.5998, + "step": 5004 + }, + { + "epoch": 2.4741070325052528, + "grad_norm": 0.14321927779505955, + "learning_rate": 6.380963111011362e-06, + "loss": 0.5929, + "step": 5005 + }, + { + "epoch": 2.4746014089729327, + "grad_norm": 0.144636459982788, + "learning_rate": 6.3773334383510014e-06, + "loss": 0.5633, + "step": 5006 + }, + { + "epoch": 2.475095785440613, + "grad_norm": 0.13912230217443766, + "learning_rate": 6.373704314973849e-06, + "loss": 0.5462, + "step": 5007 + }, + { + "epoch": 2.4755901619082934, + "grad_norm": 0.14145294668899147, + "learning_rate": 6.370075741430166e-06, + "loss": 0.5602, + "step": 5008 + }, + { + "epoch": 2.4760845383759733, + "grad_norm": 0.13874277507723778, + "learning_rate": 6.366447718270134e-06, + "loss": 0.5456, + "step": 5009 + }, + { + "epoch": 2.4765789148436532, + "grad_norm": 0.1484233589975126, + "learning_rate": 6.36282024604385e-06, + "loss": 0.5703, + "step": 5010 + }, + { + "epoch": 2.4770732913113336, + "grad_norm": 0.14207427533007017, + "learning_rate": 6.359193325301323e-06, + "loss": 0.5502, + "step": 5011 + }, + { + "epoch": 2.477567667779014, + "grad_norm": 0.13946022950081371, + "learning_rate": 6.355566956592478e-06, + "loss": 0.5326, + "step": 5012 + }, + { + "epoch": 2.478062044246694, + "grad_norm": 0.14151584304323928, + "learning_rate": 6.351941140467166e-06, + "loss": 0.5647, + "step": 5013 + }, + { + "epoch": 2.4785564207143738, + "grad_norm": 0.14610341732927845, + "learning_rate": 6.348315877475142e-06, + "loss": 0.5782, + "step": 5014 + }, + { + "epoch": 2.479050797182054, + "grad_norm": 0.14655802015050345, + "learning_rate": 6.344691168166087e-06, + "loss": 0.5813, + "step": 5015 + }, + { + "epoch": 2.4795451736497345, + "grad_norm": 0.13715441260205755, + "learning_rate": 6.341067013089594e-06, + "loss": 0.5572, + "step": 5016 + }, + { + "epoch": 2.4800395501174144, + "grad_norm": 0.13525806585046946, + "learning_rate": 6.337443412795171e-06, + "loss": 0.5515, + "step": 5017 + }, + { + "epoch": 2.4805339265850943, + "grad_norm": 0.13921208102502353, + "learning_rate": 6.333820367832242e-06, + "loss": 0.5767, + "step": 5018 + }, + { + "epoch": 2.4810283030527747, + "grad_norm": 0.13823215940467207, + "learning_rate": 6.330197878750142e-06, + "loss": 0.5665, + "step": 5019 + }, + { + "epoch": 2.481522679520455, + "grad_norm": 0.14265407261534357, + "learning_rate": 6.326575946098141e-06, + "loss": 0.573, + "step": 5020 + }, + { + "epoch": 2.482017055988135, + "grad_norm": 0.13979711959729305, + "learning_rate": 6.322954570425399e-06, + "loss": 0.5684, + "step": 5021 + }, + { + "epoch": 2.4825114324558153, + "grad_norm": 0.13953849062150125, + "learning_rate": 6.319333752281009e-06, + "loss": 0.5943, + "step": 5022 + }, + { + "epoch": 2.483005808923495, + "grad_norm": 0.14149787952898726, + "learning_rate": 6.315713492213973e-06, + "loss": 0.547, + "step": 5023 + }, + { + "epoch": 2.4835001853911756, + "grad_norm": 0.1316534816954443, + "learning_rate": 6.3120937907732104e-06, + "loss": 0.5501, + "step": 5024 + }, + { + "epoch": 2.4839945618588555, + "grad_norm": 0.1348477943982468, + "learning_rate": 6.308474648507547e-06, + "loss": 0.5382, + "step": 5025 + }, + { + "epoch": 2.484488938326536, + "grad_norm": 0.13372307887127136, + "learning_rate": 6.3048560659657435e-06, + "loss": 0.5873, + "step": 5026 + }, + { + "epoch": 2.4849833147942157, + "grad_norm": 0.13398852250578847, + "learning_rate": 6.301238043696458e-06, + "loss": 0.535, + "step": 5027 + }, + { + "epoch": 2.485477691261896, + "grad_norm": 0.14204479614230645, + "learning_rate": 6.2976205822482696e-06, + "loss": 0.5803, + "step": 5028 + }, + { + "epoch": 2.485972067729576, + "grad_norm": 0.139007328434182, + "learning_rate": 6.2940036821696715e-06, + "loss": 0.5705, + "step": 5029 + }, + { + "epoch": 2.4864664441972564, + "grad_norm": 0.13413877774614766, + "learning_rate": 6.290387344009072e-06, + "loss": 0.5606, + "step": 5030 + }, + { + "epoch": 2.4869608206649363, + "grad_norm": 0.13768679210189477, + "learning_rate": 6.286771568314798e-06, + "loss": 0.5695, + "step": 5031 + }, + { + "epoch": 2.4874551971326166, + "grad_norm": 0.14196607040123327, + "learning_rate": 6.283156355635087e-06, + "loss": 0.5516, + "step": 5032 + }, + { + "epoch": 2.4879495736002966, + "grad_norm": 0.1380086333147212, + "learning_rate": 6.279541706518091e-06, + "loss": 0.5527, + "step": 5033 + }, + { + "epoch": 2.488443950067977, + "grad_norm": 0.1326156394185181, + "learning_rate": 6.27592762151188e-06, + "loss": 0.5553, + "step": 5034 + }, + { + "epoch": 2.488938326535657, + "grad_norm": 0.13401750654909403, + "learning_rate": 6.272314101164431e-06, + "loss": 0.5745, + "step": 5035 + }, + { + "epoch": 2.489432703003337, + "grad_norm": 0.13530007837584, + "learning_rate": 6.268701146023644e-06, + "loss": 0.5674, + "step": 5036 + }, + { + "epoch": 2.489927079471017, + "grad_norm": 0.13424043960409318, + "learning_rate": 6.265088756637328e-06, + "loss": 0.5584, + "step": 5037 + }, + { + "epoch": 2.4904214559386975, + "grad_norm": 0.13610000216315296, + "learning_rate": 6.26147693355321e-06, + "loss": 0.5444, + "step": 5038 + }, + { + "epoch": 2.4909158324063774, + "grad_norm": 0.14173084404276837, + "learning_rate": 6.257865677318928e-06, + "loss": 0.5701, + "step": 5039 + }, + { + "epoch": 2.4914102088740577, + "grad_norm": 0.13818973217019054, + "learning_rate": 6.254254988482036e-06, + "loss": 0.5587, + "step": 5040 + }, + { + "epoch": 2.4919045853417376, + "grad_norm": 0.14226127704924074, + "learning_rate": 6.250644867589997e-06, + "loss": 0.593, + "step": 5041 + }, + { + "epoch": 2.492398961809418, + "grad_norm": 0.1385944220186209, + "learning_rate": 6.247035315190198e-06, + "loss": 0.5823, + "step": 5042 + }, + { + "epoch": 2.492893338277098, + "grad_norm": 0.13491297236676672, + "learning_rate": 6.243426331829934e-06, + "loss": 0.5359, + "step": 5043 + }, + { + "epoch": 2.4933877147447783, + "grad_norm": 0.13802766754761556, + "learning_rate": 6.239817918056406e-06, + "loss": 0.537, + "step": 5044 + }, + { + "epoch": 2.493882091212458, + "grad_norm": 0.14186811110080388, + "learning_rate": 6.236210074416743e-06, + "loss": 0.5634, + "step": 5045 + }, + { + "epoch": 2.4943764676801385, + "grad_norm": 0.12919757470000975, + "learning_rate": 6.232602801457979e-06, + "loss": 0.5472, + "step": 5046 + }, + { + "epoch": 2.4948708441478185, + "grad_norm": 0.13646137564557406, + "learning_rate": 6.228996099727058e-06, + "loss": 0.5428, + "step": 5047 + }, + { + "epoch": 2.495365220615499, + "grad_norm": 0.13272682710026296, + "learning_rate": 6.225389969770851e-06, + "loss": 0.5762, + "step": 5048 + }, + { + "epoch": 2.4958595970831787, + "grad_norm": 0.13820366482144095, + "learning_rate": 6.221784412136129e-06, + "loss": 0.6095, + "step": 5049 + }, + { + "epoch": 2.496353973550859, + "grad_norm": 0.1456072810738114, + "learning_rate": 6.218179427369585e-06, + "loss": 0.5729, + "step": 5050 + }, + { + "epoch": 2.496848350018539, + "grad_norm": 0.13181481924282457, + "learning_rate": 6.214575016017816e-06, + "loss": 0.5444, + "step": 5051 + }, + { + "epoch": 2.4973427264862194, + "grad_norm": 0.14307931461311935, + "learning_rate": 6.210971178627335e-06, + "loss": 0.5653, + "step": 5052 + }, + { + "epoch": 2.4978371029538993, + "grad_norm": 0.14121567760825818, + "learning_rate": 6.2073679157445824e-06, + "loss": 0.5692, + "step": 5053 + }, + { + "epoch": 2.4983314794215796, + "grad_norm": 0.1330109441398484, + "learning_rate": 6.203765227915887e-06, + "loss": 0.5974, + "step": 5054 + }, + { + "epoch": 2.4988258558892595, + "grad_norm": 0.14062903013126113, + "learning_rate": 6.20016311568751e-06, + "loss": 0.5914, + "step": 5055 + }, + { + "epoch": 2.49932023235694, + "grad_norm": 0.13895130652835572, + "learning_rate": 6.196561579605616e-06, + "loss": 0.5506, + "step": 5056 + }, + { + "epoch": 2.49981460882462, + "grad_norm": 0.1426923563986254, + "learning_rate": 6.192960620216284e-06, + "loss": 0.5467, + "step": 5057 + }, + { + "epoch": 2.5003089852923, + "grad_norm": 0.1338714644867948, + "learning_rate": 6.189360238065501e-06, + "loss": 0.562, + "step": 5058 + }, + { + "epoch": 2.50080336175998, + "grad_norm": 0.13458799713295977, + "learning_rate": 6.185760433699179e-06, + "loss": 0.5447, + "step": 5059 + }, + { + "epoch": 2.5012977382276604, + "grad_norm": 0.14288467591546483, + "learning_rate": 6.182161207663136e-06, + "loss": 0.5696, + "step": 5060 + }, + { + "epoch": 2.5012977382276604, + "eval_loss": 0.6454212069511414, + "eval_runtime": 81.8227, + "eval_samples_per_second": 370.973, + "eval_steps_per_second": 46.381, + "step": 5060 + }, + { + "epoch": 2.5017921146953404, + "grad_norm": 0.13671451321012748, + "learning_rate": 6.1785625605030945e-06, + "loss": 0.5606, + "step": 5061 + }, + { + "epoch": 2.5022864911630207, + "grad_norm": 0.14211156331338168, + "learning_rate": 6.174964492764697e-06, + "loss": 0.5757, + "step": 5062 + }, + { + "epoch": 2.5027808676307006, + "grad_norm": 0.14153439399301382, + "learning_rate": 6.171367004993497e-06, + "loss": 0.567, + "step": 5063 + }, + { + "epoch": 2.503275244098381, + "grad_norm": 0.13901773916457483, + "learning_rate": 6.167770097734963e-06, + "loss": 0.5547, + "step": 5064 + }, + { + "epoch": 2.503769620566061, + "grad_norm": 0.1346811636693347, + "learning_rate": 6.164173771534471e-06, + "loss": 0.5788, + "step": 5065 + }, + { + "epoch": 2.5042639970337413, + "grad_norm": 0.13345356449728493, + "learning_rate": 6.160578026937312e-06, + "loss": 0.5974, + "step": 5066 + }, + { + "epoch": 2.5047583735014216, + "grad_norm": 0.14245490891221552, + "learning_rate": 6.156982864488683e-06, + "loss": 0.5618, + "step": 5067 + }, + { + "epoch": 2.5052527499691015, + "grad_norm": 0.13543335125021472, + "learning_rate": 6.153388284733695e-06, + "loss": 0.5765, + "step": 5068 + }, + { + "epoch": 2.5057471264367814, + "grad_norm": 0.14107149396518814, + "learning_rate": 6.149794288217383e-06, + "loss": 0.5856, + "step": 5069 + }, + { + "epoch": 2.506241502904462, + "grad_norm": 0.14070024365426634, + "learning_rate": 6.146200875484676e-06, + "loss": 0.591, + "step": 5070 + }, + { + "epoch": 2.506735879372142, + "grad_norm": 0.14305503747998777, + "learning_rate": 6.142608047080422e-06, + "loss": 0.5487, + "step": 5071 + }, + { + "epoch": 2.507230255839822, + "grad_norm": 0.13817457200562888, + "learning_rate": 6.1390158035493795e-06, + "loss": 0.563, + "step": 5072 + }, + { + "epoch": 2.507724632307502, + "grad_norm": 0.13847246270331925, + "learning_rate": 6.135424145436224e-06, + "loss": 0.5895, + "step": 5073 + }, + { + "epoch": 2.5082190087751823, + "grad_norm": 0.14575236708009762, + "learning_rate": 6.131833073285525e-06, + "loss": 0.5385, + "step": 5074 + }, + { + "epoch": 2.5087133852428627, + "grad_norm": 0.13800802067000398, + "learning_rate": 6.128242587641788e-06, + "loss": 0.5459, + "step": 5075 + }, + { + "epoch": 2.5092077617105426, + "grad_norm": 0.13723003289639232, + "learning_rate": 6.124652689049414e-06, + "loss": 0.5808, + "step": 5076 + }, + { + "epoch": 2.5097021381782225, + "grad_norm": 0.1463338937272396, + "learning_rate": 6.121063378052714e-06, + "loss": 0.5826, + "step": 5077 + }, + { + "epoch": 2.510196514645903, + "grad_norm": 0.14116250826386137, + "learning_rate": 6.1174746551959165e-06, + "loss": 0.5853, + "step": 5078 + }, + { + "epoch": 2.5106908911135832, + "grad_norm": 0.13984977514565372, + "learning_rate": 6.113886521023153e-06, + "loss": 0.6071, + "step": 5079 + }, + { + "epoch": 2.511185267581263, + "grad_norm": 0.1377384729099503, + "learning_rate": 6.110298976078478e-06, + "loss": 0.5786, + "step": 5080 + }, + { + "epoch": 2.511679644048943, + "grad_norm": 0.13414035954402007, + "learning_rate": 6.106712020905846e-06, + "loss": 0.5816, + "step": 5081 + }, + { + "epoch": 2.5121740205166234, + "grad_norm": 0.14105782143716347, + "learning_rate": 6.103125656049127e-06, + "loss": 0.5656, + "step": 5082 + }, + { + "epoch": 2.5126683969843038, + "grad_norm": 0.1404535633886671, + "learning_rate": 6.099539882052099e-06, + "loss": 0.5657, + "step": 5083 + }, + { + "epoch": 2.5131627734519837, + "grad_norm": 0.13982073233015194, + "learning_rate": 6.0959546994584505e-06, + "loss": 0.5697, + "step": 5084 + }, + { + "epoch": 2.5136571499196636, + "grad_norm": 0.1304106330148455, + "learning_rate": 6.092370108811779e-06, + "loss": 0.5558, + "step": 5085 + }, + { + "epoch": 2.514151526387344, + "grad_norm": 0.1331431551528671, + "learning_rate": 6.0887861106556e-06, + "loss": 0.607, + "step": 5086 + }, + { + "epoch": 2.5146459028550243, + "grad_norm": 0.135736258756284, + "learning_rate": 6.085202705533331e-06, + "loss": 0.559, + "step": 5087 + }, + { + "epoch": 2.5151402793227042, + "grad_norm": 0.13464119390331528, + "learning_rate": 6.081619893988302e-06, + "loss": 0.5625, + "step": 5088 + }, + { + "epoch": 2.515634655790384, + "grad_norm": 0.13766327557526561, + "learning_rate": 6.078037676563755e-06, + "loss": 0.5712, + "step": 5089 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 0.13808907528080658, + "learning_rate": 6.074456053802835e-06, + "loss": 0.5585, + "step": 5090 + }, + { + "epoch": 2.516623408725745, + "grad_norm": 0.13911825817898651, + "learning_rate": 6.070875026248608e-06, + "loss": 0.5631, + "step": 5091 + }, + { + "epoch": 2.5171177851934248, + "grad_norm": 0.13827851401194247, + "learning_rate": 6.067294594444044e-06, + "loss": 0.5416, + "step": 5092 + }, + { + "epoch": 2.5176121616611047, + "grad_norm": 0.13243733812738903, + "learning_rate": 6.063714758932018e-06, + "loss": 0.561, + "step": 5093 + }, + { + "epoch": 2.518106538128785, + "grad_norm": 0.13360515085606262, + "learning_rate": 6.060135520255319e-06, + "loss": 0.5348, + "step": 5094 + }, + { + "epoch": 2.5186009145964654, + "grad_norm": 0.13559217303879365, + "learning_rate": 6.0565568789566486e-06, + "loss": 0.5808, + "step": 5095 + }, + { + "epoch": 2.5190952910641453, + "grad_norm": 0.13606230275862818, + "learning_rate": 6.0529788355786115e-06, + "loss": 0.5499, + "step": 5096 + }, + { + "epoch": 2.5195896675318252, + "grad_norm": 0.1415866196738306, + "learning_rate": 6.049401390663729e-06, + "loss": 0.5865, + "step": 5097 + }, + { + "epoch": 2.5200840439995056, + "grad_norm": 0.13425960903928613, + "learning_rate": 6.045824544754425e-06, + "loss": 0.5489, + "step": 5098 + }, + { + "epoch": 2.520578420467186, + "grad_norm": 0.13483345718045056, + "learning_rate": 6.0422482983930385e-06, + "loss": 0.6027, + "step": 5099 + }, + { + "epoch": 2.521072796934866, + "grad_norm": 0.1385066894669286, + "learning_rate": 6.038672652121809e-06, + "loss": 0.5527, + "step": 5100 + }, + { + "epoch": 2.5215671734025458, + "grad_norm": 0.1376719812082458, + "learning_rate": 6.035097606482889e-06, + "loss": 0.5791, + "step": 5101 + }, + { + "epoch": 2.522061549870226, + "grad_norm": 0.1349933504462646, + "learning_rate": 6.0315231620183515e-06, + "loss": 0.5659, + "step": 5102 + }, + { + "epoch": 2.5225559263379065, + "grad_norm": 0.1435789575406635, + "learning_rate": 6.027949319270159e-06, + "loss": 0.5805, + "step": 5103 + }, + { + "epoch": 2.5230503028055864, + "grad_norm": 0.13460486554926562, + "learning_rate": 6.0243760787801945e-06, + "loss": 0.5452, + "step": 5104 + }, + { + "epoch": 2.5235446792732668, + "grad_norm": 0.1425017428106859, + "learning_rate": 6.020803441090246e-06, + "loss": 0.529, + "step": 5105 + }, + { + "epoch": 2.5240390557409467, + "grad_norm": 0.1341684900224925, + "learning_rate": 6.017231406742015e-06, + "loss": 0.5556, + "step": 5106 + }, + { + "epoch": 2.524533432208627, + "grad_norm": 0.1412886418523525, + "learning_rate": 6.013659976277099e-06, + "loss": 0.611, + "step": 5107 + }, + { + "epoch": 2.525027808676307, + "grad_norm": 0.13814122494782005, + "learning_rate": 6.010089150237022e-06, + "loss": 0.5698, + "step": 5108 + }, + { + "epoch": 2.5255221851439873, + "grad_norm": 0.1388595273904115, + "learning_rate": 6.006518929163205e-06, + "loss": 0.5848, + "step": 5109 + }, + { + "epoch": 2.526016561611667, + "grad_norm": 0.14489147481315023, + "learning_rate": 6.002949313596977e-06, + "loss": 0.5732, + "step": 5110 + }, + { + "epoch": 2.5265109380793476, + "grad_norm": 0.1398253345928389, + "learning_rate": 5.999380304079577e-06, + "loss": 0.5485, + "step": 5111 + }, + { + "epoch": 2.5270053145470275, + "grad_norm": 0.13042883758973076, + "learning_rate": 5.995811901152151e-06, + "loss": 0.5724, + "step": 5112 + }, + { + "epoch": 2.527499691014708, + "grad_norm": 0.1360156956713546, + "learning_rate": 5.99224410535576e-06, + "loss": 0.5441, + "step": 5113 + }, + { + "epoch": 2.5279940674823878, + "grad_norm": 0.13461319719228393, + "learning_rate": 5.9886769172313645e-06, + "loss": 0.5649, + "step": 5114 + }, + { + "epoch": 2.528488443950068, + "grad_norm": 0.14323955781408296, + "learning_rate": 5.985110337319835e-06, + "loss": 0.5597, + "step": 5115 + }, + { + "epoch": 2.528982820417748, + "grad_norm": 0.1347101501081569, + "learning_rate": 5.981544366161953e-06, + "loss": 0.5748, + "step": 5116 + }, + { + "epoch": 2.5294771968854284, + "grad_norm": 0.14030216096744283, + "learning_rate": 5.977979004298403e-06, + "loss": 0.5772, + "step": 5117 + }, + { + "epoch": 2.5299715733531083, + "grad_norm": 0.13363593542270427, + "learning_rate": 5.974414252269778e-06, + "loss": 0.5363, + "step": 5118 + }, + { + "epoch": 2.5304659498207887, + "grad_norm": 0.13771022645786324, + "learning_rate": 5.970850110616584e-06, + "loss": 0.5719, + "step": 5119 + }, + { + "epoch": 2.5309603262884686, + "grad_norm": 0.14641076281820914, + "learning_rate": 5.967286579879228e-06, + "loss": 0.5846, + "step": 5120 + }, + { + "epoch": 2.531454702756149, + "grad_norm": 0.13627534150612894, + "learning_rate": 5.963723660598029e-06, + "loss": 0.5495, + "step": 5121 + }, + { + "epoch": 2.531949079223829, + "grad_norm": 0.1353944938661403, + "learning_rate": 5.960161353313207e-06, + "loss": 0.6087, + "step": 5122 + }, + { + "epoch": 2.532443455691509, + "grad_norm": 0.13725977046146806, + "learning_rate": 5.9565996585648965e-06, + "loss": 0.5619, + "step": 5123 + }, + { + "epoch": 2.532937832159189, + "grad_norm": 0.1418345674121483, + "learning_rate": 5.953038576893135e-06, + "loss": 0.5569, + "step": 5124 + }, + { + "epoch": 2.5334322086268695, + "grad_norm": 0.14564534248783906, + "learning_rate": 5.949478108837872e-06, + "loss": 0.54, + "step": 5125 + }, + { + "epoch": 2.5339265850945494, + "grad_norm": 0.14388736010141867, + "learning_rate": 5.945918254938953e-06, + "loss": 0.5818, + "step": 5126 + }, + { + "epoch": 2.5344209615622297, + "grad_norm": 0.1356589293995041, + "learning_rate": 5.942359015736141e-06, + "loss": 0.5405, + "step": 5127 + }, + { + "epoch": 2.5349153380299096, + "grad_norm": 0.13766533648211576, + "learning_rate": 5.938800391769101e-06, + "loss": 0.5565, + "step": 5128 + }, + { + "epoch": 2.53540971449759, + "grad_norm": 0.1368332937639521, + "learning_rate": 5.935242383577404e-06, + "loss": 0.5736, + "step": 5129 + }, + { + "epoch": 2.53590409096527, + "grad_norm": 0.13287983249493365, + "learning_rate": 5.931684991700535e-06, + "loss": 0.5352, + "step": 5130 + }, + { + "epoch": 2.5363984674329503, + "grad_norm": 0.13237057250072834, + "learning_rate": 5.928128216677875e-06, + "loss": 0.5775, + "step": 5131 + }, + { + "epoch": 2.53689284390063, + "grad_norm": 0.13487038563962978, + "learning_rate": 5.924572059048721e-06, + "loss": 0.5791, + "step": 5132 + }, + { + "epoch": 2.5373872203683105, + "grad_norm": 0.13598885363772512, + "learning_rate": 5.9210165193522675e-06, + "loss": 0.56, + "step": 5133 + }, + { + "epoch": 2.5378815968359905, + "grad_norm": 0.1354874361287378, + "learning_rate": 5.917461598127616e-06, + "loss": 0.5477, + "step": 5134 + }, + { + "epoch": 2.538375973303671, + "grad_norm": 0.13426199669294706, + "learning_rate": 5.913907295913791e-06, + "loss": 0.5414, + "step": 5135 + }, + { + "epoch": 2.5388703497713507, + "grad_norm": 0.13473775572199073, + "learning_rate": 5.9103536132497e-06, + "loss": 0.5545, + "step": 5136 + }, + { + "epoch": 2.539364726239031, + "grad_norm": 0.1347272489332051, + "learning_rate": 5.9068005506741675e-06, + "loss": 0.5622, + "step": 5137 + }, + { + "epoch": 2.539859102706711, + "grad_norm": 0.14052136549096797, + "learning_rate": 5.903248108725925e-06, + "loss": 0.5532, + "step": 5138 + }, + { + "epoch": 2.5403534791743914, + "grad_norm": 0.13066951135753777, + "learning_rate": 5.8996962879436085e-06, + "loss": 0.5405, + "step": 5139 + }, + { + "epoch": 2.5408478556420713, + "grad_norm": 0.13613271081183795, + "learning_rate": 5.896145088865753e-06, + "loss": 0.5425, + "step": 5140 + }, + { + "epoch": 2.5413422321097516, + "grad_norm": 0.20436536417871978, + "learning_rate": 5.892594512030814e-06, + "loss": 0.5919, + "step": 5141 + }, + { + "epoch": 2.541836608577432, + "grad_norm": 0.13229521556669033, + "learning_rate": 5.889044557977144e-06, + "loss": 0.5678, + "step": 5142 + }, + { + "epoch": 2.542330985045112, + "grad_norm": 0.13385990121070843, + "learning_rate": 5.885495227242995e-06, + "loss": 0.5659, + "step": 5143 + }, + { + "epoch": 2.542825361512792, + "grad_norm": 0.13630601977579634, + "learning_rate": 5.881946520366534e-06, + "loss": 0.5672, + "step": 5144 + }, + { + "epoch": 2.543319737980472, + "grad_norm": 0.13510218334151547, + "learning_rate": 5.878398437885828e-06, + "loss": 0.5289, + "step": 5145 + }, + { + "epoch": 2.5438141144481525, + "grad_norm": 0.1343645131288739, + "learning_rate": 5.8748509803388554e-06, + "loss": 0.5886, + "step": 5146 + }, + { + "epoch": 2.5443084909158324, + "grad_norm": 0.13069084927831, + "learning_rate": 5.8713041482634936e-06, + "loss": 0.5457, + "step": 5147 + }, + { + "epoch": 2.5448028673835124, + "grad_norm": 0.13124998904661628, + "learning_rate": 5.867757942197531e-06, + "loss": 0.5891, + "step": 5148 + }, + { + "epoch": 2.5452972438511927, + "grad_norm": 0.15134334035717964, + "learning_rate": 5.864212362678651e-06, + "loss": 0.6136, + "step": 5149 + }, + { + "epoch": 2.545791620318873, + "grad_norm": 0.13704422835433294, + "learning_rate": 5.860667410244448e-06, + "loss": 0.5463, + "step": 5150 + }, + { + "epoch": 2.546285996786553, + "grad_norm": 0.1323949415689311, + "learning_rate": 5.857123085432432e-06, + "loss": 0.5927, + "step": 5151 + }, + { + "epoch": 2.546780373254233, + "grad_norm": 0.14163766848474274, + "learning_rate": 5.85357938878e-06, + "loss": 0.5947, + "step": 5152 + }, + { + "epoch": 2.5472747497219133, + "grad_norm": 0.13161065633524616, + "learning_rate": 5.850036320824462e-06, + "loss": 0.5728, + "step": 5153 + }, + { + "epoch": 2.5477691261895936, + "grad_norm": 0.14359945301349145, + "learning_rate": 5.846493882103035e-06, + "loss": 0.5787, + "step": 5154 + }, + { + "epoch": 2.5482635026572735, + "grad_norm": 0.14090501132212632, + "learning_rate": 5.842952073152837e-06, + "loss": 0.5899, + "step": 5155 + }, + { + "epoch": 2.5487578791249534, + "grad_norm": 0.13203721539179386, + "learning_rate": 5.839410894510884e-06, + "loss": 0.559, + "step": 5156 + }, + { + "epoch": 2.549252255592634, + "grad_norm": 0.13994437810239838, + "learning_rate": 5.835870346714114e-06, + "loss": 0.5426, + "step": 5157 + }, + { + "epoch": 2.549746632060314, + "grad_norm": 0.1331638843992615, + "learning_rate": 5.832330430299353e-06, + "loss": 0.5169, + "step": 5158 + }, + { + "epoch": 2.550241008527994, + "grad_norm": 0.1370684066710575, + "learning_rate": 5.828791145803343e-06, + "loss": 0.589, + "step": 5159 + }, + { + "epoch": 2.550735384995674, + "grad_norm": 0.14436746552151286, + "learning_rate": 5.8252524937627204e-06, + "loss": 0.5752, + "step": 5160 + }, + { + "epoch": 2.5512297614633543, + "grad_norm": 0.1337817633571879, + "learning_rate": 5.821714474714022e-06, + "loss": 0.5569, + "step": 5161 + }, + { + "epoch": 2.5517241379310347, + "grad_norm": 0.14016128209139214, + "learning_rate": 5.818177089193713e-06, + "loss": 0.5646, + "step": 5162 + }, + { + "epoch": 2.5522185143987146, + "grad_norm": 0.14087097903280013, + "learning_rate": 5.814640337738137e-06, + "loss": 0.5737, + "step": 5163 + }, + { + "epoch": 2.5527128908663945, + "grad_norm": 0.13270100756316827, + "learning_rate": 5.811104220883547e-06, + "loss": 0.5291, + "step": 5164 + }, + { + "epoch": 2.553207267334075, + "grad_norm": 0.13610253882380807, + "learning_rate": 5.807568739166109e-06, + "loss": 0.6021, + "step": 5165 + }, + { + "epoch": 2.5537016438017552, + "grad_norm": 0.13549865744804673, + "learning_rate": 5.8040338931218845e-06, + "loss": 0.5639, + "step": 5166 + }, + { + "epoch": 2.554196020269435, + "grad_norm": 0.13417789716454775, + "learning_rate": 5.80049968328684e-06, + "loss": 0.548, + "step": 5167 + }, + { + "epoch": 2.554690396737115, + "grad_norm": 0.1409852793735077, + "learning_rate": 5.796966110196851e-06, + "loss": 0.6172, + "step": 5168 + }, + { + "epoch": 2.5551847732047954, + "grad_norm": 0.12964744676560236, + "learning_rate": 5.793433174387686e-06, + "loss": 0.5542, + "step": 5169 + }, + { + "epoch": 2.555679149672476, + "grad_norm": 0.14010532770444495, + "learning_rate": 5.7899008763950295e-06, + "loss": 0.5466, + "step": 5170 + }, + { + "epoch": 2.5561735261401557, + "grad_norm": 0.14029805834487538, + "learning_rate": 5.7863692167544585e-06, + "loss": 0.5538, + "step": 5171 + }, + { + "epoch": 2.5566679026078356, + "grad_norm": 0.13501210330691607, + "learning_rate": 5.782838196001454e-06, + "loss": 0.5568, + "step": 5172 + }, + { + "epoch": 2.557162279075516, + "grad_norm": 0.13466777657973134, + "learning_rate": 5.779307814671408e-06, + "loss": 0.5937, + "step": 5173 + }, + { + "epoch": 2.5576566555431963, + "grad_norm": 0.1385187553330484, + "learning_rate": 5.7757780732996136e-06, + "loss": 0.5668, + "step": 5174 + }, + { + "epoch": 2.5581510320108762, + "grad_norm": 0.1420900008574152, + "learning_rate": 5.772248972421257e-06, + "loss": 0.5289, + "step": 5175 + }, + { + "epoch": 2.558645408478556, + "grad_norm": 0.1304839200179741, + "learning_rate": 5.768720512571444e-06, + "loss": 0.5154, + "step": 5176 + }, + { + "epoch": 2.5591397849462365, + "grad_norm": 0.13177552806808418, + "learning_rate": 5.765192694285169e-06, + "loss": 0.5816, + "step": 5177 + }, + { + "epoch": 2.559634161413917, + "grad_norm": 0.13182912333126165, + "learning_rate": 5.761665518097323e-06, + "loss": 0.5431, + "step": 5178 + }, + { + "epoch": 2.5601285378815968, + "grad_norm": 0.13914692519707828, + "learning_rate": 5.758138984542731e-06, + "loss": 0.5526, + "step": 5179 + }, + { + "epoch": 2.560622914349277, + "grad_norm": 0.1372897965735163, + "learning_rate": 5.75461309415609e-06, + "loss": 0.5674, + "step": 5180 + }, + { + "epoch": 2.561117290816957, + "grad_norm": 0.13447969747285368, + "learning_rate": 5.751087847472005e-06, + "loss": 0.5537, + "step": 5181 + }, + { + "epoch": 2.5616116672846374, + "grad_norm": 0.1341361669237955, + "learning_rate": 5.747563245024999e-06, + "loss": 0.5966, + "step": 5182 + }, + { + "epoch": 2.5621060437523173, + "grad_norm": 0.13299240988117333, + "learning_rate": 5.744039287349474e-06, + "loss": 0.5673, + "step": 5183 + }, + { + "epoch": 2.5626004202199977, + "grad_norm": 0.13282555456590575, + "learning_rate": 5.740515974979755e-06, + "loss": 0.5581, + "step": 5184 + }, + { + "epoch": 2.5630947966876776, + "grad_norm": 0.1361983981244297, + "learning_rate": 5.736993308450061e-06, + "loss": 0.5457, + "step": 5185 + }, + { + "epoch": 2.563589173155358, + "grad_norm": 0.1312234499909148, + "learning_rate": 5.73347128829451e-06, + "loss": 0.5476, + "step": 5186 + }, + { + "epoch": 2.564083549623038, + "grad_norm": 0.1349702109688897, + "learning_rate": 5.72994991504712e-06, + "loss": 0.5272, + "step": 5187 + }, + { + "epoch": 2.564577926090718, + "grad_norm": 0.1319979854177678, + "learning_rate": 5.726429189241827e-06, + "loss": 0.54, + "step": 5188 + }, + { + "epoch": 2.565072302558398, + "grad_norm": 0.13386670159842987, + "learning_rate": 5.722909111412447e-06, + "loss": 0.5678, + "step": 5189 + }, + { + "epoch": 2.5655666790260785, + "grad_norm": 0.1357465067701742, + "learning_rate": 5.719389682092712e-06, + "loss": 0.5497, + "step": 5190 + }, + { + "epoch": 2.5660610554937584, + "grad_norm": 0.14165482074366484, + "learning_rate": 5.715870901816256e-06, + "loss": 0.546, + "step": 5191 + }, + { + "epoch": 2.5665554319614388, + "grad_norm": 0.1394482221002608, + "learning_rate": 5.712352771116605e-06, + "loss": 0.5817, + "step": 5192 + }, + { + "epoch": 2.5670498084291187, + "grad_norm": 0.1355827204612357, + "learning_rate": 5.708835290527197e-06, + "loss": 0.5899, + "step": 5193 + }, + { + "epoch": 2.567544184896799, + "grad_norm": 0.15265779914835018, + "learning_rate": 5.705318460581359e-06, + "loss": 0.5441, + "step": 5194 + }, + { + "epoch": 2.568038561364479, + "grad_norm": 0.14495670009278117, + "learning_rate": 5.701802281812338e-06, + "loss": 0.5598, + "step": 5195 + }, + { + "epoch": 2.5685329378321593, + "grad_norm": 0.13637233147920053, + "learning_rate": 5.698286754753258e-06, + "loss": 0.569, + "step": 5196 + }, + { + "epoch": 2.569027314299839, + "grad_norm": 0.1370753960681341, + "learning_rate": 5.69477187993717e-06, + "loss": 0.6231, + "step": 5197 + }, + { + "epoch": 2.5695216907675196, + "grad_norm": 0.1409729036438364, + "learning_rate": 5.691257657897003e-06, + "loss": 0.605, + "step": 5198 + }, + { + "epoch": 2.5700160672351995, + "grad_norm": 0.13704658959761384, + "learning_rate": 5.687744089165604e-06, + "loss": 0.547, + "step": 5199 + }, + { + "epoch": 2.57051044370288, + "grad_norm": 0.14236805542973222, + "learning_rate": 5.68423117427571e-06, + "loss": 0.5796, + "step": 5200 + }, + { + "epoch": 2.5710048201705598, + "grad_norm": 0.14214734855706937, + "learning_rate": 5.680718913759964e-06, + "loss": 0.5465, + "step": 5201 + }, + { + "epoch": 2.57149919663824, + "grad_norm": 0.14316912776807245, + "learning_rate": 5.677207308150916e-06, + "loss": 0.5386, + "step": 5202 + }, + { + "epoch": 2.57199357310592, + "grad_norm": 0.13695942946567777, + "learning_rate": 5.673696357981002e-06, + "loss": 0.5796, + "step": 5203 + }, + { + "epoch": 2.5724879495736004, + "grad_norm": 0.14110132950758728, + "learning_rate": 5.670186063782566e-06, + "loss": 0.5894, + "step": 5204 + }, + { + "epoch": 2.5729823260412803, + "grad_norm": 0.13919344070109343, + "learning_rate": 5.666676426087855e-06, + "loss": 0.5898, + "step": 5205 + }, + { + "epoch": 2.5734767025089607, + "grad_norm": 0.13527483619001088, + "learning_rate": 5.663167445429019e-06, + "loss": 0.5687, + "step": 5206 + }, + { + "epoch": 2.5739710789766406, + "grad_norm": 0.14148963813308751, + "learning_rate": 5.659659122338092e-06, + "loss": 0.6074, + "step": 5207 + }, + { + "epoch": 2.574465455444321, + "grad_norm": 0.13649596671572015, + "learning_rate": 5.656151457347034e-06, + "loss": 0.5614, + "step": 5208 + }, + { + "epoch": 2.574959831912001, + "grad_norm": 0.13236109587415187, + "learning_rate": 5.652644450987685e-06, + "loss": 0.5467, + "step": 5209 + }, + { + "epoch": 2.575454208379681, + "grad_norm": 0.13244619085081472, + "learning_rate": 5.649138103791787e-06, + "loss": 0.5448, + "step": 5210 + }, + { + "epoch": 2.575948584847361, + "grad_norm": 0.13808181747538006, + "learning_rate": 5.6456324162909885e-06, + "loss": 0.5745, + "step": 5211 + }, + { + "epoch": 2.5764429613150415, + "grad_norm": 0.13619281625330007, + "learning_rate": 5.642127389016842e-06, + "loss": 0.5732, + "step": 5212 + }, + { + "epoch": 2.5769373377827214, + "grad_norm": 0.1314720255309068, + "learning_rate": 5.638623022500786e-06, + "loss": 0.5623, + "step": 5213 + }, + { + "epoch": 2.5774317142504017, + "grad_norm": 0.133336171614363, + "learning_rate": 5.635119317274174e-06, + "loss": 0.5293, + "step": 5214 + }, + { + "epoch": 2.5779260907180817, + "grad_norm": 0.13723843109310424, + "learning_rate": 5.631616273868242e-06, + "loss": 0.538, + "step": 5215 + }, + { + "epoch": 2.578420467185762, + "grad_norm": 0.13659746502667983, + "learning_rate": 5.628113892814142e-06, + "loss": 0.5651, + "step": 5216 + }, + { + "epoch": 2.5789148436534424, + "grad_norm": 0.13834816426914157, + "learning_rate": 5.624612174642922e-06, + "loss": 0.5733, + "step": 5217 + }, + { + "epoch": 2.5794092201211223, + "grad_norm": 0.1392633202835842, + "learning_rate": 5.621111119885521e-06, + "loss": 0.5529, + "step": 5218 + }, + { + "epoch": 2.579903596588802, + "grad_norm": 0.13547191665929237, + "learning_rate": 5.617610729072787e-06, + "loss": 0.5459, + "step": 5219 + }, + { + "epoch": 2.5803979730564826, + "grad_norm": 0.13733601899398257, + "learning_rate": 5.614111002735461e-06, + "loss": 0.57, + "step": 5220 + }, + { + "epoch": 2.580892349524163, + "grad_norm": 0.14167246059858785, + "learning_rate": 5.610611941404181e-06, + "loss": 0.5742, + "step": 5221 + }, + { + "epoch": 2.581386725991843, + "grad_norm": 0.14033193299613764, + "learning_rate": 5.607113545609495e-06, + "loss": 0.5367, + "step": 5222 + }, + { + "epoch": 2.5818811024595227, + "grad_norm": 0.137109110780268, + "learning_rate": 5.603615815881845e-06, + "loss": 0.5767, + "step": 5223 + }, + { + "epoch": 2.582375478927203, + "grad_norm": 0.13493630968053336, + "learning_rate": 5.600118752751562e-06, + "loss": 0.5718, + "step": 5224 + }, + { + "epoch": 2.5828698553948835, + "grad_norm": 0.13589682016237123, + "learning_rate": 5.5966223567488975e-06, + "loss": 0.5254, + "step": 5225 + }, + { + "epoch": 2.5833642318625634, + "grad_norm": 0.137552475458758, + "learning_rate": 5.59312662840398e-06, + "loss": 0.5496, + "step": 5226 + }, + { + "epoch": 2.5838586083302433, + "grad_norm": 0.13936631940944, + "learning_rate": 5.589631568246841e-06, + "loss": 0.5407, + "step": 5227 + }, + { + "epoch": 2.5843529847979236, + "grad_norm": 0.1311039859050573, + "learning_rate": 5.586137176807429e-06, + "loss": 0.5556, + "step": 5228 + }, + { + "epoch": 2.584847361265604, + "grad_norm": 0.13644017582585014, + "learning_rate": 5.582643454615572e-06, + "loss": 0.5408, + "step": 5229 + }, + { + "epoch": 2.585341737733284, + "grad_norm": 0.1422154539244573, + "learning_rate": 5.579150402200997e-06, + "loss": 0.5815, + "step": 5230 + }, + { + "epoch": 2.585836114200964, + "grad_norm": 0.1323229163125091, + "learning_rate": 5.575658020093342e-06, + "loss": 0.6389, + "step": 5231 + }, + { + "epoch": 2.586330490668644, + "grad_norm": 0.1392546203625532, + "learning_rate": 5.57216630882213e-06, + "loss": 0.6024, + "step": 5232 + }, + { + "epoch": 2.5868248671363245, + "grad_norm": 0.13699428968422855, + "learning_rate": 5.56867526891679e-06, + "loss": 0.5556, + "step": 5233 + }, + { + "epoch": 2.5873192436040044, + "grad_norm": 0.1370789618214025, + "learning_rate": 5.565184900906653e-06, + "loss": 0.5635, + "step": 5234 + }, + { + "epoch": 2.5878136200716844, + "grad_norm": 0.1394451755891981, + "learning_rate": 5.561695205320937e-06, + "loss": 0.5975, + "step": 5235 + }, + { + "epoch": 2.5883079965393647, + "grad_norm": 0.13745668256006813, + "learning_rate": 5.558206182688762e-06, + "loss": 0.5611, + "step": 5236 + }, + { + "epoch": 2.588802373007045, + "grad_norm": 0.1378454421676196, + "learning_rate": 5.5547178335391536e-06, + "loss": 0.5531, + "step": 5237 + }, + { + "epoch": 2.589296749474725, + "grad_norm": 0.1388748169029275, + "learning_rate": 5.551230158401021e-06, + "loss": 0.535, + "step": 5238 + }, + { + "epoch": 2.589791125942405, + "grad_norm": 0.14128370144923633, + "learning_rate": 5.547743157803185e-06, + "loss": 0.5705, + "step": 5239 + }, + { + "epoch": 2.5902855024100853, + "grad_norm": 0.13218609296392855, + "learning_rate": 5.544256832274362e-06, + "loss": 0.5482, + "step": 5240 + }, + { + "epoch": 2.5907798788777656, + "grad_norm": 0.13547435696243781, + "learning_rate": 5.5407711823431545e-06, + "loss": 0.5574, + "step": 5241 + }, + { + "epoch": 2.5912742553454455, + "grad_norm": 0.13016316354432098, + "learning_rate": 5.537286208538077e-06, + "loss": 0.5428, + "step": 5242 + }, + { + "epoch": 2.5917686318131254, + "grad_norm": 0.13865404097235146, + "learning_rate": 5.53380191138753e-06, + "loss": 0.5715, + "step": 5243 + }, + { + "epoch": 2.592263008280806, + "grad_norm": 0.13602015234567444, + "learning_rate": 5.530318291419821e-06, + "loss": 0.5848, + "step": 5244 + }, + { + "epoch": 2.592757384748486, + "grad_norm": 0.1330808259521017, + "learning_rate": 5.5268353491631525e-06, + "loss": 0.5651, + "step": 5245 + }, + { + "epoch": 2.593251761216166, + "grad_norm": 0.13548824015587269, + "learning_rate": 5.523353085145617e-06, + "loss": 0.5414, + "step": 5246 + }, + { + "epoch": 2.593746137683846, + "grad_norm": 0.13562742060202956, + "learning_rate": 5.519871499895208e-06, + "loss": 0.5943, + "step": 5247 + }, + { + "epoch": 2.5942405141515263, + "grad_norm": 0.1392908554708138, + "learning_rate": 5.516390593939824e-06, + "loss": 0.55, + "step": 5248 + }, + { + "epoch": 2.5947348906192067, + "grad_norm": 0.13380010221270774, + "learning_rate": 5.512910367807246e-06, + "loss": 0.5761, + "step": 5249 + }, + { + "epoch": 2.5952292670868866, + "grad_norm": 0.1422386051689916, + "learning_rate": 5.509430822025163e-06, + "loss": 0.5557, + "step": 5250 + }, + { + "epoch": 2.5957236435545665, + "grad_norm": 0.13372119037681351, + "learning_rate": 5.505951957121165e-06, + "loss": 0.534, + "step": 5251 + }, + { + "epoch": 2.596218020022247, + "grad_norm": 0.1348243804698149, + "learning_rate": 5.502473773622723e-06, + "loss": 0.5589, + "step": 5252 + }, + { + "epoch": 2.5967123964899272, + "grad_norm": 0.13313638178540368, + "learning_rate": 5.498996272057213e-06, + "loss": 0.5784, + "step": 5253 + }, + { + "epoch": 2.597206772957607, + "grad_norm": 0.13316696683597348, + "learning_rate": 5.495519452951908e-06, + "loss": 0.562, + "step": 5254 + }, + { + "epoch": 2.5977011494252875, + "grad_norm": 0.13420396802449572, + "learning_rate": 5.492043316833984e-06, + "loss": 0.5449, + "step": 5255 + }, + { + "epoch": 2.5981955258929674, + "grad_norm": 0.13321676874720134, + "learning_rate": 5.488567864230499e-06, + "loss": 0.5845, + "step": 5256 + }, + { + "epoch": 2.598689902360648, + "grad_norm": 0.13395682086707142, + "learning_rate": 5.485093095668419e-06, + "loss": 0.5466, + "step": 5257 + }, + { + "epoch": 2.5991842788283277, + "grad_norm": 0.13070175008500517, + "learning_rate": 5.4816190116746e-06, + "loss": 0.5767, + "step": 5258 + }, + { + "epoch": 2.599678655296008, + "grad_norm": 0.1333871264833943, + "learning_rate": 5.478145612775799e-06, + "loss": 0.5646, + "step": 5259 + }, + { + "epoch": 2.600173031763688, + "grad_norm": 0.13363328973011115, + "learning_rate": 5.474672899498663e-06, + "loss": 0.5565, + "step": 5260 + }, + { + "epoch": 2.6006674082313683, + "grad_norm": 0.13433894219532594, + "learning_rate": 5.471200872369744e-06, + "loss": 0.5745, + "step": 5261 + }, + { + "epoch": 2.6011617846990482, + "grad_norm": 0.1366490717792372, + "learning_rate": 5.46772953191548e-06, + "loss": 0.5553, + "step": 5262 + }, + { + "epoch": 2.6016561611667286, + "grad_norm": 0.1379765140417098, + "learning_rate": 5.464258878662212e-06, + "loss": 0.5702, + "step": 5263 + }, + { + "epoch": 2.6021505376344085, + "grad_norm": 0.13491393287897155, + "learning_rate": 5.460788913136173e-06, + "loss": 0.5896, + "step": 5264 + }, + { + "epoch": 2.602644914102089, + "grad_norm": 0.14111356756072313, + "learning_rate": 5.45731963586349e-06, + "loss": 0.5997, + "step": 5265 + }, + { + "epoch": 2.603139290569769, + "grad_norm": 0.14631189947206102, + "learning_rate": 5.453851047370198e-06, + "loss": 0.5502, + "step": 5266 + }, + { + "epoch": 2.603633667037449, + "grad_norm": 0.13223846032211092, + "learning_rate": 5.45038314818221e-06, + "loss": 0.5614, + "step": 5267 + }, + { + "epoch": 2.604128043505129, + "grad_norm": 0.1430047288503426, + "learning_rate": 5.4469159388253475e-06, + "loss": 0.5491, + "step": 5268 + }, + { + "epoch": 2.6046224199728094, + "grad_norm": 0.13141463737767817, + "learning_rate": 5.443449419825321e-06, + "loss": 0.5325, + "step": 5269 + }, + { + "epoch": 2.6051167964404893, + "grad_norm": 0.13409822970667296, + "learning_rate": 5.439983591707734e-06, + "loss": 0.5472, + "step": 5270 + }, + { + "epoch": 2.6056111729081697, + "grad_norm": 0.1354709877939259, + "learning_rate": 5.436518454998092e-06, + "loss": 0.5527, + "step": 5271 + }, + { + "epoch": 2.6061055493758496, + "grad_norm": 0.13349245955887826, + "learning_rate": 5.433054010221798e-06, + "loss": 0.5405, + "step": 5272 + }, + { + "epoch": 2.60659992584353, + "grad_norm": 0.14172680974661847, + "learning_rate": 5.429590257904136e-06, + "loss": 0.592, + "step": 5273 + }, + { + "epoch": 2.60709430231121, + "grad_norm": 0.13732790246836113, + "learning_rate": 5.426127198570303e-06, + "loss": 0.592, + "step": 5274 + }, + { + "epoch": 2.6075886787788902, + "grad_norm": 0.13225664389147856, + "learning_rate": 5.422664832745379e-06, + "loss": 0.5387, + "step": 5275 + }, + { + "epoch": 2.60808305524657, + "grad_norm": 0.13657944136371117, + "learning_rate": 5.41920316095433e-06, + "loss": 0.5507, + "step": 5276 + }, + { + "epoch": 2.6085774317142505, + "grad_norm": 0.1369890617276846, + "learning_rate": 5.415742183722048e-06, + "loss": 0.5446, + "step": 5277 + }, + { + "epoch": 2.6090718081819304, + "grad_norm": 0.13485211380996828, + "learning_rate": 5.4122819015732915e-06, + "loss": 0.5328, + "step": 5278 + }, + { + "epoch": 2.6095661846496108, + "grad_norm": 0.13852889905166998, + "learning_rate": 5.408822315032718e-06, + "loss": 0.5708, + "step": 5279 + }, + { + "epoch": 2.6100605611172907, + "grad_norm": 0.13476570143135772, + "learning_rate": 5.405363424624891e-06, + "loss": 0.5797, + "step": 5280 + }, + { + "epoch": 2.610554937584971, + "grad_norm": 0.1407174282739234, + "learning_rate": 5.4019052308742545e-06, + "loss": 0.5282, + "step": 5281 + }, + { + "epoch": 2.611049314052651, + "grad_norm": 0.13419477390422466, + "learning_rate": 5.398447734305157e-06, + "loss": 0.5505, + "step": 5282 + }, + { + "epoch": 2.6115436905203313, + "grad_norm": 0.13464503089355787, + "learning_rate": 5.394990935441843e-06, + "loss": 0.553, + "step": 5283 + }, + { + "epoch": 2.612038066988011, + "grad_norm": 0.13990560564249624, + "learning_rate": 5.39153483480844e-06, + "loss": 0.5766, + "step": 5284 + }, + { + "epoch": 2.6125324434556916, + "grad_norm": 0.1356405661456084, + "learning_rate": 5.388079432928974e-06, + "loss": 0.5527, + "step": 5285 + }, + { + "epoch": 2.6130268199233715, + "grad_norm": 0.13638508651717707, + "learning_rate": 5.384624730327375e-06, + "loss": 0.5572, + "step": 5286 + }, + { + "epoch": 2.613521196391052, + "grad_norm": 0.13662130049149765, + "learning_rate": 5.38117072752745e-06, + "loss": 0.5277, + "step": 5287 + }, + { + "epoch": 2.6140155728587318, + "grad_norm": 0.13516939984578016, + "learning_rate": 5.377717425052912e-06, + "loss": 0.5852, + "step": 5288 + }, + { + "epoch": 2.614509949326412, + "grad_norm": 0.14357515134249943, + "learning_rate": 5.374264823427368e-06, + "loss": 0.5873, + "step": 5289 + }, + { + "epoch": 2.615004325794092, + "grad_norm": 0.13852087602834992, + "learning_rate": 5.370812923174311e-06, + "loss": 0.5634, + "step": 5290 + }, + { + "epoch": 2.6154987022617724, + "grad_norm": 0.13155154648444806, + "learning_rate": 5.367361724817136e-06, + "loss": 0.5368, + "step": 5291 + }, + { + "epoch": 2.6159930787294527, + "grad_norm": 0.1340987383037723, + "learning_rate": 5.363911228879125e-06, + "loss": 0.589, + "step": 5292 + }, + { + "epoch": 2.6164874551971327, + "grad_norm": 0.14283350444600165, + "learning_rate": 5.360461435883448e-06, + "loss": 0.5785, + "step": 5293 + }, + { + "epoch": 2.6169818316648126, + "grad_norm": 0.1330065822058321, + "learning_rate": 5.3570123463531935e-06, + "loss": 0.5438, + "step": 5294 + }, + { + "epoch": 2.617476208132493, + "grad_norm": 0.134625196682025, + "learning_rate": 5.3535639608113165e-06, + "loss": 0.5713, + "step": 5295 + }, + { + "epoch": 2.6179705846001733, + "grad_norm": 0.12865936671036304, + "learning_rate": 5.3501162797806706e-06, + "loss": 0.5466, + "step": 5296 + }, + { + "epoch": 2.618464961067853, + "grad_norm": 0.13820791518421544, + "learning_rate": 5.346669303784018e-06, + "loss": 0.5707, + "step": 5297 + }, + { + "epoch": 2.618959337535533, + "grad_norm": 0.13819700576191093, + "learning_rate": 5.343223033343992e-06, + "loss": 0.5928, + "step": 5298 + }, + { + "epoch": 2.6194537140032135, + "grad_norm": 0.14011053907285936, + "learning_rate": 5.339777468983135e-06, + "loss": 0.5576, + "step": 5299 + }, + { + "epoch": 2.619948090470894, + "grad_norm": 0.13376414634442502, + "learning_rate": 5.3363326112238825e-06, + "loss": 0.5658, + "step": 5300 + }, + { + "epoch": 2.6204424669385737, + "grad_norm": 0.14190990815410418, + "learning_rate": 5.33288846058855e-06, + "loss": 0.5554, + "step": 5301 + }, + { + "epoch": 2.6209368434062537, + "grad_norm": 0.14357016961842536, + "learning_rate": 5.329445017599354e-06, + "loss": 0.5591, + "step": 5302 + }, + { + "epoch": 2.621431219873934, + "grad_norm": 0.1375148362447249, + "learning_rate": 5.326002282778409e-06, + "loss": 0.5879, + "step": 5303 + }, + { + "epoch": 2.6219255963416144, + "grad_norm": 0.1321482096350083, + "learning_rate": 5.322560256647706e-06, + "loss": 0.5445, + "step": 5304 + }, + { + "epoch": 2.6224199728092943, + "grad_norm": 0.1403315430224791, + "learning_rate": 5.319118939729146e-06, + "loss": 0.5705, + "step": 5305 + }, + { + "epoch": 2.622914349276974, + "grad_norm": 0.1461875240062784, + "learning_rate": 5.31567833254452e-06, + "loss": 0.5545, + "step": 5306 + }, + { + "epoch": 2.6234087257446546, + "grad_norm": 0.13695677573127416, + "learning_rate": 5.312238435615495e-06, + "loss": 0.5799, + "step": 5307 + }, + { + "epoch": 2.623903102212335, + "grad_norm": 0.13685155325915985, + "learning_rate": 5.308799249463652e-06, + "loss": 0.5864, + "step": 5308 + }, + { + "epoch": 2.624397478680015, + "grad_norm": 0.13658273279076222, + "learning_rate": 5.305360774610446e-06, + "loss": 0.5789, + "step": 5309 + }, + { + "epoch": 2.6248918551476947, + "grad_norm": 0.139791444045451, + "learning_rate": 5.301923011577242e-06, + "loss": 0.5321, + "step": 5310 + }, + { + "epoch": 2.625386231615375, + "grad_norm": 0.13733518645086257, + "learning_rate": 5.298485960885276e-06, + "loss": 0.5381, + "step": 5311 + }, + { + "epoch": 2.6258806080830555, + "grad_norm": 0.13460079503377595, + "learning_rate": 5.295049623055697e-06, + "loss": 0.5875, + "step": 5312 + }, + { + "epoch": 2.6263749845507354, + "grad_norm": 0.13384223890171656, + "learning_rate": 5.291613998609528e-06, + "loss": 0.5718, + "step": 5313 + }, + { + "epoch": 2.6268693610184153, + "grad_norm": 0.1392955708219336, + "learning_rate": 5.288179088067697e-06, + "loss": 0.562, + "step": 5314 + }, + { + "epoch": 2.6273637374860956, + "grad_norm": 0.1374187687008103, + "learning_rate": 5.284744891951024e-06, + "loss": 0.5974, + "step": 5315 + }, + { + "epoch": 2.627858113953776, + "grad_norm": 0.13748556035769835, + "learning_rate": 5.281311410780203e-06, + "loss": 0.5395, + "step": 5316 + }, + { + "epoch": 2.628352490421456, + "grad_norm": 0.13755016414235494, + "learning_rate": 5.277878645075845e-06, + "loss": 0.5533, + "step": 5317 + }, + { + "epoch": 2.628846866889136, + "grad_norm": 0.13774269395257413, + "learning_rate": 5.274446595358434e-06, + "loss": 0.5565, + "step": 5318 + }, + { + "epoch": 2.629341243356816, + "grad_norm": 0.14140517176382156, + "learning_rate": 5.2710152621483465e-06, + "loss": 0.5544, + "step": 5319 + }, + { + "epoch": 2.6298356198244965, + "grad_norm": 0.1343481863695424, + "learning_rate": 5.26758464596586e-06, + "loss": 0.5663, + "step": 5320 + }, + { + "epoch": 2.6303299962921765, + "grad_norm": 0.13288068039025858, + "learning_rate": 5.2641547473311405e-06, + "loss": 0.5642, + "step": 5321 + }, + { + "epoch": 2.6308243727598564, + "grad_norm": 0.1410329871386872, + "learning_rate": 5.260725566764237e-06, + "loss": 0.5732, + "step": 5322 + }, + { + "epoch": 2.6313187492275367, + "grad_norm": 0.13508937099229693, + "learning_rate": 5.257297104785103e-06, + "loss": 0.5307, + "step": 5323 + }, + { + "epoch": 2.631813125695217, + "grad_norm": 0.12631549249282828, + "learning_rate": 5.253869361913571e-06, + "loss": 0.5643, + "step": 5324 + }, + { + "epoch": 2.632307502162897, + "grad_norm": 0.13902225854011463, + "learning_rate": 5.250442338669362e-06, + "loss": 0.5661, + "step": 5325 + }, + { + "epoch": 2.632801878630577, + "grad_norm": 0.134189235251694, + "learning_rate": 5.247016035572109e-06, + "loss": 0.6254, + "step": 5326 + }, + { + "epoch": 2.6332962550982573, + "grad_norm": 0.14537221646866702, + "learning_rate": 5.2435904531413165e-06, + "loss": 0.5553, + "step": 5327 + }, + { + "epoch": 2.6337906315659376, + "grad_norm": 0.14227023945451397, + "learning_rate": 5.240165591896378e-06, + "loss": 0.5902, + "step": 5328 + }, + { + "epoch": 2.6342850080336175, + "grad_norm": 0.12677310007700315, + "learning_rate": 5.236741452356596e-06, + "loss": 0.5561, + "step": 5329 + }, + { + "epoch": 2.634779384501298, + "grad_norm": 0.13537884589346044, + "learning_rate": 5.233318035041143e-06, + "loss": 0.5727, + "step": 5330 + }, + { + "epoch": 2.635273760968978, + "grad_norm": 0.13902094663320064, + "learning_rate": 5.229895340469093e-06, + "loss": 0.5891, + "step": 5331 + }, + { + "epoch": 2.635768137436658, + "grad_norm": 0.13556062449238176, + "learning_rate": 5.226473369159417e-06, + "loss": 0.5792, + "step": 5332 + }, + { + "epoch": 2.636262513904338, + "grad_norm": 0.13286149599214853, + "learning_rate": 5.223052121630956e-06, + "loss": 0.5568, + "step": 5333 + }, + { + "epoch": 2.6367568903720184, + "grad_norm": 0.13741512047506382, + "learning_rate": 5.219631598402464e-06, + "loss": 0.5753, + "step": 5334 + }, + { + "epoch": 2.6372512668396983, + "grad_norm": 0.13760416188231805, + "learning_rate": 5.216211799992568e-06, + "loss": 0.5351, + "step": 5335 + }, + { + "epoch": 2.6377456433073787, + "grad_norm": 0.13638281410563294, + "learning_rate": 5.21279272691979e-06, + "loss": 0.5447, + "step": 5336 + }, + { + "epoch": 2.6382400197750586, + "grad_norm": 0.13249700965999678, + "learning_rate": 5.209374379702545e-06, + "loss": 0.5623, + "step": 5337 + }, + { + "epoch": 2.638734396242739, + "grad_norm": 0.1377709641056506, + "learning_rate": 5.205956758859143e-06, + "loss": 0.576, + "step": 5338 + }, + { + "epoch": 2.639228772710419, + "grad_norm": 0.13715147150101092, + "learning_rate": 5.202539864907767e-06, + "loss": 0.5491, + "step": 5339 + }, + { + "epoch": 2.6397231491780992, + "grad_norm": 0.13317580988005082, + "learning_rate": 5.19912369836651e-06, + "loss": 0.5348, + "step": 5340 + }, + { + "epoch": 2.640217525645779, + "grad_norm": 0.13543027692856527, + "learning_rate": 5.195708259753341e-06, + "loss": 0.5441, + "step": 5341 + }, + { + "epoch": 2.6407119021134595, + "grad_norm": 0.13285602506026392, + "learning_rate": 5.1922935495861125e-06, + "loss": 0.5714, + "step": 5342 + }, + { + "epoch": 2.6412062785811394, + "grad_norm": 0.13542672724016222, + "learning_rate": 5.188879568382595e-06, + "loss": 0.544, + "step": 5343 + }, + { + "epoch": 2.64170065504882, + "grad_norm": 0.14041171518807824, + "learning_rate": 5.185466316660419e-06, + "loss": 0.566, + "step": 5344 + }, + { + "epoch": 2.6421950315164997, + "grad_norm": 0.13371689963166813, + "learning_rate": 5.182053794937114e-06, + "loss": 0.5673, + "step": 5345 + }, + { + "epoch": 2.64268940798418, + "grad_norm": 0.14207548191850722, + "learning_rate": 5.178642003730107e-06, + "loss": 0.5335, + "step": 5346 + }, + { + "epoch": 2.64318378445186, + "grad_norm": 0.13586259149632693, + "learning_rate": 5.1752309435567e-06, + "loss": 0.5381, + "step": 5347 + }, + { + "epoch": 2.6436781609195403, + "grad_norm": 0.13064461736676833, + "learning_rate": 5.171820614934094e-06, + "loss": 0.5539, + "step": 5348 + }, + { + "epoch": 2.6441725373872202, + "grad_norm": 0.1591610598998963, + "learning_rate": 5.168411018379384e-06, + "loss": 0.6313, + "step": 5349 + }, + { + "epoch": 2.6446669138549006, + "grad_norm": 0.15098380693789795, + "learning_rate": 5.165002154409538e-06, + "loss": 0.5685, + "step": 5350 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 0.12796664811105032, + "learning_rate": 5.161594023541423e-06, + "loss": 0.5496, + "step": 5351 + }, + { + "epoch": 2.645655666790261, + "grad_norm": 0.13545117215583685, + "learning_rate": 5.1581866262917965e-06, + "loss": 0.556, + "step": 5352 + }, + { + "epoch": 2.646150043257941, + "grad_norm": 0.13555686446667511, + "learning_rate": 5.154779963177299e-06, + "loss": 0.5683, + "step": 5353 + }, + { + "epoch": 2.646644419725621, + "grad_norm": 0.135597067824681, + "learning_rate": 5.15137403471446e-06, + "loss": 0.5459, + "step": 5354 + }, + { + "epoch": 2.647138796193301, + "grad_norm": 0.13414103200742816, + "learning_rate": 5.1479688414197095e-06, + "loss": 0.5533, + "step": 5355 + }, + { + "epoch": 2.6476331726609814, + "grad_norm": 0.13309793184258273, + "learning_rate": 5.144564383809345e-06, + "loss": 0.6217, + "step": 5356 + }, + { + "epoch": 2.6481275491286613, + "grad_norm": 0.13472566951322185, + "learning_rate": 5.141160662399575e-06, + "loss": 0.5509, + "step": 5357 + }, + { + "epoch": 2.6486219255963417, + "grad_norm": 0.13397139728989735, + "learning_rate": 5.1377576777064745e-06, + "loss": 0.5643, + "step": 5358 + }, + { + "epoch": 2.6491163020640216, + "grad_norm": 0.13506979758247264, + "learning_rate": 5.134355430246027e-06, + "loss": 0.5576, + "step": 5359 + }, + { + "epoch": 2.649610678531702, + "grad_norm": 0.1366299696853015, + "learning_rate": 5.1309539205340875e-06, + "loss": 0.5662, + "step": 5360 + }, + { + "epoch": 2.650105054999382, + "grad_norm": 0.13941271886302423, + "learning_rate": 5.1275531490864135e-06, + "loss": 0.5589, + "step": 5361 + }, + { + "epoch": 2.6505994314670622, + "grad_norm": 0.13213179133830266, + "learning_rate": 5.124153116418636e-06, + "loss": 0.5619, + "step": 5362 + }, + { + "epoch": 2.651093807934742, + "grad_norm": 0.130585359181859, + "learning_rate": 5.1207538230462896e-06, + "loss": 0.5389, + "step": 5363 + }, + { + "epoch": 2.6515881844024225, + "grad_norm": 0.13996254315096154, + "learning_rate": 5.1173552694847804e-06, + "loss": 0.5517, + "step": 5364 + }, + { + "epoch": 2.6520825608701024, + "grad_norm": 0.13457736274606316, + "learning_rate": 5.113957456249414e-06, + "loss": 0.5753, + "step": 5365 + }, + { + "epoch": 2.6525769373377828, + "grad_norm": 0.13608597690647511, + "learning_rate": 5.110560383855387e-06, + "loss": 0.566, + "step": 5366 + }, + { + "epoch": 2.653071313805463, + "grad_norm": 0.13292322760233347, + "learning_rate": 5.10716405281777e-06, + "loss": 0.5357, + "step": 5367 + }, + { + "epoch": 2.653565690273143, + "grad_norm": 0.13356440918659307, + "learning_rate": 5.103768463651528e-06, + "loss": 0.5507, + "step": 5368 + }, + { + "epoch": 2.654060066740823, + "grad_norm": 0.13735968139873206, + "learning_rate": 5.100373616871514e-06, + "loss": 0.5252, + "step": 5369 + }, + { + "epoch": 2.6545544432085033, + "grad_norm": 0.1429672209774706, + "learning_rate": 5.096979512992475e-06, + "loss": 0.6026, + "step": 5370 + }, + { + "epoch": 2.6550488196761837, + "grad_norm": 0.14553447604585484, + "learning_rate": 5.093586152529028e-06, + "loss": 0.5911, + "step": 5371 + }, + { + "epoch": 2.6555431961438636, + "grad_norm": 0.1374129832450093, + "learning_rate": 5.090193535995698e-06, + "loss": 0.5595, + "step": 5372 + }, + { + "epoch": 2.6560375726115435, + "grad_norm": 0.1344629901308611, + "learning_rate": 5.0868016639068825e-06, + "loss": 0.5494, + "step": 5373 + }, + { + "epoch": 2.656531949079224, + "grad_norm": 0.1279550641990402, + "learning_rate": 5.083410536776867e-06, + "loss": 0.5574, + "step": 5374 + }, + { + "epoch": 2.657026325546904, + "grad_norm": 0.13398968291212626, + "learning_rate": 5.0800201551198315e-06, + "loss": 0.5631, + "step": 5375 + }, + { + "epoch": 2.657520702014584, + "grad_norm": 0.12931389544533936, + "learning_rate": 5.076630519449843e-06, + "loss": 0.5343, + "step": 5376 + }, + { + "epoch": 2.658015078482264, + "grad_norm": 0.13900090732941756, + "learning_rate": 5.073241630280845e-06, + "loss": 0.5636, + "step": 5377 + }, + { + "epoch": 2.6585094549499444, + "grad_norm": 0.13563194025490313, + "learning_rate": 5.06985348812668e-06, + "loss": 0.526, + "step": 5378 + }, + { + "epoch": 2.6590038314176248, + "grad_norm": 0.1399008101488407, + "learning_rate": 5.066466093501066e-06, + "loss": 0.5487, + "step": 5379 + }, + { + "epoch": 2.6594982078853047, + "grad_norm": 0.13345875541723445, + "learning_rate": 5.063079446917616e-06, + "loss": 0.573, + "step": 5380 + }, + { + "epoch": 2.6599925843529846, + "grad_norm": 0.13140841784293172, + "learning_rate": 5.059693548889832e-06, + "loss": 0.5164, + "step": 5381 + }, + { + "epoch": 2.660486960820665, + "grad_norm": 0.13430898489122528, + "learning_rate": 5.056308399931087e-06, + "loss": 0.5657, + "step": 5382 + }, + { + "epoch": 2.6609813372883453, + "grad_norm": 0.13379150728465516, + "learning_rate": 5.052924000554662e-06, + "loss": 0.5516, + "step": 5383 + }, + { + "epoch": 2.661475713756025, + "grad_norm": 0.13674178494440745, + "learning_rate": 5.049540351273708e-06, + "loss": 0.5937, + "step": 5384 + }, + { + "epoch": 2.661970090223705, + "grad_norm": 0.1419835145521016, + "learning_rate": 5.0461574526012616e-06, + "loss": 0.5767, + "step": 5385 + }, + { + "epoch": 2.6624644666913855, + "grad_norm": 0.1371254411881354, + "learning_rate": 5.042775305050258e-06, + "loss": 0.5638, + "step": 5386 + }, + { + "epoch": 2.662958843159066, + "grad_norm": 0.1336607241524411, + "learning_rate": 5.039393909133515e-06, + "loss": 0.5581, + "step": 5387 + }, + { + "epoch": 2.6634532196267457, + "grad_norm": 0.1401078913842988, + "learning_rate": 5.036013265363724e-06, + "loss": 0.561, + "step": 5388 + }, + { + "epoch": 2.6639475960944257, + "grad_norm": 0.13267968265526422, + "learning_rate": 5.0326333742534814e-06, + "loss": 0.5567, + "step": 5389 + }, + { + "epoch": 2.664441972562106, + "grad_norm": 0.13349717248773205, + "learning_rate": 5.029254236315257e-06, + "loss": 0.5372, + "step": 5390 + }, + { + "epoch": 2.6649363490297864, + "grad_norm": 0.13932243164645625, + "learning_rate": 5.025875852061399e-06, + "loss": 0.5398, + "step": 5391 + }, + { + "epoch": 2.6654307254974663, + "grad_norm": 0.12891413854783587, + "learning_rate": 5.0224982220041686e-06, + "loss": 0.603, + "step": 5392 + }, + { + "epoch": 2.665925101965146, + "grad_norm": 0.14086952923295853, + "learning_rate": 5.019121346655687e-06, + "loss": 0.586, + "step": 5393 + }, + { + "epoch": 2.6664194784328266, + "grad_norm": 0.1399670543753958, + "learning_rate": 5.015745226527966e-06, + "loss": 0.5359, + "step": 5394 + }, + { + "epoch": 2.666913854900507, + "grad_norm": 0.13410369956490156, + "learning_rate": 5.0123698621329145e-06, + "loss": 0.5564, + "step": 5395 + }, + { + "epoch": 2.667408231368187, + "grad_norm": 0.13697935126626307, + "learning_rate": 5.0089952539823095e-06, + "loss": 0.5514, + "step": 5396 + }, + { + "epoch": 2.6679026078358667, + "grad_norm": 0.13984284892333057, + "learning_rate": 5.005621402587829e-06, + "loss": 0.5547, + "step": 5397 + }, + { + "epoch": 2.668396984303547, + "grad_norm": 0.13451117559544679, + "learning_rate": 5.002248308461032e-06, + "loss": 0.5275, + "step": 5398 + }, + { + "epoch": 2.6688913607712275, + "grad_norm": 0.13111798696712, + "learning_rate": 4.998875972113356e-06, + "loss": 0.5528, + "step": 5399 + }, + { + "epoch": 2.6693857372389074, + "grad_norm": 0.14159321428139324, + "learning_rate": 4.9955043940561264e-06, + "loss": 0.5856, + "step": 5400 + }, + { + "epoch": 2.6698801137065877, + "grad_norm": 0.13612796851867, + "learning_rate": 4.992133574800563e-06, + "loss": 0.5575, + "step": 5401 + }, + { + "epoch": 2.6703744901742676, + "grad_norm": 0.13515391568726723, + "learning_rate": 4.988763514857753e-06, + "loss": 0.5596, + "step": 5402 + }, + { + "epoch": 2.670868866641948, + "grad_norm": 0.13280485831763256, + "learning_rate": 4.985394214738683e-06, + "loss": 0.5354, + "step": 5403 + }, + { + "epoch": 2.671363243109628, + "grad_norm": 0.13600598019616403, + "learning_rate": 4.9820256749542255e-06, + "loss": 0.5483, + "step": 5404 + }, + { + "epoch": 2.6718576195773083, + "grad_norm": 0.13764633827186276, + "learning_rate": 4.978657896015121e-06, + "loss": 0.5653, + "step": 5405 + }, + { + "epoch": 2.672351996044988, + "grad_norm": 0.1371107105839227, + "learning_rate": 4.975290878432016e-06, + "loss": 0.5683, + "step": 5406 + }, + { + "epoch": 2.6728463725126685, + "grad_norm": 0.13292060241922315, + "learning_rate": 4.971924622715423e-06, + "loss": 0.5451, + "step": 5407 + }, + { + "epoch": 2.6733407489803485, + "grad_norm": 0.14459130569446282, + "learning_rate": 4.968559129375751e-06, + "loss": 0.5531, + "step": 5408 + }, + { + "epoch": 2.673835125448029, + "grad_norm": 0.13177928807553918, + "learning_rate": 4.965194398923293e-06, + "loss": 0.5472, + "step": 5409 + }, + { + "epoch": 2.6743295019157087, + "grad_norm": 0.13921445610823155, + "learning_rate": 4.9618304318682185e-06, + "loss": 0.5875, + "step": 5410 + }, + { + "epoch": 2.674823878383389, + "grad_norm": 0.13441939015003757, + "learning_rate": 4.958467228720583e-06, + "loss": 0.5414, + "step": 5411 + }, + { + "epoch": 2.675318254851069, + "grad_norm": 0.13529321301650596, + "learning_rate": 4.955104789990336e-06, + "loss": 0.5659, + "step": 5412 + }, + { + "epoch": 2.6758126313187494, + "grad_norm": 0.14239160441905085, + "learning_rate": 4.9517431161872964e-06, + "loss": 0.5757, + "step": 5413 + }, + { + "epoch": 2.6763070077864293, + "grad_norm": 0.1391565651196586, + "learning_rate": 4.9483822078211775e-06, + "loss": 0.5598, + "step": 5414 + }, + { + "epoch": 2.6768013842541096, + "grad_norm": 0.13484667621672203, + "learning_rate": 4.945022065401579e-06, + "loss": 0.5971, + "step": 5415 + }, + { + "epoch": 2.6772957607217895, + "grad_norm": 0.13613531686655572, + "learning_rate": 4.941662689437975e-06, + "loss": 0.5522, + "step": 5416 + }, + { + "epoch": 2.67779013718947, + "grad_norm": 0.13882659368624278, + "learning_rate": 4.938304080439722e-06, + "loss": 0.5565, + "step": 5417 + }, + { + "epoch": 2.67828451365715, + "grad_norm": 0.13372942464278864, + "learning_rate": 4.934946238916071e-06, + "loss": 0.5734, + "step": 5418 + }, + { + "epoch": 2.67877889012483, + "grad_norm": 0.14055394691033882, + "learning_rate": 4.931589165376157e-06, + "loss": 0.5817, + "step": 5419 + }, + { + "epoch": 2.67927326659251, + "grad_norm": 0.13711611935462015, + "learning_rate": 4.928232860328983e-06, + "loss": 0.5639, + "step": 5420 + }, + { + "epoch": 2.6797676430601904, + "grad_norm": 0.13275666832453134, + "learning_rate": 4.924877324283452e-06, + "loss": 0.5363, + "step": 5421 + }, + { + "epoch": 2.6802620195278704, + "grad_norm": 0.1403960648470411, + "learning_rate": 4.92152255774834e-06, + "loss": 0.5944, + "step": 5422 + }, + { + "epoch": 2.6807563959955507, + "grad_norm": 0.13687708561393236, + "learning_rate": 4.918168561232313e-06, + "loss": 0.5803, + "step": 5423 + }, + { + "epoch": 2.6812507724632306, + "grad_norm": 0.130050473898903, + "learning_rate": 4.9148153352439135e-06, + "loss": 0.5888, + "step": 5424 + }, + { + "epoch": 2.681745148930911, + "grad_norm": 0.13352487743278538, + "learning_rate": 4.911462880291576e-06, + "loss": 0.5398, + "step": 5425 + }, + { + "epoch": 2.682239525398591, + "grad_norm": 0.13200682939777558, + "learning_rate": 4.908111196883608e-06, + "loss": 0.5545, + "step": 5426 + }, + { + "epoch": 2.6827339018662713, + "grad_norm": 0.13993763224431563, + "learning_rate": 4.904760285528211e-06, + "loss": 0.5415, + "step": 5427 + }, + { + "epoch": 2.683228278333951, + "grad_norm": 0.13272772612038963, + "learning_rate": 4.901410146733459e-06, + "loss": 0.6191, + "step": 5428 + }, + { + "epoch": 2.6837226548016315, + "grad_norm": 0.13422039350595794, + "learning_rate": 4.898060781007312e-06, + "loss": 0.5249, + "step": 5429 + }, + { + "epoch": 2.6842170312693114, + "grad_norm": 0.13287421895483054, + "learning_rate": 4.894712188857622e-06, + "loss": 0.5611, + "step": 5430 + }, + { + "epoch": 2.684711407736992, + "grad_norm": 0.1324330258918816, + "learning_rate": 4.8913643707921075e-06, + "loss": 0.5422, + "step": 5431 + }, + { + "epoch": 2.6852057842046717, + "grad_norm": 0.1299994427955663, + "learning_rate": 4.888017327318385e-06, + "loss": 0.5334, + "step": 5432 + }, + { + "epoch": 2.685700160672352, + "grad_norm": 0.13652619028830576, + "learning_rate": 4.8846710589439435e-06, + "loss": 0.5583, + "step": 5433 + }, + { + "epoch": 2.686194537140032, + "grad_norm": 0.13139712283548693, + "learning_rate": 4.881325566176154e-06, + "loss": 0.5957, + "step": 5434 + }, + { + "epoch": 2.6866889136077123, + "grad_norm": 0.1330481029117425, + "learning_rate": 4.8779808495222755e-06, + "loss": 0.5416, + "step": 5435 + }, + { + "epoch": 2.6871832900753923, + "grad_norm": 0.14165399105306406, + "learning_rate": 4.8746369094894544e-06, + "loss": 0.585, + "step": 5436 + }, + { + "epoch": 2.6876776665430726, + "grad_norm": 0.13326623623165518, + "learning_rate": 4.871293746584701e-06, + "loss": 0.5627, + "step": 5437 + }, + { + "epoch": 2.688172043010753, + "grad_norm": 0.1416237015049329, + "learning_rate": 4.86795136131493e-06, + "loss": 0.541, + "step": 5438 + }, + { + "epoch": 2.688666419478433, + "grad_norm": 0.13617702163457635, + "learning_rate": 4.864609754186921e-06, + "loss": 0.5149, + "step": 5439 + }, + { + "epoch": 2.689160795946113, + "grad_norm": 0.1399329215010266, + "learning_rate": 4.861268925707335e-06, + "loss": 0.5476, + "step": 5440 + }, + { + "epoch": 2.689655172413793, + "grad_norm": 0.14346113935278804, + "learning_rate": 4.8579288763827384e-06, + "loss": 0.5615, + "step": 5441 + }, + { + "epoch": 2.6901495488814735, + "grad_norm": 0.13388503161808943, + "learning_rate": 4.854589606719553e-06, + "loss": 0.5822, + "step": 5442 + }, + { + "epoch": 2.6906439253491534, + "grad_norm": 0.1479465152329293, + "learning_rate": 4.851251117224089e-06, + "loss": 0.5712, + "step": 5443 + }, + { + "epoch": 2.6911383018168333, + "grad_norm": 0.14007362765515322, + "learning_rate": 4.84791340840255e-06, + "loss": 0.6213, + "step": 5444 + }, + { + "epoch": 2.6916326782845137, + "grad_norm": 0.14122508520702173, + "learning_rate": 4.844576480761005e-06, + "loss": 0.5365, + "step": 5445 + }, + { + "epoch": 2.692127054752194, + "grad_norm": 0.13186533035501058, + "learning_rate": 4.841240334805416e-06, + "loss": 0.5255, + "step": 5446 + }, + { + "epoch": 2.692621431219874, + "grad_norm": 0.13682027874204328, + "learning_rate": 4.837904971041626e-06, + "loss": 0.5522, + "step": 5447 + }, + { + "epoch": 2.693115807687554, + "grad_norm": 0.1415584404978849, + "learning_rate": 4.834570389975354e-06, + "loss": 0.5667, + "step": 5448 + }, + { + "epoch": 2.6936101841552342, + "grad_norm": 0.1447778258275978, + "learning_rate": 4.8312365921121965e-06, + "loss": 0.5778, + "step": 5449 + }, + { + "epoch": 2.6941045606229146, + "grad_norm": 0.1366910996309786, + "learning_rate": 4.827903577957646e-06, + "loss": 0.5759, + "step": 5450 + }, + { + "epoch": 2.6945989370905945, + "grad_norm": 0.13820558504406913, + "learning_rate": 4.8245713480170594e-06, + "loss": 0.5933, + "step": 5451 + }, + { + "epoch": 2.6950933135582744, + "grad_norm": 0.13674343323389088, + "learning_rate": 4.821239902795689e-06, + "loss": 0.6141, + "step": 5452 + }, + { + "epoch": 2.6955876900259548, + "grad_norm": 0.14034248461604887, + "learning_rate": 4.817909242798662e-06, + "loss": 0.581, + "step": 5453 + }, + { + "epoch": 2.696082066493635, + "grad_norm": 0.13684291957819902, + "learning_rate": 4.8145793685309805e-06, + "loss": 0.56, + "step": 5454 + }, + { + "epoch": 2.696576442961315, + "grad_norm": 0.1349713957181729, + "learning_rate": 4.811250280497541e-06, + "loss": 0.5386, + "step": 5455 + }, + { + "epoch": 2.697070819428995, + "grad_norm": 0.13024317488975187, + "learning_rate": 4.80792197920311e-06, + "loss": 0.5532, + "step": 5456 + }, + { + "epoch": 2.6975651958966753, + "grad_norm": 0.13432602901438184, + "learning_rate": 4.804594465152329e-06, + "loss": 0.5482, + "step": 5457 + }, + { + "epoch": 2.6980595723643557, + "grad_norm": 0.13847559075859922, + "learning_rate": 4.801267738849745e-06, + "loss": 0.591, + "step": 5458 + }, + { + "epoch": 2.6985539488320356, + "grad_norm": 0.13924273012645028, + "learning_rate": 4.797941800799763e-06, + "loss": 0.5646, + "step": 5459 + }, + { + "epoch": 2.6990483252997155, + "grad_norm": 0.13622378262936383, + "learning_rate": 4.794616651506667e-06, + "loss": 0.6019, + "step": 5460 + }, + { + "epoch": 2.699542701767396, + "grad_norm": 0.13164595224041678, + "learning_rate": 4.791292291474643e-06, + "loss": 0.5412, + "step": 5461 + }, + { + "epoch": 2.700037078235076, + "grad_norm": 0.13366372872657556, + "learning_rate": 4.787968721207731e-06, + "loss": 0.5716, + "step": 5462 + }, + { + "epoch": 2.700531454702756, + "grad_norm": 0.1343540893543147, + "learning_rate": 4.7846459412098715e-06, + "loss": 0.5433, + "step": 5463 + }, + { + "epoch": 2.701025831170436, + "grad_norm": 0.13894463191752346, + "learning_rate": 4.7813239519848795e-06, + "loss": 0.5724, + "step": 5464 + }, + { + "epoch": 2.7015202076381164, + "grad_norm": 0.13568216952229573, + "learning_rate": 4.778002754036445e-06, + "loss": 0.5847, + "step": 5465 + }, + { + "epoch": 2.7020145841057968, + "grad_norm": 0.13619215714542332, + "learning_rate": 4.774682347868137e-06, + "loss": 0.5911, + "step": 5466 + }, + { + "epoch": 2.7025089605734767, + "grad_norm": 0.1360385302135351, + "learning_rate": 4.7713627339834146e-06, + "loss": 0.5819, + "step": 5467 + }, + { + "epoch": 2.7030033370411566, + "grad_norm": 0.1372723835538, + "learning_rate": 4.768043912885612e-06, + "loss": 0.5562, + "step": 5468 + }, + { + "epoch": 2.703497713508837, + "grad_norm": 0.132620717591341, + "learning_rate": 4.7647258850779364e-06, + "loss": 0.5641, + "step": 5469 + }, + { + "epoch": 2.7039920899765173, + "grad_norm": 0.13488954532349956, + "learning_rate": 4.761408651063487e-06, + "loss": 0.5856, + "step": 5470 + }, + { + "epoch": 2.704486466444197, + "grad_norm": 0.13552745223568988, + "learning_rate": 4.75809221134523e-06, + "loss": 0.5428, + "step": 5471 + }, + { + "epoch": 2.704980842911877, + "grad_norm": 0.13311553107381377, + "learning_rate": 4.7547765664260225e-06, + "loss": 0.5747, + "step": 5472 + }, + { + "epoch": 2.7054752193795575, + "grad_norm": 0.14010023225143672, + "learning_rate": 4.751461716808591e-06, + "loss": 0.5659, + "step": 5473 + }, + { + "epoch": 2.705969595847238, + "grad_norm": 0.13271608595594683, + "learning_rate": 4.7481476629955515e-06, + "loss": 0.5519, + "step": 5474 + }, + { + "epoch": 2.7064639723149178, + "grad_norm": 0.14159968488722385, + "learning_rate": 4.744834405489388e-06, + "loss": 0.5616, + "step": 5475 + }, + { + "epoch": 2.706958348782598, + "grad_norm": 0.13163865998503058, + "learning_rate": 4.7415219447924775e-06, + "loss": 0.5684, + "step": 5476 + }, + { + "epoch": 2.707452725250278, + "grad_norm": 0.13494142164618123, + "learning_rate": 4.73821028140706e-06, + "loss": 0.5761, + "step": 5477 + }, + { + "epoch": 2.7079471017179584, + "grad_norm": 0.13236172518672004, + "learning_rate": 4.734899415835267e-06, + "loss": 0.5465, + "step": 5478 + }, + { + "epoch": 2.7084414781856383, + "grad_norm": 0.13342795444789118, + "learning_rate": 4.73158934857911e-06, + "loss": 0.568, + "step": 5479 + }, + { + "epoch": 2.7089358546533187, + "grad_norm": 0.13234909741375558, + "learning_rate": 4.728280080140466e-06, + "loss": 0.565, + "step": 5480 + }, + { + "epoch": 2.7094302311209986, + "grad_norm": 0.13917536779383743, + "learning_rate": 4.724971611021107e-06, + "loss": 0.5787, + "step": 5481 + }, + { + "epoch": 2.709924607588679, + "grad_norm": 0.13096421509825687, + "learning_rate": 4.721663941722675e-06, + "loss": 0.5671, + "step": 5482 + }, + { + "epoch": 2.710418984056359, + "grad_norm": 0.1387828914475262, + "learning_rate": 4.7183570727466855e-06, + "loss": 0.5577, + "step": 5483 + }, + { + "epoch": 2.710913360524039, + "grad_norm": 0.1379671251218075, + "learning_rate": 4.715051004594543e-06, + "loss": 0.5456, + "step": 5484 + }, + { + "epoch": 2.711407736991719, + "grad_norm": 0.13287711203484434, + "learning_rate": 4.7117457377675325e-06, + "loss": 0.5563, + "step": 5485 + }, + { + "epoch": 2.7119021134593995, + "grad_norm": 0.12588284931601337, + "learning_rate": 4.708441272766803e-06, + "loss": 0.5604, + "step": 5486 + }, + { + "epoch": 2.7123964899270794, + "grad_norm": 0.1343087554348564, + "learning_rate": 4.705137610093398e-06, + "loss": 0.5773, + "step": 5487 + }, + { + "epoch": 2.7128908663947597, + "grad_norm": 0.1335822643664267, + "learning_rate": 4.701834750248229e-06, + "loss": 0.542, + "step": 5488 + }, + { + "epoch": 2.7133852428624397, + "grad_norm": 0.13651459262853624, + "learning_rate": 4.698532693732081e-06, + "loss": 0.5549, + "step": 5489 + }, + { + "epoch": 2.71387961933012, + "grad_norm": 0.13304909838722997, + "learning_rate": 4.69523144104564e-06, + "loss": 0.5568, + "step": 5490 + }, + { + "epoch": 2.7143739957978, + "grad_norm": 0.12882112506139234, + "learning_rate": 4.691930992689449e-06, + "loss": 0.5711, + "step": 5491 + }, + { + "epoch": 2.7148683722654803, + "grad_norm": 0.13254422154101314, + "learning_rate": 4.6886313491639276e-06, + "loss": 0.5622, + "step": 5492 + }, + { + "epoch": 2.71536274873316, + "grad_norm": 0.13194737921581487, + "learning_rate": 4.685332510969394e-06, + "loss": 0.5405, + "step": 5493 + }, + { + "epoch": 2.7158571252008405, + "grad_norm": 0.13721172359413383, + "learning_rate": 4.682034478606019e-06, + "loss": 0.5554, + "step": 5494 + }, + { + "epoch": 2.7163515016685205, + "grad_norm": 0.1320623581318528, + "learning_rate": 4.67873725257387e-06, + "loss": 0.5716, + "step": 5495 + }, + { + "epoch": 2.716845878136201, + "grad_norm": 0.14009623749018427, + "learning_rate": 4.675440833372887e-06, + "loss": 0.5608, + "step": 5496 + }, + { + "epoch": 2.7173402546038807, + "grad_norm": 0.13757474452632804, + "learning_rate": 4.672145221502882e-06, + "loss": 0.5491, + "step": 5497 + }, + { + "epoch": 2.717834631071561, + "grad_norm": 0.13616766060482918, + "learning_rate": 4.668850417463553e-06, + "loss": 0.5508, + "step": 5498 + }, + { + "epoch": 2.718329007539241, + "grad_norm": 0.1324593689885869, + "learning_rate": 4.66555642175447e-06, + "loss": 0.5534, + "step": 5499 + }, + { + "epoch": 2.7188233840069214, + "grad_norm": 0.13297578782009228, + "learning_rate": 4.662263234875077e-06, + "loss": 0.5765, + "step": 5500 + }, + { + "epoch": 2.7193177604746013, + "grad_norm": 0.1362099508766291, + "learning_rate": 4.658970857324705e-06, + "loss": 0.5669, + "step": 5501 + }, + { + "epoch": 2.7198121369422816, + "grad_norm": 0.13455990497866702, + "learning_rate": 4.65567928960256e-06, + "loss": 0.5887, + "step": 5502 + }, + { + "epoch": 2.7203065134099615, + "grad_norm": 0.13624837468400963, + "learning_rate": 4.6523885322077145e-06, + "loss": 0.5718, + "step": 5503 + }, + { + "epoch": 2.720800889877642, + "grad_norm": 0.12878046826192957, + "learning_rate": 4.649098585639136e-06, + "loss": 0.5359, + "step": 5504 + }, + { + "epoch": 2.721295266345322, + "grad_norm": 0.13928159324629738, + "learning_rate": 4.645809450395654e-06, + "loss": 0.6157, + "step": 5505 + }, + { + "epoch": 2.721789642813002, + "grad_norm": 0.13548770210023756, + "learning_rate": 4.642521126975974e-06, + "loss": 0.5432, + "step": 5506 + }, + { + "epoch": 2.722284019280682, + "grad_norm": 0.14023678493838776, + "learning_rate": 4.6392336158786985e-06, + "loss": 0.587, + "step": 5507 + }, + { + "epoch": 2.7227783957483624, + "grad_norm": 0.1378315383777661, + "learning_rate": 4.635946917602287e-06, + "loss": 0.5171, + "step": 5508 + }, + { + "epoch": 2.7232727722160424, + "grad_norm": 0.13381654300582496, + "learning_rate": 4.632661032645076e-06, + "loss": 0.5992, + "step": 5509 + }, + { + "epoch": 2.7237671486837227, + "grad_norm": 0.13310560783045616, + "learning_rate": 4.6293759615052946e-06, + "loss": 0.5346, + "step": 5510 + }, + { + "epoch": 2.7242615251514026, + "grad_norm": 0.13868135318254324, + "learning_rate": 4.626091704681028e-06, + "loss": 0.5443, + "step": 5511 + }, + { + "epoch": 2.724755901619083, + "grad_norm": 0.13483848599730117, + "learning_rate": 4.622808262670256e-06, + "loss": 0.552, + "step": 5512 + }, + { + "epoch": 2.7252502780867633, + "grad_norm": 0.1390574544258047, + "learning_rate": 4.619525635970827e-06, + "loss": 0.5595, + "step": 5513 + }, + { + "epoch": 2.7257446545544433, + "grad_norm": 0.13468680041231107, + "learning_rate": 4.616243825080466e-06, + "loss": 0.5615, + "step": 5514 + }, + { + "epoch": 2.726239031022123, + "grad_norm": 0.1369864520174809, + "learning_rate": 4.612962830496767e-06, + "loss": 0.5589, + "step": 5515 + }, + { + "epoch": 2.7267334074898035, + "grad_norm": 0.13271986441517136, + "learning_rate": 4.609682652717218e-06, + "loss": 0.5974, + "step": 5516 + }, + { + "epoch": 2.727227783957484, + "grad_norm": 0.13560066139994834, + "learning_rate": 4.6064032922391624e-06, + "loss": 0.5639, + "step": 5517 + }, + { + "epoch": 2.727722160425164, + "grad_norm": 0.13606157228071158, + "learning_rate": 4.603124749559835e-06, + "loss": 0.543, + "step": 5518 + }, + { + "epoch": 2.7282165368928437, + "grad_norm": 0.13207438962556764, + "learning_rate": 4.599847025176347e-06, + "loss": 0.5893, + "step": 5519 + }, + { + "epoch": 2.728710913360524, + "grad_norm": 0.13978520465989855, + "learning_rate": 4.596570119585671e-06, + "loss": 0.5941, + "step": 5520 + }, + { + "epoch": 2.7292052898282044, + "grad_norm": 0.1396515899559599, + "learning_rate": 4.593294033284671e-06, + "loss": 0.547, + "step": 5521 + }, + { + "epoch": 2.7296996662958843, + "grad_norm": 0.13174248232657995, + "learning_rate": 4.590018766770074e-06, + "loss": 0.5737, + "step": 5522 + }, + { + "epoch": 2.7301940427635643, + "grad_norm": 0.13418364262414317, + "learning_rate": 4.5867443205384964e-06, + "loss": 0.5726, + "step": 5523 + }, + { + "epoch": 2.7306884192312446, + "grad_norm": 0.13421416151722593, + "learning_rate": 4.583470695086416e-06, + "loss": 0.5379, + "step": 5524 + }, + { + "epoch": 2.731182795698925, + "grad_norm": 0.1462566505917209, + "learning_rate": 4.5801978909102e-06, + "loss": 0.5759, + "step": 5525 + }, + { + "epoch": 2.731677172166605, + "grad_norm": 0.13271040840491896, + "learning_rate": 4.576925908506076e-06, + "loss": 0.5715, + "step": 5526 + }, + { + "epoch": 2.732171548634285, + "grad_norm": 0.13316360131390728, + "learning_rate": 4.573654748370163e-06, + "loss": 0.5666, + "step": 5527 + }, + { + "epoch": 2.732665925101965, + "grad_norm": 0.1410195986712342, + "learning_rate": 4.5703844109984395e-06, + "loss": 0.5624, + "step": 5528 + }, + { + "epoch": 2.7331603015696455, + "grad_norm": 0.13107247157798763, + "learning_rate": 4.567114896886773e-06, + "loss": 0.5375, + "step": 5529 + }, + { + "epoch": 2.7336546780373254, + "grad_norm": 0.13447579194042164, + "learning_rate": 4.563846206530901e-06, + "loss": 0.5808, + "step": 5530 + }, + { + "epoch": 2.7341490545050053, + "grad_norm": 0.13329476947194288, + "learning_rate": 4.5605783404264334e-06, + "loss": 0.544, + "step": 5531 + }, + { + "epoch": 2.7346434309726857, + "grad_norm": 0.13578689104035815, + "learning_rate": 4.557311299068853e-06, + "loss": 0.5943, + "step": 5532 + }, + { + "epoch": 2.735137807440366, + "grad_norm": 0.13782895344808277, + "learning_rate": 4.554045082953525e-06, + "loss": 0.5467, + "step": 5533 + }, + { + "epoch": 2.735632183908046, + "grad_norm": 0.1296461429765445, + "learning_rate": 4.550779692575692e-06, + "loss": 0.5622, + "step": 5534 + }, + { + "epoch": 2.736126560375726, + "grad_norm": 0.1303310459352239, + "learning_rate": 4.547515128430455e-06, + "loss": 0.5279, + "step": 5535 + }, + { + "epoch": 2.7366209368434062, + "grad_norm": 0.13486113201003874, + "learning_rate": 4.544251391012809e-06, + "loss": 0.5452, + "step": 5536 + }, + { + "epoch": 2.7371153133110866, + "grad_norm": 0.1367755718883588, + "learning_rate": 4.540988480817613e-06, + "loss": 0.566, + "step": 5537 + }, + { + "epoch": 2.7376096897787665, + "grad_norm": 0.1428142108984249, + "learning_rate": 4.537726398339597e-06, + "loss": 0.5557, + "step": 5538 + }, + { + "epoch": 2.7381040662464464, + "grad_norm": 0.14145845694408274, + "learning_rate": 4.534465144073374e-06, + "loss": 0.5877, + "step": 5539 + }, + { + "epoch": 2.738598442714127, + "grad_norm": 0.13399334713702288, + "learning_rate": 4.5312047185134336e-06, + "loss": 0.5859, + "step": 5540 + }, + { + "epoch": 2.739092819181807, + "grad_norm": 0.1330570479933706, + "learning_rate": 4.527945122154127e-06, + "loss": 0.5438, + "step": 5541 + }, + { + "epoch": 2.739587195649487, + "grad_norm": 0.13370265226264785, + "learning_rate": 4.524686355489693e-06, + "loss": 0.5743, + "step": 5542 + }, + { + "epoch": 2.740081572117167, + "grad_norm": 0.13554630832437278, + "learning_rate": 4.521428419014235e-06, + "loss": 0.5779, + "step": 5543 + }, + { + "epoch": 2.7405759485848473, + "grad_norm": 0.13602704412228298, + "learning_rate": 4.518171313221734e-06, + "loss": 0.5497, + "step": 5544 + }, + { + "epoch": 2.7410703250525277, + "grad_norm": 0.13778998649421906, + "learning_rate": 4.514915038606052e-06, + "loss": 0.5736, + "step": 5545 + }, + { + "epoch": 2.7415647015202076, + "grad_norm": 0.13398272885277515, + "learning_rate": 4.51165959566091e-06, + "loss": 0.5626, + "step": 5546 + }, + { + "epoch": 2.7420590779878875, + "grad_norm": 0.1384392574598779, + "learning_rate": 4.508404984879918e-06, + "loss": 0.5937, + "step": 5547 + }, + { + "epoch": 2.742553454455568, + "grad_norm": 0.1490308669714569, + "learning_rate": 4.50515120675655e-06, + "loss": 0.5925, + "step": 5548 + }, + { + "epoch": 2.743047830923248, + "grad_norm": 0.1410596828319607, + "learning_rate": 4.501898261784155e-06, + "loss": 0.6038, + "step": 5549 + }, + { + "epoch": 2.743542207390928, + "grad_norm": 0.13035444633487528, + "learning_rate": 4.498646150455957e-06, + "loss": 0.5612, + "step": 5550 + }, + { + "epoch": 2.7440365838586085, + "grad_norm": 0.14451175993628695, + "learning_rate": 4.495394873265061e-06, + "loss": 0.5573, + "step": 5551 + }, + { + "epoch": 2.7445309603262884, + "grad_norm": 0.1383052286200954, + "learning_rate": 4.492144430704432e-06, + "loss": 0.5596, + "step": 5552 + }, + { + "epoch": 2.7450253367939688, + "grad_norm": 0.13494562538390079, + "learning_rate": 4.4888948232669194e-06, + "loss": 0.5119, + "step": 5553 + }, + { + "epoch": 2.7455197132616487, + "grad_norm": 0.12890410196106142, + "learning_rate": 4.4856460514452405e-06, + "loss": 0.553, + "step": 5554 + }, + { + "epoch": 2.746014089729329, + "grad_norm": 0.12853690318995556, + "learning_rate": 4.482398115731979e-06, + "loss": 0.5588, + "step": 5555 + }, + { + "epoch": 2.746508466197009, + "grad_norm": 0.14037612879758782, + "learning_rate": 4.479151016619615e-06, + "loss": 0.5458, + "step": 5556 + }, + { + "epoch": 2.7470028426646893, + "grad_norm": 0.13981889210503654, + "learning_rate": 4.4759047546004785e-06, + "loss": 0.5669, + "step": 5557 + }, + { + "epoch": 2.747497219132369, + "grad_norm": 0.13524939571578887, + "learning_rate": 4.472659330166777e-06, + "loss": 0.5605, + "step": 5558 + }, + { + "epoch": 2.7479915956000496, + "grad_norm": 0.1346702160019176, + "learning_rate": 4.469414743810603e-06, + "loss": 0.5607, + "step": 5559 + }, + { + "epoch": 2.7484859720677295, + "grad_norm": 0.13665128069893345, + "learning_rate": 4.466170996023905e-06, + "loss": 0.5792, + "step": 5560 + }, + { + "epoch": 2.74898034853541, + "grad_norm": 0.1378978718708237, + "learning_rate": 4.462928087298519e-06, + "loss": 0.5476, + "step": 5561 + }, + { + "epoch": 2.7494747250030898, + "grad_norm": 0.12325379242690494, + "learning_rate": 4.459686018126149e-06, + "loss": 0.5423, + "step": 5562 + }, + { + "epoch": 2.74996910147077, + "grad_norm": 0.13303235911005165, + "learning_rate": 4.456444788998369e-06, + "loss": 0.5424, + "step": 5563 + }, + { + "epoch": 2.75046347793845, + "grad_norm": 0.13612060471277507, + "learning_rate": 4.453204400406621e-06, + "loss": 0.5395, + "step": 5564 + }, + { + "epoch": 2.7509578544061304, + "grad_norm": 0.1386614518549867, + "learning_rate": 4.449964852842236e-06, + "loss": 0.5373, + "step": 5565 + }, + { + "epoch": 2.7514522308738103, + "grad_norm": 0.13938435647123446, + "learning_rate": 4.446726146796396e-06, + "loss": 0.5744, + "step": 5566 + }, + { + "epoch": 2.7514522308738103, + "eval_loss": 0.6405937075614929, + "eval_runtime": 81.8624, + "eval_samples_per_second": 370.793, + "eval_steps_per_second": 46.358, + "step": 5566 + }, + { + "epoch": 2.7519466073414907, + "grad_norm": 0.13706126673375252, + "learning_rate": 4.443488282760174e-06, + "loss": 0.5859, + "step": 5567 + }, + { + "epoch": 2.7524409838091706, + "grad_norm": 0.137811806301739, + "learning_rate": 4.440251261224509e-06, + "loss": 0.5541, + "step": 5568 + }, + { + "epoch": 2.752935360276851, + "grad_norm": 0.13331997064933732, + "learning_rate": 4.437015082680208e-06, + "loss": 0.5667, + "step": 5569 + }, + { + "epoch": 2.753429736744531, + "grad_norm": 0.13806810165610434, + "learning_rate": 4.433779747617953e-06, + "loss": 0.5963, + "step": 5570 + }, + { + "epoch": 2.753924113212211, + "grad_norm": 0.13705253229147973, + "learning_rate": 4.4305452565282996e-06, + "loss": 0.559, + "step": 5571 + }, + { + "epoch": 2.754418489679891, + "grad_norm": 0.13444985969228265, + "learning_rate": 4.427311609901671e-06, + "loss": 0.5619, + "step": 5572 + }, + { + "epoch": 2.7549128661475715, + "grad_norm": 0.13481216008638172, + "learning_rate": 4.424078808228374e-06, + "loss": 0.546, + "step": 5573 + }, + { + "epoch": 2.7554072426152514, + "grad_norm": 0.13707116630583488, + "learning_rate": 4.420846851998574e-06, + "loss": 0.5391, + "step": 5574 + }, + { + "epoch": 2.7559016190829317, + "grad_norm": 0.13436174165208986, + "learning_rate": 4.417615741702308e-06, + "loss": 0.5511, + "step": 5575 + }, + { + "epoch": 2.7563959955506117, + "grad_norm": 0.13088371771577897, + "learning_rate": 4.4143854778294996e-06, + "loss": 0.5736, + "step": 5576 + }, + { + "epoch": 2.756890372018292, + "grad_norm": 0.13775166516616827, + "learning_rate": 4.4111560608699245e-06, + "loss": 0.5512, + "step": 5577 + }, + { + "epoch": 2.757384748485972, + "grad_norm": 0.13363648871453995, + "learning_rate": 4.407927491313245e-06, + "loss": 0.5319, + "step": 5578 + }, + { + "epoch": 2.7578791249536523, + "grad_norm": 0.13266048016328827, + "learning_rate": 4.404699769648993e-06, + "loss": 0.562, + "step": 5579 + }, + { + "epoch": 2.758373501421332, + "grad_norm": 0.13562071697993638, + "learning_rate": 4.4014728963665654e-06, + "loss": 0.5795, + "step": 5580 + }, + { + "epoch": 2.7588678778890126, + "grad_norm": 0.1362556189113658, + "learning_rate": 4.3982468719552295e-06, + "loss": 0.5784, + "step": 5581 + }, + { + "epoch": 2.7593622543566925, + "grad_norm": 0.1335811802823866, + "learning_rate": 4.395021696904132e-06, + "loss": 0.5518, + "step": 5582 + }, + { + "epoch": 2.759856630824373, + "grad_norm": 0.13533448166287654, + "learning_rate": 4.39179737170229e-06, + "loss": 0.5552, + "step": 5583 + }, + { + "epoch": 2.7603510072920527, + "grad_norm": 0.14104263033202868, + "learning_rate": 4.388573896838581e-06, + "loss": 0.5428, + "step": 5584 + }, + { + "epoch": 2.760845383759733, + "grad_norm": 0.1356345208331816, + "learning_rate": 4.385351272801771e-06, + "loss": 0.6123, + "step": 5585 + }, + { + "epoch": 2.761339760227413, + "grad_norm": 0.13074210840533232, + "learning_rate": 4.38212950008048e-06, + "loss": 0.5687, + "step": 5586 + }, + { + "epoch": 2.7618341366950934, + "grad_norm": 0.13334290462945972, + "learning_rate": 4.378908579163205e-06, + "loss": 0.5578, + "step": 5587 + }, + { + "epoch": 2.7623285131627737, + "grad_norm": 0.1382103474254213, + "learning_rate": 4.375688510538318e-06, + "loss": 0.5668, + "step": 5588 + }, + { + "epoch": 2.7628228896304536, + "grad_norm": 0.13267368349944872, + "learning_rate": 4.37246929469406e-06, + "loss": 0.5617, + "step": 5589 + }, + { + "epoch": 2.7633172660981336, + "grad_norm": 0.1411016159067213, + "learning_rate": 4.369250932118537e-06, + "loss": 0.5347, + "step": 5590 + }, + { + "epoch": 2.763811642565814, + "grad_norm": 0.13026735976069884, + "learning_rate": 4.366033423299737e-06, + "loss": 0.5559, + "step": 5591 + }, + { + "epoch": 2.7643060190334943, + "grad_norm": 0.13500879509144234, + "learning_rate": 4.362816768725503e-06, + "loss": 0.5412, + "step": 5592 + }, + { + "epoch": 2.764800395501174, + "grad_norm": 0.1269377287378035, + "learning_rate": 4.359600968883562e-06, + "loss": 0.5498, + "step": 5593 + }, + { + "epoch": 2.765294771968854, + "grad_norm": 0.1323188211871568, + "learning_rate": 4.356386024261508e-06, + "loss": 0.56, + "step": 5594 + }, + { + "epoch": 2.7657891484365345, + "grad_norm": 0.13077529752767234, + "learning_rate": 4.3531719353467995e-06, + "loss": 0.5394, + "step": 5595 + }, + { + "epoch": 2.766283524904215, + "grad_norm": 0.13078797339253662, + "learning_rate": 4.349958702626775e-06, + "loss": 0.5238, + "step": 5596 + }, + { + "epoch": 2.7667779013718947, + "grad_norm": 0.1356434704169907, + "learning_rate": 4.346746326588634e-06, + "loss": 0.5975, + "step": 5597 + }, + { + "epoch": 2.7672722778395746, + "grad_norm": 0.13470766654403735, + "learning_rate": 4.343534807719446e-06, + "loss": 0.5426, + "step": 5598 + }, + { + "epoch": 2.767766654307255, + "grad_norm": 0.13994009663033818, + "learning_rate": 4.34032414650616e-06, + "loss": 0.588, + "step": 5599 + }, + { + "epoch": 2.7682610307749353, + "grad_norm": 0.1297621667181513, + "learning_rate": 4.33711434343559e-06, + "loss": 0.5891, + "step": 5600 + }, + { + "epoch": 2.7687554072426153, + "grad_norm": 0.15514000882715176, + "learning_rate": 4.333905398994414e-06, + "loss": 0.5393, + "step": 5601 + }, + { + "epoch": 2.769249783710295, + "grad_norm": 0.128559987940894, + "learning_rate": 4.330697313669191e-06, + "loss": 0.5747, + "step": 5602 + }, + { + "epoch": 2.7697441601779755, + "grad_norm": 0.1356210703905623, + "learning_rate": 4.3274900879463414e-06, + "loss": 0.5708, + "step": 5603 + }, + { + "epoch": 2.770238536645656, + "grad_norm": 0.13525362766085106, + "learning_rate": 4.324283722312148e-06, + "loss": 0.5442, + "step": 5604 + }, + { + "epoch": 2.770732913113336, + "grad_norm": 0.13616291099022643, + "learning_rate": 4.321078217252791e-06, + "loss": 0.5659, + "step": 5605 + }, + { + "epoch": 2.7712272895810157, + "grad_norm": 0.13971835176678177, + "learning_rate": 4.317873573254292e-06, + "loss": 0.6068, + "step": 5606 + }, + { + "epoch": 2.771721666048696, + "grad_norm": 0.13410080742697864, + "learning_rate": 4.31466979080255e-06, + "loss": 0.5317, + "step": 5607 + }, + { + "epoch": 2.7722160425163764, + "grad_norm": 0.13785265621450193, + "learning_rate": 4.31146687038334e-06, + "loss": 0.6034, + "step": 5608 + }, + { + "epoch": 2.7727104189840563, + "grad_norm": 0.13465836639461182, + "learning_rate": 4.308264812482296e-06, + "loss": 0.5765, + "step": 5609 + }, + { + "epoch": 2.7732047954517363, + "grad_norm": 0.1368556380975686, + "learning_rate": 4.305063617584931e-06, + "loss": 0.5641, + "step": 5610 + }, + { + "epoch": 2.7736991719194166, + "grad_norm": 0.1429207163469386, + "learning_rate": 4.301863286176625e-06, + "loss": 0.5674, + "step": 5611 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.1358269755779847, + "learning_rate": 4.298663818742623e-06, + "loss": 0.5393, + "step": 5612 + }, + { + "epoch": 2.774687924854777, + "grad_norm": 0.13460037800603286, + "learning_rate": 4.2954652157680365e-06, + "loss": 0.6007, + "step": 5613 + }, + { + "epoch": 2.775182301322457, + "grad_norm": 0.1330241408321715, + "learning_rate": 4.292267477737859e-06, + "loss": 0.5611, + "step": 5614 + }, + { + "epoch": 2.775676677790137, + "grad_norm": 0.13713444428740798, + "learning_rate": 4.289070605136936e-06, + "loss": 0.5508, + "step": 5615 + }, + { + "epoch": 2.7761710542578175, + "grad_norm": 0.13294633782485077, + "learning_rate": 4.285874598449994e-06, + "loss": 0.5563, + "step": 5616 + }, + { + "epoch": 2.7766654307254974, + "grad_norm": 0.1359825158510843, + "learning_rate": 4.282679458161627e-06, + "loss": 0.5823, + "step": 5617 + }, + { + "epoch": 2.7771598071931773, + "grad_norm": 0.13603030003507366, + "learning_rate": 4.279485184756289e-06, + "loss": 0.5818, + "step": 5618 + }, + { + "epoch": 2.7776541836608577, + "grad_norm": 0.13762268289492524, + "learning_rate": 4.276291778718316e-06, + "loss": 0.5681, + "step": 5619 + }, + { + "epoch": 2.778148560128538, + "grad_norm": 0.1339348544479617, + "learning_rate": 4.273099240531901e-06, + "loss": 0.5599, + "step": 5620 + }, + { + "epoch": 2.778642936596218, + "grad_norm": 0.1336441190437951, + "learning_rate": 4.2699075706811e-06, + "loss": 0.5148, + "step": 5621 + }, + { + "epoch": 2.779137313063898, + "grad_norm": 0.1385893703785256, + "learning_rate": 4.266716769649864e-06, + "loss": 0.5811, + "step": 5622 + }, + { + "epoch": 2.7796316895315782, + "grad_norm": 0.13696089950972376, + "learning_rate": 4.263526837921988e-06, + "loss": 0.5629, + "step": 5623 + }, + { + "epoch": 2.7801260659992586, + "grad_norm": 0.1356667400955261, + "learning_rate": 4.260337775981137e-06, + "loss": 0.541, + "step": 5624 + }, + { + "epoch": 2.7806204424669385, + "grad_norm": 0.13455174784461182, + "learning_rate": 4.257149584310858e-06, + "loss": 0.5508, + "step": 5625 + }, + { + "epoch": 2.781114818934619, + "grad_norm": 0.13509256197363592, + "learning_rate": 4.253962263394547e-06, + "loss": 0.59, + "step": 5626 + }, + { + "epoch": 2.781609195402299, + "grad_norm": 0.13881800617243653, + "learning_rate": 4.2507758137154865e-06, + "loss": 0.5463, + "step": 5627 + }, + { + "epoch": 2.782103571869979, + "grad_norm": 0.13318697048715739, + "learning_rate": 4.24759023575682e-06, + "loss": 0.5896, + "step": 5628 + }, + { + "epoch": 2.782597948337659, + "grad_norm": 0.14008372466413901, + "learning_rate": 4.244405530001553e-06, + "loss": 0.5913, + "step": 5629 + }, + { + "epoch": 2.7830923248053394, + "grad_norm": 0.1277540497569438, + "learning_rate": 4.241221696932561e-06, + "loss": 0.5638, + "step": 5630 + }, + { + "epoch": 2.7835867012730193, + "grad_norm": 0.13845223279224458, + "learning_rate": 4.238038737032594e-06, + "loss": 0.5376, + "step": 5631 + }, + { + "epoch": 2.7840810777406997, + "grad_norm": 0.13698998579032023, + "learning_rate": 4.234856650784267e-06, + "loss": 0.546, + "step": 5632 + }, + { + "epoch": 2.7845754542083796, + "grad_norm": 0.13753903904116838, + "learning_rate": 4.2316754386700544e-06, + "loss": 0.5922, + "step": 5633 + }, + { + "epoch": 2.78506983067606, + "grad_norm": 0.14064570297990178, + "learning_rate": 4.228495101172312e-06, + "loss": 0.6271, + "step": 5634 + }, + { + "epoch": 2.78556420714374, + "grad_norm": 0.1330789113299396, + "learning_rate": 4.225315638773246e-06, + "loss": 0.5693, + "step": 5635 + }, + { + "epoch": 2.7860585836114202, + "grad_norm": 0.14074131730689515, + "learning_rate": 4.222137051954949e-06, + "loss": 0.5916, + "step": 5636 + }, + { + "epoch": 2.7865529600791, + "grad_norm": 0.13647074921654181, + "learning_rate": 4.2189593411993615e-06, + "loss": 0.5792, + "step": 5637 + }, + { + "epoch": 2.7870473365467805, + "grad_norm": 0.13220939337660725, + "learning_rate": 4.21578250698831e-06, + "loss": 0.5612, + "step": 5638 + }, + { + "epoch": 2.7875417130144604, + "grad_norm": 0.13029824196310666, + "learning_rate": 4.212606549803469e-06, + "loss": 0.5663, + "step": 5639 + }, + { + "epoch": 2.7880360894821408, + "grad_norm": 0.1387276744711629, + "learning_rate": 4.209431470126402e-06, + "loss": 0.541, + "step": 5640 + }, + { + "epoch": 2.7885304659498207, + "grad_norm": 0.13480941659742574, + "learning_rate": 4.206257268438514e-06, + "loss": 0.5648, + "step": 5641 + }, + { + "epoch": 2.789024842417501, + "grad_norm": 0.13227765856617896, + "learning_rate": 4.203083945221098e-06, + "loss": 0.543, + "step": 5642 + }, + { + "epoch": 2.789519218885181, + "grad_norm": 0.13261226181776015, + "learning_rate": 4.1999115009553075e-06, + "loss": 0.5514, + "step": 5643 + }, + { + "epoch": 2.7900135953528613, + "grad_norm": 0.1308133333283015, + "learning_rate": 4.196739936122155e-06, + "loss": 0.5551, + "step": 5644 + }, + { + "epoch": 2.790507971820541, + "grad_norm": 0.1383400395907739, + "learning_rate": 4.193569251202533e-06, + "loss": 0.5672, + "step": 5645 + }, + { + "epoch": 2.7910023482882216, + "grad_norm": 0.13984308823274808, + "learning_rate": 4.190399446677189e-06, + "loss": 0.5663, + "step": 5646 + }, + { + "epoch": 2.7914967247559015, + "grad_norm": 0.13353243824288547, + "learning_rate": 4.187230523026739e-06, + "loss": 0.5359, + "step": 5647 + }, + { + "epoch": 2.791991101223582, + "grad_norm": 0.13992790170233466, + "learning_rate": 4.184062480731671e-06, + "loss": 0.5996, + "step": 5648 + }, + { + "epoch": 2.7924854776912618, + "grad_norm": 0.1425305287556943, + "learning_rate": 4.180895320272339e-06, + "loss": 0.5943, + "step": 5649 + }, + { + "epoch": 2.792979854158942, + "grad_norm": 0.13321573153478308, + "learning_rate": 4.177729042128955e-06, + "loss": 0.5479, + "step": 5650 + }, + { + "epoch": 2.793474230626622, + "grad_norm": 0.13223528046214497, + "learning_rate": 4.174563646781608e-06, + "loss": 0.5584, + "step": 5651 + }, + { + "epoch": 2.7939686070943024, + "grad_norm": 0.12873896231284312, + "learning_rate": 4.171399134710248e-06, + "loss": 0.5722, + "step": 5652 + }, + { + "epoch": 2.7944629835619823, + "grad_norm": 0.13800750904278122, + "learning_rate": 4.168235506394679e-06, + "loss": 0.5911, + "step": 5653 + }, + { + "epoch": 2.7949573600296627, + "grad_norm": 0.13307184168076835, + "learning_rate": 4.1650727623146e-06, + "loss": 0.5691, + "step": 5654 + }, + { + "epoch": 2.7954517364973426, + "grad_norm": 0.13306630922086315, + "learning_rate": 4.161910902949552e-06, + "loss": 0.6095, + "step": 5655 + }, + { + "epoch": 2.795946112965023, + "grad_norm": 0.1382416634185812, + "learning_rate": 4.158749928778944e-06, + "loss": 0.5693, + "step": 5656 + }, + { + "epoch": 2.796440489432703, + "grad_norm": 0.13240723225423817, + "learning_rate": 4.155589840282063e-06, + "loss": 0.5648, + "step": 5657 + }, + { + "epoch": 2.796934865900383, + "grad_norm": 0.13362282760035343, + "learning_rate": 4.152430637938048e-06, + "loss": 0.5618, + "step": 5658 + }, + { + "epoch": 2.797429242368063, + "grad_norm": 0.1354932435629431, + "learning_rate": 4.149272322225913e-06, + "loss": 0.5555, + "step": 5659 + }, + { + "epoch": 2.7979236188357435, + "grad_norm": 0.13303892250522778, + "learning_rate": 4.146114893624537e-06, + "loss": 0.5558, + "step": 5660 + }, + { + "epoch": 2.7984179953034234, + "grad_norm": 0.136619382493998, + "learning_rate": 4.142958352612656e-06, + "loss": 0.5478, + "step": 5661 + }, + { + "epoch": 2.7989123717711037, + "grad_norm": 0.1352905981557214, + "learning_rate": 4.1398026996688844e-06, + "loss": 0.5741, + "step": 5662 + }, + { + "epoch": 2.799406748238784, + "grad_norm": 0.13314285868215506, + "learning_rate": 4.136647935271691e-06, + "loss": 0.5652, + "step": 5663 + }, + { + "epoch": 2.799901124706464, + "grad_norm": 0.13040043429241685, + "learning_rate": 4.133494059899411e-06, + "loss": 0.5321, + "step": 5664 + }, + { + "epoch": 2.800395501174144, + "grad_norm": 0.12968233514308328, + "learning_rate": 4.130341074030251e-06, + "loss": 0.5392, + "step": 5665 + }, + { + "epoch": 2.8008898776418243, + "grad_norm": 0.1364260230498649, + "learning_rate": 4.127188978142282e-06, + "loss": 0.5299, + "step": 5666 + }, + { + "epoch": 2.8013842541095046, + "grad_norm": 0.12893148049855405, + "learning_rate": 4.1240377727134305e-06, + "loss": 0.5304, + "step": 5667 + }, + { + "epoch": 2.8018786305771846, + "grad_norm": 0.135904158566675, + "learning_rate": 4.120887458221502e-06, + "loss": 0.5643, + "step": 5668 + }, + { + "epoch": 2.8023730070448645, + "grad_norm": 0.13374376434463456, + "learning_rate": 4.117738035144158e-06, + "loss": 0.5546, + "step": 5669 + }, + { + "epoch": 2.802867383512545, + "grad_norm": 0.13217680687069874, + "learning_rate": 4.114589503958917e-06, + "loss": 0.587, + "step": 5670 + }, + { + "epoch": 2.803361759980225, + "grad_norm": 0.13270001865794662, + "learning_rate": 4.111441865143187e-06, + "loss": 0.534, + "step": 5671 + }, + { + "epoch": 2.803856136447905, + "grad_norm": 0.13607558846228918, + "learning_rate": 4.108295119174219e-06, + "loss": 0.5697, + "step": 5672 + }, + { + "epoch": 2.804350512915585, + "grad_norm": 0.13647092234530236, + "learning_rate": 4.105149266529133e-06, + "loss": 0.5908, + "step": 5673 + }, + { + "epoch": 2.8048448893832654, + "grad_norm": 0.1501190829383998, + "learning_rate": 4.102004307684919e-06, + "loss": 0.5535, + "step": 5674 + }, + { + "epoch": 2.8053392658509457, + "grad_norm": 0.13407549608075353, + "learning_rate": 4.098860243118424e-06, + "loss": 0.5656, + "step": 5675 + }, + { + "epoch": 2.8058336423186256, + "grad_norm": 0.13410691590679336, + "learning_rate": 4.095717073306367e-06, + "loss": 0.5432, + "step": 5676 + }, + { + "epoch": 2.8063280187863056, + "grad_norm": 0.13830621544225255, + "learning_rate": 4.09257479872533e-06, + "loss": 0.5594, + "step": 5677 + }, + { + "epoch": 2.806822395253986, + "grad_norm": 0.13425932959249862, + "learning_rate": 4.089433419851757e-06, + "loss": 0.5666, + "step": 5678 + }, + { + "epoch": 2.8073167717216663, + "grad_norm": 0.12986790289255964, + "learning_rate": 4.08629293716195e-06, + "loss": 0.5685, + "step": 5679 + }, + { + "epoch": 2.807811148189346, + "grad_norm": 0.13388222701561334, + "learning_rate": 4.083153351132089e-06, + "loss": 0.5638, + "step": 5680 + }, + { + "epoch": 2.808305524657026, + "grad_norm": 0.13460219597321554, + "learning_rate": 4.080014662238203e-06, + "loss": 0.5699, + "step": 5681 + }, + { + "epoch": 2.8087999011247065, + "grad_norm": 0.12903989975503774, + "learning_rate": 4.076876870956198e-06, + "loss": 0.5444, + "step": 5682 + }, + { + "epoch": 2.809294277592387, + "grad_norm": 0.13054978236588222, + "learning_rate": 4.073739977761841e-06, + "loss": 0.5718, + "step": 5683 + }, + { + "epoch": 2.8097886540600667, + "grad_norm": 0.1360759282877486, + "learning_rate": 4.070603983130754e-06, + "loss": 0.5732, + "step": 5684 + }, + { + "epoch": 2.8102830305277466, + "grad_norm": 0.13560017489113377, + "learning_rate": 4.067468887538435e-06, + "loss": 0.5645, + "step": 5685 + }, + { + "epoch": 2.810777406995427, + "grad_norm": 0.13167262623804726, + "learning_rate": 4.064334691460232e-06, + "loss": 0.5318, + "step": 5686 + }, + { + "epoch": 2.8112717834631074, + "grad_norm": 0.1302672487383476, + "learning_rate": 4.061201395371373e-06, + "loss": 0.5797, + "step": 5687 + }, + { + "epoch": 2.8117661599307873, + "grad_norm": 0.13896747996454703, + "learning_rate": 4.058068999746935e-06, + "loss": 0.5531, + "step": 5688 + }, + { + "epoch": 2.812260536398467, + "grad_norm": 0.13477982219686208, + "learning_rate": 4.054937505061868e-06, + "loss": 0.5771, + "step": 5689 + }, + { + "epoch": 2.8127549128661475, + "grad_norm": 0.1351135737997134, + "learning_rate": 4.051806911790977e-06, + "loss": 0.5841, + "step": 5690 + }, + { + "epoch": 2.813249289333828, + "grad_norm": 0.13361047830720993, + "learning_rate": 4.048677220408942e-06, + "loss": 0.536, + "step": 5691 + }, + { + "epoch": 2.813743665801508, + "grad_norm": 0.13299267291218564, + "learning_rate": 4.045548431390291e-06, + "loss": 0.5577, + "step": 5692 + }, + { + "epoch": 2.8142380422691877, + "grad_norm": 0.13674149833802193, + "learning_rate": 4.042420545209429e-06, + "loss": 0.5861, + "step": 5693 + }, + { + "epoch": 2.814732418736868, + "grad_norm": 0.13376427880094996, + "learning_rate": 4.0392935623406205e-06, + "loss": 0.5536, + "step": 5694 + }, + { + "epoch": 2.8152267952045484, + "grad_norm": 0.1347978526629664, + "learning_rate": 4.036167483257989e-06, + "loss": 0.5894, + "step": 5695 + }, + { + "epoch": 2.8157211716722284, + "grad_norm": 0.13918956349934206, + "learning_rate": 4.033042308435519e-06, + "loss": 0.5389, + "step": 5696 + }, + { + "epoch": 2.8162155481399083, + "grad_norm": 0.13796408454658002, + "learning_rate": 4.029918038347064e-06, + "loss": 0.5723, + "step": 5697 + }, + { + "epoch": 2.8167099246075886, + "grad_norm": 0.1333560484611782, + "learning_rate": 4.026794673466344e-06, + "loss": 0.5621, + "step": 5698 + }, + { + "epoch": 2.817204301075269, + "grad_norm": 0.13628738789280068, + "learning_rate": 4.023672214266928e-06, + "loss": 0.533, + "step": 5699 + }, + { + "epoch": 2.817698677542949, + "grad_norm": 0.13876257241292025, + "learning_rate": 4.020550661222264e-06, + "loss": 0.5582, + "step": 5700 + }, + { + "epoch": 2.8181930540106293, + "grad_norm": 0.14302589559167075, + "learning_rate": 4.017430014805649e-06, + "loss": 0.609, + "step": 5701 + }, + { + "epoch": 2.818687430478309, + "grad_norm": 0.13275548642202345, + "learning_rate": 4.014310275490245e-06, + "loss": 0.5644, + "step": 5702 + }, + { + "epoch": 2.8191818069459895, + "grad_norm": 0.13477262835648865, + "learning_rate": 4.011191443749085e-06, + "loss": 0.5464, + "step": 5703 + }, + { + "epoch": 2.8196761834136694, + "grad_norm": 0.13244973911733648, + "learning_rate": 4.008073520055059e-06, + "loss": 0.5175, + "step": 5704 + }, + { + "epoch": 2.82017055988135, + "grad_norm": 0.13341949411778586, + "learning_rate": 4.004956504880914e-06, + "loss": 0.5771, + "step": 5705 + }, + { + "epoch": 2.8206649363490297, + "grad_norm": 0.14008205393195158, + "learning_rate": 4.001840398699271e-06, + "loss": 0.5591, + "step": 5706 + }, + { + "epoch": 2.82115931281671, + "grad_norm": 0.13708975139566126, + "learning_rate": 3.9987252019825995e-06, + "loss": 0.5613, + "step": 5707 + }, + { + "epoch": 2.82165368928439, + "grad_norm": 0.13437622466421192, + "learning_rate": 3.995610915203241e-06, + "loss": 0.5547, + "step": 5708 + }, + { + "epoch": 2.8221480657520703, + "grad_norm": 0.14020530332855596, + "learning_rate": 3.9924975388334004e-06, + "loss": 0.5535, + "step": 5709 + }, + { + "epoch": 2.8226424422197502, + "grad_norm": 0.13689456570650377, + "learning_rate": 3.9893850733451336e-06, + "loss": 0.5251, + "step": 5710 + }, + { + "epoch": 2.8231368186874306, + "grad_norm": 0.13469024695435622, + "learning_rate": 3.98627351921037e-06, + "loss": 0.586, + "step": 5711 + }, + { + "epoch": 2.8236311951551105, + "grad_norm": 0.13140024969820888, + "learning_rate": 3.983162876900896e-06, + "loss": 0.5424, + "step": 5712 + }, + { + "epoch": 2.824125571622791, + "grad_norm": 0.13883280426491124, + "learning_rate": 3.9800531468883515e-06, + "loss": 0.5706, + "step": 5713 + }, + { + "epoch": 2.824619948090471, + "grad_norm": 0.1381955223071475, + "learning_rate": 3.976944329644254e-06, + "loss": 0.5303, + "step": 5714 + }, + { + "epoch": 2.825114324558151, + "grad_norm": 0.14327187403535904, + "learning_rate": 3.973836425639976e-06, + "loss": 0.5655, + "step": 5715 + }, + { + "epoch": 2.825608701025831, + "grad_norm": 0.13629030699196706, + "learning_rate": 3.970729435346744e-06, + "loss": 0.5417, + "step": 5716 + }, + { + "epoch": 2.8261030774935114, + "grad_norm": 0.13170538884098695, + "learning_rate": 3.9676233592356595e-06, + "loss": 0.551, + "step": 5717 + }, + { + "epoch": 2.8265974539611913, + "grad_norm": 0.14269552523269113, + "learning_rate": 3.964518197777673e-06, + "loss": 0.5852, + "step": 5718 + }, + { + "epoch": 2.8270918304288717, + "grad_norm": 0.13055186782709155, + "learning_rate": 3.961413951443598e-06, + "loss": 0.5667, + "step": 5719 + }, + { + "epoch": 2.8275862068965516, + "grad_norm": 0.13609621779094117, + "learning_rate": 3.958310620704125e-06, + "loss": 0.5682, + "step": 5720 + }, + { + "epoch": 2.828080583364232, + "grad_norm": 0.1333997871266675, + "learning_rate": 3.9552082060297835e-06, + "loss": 0.5895, + "step": 5721 + }, + { + "epoch": 2.828574959831912, + "grad_norm": 0.14157910755769978, + "learning_rate": 3.952106707890975e-06, + "loss": 0.5726, + "step": 5722 + }, + { + "epoch": 2.8290693362995922, + "grad_norm": 0.1519691101091339, + "learning_rate": 3.949006126757966e-06, + "loss": 0.5369, + "step": 5723 + }, + { + "epoch": 2.829563712767272, + "grad_norm": 0.13088071391383158, + "learning_rate": 3.9459064631008715e-06, + "loss": 0.5364, + "step": 5724 + }, + { + "epoch": 2.8300580892349525, + "grad_norm": 0.13483170888419854, + "learning_rate": 3.94280771738968e-06, + "loss": 0.5357, + "step": 5725 + }, + { + "epoch": 2.8305524657026324, + "grad_norm": 0.13621725772688267, + "learning_rate": 3.939709890094237e-06, + "loss": 0.5761, + "step": 5726 + }, + { + "epoch": 2.8310468421703128, + "grad_norm": 0.134663682378755, + "learning_rate": 3.936612981684247e-06, + "loss": 0.5443, + "step": 5727 + }, + { + "epoch": 2.8315412186379927, + "grad_norm": 0.13504239394595557, + "learning_rate": 3.9335169926292704e-06, + "loss": 0.5268, + "step": 5728 + }, + { + "epoch": 2.832035595105673, + "grad_norm": 0.13106587871741776, + "learning_rate": 3.93042192339874e-06, + "loss": 0.5813, + "step": 5729 + }, + { + "epoch": 2.832529971573353, + "grad_norm": 0.13828619366172862, + "learning_rate": 3.927327774461937e-06, + "loss": 0.555, + "step": 5730 + }, + { + "epoch": 2.8330243480410333, + "grad_norm": 0.13370669364962787, + "learning_rate": 3.924234546288009e-06, + "loss": 0.5823, + "step": 5731 + }, + { + "epoch": 2.8335187245087132, + "grad_norm": 0.13681567205722728, + "learning_rate": 3.921142239345972e-06, + "loss": 0.516, + "step": 5732 + }, + { + "epoch": 2.8340131009763936, + "grad_norm": 0.13566419324032913, + "learning_rate": 3.918050854104683e-06, + "loss": 0.527, + "step": 5733 + }, + { + "epoch": 2.8345074774440735, + "grad_norm": 0.1338844210804193, + "learning_rate": 3.914960391032879e-06, + "loss": 0.5455, + "step": 5734 + }, + { + "epoch": 2.835001853911754, + "grad_norm": 0.13316473103543905, + "learning_rate": 3.911870850599141e-06, + "loss": 0.572, + "step": 5735 + }, + { + "epoch": 2.8354962303794338, + "grad_norm": 0.13965624382857372, + "learning_rate": 3.908782233271921e-06, + "loss": 0.5465, + "step": 5736 + }, + { + "epoch": 2.835990606847114, + "grad_norm": 0.13905975648756633, + "learning_rate": 3.905694539519531e-06, + "loss": 0.5634, + "step": 5737 + }, + { + "epoch": 2.8364849833147945, + "grad_norm": 0.13402880232738934, + "learning_rate": 3.9026077698101364e-06, + "loss": 0.5832, + "step": 5738 + }, + { + "epoch": 2.8369793597824744, + "grad_norm": 0.1336100639710742, + "learning_rate": 3.899521924611761e-06, + "loss": 0.5639, + "step": 5739 + }, + { + "epoch": 2.8374737362501543, + "grad_norm": 0.1323864522128982, + "learning_rate": 3.896437004392301e-06, + "loss": 0.5473, + "step": 5740 + }, + { + "epoch": 2.8379681127178347, + "grad_norm": 0.13557940102712865, + "learning_rate": 3.893353009619497e-06, + "loss": 0.5462, + "step": 5741 + }, + { + "epoch": 2.838462489185515, + "grad_norm": 0.13683389838370436, + "learning_rate": 3.890269940760961e-06, + "loss": 0.5658, + "step": 5742 + }, + { + "epoch": 2.838956865653195, + "grad_norm": 0.13576158914632588, + "learning_rate": 3.887187798284162e-06, + "loss": 0.5481, + "step": 5743 + }, + { + "epoch": 2.839451242120875, + "grad_norm": 0.13043173299638858, + "learning_rate": 3.884106582656425e-06, + "loss": 0.556, + "step": 5744 + }, + { + "epoch": 2.839945618588555, + "grad_norm": 0.13063775072860145, + "learning_rate": 3.881026294344932e-06, + "loss": 0.5399, + "step": 5745 + }, + { + "epoch": 2.8404399950562356, + "grad_norm": 0.14326159196686755, + "learning_rate": 3.877946933816731e-06, + "loss": 0.6047, + "step": 5746 + }, + { + "epoch": 2.8409343715239155, + "grad_norm": 0.14215567357388006, + "learning_rate": 3.874868501538732e-06, + "loss": 0.5423, + "step": 5747 + }, + { + "epoch": 2.8414287479915954, + "grad_norm": 0.14071401905190856, + "learning_rate": 3.871790997977692e-06, + "loss": 0.566, + "step": 5748 + }, + { + "epoch": 2.8419231244592758, + "grad_norm": 0.13436272422265325, + "learning_rate": 3.868714423600242e-06, + "loss": 0.5832, + "step": 5749 + }, + { + "epoch": 2.842417500926956, + "grad_norm": 0.13820933257336573, + "learning_rate": 3.865638778872859e-06, + "loss": 0.5637, + "step": 5750 + }, + { + "epoch": 2.842911877394636, + "grad_norm": 0.1343640790010925, + "learning_rate": 3.8625640642618824e-06, + "loss": 0.5317, + "step": 5751 + }, + { + "epoch": 2.843406253862316, + "grad_norm": 0.13848860522411133, + "learning_rate": 3.859490280233516e-06, + "loss": 0.5475, + "step": 5752 + }, + { + "epoch": 2.8439006303299963, + "grad_norm": 0.13438977625392587, + "learning_rate": 3.856417427253824e-06, + "loss": 0.5623, + "step": 5753 + }, + { + "epoch": 2.8443950067976767, + "grad_norm": 0.13352175914157574, + "learning_rate": 3.853345505788716e-06, + "loss": 0.5714, + "step": 5754 + }, + { + "epoch": 2.8448893832653566, + "grad_norm": 0.13764239502678507, + "learning_rate": 3.850274516303977e-06, + "loss": 0.5882, + "step": 5755 + }, + { + "epoch": 2.8453837597330365, + "grad_norm": 0.13600578637514224, + "learning_rate": 3.847204459265234e-06, + "loss": 0.5501, + "step": 5756 + }, + { + "epoch": 2.845878136200717, + "grad_norm": 0.14349738303271398, + "learning_rate": 3.844135335137989e-06, + "loss": 0.6, + "step": 5757 + }, + { + "epoch": 2.846372512668397, + "grad_norm": 0.1325890611575564, + "learning_rate": 3.841067144387594e-06, + "loss": 0.5274, + "step": 5758 + }, + { + "epoch": 2.846866889136077, + "grad_norm": 0.14364109620306972, + "learning_rate": 3.837999887479253e-06, + "loss": 0.5588, + "step": 5759 + }, + { + "epoch": 2.847361265603757, + "grad_norm": 0.13799297193179855, + "learning_rate": 3.834933564878048e-06, + "loss": 0.5667, + "step": 5760 + }, + { + "epoch": 2.8478556420714374, + "grad_norm": 0.13228152988185465, + "learning_rate": 3.831868177048897e-06, + "loss": 0.5812, + "step": 5761 + }, + { + "epoch": 2.8483500185391177, + "grad_norm": 0.14083494449563347, + "learning_rate": 3.828803724456589e-06, + "loss": 0.5603, + "step": 5762 + }, + { + "epoch": 2.8488443950067976, + "grad_norm": 0.13691467620427675, + "learning_rate": 3.8257402075657675e-06, + "loss": 0.5601, + "step": 5763 + }, + { + "epoch": 2.8493387714744776, + "grad_norm": 0.13353791300862644, + "learning_rate": 3.822677626840942e-06, + "loss": 0.544, + "step": 5764 + }, + { + "epoch": 2.849833147942158, + "grad_norm": 0.13804374265116595, + "learning_rate": 3.819615982746463e-06, + "loss": 0.5557, + "step": 5765 + }, + { + "epoch": 2.8503275244098383, + "grad_norm": 0.13965634603734625, + "learning_rate": 3.816555275746558e-06, + "loss": 0.5704, + "step": 5766 + }, + { + "epoch": 2.850821900877518, + "grad_norm": 0.1345991775115226, + "learning_rate": 3.8134955063053016e-06, + "loss": 0.5441, + "step": 5767 + }, + { + "epoch": 2.851316277345198, + "grad_norm": 0.13475657365860322, + "learning_rate": 3.8104366748866197e-06, + "loss": 0.5747, + "step": 5768 + }, + { + "epoch": 2.8518106538128785, + "grad_norm": 0.1356263931395629, + "learning_rate": 3.8073787819543175e-06, + "loss": 0.5599, + "step": 5769 + }, + { + "epoch": 2.852305030280559, + "grad_norm": 0.13819893739361847, + "learning_rate": 3.8043218279720396e-06, + "loss": 0.5397, + "step": 5770 + }, + { + "epoch": 2.8527994067482387, + "grad_norm": 0.13144612030966563, + "learning_rate": 3.8012658134032896e-06, + "loss": 0.5657, + "step": 5771 + }, + { + "epoch": 2.853293783215919, + "grad_norm": 0.13481149653240174, + "learning_rate": 3.7982107387114396e-06, + "loss": 0.5606, + "step": 5772 + }, + { + "epoch": 2.853788159683599, + "grad_norm": 0.12956509926371407, + "learning_rate": 3.7951566043597055e-06, + "loss": 0.527, + "step": 5773 + }, + { + "epoch": 2.8542825361512794, + "grad_norm": 0.13528731963353477, + "learning_rate": 3.792103410811171e-06, + "loss": 0.5807, + "step": 5774 + }, + { + "epoch": 2.8547769126189593, + "grad_norm": 0.13745303236910933, + "learning_rate": 3.789051158528776e-06, + "loss": 0.5643, + "step": 5775 + }, + { + "epoch": 2.8552712890866396, + "grad_norm": 0.13327500967097242, + "learning_rate": 3.7859998479753134e-06, + "loss": 0.5561, + "step": 5776 + }, + { + "epoch": 2.8557656655543195, + "grad_norm": 0.1309542307941561, + "learning_rate": 3.7829494796134304e-06, + "loss": 0.5574, + "step": 5777 + }, + { + "epoch": 2.856260042022, + "grad_norm": 0.13304901056346333, + "learning_rate": 3.779900053905643e-06, + "loss": 0.5616, + "step": 5778 + }, + { + "epoch": 2.85675441848968, + "grad_norm": 0.14149651031062713, + "learning_rate": 3.7768515713143106e-06, + "loss": 0.5344, + "step": 5779 + }, + { + "epoch": 2.85724879495736, + "grad_norm": 0.13985814670336155, + "learning_rate": 3.77380403230166e-06, + "loss": 0.6262, + "step": 5780 + }, + { + "epoch": 2.85774317142504, + "grad_norm": 0.12891614479619892, + "learning_rate": 3.770757437329775e-06, + "loss": 0.5479, + "step": 5781 + }, + { + "epoch": 2.8582375478927204, + "grad_norm": 0.14044301657165037, + "learning_rate": 3.767711786860585e-06, + "loss": 0.5408, + "step": 5782 + }, + { + "epoch": 2.8587319243604004, + "grad_norm": 0.12738567833524492, + "learning_rate": 3.7646670813558915e-06, + "loss": 0.5719, + "step": 5783 + }, + { + "epoch": 2.8592263008280807, + "grad_norm": 0.13488224798580617, + "learning_rate": 3.76162332127734e-06, + "loss": 0.5484, + "step": 5784 + }, + { + "epoch": 2.8597206772957606, + "grad_norm": 0.13055276900472781, + "learning_rate": 3.758580507086432e-06, + "loss": 0.5472, + "step": 5785 + }, + { + "epoch": 2.860215053763441, + "grad_norm": 0.13615579247689547, + "learning_rate": 3.7555386392445447e-06, + "loss": 0.5489, + "step": 5786 + }, + { + "epoch": 2.860709430231121, + "grad_norm": 0.13376225305539954, + "learning_rate": 3.752497718212892e-06, + "loss": 0.5982, + "step": 5787 + }, + { + "epoch": 2.8612038066988013, + "grad_norm": 0.13424177998626943, + "learning_rate": 3.749457744452545e-06, + "loss": 0.5531, + "step": 5788 + }, + { + "epoch": 2.861698183166481, + "grad_norm": 0.13713723444604384, + "learning_rate": 3.746418718424445e-06, + "loss": 0.6135, + "step": 5789 + }, + { + "epoch": 2.8621925596341615, + "grad_norm": 0.13602880858267669, + "learning_rate": 3.7433806405893745e-06, + "loss": 0.5748, + "step": 5790 + }, + { + "epoch": 2.8626869361018414, + "grad_norm": 0.13154317394134063, + "learning_rate": 3.7403435114079823e-06, + "loss": 0.5865, + "step": 5791 + }, + { + "epoch": 2.863181312569522, + "grad_norm": 0.13835126668784603, + "learning_rate": 3.737307331340774e-06, + "loss": 0.584, + "step": 5792 + }, + { + "epoch": 2.8636756890372017, + "grad_norm": 0.13151865402803226, + "learning_rate": 3.734272100848103e-06, + "loss": 0.5929, + "step": 5793 + }, + { + "epoch": 2.864170065504882, + "grad_norm": 0.13466372243887417, + "learning_rate": 3.73123782039018e-06, + "loss": 0.5914, + "step": 5794 + }, + { + "epoch": 2.864664441972562, + "grad_norm": 0.13194157863424796, + "learning_rate": 3.728204490427079e-06, + "loss": 0.5835, + "step": 5795 + }, + { + "epoch": 2.8651588184402423, + "grad_norm": 0.1318907106736092, + "learning_rate": 3.7251721114187266e-06, + "loss": 0.552, + "step": 5796 + }, + { + "epoch": 2.8656531949079223, + "grad_norm": 0.14312486496211801, + "learning_rate": 3.7221406838249006e-06, + "loss": 0.5512, + "step": 5797 + }, + { + "epoch": 2.8661475713756026, + "grad_norm": 0.12968510031761799, + "learning_rate": 3.7191102081052433e-06, + "loss": 0.577, + "step": 5798 + }, + { + "epoch": 2.8666419478432825, + "grad_norm": 0.13399252633635256, + "learning_rate": 3.716080684719241e-06, + "loss": 0.5939, + "step": 5799 + }, + { + "epoch": 2.867136324310963, + "grad_norm": 0.13728884266478758, + "learning_rate": 3.713052114126249e-06, + "loss": 0.577, + "step": 5800 + }, + { + "epoch": 2.867630700778643, + "grad_norm": 0.13495274997218565, + "learning_rate": 3.710024496785464e-06, + "loss": 0.5702, + "step": 5801 + }, + { + "epoch": 2.868125077246323, + "grad_norm": 0.13558570193297348, + "learning_rate": 3.706997833155953e-06, + "loss": 0.5484, + "step": 5802 + }, + { + "epoch": 2.868619453714003, + "grad_norm": 0.12987078202091257, + "learning_rate": 3.7039721236966243e-06, + "loss": 0.5446, + "step": 5803 + }, + { + "epoch": 2.8691138301816834, + "grad_norm": 0.13773606900142135, + "learning_rate": 3.7009473688662533e-06, + "loss": 0.5715, + "step": 5804 + }, + { + "epoch": 2.8696082066493633, + "grad_norm": 0.14084403273442908, + "learning_rate": 3.6979235691234606e-06, + "loss": 0.5436, + "step": 5805 + }, + { + "epoch": 2.8701025831170437, + "grad_norm": 0.1368000307758393, + "learning_rate": 3.6949007249267286e-06, + "loss": 0.5807, + "step": 5806 + }, + { + "epoch": 2.8705969595847236, + "grad_norm": 0.14013880395136494, + "learning_rate": 3.6918788367343984e-06, + "loss": 0.5551, + "step": 5807 + }, + { + "epoch": 2.871091336052404, + "grad_norm": 0.12976981436808188, + "learning_rate": 3.6888579050046515e-06, + "loss": 0.582, + "step": 5808 + }, + { + "epoch": 2.8715857125200843, + "grad_norm": 0.1327645351290881, + "learning_rate": 3.6858379301955427e-06, + "loss": 0.5846, + "step": 5809 + }, + { + "epoch": 2.8720800889877642, + "grad_norm": 0.13820389101184719, + "learning_rate": 3.6828189127649683e-06, + "loss": 0.5427, + "step": 5810 + }, + { + "epoch": 2.872574465455444, + "grad_norm": 0.13858110932289372, + "learning_rate": 3.6798008531706796e-06, + "loss": 0.5587, + "step": 5811 + }, + { + "epoch": 2.8730688419231245, + "grad_norm": 0.13835855077105894, + "learning_rate": 3.676783751870291e-06, + "loss": 0.555, + "step": 5812 + }, + { + "epoch": 2.873563218390805, + "grad_norm": 0.13893549420710677, + "learning_rate": 3.6737676093212716e-06, + "loss": 0.5526, + "step": 5813 + }, + { + "epoch": 2.8740575948584848, + "grad_norm": 0.12941533454871593, + "learning_rate": 3.6707524259809334e-06, + "loss": 0.5223, + "step": 5814 + }, + { + "epoch": 2.8745519713261647, + "grad_norm": 0.12746064975439567, + "learning_rate": 3.6677382023064577e-06, + "loss": 0.5637, + "step": 5815 + }, + { + "epoch": 2.875046347793845, + "grad_norm": 0.13032560617389954, + "learning_rate": 3.66472493875487e-06, + "loss": 0.5624, + "step": 5816 + }, + { + "epoch": 2.8755407242615254, + "grad_norm": 0.12960947997099423, + "learning_rate": 3.6617126357830458e-06, + "loss": 0.5613, + "step": 5817 + }, + { + "epoch": 2.8760351007292053, + "grad_norm": 0.13735168483174395, + "learning_rate": 3.658701293847736e-06, + "loss": 0.5788, + "step": 5818 + }, + { + "epoch": 2.8765294771968852, + "grad_norm": 0.13471009453289204, + "learning_rate": 3.6556909134055276e-06, + "loss": 0.582, + "step": 5819 + }, + { + "epoch": 2.8770238536645656, + "grad_norm": 0.13015221544446487, + "learning_rate": 3.65268149491286e-06, + "loss": 0.5335, + "step": 5820 + }, + { + "epoch": 2.877518230132246, + "grad_norm": 0.1356372861416037, + "learning_rate": 3.649673038826043e-06, + "loss": 0.5653, + "step": 5821 + }, + { + "epoch": 2.878012606599926, + "grad_norm": 0.1344114306484756, + "learning_rate": 3.646665545601221e-06, + "loss": 0.5749, + "step": 5822 + }, + { + "epoch": 2.8785069830676058, + "grad_norm": 0.13979533934267324, + "learning_rate": 3.6436590156944087e-06, + "loss": 0.556, + "step": 5823 + }, + { + "epoch": 2.879001359535286, + "grad_norm": 0.1326189127248368, + "learning_rate": 3.64065344956147e-06, + "loss": 0.5572, + "step": 5824 + }, + { + "epoch": 2.8794957360029665, + "grad_norm": 0.13466374298206166, + "learning_rate": 3.637648847658113e-06, + "loss": 0.5689, + "step": 5825 + }, + { + "epoch": 2.8799901124706464, + "grad_norm": 0.1378929884178871, + "learning_rate": 3.6346452104399165e-06, + "loss": 0.5771, + "step": 5826 + }, + { + "epoch": 2.8804844889383263, + "grad_norm": 0.13150265308969064, + "learning_rate": 3.631642538362299e-06, + "loss": 0.5317, + "step": 5827 + }, + { + "epoch": 2.8809788654060067, + "grad_norm": 0.14200455594520464, + "learning_rate": 3.6286408318805342e-06, + "loss": 0.54, + "step": 5828 + }, + { + "epoch": 2.881473241873687, + "grad_norm": 0.13220496665351575, + "learning_rate": 3.625640091449758e-06, + "loss": 0.5702, + "step": 5829 + }, + { + "epoch": 2.881967618341367, + "grad_norm": 0.12933444325076138, + "learning_rate": 3.622640317524957e-06, + "loss": 0.5298, + "step": 5830 + }, + { + "epoch": 2.882461994809047, + "grad_norm": 0.1326465295512727, + "learning_rate": 3.6196415105609616e-06, + "loss": 0.5338, + "step": 5831 + }, + { + "epoch": 2.882956371276727, + "grad_norm": 0.13560969555509716, + "learning_rate": 3.616643671012471e-06, + "loss": 0.555, + "step": 5832 + }, + { + "epoch": 2.8834507477444076, + "grad_norm": 0.14108985033925883, + "learning_rate": 3.613646799334024e-06, + "loss": 0.5776, + "step": 5833 + }, + { + "epoch": 2.8839451242120875, + "grad_norm": 0.1342067417141375, + "learning_rate": 3.6106508959800136e-06, + "loss": 0.5958, + "step": 5834 + }, + { + "epoch": 2.8844395006797674, + "grad_norm": 0.1286931160902919, + "learning_rate": 3.6076559614047035e-06, + "loss": 0.5644, + "step": 5835 + }, + { + "epoch": 2.8849338771474478, + "grad_norm": 0.13526763472129813, + "learning_rate": 3.604661996062191e-06, + "loss": 0.5525, + "step": 5836 + }, + { + "epoch": 2.885428253615128, + "grad_norm": 0.1378048107561889, + "learning_rate": 3.6016690004064305e-06, + "loss": 0.5528, + "step": 5837 + }, + { + "epoch": 2.885922630082808, + "grad_norm": 0.14112931955825234, + "learning_rate": 3.5986769748912363e-06, + "loss": 0.5866, + "step": 5838 + }, + { + "epoch": 2.886417006550488, + "grad_norm": 0.13923098707190248, + "learning_rate": 3.5956859199702678e-06, + "loss": 0.5605, + "step": 5839 + }, + { + "epoch": 2.8869113830181683, + "grad_norm": 0.13117811728056172, + "learning_rate": 3.592695836097041e-06, + "loss": 0.5805, + "step": 5840 + }, + { + "epoch": 2.8874057594858487, + "grad_norm": 0.14331849811060074, + "learning_rate": 3.5897067237249307e-06, + "loss": 0.5429, + "step": 5841 + }, + { + "epoch": 2.8879001359535286, + "grad_norm": 0.14141423618370544, + "learning_rate": 3.586718583307153e-06, + "loss": 0.5713, + "step": 5842 + }, + { + "epoch": 2.8883945124212085, + "grad_norm": 0.1334975039566118, + "learning_rate": 3.5837314152967773e-06, + "loss": 0.5428, + "step": 5843 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.13261573622207704, + "learning_rate": 3.5807452201467387e-06, + "loss": 0.5617, + "step": 5844 + }, + { + "epoch": 2.889383265356569, + "grad_norm": 0.13521480876371522, + "learning_rate": 3.577759998309809e-06, + "loss": 0.508, + "step": 5845 + }, + { + "epoch": 2.889877641824249, + "grad_norm": 0.1340833139880582, + "learning_rate": 3.5747757502386214e-06, + "loss": 0.5702, + "step": 5846 + }, + { + "epoch": 2.8903720182919295, + "grad_norm": 0.13180974951923002, + "learning_rate": 3.5717924763856648e-06, + "loss": 0.5593, + "step": 5847 + }, + { + "epoch": 2.8908663947596094, + "grad_norm": 0.1371721656990677, + "learning_rate": 3.568810177203268e-06, + "loss": 0.5955, + "step": 5848 + }, + { + "epoch": 2.8913607712272897, + "grad_norm": 0.1438052908924308, + "learning_rate": 3.5658288531436248e-06, + "loss": 0.5618, + "step": 5849 + }, + { + "epoch": 2.8918551476949697, + "grad_norm": 0.13293942881936446, + "learning_rate": 3.562848504658769e-06, + "loss": 0.5662, + "step": 5850 + }, + { + "epoch": 2.89234952416265, + "grad_norm": 0.13471161032723275, + "learning_rate": 3.5598691322006005e-06, + "loss": 0.5627, + "step": 5851 + }, + { + "epoch": 2.89284390063033, + "grad_norm": 0.1364618641766149, + "learning_rate": 3.556890736220857e-06, + "loss": 0.5502, + "step": 5852 + }, + { + "epoch": 2.8933382770980103, + "grad_norm": 0.1376573449708964, + "learning_rate": 3.5539133171711416e-06, + "loss": 0.5538, + "step": 5853 + }, + { + "epoch": 2.89383265356569, + "grad_norm": 0.13450766483849577, + "learning_rate": 3.550936875502894e-06, + "loss": 0.6058, + "step": 5854 + }, + { + "epoch": 2.8943270300333706, + "grad_norm": 0.13592062026751736, + "learning_rate": 3.547961411667423e-06, + "loss": 0.566, + "step": 5855 + }, + { + "epoch": 2.8948214065010505, + "grad_norm": 0.1652847980321432, + "learning_rate": 3.544986926115872e-06, + "loss": 0.566, + "step": 5856 + }, + { + "epoch": 2.895315782968731, + "grad_norm": 0.1369526372974898, + "learning_rate": 3.5420134192992493e-06, + "loss": 0.5644, + "step": 5857 + }, + { + "epoch": 2.8958101594364107, + "grad_norm": 0.131795595634477, + "learning_rate": 3.539040891668413e-06, + "loss": 0.5724, + "step": 5858 + }, + { + "epoch": 2.896304535904091, + "grad_norm": 0.13272487278748113, + "learning_rate": 3.5360693436740656e-06, + "loss": 0.5513, + "step": 5859 + }, + { + "epoch": 2.896798912371771, + "grad_norm": 0.13504117097004656, + "learning_rate": 3.5330987757667613e-06, + "loss": 0.5517, + "step": 5860 + }, + { + "epoch": 2.8972932888394514, + "grad_norm": 0.13987771320135006, + "learning_rate": 3.5301291883969136e-06, + "loss": 0.5766, + "step": 5861 + }, + { + "epoch": 2.8977876653071313, + "grad_norm": 0.1471709419961036, + "learning_rate": 3.527160582014787e-06, + "loss": 0.6099, + "step": 5862 + }, + { + "epoch": 2.8982820417748116, + "grad_norm": 0.1372345186808009, + "learning_rate": 3.524192957070487e-06, + "loss": 0.552, + "step": 5863 + }, + { + "epoch": 2.8987764182424915, + "grad_norm": 0.13223753250323217, + "learning_rate": 3.5212263140139813e-06, + "loss": 0.5461, + "step": 5864 + }, + { + "epoch": 2.899270794710172, + "grad_norm": 0.1386671405753166, + "learning_rate": 3.5182606532950836e-06, + "loss": 0.5748, + "step": 5865 + }, + { + "epoch": 2.899765171177852, + "grad_norm": 0.1306080162882018, + "learning_rate": 3.515295975363454e-06, + "loss": 0.5591, + "step": 5866 + }, + { + "epoch": 2.900259547645532, + "grad_norm": 0.1271762433873979, + "learning_rate": 3.5123322806686135e-06, + "loss": 0.5356, + "step": 5867 + }, + { + "epoch": 2.900753924113212, + "grad_norm": 0.13286113678775432, + "learning_rate": 3.5093695696599304e-06, + "loss": 0.609, + "step": 5868 + }, + { + "epoch": 2.9012483005808924, + "grad_norm": 0.1341770595335867, + "learning_rate": 3.506407842786619e-06, + "loss": 0.5607, + "step": 5869 + }, + { + "epoch": 2.9017426770485724, + "grad_norm": 0.13205253900048433, + "learning_rate": 3.5034471004977534e-06, + "loss": 0.5546, + "step": 5870 + }, + { + "epoch": 2.9022370535162527, + "grad_norm": 0.13662818344831704, + "learning_rate": 3.500487343242247e-06, + "loss": 0.5842, + "step": 5871 + }, + { + "epoch": 2.9027314299839326, + "grad_norm": 0.12930147901904526, + "learning_rate": 3.4975285714688734e-06, + "loss": 0.5702, + "step": 5872 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 0.1399718138096684, + "learning_rate": 3.4945707856262557e-06, + "loss": 0.5656, + "step": 5873 + }, + { + "epoch": 2.903720182919293, + "grad_norm": 0.13594162424384415, + "learning_rate": 3.4916139861628593e-06, + "loss": 0.5474, + "step": 5874 + }, + { + "epoch": 2.9042145593869733, + "grad_norm": 0.12919809183243222, + "learning_rate": 3.4886581735270133e-06, + "loss": 0.5833, + "step": 5875 + }, + { + "epoch": 2.904708935854653, + "grad_norm": 0.13105926379218968, + "learning_rate": 3.4857033481668856e-06, + "loss": 0.5079, + "step": 5876 + }, + { + "epoch": 2.9052033123223335, + "grad_norm": 0.13231526074606761, + "learning_rate": 3.4827495105304967e-06, + "loss": 0.558, + "step": 5877 + }, + { + "epoch": 2.9056976887900134, + "grad_norm": 0.1401401044694206, + "learning_rate": 3.4797966610657198e-06, + "loss": 0.5753, + "step": 5878 + }, + { + "epoch": 2.906192065257694, + "grad_norm": 0.13803984580247186, + "learning_rate": 3.476844800220284e-06, + "loss": 0.5344, + "step": 5879 + }, + { + "epoch": 2.9066864417253737, + "grad_norm": 0.13072114262838463, + "learning_rate": 3.473893928441754e-06, + "loss": 0.5644, + "step": 5880 + }, + { + "epoch": 2.907180818193054, + "grad_norm": 0.13419837575167065, + "learning_rate": 3.47094404617756e-06, + "loss": 0.5476, + "step": 5881 + }, + { + "epoch": 2.907675194660734, + "grad_norm": 0.13578576805043233, + "learning_rate": 3.4679951538749712e-06, + "loss": 0.573, + "step": 5882 + }, + { + "epoch": 2.9081695711284143, + "grad_norm": 0.1295502697190896, + "learning_rate": 3.465047251981104e-06, + "loss": 0.5614, + "step": 5883 + }, + { + "epoch": 2.9086639475960947, + "grad_norm": 0.12809910308408906, + "learning_rate": 3.4621003409429453e-06, + "loss": 0.5521, + "step": 5884 + }, + { + "epoch": 2.9091583240637746, + "grad_norm": 0.13264792624148217, + "learning_rate": 3.459154421207309e-06, + "loss": 0.5839, + "step": 5885 + }, + { + "epoch": 2.9096527005314545, + "grad_norm": 0.13223808014976185, + "learning_rate": 3.456209493220867e-06, + "loss": 0.5771, + "step": 5886 + }, + { + "epoch": 2.910147076999135, + "grad_norm": 0.1285713055752616, + "learning_rate": 3.4532655574301444e-06, + "loss": 0.5626, + "step": 5887 + }, + { + "epoch": 2.9106414534668152, + "grad_norm": 0.13617545010050186, + "learning_rate": 3.450322614281507e-06, + "loss": 0.6156, + "step": 5888 + }, + { + "epoch": 2.911135829934495, + "grad_norm": 0.1454615936883806, + "learning_rate": 3.4473806642211793e-06, + "loss": 0.5397, + "step": 5889 + }, + { + "epoch": 2.911630206402175, + "grad_norm": 0.158420425550027, + "learning_rate": 3.444439707695235e-06, + "loss": 0.5824, + "step": 5890 + }, + { + "epoch": 2.9121245828698554, + "grad_norm": 0.13549724499018487, + "learning_rate": 3.44149974514959e-06, + "loss": 0.5633, + "step": 5891 + }, + { + "epoch": 2.912618959337536, + "grad_norm": 0.1334967032551185, + "learning_rate": 3.43856077703001e-06, + "loss": 0.564, + "step": 5892 + }, + { + "epoch": 2.9131133358052157, + "grad_norm": 0.12952882359388257, + "learning_rate": 3.4356228037821206e-06, + "loss": 0.5852, + "step": 5893 + }, + { + "epoch": 2.9136077122728956, + "grad_norm": 0.13455066788954226, + "learning_rate": 3.4326858258513807e-06, + "loss": 0.5614, + "step": 5894 + }, + { + "epoch": 2.914102088740576, + "grad_norm": 0.13193732303867411, + "learning_rate": 3.4297498436831113e-06, + "loss": 0.5588, + "step": 5895 + }, + { + "epoch": 2.9145964652082563, + "grad_norm": 0.13185954139902412, + "learning_rate": 3.42681485772248e-06, + "loss": 0.5875, + "step": 5896 + }, + { + "epoch": 2.9150908416759362, + "grad_norm": 0.13403420766433072, + "learning_rate": 3.4238808684144964e-06, + "loss": 0.5564, + "step": 5897 + }, + { + "epoch": 2.915585218143616, + "grad_norm": 0.13202066108187704, + "learning_rate": 3.4209478762040284e-06, + "loss": 0.615, + "step": 5898 + }, + { + "epoch": 2.9160795946112965, + "grad_norm": 0.1381206524632094, + "learning_rate": 3.418015881535781e-06, + "loss": 0.5292, + "step": 5899 + }, + { + "epoch": 2.916573971078977, + "grad_norm": 0.13223038903814288, + "learning_rate": 3.4150848848543208e-06, + "loss": 0.5619, + "step": 5900 + }, + { + "epoch": 2.917068347546657, + "grad_norm": 0.13786378424986137, + "learning_rate": 3.4121548866040587e-06, + "loss": 0.579, + "step": 5901 + }, + { + "epoch": 2.9175627240143367, + "grad_norm": 0.1346308354130178, + "learning_rate": 3.4092258872292494e-06, + "loss": 0.5549, + "step": 5902 + }, + { + "epoch": 2.918057100482017, + "grad_norm": 0.13089513065253042, + "learning_rate": 3.406297887173997e-06, + "loss": 0.5622, + "step": 5903 + }, + { + "epoch": 2.9185514769496974, + "grad_norm": 0.1292522975093021, + "learning_rate": 3.4033708868822635e-06, + "loss": 0.5569, + "step": 5904 + }, + { + "epoch": 2.9190458534173773, + "grad_norm": 0.13396883722202263, + "learning_rate": 3.4004448867978445e-06, + "loss": 0.5749, + "step": 5905 + }, + { + "epoch": 2.9195402298850572, + "grad_norm": 0.13529934759778517, + "learning_rate": 3.3975198873643964e-06, + "loss": 0.5495, + "step": 5906 + }, + { + "epoch": 2.9200346063527376, + "grad_norm": 0.13210160336109295, + "learning_rate": 3.3945958890254215e-06, + "loss": 0.5906, + "step": 5907 + }, + { + "epoch": 2.920528982820418, + "grad_norm": 0.1353683894835274, + "learning_rate": 3.391672892224266e-06, + "loss": 0.5926, + "step": 5908 + }, + { + "epoch": 2.921023359288098, + "grad_norm": 0.13412432649085712, + "learning_rate": 3.3887508974041217e-06, + "loss": 0.5743, + "step": 5909 + }, + { + "epoch": 2.921517735755778, + "grad_norm": 0.14620490857374063, + "learning_rate": 3.3858299050080377e-06, + "loss": 0.5912, + "step": 5910 + }, + { + "epoch": 2.922012112223458, + "grad_norm": 0.13437863610549833, + "learning_rate": 3.382909915478909e-06, + "loss": 0.589, + "step": 5911 + }, + { + "epoch": 2.9225064886911385, + "grad_norm": 0.13560886882150844, + "learning_rate": 3.37999092925947e-06, + "loss": 0.5667, + "step": 5912 + }, + { + "epoch": 2.9230008651588184, + "grad_norm": 0.13512565512023475, + "learning_rate": 3.3770729467923156e-06, + "loss": 0.5765, + "step": 5913 + }, + { + "epoch": 2.9234952416264983, + "grad_norm": 0.13387376223033082, + "learning_rate": 3.3741559685198798e-06, + "loss": 0.5817, + "step": 5914 + }, + { + "epoch": 2.9239896180941787, + "grad_norm": 0.13774479660192387, + "learning_rate": 3.371239994884441e-06, + "loss": 0.5605, + "step": 5915 + }, + { + "epoch": 2.924483994561859, + "grad_norm": 0.13603121956805334, + "learning_rate": 3.3683250263281354e-06, + "loss": 0.5486, + "step": 5916 + }, + { + "epoch": 2.924978371029539, + "grad_norm": 0.13288840216277556, + "learning_rate": 3.365411063292945e-06, + "loss": 0.5312, + "step": 5917 + }, + { + "epoch": 2.925472747497219, + "grad_norm": 0.12892840780328224, + "learning_rate": 3.3624981062206907e-06, + "loss": 0.5033, + "step": 5918 + }, + { + "epoch": 2.925967123964899, + "grad_norm": 0.13043547086873203, + "learning_rate": 3.359586155553053e-06, + "loss": 0.5663, + "step": 5919 + }, + { + "epoch": 2.9264615004325796, + "grad_norm": 0.13172961509850778, + "learning_rate": 3.356675211731546e-06, + "loss": 0.5539, + "step": 5920 + }, + { + "epoch": 2.9269558769002595, + "grad_norm": 0.13063532492940155, + "learning_rate": 3.3537652751975424e-06, + "loss": 0.5444, + "step": 5921 + }, + { + "epoch": 2.92745025336794, + "grad_norm": 0.13181192526748844, + "learning_rate": 3.350856346392263e-06, + "loss": 0.557, + "step": 5922 + }, + { + "epoch": 2.9279446298356198, + "grad_norm": 0.1342691691906216, + "learning_rate": 3.347948425756764e-06, + "loss": 0.5649, + "step": 5923 + }, + { + "epoch": 2.9284390063033, + "grad_norm": 0.12787003032086525, + "learning_rate": 3.3450415137319613e-06, + "loss": 0.5519, + "step": 5924 + }, + { + "epoch": 2.92893338277098, + "grad_norm": 0.13979184125192928, + "learning_rate": 3.34213561075861e-06, + "loss": 0.6069, + "step": 5925 + }, + { + "epoch": 2.9294277592386604, + "grad_norm": 0.1392653940682098, + "learning_rate": 3.339230717277313e-06, + "loss": 0.5626, + "step": 5926 + }, + { + "epoch": 2.9299221357063403, + "grad_norm": 0.13051691266516632, + "learning_rate": 3.3363268337285224e-06, + "loss": 0.5632, + "step": 5927 + }, + { + "epoch": 2.9304165121740207, + "grad_norm": 0.13074284761614582, + "learning_rate": 3.333423960552542e-06, + "loss": 0.5709, + "step": 5928 + }, + { + "epoch": 2.9309108886417006, + "grad_norm": 0.13623785526242832, + "learning_rate": 3.3305220981895105e-06, + "loss": 0.5788, + "step": 5929 + }, + { + "epoch": 2.931405265109381, + "grad_norm": 0.13590980798081478, + "learning_rate": 3.3276212470794244e-06, + "loss": 0.577, + "step": 5930 + }, + { + "epoch": 2.931899641577061, + "grad_norm": 0.13378858491909137, + "learning_rate": 3.3247214076621214e-06, + "loss": 0.5506, + "step": 5931 + }, + { + "epoch": 2.932394018044741, + "grad_norm": 0.12943241891075036, + "learning_rate": 3.3218225803772798e-06, + "loss": 0.5582, + "step": 5932 + }, + { + "epoch": 2.932888394512421, + "grad_norm": 0.1388937673360239, + "learning_rate": 3.318924765664443e-06, + "loss": 0.6414, + "step": 5933 + }, + { + "epoch": 2.9333827709801015, + "grad_norm": 0.13905712544247462, + "learning_rate": 3.3160279639629833e-06, + "loss": 0.5266, + "step": 5934 + }, + { + "epoch": 2.9338771474477814, + "grad_norm": 0.1339912655570973, + "learning_rate": 3.313132175712124e-06, + "loss": 0.5306, + "step": 5935 + }, + { + "epoch": 2.9343715239154617, + "grad_norm": 0.13968822555712684, + "learning_rate": 3.31023740135094e-06, + "loss": 0.628, + "step": 5936 + }, + { + "epoch": 2.9348659003831417, + "grad_norm": 0.14121398013300865, + "learning_rate": 3.3073436413183437e-06, + "loss": 0.5548, + "step": 5937 + }, + { + "epoch": 2.935360276850822, + "grad_norm": 0.13100487592142454, + "learning_rate": 3.304450896053101e-06, + "loss": 0.5872, + "step": 5938 + }, + { + "epoch": 2.935854653318502, + "grad_norm": 0.1299827094001823, + "learning_rate": 3.301559165993825e-06, + "loss": 0.5232, + "step": 5939 + }, + { + "epoch": 2.9363490297861823, + "grad_norm": 0.13923860342527125, + "learning_rate": 3.298668451578969e-06, + "loss": 0.5517, + "step": 5940 + }, + { + "epoch": 2.936843406253862, + "grad_norm": 0.13429101103157426, + "learning_rate": 3.29577875324683e-06, + "loss": 0.5764, + "step": 5941 + }, + { + "epoch": 2.9373377827215426, + "grad_norm": 0.13699077275919028, + "learning_rate": 3.292890071435563e-06, + "loss": 0.566, + "step": 5942 + }, + { + "epoch": 2.9378321591892225, + "grad_norm": 0.13378073738645258, + "learning_rate": 3.290002406583155e-06, + "loss": 0.5914, + "step": 5943 + }, + { + "epoch": 2.938326535656903, + "grad_norm": 0.13595157154553486, + "learning_rate": 3.2871157591274483e-06, + "loss": 0.5512, + "step": 5944 + }, + { + "epoch": 2.9388209121245827, + "grad_norm": 0.13235067367243916, + "learning_rate": 3.2842301295061307e-06, + "loss": 0.598, + "step": 5945 + }, + { + "epoch": 2.939315288592263, + "grad_norm": 0.13318001284671466, + "learning_rate": 3.2813455181567278e-06, + "loss": 0.5582, + "step": 5946 + }, + { + "epoch": 2.939809665059943, + "grad_norm": 0.13653320143197295, + "learning_rate": 3.278461925516622e-06, + "loss": 0.5676, + "step": 5947 + }, + { + "epoch": 2.9403040415276234, + "grad_norm": 0.1302485369493429, + "learning_rate": 3.2755793520230305e-06, + "loss": 0.5431, + "step": 5948 + }, + { + "epoch": 2.9407984179953033, + "grad_norm": 0.13233583889422348, + "learning_rate": 3.272697798113016e-06, + "loss": 0.5711, + "step": 5949 + }, + { + "epoch": 2.9412927944629836, + "grad_norm": 0.1340623943632651, + "learning_rate": 3.2698172642235027e-06, + "loss": 0.5466, + "step": 5950 + }, + { + "epoch": 2.9417871709306636, + "grad_norm": 0.12765027425149597, + "learning_rate": 3.2669377507912435e-06, + "loss": 0.5616, + "step": 5951 + }, + { + "epoch": 2.942281547398344, + "grad_norm": 0.13144080410485431, + "learning_rate": 3.2640592582528372e-06, + "loss": 0.566, + "step": 5952 + }, + { + "epoch": 2.942775923866024, + "grad_norm": 0.1399055642684579, + "learning_rate": 3.2611817870447406e-06, + "loss": 0.5493, + "step": 5953 + }, + { + "epoch": 2.943270300333704, + "grad_norm": 0.1354598084335108, + "learning_rate": 3.258305337603239e-06, + "loss": 0.5543, + "step": 5954 + }, + { + "epoch": 2.943764676801384, + "grad_norm": 0.13386869685667957, + "learning_rate": 3.255429910364475e-06, + "loss": 0.531, + "step": 5955 + }, + { + "epoch": 2.9442590532690645, + "grad_norm": 0.13180067014451355, + "learning_rate": 3.2525555057644365e-06, + "loss": 0.5639, + "step": 5956 + }, + { + "epoch": 2.9447534297367444, + "grad_norm": 0.13424928154520574, + "learning_rate": 3.2496821242389488e-06, + "loss": 0.5637, + "step": 5957 + }, + { + "epoch": 2.9452478062044247, + "grad_norm": 0.13563846081660613, + "learning_rate": 3.246809766223682e-06, + "loss": 0.5635, + "step": 5958 + }, + { + "epoch": 2.945742182672105, + "grad_norm": 0.1324702089439966, + "learning_rate": 3.2439384321541567e-06, + "loss": 0.5618, + "step": 5959 + }, + { + "epoch": 2.946236559139785, + "grad_norm": 0.13720495427338733, + "learning_rate": 3.2410681224657415e-06, + "loss": 0.5751, + "step": 5960 + }, + { + "epoch": 2.946730935607465, + "grad_norm": 0.1326255212002794, + "learning_rate": 3.238198837593636e-06, + "loss": 0.5554, + "step": 5961 + }, + { + "epoch": 2.9472253120751453, + "grad_norm": 0.14048240462976333, + "learning_rate": 3.2353305779728983e-06, + "loss": 0.5598, + "step": 5962 + }, + { + "epoch": 2.9477196885428256, + "grad_norm": 0.13477726800010204, + "learning_rate": 3.2324633440384222e-06, + "loss": 0.5646, + "step": 5963 + }, + { + "epoch": 2.9482140650105055, + "grad_norm": 0.13464720434033087, + "learning_rate": 3.229597136224952e-06, + "loss": 0.5666, + "step": 5964 + }, + { + "epoch": 2.9487084414781854, + "grad_norm": 0.13585191944980576, + "learning_rate": 3.2267319549670707e-06, + "loss": 0.5665, + "step": 5965 + }, + { + "epoch": 2.949202817945866, + "grad_norm": 0.14410368013687397, + "learning_rate": 3.223867800699213e-06, + "loss": 0.5893, + "step": 5966 + }, + { + "epoch": 2.949697194413546, + "grad_norm": 0.1373299490663797, + "learning_rate": 3.2210046738556465e-06, + "loss": 0.5513, + "step": 5967 + }, + { + "epoch": 2.950191570881226, + "grad_norm": 0.1385558796553511, + "learning_rate": 3.2181425748704977e-06, + "loss": 0.5733, + "step": 5968 + }, + { + "epoch": 2.950685947348906, + "grad_norm": 0.1353304762957497, + "learning_rate": 3.2152815041777217e-06, + "loss": 0.5916, + "step": 5969 + }, + { + "epoch": 2.9511803238165863, + "grad_norm": 0.13686860645642368, + "learning_rate": 3.2124214622111294e-06, + "loss": 0.561, + "step": 5970 + }, + { + "epoch": 2.9516747002842667, + "grad_norm": 0.13722368909397414, + "learning_rate": 3.2095624494043763e-06, + "loss": 0.5672, + "step": 5971 + }, + { + "epoch": 2.9521690767519466, + "grad_norm": 0.13153739130712058, + "learning_rate": 3.2067044661909484e-06, + "loss": 0.5646, + "step": 5972 + }, + { + "epoch": 2.9526634532196265, + "grad_norm": 0.13287945430271858, + "learning_rate": 3.2038475130041937e-06, + "loss": 0.5622, + "step": 5973 + }, + { + "epoch": 2.953157829687307, + "grad_norm": 0.13477549454945653, + "learning_rate": 3.200991590277289e-06, + "loss": 0.5292, + "step": 5974 + }, + { + "epoch": 2.9536522061549872, + "grad_norm": 0.13782417757052248, + "learning_rate": 3.1981366984432594e-06, + "loss": 0.5559, + "step": 5975 + }, + { + "epoch": 2.954146582622667, + "grad_norm": 0.1350007186668212, + "learning_rate": 3.1952828379349774e-06, + "loss": 0.5698, + "step": 5976 + }, + { + "epoch": 2.954640959090347, + "grad_norm": 0.14148554527020646, + "learning_rate": 3.192430009185161e-06, + "loss": 0.5626, + "step": 5977 + }, + { + "epoch": 2.9551353355580274, + "grad_norm": 0.13415021221903664, + "learning_rate": 3.1895782126263598e-06, + "loss": 0.5731, + "step": 5978 + }, + { + "epoch": 2.955629712025708, + "grad_norm": 0.1384039278894043, + "learning_rate": 3.1867274486909828e-06, + "loss": 0.5934, + "step": 5979 + }, + { + "epoch": 2.9561240884933877, + "grad_norm": 0.13074648277771492, + "learning_rate": 3.183877717811268e-06, + "loss": 0.5282, + "step": 5980 + }, + { + "epoch": 2.9566184649610676, + "grad_norm": 0.13243625757523894, + "learning_rate": 3.1810290204192995e-06, + "loss": 0.5617, + "step": 5981 + }, + { + "epoch": 2.957112841428748, + "grad_norm": 0.13586022715135784, + "learning_rate": 3.178181356947019e-06, + "loss": 0.5433, + "step": 5982 + }, + { + "epoch": 2.9576072178964283, + "grad_norm": 0.138286306183284, + "learning_rate": 3.1753347278261957e-06, + "loss": 0.5576, + "step": 5983 + }, + { + "epoch": 2.9581015943641082, + "grad_norm": 0.13241281932728247, + "learning_rate": 3.1724891334884432e-06, + "loss": 0.5281, + "step": 5984 + }, + { + "epoch": 2.958595970831788, + "grad_norm": 0.13450429879597525, + "learning_rate": 3.169644574365228e-06, + "loss": 0.549, + "step": 5985 + }, + { + "epoch": 2.9590903472994685, + "grad_norm": 0.12856165472740325, + "learning_rate": 3.166801050887849e-06, + "loss": 0.5224, + "step": 5986 + }, + { + "epoch": 2.959584723767149, + "grad_norm": 0.13320570501573825, + "learning_rate": 3.1639585634874525e-06, + "loss": 0.5715, + "step": 5987 + }, + { + "epoch": 2.960079100234829, + "grad_norm": 0.138215402809899, + "learning_rate": 3.1611171125950325e-06, + "loss": 0.5351, + "step": 5988 + }, + { + "epoch": 2.9605734767025087, + "grad_norm": 0.1281513249274144, + "learning_rate": 3.158276698641416e-06, + "loss": 0.5476, + "step": 5989 + }, + { + "epoch": 2.961067853170189, + "grad_norm": 0.13261108478877653, + "learning_rate": 3.155437322057283e-06, + "loss": 0.5419, + "step": 5990 + }, + { + "epoch": 2.9615622296378694, + "grad_norm": 0.13193768863146513, + "learning_rate": 3.1525989832731486e-06, + "loss": 0.5726, + "step": 5991 + }, + { + "epoch": 2.9620566061055493, + "grad_norm": 0.13686232102234303, + "learning_rate": 3.149761682719369e-06, + "loss": 0.5764, + "step": 5992 + }, + { + "epoch": 2.9625509825732292, + "grad_norm": 0.13585575028525945, + "learning_rate": 3.1469254208261512e-06, + "loss": 0.5749, + "step": 5993 + }, + { + "epoch": 2.9630453590409096, + "grad_norm": 0.152040340559372, + "learning_rate": 3.144090198023544e-06, + "loss": 0.5471, + "step": 5994 + }, + { + "epoch": 2.96353973550859, + "grad_norm": 0.13573478322522955, + "learning_rate": 3.141256014741427e-06, + "loss": 0.563, + "step": 5995 + }, + { + "epoch": 2.96403411197627, + "grad_norm": 0.1295256066099502, + "learning_rate": 3.1384228714095387e-06, + "loss": 0.5092, + "step": 5996 + }, + { + "epoch": 2.9645284884439502, + "grad_norm": 0.13502583686839045, + "learning_rate": 3.1355907684574483e-06, + "loss": 0.563, + "step": 5997 + }, + { + "epoch": 2.96502286491163, + "grad_norm": 0.13384713927257263, + "learning_rate": 3.132759706314563e-06, + "loss": 0.5555, + "step": 5998 + }, + { + "epoch": 2.9655172413793105, + "grad_norm": 0.1369356417406386, + "learning_rate": 3.1299296854101536e-06, + "loss": 0.5966, + "step": 5999 + }, + { + "epoch": 2.9660116178469904, + "grad_norm": 0.12783290423686747, + "learning_rate": 3.1271007061733126e-06, + "loss": 0.5726, + "step": 6000 + }, + { + "epoch": 2.9665059943146708, + "grad_norm": 0.14016284581065194, + "learning_rate": 3.1242727690329776e-06, + "loss": 0.6028, + "step": 6001 + }, + { + "epoch": 2.9670003707823507, + "grad_norm": 0.13190332594850815, + "learning_rate": 3.121445874417939e-06, + "loss": 0.5378, + "step": 6002 + }, + { + "epoch": 2.967494747250031, + "grad_norm": 0.13174419254037179, + "learning_rate": 3.1186200227568143e-06, + "loss": 0.5868, + "step": 6003 + }, + { + "epoch": 2.967989123717711, + "grad_norm": 0.13848828100425034, + "learning_rate": 3.1157952144780744e-06, + "loss": 0.5685, + "step": 6004 + }, + { + "epoch": 2.9684835001853913, + "grad_norm": 0.13391081377081443, + "learning_rate": 3.1129714500100306e-06, + "loss": 0.5469, + "step": 6005 + }, + { + "epoch": 2.9689778766530712, + "grad_norm": 0.1309275293293014, + "learning_rate": 3.1101487297808307e-06, + "loss": 0.5554, + "step": 6006 + }, + { + "epoch": 2.9694722531207516, + "grad_norm": 0.13311395235665185, + "learning_rate": 3.107327054218464e-06, + "loss": 0.541, + "step": 6007 + }, + { + "epoch": 2.9699666295884315, + "grad_norm": 0.13545057012202202, + "learning_rate": 3.1045064237507704e-06, + "loss": 0.5789, + "step": 6008 + }, + { + "epoch": 2.970461006056112, + "grad_norm": 0.1320041998484565, + "learning_rate": 3.101686838805419e-06, + "loss": 0.5342, + "step": 6009 + }, + { + "epoch": 2.9709553825237918, + "grad_norm": 0.1338595079615825, + "learning_rate": 3.0988682998099282e-06, + "loss": 0.5597, + "step": 6010 + }, + { + "epoch": 2.971449758991472, + "grad_norm": 0.13564651164563576, + "learning_rate": 3.096050807191662e-06, + "loss": 0.5618, + "step": 6011 + }, + { + "epoch": 2.971944135459152, + "grad_norm": 0.1374641060498601, + "learning_rate": 3.0932343613778105e-06, + "loss": 0.5921, + "step": 6012 + }, + { + "epoch": 2.9724385119268324, + "grad_norm": 0.1361422583959906, + "learning_rate": 3.090418962795424e-06, + "loss": 0.5705, + "step": 6013 + }, + { + "epoch": 2.9729328883945123, + "grad_norm": 0.1282512791252018, + "learning_rate": 3.0876046118713756e-06, + "loss": 0.5413, + "step": 6014 + }, + { + "epoch": 2.9734272648621927, + "grad_norm": 0.1381066952702608, + "learning_rate": 3.0847913090323954e-06, + "loss": 0.5375, + "step": 6015 + }, + { + "epoch": 2.9739216413298726, + "grad_norm": 0.13252942818286675, + "learning_rate": 3.081979054705042e-06, + "loss": 0.5441, + "step": 6016 + }, + { + "epoch": 2.974416017797553, + "grad_norm": 0.12891233415621553, + "learning_rate": 3.079167849315727e-06, + "loss": 0.5278, + "step": 6017 + }, + { + "epoch": 2.974910394265233, + "grad_norm": 0.13924996495372013, + "learning_rate": 3.0763576932906903e-06, + "loss": 0.5993, + "step": 6018 + }, + { + "epoch": 2.975404770732913, + "grad_norm": 0.13324889212742705, + "learning_rate": 3.0735485870560245e-06, + "loss": 0.5892, + "step": 6019 + }, + { + "epoch": 2.975899147200593, + "grad_norm": 0.1391792708590785, + "learning_rate": 3.0707405310376513e-06, + "loss": 0.5952, + "step": 6020 + }, + { + "epoch": 2.9763935236682735, + "grad_norm": 0.13324798160934087, + "learning_rate": 3.067933525661343e-06, + "loss": 0.5646, + "step": 6021 + }, + { + "epoch": 2.9768879001359534, + "grad_norm": 0.13148843020511322, + "learning_rate": 3.065127571352713e-06, + "loss": 0.5387, + "step": 6022 + }, + { + "epoch": 2.9773822766036337, + "grad_norm": 0.13223815784375131, + "learning_rate": 3.0623226685372065e-06, + "loss": 0.5612, + "step": 6023 + }, + { + "epoch": 2.9778766530713137, + "grad_norm": 0.13842371852367297, + "learning_rate": 3.0595188176401126e-06, + "loss": 0.5664, + "step": 6024 + }, + { + "epoch": 2.978371029538994, + "grad_norm": 0.1385179972630082, + "learning_rate": 3.0567160190865643e-06, + "loss": 0.5709, + "step": 6025 + }, + { + "epoch": 2.978865406006674, + "grad_norm": 0.13341540463449178, + "learning_rate": 3.0539142733015358e-06, + "loss": 0.5399, + "step": 6026 + }, + { + "epoch": 2.9793597824743543, + "grad_norm": 0.13116039238360666, + "learning_rate": 3.051113580709835e-06, + "loss": 0.5479, + "step": 6027 + }, + { + "epoch": 2.979854158942034, + "grad_norm": 0.13133492766145524, + "learning_rate": 3.0483139417361175e-06, + "loss": 0.5367, + "step": 6028 + }, + { + "epoch": 2.9803485354097146, + "grad_norm": 0.13277177650227634, + "learning_rate": 3.045515356804876e-06, + "loss": 0.5593, + "step": 6029 + }, + { + "epoch": 2.9808429118773945, + "grad_norm": 0.1324422977550319, + "learning_rate": 3.0427178263404367e-06, + "loss": 0.5476, + "step": 6030 + }, + { + "epoch": 2.981337288345075, + "grad_norm": 0.13214497733400693, + "learning_rate": 3.0399213507669765e-06, + "loss": 0.579, + "step": 6031 + }, + { + "epoch": 2.9818316648127547, + "grad_norm": 0.13050119651176417, + "learning_rate": 3.037125930508513e-06, + "loss": 0.5462, + "step": 6032 + }, + { + "epoch": 2.982326041280435, + "grad_norm": 0.1427199358468394, + "learning_rate": 3.034331565988892e-06, + "loss": 0.5843, + "step": 6033 + }, + { + "epoch": 2.9828204177481155, + "grad_norm": 0.13831324289770996, + "learning_rate": 3.031538257631811e-06, + "loss": 0.5712, + "step": 6034 + }, + { + "epoch": 2.9833147942157954, + "grad_norm": 0.13639652112834688, + "learning_rate": 3.0287460058607975e-06, + "loss": 0.613, + "step": 6035 + }, + { + "epoch": 2.9838091706834753, + "grad_norm": 0.131167149404572, + "learning_rate": 3.0259548110992265e-06, + "loss": 0.5539, + "step": 6036 + }, + { + "epoch": 2.9843035471511556, + "grad_norm": 0.132516214075518, + "learning_rate": 3.023164673770315e-06, + "loss": 0.5658, + "step": 6037 + }, + { + "epoch": 2.984797923618836, + "grad_norm": 0.13751961396868975, + "learning_rate": 3.020375594297106e-06, + "loss": 0.5461, + "step": 6038 + }, + { + "epoch": 2.985292300086516, + "grad_norm": 0.14127646887697698, + "learning_rate": 3.0175875731024984e-06, + "loss": 0.5622, + "step": 6039 + }, + { + "epoch": 2.985786676554196, + "grad_norm": 0.1343142228951392, + "learning_rate": 3.014800610609221e-06, + "loss": 0.5812, + "step": 6040 + }, + { + "epoch": 2.986281053021876, + "grad_norm": 0.1326201790095572, + "learning_rate": 3.012014707239839e-06, + "loss": 0.5257, + "step": 6041 + }, + { + "epoch": 2.9867754294895565, + "grad_norm": 0.1308984213658557, + "learning_rate": 3.0092298634167672e-06, + "loss": 0.5608, + "step": 6042 + }, + { + "epoch": 2.9872698059572365, + "grad_norm": 0.12842674453574832, + "learning_rate": 3.0064460795622563e-06, + "loss": 0.5676, + "step": 6043 + }, + { + "epoch": 2.9877641824249164, + "grad_norm": 0.13707574265970984, + "learning_rate": 3.00366335609839e-06, + "loss": 0.6494, + "step": 6044 + }, + { + "epoch": 2.9882585588925967, + "grad_norm": 0.14091395568481177, + "learning_rate": 3.0008816934471007e-06, + "loss": 0.5458, + "step": 6045 + }, + { + "epoch": 2.988752935360277, + "grad_norm": 0.130459689522635, + "learning_rate": 2.9981010920301547e-06, + "loss": 0.5658, + "step": 6046 + }, + { + "epoch": 2.989247311827957, + "grad_norm": 0.13959669385273626, + "learning_rate": 2.9953215522691483e-06, + "loss": 0.5725, + "step": 6047 + }, + { + "epoch": 2.989741688295637, + "grad_norm": 0.136645952049938, + "learning_rate": 2.992543074585541e-06, + "loss": 0.5354, + "step": 6048 + }, + { + "epoch": 2.9902360647633173, + "grad_norm": 0.13209016540743457, + "learning_rate": 2.9897656594006095e-06, + "loss": 0.5506, + "step": 6049 + }, + { + "epoch": 2.9907304412309976, + "grad_norm": 0.13486723990626076, + "learning_rate": 2.986989307135475e-06, + "loss": 0.5328, + "step": 6050 + }, + { + "epoch": 2.9912248176986775, + "grad_norm": 0.13010710974863918, + "learning_rate": 2.9842140182111035e-06, + "loss": 0.5538, + "step": 6051 + }, + { + "epoch": 2.9917191941663575, + "grad_norm": 0.130834443221903, + "learning_rate": 2.981439793048291e-06, + "loss": 0.5998, + "step": 6052 + }, + { + "epoch": 2.992213570634038, + "grad_norm": 0.13435877367103766, + "learning_rate": 2.978666632067677e-06, + "loss": 0.5542, + "step": 6053 + }, + { + "epoch": 2.992707947101718, + "grad_norm": 0.13572252978659155, + "learning_rate": 2.975894535689746e-06, + "loss": 0.5627, + "step": 6054 + }, + { + "epoch": 2.993202323569398, + "grad_norm": 0.13225214624202064, + "learning_rate": 2.9731235043348093e-06, + "loss": 0.5726, + "step": 6055 + }, + { + "epoch": 2.993696700037078, + "grad_norm": 0.13408136918127794, + "learning_rate": 2.9703535384230173e-06, + "loss": 0.5481, + "step": 6056 + }, + { + "epoch": 2.9941910765047584, + "grad_norm": 0.13176897529452428, + "learning_rate": 2.9675846383743735e-06, + "loss": 0.5585, + "step": 6057 + }, + { + "epoch": 2.9946854529724387, + "grad_norm": 0.13258327966762617, + "learning_rate": 2.964816804608699e-06, + "loss": 0.5844, + "step": 6058 + }, + { + "epoch": 2.9951798294401186, + "grad_norm": 0.1499862971583867, + "learning_rate": 2.9620500375456695e-06, + "loss": 0.5665, + "step": 6059 + }, + { + "epoch": 2.9956742059077985, + "grad_norm": 0.1359471406040536, + "learning_rate": 2.959284337604795e-06, + "loss": 0.5604, + "step": 6060 + }, + { + "epoch": 2.996168582375479, + "grad_norm": 0.13034809674097828, + "learning_rate": 2.9565197052054184e-06, + "loss": 0.5177, + "step": 6061 + }, + { + "epoch": 2.9966629588431593, + "grad_norm": 0.13285858004602646, + "learning_rate": 2.953756140766727e-06, + "loss": 0.5452, + "step": 6062 + }, + { + "epoch": 2.997157335310839, + "grad_norm": 0.12915757019525384, + "learning_rate": 2.9509936447077392e-06, + "loss": 0.5366, + "step": 6063 + }, + { + "epoch": 2.997651711778519, + "grad_norm": 0.13495055944893022, + "learning_rate": 2.94823221744732e-06, + "loss": 0.5435, + "step": 6064 + }, + { + "epoch": 2.9981460882461994, + "grad_norm": 0.13434201684534952, + "learning_rate": 2.9454718594041686e-06, + "loss": 0.5493, + "step": 6065 + }, + { + "epoch": 2.99864046471388, + "grad_norm": 0.13271064012365721, + "learning_rate": 2.9427125709968196e-06, + "loss": 0.5799, + "step": 6066 + }, + { + "epoch": 2.9991348411815597, + "grad_norm": 0.13903452058236015, + "learning_rate": 2.939954352643645e-06, + "loss": 0.5522, + "step": 6067 + }, + { + "epoch": 2.9996292176492396, + "grad_norm": 0.13081062697054333, + "learning_rate": 2.9371972047628616e-06, + "loss": 0.5571, + "step": 6068 + }, + { + "epoch": 3.0, + "grad_norm": 0.1583545054566719, + "learning_rate": 2.934441127772514e-06, + "loss": 0.5939, + "step": 6069 + }, + { + "epoch": 3.0004943764676804, + "grad_norm": 0.17253718511532054, + "learning_rate": 2.9316861220904924e-06, + "loss": 0.5219, + "step": 6070 + }, + { + "epoch": 3.0009887529353603, + "grad_norm": 0.16621542660245503, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.5554, + "step": 6071 + }, + { + "epoch": 3.0014831294030406, + "grad_norm": 0.1499559983403861, + "learning_rate": 2.9261793263221717e-06, + "loss": 0.5276, + "step": 6072 + }, + { + "epoch": 3.0014831294030406, + "eval_loss": 0.64006507396698, + "eval_runtime": 82.0951, + "eval_samples_per_second": 369.742, + "eval_steps_per_second": 46.227, + "step": 6072 + }, + { + "epoch": 3.0019775058707205, + "grad_norm": 0.14087677584593664, + "learning_rate": 2.9234275370708286e-06, + "loss": 0.5308, + "step": 6073 + }, + { + "epoch": 3.002471882338401, + "grad_norm": 0.13889396113974678, + "learning_rate": 2.9206768207977365e-06, + "loss": 0.5352, + "step": 6074 + }, + { + "epoch": 3.002966258806081, + "grad_norm": 0.15204770895136568, + "learning_rate": 2.9179271779199738e-06, + "loss": 0.5153, + "step": 6075 + }, + { + "epoch": 3.003460635273761, + "grad_norm": 0.14212101189722953, + "learning_rate": 2.915178608854443e-06, + "loss": 0.4965, + "step": 6076 + }, + { + "epoch": 3.003955011741441, + "grad_norm": 0.14473768402472842, + "learning_rate": 2.912431114017904e-06, + "loss": 0.5372, + "step": 6077 + }, + { + "epoch": 3.0044493882091214, + "grad_norm": 0.1409920711922606, + "learning_rate": 2.9096846938269364e-06, + "loss": 0.5542, + "step": 6078 + }, + { + "epoch": 3.0049437646768014, + "grad_norm": 0.15539010059793132, + "learning_rate": 2.906939348697961e-06, + "loss": 0.5362, + "step": 6079 + }, + { + "epoch": 3.0054381411444817, + "grad_norm": 0.14282452485505953, + "learning_rate": 2.904195079047243e-06, + "loss": 0.5238, + "step": 6080 + }, + { + "epoch": 3.0059325176121616, + "grad_norm": 0.14069382342334155, + "learning_rate": 2.9014518852908804e-06, + "loss": 0.5139, + "step": 6081 + }, + { + "epoch": 3.006426894079842, + "grad_norm": 0.149449900127776, + "learning_rate": 2.8987097678448017e-06, + "loss": 0.5333, + "step": 6082 + }, + { + "epoch": 3.006921270547522, + "grad_norm": 0.1488328166298847, + "learning_rate": 2.8959687271247837e-06, + "loss": 0.5716, + "step": 6083 + }, + { + "epoch": 3.0074156470152023, + "grad_norm": 0.1406399780270235, + "learning_rate": 2.8932287635464294e-06, + "loss": 0.529, + "step": 6084 + }, + { + "epoch": 3.007910023482882, + "grad_norm": 0.1381893308660609, + "learning_rate": 2.8904898775251834e-06, + "loss": 0.5565, + "step": 6085 + }, + { + "epoch": 3.0084043999505625, + "grad_norm": 0.13890671236124474, + "learning_rate": 2.887752069476333e-06, + "loss": 0.5376, + "step": 6086 + }, + { + "epoch": 3.0088987764182424, + "grad_norm": 0.13917004369788963, + "learning_rate": 2.885015339814986e-06, + "loss": 0.5298, + "step": 6087 + }, + { + "epoch": 3.009393152885923, + "grad_norm": 0.14422835129338243, + "learning_rate": 2.8822796889561045e-06, + "loss": 0.5316, + "step": 6088 + }, + { + "epoch": 3.0098875293536027, + "grad_norm": 0.13407437948839931, + "learning_rate": 2.879545117314475e-06, + "loss": 0.5326, + "step": 6089 + }, + { + "epoch": 3.010381905821283, + "grad_norm": 0.13459689369228764, + "learning_rate": 2.876811625304723e-06, + "loss": 0.5757, + "step": 6090 + }, + { + "epoch": 3.010876282288963, + "grad_norm": 0.13551101763079182, + "learning_rate": 2.874079213341311e-06, + "loss": 0.5573, + "step": 6091 + }, + { + "epoch": 3.0113706587566433, + "grad_norm": 0.14018021929860364, + "learning_rate": 2.8713478818385443e-06, + "loss": 0.5785, + "step": 6092 + }, + { + "epoch": 3.0118650352243233, + "grad_norm": 0.13752525961438639, + "learning_rate": 2.8686176312105517e-06, + "loss": 0.5066, + "step": 6093 + }, + { + "epoch": 3.0123594116920036, + "grad_norm": 0.13237386011257765, + "learning_rate": 2.86588846187131e-06, + "loss": 0.5118, + "step": 6094 + }, + { + "epoch": 3.0128537881596835, + "grad_norm": 0.1343633478416837, + "learning_rate": 2.8631603742346235e-06, + "loss": 0.5839, + "step": 6095 + }, + { + "epoch": 3.013348164627364, + "grad_norm": 0.13980460055977126, + "learning_rate": 2.8604333687141306e-06, + "loss": 0.5313, + "step": 6096 + }, + { + "epoch": 3.013842541095044, + "grad_norm": 0.13050251321769202, + "learning_rate": 2.8577074457233233e-06, + "loss": 0.5384, + "step": 6097 + }, + { + "epoch": 3.014336917562724, + "grad_norm": 0.1360872194670727, + "learning_rate": 2.8549826056755093e-06, + "loss": 0.5288, + "step": 6098 + }, + { + "epoch": 3.014831294030404, + "grad_norm": 0.13564718889284114, + "learning_rate": 2.852258848983839e-06, + "loss": 0.5729, + "step": 6099 + }, + { + "epoch": 3.0153256704980844, + "grad_norm": 0.1411416033459456, + "learning_rate": 2.8495361760613027e-06, + "loss": 0.5508, + "step": 6100 + }, + { + "epoch": 3.0158200469657643, + "grad_norm": 0.12849405543871625, + "learning_rate": 2.8468145873207185e-06, + "loss": 0.5263, + "step": 6101 + }, + { + "epoch": 3.0163144234334447, + "grad_norm": 0.14028139308302048, + "learning_rate": 2.844094083174748e-06, + "loss": 0.5496, + "step": 6102 + }, + { + "epoch": 3.0168087999011246, + "grad_norm": 0.14406206752206338, + "learning_rate": 2.8413746640358863e-06, + "loss": 0.54, + "step": 6103 + }, + { + "epoch": 3.017303176368805, + "grad_norm": 0.14152091449964835, + "learning_rate": 2.8386563303164606e-06, + "loss": 0.522, + "step": 6104 + }, + { + "epoch": 3.017797552836485, + "grad_norm": 0.1373986690664549, + "learning_rate": 2.8359390824286337e-06, + "loss": 0.4778, + "step": 6105 + }, + { + "epoch": 3.0182919293041652, + "grad_norm": 0.14392284953501144, + "learning_rate": 2.8332229207844096e-06, + "loss": 0.5416, + "step": 6106 + }, + { + "epoch": 3.018786305771845, + "grad_norm": 0.1396882208113552, + "learning_rate": 2.8305078457956183e-06, + "loss": 0.5348, + "step": 6107 + }, + { + "epoch": 3.0192806822395255, + "grad_norm": 0.13437749751610167, + "learning_rate": 2.8277938578739338e-06, + "loss": 0.5075, + "step": 6108 + }, + { + "epoch": 3.0197750587072054, + "grad_norm": 0.13622559934358022, + "learning_rate": 2.8250809574308646e-06, + "loss": 0.5367, + "step": 6109 + }, + { + "epoch": 3.0202694351748858, + "grad_norm": 0.1381261395197164, + "learning_rate": 2.8223691448777444e-06, + "loss": 0.5402, + "step": 6110 + }, + { + "epoch": 3.0207638116425657, + "grad_norm": 0.13685717059634386, + "learning_rate": 2.8196584206257573e-06, + "loss": 0.5386, + "step": 6111 + }, + { + "epoch": 3.021258188110246, + "grad_norm": 0.13747147341431765, + "learning_rate": 2.816948785085907e-06, + "loss": 0.5364, + "step": 6112 + }, + { + "epoch": 3.021752564577926, + "grad_norm": 0.13404983314982388, + "learning_rate": 2.8142402386690414e-06, + "loss": 0.509, + "step": 6113 + }, + { + "epoch": 3.0222469410456063, + "grad_norm": 0.13513488242273658, + "learning_rate": 2.8115327817858453e-06, + "loss": 0.519, + "step": 6114 + }, + { + "epoch": 3.0227413175132862, + "grad_norm": 0.13219968158091455, + "learning_rate": 2.8088264148468314e-06, + "loss": 0.5239, + "step": 6115 + }, + { + "epoch": 3.0232356939809666, + "grad_norm": 0.13969344441198753, + "learning_rate": 2.8061211382623455e-06, + "loss": 0.5372, + "step": 6116 + }, + { + "epoch": 3.0237300704486465, + "grad_norm": 0.136378304877055, + "learning_rate": 2.8034169524425804e-06, + "loss": 0.5145, + "step": 6117 + }, + { + "epoch": 3.024224446916327, + "grad_norm": 0.13606319547520332, + "learning_rate": 2.800713857797548e-06, + "loss": 0.5585, + "step": 6118 + }, + { + "epoch": 3.0247188233840068, + "grad_norm": 0.13267395695157896, + "learning_rate": 2.7980118547371047e-06, + "loss": 0.5143, + "step": 6119 + }, + { + "epoch": 3.025213199851687, + "grad_norm": 0.13966514586260032, + "learning_rate": 2.7953109436709447e-06, + "loss": 0.5511, + "step": 6120 + }, + { + "epoch": 3.025707576319367, + "grad_norm": 0.13277676495400406, + "learning_rate": 2.792611125008586e-06, + "loss": 0.5228, + "step": 6121 + }, + { + "epoch": 3.0262019527870474, + "grad_norm": 0.13437207826834288, + "learning_rate": 2.7899123991593836e-06, + "loss": 0.5151, + "step": 6122 + }, + { + "epoch": 3.0266963292547273, + "grad_norm": 0.13668402228410564, + "learning_rate": 2.787214766532531e-06, + "loss": 0.5293, + "step": 6123 + }, + { + "epoch": 3.0271907057224077, + "grad_norm": 0.13717774771644778, + "learning_rate": 2.7845182275370586e-06, + "loss": 0.5467, + "step": 6124 + }, + { + "epoch": 3.0276850821900876, + "grad_norm": 0.13509422612625915, + "learning_rate": 2.781822782581819e-06, + "loss": 0.5146, + "step": 6125 + }, + { + "epoch": 3.028179458657768, + "grad_norm": 0.13814173099355778, + "learning_rate": 2.779128432075513e-06, + "loss": 0.5281, + "step": 6126 + }, + { + "epoch": 3.028673835125448, + "grad_norm": 0.13340516055939305, + "learning_rate": 2.776435176426664e-06, + "loss": 0.4975, + "step": 6127 + }, + { + "epoch": 3.029168211593128, + "grad_norm": 0.14274664926772387, + "learning_rate": 2.7737430160436387e-06, + "loss": 0.5831, + "step": 6128 + }, + { + "epoch": 3.029662588060808, + "grad_norm": 0.1370703271720517, + "learning_rate": 2.7710519513346266e-06, + "loss": 0.4999, + "step": 6129 + }, + { + "epoch": 3.0301569645284885, + "grad_norm": 0.1367575098866532, + "learning_rate": 2.768361982707666e-06, + "loss": 0.5352, + "step": 6130 + }, + { + "epoch": 3.0306513409961684, + "grad_norm": 0.13777079486639196, + "learning_rate": 2.7656731105706136e-06, + "loss": 0.5402, + "step": 6131 + }, + { + "epoch": 3.0311457174638488, + "grad_norm": 0.13489630388376614, + "learning_rate": 2.762985335331172e-06, + "loss": 0.5612, + "step": 6132 + }, + { + "epoch": 3.0316400939315287, + "grad_norm": 0.1351161439418679, + "learning_rate": 2.7602986573968673e-06, + "loss": 0.5075, + "step": 6133 + }, + { + "epoch": 3.032134470399209, + "grad_norm": 0.143228769417949, + "learning_rate": 2.7576130771750677e-06, + "loss": 0.5278, + "step": 6134 + }, + { + "epoch": 3.032628846866889, + "grad_norm": 0.13543238644628247, + "learning_rate": 2.754928595072974e-06, + "loss": 0.5258, + "step": 6135 + }, + { + "epoch": 3.0331232233345693, + "grad_norm": 0.13411172568806443, + "learning_rate": 2.7522452114976127e-06, + "loss": 0.5257, + "step": 6136 + }, + { + "epoch": 3.033617599802249, + "grad_norm": 0.13375044864669142, + "learning_rate": 2.7495629268558543e-06, + "loss": 0.4956, + "step": 6137 + }, + { + "epoch": 3.0341119762699296, + "grad_norm": 0.13219843191245248, + "learning_rate": 2.746881741554396e-06, + "loss": 0.5534, + "step": 6138 + }, + { + "epoch": 3.0346063527376095, + "grad_norm": 0.1385489089947911, + "learning_rate": 2.7442016559997654e-06, + "loss": 0.5107, + "step": 6139 + }, + { + "epoch": 3.03510072920529, + "grad_norm": 0.1395642663236756, + "learning_rate": 2.741522670598331e-06, + "loss": 0.5567, + "step": 6140 + }, + { + "epoch": 3.0355951056729698, + "grad_norm": 0.13265837079583792, + "learning_rate": 2.738844785756295e-06, + "loss": 0.5598, + "step": 6141 + }, + { + "epoch": 3.03608948214065, + "grad_norm": 0.14001106257812063, + "learning_rate": 2.736168001879683e-06, + "loss": 0.5421, + "step": 6142 + }, + { + "epoch": 3.0365838586083305, + "grad_norm": 0.13451192773623555, + "learning_rate": 2.7334923193743644e-06, + "loss": 0.566, + "step": 6143 + }, + { + "epoch": 3.0370782350760104, + "grad_norm": 0.13418965596727483, + "learning_rate": 2.7308177386460354e-06, + "loss": 0.5266, + "step": 6144 + }, + { + "epoch": 3.0375726115436907, + "grad_norm": 0.13796043467021638, + "learning_rate": 2.7281442601002207e-06, + "loss": 0.5417, + "step": 6145 + }, + { + "epoch": 3.0380669880113707, + "grad_norm": 0.14086143650484648, + "learning_rate": 2.7254718841422944e-06, + "loss": 0.5615, + "step": 6146 + }, + { + "epoch": 3.038561364479051, + "grad_norm": 0.1343788234245357, + "learning_rate": 2.7228006111774473e-06, + "loss": 0.5387, + "step": 6147 + }, + { + "epoch": 3.039055740946731, + "grad_norm": 0.12670001550442095, + "learning_rate": 2.7201304416107067e-06, + "loss": 0.5143, + "step": 6148 + }, + { + "epoch": 3.0395501174144113, + "grad_norm": 0.1296043637866339, + "learning_rate": 2.7174613758469393e-06, + "loss": 0.514, + "step": 6149 + }, + { + "epoch": 3.040044493882091, + "grad_norm": 0.13726115380075674, + "learning_rate": 2.7147934142908337e-06, + "loss": 0.5465, + "step": 6150 + }, + { + "epoch": 3.0405388703497715, + "grad_norm": 0.13666915584666428, + "learning_rate": 2.71212655734692e-06, + "loss": 0.5285, + "step": 6151 + }, + { + "epoch": 3.0410332468174515, + "grad_norm": 0.13226682604311998, + "learning_rate": 2.7094608054195605e-06, + "loss": 0.5378, + "step": 6152 + }, + { + "epoch": 3.041527623285132, + "grad_norm": 0.1331213675484645, + "learning_rate": 2.7067961589129456e-06, + "loss": 0.5081, + "step": 6153 + }, + { + "epoch": 3.0420219997528117, + "grad_norm": 0.1347615154688043, + "learning_rate": 2.7041326182310935e-06, + "loss": 0.5274, + "step": 6154 + }, + { + "epoch": 3.042516376220492, + "grad_norm": 0.132371110945278, + "learning_rate": 2.70147018377787e-06, + "loss": 0.4817, + "step": 6155 + }, + { + "epoch": 3.043010752688172, + "grad_norm": 0.13602729158166377, + "learning_rate": 2.698808855956957e-06, + "loss": 0.5754, + "step": 6156 + }, + { + "epoch": 3.0435051291558524, + "grad_norm": 0.13834051095936137, + "learning_rate": 2.696148635171878e-06, + "loss": 0.5374, + "step": 6157 + }, + { + "epoch": 3.0439995056235323, + "grad_norm": 0.14168464888622906, + "learning_rate": 2.6934895218259902e-06, + "loss": 0.5417, + "step": 6158 + }, + { + "epoch": 3.0444938820912126, + "grad_norm": 0.1334705800863527, + "learning_rate": 2.690831516322473e-06, + "loss": 0.51, + "step": 6159 + }, + { + "epoch": 3.0449882585588925, + "grad_norm": 0.1358504106118342, + "learning_rate": 2.68817461906435e-06, + "loss": 0.5215, + "step": 6160 + }, + { + "epoch": 3.045482635026573, + "grad_norm": 0.13507230751707733, + "learning_rate": 2.6855188304544665e-06, + "loss": 0.5372, + "step": 6161 + }, + { + "epoch": 3.045977011494253, + "grad_norm": 0.13616655676672149, + "learning_rate": 2.682864150895499e-06, + "loss": 0.5034, + "step": 6162 + }, + { + "epoch": 3.046471387961933, + "grad_norm": 0.13796422258492966, + "learning_rate": 2.6802105807899713e-06, + "loss": 0.5266, + "step": 6163 + }, + { + "epoch": 3.046965764429613, + "grad_norm": 0.13512696204108057, + "learning_rate": 2.6775581205402246e-06, + "loss": 0.4968, + "step": 6164 + }, + { + "epoch": 3.0474601408972934, + "grad_norm": 0.13723376188209935, + "learning_rate": 2.67490677054843e-06, + "loss": 0.5551, + "step": 6165 + }, + { + "epoch": 3.0479545173649734, + "grad_norm": 0.13829971285572565, + "learning_rate": 2.6722565312166026e-06, + "loss": 0.5321, + "step": 6166 + }, + { + "epoch": 3.0484488938326537, + "grad_norm": 0.13314985546656452, + "learning_rate": 2.669607402946578e-06, + "loss": 0.5349, + "step": 6167 + }, + { + "epoch": 3.0489432703003336, + "grad_norm": 0.13380839429676245, + "learning_rate": 2.666959386140028e-06, + "loss": 0.5383, + "step": 6168 + }, + { + "epoch": 3.049437646768014, + "grad_norm": 0.13845668089942334, + "learning_rate": 2.66431248119846e-06, + "loss": 0.5178, + "step": 6169 + }, + { + "epoch": 3.049932023235694, + "grad_norm": 0.13607072313186652, + "learning_rate": 2.6616666885232056e-06, + "loss": 0.5359, + "step": 6170 + }, + { + "epoch": 3.0504263997033743, + "grad_norm": 0.13485734746795344, + "learning_rate": 2.6590220085154273e-06, + "loss": 0.5526, + "step": 6171 + }, + { + "epoch": 3.050920776171054, + "grad_norm": 0.13467119730107815, + "learning_rate": 2.656378441576126e-06, + "loss": 0.5207, + "step": 6172 + }, + { + "epoch": 3.0514151526387345, + "grad_norm": 0.13376074703360719, + "learning_rate": 2.6537359881061275e-06, + "loss": 0.5454, + "step": 6173 + }, + { + "epoch": 3.0519095291064144, + "grad_norm": 0.1325377901872702, + "learning_rate": 2.6510946485060916e-06, + "loss": 0.5226, + "step": 6174 + }, + { + "epoch": 3.052403905574095, + "grad_norm": 0.135254409402792, + "learning_rate": 2.648454423176513e-06, + "loss": 0.514, + "step": 6175 + }, + { + "epoch": 3.0528982820417747, + "grad_norm": 0.12933575772950584, + "learning_rate": 2.6458153125177055e-06, + "loss": 0.5196, + "step": 6176 + }, + { + "epoch": 3.053392658509455, + "grad_norm": 0.14171950445222503, + "learning_rate": 2.6431773169298302e-06, + "loss": 0.5271, + "step": 6177 + }, + { + "epoch": 3.053887034977135, + "grad_norm": 0.13440467020419183, + "learning_rate": 2.640540436812863e-06, + "loss": 0.5714, + "step": 6178 + }, + { + "epoch": 3.0543814114448153, + "grad_norm": 0.1423047302630041, + "learning_rate": 2.6379046725666237e-06, + "loss": 0.521, + "step": 6179 + }, + { + "epoch": 3.0548757879124953, + "grad_norm": 0.13470958539918584, + "learning_rate": 2.6352700245907526e-06, + "loss": 0.5339, + "step": 6180 + }, + { + "epoch": 3.0553701643801756, + "grad_norm": 0.14060408060373117, + "learning_rate": 2.6326364932847303e-06, + "loss": 0.5834, + "step": 6181 + }, + { + "epoch": 3.0558645408478555, + "grad_norm": 0.13860149931194807, + "learning_rate": 2.63000407904786e-06, + "loss": 0.5248, + "step": 6182 + }, + { + "epoch": 3.056358917315536, + "grad_norm": 0.13481643623386919, + "learning_rate": 2.6273727822792818e-06, + "loss": 0.5371, + "step": 6183 + }, + { + "epoch": 3.056853293783216, + "grad_norm": 0.13661257864128748, + "learning_rate": 2.624742603377959e-06, + "loss": 0.5149, + "step": 6184 + }, + { + "epoch": 3.057347670250896, + "grad_norm": 0.14177940970287534, + "learning_rate": 2.6221135427426925e-06, + "loss": 0.5585, + "step": 6185 + }, + { + "epoch": 3.057842046718576, + "grad_norm": 0.14263039629970092, + "learning_rate": 2.6194856007721137e-06, + "loss": 0.572, + "step": 6186 + }, + { + "epoch": 3.0583364231862564, + "grad_norm": 0.13920470297084794, + "learning_rate": 2.61685877786468e-06, + "loss": 0.5344, + "step": 6187 + }, + { + "epoch": 3.0588307996539363, + "grad_norm": 0.13844400720960207, + "learning_rate": 2.6142330744186763e-06, + "loss": 0.5108, + "step": 6188 + }, + { + "epoch": 3.0593251761216167, + "grad_norm": 0.13935348398692052, + "learning_rate": 2.6116084908322258e-06, + "loss": 0.5408, + "step": 6189 + }, + { + "epoch": 3.0598195525892966, + "grad_norm": 0.1365919568712902, + "learning_rate": 2.6089850275032823e-06, + "loss": 0.5208, + "step": 6190 + }, + { + "epoch": 3.060313929056977, + "grad_norm": 0.1344007254062926, + "learning_rate": 2.6063626848296187e-06, + "loss": 0.5219, + "step": 6191 + }, + { + "epoch": 3.060808305524657, + "grad_norm": 0.14426391116356782, + "learning_rate": 2.6037414632088507e-06, + "loss": 0.5357, + "step": 6192 + }, + { + "epoch": 3.0613026819923372, + "grad_norm": 0.1367517954470043, + "learning_rate": 2.601121363038416e-06, + "loss": 0.5281, + "step": 6193 + }, + { + "epoch": 3.061797058460017, + "grad_norm": 0.1357963585941095, + "learning_rate": 2.598502384715579e-06, + "loss": 0.4928, + "step": 6194 + }, + { + "epoch": 3.0622914349276975, + "grad_norm": 0.133762693837912, + "learning_rate": 2.5958845286374514e-06, + "loss": 0.4844, + "step": 6195 + }, + { + "epoch": 3.0627858113953774, + "grad_norm": 0.13850972719804316, + "learning_rate": 2.5932677952009577e-06, + "loss": 0.5232, + "step": 6196 + }, + { + "epoch": 3.063280187863058, + "grad_norm": 0.1374899217576655, + "learning_rate": 2.590652184802853e-06, + "loss": 0.5206, + "step": 6197 + }, + { + "epoch": 3.0637745643307377, + "grad_norm": 0.14389094025262406, + "learning_rate": 2.5880376978397335e-06, + "loss": 0.5528, + "step": 6198 + }, + { + "epoch": 3.064268940798418, + "grad_norm": 0.13362499638107936, + "learning_rate": 2.585424334708012e-06, + "loss": 0.5274, + "step": 6199 + }, + { + "epoch": 3.064763317266098, + "grad_norm": 0.1349238628900542, + "learning_rate": 2.5828120958039414e-06, + "loss": 0.5223, + "step": 6200 + }, + { + "epoch": 3.0652576937337783, + "grad_norm": 0.1403725729207918, + "learning_rate": 2.5802009815236006e-06, + "loss": 0.5201, + "step": 6201 + }, + { + "epoch": 3.0657520702014582, + "grad_norm": 0.14054001195359142, + "learning_rate": 2.577590992262893e-06, + "loss": 0.5602, + "step": 6202 + }, + { + "epoch": 3.0662464466691386, + "grad_norm": 0.13737323285013361, + "learning_rate": 2.574982128417559e-06, + "loss": 0.5577, + "step": 6203 + }, + { + "epoch": 3.0667408231368185, + "grad_norm": 0.1332255923243462, + "learning_rate": 2.5723743903831653e-06, + "loss": 0.5205, + "step": 6204 + }, + { + "epoch": 3.067235199604499, + "grad_norm": 0.13037213872352077, + "learning_rate": 2.569767778555103e-06, + "loss": 0.4872, + "step": 6205 + }, + { + "epoch": 3.0677295760721788, + "grad_norm": 0.13623448205566835, + "learning_rate": 2.567162293328601e-06, + "loss": 0.5042, + "step": 6206 + }, + { + "epoch": 3.068223952539859, + "grad_norm": 0.13872147975408952, + "learning_rate": 2.564557935098715e-06, + "loss": 0.5161, + "step": 6207 + }, + { + "epoch": 3.068718329007539, + "grad_norm": 0.13552769738336587, + "learning_rate": 2.5619547042603222e-06, + "loss": 0.538, + "step": 6208 + }, + { + "epoch": 3.0692127054752194, + "grad_norm": 0.13290812142293368, + "learning_rate": 2.5593526012081417e-06, + "loss": 0.5184, + "step": 6209 + }, + { + "epoch": 3.0697070819428993, + "grad_norm": 0.13967910826850405, + "learning_rate": 2.5567516263367107e-06, + "loss": 0.5591, + "step": 6210 + }, + { + "epoch": 3.0702014584105797, + "grad_norm": 0.13960691509098722, + "learning_rate": 2.554151780040396e-06, + "loss": 0.5738, + "step": 6211 + }, + { + "epoch": 3.0706958348782596, + "grad_norm": 0.13558758754786712, + "learning_rate": 2.5515530627134055e-06, + "loss": 0.4979, + "step": 6212 + }, + { + "epoch": 3.07119021134594, + "grad_norm": 0.132990081098656, + "learning_rate": 2.5489554747497636e-06, + "loss": 0.5424, + "step": 6213 + }, + { + "epoch": 3.07168458781362, + "grad_norm": 0.13451804728540104, + "learning_rate": 2.546359016543323e-06, + "loss": 0.5143, + "step": 6214 + }, + { + "epoch": 3.0721789642813, + "grad_norm": 0.131096940642597, + "learning_rate": 2.5437636884877746e-06, + "loss": 0.5292, + "step": 6215 + }, + { + "epoch": 3.07267334074898, + "grad_norm": 0.1359836171942057, + "learning_rate": 2.5411694909766273e-06, + "loss": 0.541, + "step": 6216 + }, + { + "epoch": 3.0731677172166605, + "grad_norm": 0.13549482295048046, + "learning_rate": 2.538576424403225e-06, + "loss": 0.5263, + "step": 6217 + }, + { + "epoch": 3.073662093684341, + "grad_norm": 0.1370105844681941, + "learning_rate": 2.5359844891607443e-06, + "loss": 0.5097, + "step": 6218 + }, + { + "epoch": 3.0741564701520208, + "grad_norm": 0.13011285168503742, + "learning_rate": 2.5333936856421814e-06, + "loss": 0.5182, + "step": 6219 + }, + { + "epoch": 3.074650846619701, + "grad_norm": 0.1364704979877386, + "learning_rate": 2.530804014240359e-06, + "loss": 0.5135, + "step": 6220 + }, + { + "epoch": 3.075145223087381, + "grad_norm": 0.13478055111421766, + "learning_rate": 2.5282154753479413e-06, + "loss": 0.5179, + "step": 6221 + }, + { + "epoch": 3.0756395995550614, + "grad_norm": 0.13339535095750132, + "learning_rate": 2.5256280693574078e-06, + "loss": 0.5618, + "step": 6222 + }, + { + "epoch": 3.0761339760227413, + "grad_norm": 0.1344851777479725, + "learning_rate": 2.523041796661072e-06, + "loss": 0.5317, + "step": 6223 + }, + { + "epoch": 3.0766283524904217, + "grad_norm": 0.13740648851487094, + "learning_rate": 2.5204566576510804e-06, + "loss": 0.5043, + "step": 6224 + }, + { + "epoch": 3.0771227289581016, + "grad_norm": 0.13822998586284324, + "learning_rate": 2.517872652719393e-06, + "loss": 0.5344, + "step": 6225 + }, + { + "epoch": 3.077617105425782, + "grad_norm": 0.14778406162469662, + "learning_rate": 2.515289782257816e-06, + "loss": 0.5903, + "step": 6226 + }, + { + "epoch": 3.078111481893462, + "grad_norm": 0.13036287024647633, + "learning_rate": 2.512708046657968e-06, + "loss": 0.5264, + "step": 6227 + }, + { + "epoch": 3.078605858361142, + "grad_norm": 0.1363076552777128, + "learning_rate": 2.5101274463113034e-06, + "loss": 0.5301, + "step": 6228 + }, + { + "epoch": 3.079100234828822, + "grad_norm": 0.17506631498801659, + "learning_rate": 2.5075479816091073e-06, + "loss": 0.5096, + "step": 6229 + }, + { + "epoch": 3.0795946112965025, + "grad_norm": 0.14252681498956213, + "learning_rate": 2.504969652942486e-06, + "loss": 0.5978, + "step": 6230 + }, + { + "epoch": 3.0800889877641824, + "grad_norm": 0.1417037789110035, + "learning_rate": 2.502392460702372e-06, + "loss": 0.5412, + "step": 6231 + }, + { + "epoch": 3.0805833642318627, + "grad_norm": 0.13272767115537235, + "learning_rate": 2.4998164052795347e-06, + "loss": 0.502, + "step": 6232 + }, + { + "epoch": 3.0810777406995427, + "grad_norm": 0.14009195797683172, + "learning_rate": 2.4972414870645632e-06, + "loss": 0.5233, + "step": 6233 + }, + { + "epoch": 3.081572117167223, + "grad_norm": 0.13502028420536105, + "learning_rate": 2.4946677064478763e-06, + "loss": 0.4986, + "step": 6234 + }, + { + "epoch": 3.082066493634903, + "grad_norm": 0.13840477787229116, + "learning_rate": 2.4920950638197274e-06, + "loss": 0.5148, + "step": 6235 + }, + { + "epoch": 3.0825608701025833, + "grad_norm": 0.1348694160847659, + "learning_rate": 2.4895235595701837e-06, + "loss": 0.5343, + "step": 6236 + }, + { + "epoch": 3.083055246570263, + "grad_norm": 0.13974846229318857, + "learning_rate": 2.486953194089148e-06, + "loss": 0.5246, + "step": 6237 + }, + { + "epoch": 3.0835496230379436, + "grad_norm": 0.14200073786359904, + "learning_rate": 2.4843839677663495e-06, + "loss": 0.5298, + "step": 6238 + }, + { + "epoch": 3.0840439995056235, + "grad_norm": 0.13604021505431982, + "learning_rate": 2.4818158809913497e-06, + "loss": 0.5201, + "step": 6239 + }, + { + "epoch": 3.084538375973304, + "grad_norm": 0.13846513791833623, + "learning_rate": 2.479248934153524e-06, + "loss": 0.4983, + "step": 6240 + }, + { + "epoch": 3.0850327524409837, + "grad_norm": 0.13077454245572626, + "learning_rate": 2.476683127642091e-06, + "loss": 0.511, + "step": 6241 + }, + { + "epoch": 3.085527128908664, + "grad_norm": 0.13805478533702442, + "learning_rate": 2.4741184618460844e-06, + "loss": 0.5355, + "step": 6242 + }, + { + "epoch": 3.086021505376344, + "grad_norm": 0.13616988713090544, + "learning_rate": 2.471554937154368e-06, + "loss": 0.5202, + "step": 6243 + }, + { + "epoch": 3.0865158818440244, + "grad_norm": 0.13868934443130917, + "learning_rate": 2.4689925539556336e-06, + "loss": 0.5196, + "step": 6244 + }, + { + "epoch": 3.0870102583117043, + "grad_norm": 0.13912581352134154, + "learning_rate": 2.4664313126384065e-06, + "loss": 0.5284, + "step": 6245 + }, + { + "epoch": 3.0875046347793846, + "grad_norm": 0.13603731988038334, + "learning_rate": 2.4638712135910235e-06, + "loss": 0.5337, + "step": 6246 + }, + { + "epoch": 3.0879990112470646, + "grad_norm": 0.1353585596520398, + "learning_rate": 2.4613122572016632e-06, + "loss": 0.5113, + "step": 6247 + }, + { + "epoch": 3.088493387714745, + "grad_norm": 0.13435359070886757, + "learning_rate": 2.458754443858321e-06, + "loss": 0.5141, + "step": 6248 + }, + { + "epoch": 3.088987764182425, + "grad_norm": 0.13439765512506124, + "learning_rate": 2.4561977739488253e-06, + "loss": 0.4956, + "step": 6249 + }, + { + "epoch": 3.089482140650105, + "grad_norm": 0.13680769625234804, + "learning_rate": 2.453642247860829e-06, + "loss": 0.5123, + "step": 6250 + }, + { + "epoch": 3.089976517117785, + "grad_norm": 0.13768992862912277, + "learning_rate": 2.4510878659818082e-06, + "loss": 0.5226, + "step": 6251 + }, + { + "epoch": 3.0904708935854655, + "grad_norm": 0.13347971303702127, + "learning_rate": 2.448534628699073e-06, + "loss": 0.5306, + "step": 6252 + }, + { + "epoch": 3.0909652700531454, + "grad_norm": 0.13414598919968637, + "learning_rate": 2.4459825363997535e-06, + "loss": 0.5329, + "step": 6253 + }, + { + "epoch": 3.0914596465208257, + "grad_norm": 0.13255003974474924, + "learning_rate": 2.4434315894708037e-06, + "loss": 0.5076, + "step": 6254 + }, + { + "epoch": 3.0919540229885056, + "grad_norm": 0.13885204668961562, + "learning_rate": 2.4408817882990134e-06, + "loss": 0.534, + "step": 6255 + }, + { + "epoch": 3.092448399456186, + "grad_norm": 0.13947008756938106, + "learning_rate": 2.4383331332709936e-06, + "loss": 0.5293, + "step": 6256 + }, + { + "epoch": 3.092942775923866, + "grad_norm": 0.13636110448548025, + "learning_rate": 2.435785624773178e-06, + "loss": 0.5333, + "step": 6257 + }, + { + "epoch": 3.0934371523915463, + "grad_norm": 0.13797312254190086, + "learning_rate": 2.4332392631918365e-06, + "loss": 0.525, + "step": 6258 + }, + { + "epoch": 3.093931528859226, + "grad_norm": 0.13820947361440372, + "learning_rate": 2.4306940489130535e-06, + "loss": 0.549, + "step": 6259 + }, + { + "epoch": 3.0944259053269065, + "grad_norm": 0.1356666596466104, + "learning_rate": 2.4281499823227407e-06, + "loss": 0.5159, + "step": 6260 + }, + { + "epoch": 3.0949202817945864, + "grad_norm": 0.13444281978023814, + "learning_rate": 2.42560706380665e-06, + "loss": 0.5177, + "step": 6261 + }, + { + "epoch": 3.095414658262267, + "grad_norm": 0.12872092618450875, + "learning_rate": 2.423065293750343e-06, + "loss": 0.491, + "step": 6262 + }, + { + "epoch": 3.0959090347299467, + "grad_norm": 0.13388498404276747, + "learning_rate": 2.420524672539212e-06, + "loss": 0.5081, + "step": 6263 + }, + { + "epoch": 3.096403411197627, + "grad_norm": 0.13354402921419425, + "learning_rate": 2.4179852005584794e-06, + "loss": 0.5229, + "step": 6264 + }, + { + "epoch": 3.096897787665307, + "grad_norm": 0.1423893262297247, + "learning_rate": 2.415446878193186e-06, + "loss": 0.5188, + "step": 6265 + }, + { + "epoch": 3.0973921641329873, + "grad_norm": 0.14244135649381165, + "learning_rate": 2.412909705828205e-06, + "loss": 0.5717, + "step": 6266 + }, + { + "epoch": 3.0978865406006673, + "grad_norm": 0.1333853872223485, + "learning_rate": 2.410373683848235e-06, + "loss": 0.5043, + "step": 6267 + }, + { + "epoch": 3.0983809170683476, + "grad_norm": 0.13745982422396794, + "learning_rate": 2.407838812637796e-06, + "loss": 0.5176, + "step": 6268 + }, + { + "epoch": 3.0988752935360275, + "grad_norm": 0.13483849324781094, + "learning_rate": 2.4053050925812305e-06, + "loss": 0.4967, + "step": 6269 + }, + { + "epoch": 3.099369670003708, + "grad_norm": 0.1347635891904945, + "learning_rate": 2.4027725240627198e-06, + "loss": 0.5178, + "step": 6270 + }, + { + "epoch": 3.099864046471388, + "grad_norm": 0.13447216606801263, + "learning_rate": 2.4002411074662536e-06, + "loss": 0.5177, + "step": 6271 + }, + { + "epoch": 3.100358422939068, + "grad_norm": 0.1314617072045183, + "learning_rate": 2.3977108431756602e-06, + "loss": 0.49, + "step": 6272 + }, + { + "epoch": 3.100852799406748, + "grad_norm": 0.13520716518771753, + "learning_rate": 2.3951817315745906e-06, + "loss": 0.5287, + "step": 6273 + }, + { + "epoch": 3.1013471758744284, + "grad_norm": 0.13937359992785583, + "learning_rate": 2.3926537730465126e-06, + "loss": 0.544, + "step": 6274 + }, + { + "epoch": 3.1018415523421083, + "grad_norm": 0.13913758384208966, + "learning_rate": 2.390126967974732e-06, + "loss": 0.5503, + "step": 6275 + }, + { + "epoch": 3.1023359288097887, + "grad_norm": 0.1395112885487003, + "learning_rate": 2.3876013167423663e-06, + "loss": 0.5224, + "step": 6276 + }, + { + "epoch": 3.1028303052774686, + "grad_norm": 0.1313183172192515, + "learning_rate": 2.3850768197323694e-06, + "loss": 0.5377, + "step": 6277 + }, + { + "epoch": 3.103324681745149, + "grad_norm": 0.133958615269009, + "learning_rate": 2.3825534773275163e-06, + "loss": 0.5203, + "step": 6278 + }, + { + "epoch": 3.103819058212829, + "grad_norm": 0.13686327864480344, + "learning_rate": 2.380031289910405e-06, + "loss": 0.4988, + "step": 6279 + }, + { + "epoch": 3.1043134346805092, + "grad_norm": 0.12962062187131293, + "learning_rate": 2.377510257863457e-06, + "loss": 0.4999, + "step": 6280 + }, + { + "epoch": 3.104807811148189, + "grad_norm": 0.14213995927029455, + "learning_rate": 2.3749903815689256e-06, + "loss": 0.5531, + "step": 6281 + }, + { + "epoch": 3.1053021876158695, + "grad_norm": 0.13053730406923947, + "learning_rate": 2.3724716614088795e-06, + "loss": 0.4984, + "step": 6282 + }, + { + "epoch": 3.1057965640835494, + "grad_norm": 0.13432128231739157, + "learning_rate": 2.3699540977652214e-06, + "loss": 0.5016, + "step": 6283 + }, + { + "epoch": 3.10629094055123, + "grad_norm": 0.1358078464681723, + "learning_rate": 2.367437691019675e-06, + "loss": 0.5561, + "step": 6284 + }, + { + "epoch": 3.1067853170189097, + "grad_norm": 0.1364432429567574, + "learning_rate": 2.364922441553785e-06, + "loss": 0.5005, + "step": 6285 + }, + { + "epoch": 3.10727969348659, + "grad_norm": 0.13991253277008509, + "learning_rate": 2.362408349748924e-06, + "loss": 0.4937, + "step": 6286 + }, + { + "epoch": 3.10777406995427, + "grad_norm": 0.1374700395583838, + "learning_rate": 2.3598954159862873e-06, + "loss": 0.5148, + "step": 6287 + }, + { + "epoch": 3.1082684464219503, + "grad_norm": 0.13410556139205618, + "learning_rate": 2.357383640646902e-06, + "loss": 0.4998, + "step": 6288 + }, + { + "epoch": 3.1087628228896302, + "grad_norm": 0.13665669043717513, + "learning_rate": 2.354873024111606e-06, + "loss": 0.5391, + "step": 6289 + }, + { + "epoch": 3.1092571993573106, + "grad_norm": 0.133738740219279, + "learning_rate": 2.3523635667610755e-06, + "loss": 0.5305, + "step": 6290 + }, + { + "epoch": 3.1097515758249905, + "grad_norm": 0.13137511093714818, + "learning_rate": 2.349855268975799e-06, + "loss": 0.5122, + "step": 6291 + }, + { + "epoch": 3.110245952292671, + "grad_norm": 0.13404860286575956, + "learning_rate": 2.3473481311361e-06, + "loss": 0.5413, + "step": 6292 + }, + { + "epoch": 3.1107403287603512, + "grad_norm": 0.13485224456930103, + "learning_rate": 2.3448421536221155e-06, + "loss": 0.5183, + "step": 6293 + }, + { + "epoch": 3.111234705228031, + "grad_norm": 0.1352579809248161, + "learning_rate": 2.342337336813817e-06, + "loss": 0.5523, + "step": 6294 + }, + { + "epoch": 3.1117290816957115, + "grad_norm": 0.13446998098893684, + "learning_rate": 2.3398336810909884e-06, + "loss": 0.4962, + "step": 6295 + }, + { + "epoch": 3.1122234581633914, + "grad_norm": 0.13915234161419096, + "learning_rate": 2.3373311868332516e-06, + "loss": 0.5138, + "step": 6296 + }, + { + "epoch": 3.1127178346310718, + "grad_norm": 0.14319646862929286, + "learning_rate": 2.3348298544200374e-06, + "loss": 0.5331, + "step": 6297 + }, + { + "epoch": 3.1132122110987517, + "grad_norm": 0.15090764188752023, + "learning_rate": 2.332329684230612e-06, + "loss": 0.5747, + "step": 6298 + }, + { + "epoch": 3.113706587566432, + "grad_norm": 0.13518662390317612, + "learning_rate": 2.3298306766440627e-06, + "loss": 0.5118, + "step": 6299 + }, + { + "epoch": 3.114200964034112, + "grad_norm": 0.13138985639882766, + "learning_rate": 2.327332832039294e-06, + "loss": 0.5233, + "step": 6300 + }, + { + "epoch": 3.1146953405017923, + "grad_norm": 0.1342771846364822, + "learning_rate": 2.3248361507950457e-06, + "loss": 0.5218, + "step": 6301 + }, + { + "epoch": 3.1151897169694722, + "grad_norm": 0.14135521595883077, + "learning_rate": 2.32234063328987e-06, + "loss": 0.5602, + "step": 6302 + }, + { + "epoch": 3.1156840934371526, + "grad_norm": 0.13137654908032395, + "learning_rate": 2.319846279902147e-06, + "loss": 0.5282, + "step": 6303 + }, + { + "epoch": 3.1161784699048325, + "grad_norm": 0.13665365885639444, + "learning_rate": 2.3173530910100804e-06, + "loss": 0.5145, + "step": 6304 + }, + { + "epoch": 3.116672846372513, + "grad_norm": 0.1330037910255093, + "learning_rate": 2.3148610669917016e-06, + "loss": 0.5261, + "step": 6305 + }, + { + "epoch": 3.1171672228401928, + "grad_norm": 0.13589383253317994, + "learning_rate": 2.3123702082248568e-06, + "loss": 0.5404, + "step": 6306 + }, + { + "epoch": 3.117661599307873, + "grad_norm": 0.1328669071698612, + "learning_rate": 2.3098805150872226e-06, + "loss": 0.4863, + "step": 6307 + }, + { + "epoch": 3.118155975775553, + "grad_norm": 0.13440185849627276, + "learning_rate": 2.3073919879562968e-06, + "loss": 0.5475, + "step": 6308 + }, + { + "epoch": 3.1186503522432334, + "grad_norm": 0.13619052785092498, + "learning_rate": 2.304904627209392e-06, + "loss": 0.5128, + "step": 6309 + }, + { + "epoch": 3.1191447287109133, + "grad_norm": 0.13393064433072158, + "learning_rate": 2.302418433223664e-06, + "loss": 0.5568, + "step": 6310 + }, + { + "epoch": 3.1196391051785937, + "grad_norm": 0.14027013241621578, + "learning_rate": 2.2999334063760738e-06, + "loss": 0.5301, + "step": 6311 + }, + { + "epoch": 3.1201334816462736, + "grad_norm": 0.12747386698893012, + "learning_rate": 2.297449547043409e-06, + "loss": 0.503, + "step": 6312 + }, + { + "epoch": 3.120627858113954, + "grad_norm": 0.13702072838173526, + "learning_rate": 2.294966855602285e-06, + "loss": 0.515, + "step": 6313 + }, + { + "epoch": 3.121122234581634, + "grad_norm": 0.12823041957024947, + "learning_rate": 2.292485332429135e-06, + "loss": 0.5035, + "step": 6314 + }, + { + "epoch": 3.121616611049314, + "grad_norm": 0.1320484269220559, + "learning_rate": 2.2900049779002197e-06, + "loss": 0.4983, + "step": 6315 + }, + { + "epoch": 3.122110987516994, + "grad_norm": 0.13550517268700452, + "learning_rate": 2.2875257923916215e-06, + "loss": 0.5357, + "step": 6316 + }, + { + "epoch": 3.1226053639846745, + "grad_norm": 0.143382120264739, + "learning_rate": 2.2850477762792446e-06, + "loss": 0.5595, + "step": 6317 + }, + { + "epoch": 3.1230997404523544, + "grad_norm": 0.1373484454842762, + "learning_rate": 2.282570929938809e-06, + "loss": 0.5387, + "step": 6318 + }, + { + "epoch": 3.1235941169200347, + "grad_norm": 0.13460687053155437, + "learning_rate": 2.280095253745873e-06, + "loss": 0.5688, + "step": 6319 + }, + { + "epoch": 3.1240884933877147, + "grad_norm": 0.13545363397724264, + "learning_rate": 2.2776207480758017e-06, + "loss": 0.5238, + "step": 6320 + }, + { + "epoch": 3.124582869855395, + "grad_norm": 0.13890509659814135, + "learning_rate": 2.275147413303791e-06, + "loss": 0.5528, + "step": 6321 + }, + { + "epoch": 3.125077246323075, + "grad_norm": 0.14126846728657785, + "learning_rate": 2.272675249804863e-06, + "loss": 0.5492, + "step": 6322 + }, + { + "epoch": 3.1255716227907553, + "grad_norm": 0.13537236791796606, + "learning_rate": 2.270204257953851e-06, + "loss": 0.5779, + "step": 6323 + }, + { + "epoch": 3.126065999258435, + "grad_norm": 0.13370334375486448, + "learning_rate": 2.2677344381254207e-06, + "loss": 0.4994, + "step": 6324 + }, + { + "epoch": 3.1265603757261156, + "grad_norm": 0.1339753959523645, + "learning_rate": 2.2652657906940545e-06, + "loss": 0.5322, + "step": 6325 + }, + { + "epoch": 3.1270547521937955, + "grad_norm": 0.13767690422641018, + "learning_rate": 2.262798316034053e-06, + "loss": 0.5593, + "step": 6326 + }, + { + "epoch": 3.127549128661476, + "grad_norm": 0.13823564806678484, + "learning_rate": 2.2603320145195553e-06, + "loss": 0.5098, + "step": 6327 + }, + { + "epoch": 3.1280435051291557, + "grad_norm": 0.135200354101789, + "learning_rate": 2.257866886524507e-06, + "loss": 0.5134, + "step": 6328 + }, + { + "epoch": 3.128537881596836, + "grad_norm": 0.13518376983065838, + "learning_rate": 2.2554029324226767e-06, + "loss": 0.4969, + "step": 6329 + }, + { + "epoch": 3.129032258064516, + "grad_norm": 0.1366074802451495, + "learning_rate": 2.2529401525876647e-06, + "loss": 0.508, + "step": 6330 + }, + { + "epoch": 3.1295266345321964, + "grad_norm": 0.13422474768502998, + "learning_rate": 2.2504785473928836e-06, + "loss": 0.5074, + "step": 6331 + }, + { + "epoch": 3.1300210109998763, + "grad_norm": 0.13927823929332542, + "learning_rate": 2.2480181172115735e-06, + "loss": 0.5747, + "step": 6332 + }, + { + "epoch": 3.1305153874675566, + "grad_norm": 0.14335065919270126, + "learning_rate": 2.245558862416798e-06, + "loss": 0.5224, + "step": 6333 + }, + { + "epoch": 3.1310097639352366, + "grad_norm": 0.1366884160442147, + "learning_rate": 2.2431007833814356e-06, + "loss": 0.568, + "step": 6334 + }, + { + "epoch": 3.131504140402917, + "grad_norm": 0.19264557064905263, + "learning_rate": 2.240643880478187e-06, + "loss": 0.57, + "step": 6335 + }, + { + "epoch": 3.131998516870597, + "grad_norm": 0.13417847927586823, + "learning_rate": 2.2381881540795847e-06, + "loss": 0.5188, + "step": 6336 + }, + { + "epoch": 3.132492893338277, + "grad_norm": 0.13841513219439844, + "learning_rate": 2.2357336045579694e-06, + "loss": 0.547, + "step": 6337 + }, + { + "epoch": 3.132987269805957, + "grad_norm": 0.13919036473837884, + "learning_rate": 2.2332802322855117e-06, + "loss": 0.519, + "step": 6338 + }, + { + "epoch": 3.1334816462736375, + "grad_norm": 0.1311282508176853, + "learning_rate": 2.230828037634205e-06, + "loss": 0.5291, + "step": 6339 + }, + { + "epoch": 3.1339760227413174, + "grad_norm": 0.14001468401969175, + "learning_rate": 2.228377020975856e-06, + "loss": 0.5479, + "step": 6340 + }, + { + "epoch": 3.1344703992089977, + "grad_norm": 0.13413236723741725, + "learning_rate": 2.2259271826821036e-06, + "loss": 0.5464, + "step": 6341 + }, + { + "epoch": 3.1349647756766776, + "grad_norm": 0.13431344152892918, + "learning_rate": 2.223478523124395e-06, + "loss": 0.5256, + "step": 6342 + }, + { + "epoch": 3.135459152144358, + "grad_norm": 0.14068584655701594, + "learning_rate": 2.2210310426740122e-06, + "loss": 0.5642, + "step": 6343 + }, + { + "epoch": 3.135953528612038, + "grad_norm": 0.13777570633610867, + "learning_rate": 2.2185847417020457e-06, + "loss": 0.5654, + "step": 6344 + }, + { + "epoch": 3.1364479050797183, + "grad_norm": 0.13651115778686704, + "learning_rate": 2.2161396205794205e-06, + "loss": 0.5173, + "step": 6345 + }, + { + "epoch": 3.136942281547398, + "grad_norm": 0.13805292419816442, + "learning_rate": 2.2136956796768693e-06, + "loss": 0.5288, + "step": 6346 + }, + { + "epoch": 3.1374366580150785, + "grad_norm": 0.13478801730978865, + "learning_rate": 2.2112529193649568e-06, + "loss": 0.5031, + "step": 6347 + }, + { + "epoch": 3.1379310344827585, + "grad_norm": 0.13459799569766662, + "learning_rate": 2.20881134001406e-06, + "loss": 0.5434, + "step": 6348 + }, + { + "epoch": 3.138425410950439, + "grad_norm": 0.1381218308640791, + "learning_rate": 2.2063709419943825e-06, + "loss": 0.5754, + "step": 6349 + }, + { + "epoch": 3.1389197874181187, + "grad_norm": 0.134976288104411, + "learning_rate": 2.203931725675951e-06, + "loss": 0.5209, + "step": 6350 + }, + { + "epoch": 3.139414163885799, + "grad_norm": 0.13352875222727234, + "learning_rate": 2.2014936914286056e-06, + "loss": 0.5204, + "step": 6351 + }, + { + "epoch": 3.139908540353479, + "grad_norm": 0.13393463763379393, + "learning_rate": 2.199056839622009e-06, + "loss": 0.5157, + "step": 6352 + }, + { + "epoch": 3.1404029168211594, + "grad_norm": 0.13524396192262372, + "learning_rate": 2.1966211706256492e-06, + "loss": 0.5128, + "step": 6353 + }, + { + "epoch": 3.1408972932888393, + "grad_norm": 0.13945056607033426, + "learning_rate": 2.1941866848088335e-06, + "loss": 0.5432, + "step": 6354 + }, + { + "epoch": 3.1413916697565196, + "grad_norm": 0.13452860652614446, + "learning_rate": 2.1917533825406844e-06, + "loss": 0.4901, + "step": 6355 + }, + { + "epoch": 3.1418860462241995, + "grad_norm": 0.1337565998991981, + "learning_rate": 2.189321264190153e-06, + "loss": 0.4957, + "step": 6356 + }, + { + "epoch": 3.14238042269188, + "grad_norm": 0.13311785067977577, + "learning_rate": 2.186890330126006e-06, + "loss": 0.5323, + "step": 6357 + }, + { + "epoch": 3.14287479915956, + "grad_norm": 0.13687994431118636, + "learning_rate": 2.184460580716823e-06, + "loss": 0.4914, + "step": 6358 + }, + { + "epoch": 3.14336917562724, + "grad_norm": 0.1349578837664706, + "learning_rate": 2.182032016331026e-06, + "loss": 0.5254, + "step": 6359 + }, + { + "epoch": 3.14386355209492, + "grad_norm": 0.14319583616577103, + "learning_rate": 2.1796046373368375e-06, + "loss": 0.5346, + "step": 6360 + }, + { + "epoch": 3.1443579285626004, + "grad_norm": 0.14057065574468183, + "learning_rate": 2.1771784441023023e-06, + "loss": 0.5287, + "step": 6361 + }, + { + "epoch": 3.1448523050302803, + "grad_norm": 0.13392236815525857, + "learning_rate": 2.174753436995297e-06, + "loss": 0.51, + "step": 6362 + }, + { + "epoch": 3.1453466814979607, + "grad_norm": 0.1349403848464933, + "learning_rate": 2.172329616383505e-06, + "loss": 0.5393, + "step": 6363 + }, + { + "epoch": 3.145841057965641, + "grad_norm": 0.13850829240877377, + "learning_rate": 2.1699069826344365e-06, + "loss": 0.5248, + "step": 6364 + }, + { + "epoch": 3.146335434433321, + "grad_norm": 0.1383318249670346, + "learning_rate": 2.1674855361154257e-06, + "loss": 0.5357, + "step": 6365 + }, + { + "epoch": 3.146829810901001, + "grad_norm": 0.14016966689548954, + "learning_rate": 2.1650652771936164e-06, + "loss": 0.5174, + "step": 6366 + }, + { + "epoch": 3.1473241873686812, + "grad_norm": 0.13632662992011976, + "learning_rate": 2.162646206235982e-06, + "loss": 0.5093, + "step": 6367 + }, + { + "epoch": 3.1478185638363616, + "grad_norm": 0.13229206862579498, + "learning_rate": 2.16022832360931e-06, + "loss": 0.5537, + "step": 6368 + }, + { + "epoch": 3.1483129403040415, + "grad_norm": 0.13365606580457703, + "learning_rate": 2.1578116296802077e-06, + "loss": 0.5163, + "step": 6369 + }, + { + "epoch": 3.1488073167717214, + "grad_norm": 0.13608509173231897, + "learning_rate": 2.1553961248151045e-06, + "loss": 0.5357, + "step": 6370 + }, + { + "epoch": 3.149301693239402, + "grad_norm": 0.1360342376426291, + "learning_rate": 2.1529818093802534e-06, + "loss": 0.4948, + "step": 6371 + }, + { + "epoch": 3.149796069707082, + "grad_norm": 0.1396987406010729, + "learning_rate": 2.150568683741716e-06, + "loss": 0.4796, + "step": 6372 + }, + { + "epoch": 3.150290446174762, + "grad_norm": 0.13452081372310384, + "learning_rate": 2.1481567482653863e-06, + "loss": 0.531, + "step": 6373 + }, + { + "epoch": 3.1507848226424424, + "grad_norm": 0.13768829073900357, + "learning_rate": 2.145746003316969e-06, + "loss": 0.518, + "step": 6374 + }, + { + "epoch": 3.1512791991101223, + "grad_norm": 0.1325271650327455, + "learning_rate": 2.1433364492619846e-06, + "loss": 0.5119, + "step": 6375 + }, + { + "epoch": 3.1517735755778027, + "grad_norm": 0.1430150442550929, + "learning_rate": 2.14092808646579e-06, + "loss": 0.5329, + "step": 6376 + }, + { + "epoch": 3.1522679520454826, + "grad_norm": 0.12795063917966765, + "learning_rate": 2.138520915293547e-06, + "loss": 0.5191, + "step": 6377 + }, + { + "epoch": 3.152762328513163, + "grad_norm": 0.13511163764883785, + "learning_rate": 2.1361149361102364e-06, + "loss": 0.5152, + "step": 6378 + }, + { + "epoch": 3.153256704980843, + "grad_norm": 0.14078513879145135, + "learning_rate": 2.1337101492806676e-06, + "loss": 0.556, + "step": 6379 + }, + { + "epoch": 3.1537510814485232, + "grad_norm": 0.13458853260429854, + "learning_rate": 2.13130655516946e-06, + "loss": 0.551, + "step": 6380 + }, + { + "epoch": 3.154245457916203, + "grad_norm": 0.13905451480587494, + "learning_rate": 2.128904154141058e-06, + "loss": 0.5593, + "step": 6381 + }, + { + "epoch": 3.1547398343838835, + "grad_norm": 0.13296508154102418, + "learning_rate": 2.1265029465597263e-06, + "loss": 0.5008, + "step": 6382 + }, + { + "epoch": 3.1552342108515634, + "grad_norm": 0.13804671528509652, + "learning_rate": 2.1241029327895425e-06, + "loss": 0.5186, + "step": 6383 + }, + { + "epoch": 3.1557285873192438, + "grad_norm": 0.1363518337643648, + "learning_rate": 2.1217041131944037e-06, + "loss": 0.5072, + "step": 6384 + }, + { + "epoch": 3.1562229637869237, + "grad_norm": 0.13425492323538732, + "learning_rate": 2.119306488138034e-06, + "loss": 0.5108, + "step": 6385 + }, + { + "epoch": 3.156717340254604, + "grad_norm": 0.1355652711115016, + "learning_rate": 2.116910057983965e-06, + "loss": 0.5273, + "step": 6386 + }, + { + "epoch": 3.157211716722284, + "grad_norm": 0.13385284509285295, + "learning_rate": 2.114514823095557e-06, + "loss": 0.543, + "step": 6387 + }, + { + "epoch": 3.1577060931899643, + "grad_norm": 0.13716191493947397, + "learning_rate": 2.112120783835987e-06, + "loss": 0.5375, + "step": 6388 + }, + { + "epoch": 3.1582004696576442, + "grad_norm": 0.1344862114244329, + "learning_rate": 2.1097279405682435e-06, + "loss": 0.5388, + "step": 6389 + }, + { + "epoch": 3.1586948461253246, + "grad_norm": 0.1286630949150916, + "learning_rate": 2.1073362936551435e-06, + "loss": 0.4816, + "step": 6390 + }, + { + "epoch": 3.1591892225930045, + "grad_norm": 0.13605030389272224, + "learning_rate": 2.104945843459314e-06, + "loss": 0.5193, + "step": 6391 + }, + { + "epoch": 3.159683599060685, + "grad_norm": 0.13289597183377916, + "learning_rate": 2.102556590343208e-06, + "loss": 0.5149, + "step": 6392 + }, + { + "epoch": 3.1601779755283648, + "grad_norm": 0.1417165919337047, + "learning_rate": 2.1001685346690936e-06, + "loss": 0.5754, + "step": 6393 + }, + { + "epoch": 3.160672351996045, + "grad_norm": 0.1330153467160395, + "learning_rate": 2.0977816767990567e-06, + "loss": 0.5316, + "step": 6394 + }, + { + "epoch": 3.161166728463725, + "grad_norm": 0.13358765960139168, + "learning_rate": 2.0953960170949995e-06, + "loss": 0.5324, + "step": 6395 + }, + { + "epoch": 3.1616611049314054, + "grad_norm": 0.1610165533048913, + "learning_rate": 2.0930115559186502e-06, + "loss": 0.5206, + "step": 6396 + }, + { + "epoch": 3.1621554813990853, + "grad_norm": 0.1408638546793124, + "learning_rate": 2.0906282936315448e-06, + "loss": 0.5364, + "step": 6397 + }, + { + "epoch": 3.1626498578667657, + "grad_norm": 0.14031053168325877, + "learning_rate": 2.0882462305950457e-06, + "loss": 0.5234, + "step": 6398 + }, + { + "epoch": 3.1631442343344456, + "grad_norm": 0.13348106253352687, + "learning_rate": 2.0858653671703346e-06, + "loss": 0.5216, + "step": 6399 + }, + { + "epoch": 3.163638610802126, + "grad_norm": 0.13729280910219724, + "learning_rate": 2.083485703718403e-06, + "loss": 0.5307, + "step": 6400 + }, + { + "epoch": 3.164132987269806, + "grad_norm": 0.14027843155566505, + "learning_rate": 2.0811072406000643e-06, + "loss": 0.526, + "step": 6401 + }, + { + "epoch": 3.164627363737486, + "grad_norm": 0.13901941807308954, + "learning_rate": 2.078729978175952e-06, + "loss": 0.5442, + "step": 6402 + }, + { + "epoch": 3.165121740205166, + "grad_norm": 0.130926283867922, + "learning_rate": 2.0763539168065196e-06, + "loss": 0.495, + "step": 6403 + }, + { + "epoch": 3.1656161166728465, + "grad_norm": 0.13566939248090087, + "learning_rate": 2.073979056852029e-06, + "loss": 0.4984, + "step": 6404 + }, + { + "epoch": 3.1661104931405264, + "grad_norm": 0.13504265942556767, + "learning_rate": 2.0716053986725714e-06, + "loss": 0.5316, + "step": 6405 + }, + { + "epoch": 3.1666048696082068, + "grad_norm": 0.13364479562417722, + "learning_rate": 2.0692329426280477e-06, + "loss": 0.4901, + "step": 6406 + }, + { + "epoch": 3.1670992460758867, + "grad_norm": 0.1335520339947215, + "learning_rate": 2.0668616890781777e-06, + "loss": 0.5103, + "step": 6407 + }, + { + "epoch": 3.167593622543567, + "grad_norm": 0.13421317404132357, + "learning_rate": 2.064491638382501e-06, + "loss": 0.5505, + "step": 6408 + }, + { + "epoch": 3.168087999011247, + "grad_norm": 0.1344335964304134, + "learning_rate": 2.0621227909003782e-06, + "loss": 0.5154, + "step": 6409 + }, + { + "epoch": 3.1685823754789273, + "grad_norm": 0.13600542937602156, + "learning_rate": 2.059755146990978e-06, + "loss": 0.5207, + "step": 6410 + }, + { + "epoch": 3.169076751946607, + "grad_norm": 0.13673987270437316, + "learning_rate": 2.057388707013297e-06, + "loss": 0.5343, + "step": 6411 + }, + { + "epoch": 3.1695711284142876, + "grad_norm": 0.13586581888923527, + "learning_rate": 2.05502347132614e-06, + "loss": 0.4963, + "step": 6412 + }, + { + "epoch": 3.1700655048819675, + "grad_norm": 0.1373481849168852, + "learning_rate": 2.052659440288134e-06, + "loss": 0.5118, + "step": 6413 + }, + { + "epoch": 3.170559881349648, + "grad_norm": 0.12886349425867344, + "learning_rate": 2.0502966142577274e-06, + "loss": 0.5063, + "step": 6414 + }, + { + "epoch": 3.1710542578173277, + "grad_norm": 0.13284505239500574, + "learning_rate": 2.0479349935931747e-06, + "loss": 0.5123, + "step": 6415 + }, + { + "epoch": 3.171548634285008, + "grad_norm": 0.13542284898431733, + "learning_rate": 2.0455745786525616e-06, + "loss": 0.4889, + "step": 6416 + }, + { + "epoch": 3.172043010752688, + "grad_norm": 0.12930924347980355, + "learning_rate": 2.0432153697937786e-06, + "loss": 0.5398, + "step": 6417 + }, + { + "epoch": 3.1725373872203684, + "grad_norm": 0.13199591516084874, + "learning_rate": 2.040857367374538e-06, + "loss": 0.5301, + "step": 6418 + }, + { + "epoch": 3.1730317636880483, + "grad_norm": 0.13430828206675166, + "learning_rate": 2.03850057175237e-06, + "loss": 0.519, + "step": 6419 + }, + { + "epoch": 3.1735261401557286, + "grad_norm": 0.13350129809013028, + "learning_rate": 2.036144983284626e-06, + "loss": 0.5145, + "step": 6420 + }, + { + "epoch": 3.1740205166234086, + "grad_norm": 0.13264807402375298, + "learning_rate": 2.0337906023284628e-06, + "loss": 0.4872, + "step": 6421 + }, + { + "epoch": 3.174514893091089, + "grad_norm": 0.13084030764853552, + "learning_rate": 2.0314374292408677e-06, + "loss": 0.4949, + "step": 6422 + }, + { + "epoch": 3.175009269558769, + "grad_norm": 0.13889565973446186, + "learning_rate": 2.029085464378635e-06, + "loss": 0.5349, + "step": 6423 + }, + { + "epoch": 3.175503646026449, + "grad_norm": 0.144496151977773, + "learning_rate": 2.0267347080983733e-06, + "loss": 0.5367, + "step": 6424 + }, + { + "epoch": 3.175998022494129, + "grad_norm": 0.13305233740893957, + "learning_rate": 2.0243851607565258e-06, + "loss": 0.5355, + "step": 6425 + }, + { + "epoch": 3.1764923989618095, + "grad_norm": 0.13620209033050734, + "learning_rate": 2.0220368227093336e-06, + "loss": 0.5544, + "step": 6426 + }, + { + "epoch": 3.1769867754294894, + "grad_norm": 0.13569379307511373, + "learning_rate": 2.0196896943128584e-06, + "loss": 0.5226, + "step": 6427 + }, + { + "epoch": 3.1774811518971697, + "grad_norm": 0.13010806817637108, + "learning_rate": 2.0173437759229875e-06, + "loss": 0.5177, + "step": 6428 + }, + { + "epoch": 3.1779755283648496, + "grad_norm": 0.13430562577934838, + "learning_rate": 2.0149990678954124e-06, + "loss": 0.5172, + "step": 6429 + }, + { + "epoch": 3.17846990483253, + "grad_norm": 0.13743445959951656, + "learning_rate": 2.012655570585649e-06, + "loss": 0.5351, + "step": 6430 + }, + { + "epoch": 3.17896428130021, + "grad_norm": 0.13994065034641442, + "learning_rate": 2.0103132843490313e-06, + "loss": 0.5114, + "step": 6431 + }, + { + "epoch": 3.1794586577678903, + "grad_norm": 0.13484795679150588, + "learning_rate": 2.007972209540705e-06, + "loss": 0.5374, + "step": 6432 + }, + { + "epoch": 3.17995303423557, + "grad_norm": 0.13411714397088847, + "learning_rate": 2.005632346515628e-06, + "loss": 0.5179, + "step": 6433 + }, + { + "epoch": 3.1804474107032505, + "grad_norm": 0.1345188100697839, + "learning_rate": 2.0032936956285866e-06, + "loss": 0.5352, + "step": 6434 + }, + { + "epoch": 3.1809417871709305, + "grad_norm": 0.13440699935533224, + "learning_rate": 2.00095625723417e-06, + "loss": 0.5504, + "step": 6435 + }, + { + "epoch": 3.181436163638611, + "grad_norm": 0.13406246274357597, + "learning_rate": 1.998620031686793e-06, + "loss": 0.4902, + "step": 6436 + }, + { + "epoch": 3.1819305401062907, + "grad_norm": 0.13439247850193056, + "learning_rate": 1.996285019340687e-06, + "loss": 0.5317, + "step": 6437 + }, + { + "epoch": 3.182424916573971, + "grad_norm": 0.13296233023860096, + "learning_rate": 1.993951220549889e-06, + "loss": 0.5271, + "step": 6438 + }, + { + "epoch": 3.1829192930416514, + "grad_norm": 0.1391460420787914, + "learning_rate": 1.9916186356682667e-06, + "loss": 0.6015, + "step": 6439 + }, + { + "epoch": 3.1834136695093314, + "grad_norm": 0.134370424739954, + "learning_rate": 1.9892872650494875e-06, + "loss": 0.5376, + "step": 6440 + }, + { + "epoch": 3.1839080459770113, + "grad_norm": 0.13756900088619747, + "learning_rate": 1.9869571090470484e-06, + "loss": 0.5186, + "step": 6441 + }, + { + "epoch": 3.1844024224446916, + "grad_norm": 0.13644938293182274, + "learning_rate": 1.9846281680142587e-06, + "loss": 0.4931, + "step": 6442 + }, + { + "epoch": 3.184896798912372, + "grad_norm": 0.13994074353284067, + "learning_rate": 1.982300442304238e-06, + "loss": 0.5106, + "step": 6443 + }, + { + "epoch": 3.185391175380052, + "grad_norm": 0.13314522299617346, + "learning_rate": 1.9799739322699252e-06, + "loss": 0.5282, + "step": 6444 + }, + { + "epoch": 3.1858855518477323, + "grad_norm": 0.14303994177409876, + "learning_rate": 1.977648638264078e-06, + "loss": 0.5504, + "step": 6445 + }, + { + "epoch": 3.186379928315412, + "grad_norm": 0.13786345091220198, + "learning_rate": 1.9753245606392622e-06, + "loss": 0.5417, + "step": 6446 + }, + { + "epoch": 3.1868743047830925, + "grad_norm": 0.13692324708504747, + "learning_rate": 1.9730016997478673e-06, + "loss": 0.5309, + "step": 6447 + }, + { + "epoch": 3.1873686812507724, + "grad_norm": 0.1415513535566022, + "learning_rate": 1.970680055942097e-06, + "loss": 0.5368, + "step": 6448 + }, + { + "epoch": 3.187863057718453, + "grad_norm": 0.13413162212763147, + "learning_rate": 1.968359629573965e-06, + "loss": 0.5228, + "step": 6449 + }, + { + "epoch": 3.1883574341861327, + "grad_norm": 0.13967014629711266, + "learning_rate": 1.966040420995301e-06, + "loss": 0.522, + "step": 6450 + }, + { + "epoch": 3.188851810653813, + "grad_norm": 0.13565475345068703, + "learning_rate": 1.963722430557756e-06, + "loss": 0.5182, + "step": 6451 + }, + { + "epoch": 3.189346187121493, + "grad_norm": 0.13534505306896272, + "learning_rate": 1.961405658612795e-06, + "loss": 0.5364, + "step": 6452 + }, + { + "epoch": 3.1898405635891733, + "grad_norm": 0.1335345146533755, + "learning_rate": 1.9590901055116918e-06, + "loss": 0.5394, + "step": 6453 + }, + { + "epoch": 3.1903349400568533, + "grad_norm": 0.13918256883025568, + "learning_rate": 1.9567757716055446e-06, + "loss": 0.529, + "step": 6454 + }, + { + "epoch": 3.1908293165245336, + "grad_norm": 0.13303518122911895, + "learning_rate": 1.9544626572452563e-06, + "loss": 0.5159, + "step": 6455 + }, + { + "epoch": 3.1913236929922135, + "grad_norm": 0.12912233061021508, + "learning_rate": 1.952150762781556e-06, + "loss": 0.4873, + "step": 6456 + }, + { + "epoch": 3.191818069459894, + "grad_norm": 0.14284657592655342, + "learning_rate": 1.9498400885649783e-06, + "loss": 0.5059, + "step": 6457 + }, + { + "epoch": 3.192312445927574, + "grad_norm": 0.13657947342519697, + "learning_rate": 1.94753063494588e-06, + "loss": 0.5757, + "step": 6458 + }, + { + "epoch": 3.192806822395254, + "grad_norm": 0.13410764105848302, + "learning_rate": 1.945222402274427e-06, + "loss": 0.5128, + "step": 6459 + }, + { + "epoch": 3.193301198862934, + "grad_norm": 0.13552871686208764, + "learning_rate": 1.942915390900606e-06, + "loss": 0.5095, + "step": 6460 + }, + { + "epoch": 3.1937955753306144, + "grad_norm": 0.13498300753315512, + "learning_rate": 1.9406096011742102e-06, + "loss": 0.5271, + "step": 6461 + }, + { + "epoch": 3.1942899517982943, + "grad_norm": 0.1391522924403562, + "learning_rate": 1.9383050334448563e-06, + "loss": 0.5313, + "step": 6462 + }, + { + "epoch": 3.1947843282659747, + "grad_norm": 0.13314899013561995, + "learning_rate": 1.9360016880619735e-06, + "loss": 0.5073, + "step": 6463 + }, + { + "epoch": 3.1952787047336546, + "grad_norm": 0.13111168171437743, + "learning_rate": 1.9336995653748005e-06, + "loss": 0.5348, + "step": 6464 + }, + { + "epoch": 3.195773081201335, + "grad_norm": 0.1335274090787306, + "learning_rate": 1.9313986657323993e-06, + "loss": 0.5048, + "step": 6465 + }, + { + "epoch": 3.196267457669015, + "grad_norm": 0.14169320312450104, + "learning_rate": 1.9290989894836375e-06, + "loss": 0.5269, + "step": 6466 + }, + { + "epoch": 3.1967618341366952, + "grad_norm": 0.13142519240498798, + "learning_rate": 1.926800536977199e-06, + "loss": 0.5226, + "step": 6467 + }, + { + "epoch": 3.197256210604375, + "grad_norm": 0.1375846181270154, + "learning_rate": 1.9245033085615893e-06, + "loss": 0.5293, + "step": 6468 + }, + { + "epoch": 3.1977505870720555, + "grad_norm": 0.13244455331523025, + "learning_rate": 1.922207304585123e-06, + "loss": 0.5197, + "step": 6469 + }, + { + "epoch": 3.1982449635397354, + "grad_norm": 0.13732741788258973, + "learning_rate": 1.919912525395925e-06, + "loss": 0.5627, + "step": 6470 + }, + { + "epoch": 3.1987393400074158, + "grad_norm": 0.13824615175431318, + "learning_rate": 1.917618971341946e-06, + "loss": 0.5174, + "step": 6471 + }, + { + "epoch": 3.1992337164750957, + "grad_norm": 0.13625752732247762, + "learning_rate": 1.9153266427709385e-06, + "loss": 0.5009, + "step": 6472 + }, + { + "epoch": 3.199728092942776, + "grad_norm": 0.1304476989310387, + "learning_rate": 1.913035540030471e-06, + "loss": 0.4977, + "step": 6473 + }, + { + "epoch": 3.200222469410456, + "grad_norm": 0.1343508188622306, + "learning_rate": 1.910745663467939e-06, + "loss": 0.5194, + "step": 6474 + }, + { + "epoch": 3.2007168458781363, + "grad_norm": 0.13652152953115437, + "learning_rate": 1.908457013430539e-06, + "loss": 0.5374, + "step": 6475 + }, + { + "epoch": 3.2012112223458162, + "grad_norm": 0.14098243636913668, + "learning_rate": 1.9061695902652821e-06, + "loss": 0.5408, + "step": 6476 + }, + { + "epoch": 3.2017055988134966, + "grad_norm": 0.1371777304058209, + "learning_rate": 1.9038833943190004e-06, + "loss": 0.54, + "step": 6477 + }, + { + "epoch": 3.2021999752811765, + "grad_norm": 0.13550077355265205, + "learning_rate": 1.9015984259383336e-06, + "loss": 0.5508, + "step": 6478 + }, + { + "epoch": 3.202694351748857, + "grad_norm": 0.13026182981150117, + "learning_rate": 1.8993146854697376e-06, + "loss": 0.5215, + "step": 6479 + }, + { + "epoch": 3.2031887282165368, + "grad_norm": 0.13468179069646455, + "learning_rate": 1.8970321732594866e-06, + "loss": 0.5095, + "step": 6480 + }, + { + "epoch": 3.203683104684217, + "grad_norm": 0.13773434055814882, + "learning_rate": 1.8947508896536603e-06, + "loss": 0.5229, + "step": 6481 + }, + { + "epoch": 3.204177481151897, + "grad_norm": 0.13955035416184355, + "learning_rate": 1.8924708349981557e-06, + "loss": 0.5669, + "step": 6482 + }, + { + "epoch": 3.2046718576195774, + "grad_norm": 0.13053009906959784, + "learning_rate": 1.8901920096386862e-06, + "loss": 0.502, + "step": 6483 + }, + { + "epoch": 3.2051662340872573, + "grad_norm": 0.13528908302018122, + "learning_rate": 1.8879144139207727e-06, + "loss": 0.5137, + "step": 6484 + }, + { + "epoch": 3.2056606105549377, + "grad_norm": 0.13372433840184692, + "learning_rate": 1.8856380481897574e-06, + "loss": 0.523, + "step": 6485 + }, + { + "epoch": 3.2061549870226176, + "grad_norm": 0.13757556657435271, + "learning_rate": 1.8833629127907916e-06, + "loss": 0.5238, + "step": 6486 + }, + { + "epoch": 3.206649363490298, + "grad_norm": 0.13255141980189902, + "learning_rate": 1.8810890080688371e-06, + "loss": 0.5176, + "step": 6487 + }, + { + "epoch": 3.207143739957978, + "grad_norm": 0.13800919106424653, + "learning_rate": 1.878816334368677e-06, + "loss": 0.535, + "step": 6488 + }, + { + "epoch": 3.207638116425658, + "grad_norm": 0.13765261635910764, + "learning_rate": 1.8765448920349017e-06, + "loss": 0.5087, + "step": 6489 + }, + { + "epoch": 3.208132492893338, + "grad_norm": 0.13502399821108974, + "learning_rate": 1.8742746814119095e-06, + "loss": 0.5095, + "step": 6490 + }, + { + "epoch": 3.2086268693610185, + "grad_norm": 0.1388471326154822, + "learning_rate": 1.8720057028439298e-06, + "loss": 0.5387, + "step": 6491 + }, + { + "epoch": 3.2091212458286984, + "grad_norm": 0.1390523146732163, + "learning_rate": 1.8697379566749907e-06, + "loss": 0.5314, + "step": 6492 + }, + { + "epoch": 3.2096156222963788, + "grad_norm": 0.1403606920779692, + "learning_rate": 1.8674714432489315e-06, + "loss": 0.5369, + "step": 6493 + }, + { + "epoch": 3.2101099987640587, + "grad_norm": 0.13755461099996452, + "learning_rate": 1.8652061629094166e-06, + "loss": 0.5699, + "step": 6494 + }, + { + "epoch": 3.210604375231739, + "grad_norm": 0.15127559339724547, + "learning_rate": 1.8629421159999128e-06, + "loss": 0.5356, + "step": 6495 + }, + { + "epoch": 3.211098751699419, + "grad_norm": 0.1312627558428505, + "learning_rate": 1.860679302863705e-06, + "loss": 0.516, + "step": 6496 + }, + { + "epoch": 3.2115931281670993, + "grad_norm": 0.13712996666070926, + "learning_rate": 1.8584177238438927e-06, + "loss": 0.5454, + "step": 6497 + }, + { + "epoch": 3.212087504634779, + "grad_norm": 0.12906300532879292, + "learning_rate": 1.8561573792833831e-06, + "loss": 0.5108, + "step": 6498 + }, + { + "epoch": 3.2125818811024596, + "grad_norm": 0.1378562386094819, + "learning_rate": 1.8538982695248975e-06, + "loss": 0.5236, + "step": 6499 + }, + { + "epoch": 3.2130762575701395, + "grad_norm": 0.1388678772129554, + "learning_rate": 1.851640394910974e-06, + "loss": 0.5424, + "step": 6500 + }, + { + "epoch": 3.21357063403782, + "grad_norm": 0.13492332039614308, + "learning_rate": 1.8493837557839579e-06, + "loss": 0.5345, + "step": 6501 + }, + { + "epoch": 3.2140650105054998, + "grad_norm": 0.1329047451634621, + "learning_rate": 1.8471283524860095e-06, + "loss": 0.483, + "step": 6502 + }, + { + "epoch": 3.21455938697318, + "grad_norm": 0.12995461462325342, + "learning_rate": 1.8448741853591078e-06, + "loss": 0.5126, + "step": 6503 + }, + { + "epoch": 3.21505376344086, + "grad_norm": 0.1328398134874236, + "learning_rate": 1.8426212547450307e-06, + "loss": 0.5385, + "step": 6504 + }, + { + "epoch": 3.2155481399085404, + "grad_norm": 0.14274318950987597, + "learning_rate": 1.8403695609853845e-06, + "loss": 0.5233, + "step": 6505 + }, + { + "epoch": 3.2160425163762203, + "grad_norm": 0.13596393820717403, + "learning_rate": 1.8381191044215718e-06, + "loss": 0.5485, + "step": 6506 + }, + { + "epoch": 3.2165368928439007, + "grad_norm": 0.14046875480836923, + "learning_rate": 1.8358698853948232e-06, + "loss": 0.5355, + "step": 6507 + }, + { + "epoch": 3.2170312693115806, + "grad_norm": 0.13913679830628475, + "learning_rate": 1.833621904246169e-06, + "loss": 0.5158, + "step": 6508 + }, + { + "epoch": 3.217525645779261, + "grad_norm": 0.13224242100174924, + "learning_rate": 1.831375161316461e-06, + "loss": 0.5119, + "step": 6509 + }, + { + "epoch": 3.2180200222469413, + "grad_norm": 0.13506293501345085, + "learning_rate": 1.8291296569463556e-06, + "loss": 0.5022, + "step": 6510 + }, + { + "epoch": 3.218514398714621, + "grad_norm": 0.13327475751616163, + "learning_rate": 1.8268853914763297e-06, + "loss": 0.5081, + "step": 6511 + }, + { + "epoch": 3.219008775182301, + "grad_norm": 0.13991120210065394, + "learning_rate": 1.8246423652466616e-06, + "loss": 0.5239, + "step": 6512 + }, + { + "epoch": 3.2195031516499815, + "grad_norm": 0.13181270516324764, + "learning_rate": 1.822400578597453e-06, + "loss": 0.5312, + "step": 6513 + }, + { + "epoch": 3.219997528117662, + "grad_norm": 0.13596176570490712, + "learning_rate": 1.820160031868613e-06, + "loss": 0.5169, + "step": 6514 + }, + { + "epoch": 3.2204919045853417, + "grad_norm": 0.13294279366530815, + "learning_rate": 1.8179207253998598e-06, + "loss": 0.5188, + "step": 6515 + }, + { + "epoch": 3.2209862810530216, + "grad_norm": 0.1371503328260008, + "learning_rate": 1.8156826595307243e-06, + "loss": 0.5237, + "step": 6516 + }, + { + "epoch": 3.221480657520702, + "grad_norm": 0.13247014677432808, + "learning_rate": 1.8134458346005535e-06, + "loss": 0.5352, + "step": 6517 + }, + { + "epoch": 3.2219750339883824, + "grad_norm": 0.13582034233031992, + "learning_rate": 1.8112102509485063e-06, + "loss": 0.5016, + "step": 6518 + }, + { + "epoch": 3.2224694104560623, + "grad_norm": 0.13710340570112214, + "learning_rate": 1.8089759089135462e-06, + "loss": 0.5604, + "step": 6519 + }, + { + "epoch": 3.2229637869237426, + "grad_norm": 0.13456460104055887, + "learning_rate": 1.8067428088344563e-06, + "loss": 0.5285, + "step": 6520 + }, + { + "epoch": 3.2234581633914225, + "grad_norm": 0.13631131508475916, + "learning_rate": 1.8045109510498272e-06, + "loss": 0.4895, + "step": 6521 + }, + { + "epoch": 3.223952539859103, + "grad_norm": 0.13500658364666507, + "learning_rate": 1.8022803358980567e-06, + "loss": 0.5571, + "step": 6522 + }, + { + "epoch": 3.224446916326783, + "grad_norm": 0.13791101625907368, + "learning_rate": 1.8000509637173702e-06, + "loss": 0.548, + "step": 6523 + }, + { + "epoch": 3.224941292794463, + "grad_norm": 0.1388380614233981, + "learning_rate": 1.7978228348457882e-06, + "loss": 0.5029, + "step": 6524 + }, + { + "epoch": 3.225435669262143, + "grad_norm": 0.13503726505866545, + "learning_rate": 1.7955959496211462e-06, + "loss": 0.5221, + "step": 6525 + }, + { + "epoch": 3.2259300457298234, + "grad_norm": 0.13385562539753573, + "learning_rate": 1.7933703083810983e-06, + "loss": 0.531, + "step": 6526 + }, + { + "epoch": 3.2264244221975034, + "grad_norm": 0.13325961774606818, + "learning_rate": 1.7911459114631003e-06, + "loss": 0.5326, + "step": 6527 + }, + { + "epoch": 3.2269187986651837, + "grad_norm": 0.12912600959496798, + "learning_rate": 1.7889227592044278e-06, + "loss": 0.5375, + "step": 6528 + }, + { + "epoch": 3.2274131751328636, + "grad_norm": 0.12862406998386897, + "learning_rate": 1.7867008519421647e-06, + "loss": 0.5328, + "step": 6529 + }, + { + "epoch": 3.227907551600544, + "grad_norm": 0.13184501715924996, + "learning_rate": 1.7844801900132014e-06, + "loss": 0.519, + "step": 6530 + }, + { + "epoch": 3.228401928068224, + "grad_norm": 0.14305734560625452, + "learning_rate": 1.7822607737542485e-06, + "loss": 0.5341, + "step": 6531 + }, + { + "epoch": 3.2288963045359043, + "grad_norm": 0.13730505720371222, + "learning_rate": 1.7800426035018204e-06, + "loss": 0.5481, + "step": 6532 + }, + { + "epoch": 3.229390681003584, + "grad_norm": 0.1341427665037422, + "learning_rate": 1.777825679592242e-06, + "loss": 0.5102, + "step": 6533 + }, + { + "epoch": 3.2298850574712645, + "grad_norm": 0.13644951260464758, + "learning_rate": 1.7756100023616552e-06, + "loss": 0.534, + "step": 6534 + }, + { + "epoch": 3.2303794339389444, + "grad_norm": 0.1344157614046281, + "learning_rate": 1.7733955721460116e-06, + "loss": 0.5053, + "step": 6535 + }, + { + "epoch": 3.230873810406625, + "grad_norm": 0.13443352639639553, + "learning_rate": 1.7711823892810687e-06, + "loss": 0.5035, + "step": 6536 + }, + { + "epoch": 3.2313681868743047, + "grad_norm": 0.12591387881340846, + "learning_rate": 1.768970454102401e-06, + "loss": 0.5037, + "step": 6537 + }, + { + "epoch": 3.231862563341985, + "grad_norm": 0.1368462614707958, + "learning_rate": 1.7667597669453895e-06, + "loss": 0.5281, + "step": 6538 + }, + { + "epoch": 3.232356939809665, + "grad_norm": 0.13455520400900012, + "learning_rate": 1.7645503281452226e-06, + "loss": 0.5106, + "step": 6539 + }, + { + "epoch": 3.2328513162773453, + "grad_norm": 0.1322768806336325, + "learning_rate": 1.7623421380369133e-06, + "loss": 0.5145, + "step": 6540 + }, + { + "epoch": 3.2333456927450253, + "grad_norm": 0.12932922820306939, + "learning_rate": 1.7601351969552726e-06, + "loss": 0.5177, + "step": 6541 + }, + { + "epoch": 3.2338400692127056, + "grad_norm": 0.1319050450213728, + "learning_rate": 1.7579295052349232e-06, + "loss": 0.5063, + "step": 6542 + }, + { + "epoch": 3.2343344456803855, + "grad_norm": 0.13321210723778903, + "learning_rate": 1.755725063210304e-06, + "loss": 0.5227, + "step": 6543 + }, + { + "epoch": 3.234828822148066, + "grad_norm": 0.13687262495562832, + "learning_rate": 1.7535218712156587e-06, + "loss": 0.5108, + "step": 6544 + }, + { + "epoch": 3.235323198615746, + "grad_norm": 0.13865724401930657, + "learning_rate": 1.7513199295850446e-06, + "loss": 0.5292, + "step": 6545 + }, + { + "epoch": 3.235817575083426, + "grad_norm": 0.1282387308899099, + "learning_rate": 1.7491192386523325e-06, + "loss": 0.5282, + "step": 6546 + }, + { + "epoch": 3.236311951551106, + "grad_norm": 0.1386811035914259, + "learning_rate": 1.7469197987511977e-06, + "loss": 0.5541, + "step": 6547 + }, + { + "epoch": 3.2368063280187864, + "grad_norm": 0.14448087988024363, + "learning_rate": 1.7447216102151254e-06, + "loss": 0.5668, + "step": 6548 + }, + { + "epoch": 3.2373007044864663, + "grad_norm": 0.13212582364517148, + "learning_rate": 1.742524673377418e-06, + "loss": 0.5341, + "step": 6549 + }, + { + "epoch": 3.2377950809541467, + "grad_norm": 0.1344174417370179, + "learning_rate": 1.7403289885711793e-06, + "loss": 0.5083, + "step": 6550 + }, + { + "epoch": 3.2382894574218266, + "grad_norm": 0.14362229513215638, + "learning_rate": 1.7381345561293306e-06, + "loss": 0.5391, + "step": 6551 + }, + { + "epoch": 3.238783833889507, + "grad_norm": 0.1339733758086635, + "learning_rate": 1.7359413763846022e-06, + "loss": 0.5548, + "step": 6552 + }, + { + "epoch": 3.239278210357187, + "grad_norm": 0.13869364741165865, + "learning_rate": 1.7337494496695295e-06, + "loss": 0.5319, + "step": 6553 + }, + { + "epoch": 3.2397725868248672, + "grad_norm": 0.1334712609205356, + "learning_rate": 1.7315587763164642e-06, + "loss": 0.5641, + "step": 6554 + }, + { + "epoch": 3.240266963292547, + "grad_norm": 0.13940472890181307, + "learning_rate": 1.729369356657562e-06, + "loss": 0.5423, + "step": 6555 + }, + { + "epoch": 3.2407613397602275, + "grad_norm": 0.13619021595201616, + "learning_rate": 1.7271811910247916e-06, + "loss": 0.5473, + "step": 6556 + }, + { + "epoch": 3.2412557162279074, + "grad_norm": 0.13091455669434604, + "learning_rate": 1.7249942797499364e-06, + "loss": 0.5298, + "step": 6557 + }, + { + "epoch": 3.241750092695588, + "grad_norm": 0.1346068157339882, + "learning_rate": 1.7228086231645803e-06, + "loss": 0.5416, + "step": 6558 + }, + { + "epoch": 3.2422444691632677, + "grad_norm": 0.13278371257059746, + "learning_rate": 1.720624221600119e-06, + "loss": 0.523, + "step": 6559 + }, + { + "epoch": 3.242738845630948, + "grad_norm": 0.13476751879273322, + "learning_rate": 1.7184410753877668e-06, + "loss": 0.5237, + "step": 6560 + }, + { + "epoch": 3.243233222098628, + "grad_norm": 0.13664846342118128, + "learning_rate": 1.7162591848585341e-06, + "loss": 0.5337, + "step": 6561 + }, + { + "epoch": 3.2437275985663083, + "grad_norm": 0.1308615654589546, + "learning_rate": 1.7140785503432511e-06, + "loss": 0.4988, + "step": 6562 + }, + { + "epoch": 3.2442219750339882, + "grad_norm": 0.137737489922039, + "learning_rate": 1.7118991721725576e-06, + "loss": 0.5091, + "step": 6563 + }, + { + "epoch": 3.2447163515016686, + "grad_norm": 0.13454268723456067, + "learning_rate": 1.7097210506768958e-06, + "loss": 0.5457, + "step": 6564 + }, + { + "epoch": 3.2452107279693485, + "grad_norm": 0.13970749783617348, + "learning_rate": 1.7075441861865193e-06, + "loss": 0.5497, + "step": 6565 + }, + { + "epoch": 3.245705104437029, + "grad_norm": 0.14943710791229645, + "learning_rate": 1.7053685790314966e-06, + "loss": 0.579, + "step": 6566 + }, + { + "epoch": 3.246199480904709, + "grad_norm": 0.140360936413045, + "learning_rate": 1.7031942295417025e-06, + "loss": 0.5726, + "step": 6567 + }, + { + "epoch": 3.246693857372389, + "grad_norm": 0.13449395876314207, + "learning_rate": 1.7010211380468167e-06, + "loss": 0.5012, + "step": 6568 + }, + { + "epoch": 3.247188233840069, + "grad_norm": 0.13074914585684771, + "learning_rate": 1.6988493048763376e-06, + "loss": 0.5067, + "step": 6569 + }, + { + "epoch": 3.2476826103077494, + "grad_norm": 0.1341878174222683, + "learning_rate": 1.6966787303595633e-06, + "loss": 0.5309, + "step": 6570 + }, + { + "epoch": 3.2481769867754293, + "grad_norm": 0.13319278474392338, + "learning_rate": 1.6945094148256035e-06, + "loss": 0.5596, + "step": 6571 + }, + { + "epoch": 3.2486713632431097, + "grad_norm": 0.13660275500181035, + "learning_rate": 1.6923413586033799e-06, + "loss": 0.5435, + "step": 6572 + }, + { + "epoch": 3.2491657397107896, + "grad_norm": 0.14048190455226922, + "learning_rate": 1.6901745620216258e-06, + "loss": 0.5648, + "step": 6573 + }, + { + "epoch": 3.24966011617847, + "grad_norm": 0.1344772806912378, + "learning_rate": 1.6880090254088744e-06, + "loss": 0.5058, + "step": 6574 + }, + { + "epoch": 3.25015449264615, + "grad_norm": 0.13961552901409616, + "learning_rate": 1.6858447490934771e-06, + "loss": 0.5494, + "step": 6575 + }, + { + "epoch": 3.25064886911383, + "grad_norm": 0.13481942831690882, + "learning_rate": 1.683681733403586e-06, + "loss": 0.5339, + "step": 6576 + }, + { + "epoch": 3.25114324558151, + "grad_norm": 0.13708456990764556, + "learning_rate": 1.6815199786671688e-06, + "loss": 0.5686, + "step": 6577 + }, + { + "epoch": 3.2516376220491905, + "grad_norm": 0.1340460959458437, + "learning_rate": 1.679359485212001e-06, + "loss": 0.4915, + "step": 6578 + }, + { + "epoch": 3.2516376220491905, + "eval_loss": 0.6430450677871704, + "eval_runtime": 81.6988, + "eval_samples_per_second": 371.535, + "eval_steps_per_second": 46.451, + "step": 6578 + }, + { + "epoch": 3.2521319985168704, + "grad_norm": 0.13675071190557772, + "learning_rate": 1.6772002533656594e-06, + "loss": 0.5365, + "step": 6579 + }, + { + "epoch": 3.2526263749845508, + "grad_norm": 0.13371500542111484, + "learning_rate": 1.6750422834555434e-06, + "loss": 0.5062, + "step": 6580 + }, + { + "epoch": 3.253120751452231, + "grad_norm": 0.1283074570389727, + "learning_rate": 1.672885575808848e-06, + "loss": 0.5032, + "step": 6581 + }, + { + "epoch": 3.253615127919911, + "grad_norm": 0.1345088837528528, + "learning_rate": 1.6707301307525803e-06, + "loss": 0.5291, + "step": 6582 + }, + { + "epoch": 3.254109504387591, + "grad_norm": 0.1352894149049901, + "learning_rate": 1.66857594861356e-06, + "loss": 0.4902, + "step": 6583 + }, + { + "epoch": 3.2546038808552713, + "grad_norm": 0.1332104684264614, + "learning_rate": 1.666423029718416e-06, + "loss": 0.5152, + "step": 6584 + }, + { + "epoch": 3.2550982573229517, + "grad_norm": 0.12975548220849556, + "learning_rate": 1.6642713743935756e-06, + "loss": 0.5118, + "step": 6585 + }, + { + "epoch": 3.2555926337906316, + "grad_norm": 0.1326609054302735, + "learning_rate": 1.6621209829652872e-06, + "loss": 0.5227, + "step": 6586 + }, + { + "epoch": 3.2560870102583115, + "grad_norm": 0.13375710595437962, + "learning_rate": 1.6599718557596002e-06, + "loss": 0.5132, + "step": 6587 + }, + { + "epoch": 3.256581386725992, + "grad_norm": 0.13618738903387992, + "learning_rate": 1.6578239931023687e-06, + "loss": 0.5095, + "step": 6588 + }, + { + "epoch": 3.257075763193672, + "grad_norm": 0.1351241918178531, + "learning_rate": 1.6556773953192685e-06, + "loss": 0.5597, + "step": 6589 + }, + { + "epoch": 3.257570139661352, + "grad_norm": 0.13706818670450208, + "learning_rate": 1.6535320627357732e-06, + "loss": 0.5471, + "step": 6590 + }, + { + "epoch": 3.258064516129032, + "grad_norm": 0.13685228037738936, + "learning_rate": 1.6513879956771617e-06, + "loss": 0.5135, + "step": 6591 + }, + { + "epoch": 3.2585588925967124, + "grad_norm": 0.13590781705139207, + "learning_rate": 1.6492451944685317e-06, + "loss": 0.5594, + "step": 6592 + }, + { + "epoch": 3.2590532690643927, + "grad_norm": 0.13455619641163463, + "learning_rate": 1.647103659434779e-06, + "loss": 0.5352, + "step": 6593 + }, + { + "epoch": 3.2595476455320727, + "grad_norm": 0.13671979688710106, + "learning_rate": 1.6449633909006136e-06, + "loss": 0.4858, + "step": 6594 + }, + { + "epoch": 3.2600420219997526, + "grad_norm": 0.14344785798031343, + "learning_rate": 1.6428243891905539e-06, + "loss": 0.5301, + "step": 6595 + }, + { + "epoch": 3.260536398467433, + "grad_norm": 0.1318477504323378, + "learning_rate": 1.6406866546289212e-06, + "loss": 0.5123, + "step": 6596 + }, + { + "epoch": 3.2610307749351133, + "grad_norm": 0.13420792209900206, + "learning_rate": 1.6385501875398468e-06, + "loss": 0.5174, + "step": 6597 + }, + { + "epoch": 3.261525151402793, + "grad_norm": 0.13394027523789953, + "learning_rate": 1.6364149882472735e-06, + "loss": 0.5636, + "step": 6598 + }, + { + "epoch": 3.2620195278704736, + "grad_norm": 0.13578502947998353, + "learning_rate": 1.6342810570749446e-06, + "loss": 0.5415, + "step": 6599 + }, + { + "epoch": 3.2625139043381535, + "grad_norm": 0.13752504019676046, + "learning_rate": 1.6321483943464168e-06, + "loss": 0.5256, + "step": 6600 + }, + { + "epoch": 3.263008280805834, + "grad_norm": 0.136217554430685, + "learning_rate": 1.6300170003850568e-06, + "loss": 0.5319, + "step": 6601 + }, + { + "epoch": 3.2635026572735137, + "grad_norm": 0.13651405854787044, + "learning_rate": 1.6278868755140288e-06, + "loss": 0.5293, + "step": 6602 + }, + { + "epoch": 3.263997033741194, + "grad_norm": 0.1371308540319755, + "learning_rate": 1.6257580200563172e-06, + "loss": 0.5281, + "step": 6603 + }, + { + "epoch": 3.264491410208874, + "grad_norm": 0.13841550528657579, + "learning_rate": 1.623630434334701e-06, + "loss": 0.5471, + "step": 6604 + }, + { + "epoch": 3.2649857866765544, + "grad_norm": 0.1387461871182086, + "learning_rate": 1.6215041186717773e-06, + "loss": 0.5365, + "step": 6605 + }, + { + "epoch": 3.2654801631442343, + "grad_norm": 0.13485844902285507, + "learning_rate": 1.6193790733899485e-06, + "loss": 0.5258, + "step": 6606 + }, + { + "epoch": 3.2659745396119146, + "grad_norm": 0.13735869609389806, + "learning_rate": 1.6172552988114199e-06, + "loss": 0.5332, + "step": 6607 + }, + { + "epoch": 3.2664689160795946, + "grad_norm": 0.13405676295701963, + "learning_rate": 1.615132795258204e-06, + "loss": 0.5235, + "step": 6608 + }, + { + "epoch": 3.266963292547275, + "grad_norm": 0.13012569369863064, + "learning_rate": 1.613011563052128e-06, + "loss": 0.5218, + "step": 6609 + }, + { + "epoch": 3.267457669014955, + "grad_norm": 0.13943546525503014, + "learning_rate": 1.610891602514818e-06, + "loss": 0.5261, + "step": 6610 + }, + { + "epoch": 3.267952045482635, + "grad_norm": 0.13572233503013145, + "learning_rate": 1.6087729139677121e-06, + "loss": 0.5198, + "step": 6611 + }, + { + "epoch": 3.268446421950315, + "grad_norm": 0.1349347927923128, + "learning_rate": 1.606655497732057e-06, + "loss": 0.5477, + "step": 6612 + }, + { + "epoch": 3.2689407984179955, + "grad_norm": 0.13995193792335633, + "learning_rate": 1.6045393541289022e-06, + "loss": 0.5174, + "step": 6613 + }, + { + "epoch": 3.2694351748856754, + "grad_norm": 0.13566468293282166, + "learning_rate": 1.6024244834791025e-06, + "loss": 0.4989, + "step": 6614 + }, + { + "epoch": 3.2699295513533557, + "grad_norm": 0.13957246498972886, + "learning_rate": 1.6003108861033256e-06, + "loss": 0.5277, + "step": 6615 + }, + { + "epoch": 3.2704239278210356, + "grad_norm": 0.13439600801677673, + "learning_rate": 1.5981985623220465e-06, + "loss": 0.565, + "step": 6616 + }, + { + "epoch": 3.270918304288716, + "grad_norm": 0.13241296457464458, + "learning_rate": 1.5960875124555386e-06, + "loss": 0.559, + "step": 6617 + }, + { + "epoch": 3.271412680756396, + "grad_norm": 0.14065020590942562, + "learning_rate": 1.593977736823894e-06, + "loss": 0.5415, + "step": 6618 + }, + { + "epoch": 3.2719070572240763, + "grad_norm": 0.13639475257293146, + "learning_rate": 1.5918692357469988e-06, + "loss": 0.5115, + "step": 6619 + }, + { + "epoch": 3.272401433691756, + "grad_norm": 0.141238453001373, + "learning_rate": 1.5897620095445587e-06, + "loss": 0.539, + "step": 6620 + }, + { + "epoch": 3.2728958101594365, + "grad_norm": 0.1351927084825902, + "learning_rate": 1.5876560585360735e-06, + "loss": 0.5427, + "step": 6621 + }, + { + "epoch": 3.2733901866271164, + "grad_norm": 0.13084767157514432, + "learning_rate": 1.585551383040862e-06, + "loss": 0.5, + "step": 6622 + }, + { + "epoch": 3.273884563094797, + "grad_norm": 0.13648472498388486, + "learning_rate": 1.583447983378037e-06, + "loss": 0.5129, + "step": 6623 + }, + { + "epoch": 3.2743789395624767, + "grad_norm": 0.13357246165379133, + "learning_rate": 1.581345859866531e-06, + "loss": 0.5383, + "step": 6624 + }, + { + "epoch": 3.274873316030157, + "grad_norm": 0.1400134291154146, + "learning_rate": 1.5792450128250714e-06, + "loss": 0.5406, + "step": 6625 + }, + { + "epoch": 3.275367692497837, + "grad_norm": 0.13558953742509913, + "learning_rate": 1.577145442572199e-06, + "loss": 0.5365, + "step": 6626 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 0.1310800031991842, + "learning_rate": 1.5750471494262609e-06, + "loss": 0.5302, + "step": 6627 + }, + { + "epoch": 3.2763564454331973, + "grad_norm": 0.13730613884675744, + "learning_rate": 1.5729501337054042e-06, + "loss": 0.5383, + "step": 6628 + }, + { + "epoch": 3.2768508219008776, + "grad_norm": 0.1362368673828054, + "learning_rate": 1.5708543957275923e-06, + "loss": 0.5193, + "step": 6629 + }, + { + "epoch": 3.2773451983685575, + "grad_norm": 0.13896707470124342, + "learning_rate": 1.5687599358105866e-06, + "loss": 0.5574, + "step": 6630 + }, + { + "epoch": 3.277839574836238, + "grad_norm": 0.13345251778003503, + "learning_rate": 1.5666667542719548e-06, + "loss": 0.5293, + "step": 6631 + }, + { + "epoch": 3.278333951303918, + "grad_norm": 0.13953881911715071, + "learning_rate": 1.564574851429076e-06, + "loss": 0.5247, + "step": 6632 + }, + { + "epoch": 3.278828327771598, + "grad_norm": 0.13884400588811102, + "learning_rate": 1.562484227599136e-06, + "loss": 0.5177, + "step": 6633 + }, + { + "epoch": 3.279322704239278, + "grad_norm": 0.1359291195733351, + "learning_rate": 1.5603948830991167e-06, + "loss": 0.5332, + "step": 6634 + }, + { + "epoch": 3.2798170807069584, + "grad_norm": 0.13538551061460266, + "learning_rate": 1.5583068182458205e-06, + "loss": 0.537, + "step": 6635 + }, + { + "epoch": 3.2803114571746383, + "grad_norm": 0.13622519302285185, + "learning_rate": 1.5562200333558442e-06, + "loss": 0.546, + "step": 6636 + }, + { + "epoch": 3.2808058336423187, + "grad_norm": 0.1342102222967283, + "learning_rate": 1.5541345287455889e-06, + "loss": 0.5206, + "step": 6637 + }, + { + "epoch": 3.2813002101099986, + "grad_norm": 0.13710843900796732, + "learning_rate": 1.5520503047312786e-06, + "loss": 0.5471, + "step": 6638 + }, + { + "epoch": 3.281794586577679, + "grad_norm": 0.13522261434152238, + "learning_rate": 1.5499673616289256e-06, + "loss": 0.5123, + "step": 6639 + }, + { + "epoch": 3.282288963045359, + "grad_norm": 0.13475943590623957, + "learning_rate": 1.5478856997543522e-06, + "loss": 0.5435, + "step": 6640 + }, + { + "epoch": 3.2827833395130392, + "grad_norm": 0.14057135749255092, + "learning_rate": 1.5458053194231938e-06, + "loss": 0.565, + "step": 6641 + }, + { + "epoch": 3.283277715980719, + "grad_norm": 0.13448200999124157, + "learning_rate": 1.5437262209508785e-06, + "loss": 0.5496, + "step": 6642 + }, + { + "epoch": 3.2837720924483995, + "grad_norm": 0.14284198551962105, + "learning_rate": 1.5416484046526537e-06, + "loss": 0.5265, + "step": 6643 + }, + { + "epoch": 3.2842664689160794, + "grad_norm": 0.13185709593072317, + "learning_rate": 1.539571870843566e-06, + "loss": 0.5342, + "step": 6644 + }, + { + "epoch": 3.28476084538376, + "grad_norm": 0.13575198132739194, + "learning_rate": 1.5374966198384656e-06, + "loss": 0.5302, + "step": 6645 + }, + { + "epoch": 3.2852552218514397, + "grad_norm": 0.13450203409924077, + "learning_rate": 1.5354226519520088e-06, + "loss": 0.5531, + "step": 6646 + }, + { + "epoch": 3.28574959831912, + "grad_norm": 0.14201156771486384, + "learning_rate": 1.533349967498663e-06, + "loss": 0.5637, + "step": 6647 + }, + { + "epoch": 3.2862439747868, + "grad_norm": 0.13782242908102643, + "learning_rate": 1.5312785667926933e-06, + "loss": 0.5124, + "step": 6648 + }, + { + "epoch": 3.2867383512544803, + "grad_norm": 0.13794026972259532, + "learning_rate": 1.5292084501481751e-06, + "loss": 0.5366, + "step": 6649 + }, + { + "epoch": 3.2872327277221602, + "grad_norm": 0.13574360003747074, + "learning_rate": 1.5271396178789898e-06, + "loss": 0.5499, + "step": 6650 + }, + { + "epoch": 3.2877271041898406, + "grad_norm": 0.13787197391622089, + "learning_rate": 1.5250720702988187e-06, + "loss": 0.5254, + "step": 6651 + }, + { + "epoch": 3.2882214806575205, + "grad_norm": 0.1368248219087095, + "learning_rate": 1.5230058077211552e-06, + "loss": 0.5303, + "step": 6652 + }, + { + "epoch": 3.288715857125201, + "grad_norm": 0.13297325881692637, + "learning_rate": 1.5209408304592922e-06, + "loss": 0.5407, + "step": 6653 + }, + { + "epoch": 3.289210233592881, + "grad_norm": 0.1325036288685499, + "learning_rate": 1.5188771388263258e-06, + "loss": 0.5207, + "step": 6654 + }, + { + "epoch": 3.289704610060561, + "grad_norm": 0.13778431881431172, + "learning_rate": 1.5168147331351702e-06, + "loss": 0.5384, + "step": 6655 + }, + { + "epoch": 3.2901989865282415, + "grad_norm": 0.16806416601052995, + "learning_rate": 1.5147536136985297e-06, + "loss": 0.585, + "step": 6656 + }, + { + "epoch": 3.2906933629959214, + "grad_norm": 0.13803428467370094, + "learning_rate": 1.5126937808289188e-06, + "loss": 0.5308, + "step": 6657 + }, + { + "epoch": 3.2911877394636013, + "grad_norm": 0.1379984621116054, + "learning_rate": 1.510635234838661e-06, + "loss": 0.559, + "step": 6658 + }, + { + "epoch": 3.2916821159312817, + "grad_norm": 0.1377237893058879, + "learning_rate": 1.5085779760398777e-06, + "loss": 0.5457, + "step": 6659 + }, + { + "epoch": 3.292176492398962, + "grad_norm": 0.13842965021496886, + "learning_rate": 1.5065220047445018e-06, + "loss": 0.499, + "step": 6660 + }, + { + "epoch": 3.292670868866642, + "grad_norm": 0.13874452120456687, + "learning_rate": 1.504467321264268e-06, + "loss": 0.5379, + "step": 6661 + }, + { + "epoch": 3.293165245334322, + "grad_norm": 0.14226482579127586, + "learning_rate": 1.502413925910714e-06, + "loss": 0.5534, + "step": 6662 + }, + { + "epoch": 3.2936596218020022, + "grad_norm": 0.1384543139105087, + "learning_rate": 1.5003618189951818e-06, + "loss": 0.5487, + "step": 6663 + }, + { + "epoch": 3.2941539982696826, + "grad_norm": 0.13351792882057778, + "learning_rate": 1.498311000828826e-06, + "loss": 0.508, + "step": 6664 + }, + { + "epoch": 3.2946483747373625, + "grad_norm": 0.13694936701539215, + "learning_rate": 1.496261471722593e-06, + "loss": 0.5475, + "step": 6665 + }, + { + "epoch": 3.2951427512050424, + "grad_norm": 0.13430124277606784, + "learning_rate": 1.4942132319872439e-06, + "loss": 0.5327, + "step": 6666 + }, + { + "epoch": 3.2956371276727228, + "grad_norm": 0.13768349062141239, + "learning_rate": 1.4921662819333438e-06, + "loss": 0.5593, + "step": 6667 + }, + { + "epoch": 3.296131504140403, + "grad_norm": 0.14088359596075378, + "learning_rate": 1.490120621871254e-06, + "loss": 0.5437, + "step": 6668 + }, + { + "epoch": 3.296625880608083, + "grad_norm": 0.1377207076046821, + "learning_rate": 1.4880762521111502e-06, + "loss": 0.5287, + "step": 6669 + }, + { + "epoch": 3.297120257075763, + "grad_norm": 0.13309519100567963, + "learning_rate": 1.486033172963005e-06, + "loss": 0.4987, + "step": 6670 + }, + { + "epoch": 3.2976146335434433, + "grad_norm": 0.13931789402429334, + "learning_rate": 1.4839913847366006e-06, + "loss": 0.5572, + "step": 6671 + }, + { + "epoch": 3.2981090100111237, + "grad_norm": 0.13642793386084787, + "learning_rate": 1.4819508877415189e-06, + "loss": 0.5388, + "step": 6672 + }, + { + "epoch": 3.2986033864788036, + "grad_norm": 0.13477543372525672, + "learning_rate": 1.4799116822871506e-06, + "loss": 0.5341, + "step": 6673 + }, + { + "epoch": 3.299097762946484, + "grad_norm": 0.1432852852039617, + "learning_rate": 1.4778737686826838e-06, + "loss": 0.5496, + "step": 6674 + }, + { + "epoch": 3.299592139414164, + "grad_norm": 0.131771238736388, + "learning_rate": 1.4758371472371212e-06, + "loss": 0.5368, + "step": 6675 + }, + { + "epoch": 3.300086515881844, + "grad_norm": 0.13976619775114163, + "learning_rate": 1.4738018182592584e-06, + "loss": 0.5031, + "step": 6676 + }, + { + "epoch": 3.300580892349524, + "grad_norm": 0.12734945993410948, + "learning_rate": 1.4717677820577014e-06, + "loss": 0.5371, + "step": 6677 + }, + { + "epoch": 3.3010752688172045, + "grad_norm": 0.1372734718504485, + "learning_rate": 1.4697350389408615e-06, + "loss": 0.5356, + "step": 6678 + }, + { + "epoch": 3.3015696452848844, + "grad_norm": 0.13930370152077515, + "learning_rate": 1.4677035892169511e-06, + "loss": 0.5587, + "step": 6679 + }, + { + "epoch": 3.3020640217525647, + "grad_norm": 0.13598078492713536, + "learning_rate": 1.4656734331939814e-06, + "loss": 0.5149, + "step": 6680 + }, + { + "epoch": 3.3025583982202447, + "grad_norm": 0.13948463240318087, + "learning_rate": 1.4636445711797765e-06, + "loss": 0.5349, + "step": 6681 + }, + { + "epoch": 3.303052774687925, + "grad_norm": 0.13860293989512934, + "learning_rate": 1.4616170034819633e-06, + "loss": 0.5453, + "step": 6682 + }, + { + "epoch": 3.303547151155605, + "grad_norm": 0.13969033482366267, + "learning_rate": 1.4595907304079647e-06, + "loss": 0.5491, + "step": 6683 + }, + { + "epoch": 3.3040415276232853, + "grad_norm": 0.1468763762183844, + "learning_rate": 1.4575657522650176e-06, + "loss": 0.5116, + "step": 6684 + }, + { + "epoch": 3.304535904090965, + "grad_norm": 0.12964701050626667, + "learning_rate": 1.455542069360153e-06, + "loss": 0.5108, + "step": 6685 + }, + { + "epoch": 3.3050302805586456, + "grad_norm": 0.13306344034339282, + "learning_rate": 1.4535196820002073e-06, + "loss": 0.5222, + "step": 6686 + }, + { + "epoch": 3.3055246570263255, + "grad_norm": 0.1384910148550862, + "learning_rate": 1.4514985904918322e-06, + "loss": 0.5095, + "step": 6687 + }, + { + "epoch": 3.306019033494006, + "grad_norm": 0.13414240267772018, + "learning_rate": 1.4494787951414669e-06, + "loss": 0.5197, + "step": 6688 + }, + { + "epoch": 3.3065134099616857, + "grad_norm": 0.13656902193107237, + "learning_rate": 1.4474602962553608e-06, + "loss": 0.5195, + "step": 6689 + }, + { + "epoch": 3.307007786429366, + "grad_norm": 0.13457915715355276, + "learning_rate": 1.4454430941395703e-06, + "loss": 0.5238, + "step": 6690 + }, + { + "epoch": 3.307502162897046, + "grad_norm": 0.1403417194504881, + "learning_rate": 1.4434271890999474e-06, + "loss": 0.5466, + "step": 6691 + }, + { + "epoch": 3.3079965393647264, + "grad_norm": 0.1315124745539502, + "learning_rate": 1.4414125814421542e-06, + "loss": 0.5304, + "step": 6692 + }, + { + "epoch": 3.3084909158324063, + "grad_norm": 0.13160761210692268, + "learning_rate": 1.4393992714716543e-06, + "loss": 0.5198, + "step": 6693 + }, + { + "epoch": 3.3089852923000866, + "grad_norm": 0.13408629514124423, + "learning_rate": 1.4373872594937123e-06, + "loss": 0.5188, + "step": 6694 + }, + { + "epoch": 3.3094796687677666, + "grad_norm": 0.13636539355855598, + "learning_rate": 1.4353765458133994e-06, + "loss": 0.4898, + "step": 6695 + }, + { + "epoch": 3.309974045235447, + "grad_norm": 0.13136340965058124, + "learning_rate": 1.4333671307355868e-06, + "loss": 0.5212, + "step": 6696 + }, + { + "epoch": 3.310468421703127, + "grad_norm": 0.13443304798626846, + "learning_rate": 1.431359014564947e-06, + "loss": 0.5601, + "step": 6697 + }, + { + "epoch": 3.310962798170807, + "grad_norm": 0.1392794271490108, + "learning_rate": 1.429352197605962e-06, + "loss": 0.5472, + "step": 6698 + }, + { + "epoch": 3.311457174638487, + "grad_norm": 0.13798525636796152, + "learning_rate": 1.4273466801629154e-06, + "loss": 0.5069, + "step": 6699 + }, + { + "epoch": 3.3119515511061675, + "grad_norm": 0.13990201583916398, + "learning_rate": 1.425342462539887e-06, + "loss": 0.5871, + "step": 6700 + }, + { + "epoch": 3.3124459275738474, + "grad_norm": 0.13368905782136084, + "learning_rate": 1.4233395450407683e-06, + "loss": 0.5067, + "step": 6701 + }, + { + "epoch": 3.3129403040415277, + "grad_norm": 0.14005583876245195, + "learning_rate": 1.4213379279692497e-06, + "loss": 0.5151, + "step": 6702 + }, + { + "epoch": 3.3134346805092076, + "grad_norm": 0.13248867101127285, + "learning_rate": 1.419337611628816e-06, + "loss": 0.4978, + "step": 6703 + }, + { + "epoch": 3.313929056976888, + "grad_norm": 0.1397678964928703, + "learning_rate": 1.4173385963227759e-06, + "loss": 0.5787, + "step": 6704 + }, + { + "epoch": 3.314423433444568, + "grad_norm": 0.12975296202191128, + "learning_rate": 1.4153408823542214e-06, + "loss": 0.4898, + "step": 6705 + }, + { + "epoch": 3.3149178099122483, + "grad_norm": 0.13030727491269686, + "learning_rate": 1.4133444700260535e-06, + "loss": 0.5272, + "step": 6706 + }, + { + "epoch": 3.315412186379928, + "grad_norm": 0.13498674049300552, + "learning_rate": 1.4113493596409788e-06, + "loss": 0.5484, + "step": 6707 + }, + { + "epoch": 3.3159065628476085, + "grad_norm": 0.13388660349835166, + "learning_rate": 1.4093555515015e-06, + "loss": 0.5317, + "step": 6708 + }, + { + "epoch": 3.3164009393152885, + "grad_norm": 0.13172556394807694, + "learning_rate": 1.4073630459099285e-06, + "loss": 0.5165, + "step": 6709 + }, + { + "epoch": 3.316895315782969, + "grad_norm": 0.13621025485686397, + "learning_rate": 1.4053718431683782e-06, + "loss": 0.527, + "step": 6710 + }, + { + "epoch": 3.3173896922506487, + "grad_norm": 0.13321958920929844, + "learning_rate": 1.4033819435787622e-06, + "loss": 0.5145, + "step": 6711 + }, + { + "epoch": 3.317884068718329, + "grad_norm": 0.13979512963989651, + "learning_rate": 1.4013933474427932e-06, + "loss": 0.5762, + "step": 6712 + }, + { + "epoch": 3.318378445186009, + "grad_norm": 0.13687358535694027, + "learning_rate": 1.399406055061996e-06, + "loss": 0.5134, + "step": 6713 + }, + { + "epoch": 3.3188728216536894, + "grad_norm": 0.13711306384580735, + "learning_rate": 1.3974200667376858e-06, + "loss": 0.5302, + "step": 6714 + }, + { + "epoch": 3.3193671981213693, + "grad_norm": 0.13999004365847978, + "learning_rate": 1.3954353827709887e-06, + "loss": 0.5493, + "step": 6715 + }, + { + "epoch": 3.3198615745890496, + "grad_norm": 0.13209316790741876, + "learning_rate": 1.393452003462834e-06, + "loss": 0.4987, + "step": 6716 + }, + { + "epoch": 3.3203559510567295, + "grad_norm": 0.131284998816581, + "learning_rate": 1.3914699291139444e-06, + "loss": 0.5101, + "step": 6717 + }, + { + "epoch": 3.32085032752441, + "grad_norm": 0.13449968657348177, + "learning_rate": 1.389489160024854e-06, + "loss": 0.5131, + "step": 6718 + }, + { + "epoch": 3.32134470399209, + "grad_norm": 0.13730781594893146, + "learning_rate": 1.38750969649589e-06, + "loss": 0.5424, + "step": 6719 + }, + { + "epoch": 3.32183908045977, + "grad_norm": 0.1398686532936773, + "learning_rate": 1.3855315388271918e-06, + "loss": 0.5983, + "step": 6720 + }, + { + "epoch": 3.32233345692745, + "grad_norm": 0.13380085851006604, + "learning_rate": 1.383554687318691e-06, + "loss": 0.5148, + "step": 6721 + }, + { + "epoch": 3.3228278333951304, + "grad_norm": 0.13363917862060137, + "learning_rate": 1.3815791422701308e-06, + "loss": 0.5262, + "step": 6722 + }, + { + "epoch": 3.3233222098628104, + "grad_norm": 0.13780373282481734, + "learning_rate": 1.3796049039810467e-06, + "loss": 0.5382, + "step": 6723 + }, + { + "epoch": 3.3238165863304907, + "grad_norm": 0.13642892930157377, + "learning_rate": 1.3776319727507836e-06, + "loss": 0.551, + "step": 6724 + }, + { + "epoch": 3.3243109627981706, + "grad_norm": 0.14703877509787416, + "learning_rate": 1.3756603488784826e-06, + "loss": 0.5315, + "step": 6725 + }, + { + "epoch": 3.324805339265851, + "grad_norm": 0.12745193952519765, + "learning_rate": 1.3736900326630908e-06, + "loss": 0.5124, + "step": 6726 + }, + { + "epoch": 3.325299715733531, + "grad_norm": 0.13085459693350715, + "learning_rate": 1.3717210244033562e-06, + "loss": 0.5227, + "step": 6727 + }, + { + "epoch": 3.3257940922012112, + "grad_norm": 0.13892297457894542, + "learning_rate": 1.3697533243978277e-06, + "loss": 0.5223, + "step": 6728 + }, + { + "epoch": 3.326288468668891, + "grad_norm": 0.13220448435821905, + "learning_rate": 1.3677869329448535e-06, + "loss": 0.54, + "step": 6729 + }, + { + "epoch": 3.3267828451365715, + "grad_norm": 0.1328984998012589, + "learning_rate": 1.3658218503425858e-06, + "loss": 0.5323, + "step": 6730 + }, + { + "epoch": 3.327277221604252, + "grad_norm": 0.12919724326439108, + "learning_rate": 1.363858076888983e-06, + "loss": 0.5185, + "step": 6731 + }, + { + "epoch": 3.327771598071932, + "grad_norm": 0.13559124460093816, + "learning_rate": 1.361895612881794e-06, + "loss": 0.5311, + "step": 6732 + }, + { + "epoch": 3.3282659745396117, + "grad_norm": 0.1355044641270287, + "learning_rate": 1.3599344586185813e-06, + "loss": 0.5095, + "step": 6733 + }, + { + "epoch": 3.328760351007292, + "grad_norm": 0.13317939234361928, + "learning_rate": 1.3579746143966998e-06, + "loss": 0.5421, + "step": 6734 + }, + { + "epoch": 3.3292547274749724, + "grad_norm": 0.13940926307727278, + "learning_rate": 1.356016080513306e-06, + "loss": 0.5709, + "step": 6735 + }, + { + "epoch": 3.3297491039426523, + "grad_norm": 0.13872877999032626, + "learning_rate": 1.3540588572653657e-06, + "loss": 0.5246, + "step": 6736 + }, + { + "epoch": 3.3302434804103322, + "grad_norm": 0.1339820798153959, + "learning_rate": 1.3521029449496404e-06, + "loss": 0.5479, + "step": 6737 + }, + { + "epoch": 3.3307378568780126, + "grad_norm": 0.13458205953122607, + "learning_rate": 1.3501483438626894e-06, + "loss": 0.5371, + "step": 6738 + }, + { + "epoch": 3.331232233345693, + "grad_norm": 0.1356768984736242, + "learning_rate": 1.3481950543008825e-06, + "loss": 0.5414, + "step": 6739 + }, + { + "epoch": 3.331726609813373, + "grad_norm": 0.1411931986951223, + "learning_rate": 1.3462430765603806e-06, + "loss": 0.5195, + "step": 6740 + }, + { + "epoch": 3.332220986281053, + "grad_norm": 0.13337411524354087, + "learning_rate": 1.3442924109371513e-06, + "loss": 0.5046, + "step": 6741 + }, + { + "epoch": 3.332715362748733, + "grad_norm": 0.13715001997998535, + "learning_rate": 1.342343057726967e-06, + "loss": 0.5273, + "step": 6742 + }, + { + "epoch": 3.3332097392164135, + "grad_norm": 0.14159662119370683, + "learning_rate": 1.34039501722539e-06, + "loss": 0.5715, + "step": 6743 + }, + { + "epoch": 3.3337041156840934, + "grad_norm": 0.1360923958051231, + "learning_rate": 1.338448289727795e-06, + "loss": 0.5152, + "step": 6744 + }, + { + "epoch": 3.3341984921517733, + "grad_norm": 0.1370121512053211, + "learning_rate": 1.3365028755293507e-06, + "loss": 0.5049, + "step": 6745 + }, + { + "epoch": 3.3346928686194537, + "grad_norm": 0.1324045307016685, + "learning_rate": 1.3345587749250255e-06, + "loss": 0.529, + "step": 6746 + }, + { + "epoch": 3.335187245087134, + "grad_norm": 0.13454528781605996, + "learning_rate": 1.3326159882095957e-06, + "loss": 0.5532, + "step": 6747 + }, + { + "epoch": 3.335681621554814, + "grad_norm": 0.13173091627647499, + "learning_rate": 1.3306745156776346e-06, + "loss": 0.5334, + "step": 6748 + }, + { + "epoch": 3.3361759980224943, + "grad_norm": 0.13332127413656597, + "learning_rate": 1.3287343576235123e-06, + "loss": 0.5176, + "step": 6749 + }, + { + "epoch": 3.3366703744901742, + "grad_norm": 0.13231711036803806, + "learning_rate": 1.326795514341408e-06, + "loss": 0.5276, + "step": 6750 + }, + { + "epoch": 3.3371647509578546, + "grad_norm": 0.13650482024147437, + "learning_rate": 1.3248579861252953e-06, + "loss": 0.5626, + "step": 6751 + }, + { + "epoch": 3.3376591274255345, + "grad_norm": 0.13476861263181583, + "learning_rate": 1.3229217732689448e-06, + "loss": 0.5378, + "step": 6752 + }, + { + "epoch": 3.338153503893215, + "grad_norm": 0.1423073045972271, + "learning_rate": 1.3209868760659406e-06, + "loss": 0.5263, + "step": 6753 + }, + { + "epoch": 3.3386478803608948, + "grad_norm": 0.13368948200255956, + "learning_rate": 1.319053294809658e-06, + "loss": 0.5403, + "step": 6754 + }, + { + "epoch": 3.339142256828575, + "grad_norm": 0.13799713596416652, + "learning_rate": 1.317121029793269e-06, + "loss": 0.566, + "step": 6755 + }, + { + "epoch": 3.339636633296255, + "grad_norm": 0.1325319690504343, + "learning_rate": 1.3151900813097585e-06, + "loss": 0.5102, + "step": 6756 + }, + { + "epoch": 3.3401310097639354, + "grad_norm": 0.13245112590493377, + "learning_rate": 1.3132604496518975e-06, + "loss": 0.5056, + "step": 6757 + }, + { + "epoch": 3.3406253862316153, + "grad_norm": 0.13827846942997685, + "learning_rate": 1.3113321351122688e-06, + "loss": 0.5183, + "step": 6758 + }, + { + "epoch": 3.3411197626992957, + "grad_norm": 0.13341773637650484, + "learning_rate": 1.3094051379832528e-06, + "loss": 0.5389, + "step": 6759 + }, + { + "epoch": 3.3416141391669756, + "grad_norm": 0.13400824608545858, + "learning_rate": 1.307479458557026e-06, + "loss": 0.5353, + "step": 6760 + }, + { + "epoch": 3.342108515634656, + "grad_norm": 0.13970276684424515, + "learning_rate": 1.305555097125566e-06, + "loss": 0.555, + "step": 6761 + }, + { + "epoch": 3.342602892102336, + "grad_norm": 0.13526700891314503, + "learning_rate": 1.3036320539806558e-06, + "loss": 0.501, + "step": 6762 + }, + { + "epoch": 3.343097268570016, + "grad_norm": 0.13193390532984667, + "learning_rate": 1.3017103294138712e-06, + "loss": 0.5095, + "step": 6763 + }, + { + "epoch": 3.343591645037696, + "grad_norm": 0.13195683591777918, + "learning_rate": 1.2997899237165935e-06, + "loss": 0.54, + "step": 6764 + }, + { + "epoch": 3.3440860215053765, + "grad_norm": 0.14062927641665435, + "learning_rate": 1.2978708371800054e-06, + "loss": 0.5616, + "step": 6765 + }, + { + "epoch": 3.3445803979730564, + "grad_norm": 0.13666696624209543, + "learning_rate": 1.2959530700950807e-06, + "loss": 0.5575, + "step": 6766 + }, + { + "epoch": 3.3450747744407368, + "grad_norm": 0.1322846774658295, + "learning_rate": 1.2940366227526035e-06, + "loss": 0.5025, + "step": 6767 + }, + { + "epoch": 3.3455691509084167, + "grad_norm": 0.13473820456590216, + "learning_rate": 1.2921214954431495e-06, + "loss": 0.5139, + "step": 6768 + }, + { + "epoch": 3.346063527376097, + "grad_norm": 0.1314930742643631, + "learning_rate": 1.2902076884571002e-06, + "loss": 0.5206, + "step": 6769 + }, + { + "epoch": 3.346557903843777, + "grad_norm": 0.13384203209836198, + "learning_rate": 1.2882952020846374e-06, + "loss": 0.5228, + "step": 6770 + }, + { + "epoch": 3.3470522803114573, + "grad_norm": 0.13734049648006558, + "learning_rate": 1.2863840366157366e-06, + "loss": 0.5095, + "step": 6771 + }, + { + "epoch": 3.347546656779137, + "grad_norm": 0.13254760885370045, + "learning_rate": 1.2844741923401739e-06, + "loss": 0.5174, + "step": 6772 + }, + { + "epoch": 3.3480410332468176, + "grad_norm": 0.13545298181933593, + "learning_rate": 1.282565669547533e-06, + "loss": 0.5506, + "step": 6773 + }, + { + "epoch": 3.3485354097144975, + "grad_norm": 0.1361500667514627, + "learning_rate": 1.2806584685271871e-06, + "loss": 0.5213, + "step": 6774 + }, + { + "epoch": 3.349029786182178, + "grad_norm": 0.13620357128170849, + "learning_rate": 1.2787525895683161e-06, + "loss": 0.5126, + "step": 6775 + }, + { + "epoch": 3.3495241626498578, + "grad_norm": 0.1377658470450938, + "learning_rate": 1.2768480329598975e-06, + "loss": 0.556, + "step": 6776 + }, + { + "epoch": 3.350018539117538, + "grad_norm": 0.1344595173561864, + "learning_rate": 1.2749447989907083e-06, + "loss": 0.5251, + "step": 6777 + }, + { + "epoch": 3.350512915585218, + "grad_norm": 0.13627372570302862, + "learning_rate": 1.2730428879493206e-06, + "loss": 0.5421, + "step": 6778 + }, + { + "epoch": 3.3510072920528984, + "grad_norm": 0.13637650408053614, + "learning_rate": 1.2711423001241118e-06, + "loss": 0.5297, + "step": 6779 + }, + { + "epoch": 3.3515016685205783, + "grad_norm": 0.13685702856707369, + "learning_rate": 1.2692430358032593e-06, + "loss": 0.5409, + "step": 6780 + }, + { + "epoch": 3.3519960449882586, + "grad_norm": 0.139060254865207, + "learning_rate": 1.2673450952747336e-06, + "loss": 0.5265, + "step": 6781 + }, + { + "epoch": 3.3524904214559386, + "grad_norm": 0.136534793691308, + "learning_rate": 1.265448478826311e-06, + "loss": 0.4926, + "step": 6782 + }, + { + "epoch": 3.352984797923619, + "grad_norm": 0.13608115979415, + "learning_rate": 1.2635531867455597e-06, + "loss": 0.5344, + "step": 6783 + }, + { + "epoch": 3.353479174391299, + "grad_norm": 0.13163442509342524, + "learning_rate": 1.2616592193198573e-06, + "loss": 0.5447, + "step": 6784 + }, + { + "epoch": 3.353973550858979, + "grad_norm": 0.13484387875555917, + "learning_rate": 1.2597665768363687e-06, + "loss": 0.5277, + "step": 6785 + }, + { + "epoch": 3.354467927326659, + "grad_norm": 0.1380287271310464, + "learning_rate": 1.2578752595820698e-06, + "loss": 0.5405, + "step": 6786 + }, + { + "epoch": 3.3549623037943395, + "grad_norm": 0.1314230902045048, + "learning_rate": 1.2559852678437246e-06, + "loss": 0.5224, + "step": 6787 + }, + { + "epoch": 3.3554566802620194, + "grad_norm": 0.1342826455784126, + "learning_rate": 1.2540966019079048e-06, + "loss": 0.5206, + "step": 6788 + }, + { + "epoch": 3.3559510567296997, + "grad_norm": 0.13724944527587504, + "learning_rate": 1.252209262060975e-06, + "loss": 0.5273, + "step": 6789 + }, + { + "epoch": 3.3564454331973796, + "grad_norm": 0.13742993358425004, + "learning_rate": 1.2503232485891014e-06, + "loss": 0.5304, + "step": 6790 + }, + { + "epoch": 3.35693980966506, + "grad_norm": 0.1355249350915414, + "learning_rate": 1.2484385617782524e-06, + "loss": 0.5134, + "step": 6791 + }, + { + "epoch": 3.35743418613274, + "grad_norm": 0.1328057280572765, + "learning_rate": 1.2465552019141869e-06, + "loss": 0.5297, + "step": 6792 + }, + { + "epoch": 3.3579285626004203, + "grad_norm": 0.13996630349910796, + "learning_rate": 1.244673169282472e-06, + "loss": 0.5288, + "step": 6793 + }, + { + "epoch": 3.3584229390681, + "grad_norm": 0.14053935763994713, + "learning_rate": 1.2427924641684674e-06, + "loss": 0.5452, + "step": 6794 + }, + { + "epoch": 3.3589173155357805, + "grad_norm": 0.12758450372999278, + "learning_rate": 1.2409130868573294e-06, + "loss": 0.519, + "step": 6795 + }, + { + "epoch": 3.3594116920034605, + "grad_norm": 0.13675210019882275, + "learning_rate": 1.2390350376340199e-06, + "loss": 0.5245, + "step": 6796 + }, + { + "epoch": 3.359906068471141, + "grad_norm": 0.13417578471857255, + "learning_rate": 1.237158316783299e-06, + "loss": 0.528, + "step": 6797 + }, + { + "epoch": 3.3604004449388207, + "grad_norm": 0.13443717764469307, + "learning_rate": 1.2352829245897168e-06, + "loss": 0.5192, + "step": 6798 + }, + { + "epoch": 3.360894821406501, + "grad_norm": 0.13822577771947564, + "learning_rate": 1.2334088613376339e-06, + "loss": 0.5304, + "step": 6799 + }, + { + "epoch": 3.361389197874181, + "grad_norm": 0.1335228028249497, + "learning_rate": 1.2315361273111991e-06, + "loss": 0.5334, + "step": 6800 + }, + { + "epoch": 3.3618835743418614, + "grad_norm": 0.1340947712889393, + "learning_rate": 1.2296647227943615e-06, + "loss": 0.5443, + "step": 6801 + }, + { + "epoch": 3.3623779508095413, + "grad_norm": 0.13680688605407768, + "learning_rate": 1.227794648070878e-06, + "loss": 0.5697, + "step": 6802 + }, + { + "epoch": 3.3628723272772216, + "grad_norm": 0.1366429561883105, + "learning_rate": 1.2259259034242932e-06, + "loss": 0.5604, + "step": 6803 + }, + { + "epoch": 3.3633667037449015, + "grad_norm": 0.13232307784897954, + "learning_rate": 1.2240584891379526e-06, + "loss": 0.5271, + "step": 6804 + }, + { + "epoch": 3.363861080212582, + "grad_norm": 0.13613039735327984, + "learning_rate": 1.2221924054950029e-06, + "loss": 0.5648, + "step": 6805 + }, + { + "epoch": 3.3643554566802623, + "grad_norm": 0.13523970003515695, + "learning_rate": 1.2203276527783847e-06, + "loss": 0.5572, + "step": 6806 + }, + { + "epoch": 3.364849833147942, + "grad_norm": 0.13541394708857254, + "learning_rate": 1.2184642312708405e-06, + "loss": 0.5387, + "step": 6807 + }, + { + "epoch": 3.365344209615622, + "grad_norm": 0.13101819626105582, + "learning_rate": 1.2166021412549122e-06, + "loss": 0.5421, + "step": 6808 + }, + { + "epoch": 3.3658385860833024, + "grad_norm": 0.13457381663598186, + "learning_rate": 1.2147413830129351e-06, + "loss": 0.5198, + "step": 6809 + }, + { + "epoch": 3.366332962550983, + "grad_norm": 0.13535996468157674, + "learning_rate": 1.2128819568270434e-06, + "loss": 0.5057, + "step": 6810 + }, + { + "epoch": 3.3668273390186627, + "grad_norm": 0.1305998702687742, + "learning_rate": 1.2110238629791738e-06, + "loss": 0.5165, + "step": 6811 + }, + { + "epoch": 3.3673217154863426, + "grad_norm": 0.13569577825870222, + "learning_rate": 1.2091671017510554e-06, + "loss": 0.5456, + "step": 6812 + }, + { + "epoch": 3.367816091954023, + "grad_norm": 0.13448069383738767, + "learning_rate": 1.2073116734242174e-06, + "loss": 0.5091, + "step": 6813 + }, + { + "epoch": 3.3683104684217033, + "grad_norm": 0.13142780132382734, + "learning_rate": 1.2054575782799916e-06, + "loss": 0.5146, + "step": 6814 + }, + { + "epoch": 3.3688048448893833, + "grad_norm": 0.13178640192097218, + "learning_rate": 1.2036048165994985e-06, + "loss": 0.5123, + "step": 6815 + }, + { + "epoch": 3.369299221357063, + "grad_norm": 0.13404840434379306, + "learning_rate": 1.2017533886636645e-06, + "loss": 0.5256, + "step": 6816 + }, + { + "epoch": 3.3697935978247435, + "grad_norm": 0.13463222161580338, + "learning_rate": 1.1999032947532097e-06, + "loss": 0.5179, + "step": 6817 + }, + { + "epoch": 3.370287974292424, + "grad_norm": 0.13325360301800898, + "learning_rate": 1.1980545351486483e-06, + "loss": 0.5114, + "step": 6818 + }, + { + "epoch": 3.370782350760104, + "grad_norm": 0.13286907787232408, + "learning_rate": 1.1962071101303042e-06, + "loss": 0.4958, + "step": 6819 + }, + { + "epoch": 3.3712767272277837, + "grad_norm": 0.14032602922176465, + "learning_rate": 1.1943610199782874e-06, + "loss": 0.5302, + "step": 6820 + }, + { + "epoch": 3.371771103695464, + "grad_norm": 0.14008608234007688, + "learning_rate": 1.1925162649725085e-06, + "loss": 0.519, + "step": 6821 + }, + { + "epoch": 3.3722654801631444, + "grad_norm": 0.1340722630470232, + "learning_rate": 1.1906728453926798e-06, + "loss": 0.5372, + "step": 6822 + }, + { + "epoch": 3.3727598566308243, + "grad_norm": 0.1321423984018012, + "learning_rate": 1.1888307615183037e-06, + "loss": 0.5108, + "step": 6823 + }, + { + "epoch": 3.3732542330985047, + "grad_norm": 0.1340797794680627, + "learning_rate": 1.1869900136286872e-06, + "loss": 0.5146, + "step": 6824 + }, + { + "epoch": 3.3737486095661846, + "grad_norm": 0.1372365280008834, + "learning_rate": 1.1851506020029335e-06, + "loss": 0.5355, + "step": 6825 + }, + { + "epoch": 3.374242986033865, + "grad_norm": 0.1314011492302428, + "learning_rate": 1.1833125269199386e-06, + "loss": 0.514, + "step": 6826 + }, + { + "epoch": 3.374737362501545, + "grad_norm": 0.13176670158536233, + "learning_rate": 1.1814757886583984e-06, + "loss": 0.5187, + "step": 6827 + }, + { + "epoch": 3.3752317389692252, + "grad_norm": 0.1418086888444993, + "learning_rate": 1.1796403874968098e-06, + "loss": 0.5654, + "step": 6828 + }, + { + "epoch": 3.375726115436905, + "grad_norm": 0.1357039679238753, + "learning_rate": 1.1778063237134596e-06, + "loss": 0.5366, + "step": 6829 + }, + { + "epoch": 3.3762204919045855, + "grad_norm": 0.13502719063928478, + "learning_rate": 1.1759735975864372e-06, + "loss": 0.5367, + "step": 6830 + }, + { + "epoch": 3.3767148683722654, + "grad_norm": 0.136144263912696, + "learning_rate": 1.1741422093936317e-06, + "loss": 0.5301, + "step": 6831 + }, + { + "epoch": 3.377209244839946, + "grad_norm": 0.13310093433866357, + "learning_rate": 1.1723121594127195e-06, + "loss": 0.5222, + "step": 6832 + }, + { + "epoch": 3.3777036213076257, + "grad_norm": 0.1388855793474982, + "learning_rate": 1.1704834479211856e-06, + "loss": 0.5588, + "step": 6833 + }, + { + "epoch": 3.378197997775306, + "grad_norm": 0.13449114408325463, + "learning_rate": 1.1686560751963017e-06, + "loss": 0.4986, + "step": 6834 + }, + { + "epoch": 3.378692374242986, + "grad_norm": 0.1334200308369579, + "learning_rate": 1.1668300415151458e-06, + "loss": 0.5591, + "step": 6835 + }, + { + "epoch": 3.3791867507106663, + "grad_norm": 0.1343979024918561, + "learning_rate": 1.1650053471545842e-06, + "loss": 0.5461, + "step": 6836 + }, + { + "epoch": 3.3796811271783462, + "grad_norm": 0.13787541792749042, + "learning_rate": 1.163181992391289e-06, + "loss": 0.5188, + "step": 6837 + }, + { + "epoch": 3.3801755036460266, + "grad_norm": 0.13246745760069797, + "learning_rate": 1.1613599775017192e-06, + "loss": 0.521, + "step": 6838 + }, + { + "epoch": 3.3806698801137065, + "grad_norm": 0.1343634621945659, + "learning_rate": 1.1595393027621394e-06, + "loss": 0.5228, + "step": 6839 + }, + { + "epoch": 3.381164256581387, + "grad_norm": 0.13137354429743672, + "learning_rate": 1.1577199684486085e-06, + "loss": 0.5236, + "step": 6840 + }, + { + "epoch": 3.3816586330490668, + "grad_norm": 0.13515036181640383, + "learning_rate": 1.1559019748369782e-06, + "loss": 0.5458, + "step": 6841 + }, + { + "epoch": 3.382153009516747, + "grad_norm": 0.1352306400839177, + "learning_rate": 1.1540853222029025e-06, + "loss": 0.5356, + "step": 6842 + }, + { + "epoch": 3.382647385984427, + "grad_norm": 0.13403550435251557, + "learning_rate": 1.15227001082183e-06, + "loss": 0.5378, + "step": 6843 + }, + { + "epoch": 3.3831417624521074, + "grad_norm": 0.13373269486864578, + "learning_rate": 1.1504560409690013e-06, + "loss": 0.5147, + "step": 6844 + }, + { + "epoch": 3.3836361389197873, + "grad_norm": 0.136189864947682, + "learning_rate": 1.1486434129194602e-06, + "loss": 0.5268, + "step": 6845 + }, + { + "epoch": 3.3841305153874677, + "grad_norm": 0.13374688986978076, + "learning_rate": 1.1468321269480476e-06, + "loss": 0.5169, + "step": 6846 + }, + { + "epoch": 3.3846248918551476, + "grad_norm": 0.13253255607857192, + "learning_rate": 1.1450221833293928e-06, + "loss": 0.5465, + "step": 6847 + }, + { + "epoch": 3.385119268322828, + "grad_norm": 0.13261789727578527, + "learning_rate": 1.1432135823379308e-06, + "loss": 0.5581, + "step": 6848 + }, + { + "epoch": 3.385613644790508, + "grad_norm": 0.13615971909179148, + "learning_rate": 1.1414063242478879e-06, + "loss": 0.5487, + "step": 6849 + }, + { + "epoch": 3.386108021258188, + "grad_norm": 0.1310896610099215, + "learning_rate": 1.1396004093332835e-06, + "loss": 0.5225, + "step": 6850 + }, + { + "epoch": 3.386602397725868, + "grad_norm": 0.13577233911044248, + "learning_rate": 1.1377958378679455e-06, + "loss": 0.5058, + "step": 6851 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 0.13259458417248313, + "learning_rate": 1.1359926101254848e-06, + "loss": 0.5096, + "step": 6852 + }, + { + "epoch": 3.3875911506612284, + "grad_norm": 0.13138729811384317, + "learning_rate": 1.134190726379314e-06, + "loss": 0.5161, + "step": 6853 + }, + { + "epoch": 3.3880855271289088, + "grad_norm": 0.13410469148619353, + "learning_rate": 1.1323901869026455e-06, + "loss": 0.5162, + "step": 6854 + }, + { + "epoch": 3.3885799035965887, + "grad_norm": 0.13155718581480846, + "learning_rate": 1.13059099196848e-06, + "loss": 0.5241, + "step": 6855 + }, + { + "epoch": 3.389074280064269, + "grad_norm": 0.137525718548488, + "learning_rate": 1.12879314184962e-06, + "loss": 0.5376, + "step": 6856 + }, + { + "epoch": 3.389568656531949, + "grad_norm": 0.13922314117761822, + "learning_rate": 1.126996636818667e-06, + "loss": 0.5445, + "step": 6857 + }, + { + "epoch": 3.3900630329996293, + "grad_norm": 0.13555658545916155, + "learning_rate": 1.125201477148007e-06, + "loss": 0.5449, + "step": 6858 + }, + { + "epoch": 3.390557409467309, + "grad_norm": 0.13916891317439045, + "learning_rate": 1.1234076631098357e-06, + "loss": 0.5351, + "step": 6859 + }, + { + "epoch": 3.3910517859349896, + "grad_norm": 0.13524140259104883, + "learning_rate": 1.1216151949761356e-06, + "loss": 0.5063, + "step": 6860 + }, + { + "epoch": 3.3915461624026695, + "grad_norm": 0.13156070545855125, + "learning_rate": 1.119824073018686e-06, + "loss": 0.5641, + "step": 6861 + }, + { + "epoch": 3.39204053887035, + "grad_norm": 0.13316329324018028, + "learning_rate": 1.1180342975090663e-06, + "loss": 0.5287, + "step": 6862 + }, + { + "epoch": 3.3925349153380298, + "grad_norm": 0.14070240812079915, + "learning_rate": 1.1162458687186507e-06, + "loss": 0.5472, + "step": 6863 + }, + { + "epoch": 3.39302929180571, + "grad_norm": 0.13410700747392046, + "learning_rate": 1.1144587869186041e-06, + "loss": 0.5397, + "step": 6864 + }, + { + "epoch": 3.39352366827339, + "grad_norm": 0.14004791481543163, + "learning_rate": 1.1126730523798956e-06, + "loss": 0.5349, + "step": 6865 + }, + { + "epoch": 3.3940180447410704, + "grad_norm": 0.13042298720037107, + "learning_rate": 1.110888665373282e-06, + "loss": 0.5231, + "step": 6866 + }, + { + "epoch": 3.3945124212087503, + "grad_norm": 0.13526427438081784, + "learning_rate": 1.1091056261693178e-06, + "loss": 0.5565, + "step": 6867 + }, + { + "epoch": 3.3950067976764307, + "grad_norm": 0.14133015965571233, + "learning_rate": 1.1073239350383603e-06, + "loss": 0.5391, + "step": 6868 + }, + { + "epoch": 3.3955011741441106, + "grad_norm": 0.13200845654147836, + "learning_rate": 1.105543592250553e-06, + "loss": 0.5081, + "step": 6869 + }, + { + "epoch": 3.395995550611791, + "grad_norm": 0.1363800714074898, + "learning_rate": 1.1037645980758372e-06, + "loss": 0.5294, + "step": 6870 + }, + { + "epoch": 3.396489927079471, + "grad_norm": 0.13770790053553658, + "learning_rate": 1.1019869527839545e-06, + "loss": 0.5431, + "step": 6871 + }, + { + "epoch": 3.396984303547151, + "grad_norm": 0.13576296111305428, + "learning_rate": 1.100210656644435e-06, + "loss": 0.5359, + "step": 6872 + }, + { + "epoch": 3.397478680014831, + "grad_norm": 0.13764721523041912, + "learning_rate": 1.0984357099266096e-06, + "loss": 0.5263, + "step": 6873 + }, + { + "epoch": 3.3979730564825115, + "grad_norm": 0.13327563499074668, + "learning_rate": 1.0966621128996058e-06, + "loss": 0.5149, + "step": 6874 + }, + { + "epoch": 3.3984674329501914, + "grad_norm": 0.13340391406188143, + "learning_rate": 1.0948898658323404e-06, + "loss": 0.5043, + "step": 6875 + }, + { + "epoch": 3.3989618094178717, + "grad_norm": 0.1335910771004972, + "learning_rate": 1.0931189689935262e-06, + "loss": 0.5353, + "step": 6876 + }, + { + "epoch": 3.399456185885552, + "grad_norm": 0.1349530510201269, + "learning_rate": 1.0913494226516796e-06, + "loss": 0.5375, + "step": 6877 + }, + { + "epoch": 3.399950562353232, + "grad_norm": 0.14433104462196947, + "learning_rate": 1.0895812270750993e-06, + "loss": 0.5681, + "step": 6878 + }, + { + "epoch": 3.400444938820912, + "grad_norm": 0.18595560250108895, + "learning_rate": 1.087814382531891e-06, + "loss": 0.531, + "step": 6879 + }, + { + "epoch": 3.4009393152885923, + "grad_norm": 0.13265756657089983, + "learning_rate": 1.0860488892899524e-06, + "loss": 0.4991, + "step": 6880 + }, + { + "epoch": 3.4014336917562726, + "grad_norm": 0.1325796410612809, + "learning_rate": 1.0842847476169682e-06, + "loss": 0.5313, + "step": 6881 + }, + { + "epoch": 3.4019280682239526, + "grad_norm": 0.13425047762423822, + "learning_rate": 1.0825219577804313e-06, + "loss": 0.5246, + "step": 6882 + }, + { + "epoch": 3.4024224446916325, + "grad_norm": 0.13228023633718738, + "learning_rate": 1.080760520047619e-06, + "loss": 0.5182, + "step": 6883 + }, + { + "epoch": 3.402916821159313, + "grad_norm": 0.13353191155243693, + "learning_rate": 1.0790004346856086e-06, + "loss": 0.5199, + "step": 6884 + }, + { + "epoch": 3.403411197626993, + "grad_norm": 0.14246527915315202, + "learning_rate": 1.0772417019612702e-06, + "loss": 0.5351, + "step": 6885 + }, + { + "epoch": 3.403905574094673, + "grad_norm": 0.1392627728842706, + "learning_rate": 1.0754843221412737e-06, + "loss": 0.5705, + "step": 6886 + }, + { + "epoch": 3.404399950562353, + "grad_norm": 0.1332528458625962, + "learning_rate": 1.0737282954920737e-06, + "loss": 0.5473, + "step": 6887 + }, + { + "epoch": 3.4048943270300334, + "grad_norm": 0.13902610841847354, + "learning_rate": 1.0719736222799326e-06, + "loss": 0.5374, + "step": 6888 + }, + { + "epoch": 3.4053887034977137, + "grad_norm": 0.12991511769023323, + "learning_rate": 1.0702203027708958e-06, + "loss": 0.4965, + "step": 6889 + }, + { + "epoch": 3.4058830799653936, + "grad_norm": 0.13311155517759257, + "learning_rate": 1.0684683372308114e-06, + "loss": 0.5323, + "step": 6890 + }, + { + "epoch": 3.4063774564330735, + "grad_norm": 0.13222927952847086, + "learning_rate": 1.0667177259253192e-06, + "loss": 0.5168, + "step": 6891 + }, + { + "epoch": 3.406871832900754, + "grad_norm": 0.13206522330637305, + "learning_rate": 1.0649684691198548e-06, + "loss": 0.538, + "step": 6892 + }, + { + "epoch": 3.4073662093684343, + "grad_norm": 0.13590641094198747, + "learning_rate": 1.0632205670796448e-06, + "loss": 0.53, + "step": 6893 + }, + { + "epoch": 3.407860585836114, + "grad_norm": 0.1330524241257489, + "learning_rate": 1.0614740200697126e-06, + "loss": 0.4903, + "step": 6894 + }, + { + "epoch": 3.4083549623037945, + "grad_norm": 0.13681847146130527, + "learning_rate": 1.0597288283548824e-06, + "loss": 0.5611, + "step": 6895 + }, + { + "epoch": 3.4088493387714744, + "grad_norm": 0.13722901053340153, + "learning_rate": 1.057984992199761e-06, + "loss": 0.5367, + "step": 6896 + }, + { + "epoch": 3.409343715239155, + "grad_norm": 0.13416411715354573, + "learning_rate": 1.0562425118687592e-06, + "loss": 0.5339, + "step": 6897 + }, + { + "epoch": 3.4098380917068347, + "grad_norm": 0.1332786633597748, + "learning_rate": 1.054501387626079e-06, + "loss": 0.5003, + "step": 6898 + }, + { + "epoch": 3.410332468174515, + "grad_norm": 0.13674317601920574, + "learning_rate": 1.0527616197357126e-06, + "loss": 0.5418, + "step": 6899 + }, + { + "epoch": 3.410826844642195, + "grad_norm": 0.13798995208829426, + "learning_rate": 1.0510232084614535e-06, + "loss": 0.5598, + "step": 6900 + }, + { + "epoch": 3.4113212211098753, + "grad_norm": 0.1383695289006086, + "learning_rate": 1.0492861540668885e-06, + "loss": 0.5251, + "step": 6901 + }, + { + "epoch": 3.4118155975775553, + "grad_norm": 0.13507903611988314, + "learning_rate": 1.0475504568153937e-06, + "loss": 0.5223, + "step": 6902 + }, + { + "epoch": 3.4123099740452356, + "grad_norm": 0.13647824256495514, + "learning_rate": 1.0458161169701441e-06, + "loss": 0.515, + "step": 6903 + }, + { + "epoch": 3.4128043505129155, + "grad_norm": 0.13314644386034805, + "learning_rate": 1.044083134794106e-06, + "loss": 0.5147, + "step": 6904 + }, + { + "epoch": 3.413298726980596, + "grad_norm": 0.13552690068446507, + "learning_rate": 1.0423515105500404e-06, + "loss": 0.5067, + "step": 6905 + }, + { + "epoch": 3.413793103448276, + "grad_norm": 0.13167168896652734, + "learning_rate": 1.0406212445005071e-06, + "loss": 0.5103, + "step": 6906 + }, + { + "epoch": 3.414287479915956, + "grad_norm": 0.1331064458818392, + "learning_rate": 1.0388923369078519e-06, + "loss": 0.544, + "step": 6907 + }, + { + "epoch": 3.414781856383636, + "grad_norm": 0.13908612323387556, + "learning_rate": 1.0371647880342218e-06, + "loss": 0.5373, + "step": 6908 + }, + { + "epoch": 3.4152762328513164, + "grad_norm": 0.12919998349680592, + "learning_rate": 1.0354385981415527e-06, + "loss": 0.5619, + "step": 6909 + }, + { + "epoch": 3.4157706093189963, + "grad_norm": 0.13843162339198747, + "learning_rate": 1.0337137674915743e-06, + "loss": 0.5392, + "step": 6910 + }, + { + "epoch": 3.4162649857866767, + "grad_norm": 0.1355796368228542, + "learning_rate": 1.031990296345815e-06, + "loss": 0.5337, + "step": 6911 + }, + { + "epoch": 3.4167593622543566, + "grad_norm": 0.13635351967098305, + "learning_rate": 1.0302681849655971e-06, + "loss": 0.5396, + "step": 6912 + }, + { + "epoch": 3.417253738722037, + "grad_norm": 0.13900622786076589, + "learning_rate": 1.0285474336120283e-06, + "loss": 0.5674, + "step": 6913 + }, + { + "epoch": 3.417748115189717, + "grad_norm": 0.13503637066651566, + "learning_rate": 1.0268280425460198e-06, + "loss": 0.558, + "step": 6914 + }, + { + "epoch": 3.4182424916573972, + "grad_norm": 0.1360811003096712, + "learning_rate": 1.0251100120282719e-06, + "loss": 0.5477, + "step": 6915 + }, + { + "epoch": 3.418736868125077, + "grad_norm": 0.1292939840664812, + "learning_rate": 1.0233933423192755e-06, + "loss": 0.503, + "step": 6916 + }, + { + "epoch": 3.4192312445927575, + "grad_norm": 0.1353582174347379, + "learning_rate": 1.0216780336793252e-06, + "loss": 0.5192, + "step": 6917 + }, + { + "epoch": 3.4197256210604374, + "grad_norm": 0.1305527716905994, + "learning_rate": 1.0199640863685012e-06, + "loss": 0.5103, + "step": 6918 + }, + { + "epoch": 3.420219997528118, + "grad_norm": 0.1342753274697025, + "learning_rate": 1.0182515006466742e-06, + "loss": 0.4881, + "step": 6919 + }, + { + "epoch": 3.4207143739957977, + "grad_norm": 0.13708653049894828, + "learning_rate": 1.01654027677352e-06, + "loss": 0.5266, + "step": 6920 + }, + { + "epoch": 3.421208750463478, + "grad_norm": 0.13662900175549245, + "learning_rate": 1.0148304150084952e-06, + "loss": 0.5263, + "step": 6921 + }, + { + "epoch": 3.421703126931158, + "grad_norm": 0.1364640213304078, + "learning_rate": 1.0131219156108584e-06, + "loss": 0.5307, + "step": 6922 + }, + { + "epoch": 3.4221975033988383, + "grad_norm": 0.13370897497211892, + "learning_rate": 1.0114147788396623e-06, + "loss": 0.5156, + "step": 6923 + }, + { + "epoch": 3.4226918798665182, + "grad_norm": 0.14416845435566383, + "learning_rate": 1.0097090049537473e-06, + "loss": 0.5325, + "step": 6924 + }, + { + "epoch": 3.4231862563341986, + "grad_norm": 0.1311589137595495, + "learning_rate": 1.0080045942117467e-06, + "loss": 0.6006, + "step": 6925 + }, + { + "epoch": 3.4236806328018785, + "grad_norm": 0.1412563266627295, + "learning_rate": 1.0063015468720949e-06, + "loss": 0.528, + "step": 6926 + }, + { + "epoch": 3.424175009269559, + "grad_norm": 0.1322601773100567, + "learning_rate": 1.004599863193011e-06, + "loss": 0.495, + "step": 6927 + }, + { + "epoch": 3.424669385737239, + "grad_norm": 0.13711051855258224, + "learning_rate": 1.0028995434325116e-06, + "loss": 0.5678, + "step": 6928 + }, + { + "epoch": 3.425163762204919, + "grad_norm": 0.13406408073112064, + "learning_rate": 1.00120058784841e-06, + "loss": 0.5127, + "step": 6929 + }, + { + "epoch": 3.425658138672599, + "grad_norm": 0.13346457977169843, + "learning_rate": 9.99502996698304e-07, + "loss": 0.5243, + "step": 6930 + }, + { + "epoch": 3.4261525151402794, + "grad_norm": 0.13242840975774167, + "learning_rate": 9.978067702395922e-07, + "loss": 0.5214, + "step": 6931 + }, + { + "epoch": 3.4266468916079593, + "grad_norm": 0.1322703481617373, + "learning_rate": 9.961119087294602e-07, + "loss": 0.5221, + "step": 6932 + }, + { + "epoch": 3.4271412680756397, + "grad_norm": 0.1319582518755421, + "learning_rate": 9.944184124248913e-07, + "loss": 0.5304, + "step": 6933 + }, + { + "epoch": 3.4276356445433196, + "grad_norm": 0.13921360719719425, + "learning_rate": 9.92726281582662e-07, + "loss": 0.5227, + "step": 6934 + }, + { + "epoch": 3.428130021011, + "grad_norm": 0.13536802962102273, + "learning_rate": 9.910355164593388e-07, + "loss": 0.5273, + "step": 6935 + }, + { + "epoch": 3.42862439747868, + "grad_norm": 0.1339681649080112, + "learning_rate": 9.893461173112794e-07, + "loss": 0.564, + "step": 6936 + }, + { + "epoch": 3.42911877394636, + "grad_norm": 0.13325475935336517, + "learning_rate": 9.876580843946427e-07, + "loss": 0.4907, + "step": 6937 + }, + { + "epoch": 3.42961315041404, + "grad_norm": 0.1353018016167844, + "learning_rate": 9.859714179653678e-07, + "loss": 0.499, + "step": 6938 + }, + { + "epoch": 3.4301075268817205, + "grad_norm": 0.13266738974743184, + "learning_rate": 9.842861182791997e-07, + "loss": 0.4978, + "step": 6939 + }, + { + "epoch": 3.4306019033494004, + "grad_norm": 0.1282233195028724, + "learning_rate": 9.8260218559167e-07, + "loss": 0.5224, + "step": 6940 + }, + { + "epoch": 3.4310962798170808, + "grad_norm": 0.12918802778594915, + "learning_rate": 9.809196201581017e-07, + "loss": 0.4901, + "step": 6941 + }, + { + "epoch": 3.4315906562847607, + "grad_norm": 0.13962611385899934, + "learning_rate": 9.792384222336103e-07, + "loss": 0.574, + "step": 6942 + }, + { + "epoch": 3.432085032752441, + "grad_norm": 0.13678256038369344, + "learning_rate": 9.77558592073108e-07, + "loss": 0.5183, + "step": 6943 + }, + { + "epoch": 3.432579409220121, + "grad_norm": 0.13929083525239083, + "learning_rate": 9.758801299312992e-07, + "loss": 0.5109, + "step": 6944 + }, + { + "epoch": 3.4330737856878013, + "grad_norm": 0.13802655525916255, + "learning_rate": 9.742030360626742e-07, + "loss": 0.5429, + "step": 6945 + }, + { + "epoch": 3.433568162155481, + "grad_norm": 0.1374529025528385, + "learning_rate": 9.725273107215261e-07, + "loss": 0.514, + "step": 6946 + }, + { + "epoch": 3.4340625386231616, + "grad_norm": 0.13128227930821562, + "learning_rate": 9.708529541619293e-07, + "loss": 0.5176, + "step": 6947 + }, + { + "epoch": 3.4345569150908415, + "grad_norm": 0.13226475351384614, + "learning_rate": 9.691799666377632e-07, + "loss": 0.5376, + "step": 6948 + }, + { + "epoch": 3.435051291558522, + "grad_norm": 0.13411902756201693, + "learning_rate": 9.675083484026859e-07, + "loss": 0.5582, + "step": 6949 + }, + { + "epoch": 3.4355456680262018, + "grad_norm": 0.13991969209497737, + "learning_rate": 9.658380997101602e-07, + "loss": 0.5406, + "step": 6950 + }, + { + "epoch": 3.436040044493882, + "grad_norm": 0.1345175991445195, + "learning_rate": 9.641692208134324e-07, + "loss": 0.5121, + "step": 6951 + }, + { + "epoch": 3.4365344209615625, + "grad_norm": 0.1324571167304791, + "learning_rate": 9.625017119655488e-07, + "loss": 0.4951, + "step": 6952 + }, + { + "epoch": 3.4370287974292424, + "grad_norm": 0.13589333591446645, + "learning_rate": 9.608355734193376e-07, + "loss": 0.5251, + "step": 6953 + }, + { + "epoch": 3.4375231738969223, + "grad_norm": 0.13543566323918263, + "learning_rate": 9.591708054274295e-07, + "loss": 0.5229, + "step": 6954 + }, + { + "epoch": 3.4380175503646027, + "grad_norm": 0.1361476604057277, + "learning_rate": 9.575074082422441e-07, + "loss": 0.5496, + "step": 6955 + }, + { + "epoch": 3.438511926832283, + "grad_norm": 0.13151208365651207, + "learning_rate": 9.558453821159896e-07, + "loss": 0.5219, + "step": 6956 + }, + { + "epoch": 3.439006303299963, + "grad_norm": 0.1377905887237918, + "learning_rate": 9.541847273006722e-07, + "loss": 0.5528, + "step": 6957 + }, + { + "epoch": 3.439500679767643, + "grad_norm": 0.1362835091015484, + "learning_rate": 9.525254440480846e-07, + "loss": 0.5775, + "step": 6958 + }, + { + "epoch": 3.439995056235323, + "grad_norm": 0.13352726681728053, + "learning_rate": 9.508675326098127e-07, + "loss": 0.5454, + "step": 6959 + }, + { + "epoch": 3.4404894327030036, + "grad_norm": 0.14302772762584318, + "learning_rate": 9.492109932372384e-07, + "loss": 0.5546, + "step": 6960 + }, + { + "epoch": 3.4409838091706835, + "grad_norm": 0.13391547259617664, + "learning_rate": 9.475558261815332e-07, + "loss": 0.5174, + "step": 6961 + }, + { + "epoch": 3.4414781856383634, + "grad_norm": 0.13388190287043766, + "learning_rate": 9.459020316936562e-07, + "loss": 0.5524, + "step": 6962 + }, + { + "epoch": 3.4419725621060437, + "grad_norm": 0.13297686081658555, + "learning_rate": 9.442496100243681e-07, + "loss": 0.5392, + "step": 6963 + }, + { + "epoch": 3.442466938573724, + "grad_norm": 0.13780389023904574, + "learning_rate": 9.425985614242117e-07, + "loss": 0.4919, + "step": 6964 + }, + { + "epoch": 3.442961315041404, + "grad_norm": 0.13393315808608774, + "learning_rate": 9.409488861435234e-07, + "loss": 0.5589, + "step": 6965 + }, + { + "epoch": 3.443455691509084, + "grad_norm": 0.13814932630774124, + "learning_rate": 9.393005844324399e-07, + "loss": 0.5134, + "step": 6966 + }, + { + "epoch": 3.4439500679767643, + "grad_norm": 0.13828322864026094, + "learning_rate": 9.376536565408811e-07, + "loss": 0.5352, + "step": 6967 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.13009629396513336, + "learning_rate": 9.360081027185563e-07, + "loss": 0.5117, + "step": 6968 + }, + { + "epoch": 3.4449388209121246, + "grad_norm": 0.1319725245155449, + "learning_rate": 9.343639232149781e-07, + "loss": 0.4925, + "step": 6969 + }, + { + "epoch": 3.445433197379805, + "grad_norm": 0.13281739376616727, + "learning_rate": 9.327211182794372e-07, + "loss": 0.506, + "step": 6970 + }, + { + "epoch": 3.445927573847485, + "grad_norm": 0.13077514942409724, + "learning_rate": 9.31079688161024e-07, + "loss": 0.5638, + "step": 6971 + }, + { + "epoch": 3.446421950315165, + "grad_norm": 0.13102293164087078, + "learning_rate": 9.29439633108623e-07, + "loss": 0.519, + "step": 6972 + }, + { + "epoch": 3.446916326782845, + "grad_norm": 0.12975298471661453, + "learning_rate": 9.278009533709021e-07, + "loss": 0.4875, + "step": 6973 + }, + { + "epoch": 3.4474107032505255, + "grad_norm": 0.13835782973662925, + "learning_rate": 9.261636491963233e-07, + "loss": 0.5483, + "step": 6974 + }, + { + "epoch": 3.4479050797182054, + "grad_norm": 0.1358407950586929, + "learning_rate": 9.245277208331438e-07, + "loss": 0.5492, + "step": 6975 + }, + { + "epoch": 3.4483994561858857, + "grad_norm": 0.14019485677776264, + "learning_rate": 9.228931685294085e-07, + "loss": 0.5415, + "step": 6976 + }, + { + "epoch": 3.4488938326535656, + "grad_norm": 0.13453685310728625, + "learning_rate": 9.212599925329536e-07, + "loss": 0.5434, + "step": 6977 + }, + { + "epoch": 3.449388209121246, + "grad_norm": 0.13739399259009683, + "learning_rate": 9.19628193091412e-07, + "loss": 0.5303, + "step": 6978 + }, + { + "epoch": 3.449882585588926, + "grad_norm": 0.13720823039492427, + "learning_rate": 9.179977704521981e-07, + "loss": 0.5108, + "step": 6979 + }, + { + "epoch": 3.4503769620566063, + "grad_norm": 0.13526295669392227, + "learning_rate": 9.163687248625286e-07, + "loss": 0.5248, + "step": 6980 + }, + { + "epoch": 3.450871338524286, + "grad_norm": 0.13851763757732277, + "learning_rate": 9.147410565694026e-07, + "loss": 0.5691, + "step": 6981 + }, + { + "epoch": 3.4513657149919665, + "grad_norm": 0.1294808893201566, + "learning_rate": 9.131147658196104e-07, + "loss": 0.5035, + "step": 6982 + }, + { + "epoch": 3.4518600914596465, + "grad_norm": 0.1354187168766861, + "learning_rate": 9.114898528597437e-07, + "loss": 0.5092, + "step": 6983 + }, + { + "epoch": 3.452354467927327, + "grad_norm": 0.14106389099719563, + "learning_rate": 9.098663179361767e-07, + "loss": 0.5, + "step": 6984 + }, + { + "epoch": 3.4528488443950067, + "grad_norm": 0.1309084135318213, + "learning_rate": 9.082441612950721e-07, + "loss": 0.5293, + "step": 6985 + }, + { + "epoch": 3.453343220862687, + "grad_norm": 0.1329492108995973, + "learning_rate": 9.066233831823912e-07, + "loss": 0.5246, + "step": 6986 + }, + { + "epoch": 3.453837597330367, + "grad_norm": 0.13574071531666523, + "learning_rate": 9.050039838438817e-07, + "loss": 0.5403, + "step": 6987 + }, + { + "epoch": 3.4543319737980474, + "grad_norm": 0.1386421062861745, + "learning_rate": 9.03385963525083e-07, + "loss": 0.5377, + "step": 6988 + }, + { + "epoch": 3.4548263502657273, + "grad_norm": 0.13602261471405355, + "learning_rate": 9.017693224713286e-07, + "loss": 0.5113, + "step": 6989 + }, + { + "epoch": 3.4553207267334076, + "grad_norm": 0.13285363975354672, + "learning_rate": 9.001540609277381e-07, + "loss": 0.5182, + "step": 6990 + }, + { + "epoch": 3.4558151032010875, + "grad_norm": 0.1375819325192113, + "learning_rate": 8.985401791392223e-07, + "loss": 0.5182, + "step": 6991 + }, + { + "epoch": 3.456309479668768, + "grad_norm": 0.1299372126306242, + "learning_rate": 8.969276773504886e-07, + "loss": 0.5161, + "step": 6992 + }, + { + "epoch": 3.456803856136448, + "grad_norm": 0.134489362671343, + "learning_rate": 8.953165558060251e-07, + "loss": 0.5232, + "step": 6993 + }, + { + "epoch": 3.457298232604128, + "grad_norm": 0.13660728244972048, + "learning_rate": 8.937068147501205e-07, + "loss": 0.5146, + "step": 6994 + }, + { + "epoch": 3.457792609071808, + "grad_norm": 0.1344849762366074, + "learning_rate": 8.920984544268519e-07, + "loss": 0.5113, + "step": 6995 + }, + { + "epoch": 3.4582869855394884, + "grad_norm": 0.13689749470602022, + "learning_rate": 8.904914750800797e-07, + "loss": 0.5431, + "step": 6996 + }, + { + "epoch": 3.4587813620071683, + "grad_norm": 0.1350117000028532, + "learning_rate": 8.88885876953467e-07, + "loss": 0.5177, + "step": 6997 + }, + { + "epoch": 3.4592757384748487, + "grad_norm": 0.13374262829423417, + "learning_rate": 8.872816602904555e-07, + "loss": 0.5326, + "step": 6998 + }, + { + "epoch": 3.4597701149425286, + "grad_norm": 0.12961317809793954, + "learning_rate": 8.856788253342863e-07, + "loss": 0.5208, + "step": 6999 + }, + { + "epoch": 3.460264491410209, + "grad_norm": 0.13546169278534054, + "learning_rate": 8.840773723279861e-07, + "loss": 0.5636, + "step": 7000 + }, + { + "epoch": 3.460758867877889, + "grad_norm": 0.13835845862977694, + "learning_rate": 8.82477301514375e-07, + "loss": 0.502, + "step": 7001 + }, + { + "epoch": 3.4612532443455692, + "grad_norm": 0.13483761850427625, + "learning_rate": 8.808786131360591e-07, + "loss": 0.5211, + "step": 7002 + }, + { + "epoch": 3.461747620813249, + "grad_norm": 0.12937249029634743, + "learning_rate": 8.792813074354412e-07, + "loss": 0.5247, + "step": 7003 + }, + { + "epoch": 3.4622419972809295, + "grad_norm": 0.13858601512232144, + "learning_rate": 8.776853846547106e-07, + "loss": 0.5272, + "step": 7004 + }, + { + "epoch": 3.4627363737486094, + "grad_norm": 0.1344793559909614, + "learning_rate": 8.760908450358463e-07, + "loss": 0.5115, + "step": 7005 + }, + { + "epoch": 3.46323075021629, + "grad_norm": 0.13374558364981154, + "learning_rate": 8.744976888206213e-07, + "loss": 0.5196, + "step": 7006 + }, + { + "epoch": 3.4637251266839697, + "grad_norm": 0.13277886992117674, + "learning_rate": 8.729059162505938e-07, + "loss": 0.514, + "step": 7007 + }, + { + "epoch": 3.46421950315165, + "grad_norm": 0.13675486372132292, + "learning_rate": 8.713155275671137e-07, + "loss": 0.5245, + "step": 7008 + }, + { + "epoch": 3.46471387961933, + "grad_norm": 0.1310817250073247, + "learning_rate": 8.697265230113239e-07, + "loss": 0.4981, + "step": 7009 + }, + { + "epoch": 3.4652082560870103, + "grad_norm": 0.1349124196987497, + "learning_rate": 8.681389028241571e-07, + "loss": 0.4981, + "step": 7010 + }, + { + "epoch": 3.4657026325546902, + "grad_norm": 0.13237557457734186, + "learning_rate": 8.66552667246332e-07, + "loss": 0.5384, + "step": 7011 + }, + { + "epoch": 3.4661970090223706, + "grad_norm": 0.12838905088452815, + "learning_rate": 8.649678165183628e-07, + "loss": 0.5292, + "step": 7012 + }, + { + "epoch": 3.4666913854900505, + "grad_norm": 0.13264603304155823, + "learning_rate": 8.633843508805484e-07, + "loss": 0.5194, + "step": 7013 + }, + { + "epoch": 3.467185761957731, + "grad_norm": 0.1372292012894829, + "learning_rate": 8.618022705729778e-07, + "loss": 0.5533, + "step": 7014 + }, + { + "epoch": 3.467680138425411, + "grad_norm": 0.13421514478400587, + "learning_rate": 8.602215758355392e-07, + "loss": 0.5262, + "step": 7015 + }, + { + "epoch": 3.468174514893091, + "grad_norm": 0.13473420683680964, + "learning_rate": 8.586422669078997e-07, + "loss": 0.5476, + "step": 7016 + }, + { + "epoch": 3.468668891360771, + "grad_norm": 0.14053184153821177, + "learning_rate": 8.570643440295201e-07, + "loss": 0.5255, + "step": 7017 + }, + { + "epoch": 3.4691632678284514, + "grad_norm": 0.13194716910574097, + "learning_rate": 8.554878074396533e-07, + "loss": 0.5119, + "step": 7018 + }, + { + "epoch": 3.4696576442961313, + "grad_norm": 0.13522228942708894, + "learning_rate": 8.53912657377337e-07, + "loss": 0.5739, + "step": 7019 + }, + { + "epoch": 3.4701520207638117, + "grad_norm": 0.14602026642927143, + "learning_rate": 8.523388940814048e-07, + "loss": 0.5302, + "step": 7020 + }, + { + "epoch": 3.4706463972314916, + "grad_norm": 0.13358593124858287, + "learning_rate": 8.507665177904767e-07, + "loss": 0.5502, + "step": 7021 + }, + { + "epoch": 3.471140773699172, + "grad_norm": 0.13585641019616576, + "learning_rate": 8.49195528742961e-07, + "loss": 0.5242, + "step": 7022 + }, + { + "epoch": 3.471635150166852, + "grad_norm": 0.13351267253614416, + "learning_rate": 8.476259271770593e-07, + "loss": 0.5295, + "step": 7023 + }, + { + "epoch": 3.4721295266345322, + "grad_norm": 0.1335831302316546, + "learning_rate": 8.460577133307602e-07, + "loss": 0.562, + "step": 7024 + }, + { + "epoch": 3.472623903102212, + "grad_norm": 0.13800919770330272, + "learning_rate": 8.4449088744184e-07, + "loss": 0.5152, + "step": 7025 + }, + { + "epoch": 3.4731182795698925, + "grad_norm": 0.1353632993637421, + "learning_rate": 8.429254497478701e-07, + "loss": 0.535, + "step": 7026 + }, + { + "epoch": 3.473612656037573, + "grad_norm": 0.13713927216324337, + "learning_rate": 8.413614004862091e-07, + "loss": 0.53, + "step": 7027 + }, + { + "epoch": 3.4741070325052528, + "grad_norm": 0.13105935220369555, + "learning_rate": 8.397987398940011e-07, + "loss": 0.5111, + "step": 7028 + }, + { + "epoch": 3.4746014089729327, + "grad_norm": 0.135953575891023, + "learning_rate": 8.382374682081873e-07, + "loss": 0.5146, + "step": 7029 + }, + { + "epoch": 3.475095785440613, + "grad_norm": 0.13611171501687164, + "learning_rate": 8.366775856654908e-07, + "loss": 0.5092, + "step": 7030 + }, + { + "epoch": 3.4755901619082934, + "grad_norm": 0.1370590036575243, + "learning_rate": 8.351190925024244e-07, + "loss": 0.544, + "step": 7031 + }, + { + "epoch": 3.4760845383759733, + "grad_norm": 0.1297135650592498, + "learning_rate": 8.335619889553004e-07, + "loss": 0.5212, + "step": 7032 + }, + { + "epoch": 3.4765789148436532, + "grad_norm": 0.13210901519102156, + "learning_rate": 8.320062752602098e-07, + "loss": 0.5622, + "step": 7033 + }, + { + "epoch": 3.4770732913113336, + "grad_norm": 0.1428275901149459, + "learning_rate": 8.30451951653033e-07, + "loss": 0.5589, + "step": 7034 + }, + { + "epoch": 3.477567667779014, + "grad_norm": 0.13397457068943755, + "learning_rate": 8.288990183694479e-07, + "loss": 0.5065, + "step": 7035 + }, + { + "epoch": 3.478062044246694, + "grad_norm": 0.13814926270870462, + "learning_rate": 8.273474756449118e-07, + "loss": 0.5604, + "step": 7036 + }, + { + "epoch": 3.4785564207143738, + "grad_norm": 0.13335553273154233, + "learning_rate": 8.257973237146777e-07, + "loss": 0.5262, + "step": 7037 + }, + { + "epoch": 3.479050797182054, + "grad_norm": 0.13409883427818664, + "learning_rate": 8.242485628137887e-07, + "loss": 0.5537, + "step": 7038 + }, + { + "epoch": 3.4795451736497345, + "grad_norm": 0.14917288296226894, + "learning_rate": 8.227011931770701e-07, + "loss": 0.537, + "step": 7039 + }, + { + "epoch": 3.4800395501174144, + "grad_norm": 0.1310483141256775, + "learning_rate": 8.211552150391411e-07, + "loss": 0.5129, + "step": 7040 + }, + { + "epoch": 3.4805339265850943, + "grad_norm": 0.13351489929907626, + "learning_rate": 8.196106286344119e-07, + "loss": 0.5045, + "step": 7041 + }, + { + "epoch": 3.4810283030527747, + "grad_norm": 0.13579931378391263, + "learning_rate": 8.18067434197074e-07, + "loss": 0.5361, + "step": 7042 + }, + { + "epoch": 3.481522679520455, + "grad_norm": 0.1347831025960574, + "learning_rate": 8.165256319611158e-07, + "loss": 0.5245, + "step": 7043 + }, + { + "epoch": 3.482017055988135, + "grad_norm": 0.1305550075825026, + "learning_rate": 8.149852221603127e-07, + "loss": 0.5036, + "step": 7044 + }, + { + "epoch": 3.4825114324558153, + "grad_norm": 0.13382241945228782, + "learning_rate": 8.134462050282255e-07, + "loss": 0.5376, + "step": 7045 + }, + { + "epoch": 3.483005808923495, + "grad_norm": 0.13096908365247015, + "learning_rate": 8.119085807982074e-07, + "loss": 0.5012, + "step": 7046 + }, + { + "epoch": 3.4835001853911756, + "grad_norm": 0.13222631453201836, + "learning_rate": 8.103723497033977e-07, + "loss": 0.4986, + "step": 7047 + }, + { + "epoch": 3.4839945618588555, + "grad_norm": 0.13425756782428036, + "learning_rate": 8.088375119767278e-07, + "loss": 0.5055, + "step": 7048 + }, + { + "epoch": 3.484488938326536, + "grad_norm": 0.13629725682083024, + "learning_rate": 8.073040678509136e-07, + "loss": 0.5294, + "step": 7049 + }, + { + "epoch": 3.4849833147942157, + "grad_norm": 0.13770999168384743, + "learning_rate": 8.05772017558466e-07, + "loss": 0.5307, + "step": 7050 + }, + { + "epoch": 3.485477691261896, + "grad_norm": 0.13620664660313733, + "learning_rate": 8.042413613316757e-07, + "loss": 0.5246, + "step": 7051 + }, + { + "epoch": 3.485972067729576, + "grad_norm": 0.1346892885512603, + "learning_rate": 8.027120994026306e-07, + "loss": 0.5407, + "step": 7052 + }, + { + "epoch": 3.4864664441972564, + "grad_norm": 0.1306494254027105, + "learning_rate": 8.011842320032004e-07, + "loss": 0.5029, + "step": 7053 + }, + { + "epoch": 3.4869608206649363, + "grad_norm": 0.13537280251109088, + "learning_rate": 7.99657759365049e-07, + "loss": 0.5647, + "step": 7054 + }, + { + "epoch": 3.4874551971326166, + "grad_norm": 0.13422697522602478, + "learning_rate": 7.981326817196267e-07, + "loss": 0.5365, + "step": 7055 + }, + { + "epoch": 3.4879495736002966, + "grad_norm": 0.13678787609028023, + "learning_rate": 7.966089992981707e-07, + "loss": 0.5228, + "step": 7056 + }, + { + "epoch": 3.488443950067977, + "grad_norm": 0.13802881668997552, + "learning_rate": 7.950867123317064e-07, + "loss": 0.5249, + "step": 7057 + }, + { + "epoch": 3.488938326535657, + "grad_norm": 0.13590184580932377, + "learning_rate": 7.9356582105105e-07, + "loss": 0.5561, + "step": 7058 + }, + { + "epoch": 3.489432703003337, + "grad_norm": 0.13401250757298383, + "learning_rate": 7.920463256868083e-07, + "loss": 0.5288, + "step": 7059 + }, + { + "epoch": 3.489927079471017, + "grad_norm": 0.13532085277751632, + "learning_rate": 7.905282264693681e-07, + "loss": 0.526, + "step": 7060 + }, + { + "epoch": 3.4904214559386975, + "grad_norm": 0.13210021961064577, + "learning_rate": 7.890115236289142e-07, + "loss": 0.5286, + "step": 7061 + }, + { + "epoch": 3.4909158324063774, + "grad_norm": 0.1348781948533346, + "learning_rate": 7.874962173954126e-07, + "loss": 0.5466, + "step": 7062 + }, + { + "epoch": 3.4914102088740577, + "grad_norm": 0.14057273470851508, + "learning_rate": 7.859823079986195e-07, + "loss": 0.5264, + "step": 7063 + }, + { + "epoch": 3.4919045853417376, + "grad_norm": 0.13249911563700711, + "learning_rate": 7.844697956680803e-07, + "loss": 0.49, + "step": 7064 + }, + { + "epoch": 3.492398961809418, + "grad_norm": 0.13274060053714298, + "learning_rate": 7.829586806331313e-07, + "loss": 0.5116, + "step": 7065 + }, + { + "epoch": 3.492893338277098, + "grad_norm": 0.13775799972584457, + "learning_rate": 7.814489631228894e-07, + "loss": 0.5746, + "step": 7066 + }, + { + "epoch": 3.4933877147447783, + "grad_norm": 0.13554955688721312, + "learning_rate": 7.799406433662681e-07, + "loss": 0.5318, + "step": 7067 + }, + { + "epoch": 3.493882091212458, + "grad_norm": 0.13196930194481893, + "learning_rate": 7.784337215919613e-07, + "loss": 0.4969, + "step": 7068 + }, + { + "epoch": 3.4943764676801385, + "grad_norm": 0.13195565220051014, + "learning_rate": 7.769281980284548e-07, + "loss": 0.5254, + "step": 7069 + }, + { + "epoch": 3.4948708441478185, + "grad_norm": 0.13892836756624846, + "learning_rate": 7.754240729040263e-07, + "loss": 0.4936, + "step": 7070 + }, + { + "epoch": 3.495365220615499, + "grad_norm": 0.1337806239341357, + "learning_rate": 7.739213464467321e-07, + "loss": 0.5157, + "step": 7071 + }, + { + "epoch": 3.4958595970831787, + "grad_norm": 0.13081168074218927, + "learning_rate": 7.724200188844255e-07, + "loss": 0.506, + "step": 7072 + }, + { + "epoch": 3.496353973550859, + "grad_norm": 0.13968970833426253, + "learning_rate": 7.709200904447423e-07, + "loss": 0.5419, + "step": 7073 + }, + { + "epoch": 3.496848350018539, + "grad_norm": 0.1369333657044159, + "learning_rate": 7.694215613551059e-07, + "loss": 0.5222, + "step": 7074 + }, + { + "epoch": 3.4973427264862194, + "grad_norm": 0.13172326730253828, + "learning_rate": 7.679244318427303e-07, + "loss": 0.515, + "step": 7075 + }, + { + "epoch": 3.4978371029538993, + "grad_norm": 0.13713510673256485, + "learning_rate": 7.664287021346184e-07, + "loss": 0.5224, + "step": 7076 + }, + { + "epoch": 3.4983314794215796, + "grad_norm": 0.13999451205293667, + "learning_rate": 7.649343724575564e-07, + "loss": 0.5699, + "step": 7077 + }, + { + "epoch": 3.4988258558892595, + "grad_norm": 0.13429604988952107, + "learning_rate": 7.63441443038122e-07, + "loss": 0.5582, + "step": 7078 + }, + { + "epoch": 3.49932023235694, + "grad_norm": 0.13614348755687294, + "learning_rate": 7.619499141026787e-07, + "loss": 0.5539, + "step": 7079 + }, + { + "epoch": 3.49981460882462, + "grad_norm": 0.13435743379932993, + "learning_rate": 7.604597858773744e-07, + "loss": 0.5145, + "step": 7080 + }, + { + "epoch": 3.5003089852923, + "grad_norm": 0.1390000924426043, + "learning_rate": 7.58971058588156e-07, + "loss": 0.5207, + "step": 7081 + }, + { + "epoch": 3.50080336175998, + "grad_norm": 0.13019364831817556, + "learning_rate": 7.574837324607454e-07, + "loss": 0.5091, + "step": 7082 + }, + { + "epoch": 3.5012977382276604, + "grad_norm": 0.13370712693749634, + "learning_rate": 7.559978077206553e-07, + "loss": 0.5544, + "step": 7083 + }, + { + "epoch": 3.5017921146953404, + "grad_norm": 0.13267899062937846, + "learning_rate": 7.545132845931924e-07, + "loss": 0.5116, + "step": 7084 + }, + { + "epoch": 3.5017921146953404, + "eval_loss": 0.6415496468544006, + "eval_runtime": 81.7358, + "eval_samples_per_second": 371.367, + "eval_steps_per_second": 46.43, + "step": 7084 + }, + { + "epoch": 3.5022864911630207, + "grad_norm": 0.13478804133914438, + "learning_rate": 7.530301633034409e-07, + "loss": 0.4887, + "step": 7085 + }, + { + "epoch": 3.5027808676307006, + "grad_norm": 0.13669284708676213, + "learning_rate": 7.51548444076281e-07, + "loss": 0.5547, + "step": 7086 + }, + { + "epoch": 3.503275244098381, + "grad_norm": 0.136687888912503, + "learning_rate": 7.500681271363774e-07, + "loss": 0.5238, + "step": 7087 + }, + { + "epoch": 3.503769620566061, + "grad_norm": 0.1373143039222891, + "learning_rate": 7.485892127081795e-07, + "loss": 0.5576, + "step": 7088 + }, + { + "epoch": 3.5042639970337413, + "grad_norm": 0.1323783496616736, + "learning_rate": 7.471117010159268e-07, + "loss": 0.5015, + "step": 7089 + }, + { + "epoch": 3.5047583735014216, + "grad_norm": 0.1360406335501408, + "learning_rate": 7.456355922836467e-07, + "loss": 0.5411, + "step": 7090 + }, + { + "epoch": 3.5052527499691015, + "grad_norm": 0.12849512207181948, + "learning_rate": 7.441608867351502e-07, + "loss": 0.5212, + "step": 7091 + }, + { + "epoch": 3.5057471264367814, + "grad_norm": 0.13168508514822555, + "learning_rate": 7.426875845940395e-07, + "loss": 0.538, + "step": 7092 + }, + { + "epoch": 3.506241502904462, + "grad_norm": 0.13750357413218334, + "learning_rate": 7.41215686083705e-07, + "loss": 0.5636, + "step": 7093 + }, + { + "epoch": 3.506735879372142, + "grad_norm": 0.1341808484100001, + "learning_rate": 7.397451914273168e-07, + "loss": 0.5139, + "step": 7094 + }, + { + "epoch": 3.507230255839822, + "grad_norm": 0.1355773446480223, + "learning_rate": 7.382761008478423e-07, + "loss": 0.5379, + "step": 7095 + }, + { + "epoch": 3.507724632307502, + "grad_norm": 0.134989808473594, + "learning_rate": 7.368084145680276e-07, + "loss": 0.5238, + "step": 7096 + }, + { + "epoch": 3.5082190087751823, + "grad_norm": 0.13060924830757473, + "learning_rate": 7.353421328104094e-07, + "loss": 0.5385, + "step": 7097 + }, + { + "epoch": 3.5087133852428627, + "grad_norm": 0.13638135804203438, + "learning_rate": 7.33877255797314e-07, + "loss": 0.5643, + "step": 7098 + }, + { + "epoch": 3.5092077617105426, + "grad_norm": 0.13306608796031966, + "learning_rate": 7.324137837508494e-07, + "loss": 0.5238, + "step": 7099 + }, + { + "epoch": 3.5097021381782225, + "grad_norm": 0.13530210093254638, + "learning_rate": 7.309517168929115e-07, + "loss": 0.5398, + "step": 7100 + }, + { + "epoch": 3.510196514645903, + "grad_norm": 0.1315449097074158, + "learning_rate": 7.294910554451895e-07, + "loss": 0.5309, + "step": 7101 + }, + { + "epoch": 3.5106908911135832, + "grad_norm": 0.1332590225601099, + "learning_rate": 7.280317996291497e-07, + "loss": 0.5375, + "step": 7102 + }, + { + "epoch": 3.511185267581263, + "grad_norm": 0.12995505428933818, + "learning_rate": 7.265739496660529e-07, + "loss": 0.5113, + "step": 7103 + }, + { + "epoch": 3.511679644048943, + "grad_norm": 0.1386742208072461, + "learning_rate": 7.251175057769455e-07, + "loss": 0.5432, + "step": 7104 + }, + { + "epoch": 3.5121740205166234, + "grad_norm": 0.12837142246392189, + "learning_rate": 7.236624681826576e-07, + "loss": 0.511, + "step": 7105 + }, + { + "epoch": 3.5126683969843038, + "grad_norm": 0.1339662942281634, + "learning_rate": 7.22208837103806e-07, + "loss": 0.5015, + "step": 7106 + }, + { + "epoch": 3.5131627734519837, + "grad_norm": 0.13522837198549667, + "learning_rate": 7.20756612760799e-07, + "loss": 0.5259, + "step": 7107 + }, + { + "epoch": 3.5136571499196636, + "grad_norm": 0.13600029578776338, + "learning_rate": 7.19305795373828e-07, + "loss": 0.5214, + "step": 7108 + }, + { + "epoch": 3.514151526387344, + "grad_norm": 0.13102362207744914, + "learning_rate": 7.178563851628717e-07, + "loss": 0.5215, + "step": 7109 + }, + { + "epoch": 3.5146459028550243, + "grad_norm": 0.1328809305606353, + "learning_rate": 7.164083823476953e-07, + "loss": 0.4885, + "step": 7110 + }, + { + "epoch": 3.5151402793227042, + "grad_norm": 0.13771483302075355, + "learning_rate": 7.149617871478498e-07, + "loss": 0.5545, + "step": 7111 + }, + { + "epoch": 3.515634655790384, + "grad_norm": 0.13450458846182486, + "learning_rate": 7.135165997826754e-07, + "loss": 0.53, + "step": 7112 + }, + { + "epoch": 3.5161290322580645, + "grad_norm": 0.13241791531967492, + "learning_rate": 7.120728204712957e-07, + "loss": 0.5017, + "step": 7113 + }, + { + "epoch": 3.516623408725745, + "grad_norm": 0.13043247943715558, + "learning_rate": 7.106304494326243e-07, + "loss": 0.5187, + "step": 7114 + }, + { + "epoch": 3.5171177851934248, + "grad_norm": 0.13613553003203954, + "learning_rate": 7.091894868853566e-07, + "loss": 0.5425, + "step": 7115 + }, + { + "epoch": 3.5176121616611047, + "grad_norm": 0.13823831496040784, + "learning_rate": 7.077499330479797e-07, + "loss": 0.5228, + "step": 7116 + }, + { + "epoch": 3.518106538128785, + "grad_norm": 0.13618499572667522, + "learning_rate": 7.063117881387626e-07, + "loss": 0.5092, + "step": 7117 + }, + { + "epoch": 3.5186009145964654, + "grad_norm": 0.13382797297781046, + "learning_rate": 7.04875052375763e-07, + "loss": 0.5357, + "step": 7118 + }, + { + "epoch": 3.5190952910641453, + "grad_norm": 0.13745047992943485, + "learning_rate": 7.034397259768267e-07, + "loss": 0.4976, + "step": 7119 + }, + { + "epoch": 3.5195896675318252, + "grad_norm": 0.13398094408743907, + "learning_rate": 7.020058091595794e-07, + "loss": 0.5387, + "step": 7120 + }, + { + "epoch": 3.5200840439995056, + "grad_norm": 0.13705725436437335, + "learning_rate": 7.005733021414418e-07, + "loss": 0.5374, + "step": 7121 + }, + { + "epoch": 3.520578420467186, + "grad_norm": 0.13918408204116567, + "learning_rate": 6.991422051396146e-07, + "loss": 0.5263, + "step": 7122 + }, + { + "epoch": 3.521072796934866, + "grad_norm": 0.13621203964362857, + "learning_rate": 6.977125183710842e-07, + "loss": 0.5369, + "step": 7123 + }, + { + "epoch": 3.5215671734025458, + "grad_norm": 0.12861861561733975, + "learning_rate": 6.962842420526272e-07, + "loss": 0.4964, + "step": 7124 + }, + { + "epoch": 3.522061549870226, + "grad_norm": 0.13570772715306897, + "learning_rate": 6.94857376400806e-07, + "loss": 0.5827, + "step": 7125 + }, + { + "epoch": 3.5225559263379065, + "grad_norm": 0.13437023526611652, + "learning_rate": 6.934319216319651e-07, + "loss": 0.5025, + "step": 7126 + }, + { + "epoch": 3.5230503028055864, + "grad_norm": 0.13654251863127995, + "learning_rate": 6.920078779622407e-07, + "loss": 0.5309, + "step": 7127 + }, + { + "epoch": 3.5235446792732668, + "grad_norm": 0.14077177562072693, + "learning_rate": 6.905852456075513e-07, + "loss": 0.5458, + "step": 7128 + }, + { + "epoch": 3.5240390557409467, + "grad_norm": 0.1350865749265204, + "learning_rate": 6.891640247835963e-07, + "loss": 0.5144, + "step": 7129 + }, + { + "epoch": 3.524533432208627, + "grad_norm": 0.13442078076739028, + "learning_rate": 6.877442157058755e-07, + "loss": 0.5099, + "step": 7130 + }, + { + "epoch": 3.525027808676307, + "grad_norm": 0.13348553817256179, + "learning_rate": 6.863258185896627e-07, + "loss": 0.5264, + "step": 7131 + }, + { + "epoch": 3.5255221851439873, + "grad_norm": 0.13437700350735177, + "learning_rate": 6.849088336500176e-07, + "loss": 0.5174, + "step": 7132 + }, + { + "epoch": 3.526016561611667, + "grad_norm": 0.13601263969829522, + "learning_rate": 6.834932611017953e-07, + "loss": 0.5562, + "step": 7133 + }, + { + "epoch": 3.5265109380793476, + "grad_norm": 0.13505790963296813, + "learning_rate": 6.82079101159624e-07, + "loss": 0.5127, + "step": 7134 + }, + { + "epoch": 3.5270053145470275, + "grad_norm": 0.1320801231351672, + "learning_rate": 6.806663540379288e-07, + "loss": 0.5189, + "step": 7135 + }, + { + "epoch": 3.527499691014708, + "grad_norm": 0.13757312066880473, + "learning_rate": 6.792550199509162e-07, + "loss": 0.5248, + "step": 7136 + }, + { + "epoch": 3.5279940674823878, + "grad_norm": 0.13565681589346512, + "learning_rate": 6.778450991125762e-07, + "loss": 0.5239, + "step": 7137 + }, + { + "epoch": 3.528488443950068, + "grad_norm": 0.12872286615775833, + "learning_rate": 6.764365917366866e-07, + "loss": 0.5214, + "step": 7138 + }, + { + "epoch": 3.528982820417748, + "grad_norm": 0.1361175831262487, + "learning_rate": 6.750294980368133e-07, + "loss": 0.5183, + "step": 7139 + }, + { + "epoch": 3.5294771968854284, + "grad_norm": 0.13383757529021123, + "learning_rate": 6.736238182263022e-07, + "loss": 0.5143, + "step": 7140 + }, + { + "epoch": 3.5299715733531083, + "grad_norm": 0.13240592393958178, + "learning_rate": 6.722195525182895e-07, + "loss": 0.5099, + "step": 7141 + }, + { + "epoch": 3.5304659498207887, + "grad_norm": 0.13048728518471145, + "learning_rate": 6.708167011256972e-07, + "loss": 0.4852, + "step": 7142 + }, + { + "epoch": 3.5309603262884686, + "grad_norm": 0.13437471544088547, + "learning_rate": 6.694152642612295e-07, + "loss": 0.5473, + "step": 7143 + }, + { + "epoch": 3.531454702756149, + "grad_norm": 0.13545969636395205, + "learning_rate": 6.680152421373798e-07, + "loss": 0.5622, + "step": 7144 + }, + { + "epoch": 3.531949079223829, + "grad_norm": 0.13053720510769198, + "learning_rate": 6.666166349664227e-07, + "loss": 0.5071, + "step": 7145 + }, + { + "epoch": 3.532443455691509, + "grad_norm": 0.13322011759788943, + "learning_rate": 6.652194429604186e-07, + "loss": 0.5262, + "step": 7146 + }, + { + "epoch": 3.532937832159189, + "grad_norm": 0.13493528038903846, + "learning_rate": 6.638236663312214e-07, + "loss": 0.5262, + "step": 7147 + }, + { + "epoch": 3.5334322086268695, + "grad_norm": 0.13748144450329697, + "learning_rate": 6.624293052904618e-07, + "loss": 0.5481, + "step": 7148 + }, + { + "epoch": 3.5339265850945494, + "grad_norm": 0.13973453084494408, + "learning_rate": 6.610363600495551e-07, + "loss": 0.5614, + "step": 7149 + }, + { + "epoch": 3.5344209615622297, + "grad_norm": 0.1341430555995607, + "learning_rate": 6.596448308197112e-07, + "loss": 0.5333, + "step": 7150 + }, + { + "epoch": 3.5349153380299096, + "grad_norm": 0.13238742311810883, + "learning_rate": 6.582547178119125e-07, + "loss": 0.4907, + "step": 7151 + }, + { + "epoch": 3.53540971449759, + "grad_norm": 0.13254422554738676, + "learning_rate": 6.568660212369382e-07, + "loss": 0.5448, + "step": 7152 + }, + { + "epoch": 3.53590409096527, + "grad_norm": 0.1376886887926367, + "learning_rate": 6.554787413053487e-07, + "loss": 0.5184, + "step": 7153 + }, + { + "epoch": 3.5363984674329503, + "grad_norm": 0.13283809111705353, + "learning_rate": 6.540928782274869e-07, + "loss": 0.5454, + "step": 7154 + }, + { + "epoch": 3.53689284390063, + "grad_norm": 0.1308357884123498, + "learning_rate": 6.527084322134813e-07, + "loss": 0.5178, + "step": 7155 + }, + { + "epoch": 3.5373872203683105, + "grad_norm": 0.12865238879686494, + "learning_rate": 6.513254034732508e-07, + "loss": 0.5157, + "step": 7156 + }, + { + "epoch": 3.5378815968359905, + "grad_norm": 0.1317026174407076, + "learning_rate": 6.499437922164919e-07, + "loss": 0.5202, + "step": 7157 + }, + { + "epoch": 3.538375973303671, + "grad_norm": 0.13006304361291457, + "learning_rate": 6.485635986526928e-07, + "loss": 0.5067, + "step": 7158 + }, + { + "epoch": 3.5388703497713507, + "grad_norm": 0.13272466497482113, + "learning_rate": 6.471848229911238e-07, + "loss": 0.5241, + "step": 7159 + }, + { + "epoch": 3.539364726239031, + "grad_norm": 0.13178039935241345, + "learning_rate": 6.458074654408397e-07, + "loss": 0.5238, + "step": 7160 + }, + { + "epoch": 3.539859102706711, + "grad_norm": 0.14077864199825202, + "learning_rate": 6.444315262106815e-07, + "loss": 0.5617, + "step": 7161 + }, + { + "epoch": 3.5403534791743914, + "grad_norm": 0.13345894048214513, + "learning_rate": 6.430570055092733e-07, + "loss": 0.5318, + "step": 7162 + }, + { + "epoch": 3.5408478556420713, + "grad_norm": 0.1277169240854265, + "learning_rate": 6.416839035450273e-07, + "loss": 0.5157, + "step": 7163 + }, + { + "epoch": 3.5413422321097516, + "grad_norm": 0.13529734282179257, + "learning_rate": 6.40312220526137e-07, + "loss": 0.5043, + "step": 7164 + }, + { + "epoch": 3.541836608577432, + "grad_norm": 0.12980107798944263, + "learning_rate": 6.389419566605837e-07, + "loss": 0.5069, + "step": 7165 + }, + { + "epoch": 3.542330985045112, + "grad_norm": 0.13315213050188973, + "learning_rate": 6.375731121561313e-07, + "loss": 0.5193, + "step": 7166 + }, + { + "epoch": 3.542825361512792, + "grad_norm": 0.1303028534651801, + "learning_rate": 6.362056872203303e-07, + "loss": 0.5238, + "step": 7167 + }, + { + "epoch": 3.543319737980472, + "grad_norm": 0.13512666123340922, + "learning_rate": 6.348396820605152e-07, + "loss": 0.536, + "step": 7168 + }, + { + "epoch": 3.5438141144481525, + "grad_norm": 0.1351175880704852, + "learning_rate": 6.334750968838044e-07, + "loss": 0.5215, + "step": 7169 + }, + { + "epoch": 3.5443084909158324, + "grad_norm": 0.13497410777120078, + "learning_rate": 6.321119318971025e-07, + "loss": 0.5495, + "step": 7170 + }, + { + "epoch": 3.5448028673835124, + "grad_norm": 0.13774752518187036, + "learning_rate": 6.307501873070987e-07, + "loss": 0.5656, + "step": 7171 + }, + { + "epoch": 3.5452972438511927, + "grad_norm": 0.13623554196193158, + "learning_rate": 6.293898633202633e-07, + "loss": 0.5417, + "step": 7172 + }, + { + "epoch": 3.545791620318873, + "grad_norm": 0.1351775475639896, + "learning_rate": 6.280309601428569e-07, + "loss": 0.5434, + "step": 7173 + }, + { + "epoch": 3.546285996786553, + "grad_norm": 0.13362031520227566, + "learning_rate": 6.266734779809213e-07, + "loss": 0.5236, + "step": 7174 + }, + { + "epoch": 3.546780373254233, + "grad_norm": 0.1339896332701717, + "learning_rate": 6.253174170402821e-07, + "loss": 0.5197, + "step": 7175 + }, + { + "epoch": 3.5472747497219133, + "grad_norm": 0.13709805460315072, + "learning_rate": 6.239627775265523e-07, + "loss": 0.532, + "step": 7176 + }, + { + "epoch": 3.5477691261895936, + "grad_norm": 0.13401813038547813, + "learning_rate": 6.226095596451276e-07, + "loss": 0.5278, + "step": 7177 + }, + { + "epoch": 3.5482635026572735, + "grad_norm": 0.13248465653604594, + "learning_rate": 6.212577636011852e-07, + "loss": 0.527, + "step": 7178 + }, + { + "epoch": 3.5487578791249534, + "grad_norm": 0.13318046818442417, + "learning_rate": 6.199073895996944e-07, + "loss": 0.5202, + "step": 7179 + }, + { + "epoch": 3.549252255592634, + "grad_norm": 0.1352492018823223, + "learning_rate": 6.185584378454035e-07, + "loss": 0.5156, + "step": 7180 + }, + { + "epoch": 3.549746632060314, + "grad_norm": 0.13068175372505514, + "learning_rate": 6.172109085428424e-07, + "loss": 0.52, + "step": 7181 + }, + { + "epoch": 3.550241008527994, + "grad_norm": 0.13112734778061005, + "learning_rate": 6.158648018963331e-07, + "loss": 0.5244, + "step": 7182 + }, + { + "epoch": 3.550735384995674, + "grad_norm": 0.13853605439749272, + "learning_rate": 6.145201181099736e-07, + "loss": 0.5484, + "step": 7183 + }, + { + "epoch": 3.5512297614633543, + "grad_norm": 0.14331798405106166, + "learning_rate": 6.13176857387654e-07, + "loss": 0.5285, + "step": 7184 + }, + { + "epoch": 3.5517241379310347, + "grad_norm": 0.1349243018014402, + "learning_rate": 6.118350199330436e-07, + "loss": 0.5184, + "step": 7185 + }, + { + "epoch": 3.5522185143987146, + "grad_norm": 0.1417231312116388, + "learning_rate": 6.104946059495953e-07, + "loss": 0.5342, + "step": 7186 + }, + { + "epoch": 3.5527128908663945, + "grad_norm": 0.13725338187574057, + "learning_rate": 6.09155615640552e-07, + "loss": 0.5465, + "step": 7187 + }, + { + "epoch": 3.553207267334075, + "grad_norm": 0.13852313368268881, + "learning_rate": 6.078180492089337e-07, + "loss": 0.5284, + "step": 7188 + }, + { + "epoch": 3.5537016438017552, + "grad_norm": 0.12966421065163125, + "learning_rate": 6.064819068575478e-07, + "loss": 0.4921, + "step": 7189 + }, + { + "epoch": 3.554196020269435, + "grad_norm": 0.1359164056006188, + "learning_rate": 6.051471887889848e-07, + "loss": 0.5594, + "step": 7190 + }, + { + "epoch": 3.554690396737115, + "grad_norm": 0.13561825536486286, + "learning_rate": 6.038138952056239e-07, + "loss": 0.5541, + "step": 7191 + }, + { + "epoch": 3.5551847732047954, + "grad_norm": 0.13741768054066827, + "learning_rate": 6.024820263096198e-07, + "loss": 0.5479, + "step": 7192 + }, + { + "epoch": 3.555679149672476, + "grad_norm": 0.12962127385889824, + "learning_rate": 6.011515823029191e-07, + "loss": 0.5114, + "step": 7193 + }, + { + "epoch": 3.5561735261401557, + "grad_norm": 0.13475862240154082, + "learning_rate": 5.998225633872479e-07, + "loss": 0.5597, + "step": 7194 + }, + { + "epoch": 3.5566679026078356, + "grad_norm": 0.14011924160454836, + "learning_rate": 5.984949697641141e-07, + "loss": 0.5251, + "step": 7195 + }, + { + "epoch": 3.557162279075516, + "grad_norm": 0.13276645510676696, + "learning_rate": 5.971688016348187e-07, + "loss": 0.5094, + "step": 7196 + }, + { + "epoch": 3.5576566555431963, + "grad_norm": 0.1383049863275982, + "learning_rate": 5.958440592004378e-07, + "loss": 0.5582, + "step": 7197 + }, + { + "epoch": 3.5581510320108762, + "grad_norm": 0.1353442099353418, + "learning_rate": 5.945207426618326e-07, + "loss": 0.5549, + "step": 7198 + }, + { + "epoch": 3.558645408478556, + "grad_norm": 0.13313837839587891, + "learning_rate": 5.93198852219653e-07, + "loss": 0.5216, + "step": 7199 + }, + { + "epoch": 3.5591397849462365, + "grad_norm": 0.12912178398151972, + "learning_rate": 5.918783880743251e-07, + "loss": 0.5048, + "step": 7200 + }, + { + "epoch": 3.559634161413917, + "grad_norm": 0.1398839813645848, + "learning_rate": 5.905593504260665e-07, + "loss": 0.5401, + "step": 7201 + }, + { + "epoch": 3.5601285378815968, + "grad_norm": 0.13057760420451095, + "learning_rate": 5.892417394748751e-07, + "loss": 0.5332, + "step": 7202 + }, + { + "epoch": 3.560622914349277, + "grad_norm": 0.13351938082456732, + "learning_rate": 5.879255554205299e-07, + "loss": 0.569, + "step": 7203 + }, + { + "epoch": 3.561117290816957, + "grad_norm": 0.13549758491515615, + "learning_rate": 5.866107984625968e-07, + "loss": 0.5225, + "step": 7204 + }, + { + "epoch": 3.5616116672846374, + "grad_norm": 0.13097816684636399, + "learning_rate": 5.852974688004265e-07, + "loss": 0.5055, + "step": 7205 + }, + { + "epoch": 3.5621060437523173, + "grad_norm": 0.13155522479211643, + "learning_rate": 5.839855666331473e-07, + "loss": 0.5036, + "step": 7206 + }, + { + "epoch": 3.5626004202199977, + "grad_norm": 0.1342627894002623, + "learning_rate": 5.82675092159678e-07, + "loss": 0.5233, + "step": 7207 + }, + { + "epoch": 3.5630947966876776, + "grad_norm": 0.1367829781804646, + "learning_rate": 5.813660455787185e-07, + "loss": 0.5516, + "step": 7208 + }, + { + "epoch": 3.563589173155358, + "grad_norm": 0.138059293628406, + "learning_rate": 5.800584270887499e-07, + "loss": 0.5335, + "step": 7209 + }, + { + "epoch": 3.564083549623038, + "grad_norm": 0.13650880602346746, + "learning_rate": 5.787522368880394e-07, + "loss": 0.5777, + "step": 7210 + }, + { + "epoch": 3.564577926090718, + "grad_norm": 0.14058427151107772, + "learning_rate": 5.774474751746362e-07, + "loss": 0.5113, + "step": 7211 + }, + { + "epoch": 3.565072302558398, + "grad_norm": 0.13157590680926684, + "learning_rate": 5.761441421463754e-07, + "loss": 0.5244, + "step": 7212 + }, + { + "epoch": 3.5655666790260785, + "grad_norm": 0.14212423409510172, + "learning_rate": 5.7484223800087e-07, + "loss": 0.5663, + "step": 7213 + }, + { + "epoch": 3.5660610554937584, + "grad_norm": 0.13204590844499905, + "learning_rate": 5.735417629355245e-07, + "loss": 0.5042, + "step": 7214 + }, + { + "epoch": 3.5665554319614388, + "grad_norm": 0.13374278378190574, + "learning_rate": 5.722427171475175e-07, + "loss": 0.5316, + "step": 7215 + }, + { + "epoch": 3.5670498084291187, + "grad_norm": 0.13395885279342534, + "learning_rate": 5.709451008338196e-07, + "loss": 0.5169, + "step": 7216 + }, + { + "epoch": 3.567544184896799, + "grad_norm": 0.1356906146461268, + "learning_rate": 5.696489141911765e-07, + "loss": 0.5132, + "step": 7217 + }, + { + "epoch": 3.568038561364479, + "grad_norm": 0.1305554069777055, + "learning_rate": 5.683541574161244e-07, + "loss": 0.5162, + "step": 7218 + }, + { + "epoch": 3.5685329378321593, + "grad_norm": 0.13409523220445901, + "learning_rate": 5.670608307049786e-07, + "loss": 0.5422, + "step": 7219 + }, + { + "epoch": 3.569027314299839, + "grad_norm": 0.14084291498880264, + "learning_rate": 5.657689342538397e-07, + "loss": 0.5157, + "step": 7220 + }, + { + "epoch": 3.5695216907675196, + "grad_norm": 0.1320355170587652, + "learning_rate": 5.644784682585869e-07, + "loss": 0.5183, + "step": 7221 + }, + { + "epoch": 3.5700160672351995, + "grad_norm": 0.13139575237970672, + "learning_rate": 5.63189432914888e-07, + "loss": 0.5519, + "step": 7222 + }, + { + "epoch": 3.57051044370288, + "grad_norm": 0.135093271585416, + "learning_rate": 5.619018284181921e-07, + "loss": 0.5105, + "step": 7223 + }, + { + "epoch": 3.5710048201705598, + "grad_norm": 0.13312416049917064, + "learning_rate": 5.606156549637299e-07, + "loss": 0.5367, + "step": 7224 + }, + { + "epoch": 3.57149919663824, + "grad_norm": 0.13617612737295304, + "learning_rate": 5.593309127465174e-07, + "loss": 0.5166, + "step": 7225 + }, + { + "epoch": 3.57199357310592, + "grad_norm": 0.1343854324894382, + "learning_rate": 5.580476019613512e-07, + "loss": 0.5109, + "step": 7226 + }, + { + "epoch": 3.5724879495736004, + "grad_norm": 0.13053945737693676, + "learning_rate": 5.567657228028123e-07, + "loss": 0.5143, + "step": 7227 + }, + { + "epoch": 3.5729823260412803, + "grad_norm": 0.13252676917876635, + "learning_rate": 5.554852754652629e-07, + "loss": 0.5313, + "step": 7228 + }, + { + "epoch": 3.5734767025089607, + "grad_norm": 0.1323027859410791, + "learning_rate": 5.542062601428532e-07, + "loss": 0.5091, + "step": 7229 + }, + { + "epoch": 3.5739710789766406, + "grad_norm": 0.13126786111054298, + "learning_rate": 5.529286770295094e-07, + "loss": 0.5654, + "step": 7230 + }, + { + "epoch": 3.574465455444321, + "grad_norm": 0.13304665642395536, + "learning_rate": 5.516525263189465e-07, + "loss": 0.5459, + "step": 7231 + }, + { + "epoch": 3.574959831912001, + "grad_norm": 0.1357440868739228, + "learning_rate": 5.503778082046562e-07, + "loss": 0.535, + "step": 7232 + }, + { + "epoch": 3.575454208379681, + "grad_norm": 0.13835790554354255, + "learning_rate": 5.491045228799175e-07, + "loss": 0.5362, + "step": 7233 + }, + { + "epoch": 3.575948584847361, + "grad_norm": 0.14210160906561672, + "learning_rate": 5.478326705377934e-07, + "loss": 0.5813, + "step": 7234 + }, + { + "epoch": 3.5764429613150415, + "grad_norm": 0.13576147893497215, + "learning_rate": 5.465622513711244e-07, + "loss": 0.5235, + "step": 7235 + }, + { + "epoch": 3.5769373377827214, + "grad_norm": 0.13504787555669864, + "learning_rate": 5.452932655725396e-07, + "loss": 0.5329, + "step": 7236 + }, + { + "epoch": 3.5774317142504017, + "grad_norm": 0.1396395658258203, + "learning_rate": 5.440257133344451e-07, + "loss": 0.5107, + "step": 7237 + }, + { + "epoch": 3.5779260907180817, + "grad_norm": 0.1353695648890502, + "learning_rate": 5.427595948490316e-07, + "loss": 0.5033, + "step": 7238 + }, + { + "epoch": 3.578420467185762, + "grad_norm": 0.13650745346335286, + "learning_rate": 5.414949103082734e-07, + "loss": 0.5219, + "step": 7239 + }, + { + "epoch": 3.5789148436534424, + "grad_norm": 0.13760053990782323, + "learning_rate": 5.402316599039304e-07, + "loss": 0.5305, + "step": 7240 + }, + { + "epoch": 3.5794092201211223, + "grad_norm": 0.13995212310050098, + "learning_rate": 5.389698438275382e-07, + "loss": 0.5543, + "step": 7241 + }, + { + "epoch": 3.579903596588802, + "grad_norm": 0.13858596282167607, + "learning_rate": 5.377094622704204e-07, + "loss": 0.5362, + "step": 7242 + }, + { + "epoch": 3.5803979730564826, + "grad_norm": 0.13795861105588925, + "learning_rate": 5.364505154236799e-07, + "loss": 0.5238, + "step": 7243 + }, + { + "epoch": 3.580892349524163, + "grad_norm": 0.13848718914595495, + "learning_rate": 5.351930034782005e-07, + "loss": 0.5009, + "step": 7244 + }, + { + "epoch": 3.581386725991843, + "grad_norm": 0.13067367585229325, + "learning_rate": 5.339369266246575e-07, + "loss": 0.5151, + "step": 7245 + }, + { + "epoch": 3.5818811024595227, + "grad_norm": 0.1315400708783037, + "learning_rate": 5.326822850534985e-07, + "loss": 0.4949, + "step": 7246 + }, + { + "epoch": 3.582375478927203, + "grad_norm": 0.1352811208723691, + "learning_rate": 5.314290789549569e-07, + "loss": 0.5569, + "step": 7247 + }, + { + "epoch": 3.5828698553948835, + "grad_norm": 0.1358975480554276, + "learning_rate": 5.30177308519051e-07, + "loss": 0.5211, + "step": 7248 + }, + { + "epoch": 3.5833642318625634, + "grad_norm": 0.13283812280020255, + "learning_rate": 5.289269739355774e-07, + "loss": 0.5233, + "step": 7249 + }, + { + "epoch": 3.5838586083302433, + "grad_norm": 0.13549645482507747, + "learning_rate": 5.276780753941158e-07, + "loss": 0.5344, + "step": 7250 + }, + { + "epoch": 3.5843529847979236, + "grad_norm": 0.13552458294159433, + "learning_rate": 5.264306130840335e-07, + "loss": 0.5164, + "step": 7251 + }, + { + "epoch": 3.584847361265604, + "grad_norm": 0.1273798103223577, + "learning_rate": 5.251845871944727e-07, + "loss": 0.5196, + "step": 7252 + }, + { + "epoch": 3.585341737733284, + "grad_norm": 0.1347018238289667, + "learning_rate": 5.2393999791436e-07, + "loss": 0.5446, + "step": 7253 + }, + { + "epoch": 3.585836114200964, + "grad_norm": 0.12810627407106057, + "learning_rate": 5.226968454324066e-07, + "loss": 0.5051, + "step": 7254 + }, + { + "epoch": 3.586330490668644, + "grad_norm": 0.13359475057565795, + "learning_rate": 5.214551299371029e-07, + "loss": 0.5276, + "step": 7255 + }, + { + "epoch": 3.5868248671363245, + "grad_norm": 0.1395501447884612, + "learning_rate": 5.202148516167238e-07, + "loss": 0.5304, + "step": 7256 + }, + { + "epoch": 3.5873192436040044, + "grad_norm": 0.1334246725427225, + "learning_rate": 5.189760106593267e-07, + "loss": 0.5274, + "step": 7257 + }, + { + "epoch": 3.5878136200716844, + "grad_norm": 0.13534716017496573, + "learning_rate": 5.177386072527468e-07, + "loss": 0.5167, + "step": 7258 + }, + { + "epoch": 3.5883079965393647, + "grad_norm": 0.14389619265742226, + "learning_rate": 5.165026415846075e-07, + "loss": 0.5722, + "step": 7259 + }, + { + "epoch": 3.588802373007045, + "grad_norm": 0.1356244797815738, + "learning_rate": 5.152681138423065e-07, + "loss": 0.5491, + "step": 7260 + }, + { + "epoch": 3.589296749474725, + "grad_norm": 0.13236109825245668, + "learning_rate": 5.140350242130299e-07, + "loss": 0.5073, + "step": 7261 + }, + { + "epoch": 3.589791125942405, + "grad_norm": 0.1349965660199099, + "learning_rate": 5.128033728837456e-07, + "loss": 0.5624, + "step": 7262 + }, + { + "epoch": 3.5902855024100853, + "grad_norm": 0.13837709616009797, + "learning_rate": 5.115731600411999e-07, + "loss": 0.5256, + "step": 7263 + }, + { + "epoch": 3.5907798788777656, + "grad_norm": 0.12637300146692645, + "learning_rate": 5.103443858719215e-07, + "loss": 0.503, + "step": 7264 + }, + { + "epoch": 3.5912742553454455, + "grad_norm": 0.13826551726799294, + "learning_rate": 5.091170505622245e-07, + "loss": 0.5442, + "step": 7265 + }, + { + "epoch": 3.5917686318131254, + "grad_norm": 0.13549193530349105, + "learning_rate": 5.07891154298199e-07, + "loss": 0.5418, + "step": 7266 + }, + { + "epoch": 3.592263008280806, + "grad_norm": 0.139553516473668, + "learning_rate": 5.066666972657231e-07, + "loss": 0.5586, + "step": 7267 + }, + { + "epoch": 3.592757384748486, + "grad_norm": 0.13246522587144074, + "learning_rate": 5.054436796504536e-07, + "loss": 0.5286, + "step": 7268 + }, + { + "epoch": 3.593251761216166, + "grad_norm": 0.13240040176741408, + "learning_rate": 5.042221016378301e-07, + "loss": 0.5489, + "step": 7269 + }, + { + "epoch": 3.593746137683846, + "grad_norm": 0.1347609793071556, + "learning_rate": 5.03001963413069e-07, + "loss": 0.5357, + "step": 7270 + }, + { + "epoch": 3.5942405141515263, + "grad_norm": 0.13425046676515515, + "learning_rate": 5.017832651611765e-07, + "loss": 0.5295, + "step": 7271 + }, + { + "epoch": 3.5947348906192067, + "grad_norm": 0.13778583895590235, + "learning_rate": 5.00566007066936e-07, + "loss": 0.5295, + "step": 7272 + }, + { + "epoch": 3.5952292670868866, + "grad_norm": 0.13580016331294803, + "learning_rate": 4.99350189314911e-07, + "loss": 0.5771, + "step": 7273 + }, + { + "epoch": 3.5957236435545665, + "grad_norm": 0.1357805349306018, + "learning_rate": 4.981358120894531e-07, + "loss": 0.5369, + "step": 7274 + }, + { + "epoch": 3.596218020022247, + "grad_norm": 0.13926416839115222, + "learning_rate": 4.969228755746847e-07, + "loss": 0.5348, + "step": 7275 + }, + { + "epoch": 3.5967123964899272, + "grad_norm": 0.13100272291621823, + "learning_rate": 4.957113799545221e-07, + "loss": 0.5225, + "step": 7276 + }, + { + "epoch": 3.597206772957607, + "grad_norm": 0.13572569825220637, + "learning_rate": 4.945013254126518e-07, + "loss": 0.5265, + "step": 7277 + }, + { + "epoch": 3.5977011494252875, + "grad_norm": 0.13700966882942037, + "learning_rate": 4.932927121325525e-07, + "loss": 0.5632, + "step": 7278 + }, + { + "epoch": 3.5981955258929674, + "grad_norm": 0.13592756965066133, + "learning_rate": 4.920855402974745e-07, + "loss": 0.518, + "step": 7279 + }, + { + "epoch": 3.598689902360648, + "grad_norm": 0.13115175402001597, + "learning_rate": 4.908798100904566e-07, + "loss": 0.5226, + "step": 7280 + }, + { + "epoch": 3.5991842788283277, + "grad_norm": 0.14350777815310956, + "learning_rate": 4.89675521694315e-07, + "loss": 0.5484, + "step": 7281 + }, + { + "epoch": 3.599678655296008, + "grad_norm": 0.13229646775160814, + "learning_rate": 4.884726752916491e-07, + "loss": 0.5339, + "step": 7282 + }, + { + "epoch": 3.600173031763688, + "grad_norm": 0.12901777627809186, + "learning_rate": 4.872712710648408e-07, + "loss": 0.5146, + "step": 7283 + }, + { + "epoch": 3.6006674082313683, + "grad_norm": 0.13074706037503342, + "learning_rate": 4.860713091960489e-07, + "loss": 0.5272, + "step": 7284 + }, + { + "epoch": 3.6011617846990482, + "grad_norm": 0.1355539542345548, + "learning_rate": 4.8487278986722e-07, + "loss": 0.5558, + "step": 7285 + }, + { + "epoch": 3.6016561611667286, + "grad_norm": 0.13380491349180856, + "learning_rate": 4.836757132600767e-07, + "loss": 0.5412, + "step": 7286 + }, + { + "epoch": 3.6021505376344085, + "grad_norm": 0.1296033469633657, + "learning_rate": 4.824800795561224e-07, + "loss": 0.4967, + "step": 7287 + }, + { + "epoch": 3.602644914102089, + "grad_norm": 0.13266761084092513, + "learning_rate": 4.812858889366456e-07, + "loss": 0.5509, + "step": 7288 + }, + { + "epoch": 3.603139290569769, + "grad_norm": 0.13550043046711585, + "learning_rate": 4.80093141582716e-07, + "loss": 0.5343, + "step": 7289 + }, + { + "epoch": 3.603633667037449, + "grad_norm": 0.1359412562576795, + "learning_rate": 4.789018376751808e-07, + "loss": 0.5192, + "step": 7290 + }, + { + "epoch": 3.604128043505129, + "grad_norm": 0.1342146355171454, + "learning_rate": 4.777119773946704e-07, + "loss": 0.5345, + "step": 7291 + }, + { + "epoch": 3.6046224199728094, + "grad_norm": 0.1367192132309017, + "learning_rate": 4.765235609215979e-07, + "loss": 0.5315, + "step": 7292 + }, + { + "epoch": 3.6051167964404893, + "grad_norm": 0.13031427241221802, + "learning_rate": 4.753365884361505e-07, + "loss": 0.5179, + "step": 7293 + }, + { + "epoch": 3.6056111729081697, + "grad_norm": 0.13712575205359892, + "learning_rate": 4.7415106011830856e-07, + "loss": 0.5526, + "step": 7294 + }, + { + "epoch": 3.6061055493758496, + "grad_norm": 0.13048549676707924, + "learning_rate": 4.7296697614782395e-07, + "loss": 0.511, + "step": 7295 + }, + { + "epoch": 3.60659992584353, + "grad_norm": 0.13610855765849128, + "learning_rate": 4.717843367042307e-07, + "loss": 0.5405, + "step": 7296 + }, + { + "epoch": 3.60709430231121, + "grad_norm": 0.1313000798440397, + "learning_rate": 4.7060314196684777e-07, + "loss": 0.5281, + "step": 7297 + }, + { + "epoch": 3.6075886787788902, + "grad_norm": 0.13498482306678197, + "learning_rate": 4.694233921147695e-07, + "loss": 0.5235, + "step": 7298 + }, + { + "epoch": 3.60808305524657, + "grad_norm": 0.13156669908522914, + "learning_rate": 4.6824508732687625e-07, + "loss": 0.5143, + "step": 7299 + }, + { + "epoch": 3.6085774317142505, + "grad_norm": 0.1344250345376678, + "learning_rate": 4.670682277818284e-07, + "loss": 0.5264, + "step": 7300 + }, + { + "epoch": 3.6090718081819304, + "grad_norm": 0.13142398880276435, + "learning_rate": 4.658928136580654e-07, + "loss": 0.5349, + "step": 7301 + }, + { + "epoch": 3.6095661846496108, + "grad_norm": 0.1338543656986121, + "learning_rate": 4.6471884513380584e-07, + "loss": 0.5541, + "step": 7302 + }, + { + "epoch": 3.6100605611172907, + "grad_norm": 0.13160665252903786, + "learning_rate": 4.635463223870562e-07, + "loss": 0.5376, + "step": 7303 + }, + { + "epoch": 3.610554937584971, + "grad_norm": 0.1387311992560665, + "learning_rate": 4.6237524559559323e-07, + "loss": 0.5296, + "step": 7304 + }, + { + "epoch": 3.611049314052651, + "grad_norm": 0.1346883728856192, + "learning_rate": 4.612056149369848e-07, + "loss": 0.506, + "step": 7305 + }, + { + "epoch": 3.6115436905203313, + "grad_norm": 0.13403776084060937, + "learning_rate": 4.600374305885735e-07, + "loss": 0.5405, + "step": 7306 + }, + { + "epoch": 3.612038066988011, + "grad_norm": 0.13339139853932172, + "learning_rate": 4.588706927274844e-07, + "loss": 0.5311, + "step": 7307 + }, + { + "epoch": 3.6125324434556916, + "grad_norm": 0.13528818095437206, + "learning_rate": 4.577054015306226e-07, + "loss": 0.5089, + "step": 7308 + }, + { + "epoch": 3.6130268199233715, + "grad_norm": 0.12979315604954367, + "learning_rate": 4.5654155717467563e-07, + "loss": 0.5128, + "step": 7309 + }, + { + "epoch": 3.613521196391052, + "grad_norm": 0.1406825260649367, + "learning_rate": 4.553791598361057e-07, + "loss": 0.544, + "step": 7310 + }, + { + "epoch": 3.6140155728587318, + "grad_norm": 0.13241577740308552, + "learning_rate": 4.5421820969116625e-07, + "loss": 0.5268, + "step": 7311 + }, + { + "epoch": 3.614509949326412, + "grad_norm": 0.13735135881233937, + "learning_rate": 4.5305870691588203e-07, + "loss": 0.4898, + "step": 7312 + }, + { + "epoch": 3.615004325794092, + "grad_norm": 0.1353310597779077, + "learning_rate": 4.519006516860613e-07, + "loss": 0.5462, + "step": 7313 + }, + { + "epoch": 3.6154987022617724, + "grad_norm": 0.13419332758961705, + "learning_rate": 4.5074404417729477e-07, + "loss": 0.5028, + "step": 7314 + }, + { + "epoch": 3.6159930787294527, + "grad_norm": 0.13089163490920838, + "learning_rate": 4.4958888456494876e-07, + "loss": 0.5195, + "step": 7315 + }, + { + "epoch": 3.6164874551971327, + "grad_norm": 0.13668446440085266, + "learning_rate": 4.484351730241754e-07, + "loss": 0.5168, + "step": 7316 + }, + { + "epoch": 3.6169818316648126, + "grad_norm": 0.1324359610486145, + "learning_rate": 4.4728290972990585e-07, + "loss": 0.5022, + "step": 7317 + }, + { + "epoch": 3.617476208132493, + "grad_norm": 0.13547683928499835, + "learning_rate": 4.461320948568504e-07, + "loss": 0.4996, + "step": 7318 + }, + { + "epoch": 3.6179705846001733, + "grad_norm": 0.13528037141983043, + "learning_rate": 4.449827285794972e-07, + "loss": 0.5119, + "step": 7319 + }, + { + "epoch": 3.618464961067853, + "grad_norm": 0.13072968062841267, + "learning_rate": 4.438348110721224e-07, + "loss": 0.5126, + "step": 7320 + }, + { + "epoch": 3.618959337535533, + "grad_norm": 0.1291995641164004, + "learning_rate": 4.426883425087747e-07, + "loss": 0.5229, + "step": 7321 + }, + { + "epoch": 3.6194537140032135, + "grad_norm": 0.13117392327746377, + "learning_rate": 4.415433230632871e-07, + "loss": 0.512, + "step": 7322 + }, + { + "epoch": 3.619948090470894, + "grad_norm": 0.1368567865525863, + "learning_rate": 4.4039975290927293e-07, + "loss": 0.5196, + "step": 7323 + }, + { + "epoch": 3.6204424669385737, + "grad_norm": 0.1448336332867672, + "learning_rate": 4.3925763222012255e-07, + "loss": 0.537, + "step": 7324 + }, + { + "epoch": 3.6209368434062537, + "grad_norm": 0.12595403354640683, + "learning_rate": 4.381169611690117e-07, + "loss": 0.5128, + "step": 7325 + }, + { + "epoch": 3.621431219873934, + "grad_norm": 0.13272164347429533, + "learning_rate": 4.3697773992889105e-07, + "loss": 0.5185, + "step": 7326 + }, + { + "epoch": 3.6219255963416144, + "grad_norm": 0.13009712941081333, + "learning_rate": 4.358399686724968e-07, + "loss": 0.5126, + "step": 7327 + }, + { + "epoch": 3.6224199728092943, + "grad_norm": 0.13229413071128426, + "learning_rate": 4.347036475723387e-07, + "loss": 0.5337, + "step": 7328 + }, + { + "epoch": 3.622914349276974, + "grad_norm": 0.13544558641068213, + "learning_rate": 4.3356877680071464e-07, + "loss": 0.5439, + "step": 7329 + }, + { + "epoch": 3.6234087257446546, + "grad_norm": 0.13838210372203064, + "learning_rate": 4.324353565296935e-07, + "loss": 0.5502, + "step": 7330 + }, + { + "epoch": 3.623903102212335, + "grad_norm": 0.13586391286634422, + "learning_rate": 4.3130338693113115e-07, + "loss": 0.527, + "step": 7331 + }, + { + "epoch": 3.624397478680015, + "grad_norm": 0.13720153446170621, + "learning_rate": 4.301728681766637e-07, + "loss": 0.5366, + "step": 7332 + }, + { + "epoch": 3.6248918551476947, + "grad_norm": 0.1364687723255388, + "learning_rate": 4.290438004377007e-07, + "loss": 0.5344, + "step": 7333 + }, + { + "epoch": 3.625386231615375, + "grad_norm": 0.13220618002029136, + "learning_rate": 4.2791618388543955e-07, + "loss": 0.5418, + "step": 7334 + }, + { + "epoch": 3.6258806080830555, + "grad_norm": 0.13169082747446123, + "learning_rate": 4.2679001869085244e-07, + "loss": 0.5105, + "step": 7335 + }, + { + "epoch": 3.6263749845507354, + "grad_norm": 0.13211084322388075, + "learning_rate": 4.2566530502469174e-07, + "loss": 0.5311, + "step": 7336 + }, + { + "epoch": 3.6268693610184153, + "grad_norm": 0.13008372695121176, + "learning_rate": 4.245420430574931e-07, + "loss": 0.4905, + "step": 7337 + }, + { + "epoch": 3.6273637374860956, + "grad_norm": 0.1294878410369218, + "learning_rate": 4.2342023295956933e-07, + "loss": 0.5374, + "step": 7338 + }, + { + "epoch": 3.627858113953776, + "grad_norm": 0.13907842937470885, + "learning_rate": 4.2229987490101323e-07, + "loss": 0.5325, + "step": 7339 + }, + { + "epoch": 3.628352490421456, + "grad_norm": 0.13025946664582413, + "learning_rate": 4.211809690516999e-07, + "loss": 0.49, + "step": 7340 + }, + { + "epoch": 3.628846866889136, + "grad_norm": 0.13726545096644954, + "learning_rate": 4.2006351558127933e-07, + "loss": 0.5276, + "step": 7341 + }, + { + "epoch": 3.629341243356816, + "grad_norm": 0.12932451920401053, + "learning_rate": 4.189475146591848e-07, + "loss": 0.5317, + "step": 7342 + }, + { + "epoch": 3.6298356198244965, + "grad_norm": 0.12977330114446536, + "learning_rate": 4.178329664546321e-07, + "loss": 0.5189, + "step": 7343 + }, + { + "epoch": 3.6303299962921765, + "grad_norm": 0.1348603983980516, + "learning_rate": 4.167198711366105e-07, + "loss": 0.525, + "step": 7344 + }, + { + "epoch": 3.6308243727598564, + "grad_norm": 0.13644523382867357, + "learning_rate": 4.1560822887389165e-07, + "loss": 0.5144, + "step": 7345 + }, + { + "epoch": 3.6313187492275367, + "grad_norm": 0.13155190184911789, + "learning_rate": 4.144980398350307e-07, + "loss": 0.5108, + "step": 7346 + }, + { + "epoch": 3.631813125695217, + "grad_norm": 0.13216645625777385, + "learning_rate": 4.133893041883541e-07, + "loss": 0.4943, + "step": 7347 + }, + { + "epoch": 3.632307502162897, + "grad_norm": 0.13027990801392367, + "learning_rate": 4.122820221019752e-07, + "loss": 0.539, + "step": 7348 + }, + { + "epoch": 3.632801878630577, + "grad_norm": 0.13150702313523752, + "learning_rate": 4.1117619374378637e-07, + "loss": 0.5425, + "step": 7349 + }, + { + "epoch": 3.6332962550982573, + "grad_norm": 0.13310024921190955, + "learning_rate": 4.1007181928145343e-07, + "loss": 0.5602, + "step": 7350 + }, + { + "epoch": 3.6337906315659376, + "grad_norm": 0.13272323801843855, + "learning_rate": 4.089688988824314e-07, + "loss": 0.5581, + "step": 7351 + }, + { + "epoch": 3.6342850080336175, + "grad_norm": 0.1338421246051216, + "learning_rate": 4.0786743271394534e-07, + "loss": 0.5473, + "step": 7352 + }, + { + "epoch": 3.634779384501298, + "grad_norm": 0.13242750212870424, + "learning_rate": 4.0676742094300506e-07, + "loss": 0.4968, + "step": 7353 + }, + { + "epoch": 3.635273760968978, + "grad_norm": 0.1355434792720414, + "learning_rate": 4.056688637363981e-07, + "loss": 0.5311, + "step": 7354 + }, + { + "epoch": 3.635768137436658, + "grad_norm": 0.1426720250166445, + "learning_rate": 4.0457176126069473e-07, + "loss": 0.5218, + "step": 7355 + }, + { + "epoch": 3.636262513904338, + "grad_norm": 0.13100612809403817, + "learning_rate": 4.034761136822385e-07, + "loss": 0.5211, + "step": 7356 + }, + { + "epoch": 3.6367568903720184, + "grad_norm": 0.131565435543843, + "learning_rate": 4.023819211671587e-07, + "loss": 0.5232, + "step": 7357 + }, + { + "epoch": 3.6372512668396983, + "grad_norm": 0.13424138720219264, + "learning_rate": 4.012891838813604e-07, + "loss": 0.5253, + "step": 7358 + }, + { + "epoch": 3.6377456433073787, + "grad_norm": 0.13320603910212822, + "learning_rate": 4.0019790199052663e-07, + "loss": 0.5017, + "step": 7359 + }, + { + "epoch": 3.6382400197750586, + "grad_norm": 0.13240900262848412, + "learning_rate": 3.991080756601251e-07, + "loss": 0.5375, + "step": 7360 + }, + { + "epoch": 3.638734396242739, + "grad_norm": 0.13643615699782785, + "learning_rate": 3.9801970505539913e-07, + "loss": 0.5427, + "step": 7361 + }, + { + "epoch": 3.639228772710419, + "grad_norm": 0.13757788110853614, + "learning_rate": 3.9693279034137e-07, + "loss": 0.5018, + "step": 7362 + }, + { + "epoch": 3.6397231491780992, + "grad_norm": 0.13234556612234674, + "learning_rate": 3.958473316828415e-07, + "loss": 0.5153, + "step": 7363 + }, + { + "epoch": 3.640217525645779, + "grad_norm": 0.1354354517689986, + "learning_rate": 3.947633292443942e-07, + "loss": 0.5314, + "step": 7364 + }, + { + "epoch": 3.6407119021134595, + "grad_norm": 0.12914155564400745, + "learning_rate": 3.936807831903888e-07, + "loss": 0.5118, + "step": 7365 + }, + { + "epoch": 3.6412062785811394, + "grad_norm": 0.13319564532309863, + "learning_rate": 3.9259969368496855e-07, + "loss": 0.5287, + "step": 7366 + }, + { + "epoch": 3.64170065504882, + "grad_norm": 0.13571950206494376, + "learning_rate": 3.9152006089204884e-07, + "loss": 0.5615, + "step": 7367 + }, + { + "epoch": 3.6421950315164997, + "grad_norm": 0.13220690166906007, + "learning_rate": 3.9044188497532773e-07, + "loss": 0.5231, + "step": 7368 + }, + { + "epoch": 3.64268940798418, + "grad_norm": 0.13270154219527366, + "learning_rate": 3.8936516609828556e-07, + "loss": 0.5287, + "step": 7369 + }, + { + "epoch": 3.64318378445186, + "grad_norm": 0.133799714370867, + "learning_rate": 3.882899044241761e-07, + "loss": 0.5308, + "step": 7370 + }, + { + "epoch": 3.6436781609195403, + "grad_norm": 0.1398002140438045, + "learning_rate": 3.872161001160357e-07, + "loss": 0.5401, + "step": 7371 + }, + { + "epoch": 3.6441725373872202, + "grad_norm": 0.13446076172443902, + "learning_rate": 3.8614375333667966e-07, + "loss": 0.4944, + "step": 7372 + }, + { + "epoch": 3.6446669138549006, + "grad_norm": 0.1362104156201344, + "learning_rate": 3.8507286424870005e-07, + "loss": 0.5504, + "step": 7373 + }, + { + "epoch": 3.6451612903225805, + "grad_norm": 0.12949438680550138, + "learning_rate": 3.840034330144715e-07, + "loss": 0.5244, + "step": 7374 + }, + { + "epoch": 3.645655666790261, + "grad_norm": 0.6324984318690214, + "learning_rate": 3.829354597961421e-07, + "loss": 0.5095, + "step": 7375 + }, + { + "epoch": 3.646150043257941, + "grad_norm": 0.133013971844608, + "learning_rate": 3.818689447556456e-07, + "loss": 0.5069, + "step": 7376 + }, + { + "epoch": 3.646644419725621, + "grad_norm": 0.13194949285317387, + "learning_rate": 3.8080388805468936e-07, + "loss": 0.5074, + "step": 7377 + }, + { + "epoch": 3.647138796193301, + "grad_norm": 0.13239391965853176, + "learning_rate": 3.797402898547631e-07, + "loss": 0.5269, + "step": 7378 + }, + { + "epoch": 3.6476331726609814, + "grad_norm": 0.13393629704050083, + "learning_rate": 3.7867815031713105e-07, + "loss": 0.5479, + "step": 7379 + }, + { + "epoch": 3.6481275491286613, + "grad_norm": 0.13328521269683055, + "learning_rate": 3.776174696028434e-07, + "loss": 0.5258, + "step": 7380 + }, + { + "epoch": 3.6486219255963417, + "grad_norm": 0.13579131422427831, + "learning_rate": 3.765582478727203e-07, + "loss": 0.5436, + "step": 7381 + }, + { + "epoch": 3.6491163020640216, + "grad_norm": 0.1391584889450463, + "learning_rate": 3.7550048528736673e-07, + "loss": 0.5171, + "step": 7382 + }, + { + "epoch": 3.649610678531702, + "grad_norm": 0.13115203543374399, + "learning_rate": 3.7444418200716647e-07, + "loss": 0.4987, + "step": 7383 + }, + { + "epoch": 3.650105054999382, + "grad_norm": 0.1347828189101396, + "learning_rate": 3.733893381922793e-07, + "loss": 0.4983, + "step": 7384 + }, + { + "epoch": 3.6505994314670622, + "grad_norm": 0.1339113814214426, + "learning_rate": 3.723359540026439e-07, + "loss": 0.5726, + "step": 7385 + }, + { + "epoch": 3.651093807934742, + "grad_norm": 0.13252373889046143, + "learning_rate": 3.7128402959797914e-07, + "loss": 0.4954, + "step": 7386 + }, + { + "epoch": 3.6515881844024225, + "grad_norm": 0.12794884309730026, + "learning_rate": 3.7023356513778416e-07, + "loss": 0.4932, + "step": 7387 + }, + { + "epoch": 3.6520825608701024, + "grad_norm": 0.13173798134370795, + "learning_rate": 3.6918456078133156e-07, + "loss": 0.5216, + "step": 7388 + }, + { + "epoch": 3.6525769373377828, + "grad_norm": 0.13294711557751393, + "learning_rate": 3.6813701668767745e-07, + "loss": 0.5376, + "step": 7389 + }, + { + "epoch": 3.653071313805463, + "grad_norm": 0.14115477341136734, + "learning_rate": 3.670909330156547e-07, + "loss": 0.5288, + "step": 7390 + }, + { + "epoch": 3.653565690273143, + "grad_norm": 0.1348097090651635, + "learning_rate": 3.6604630992387316e-07, + "loss": 0.5353, + "step": 7391 + }, + { + "epoch": 3.654060066740823, + "grad_norm": 0.13342328272212503, + "learning_rate": 3.6500314757072273e-07, + "loss": 0.5177, + "step": 7392 + }, + { + "epoch": 3.6545544432085033, + "grad_norm": 0.13559134072225934, + "learning_rate": 3.639614461143748e-07, + "loss": 0.5338, + "step": 7393 + }, + { + "epoch": 3.6550488196761837, + "grad_norm": 0.1402030278875663, + "learning_rate": 3.629212057127729e-07, + "loss": 0.5414, + "step": 7394 + }, + { + "epoch": 3.6555431961438636, + "grad_norm": 0.1320104493575387, + "learning_rate": 3.6188242652364426e-07, + "loss": 0.5672, + "step": 7395 + }, + { + "epoch": 3.6560375726115435, + "grad_norm": 0.13711309045827388, + "learning_rate": 3.6084510870449175e-07, + "loss": 0.503, + "step": 7396 + }, + { + "epoch": 3.656531949079224, + "grad_norm": 0.13120970279812702, + "learning_rate": 3.5980925241259846e-07, + "loss": 0.5265, + "step": 7397 + }, + { + "epoch": 3.657026325546904, + "grad_norm": 0.13104702243418093, + "learning_rate": 3.587748578050254e-07, + "loss": 0.5385, + "step": 7398 + }, + { + "epoch": 3.657520702014584, + "grad_norm": 0.13719105082655686, + "learning_rate": 3.577419250386105e-07, + "loss": 0.5193, + "step": 7399 + }, + { + "epoch": 3.658015078482264, + "grad_norm": 0.1298806679260725, + "learning_rate": 3.567104542699729e-07, + "loss": 0.5212, + "step": 7400 + }, + { + "epoch": 3.6585094549499444, + "grad_norm": 0.12914732780873467, + "learning_rate": 3.556804456555074e-07, + "loss": 0.5496, + "step": 7401 + }, + { + "epoch": 3.6590038314176248, + "grad_norm": 0.13337394201229053, + "learning_rate": 3.5465189935138587e-07, + "loss": 0.5309, + "step": 7402 + }, + { + "epoch": 3.6594982078853047, + "grad_norm": 0.1346856470233224, + "learning_rate": 3.5362481551356355e-07, + "loss": 0.5621, + "step": 7403 + }, + { + "epoch": 3.6599925843529846, + "grad_norm": 0.12989097189100027, + "learning_rate": 3.525991942977702e-07, + "loss": 0.4722, + "step": 7404 + }, + { + "epoch": 3.660486960820665, + "grad_norm": 0.13500219363899518, + "learning_rate": 3.515750358595138e-07, + "loss": 0.5186, + "step": 7405 + }, + { + "epoch": 3.6609813372883453, + "grad_norm": 0.13097970908705941, + "learning_rate": 3.5055234035408226e-07, + "loss": 0.4852, + "step": 7406 + }, + { + "epoch": 3.661475713756025, + "grad_norm": 0.13327571574513425, + "learning_rate": 3.4953110793654154e-07, + "loss": 0.5308, + "step": 7407 + }, + { + "epoch": 3.661970090223705, + "grad_norm": 0.13367874176905223, + "learning_rate": 3.4851133876173114e-07, + "loss": 0.5581, + "step": 7408 + }, + { + "epoch": 3.6624644666913855, + "grad_norm": 0.13545695688066994, + "learning_rate": 3.474930329842774e-07, + "loss": 0.5227, + "step": 7409 + }, + { + "epoch": 3.662958843159066, + "grad_norm": 0.13426500569858166, + "learning_rate": 3.4647619075857784e-07, + "loss": 0.5164, + "step": 7410 + }, + { + "epoch": 3.6634532196267457, + "grad_norm": 0.13760050690023354, + "learning_rate": 3.4546081223880924e-07, + "loss": 0.5391, + "step": 7411 + }, + { + "epoch": 3.6639475960944257, + "grad_norm": 0.13308775969504913, + "learning_rate": 3.4444689757892945e-07, + "loss": 0.5088, + "step": 7412 + }, + { + "epoch": 3.664441972562106, + "grad_norm": 0.13648038127635984, + "learning_rate": 3.4343444693267e-07, + "loss": 0.5254, + "step": 7413 + }, + { + "epoch": 3.6649363490297864, + "grad_norm": 0.13358664796983208, + "learning_rate": 3.424234604535437e-07, + "loss": 0.5072, + "step": 7414 + }, + { + "epoch": 3.6654307254974663, + "grad_norm": 0.13335306077680734, + "learning_rate": 3.4141393829484113e-07, + "loss": 0.5458, + "step": 7415 + }, + { + "epoch": 3.665925101965146, + "grad_norm": 0.13202275511836195, + "learning_rate": 3.4040588060963e-07, + "loss": 0.5068, + "step": 7416 + }, + { + "epoch": 3.6664194784328266, + "grad_norm": 0.13563939990796364, + "learning_rate": 3.393992875507546e-07, + "loss": 0.5294, + "step": 7417 + }, + { + "epoch": 3.666913854900507, + "grad_norm": 0.13719432930323572, + "learning_rate": 3.3839415927084064e-07, + "loss": 0.5254, + "step": 7418 + }, + { + "epoch": 3.667408231368187, + "grad_norm": 0.13744611238805074, + "learning_rate": 3.3739049592228624e-07, + "loss": 0.5673, + "step": 7419 + }, + { + "epoch": 3.6679026078358667, + "grad_norm": 0.13476038291923828, + "learning_rate": 3.363882976572741e-07, + "loss": 0.5279, + "step": 7420 + }, + { + "epoch": 3.668396984303547, + "grad_norm": 0.13453441854733247, + "learning_rate": 3.3538756462776157e-07, + "loss": 0.5665, + "step": 7421 + }, + { + "epoch": 3.6688913607712275, + "grad_norm": 0.13730172583491831, + "learning_rate": 3.343882969854817e-07, + "loss": 0.5382, + "step": 7422 + }, + { + "epoch": 3.6693857372389074, + "grad_norm": 0.13222815753898803, + "learning_rate": 3.3339049488194996e-07, + "loss": 0.5306, + "step": 7423 + }, + { + "epoch": 3.6698801137065877, + "grad_norm": 0.14036973821114646, + "learning_rate": 3.3239415846845424e-07, + "loss": 0.5323, + "step": 7424 + }, + { + "epoch": 3.6703744901742676, + "grad_norm": 0.13295564208419483, + "learning_rate": 3.313992878960637e-07, + "loss": 0.5217, + "step": 7425 + }, + { + "epoch": 3.670868866641948, + "grad_norm": 0.12741551836515244, + "learning_rate": 3.304058833156276e-07, + "loss": 0.502, + "step": 7426 + }, + { + "epoch": 3.671363243109628, + "grad_norm": 0.13187952406158823, + "learning_rate": 3.294139448777678e-07, + "loss": 0.5059, + "step": 7427 + }, + { + "epoch": 3.6718576195773083, + "grad_norm": 0.12872875270283113, + "learning_rate": 3.284234727328839e-07, + "loss": 0.5394, + "step": 7428 + }, + { + "epoch": 3.672351996044988, + "grad_norm": 0.13709925714708612, + "learning_rate": 3.2743446703115913e-07, + "loss": 0.5343, + "step": 7429 + }, + { + "epoch": 3.6728463725126685, + "grad_norm": 0.13397369819530736, + "learning_rate": 3.264469279225468e-07, + "loss": 0.5039, + "step": 7430 + }, + { + "epoch": 3.6733407489803485, + "grad_norm": 0.13253158752564176, + "learning_rate": 3.2546085555678396e-07, + "loss": 0.5098, + "step": 7431 + }, + { + "epoch": 3.673835125448029, + "grad_norm": 0.13432689653199134, + "learning_rate": 3.2447625008338315e-07, + "loss": 0.5624, + "step": 7432 + }, + { + "epoch": 3.6743295019157087, + "grad_norm": 0.13425398983036924, + "learning_rate": 3.234931116516338e-07, + "loss": 0.5404, + "step": 7433 + }, + { + "epoch": 3.674823878383389, + "grad_norm": 0.13597897567437342, + "learning_rate": 3.225114404106011e-07, + "loss": 0.5514, + "step": 7434 + }, + { + "epoch": 3.675318254851069, + "grad_norm": 0.13217823040752721, + "learning_rate": 3.2153123650913163e-07, + "loss": 0.4828, + "step": 7435 + }, + { + "epoch": 3.6758126313187494, + "grad_norm": 0.1338075858805545, + "learning_rate": 3.2055250009584984e-07, + "loss": 0.5614, + "step": 7436 + }, + { + "epoch": 3.6763070077864293, + "grad_norm": 0.1334419899125147, + "learning_rate": 3.1957523131915134e-07, + "loss": 0.5462, + "step": 7437 + }, + { + "epoch": 3.6768013842541096, + "grad_norm": 0.13611801695105452, + "learning_rate": 3.185994303272177e-07, + "loss": 0.4932, + "step": 7438 + }, + { + "epoch": 3.6772957607217895, + "grad_norm": 0.1326763509889496, + "learning_rate": 3.176250972680006e-07, + "loss": 0.5053, + "step": 7439 + }, + { + "epoch": 3.67779013718947, + "grad_norm": 0.13021214018797378, + "learning_rate": 3.1665223228923514e-07, + "loss": 0.4974, + "step": 7440 + }, + { + "epoch": 3.67828451365715, + "grad_norm": 0.1339627302065828, + "learning_rate": 3.1568083553842887e-07, + "loss": 0.5165, + "step": 7441 + }, + { + "epoch": 3.67877889012483, + "grad_norm": 0.1301216564755629, + "learning_rate": 3.1471090716286956e-07, + "loss": 0.5278, + "step": 7442 + }, + { + "epoch": 3.67927326659251, + "grad_norm": 0.141331254712588, + "learning_rate": 3.1374244730962065e-07, + "loss": 0.5472, + "step": 7443 + }, + { + "epoch": 3.6797676430601904, + "grad_norm": 0.13203761376095735, + "learning_rate": 3.1277545612552695e-07, + "loss": 0.5357, + "step": 7444 + }, + { + "epoch": 3.6802620195278704, + "grad_norm": 0.1370906177749237, + "learning_rate": 3.1180993375720336e-07, + "loss": 0.4923, + "step": 7445 + }, + { + "epoch": 3.6807563959955507, + "grad_norm": 0.1277067002373486, + "learning_rate": 3.1084588035104835e-07, + "loss": 0.5249, + "step": 7446 + }, + { + "epoch": 3.6812507724632306, + "grad_norm": 0.13216552264155126, + "learning_rate": 3.098832960532372e-07, + "loss": 0.5257, + "step": 7447 + }, + { + "epoch": 3.681745148930911, + "grad_norm": 0.13415452430905656, + "learning_rate": 3.089221810097176e-07, + "loss": 0.5477, + "step": 7448 + }, + { + "epoch": 3.682239525398591, + "grad_norm": 0.13049833176609166, + "learning_rate": 3.079625353662208e-07, + "loss": 0.5055, + "step": 7449 + }, + { + "epoch": 3.6827339018662713, + "grad_norm": 0.13234441318971263, + "learning_rate": 3.070043592682503e-07, + "loss": 0.5125, + "step": 7450 + }, + { + "epoch": 3.683228278333951, + "grad_norm": 0.1333191882679837, + "learning_rate": 3.0604765286108673e-07, + "loss": 0.5196, + "step": 7451 + }, + { + "epoch": 3.6837226548016315, + "grad_norm": 0.13590178819230597, + "learning_rate": 3.050924162897928e-07, + "loss": 0.5193, + "step": 7452 + }, + { + "epoch": 3.6842170312693114, + "grad_norm": 0.13397743614953223, + "learning_rate": 3.0413864969920605e-07, + "loss": 0.5271, + "step": 7453 + }, + { + "epoch": 3.684711407736992, + "grad_norm": 0.13364631605311447, + "learning_rate": 3.031863532339363e-07, + "loss": 0.5265, + "step": 7454 + }, + { + "epoch": 3.6852057842046717, + "grad_norm": 0.13365799250690877, + "learning_rate": 3.022355270383781e-07, + "loss": 0.5726, + "step": 7455 + }, + { + "epoch": 3.685700160672352, + "grad_norm": 0.13442076509021336, + "learning_rate": 3.012861712566995e-07, + "loss": 0.5008, + "step": 7456 + }, + { + "epoch": 3.686194537140032, + "grad_norm": 0.13359502855596012, + "learning_rate": 3.0033828603284077e-07, + "loss": 0.5127, + "step": 7457 + }, + { + "epoch": 3.6866889136077123, + "grad_norm": 0.12990134558980354, + "learning_rate": 2.9939187151053153e-07, + "loss": 0.5197, + "step": 7458 + }, + { + "epoch": 3.6871832900753923, + "grad_norm": 0.13392930705194528, + "learning_rate": 2.984469278332658e-07, + "loss": 0.5226, + "step": 7459 + }, + { + "epoch": 3.6876776665430726, + "grad_norm": 0.15774621955646467, + "learning_rate": 2.975034551443201e-07, + "loss": 0.5615, + "step": 7460 + }, + { + "epoch": 3.688172043010753, + "grad_norm": 0.13983683070279054, + "learning_rate": 2.9656145358675e-07, + "loss": 0.5782, + "step": 7461 + }, + { + "epoch": 3.688666419478433, + "grad_norm": 0.13266209116076544, + "learning_rate": 2.9562092330338355e-07, + "loss": 0.525, + "step": 7462 + }, + { + "epoch": 3.689160795946113, + "grad_norm": 0.14223447224788616, + "learning_rate": 2.946818644368277e-07, + "loss": 0.5442, + "step": 7463 + }, + { + "epoch": 3.689655172413793, + "grad_norm": 0.13024743593042057, + "learning_rate": 2.937442771294674e-07, + "loss": 0.5178, + "step": 7464 + }, + { + "epoch": 3.6901495488814735, + "grad_norm": 0.13374132443968853, + "learning_rate": 2.928081615234635e-07, + "loss": 0.5775, + "step": 7465 + }, + { + "epoch": 3.6906439253491534, + "grad_norm": 0.134240611972253, + "learning_rate": 2.9187351776075235e-07, + "loss": 0.523, + "step": 7466 + }, + { + "epoch": 3.6911383018168333, + "grad_norm": 0.13428221433561605, + "learning_rate": 2.9094034598304957e-07, + "loss": 0.5284, + "step": 7467 + }, + { + "epoch": 3.6916326782845137, + "grad_norm": 0.13491181387848014, + "learning_rate": 2.900086463318441e-07, + "loss": 0.5319, + "step": 7468 + }, + { + "epoch": 3.692127054752194, + "grad_norm": 0.13624025950832366, + "learning_rate": 2.890784189484064e-07, + "loss": 0.5388, + "step": 7469 + }, + { + "epoch": 3.692621431219874, + "grad_norm": 0.14048772407185217, + "learning_rate": 2.881496639737813e-07, + "loss": 0.53, + "step": 7470 + }, + { + "epoch": 3.693115807687554, + "grad_norm": 0.12860783786050806, + "learning_rate": 2.8722238154878847e-07, + "loss": 0.5008, + "step": 7471 + }, + { + "epoch": 3.6936101841552342, + "grad_norm": 0.13556064838694432, + "learning_rate": 2.862965718140287e-07, + "loss": 0.5155, + "step": 7472 + }, + { + "epoch": 3.6941045606229146, + "grad_norm": 0.13100592811902873, + "learning_rate": 2.853722349098753e-07, + "loss": 0.528, + "step": 7473 + }, + { + "epoch": 3.6945989370905945, + "grad_norm": 0.13385460299653232, + "learning_rate": 2.844493709764784e-07, + "loss": 0.5189, + "step": 7474 + }, + { + "epoch": 3.6950933135582744, + "grad_norm": 0.13459263304087346, + "learning_rate": 2.835279801537705e-07, + "loss": 0.5439, + "step": 7475 + }, + { + "epoch": 3.6955876900259548, + "grad_norm": 0.13498558727509835, + "learning_rate": 2.826080625814542e-07, + "loss": 0.5392, + "step": 7476 + }, + { + "epoch": 3.696082066493635, + "grad_norm": 0.13405527216171587, + "learning_rate": 2.8168961839900924e-07, + "loss": 0.5266, + "step": 7477 + }, + { + "epoch": 3.696576442961315, + "grad_norm": 0.13024743591582083, + "learning_rate": 2.807726477456973e-07, + "loss": 0.5285, + "step": 7478 + }, + { + "epoch": 3.697070819428995, + "grad_norm": 0.13328180422119673, + "learning_rate": 2.7985715076054967e-07, + "loss": 0.5397, + "step": 7479 + }, + { + "epoch": 3.6975651958966753, + "grad_norm": 0.1366214101099874, + "learning_rate": 2.789431275823806e-07, + "loss": 0.593, + "step": 7480 + }, + { + "epoch": 3.6980595723643557, + "grad_norm": 0.1345253811790997, + "learning_rate": 2.780305783497772e-07, + "loss": 0.5416, + "step": 7481 + }, + { + "epoch": 3.6985539488320356, + "grad_norm": 0.13544783019298157, + "learning_rate": 2.771195032011031e-07, + "loss": 0.5241, + "step": 7482 + }, + { + "epoch": 3.6990483252997155, + "grad_norm": 0.13385371600491058, + "learning_rate": 2.7620990227449905e-07, + "loss": 0.5365, + "step": 7483 + }, + { + "epoch": 3.699542701767396, + "grad_norm": 0.1332120502196185, + "learning_rate": 2.753017757078835e-07, + "loss": 0.4836, + "step": 7484 + }, + { + "epoch": 3.700037078235076, + "grad_norm": 0.13818156217242086, + "learning_rate": 2.7439512363894973e-07, + "loss": 0.5531, + "step": 7485 + }, + { + "epoch": 3.700531454702756, + "grad_norm": 0.13388593251927985, + "learning_rate": 2.7348994620516764e-07, + "loss": 0.5143, + "step": 7486 + }, + { + "epoch": 3.701025831170436, + "grad_norm": 0.13487384981749317, + "learning_rate": 2.7258624354378426e-07, + "loss": 0.4831, + "step": 7487 + }, + { + "epoch": 3.7015202076381164, + "grad_norm": 0.14442856979461785, + "learning_rate": 2.71684015791821e-07, + "loss": 0.5715, + "step": 7488 + }, + { + "epoch": 3.7020145841057968, + "grad_norm": 0.1330975317856325, + "learning_rate": 2.707832630860807e-07, + "loss": 0.5026, + "step": 7489 + }, + { + "epoch": 3.7025089605734767, + "grad_norm": 0.1298061403082928, + "learning_rate": 2.698839855631352e-07, + "loss": 0.5117, + "step": 7490 + }, + { + "epoch": 3.7030033370411566, + "grad_norm": 0.12850744095216537, + "learning_rate": 2.689861833593399e-07, + "loss": 0.5491, + "step": 7491 + }, + { + "epoch": 3.703497713508837, + "grad_norm": 0.13030113412759353, + "learning_rate": 2.680898566108203e-07, + "loss": 0.5193, + "step": 7492 + }, + { + "epoch": 3.7039920899765173, + "grad_norm": 0.13142170193223515, + "learning_rate": 2.6719500545348444e-07, + "loss": 0.5084, + "step": 7493 + }, + { + "epoch": 3.704486466444197, + "grad_norm": 0.13101169575856617, + "learning_rate": 2.6630163002300926e-07, + "loss": 0.5333, + "step": 7494 + }, + { + "epoch": 3.704980842911877, + "grad_norm": 0.13065209867471025, + "learning_rate": 2.6540973045485417e-07, + "loss": 0.5297, + "step": 7495 + }, + { + "epoch": 3.7054752193795575, + "grad_norm": 0.1331635846976728, + "learning_rate": 2.6451930688425333e-07, + "loss": 0.5426, + "step": 7496 + }, + { + "epoch": 3.705969595847238, + "grad_norm": 0.14301496888422796, + "learning_rate": 2.6363035944621306e-07, + "loss": 0.5608, + "step": 7497 + }, + { + "epoch": 3.7064639723149178, + "grad_norm": 0.13764112946498616, + "learning_rate": 2.6274288827552344e-07, + "loss": 0.5426, + "step": 7498 + }, + { + "epoch": 3.706958348782598, + "grad_norm": 0.13112578213905565, + "learning_rate": 2.618568935067445e-07, + "loss": 0.524, + "step": 7499 + }, + { + "epoch": 3.707452725250278, + "grad_norm": 0.13482843735481165, + "learning_rate": 2.6097237527421217e-07, + "loss": 0.518, + "step": 7500 + }, + { + "epoch": 3.7079471017179584, + "grad_norm": 0.1326455916475558, + "learning_rate": 2.600893337120436e-07, + "loss": 0.5187, + "step": 7501 + }, + { + "epoch": 3.7084414781856383, + "grad_norm": 0.13274767605731919, + "learning_rate": 2.5920776895412836e-07, + "loss": 0.5259, + "step": 7502 + }, + { + "epoch": 3.7089358546533187, + "grad_norm": 0.13296399539358253, + "learning_rate": 2.5832768113413176e-07, + "loss": 0.5069, + "step": 7503 + }, + { + "epoch": 3.7094302311209986, + "grad_norm": 0.13135292500213508, + "learning_rate": 2.574490703854982e-07, + "loss": 0.511, + "step": 7504 + }, + { + "epoch": 3.709924607588679, + "grad_norm": 0.13198476939594497, + "learning_rate": 2.5657193684144434e-07, + "loss": 0.5238, + "step": 7505 + }, + { + "epoch": 3.710418984056359, + "grad_norm": 0.1322810877558683, + "learning_rate": 2.556962806349639e-07, + "loss": 0.5598, + "step": 7506 + }, + { + "epoch": 3.710913360524039, + "grad_norm": 0.13846622179231294, + "learning_rate": 2.548221018988306e-07, + "loss": 0.5468, + "step": 7507 + }, + { + "epoch": 3.711407736991719, + "grad_norm": 0.1347590487104857, + "learning_rate": 2.5394940076558847e-07, + "loss": 0.5271, + "step": 7508 + }, + { + "epoch": 3.7119021134593995, + "grad_norm": 0.1384469339332765, + "learning_rate": 2.5307817736756057e-07, + "loss": 0.5456, + "step": 7509 + }, + { + "epoch": 3.7123964899270794, + "grad_norm": 0.13433745222845545, + "learning_rate": 2.522084318368456e-07, + "loss": 0.525, + "step": 7510 + }, + { + "epoch": 3.7128908663947597, + "grad_norm": 0.13646662798259065, + "learning_rate": 2.5134016430531703e-07, + "loss": 0.5244, + "step": 7511 + }, + { + "epoch": 3.7133852428624397, + "grad_norm": 0.13003888305610706, + "learning_rate": 2.504733749046251e-07, + "loss": 0.4909, + "step": 7512 + }, + { + "epoch": 3.71387961933012, + "grad_norm": 0.13559880263535515, + "learning_rate": 2.4960806376619793e-07, + "loss": 0.5035, + "step": 7513 + }, + { + "epoch": 3.7143739957978, + "grad_norm": 0.13283612376151147, + "learning_rate": 2.4874423102123404e-07, + "loss": 0.5615, + "step": 7514 + }, + { + "epoch": 3.7148683722654803, + "grad_norm": 0.1330353290077898, + "learning_rate": 2.4788187680071517e-07, + "loss": 0.5144, + "step": 7515 + }, + { + "epoch": 3.71536274873316, + "grad_norm": 0.1374986128799882, + "learning_rate": 2.470210012353924e-07, + "loss": 0.5181, + "step": 7516 + }, + { + "epoch": 3.7158571252008405, + "grad_norm": 0.12988986155521928, + "learning_rate": 2.4616160445579465e-07, + "loss": 0.5448, + "step": 7517 + }, + { + "epoch": 3.7163515016685205, + "grad_norm": 0.13595593716834692, + "learning_rate": 2.453036865922276e-07, + "loss": 0.5184, + "step": 7518 + }, + { + "epoch": 3.716845878136201, + "grad_norm": 0.13230889629206846, + "learning_rate": 2.444472477747739e-07, + "loss": 0.5001, + "step": 7519 + }, + { + "epoch": 3.7173402546038807, + "grad_norm": 0.133626114251798, + "learning_rate": 2.435922881332875e-07, + "loss": 0.5468, + "step": 7520 + }, + { + "epoch": 3.717834631071561, + "grad_norm": 0.13529922375139947, + "learning_rate": 2.4273880779740357e-07, + "loss": 0.5094, + "step": 7521 + }, + { + "epoch": 3.718329007539241, + "grad_norm": 0.131124250199387, + "learning_rate": 2.4188680689652854e-07, + "loss": 0.5555, + "step": 7522 + }, + { + "epoch": 3.7188233840069214, + "grad_norm": 0.1353782130153397, + "learning_rate": 2.410362855598447e-07, + "loss": 0.5071, + "step": 7523 + }, + { + "epoch": 3.7193177604746013, + "grad_norm": 0.13311020836438117, + "learning_rate": 2.401872439163155e-07, + "loss": 0.5197, + "step": 7524 + }, + { + "epoch": 3.7198121369422816, + "grad_norm": 0.1336174841001742, + "learning_rate": 2.393396820946736e-07, + "loss": 0.526, + "step": 7525 + }, + { + "epoch": 3.7203065134099615, + "grad_norm": 0.13884726331920016, + "learning_rate": 2.384936002234295e-07, + "loss": 0.5163, + "step": 7526 + }, + { + "epoch": 3.720800889877642, + "grad_norm": 0.1367557201317476, + "learning_rate": 2.3764899843087052e-07, + "loss": 0.5133, + "step": 7527 + }, + { + "epoch": 3.721295266345322, + "grad_norm": 0.12982370950391073, + "learning_rate": 2.3680587684505762e-07, + "loss": 0.5097, + "step": 7528 + }, + { + "epoch": 3.721789642813002, + "grad_norm": 0.13133075178419393, + "learning_rate": 2.3596423559382742e-07, + "loss": 0.5206, + "step": 7529 + }, + { + "epoch": 3.722284019280682, + "grad_norm": 0.13382225019577002, + "learning_rate": 2.3512407480479672e-07, + "loss": 0.521, + "step": 7530 + }, + { + "epoch": 3.7227783957483624, + "grad_norm": 0.13940947776889318, + "learning_rate": 2.3428539460535026e-07, + "loss": 0.5633, + "step": 7531 + }, + { + "epoch": 3.7232727722160424, + "grad_norm": 0.13573981771980118, + "learning_rate": 2.3344819512265305e-07, + "loss": 0.5163, + "step": 7532 + }, + { + "epoch": 3.7237671486837227, + "grad_norm": 0.1316670832478916, + "learning_rate": 2.326124764836457e-07, + "loss": 0.4957, + "step": 7533 + }, + { + "epoch": 3.7242615251514026, + "grad_norm": 0.13402898714022585, + "learning_rate": 2.3177823881504246e-07, + "loss": 0.5151, + "step": 7534 + }, + { + "epoch": 3.724755901619083, + "grad_norm": 0.13057938967444374, + "learning_rate": 2.3094548224333325e-07, + "loss": 0.5143, + "step": 7535 + }, + { + "epoch": 3.7252502780867633, + "grad_norm": 0.13801278238587783, + "learning_rate": 2.301142068947848e-07, + "loss": 0.5114, + "step": 7536 + }, + { + "epoch": 3.7257446545544433, + "grad_norm": 0.13758778402433622, + "learning_rate": 2.2928441289543745e-07, + "loss": 0.5204, + "step": 7537 + }, + { + "epoch": 3.726239031022123, + "grad_norm": 0.13901344784712316, + "learning_rate": 2.2845610037111055e-07, + "loss": 0.5384, + "step": 7538 + }, + { + "epoch": 3.7267334074898035, + "grad_norm": 0.13268420158893618, + "learning_rate": 2.276292694473925e-07, + "loss": 0.5478, + "step": 7539 + }, + { + "epoch": 3.727227783957484, + "grad_norm": 0.13765579673513678, + "learning_rate": 2.2680392024965303e-07, + "loss": 0.5353, + "step": 7540 + }, + { + "epoch": 3.727722160425164, + "grad_norm": 0.13284053085266265, + "learning_rate": 2.2598005290303315e-07, + "loss": 0.5208, + "step": 7541 + }, + { + "epoch": 3.7282165368928437, + "grad_norm": 0.13156294482466374, + "learning_rate": 2.2515766753245295e-07, + "loss": 0.5343, + "step": 7542 + }, + { + "epoch": 3.728710913360524, + "grad_norm": 0.13365961030355086, + "learning_rate": 2.2433676426260488e-07, + "loss": 0.5323, + "step": 7543 + }, + { + "epoch": 3.7292052898282044, + "grad_norm": 0.13029153123980716, + "learning_rate": 2.2351734321795826e-07, + "loss": 0.5164, + "step": 7544 + }, + { + "epoch": 3.7296996662958843, + "grad_norm": 0.1345642340432335, + "learning_rate": 2.2269940452275484e-07, + "loss": 0.5171, + "step": 7545 + }, + { + "epoch": 3.7301940427635643, + "grad_norm": 0.1353622714664854, + "learning_rate": 2.2188294830101542e-07, + "loss": 0.5686, + "step": 7546 + }, + { + "epoch": 3.7306884192312446, + "grad_norm": 0.13501386042488508, + "learning_rate": 2.2106797467653428e-07, + "loss": 0.5384, + "step": 7547 + }, + { + "epoch": 3.731182795698925, + "grad_norm": 0.13448816164997346, + "learning_rate": 2.202544837728815e-07, + "loss": 0.5319, + "step": 7548 + }, + { + "epoch": 3.731677172166605, + "grad_norm": 0.1321562512289715, + "learning_rate": 2.194424757134006e-07, + "loss": 0.4884, + "step": 7549 + }, + { + "epoch": 3.732171548634285, + "grad_norm": 0.128460980418577, + "learning_rate": 2.18631950621212e-07, + "loss": 0.4827, + "step": 7550 + }, + { + "epoch": 3.732665925101965, + "grad_norm": 0.1312097588409577, + "learning_rate": 2.1782290861921186e-07, + "loss": 0.5155, + "step": 7551 + }, + { + "epoch": 3.7331603015696455, + "grad_norm": 0.13512397344487512, + "learning_rate": 2.1701534983006755e-07, + "loss": 0.5034, + "step": 7552 + }, + { + "epoch": 3.7336546780373254, + "grad_norm": 0.13023107753175278, + "learning_rate": 2.1620927437622786e-07, + "loss": 0.5476, + "step": 7553 + }, + { + "epoch": 3.7341490545050053, + "grad_norm": 0.13699055921739034, + "learning_rate": 2.1540468237991164e-07, + "loss": 0.5289, + "step": 7554 + }, + { + "epoch": 3.7346434309726857, + "grad_norm": 0.13554163296358362, + "learning_rate": 2.1460157396311242e-07, + "loss": 0.5289, + "step": 7555 + }, + { + "epoch": 3.735137807440366, + "grad_norm": 0.13261181534317934, + "learning_rate": 2.1379994924760395e-07, + "loss": 0.518, + "step": 7556 + }, + { + "epoch": 3.735632183908046, + "grad_norm": 0.13285548629709665, + "learning_rate": 2.1299980835493005e-07, + "loss": 0.5289, + "step": 7557 + }, + { + "epoch": 3.736126560375726, + "grad_norm": 0.14078280294664486, + "learning_rate": 2.122011514064104e-07, + "loss": 0.5694, + "step": 7558 + }, + { + "epoch": 3.7366209368434062, + "grad_norm": 0.13603964077231304, + "learning_rate": 2.1140397852314365e-07, + "loss": 0.5499, + "step": 7559 + }, + { + "epoch": 3.7371153133110866, + "grad_norm": 0.13499288809933283, + "learning_rate": 2.1060828982599645e-07, + "loss": 0.533, + "step": 7560 + }, + { + "epoch": 3.7376096897787665, + "grad_norm": 0.1331240017798492, + "learning_rate": 2.0981408543561676e-07, + "loss": 0.5256, + "step": 7561 + }, + { + "epoch": 3.7381040662464464, + "grad_norm": 0.1346420778192306, + "learning_rate": 2.09021365472426e-07, + "loss": 0.5065, + "step": 7562 + }, + { + "epoch": 3.738598442714127, + "grad_norm": 0.13997960934503628, + "learning_rate": 2.0823013005661695e-07, + "loss": 0.5384, + "step": 7563 + }, + { + "epoch": 3.739092819181807, + "grad_norm": 0.12928746638046085, + "learning_rate": 2.0744037930816142e-07, + "loss": 0.518, + "step": 7564 + }, + { + "epoch": 3.739587195649487, + "grad_norm": 0.13314859256886186, + "learning_rate": 2.0665211334680356e-07, + "loss": 0.547, + "step": 7565 + }, + { + "epoch": 3.740081572117167, + "grad_norm": 0.13673705681928727, + "learning_rate": 2.0586533229206451e-07, + "loss": 0.5179, + "step": 7566 + }, + { + "epoch": 3.7405759485848473, + "grad_norm": 0.13851537711451628, + "learning_rate": 2.050800362632377e-07, + "loss": 0.5278, + "step": 7567 + }, + { + "epoch": 3.7410703250525277, + "grad_norm": 0.13234844496647866, + "learning_rate": 2.0429622537939565e-07, + "loss": 0.5414, + "step": 7568 + }, + { + "epoch": 3.7415647015202076, + "grad_norm": 0.14494373046534323, + "learning_rate": 2.0351389975937998e-07, + "loss": 0.5157, + "step": 7569 + }, + { + "epoch": 3.7420590779878875, + "grad_norm": 0.128246824400279, + "learning_rate": 2.0273305952181133e-07, + "loss": 0.4914, + "step": 7570 + }, + { + "epoch": 3.742553454455568, + "grad_norm": 0.13124435629608566, + "learning_rate": 2.0195370478508392e-07, + "loss": 0.5191, + "step": 7571 + }, + { + "epoch": 3.743047830923248, + "grad_norm": 0.12718130599017677, + "learning_rate": 2.0117583566736544e-07, + "loss": 0.5109, + "step": 7572 + }, + { + "epoch": 3.743542207390928, + "grad_norm": 0.13572932054265624, + "learning_rate": 2.003994522866015e-07, + "loss": 0.5624, + "step": 7573 + }, + { + "epoch": 3.7440365838586085, + "grad_norm": 0.13887303860485747, + "learning_rate": 1.9962455476050913e-07, + "loss": 0.54, + "step": 7574 + }, + { + "epoch": 3.7445309603262884, + "grad_norm": 0.13010075331121912, + "learning_rate": 1.9885114320658093e-07, + "loss": 0.4981, + "step": 7575 + }, + { + "epoch": 3.7450253367939688, + "grad_norm": 0.13399477797132764, + "learning_rate": 1.980792177420865e-07, + "loss": 0.5624, + "step": 7576 + }, + { + "epoch": 3.7455197132616487, + "grad_norm": 0.13530925693087884, + "learning_rate": 1.973087784840666e-07, + "loss": 0.5007, + "step": 7577 + }, + { + "epoch": 3.746014089729329, + "grad_norm": 0.133011979255323, + "learning_rate": 1.965398255493378e-07, + "loss": 0.5474, + "step": 7578 + }, + { + "epoch": 3.746508466197009, + "grad_norm": 0.13324939210029565, + "learning_rate": 1.9577235905449465e-07, + "loss": 0.5061, + "step": 7579 + }, + { + "epoch": 3.7470028426646893, + "grad_norm": 0.13265568989604465, + "learning_rate": 1.950063791159018e-07, + "loss": 0.5211, + "step": 7580 + }, + { + "epoch": 3.747497219132369, + "grad_norm": 0.13294554908192865, + "learning_rate": 1.9424188584969862e-07, + "loss": 0.5039, + "step": 7581 + }, + { + "epoch": 3.7479915956000496, + "grad_norm": 0.13607560284007955, + "learning_rate": 1.9347887937180344e-07, + "loss": 0.4935, + "step": 7582 + }, + { + "epoch": 3.7484859720677295, + "grad_norm": 0.1351572147177695, + "learning_rate": 1.9271735979790485e-07, + "loss": 0.5274, + "step": 7583 + }, + { + "epoch": 3.74898034853541, + "grad_norm": 0.1286834110304449, + "learning_rate": 1.9195732724346604e-07, + "loss": 0.5159, + "step": 7584 + }, + { + "epoch": 3.7494747250030898, + "grad_norm": 0.1335621810382087, + "learning_rate": 1.9119878182373043e-07, + "loss": 0.5254, + "step": 7585 + }, + { + "epoch": 3.74996910147077, + "grad_norm": 0.13601706568522204, + "learning_rate": 1.9044172365370705e-07, + "loss": 0.5617, + "step": 7586 + }, + { + "epoch": 3.75046347793845, + "grad_norm": 0.13859997342161545, + "learning_rate": 1.8968615284818747e-07, + "loss": 0.5498, + "step": 7587 + }, + { + "epoch": 3.7509578544061304, + "grad_norm": 0.13524777965726564, + "learning_rate": 1.8893206952173338e-07, + "loss": 0.4966, + "step": 7588 + }, + { + "epoch": 3.7514522308738103, + "grad_norm": 0.12988911710394838, + "learning_rate": 1.8817947378867995e-07, + "loss": 0.5065, + "step": 7589 + }, + { + "epoch": 3.7519466073414907, + "grad_norm": 0.13404997882970762, + "learning_rate": 1.874283657631426e-07, + "loss": 0.5609, + "step": 7590 + }, + { + "epoch": 3.7519466073414907, + "eval_loss": 0.641017496585846, + "eval_runtime": 81.7594, + "eval_samples_per_second": 371.26, + "eval_steps_per_second": 46.417, + "step": 7590 + }, + { + "epoch": 3.7524409838091706, + "grad_norm": 0.13380659156102548, + "learning_rate": 1.8667874555900355e-07, + "loss": 0.5113, + "step": 7591 + }, + { + "epoch": 3.752935360276851, + "grad_norm": 0.13153587342491757, + "learning_rate": 1.8593061328992524e-07, + "loss": 0.5293, + "step": 7592 + }, + { + "epoch": 3.753429736744531, + "grad_norm": 0.13152312750045536, + "learning_rate": 1.8518396906934245e-07, + "loss": 0.5206, + "step": 7593 + }, + { + "epoch": 3.753924113212211, + "grad_norm": 0.13410461815928418, + "learning_rate": 1.8443881301046352e-07, + "loss": 0.5452, + "step": 7594 + }, + { + "epoch": 3.754418489679891, + "grad_norm": 0.13465094846051662, + "learning_rate": 1.8369514522627252e-07, + "loss": 0.5284, + "step": 7595 + }, + { + "epoch": 3.7549128661475715, + "grad_norm": 0.13123671613892296, + "learning_rate": 1.8295296582952704e-07, + "loss": 0.5361, + "step": 7596 + }, + { + "epoch": 3.7554072426152514, + "grad_norm": 0.137233408902342, + "learning_rate": 1.822122749327604e-07, + "loss": 0.5384, + "step": 7597 + }, + { + "epoch": 3.7559016190829317, + "grad_norm": 0.13421434195131848, + "learning_rate": 1.814730726482772e-07, + "loss": 0.5328, + "step": 7598 + }, + { + "epoch": 3.7563959955506117, + "grad_norm": 0.13079312860755646, + "learning_rate": 1.8073535908815997e-07, + "loss": 0.4812, + "step": 7599 + }, + { + "epoch": 3.756890372018292, + "grad_norm": 0.13250873507430036, + "learning_rate": 1.7999913436426375e-07, + "loss": 0.5351, + "step": 7600 + }, + { + "epoch": 3.757384748485972, + "grad_norm": 0.1531084609829419, + "learning_rate": 1.7926439858821586e-07, + "loss": 0.5415, + "step": 7601 + }, + { + "epoch": 3.7578791249536523, + "grad_norm": 0.13380010092399214, + "learning_rate": 1.7853115187142166e-07, + "loss": 0.5203, + "step": 7602 + }, + { + "epoch": 3.758373501421332, + "grad_norm": 0.13328825092694915, + "learning_rate": 1.7779939432506e-07, + "loss": 0.5362, + "step": 7603 + }, + { + "epoch": 3.7588678778890126, + "grad_norm": 0.13133707070463516, + "learning_rate": 1.7706912606007988e-07, + "loss": 0.5299, + "step": 7604 + }, + { + "epoch": 3.7593622543566925, + "grad_norm": 0.13629631146314836, + "learning_rate": 1.7634034718720827e-07, + "loss": 0.5202, + "step": 7605 + }, + { + "epoch": 3.759856630824373, + "grad_norm": 0.14223391709951444, + "learning_rate": 1.7561305781694792e-07, + "loss": 0.5447, + "step": 7606 + }, + { + "epoch": 3.7603510072920527, + "grad_norm": 0.13061268194519246, + "learning_rate": 1.7488725805957175e-07, + "loss": 0.5102, + "step": 7607 + }, + { + "epoch": 3.760845383759733, + "grad_norm": 0.13375218506650868, + "learning_rate": 1.7416294802512834e-07, + "loss": 0.5227, + "step": 7608 + }, + { + "epoch": 3.761339760227413, + "grad_norm": 0.13478920657411458, + "learning_rate": 1.7344012782343988e-07, + "loss": 0.5131, + "step": 7609 + }, + { + "epoch": 3.7618341366950934, + "grad_norm": 0.1313255086521153, + "learning_rate": 1.7271879756410425e-07, + "loss": 0.5185, + "step": 7610 + }, + { + "epoch": 3.7623285131627737, + "grad_norm": 0.13037255778160325, + "learning_rate": 1.7199895735649174e-07, + "loss": 0.5206, + "step": 7611 + }, + { + "epoch": 3.7628228896304536, + "grad_norm": 0.13443244211621325, + "learning_rate": 1.7128060730974837e-07, + "loss": 0.5661, + "step": 7612 + }, + { + "epoch": 3.7633172660981336, + "grad_norm": 0.1339530214783434, + "learning_rate": 1.705637475327937e-07, + "loss": 0.5082, + "step": 7613 + }, + { + "epoch": 3.763811642565814, + "grad_norm": 0.13759059049805342, + "learning_rate": 1.6984837813431854e-07, + "loss": 0.5211, + "step": 7614 + }, + { + "epoch": 3.7643060190334943, + "grad_norm": 0.13334391833078432, + "learning_rate": 1.6913449922279168e-07, + "loss": 0.5154, + "step": 7615 + }, + { + "epoch": 3.764800395501174, + "grad_norm": 0.13501271631528544, + "learning_rate": 1.6842211090645432e-07, + "loss": 0.5404, + "step": 7616 + }, + { + "epoch": 3.765294771968854, + "grad_norm": 0.14522963009579337, + "learning_rate": 1.6771121329332117e-07, + "loss": 0.535, + "step": 7617 + }, + { + "epoch": 3.7657891484365345, + "grad_norm": 0.13194873126555182, + "learning_rate": 1.6700180649118047e-07, + "loss": 0.5124, + "step": 7618 + }, + { + "epoch": 3.766283524904215, + "grad_norm": 0.1322566762689065, + "learning_rate": 1.6629389060759838e-07, + "loss": 0.502, + "step": 7619 + }, + { + "epoch": 3.7667779013718947, + "grad_norm": 0.1334172014643419, + "learning_rate": 1.6558746574990903e-07, + "loss": 0.5227, + "step": 7620 + }, + { + "epoch": 3.7672722778395746, + "grad_norm": 0.13441609399244353, + "learning_rate": 1.6488253202522343e-07, + "loss": 0.5067, + "step": 7621 + }, + { + "epoch": 3.767766654307255, + "grad_norm": 0.13534284137441566, + "learning_rate": 1.6417908954042826e-07, + "loss": 0.5398, + "step": 7622 + }, + { + "epoch": 3.7682610307749353, + "grad_norm": 0.13506630236432882, + "learning_rate": 1.634771384021816e-07, + "loss": 0.519, + "step": 7623 + }, + { + "epoch": 3.7687554072426153, + "grad_norm": 0.1310683237062051, + "learning_rate": 1.6277667871691495e-07, + "loss": 0.5118, + "step": 7624 + }, + { + "epoch": 3.769249783710295, + "grad_norm": 0.13089172201503746, + "learning_rate": 1.6207771059083665e-07, + "loss": 0.5197, + "step": 7625 + }, + { + "epoch": 3.7697441601779755, + "grad_norm": 0.13248520657083693, + "learning_rate": 1.613802341299253e-07, + "loss": 0.5006, + "step": 7626 + }, + { + "epoch": 3.770238536645656, + "grad_norm": 0.1288170923670464, + "learning_rate": 1.6068424943993633e-07, + "loss": 0.5485, + "step": 7627 + }, + { + "epoch": 3.770732913113336, + "grad_norm": 0.12840071340711479, + "learning_rate": 1.599897566263975e-07, + "loss": 0.5113, + "step": 7628 + }, + { + "epoch": 3.7712272895810157, + "grad_norm": 0.13254478485140564, + "learning_rate": 1.5929675579461012e-07, + "loss": 0.5192, + "step": 7629 + }, + { + "epoch": 3.771721666048696, + "grad_norm": 0.13149217356317255, + "learning_rate": 1.586052470496502e-07, + "loss": 0.5043, + "step": 7630 + }, + { + "epoch": 3.7722160425163764, + "grad_norm": 0.1331115191235143, + "learning_rate": 1.5791523049636714e-07, + "loss": 0.5292, + "step": 7631 + }, + { + "epoch": 3.7727104189840563, + "grad_norm": 0.13041389781027032, + "learning_rate": 1.5722670623938284e-07, + "loss": 0.5042, + "step": 7632 + }, + { + "epoch": 3.7732047954517363, + "grad_norm": 0.13179530890839697, + "learning_rate": 1.5653967438309493e-07, + "loss": 0.5428, + "step": 7633 + }, + { + "epoch": 3.7736991719194166, + "grad_norm": 0.1353653227214245, + "learning_rate": 1.5585413503167445e-07, + "loss": 0.5338, + "step": 7634 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 0.13560292230708354, + "learning_rate": 1.5517008828906498e-07, + "loss": 0.5461, + "step": 7635 + }, + { + "epoch": 3.774687924854777, + "grad_norm": 0.13601690628864616, + "learning_rate": 1.5448753425898467e-07, + "loss": 0.5331, + "step": 7636 + }, + { + "epoch": 3.775182301322457, + "grad_norm": 0.13601961811215008, + "learning_rate": 1.5380647304492625e-07, + "loss": 0.516, + "step": 7637 + }, + { + "epoch": 3.775676677790137, + "grad_norm": 0.1358869462622207, + "learning_rate": 1.5312690475015047e-07, + "loss": 0.5243, + "step": 7638 + }, + { + "epoch": 3.7761710542578175, + "grad_norm": 0.14328698146558358, + "learning_rate": 1.5244882947770269e-07, + "loss": 0.5514, + "step": 7639 + }, + { + "epoch": 3.7766654307254974, + "grad_norm": 0.13392973486502796, + "learning_rate": 1.5177224733039175e-07, + "loss": 0.5217, + "step": 7640 + }, + { + "epoch": 3.7771598071931773, + "grad_norm": 0.13531487668868988, + "learning_rate": 1.510971584108023e-07, + "loss": 0.5309, + "step": 7641 + }, + { + "epoch": 3.7776541836608577, + "grad_norm": 0.1380817683157095, + "learning_rate": 1.50423562821298e-07, + "loss": 0.5182, + "step": 7642 + }, + { + "epoch": 3.778148560128538, + "grad_norm": 0.13180005270173625, + "learning_rate": 1.4975146066400824e-07, + "loss": 0.5033, + "step": 7643 + }, + { + "epoch": 3.778642936596218, + "grad_norm": 0.1363620606756773, + "learning_rate": 1.4908085204084266e-07, + "loss": 0.5548, + "step": 7644 + }, + { + "epoch": 3.779137313063898, + "grad_norm": 0.1341096757499421, + "learning_rate": 1.4841173705347988e-07, + "loss": 0.526, + "step": 7645 + }, + { + "epoch": 3.7796316895315782, + "grad_norm": 0.13626459168642832, + "learning_rate": 1.4774411580337544e-07, + "loss": 0.5336, + "step": 7646 + }, + { + "epoch": 3.7801260659992586, + "grad_norm": 0.1331062036919269, + "learning_rate": 1.47077988391755e-07, + "loss": 0.5063, + "step": 7647 + }, + { + "epoch": 3.7806204424669385, + "grad_norm": 0.13035715224121575, + "learning_rate": 1.4641335491962006e-07, + "loss": 0.5098, + "step": 7648 + }, + { + "epoch": 3.781114818934619, + "grad_norm": 0.13166422759644955, + "learning_rate": 1.457502154877466e-07, + "loss": 0.5434, + "step": 7649 + }, + { + "epoch": 3.781609195402299, + "grad_norm": 0.13309698415259066, + "learning_rate": 1.4508857019667976e-07, + "loss": 0.5026, + "step": 7650 + }, + { + "epoch": 3.782103571869979, + "grad_norm": 0.13326598855229935, + "learning_rate": 1.4442841914674265e-07, + "loss": 0.5583, + "step": 7651 + }, + { + "epoch": 3.782597948337659, + "grad_norm": 0.13374941567793058, + "learning_rate": 1.4376976243802854e-07, + "loss": 0.5412, + "step": 7652 + }, + { + "epoch": 3.7830923248053394, + "grad_norm": 0.1328111197774785, + "learning_rate": 1.4311260017040863e-07, + "loss": 0.5131, + "step": 7653 + }, + { + "epoch": 3.7835867012730193, + "grad_norm": 0.13111445138972583, + "learning_rate": 1.42456932443521e-07, + "loss": 0.5248, + "step": 7654 + }, + { + "epoch": 3.7840810777406997, + "grad_norm": 0.13557074847784092, + "learning_rate": 1.4180275935678167e-07, + "loss": 0.5348, + "step": 7655 + }, + { + "epoch": 3.7845754542083796, + "grad_norm": 0.13670768307024125, + "learning_rate": 1.4115008100938022e-07, + "loss": 0.5253, + "step": 7656 + }, + { + "epoch": 3.78506983067606, + "grad_norm": 0.13173709685561025, + "learning_rate": 1.4049889750027745e-07, + "loss": 0.5393, + "step": 7657 + }, + { + "epoch": 3.78556420714374, + "grad_norm": 0.13374304700931083, + "learning_rate": 1.3984920892820775e-07, + "loss": 0.5085, + "step": 7658 + }, + { + "epoch": 3.7860585836114202, + "grad_norm": 0.1368262562269902, + "learning_rate": 1.392010153916812e-07, + "loss": 0.5369, + "step": 7659 + }, + { + "epoch": 3.7865529600791, + "grad_norm": 0.13226218467766843, + "learning_rate": 1.3855431698897803e-07, + "loss": 0.5128, + "step": 7660 + }, + { + "epoch": 3.7870473365467805, + "grad_norm": 0.13413326422047125, + "learning_rate": 1.379091138181532e-07, + "loss": 0.5439, + "step": 7661 + }, + { + "epoch": 3.7875417130144604, + "grad_norm": 0.13604186103929494, + "learning_rate": 1.372654059770373e-07, + "loss": 0.5273, + "step": 7662 + }, + { + "epoch": 3.7880360894821408, + "grad_norm": 0.1372177523129836, + "learning_rate": 1.3662319356322895e-07, + "loss": 0.5557, + "step": 7663 + }, + { + "epoch": 3.7885304659498207, + "grad_norm": 0.13260671878457955, + "learning_rate": 1.3598247667410359e-07, + "loss": 0.5431, + "step": 7664 + }, + { + "epoch": 3.789024842417501, + "grad_norm": 0.13759703488291716, + "learning_rate": 1.3534325540681015e-07, + "loss": 0.5389, + "step": 7665 + }, + { + "epoch": 3.789519218885181, + "grad_norm": 0.13352606549306775, + "learning_rate": 1.3470552985827e-07, + "loss": 0.547, + "step": 7666 + }, + { + "epoch": 3.7900135953528613, + "grad_norm": 0.1372600837420752, + "learning_rate": 1.3406930012517695e-07, + "loss": 0.5481, + "step": 7667 + }, + { + "epoch": 3.790507971820541, + "grad_norm": 0.13134388948721298, + "learning_rate": 1.3343456630399932e-07, + "loss": 0.5503, + "step": 7668 + }, + { + "epoch": 3.7910023482882216, + "grad_norm": 0.13004558484759512, + "learning_rate": 1.3280132849097793e-07, + "loss": 0.5219, + "step": 7669 + }, + { + "epoch": 3.7914967247559015, + "grad_norm": 0.13368475948757663, + "learning_rate": 1.3216958678212487e-07, + "loss": 0.5159, + "step": 7670 + }, + { + "epoch": 3.791991101223582, + "grad_norm": 0.1322258025929566, + "learning_rate": 1.3153934127323133e-07, + "loss": 0.5163, + "step": 7671 + }, + { + "epoch": 3.7924854776912618, + "grad_norm": 0.12975600616684893, + "learning_rate": 1.3091059205985413e-07, + "loss": 0.5128, + "step": 7672 + }, + { + "epoch": 3.792979854158942, + "grad_norm": 0.1371222884358027, + "learning_rate": 1.3028333923732816e-07, + "loss": 0.5271, + "step": 7673 + }, + { + "epoch": 3.793474230626622, + "grad_norm": 0.13144496654219, + "learning_rate": 1.296575829007607e-07, + "loss": 0.5077, + "step": 7674 + }, + { + "epoch": 3.7939686070943024, + "grad_norm": 0.12952281294445514, + "learning_rate": 1.2903332314502914e-07, + "loss": 0.4991, + "step": 7675 + }, + { + "epoch": 3.7944629835619823, + "grad_norm": 0.1370017078571514, + "learning_rate": 1.284105600647878e-07, + "loss": 0.5364, + "step": 7676 + }, + { + "epoch": 3.7949573600296627, + "grad_norm": 0.13194085488198343, + "learning_rate": 1.2778929375446337e-07, + "loss": 0.5204, + "step": 7677 + }, + { + "epoch": 3.7954517364973426, + "grad_norm": 0.13526273733933622, + "learning_rate": 1.271695243082527e-07, + "loss": 0.4946, + "step": 7678 + }, + { + "epoch": 3.795946112965023, + "grad_norm": 0.13116297107351296, + "learning_rate": 1.265512518201295e-07, + "loss": 0.583, + "step": 7679 + }, + { + "epoch": 3.796440489432703, + "grad_norm": 0.1345842854220472, + "learning_rate": 1.2593447638383772e-07, + "loss": 0.538, + "step": 7680 + }, + { + "epoch": 3.796934865900383, + "grad_norm": 0.13043110134858785, + "learning_rate": 1.2531919809289584e-07, + "loss": 0.4853, + "step": 7681 + }, + { + "epoch": 3.797429242368063, + "grad_norm": 0.13601858930190283, + "learning_rate": 1.247054170405937e-07, + "loss": 0.5564, + "step": 7682 + }, + { + "epoch": 3.7979236188357435, + "grad_norm": 0.13731433778240168, + "learning_rate": 1.2409313331999685e-07, + "loss": 0.5191, + "step": 7683 + }, + { + "epoch": 3.7984179953034234, + "grad_norm": 0.1347422542846958, + "learning_rate": 1.2348234702394102e-07, + "loss": 0.5609, + "step": 7684 + }, + { + "epoch": 3.7989123717711037, + "grad_norm": 0.1337853996089343, + "learning_rate": 1.228730582450366e-07, + "loss": 0.5378, + "step": 7685 + }, + { + "epoch": 3.799406748238784, + "grad_norm": 0.13489695308729557, + "learning_rate": 1.2226526707566744e-07, + "loss": 0.5532, + "step": 7686 + }, + { + "epoch": 3.799901124706464, + "grad_norm": 0.13053272877614622, + "learning_rate": 1.216589736079854e-07, + "loss": 0.5258, + "step": 7687 + }, + { + "epoch": 3.800395501174144, + "grad_norm": 0.1309168745166383, + "learning_rate": 1.210541779339247e-07, + "loss": 0.5074, + "step": 7688 + }, + { + "epoch": 3.8008898776418243, + "grad_norm": 0.13317188277882908, + "learning_rate": 1.2045088014518313e-07, + "loss": 0.5429, + "step": 7689 + }, + { + "epoch": 3.8013842541095046, + "grad_norm": 0.13657860906287791, + "learning_rate": 1.1984908033323528e-07, + "loss": 0.5016, + "step": 7690 + }, + { + "epoch": 3.8018786305771846, + "grad_norm": 0.13415226122702212, + "learning_rate": 1.1924877858933037e-07, + "loss": 0.5311, + "step": 7691 + }, + { + "epoch": 3.8023730070448645, + "grad_norm": 0.13383227992797814, + "learning_rate": 1.186499750044856e-07, + "loss": 0.543, + "step": 7692 + }, + { + "epoch": 3.802867383512545, + "grad_norm": 0.13546751569354562, + "learning_rate": 1.1805266966949725e-07, + "loss": 0.5318, + "step": 7693 + }, + { + "epoch": 3.803361759980225, + "grad_norm": 0.13161982214002502, + "learning_rate": 1.174568626749295e-07, + "loss": 0.5012, + "step": 7694 + }, + { + "epoch": 3.803856136447905, + "grad_norm": 0.13355779241638754, + "learning_rate": 1.1686255411112124e-07, + "loss": 0.4996, + "step": 7695 + }, + { + "epoch": 3.804350512915585, + "grad_norm": 0.1331958189951175, + "learning_rate": 1.1626974406818258e-07, + "loss": 0.5386, + "step": 7696 + }, + { + "epoch": 3.8048448893832654, + "grad_norm": 0.13688258238326723, + "learning_rate": 1.1567843263600054e-07, + "loss": 0.5346, + "step": 7697 + }, + { + "epoch": 3.8053392658509457, + "grad_norm": 0.1315945305089278, + "learning_rate": 1.1508861990422893e-07, + "loss": 0.5168, + "step": 7698 + }, + { + "epoch": 3.8058336423186256, + "grad_norm": 0.13206986350749586, + "learning_rate": 1.1450030596229955e-07, + "loss": 0.5267, + "step": 7699 + }, + { + "epoch": 3.8063280187863056, + "grad_norm": 0.13096353583680148, + "learning_rate": 1.139134908994155e-07, + "loss": 0.5151, + "step": 7700 + }, + { + "epoch": 3.806822395253986, + "grad_norm": 0.13284585231352689, + "learning_rate": 1.1332817480455005e-07, + "loss": 0.5072, + "step": 7701 + }, + { + "epoch": 3.8073167717216663, + "grad_norm": 0.135844981021941, + "learning_rate": 1.127443577664522e-07, + "loss": 0.5482, + "step": 7702 + }, + { + "epoch": 3.807811148189346, + "grad_norm": 0.13401637290304322, + "learning_rate": 1.1216203987364338e-07, + "loss": 0.5165, + "step": 7703 + }, + { + "epoch": 3.808305524657026, + "grad_norm": 0.12916353491537177, + "learning_rate": 1.1158122121441629e-07, + "loss": 0.5125, + "step": 7704 + }, + { + "epoch": 3.8087999011247065, + "grad_norm": 0.13455800013390218, + "learning_rate": 1.1100190187683602e-07, + "loss": 0.5254, + "step": 7705 + }, + { + "epoch": 3.809294277592387, + "grad_norm": 0.13252502483693282, + "learning_rate": 1.1042408194874232e-07, + "loss": 0.5423, + "step": 7706 + }, + { + "epoch": 3.8097886540600667, + "grad_norm": 0.13174761997096845, + "learning_rate": 1.0984776151774623e-07, + "loss": 0.497, + "step": 7707 + }, + { + "epoch": 3.8102830305277466, + "grad_norm": 0.13330064389437946, + "learning_rate": 1.0927294067123229e-07, + "loss": 0.5311, + "step": 7708 + }, + { + "epoch": 3.810777406995427, + "grad_norm": 0.1301236316099045, + "learning_rate": 1.0869961949635633e-07, + "loss": 0.5121, + "step": 7709 + }, + { + "epoch": 3.8112717834631074, + "grad_norm": 0.13308050629466697, + "learning_rate": 1.0812779808004769e-07, + "loss": 0.5162, + "step": 7710 + }, + { + "epoch": 3.8117661599307873, + "grad_norm": 0.13442751823329202, + "learning_rate": 1.0755747650900928e-07, + "loss": 0.5255, + "step": 7711 + }, + { + "epoch": 3.812260536398467, + "grad_norm": 0.13296170954443462, + "learning_rate": 1.0698865486971521e-07, + "loss": 0.533, + "step": 7712 + }, + { + "epoch": 3.8127549128661475, + "grad_norm": 0.1350132182519462, + "learning_rate": 1.0642133324841097e-07, + "loss": 0.5233, + "step": 7713 + }, + { + "epoch": 3.813249289333828, + "grad_norm": 0.128335895702254, + "learning_rate": 1.0585551173111775e-07, + "loss": 0.5227, + "step": 7714 + }, + { + "epoch": 3.813743665801508, + "grad_norm": 0.1348168482397941, + "learning_rate": 1.052911904036269e-07, + "loss": 0.5438, + "step": 7715 + }, + { + "epoch": 3.8142380422691877, + "grad_norm": 0.13631364173799937, + "learning_rate": 1.0472836935150332e-07, + "loss": 0.5008, + "step": 7716 + }, + { + "epoch": 3.814732418736868, + "grad_norm": 0.13670754601201265, + "learning_rate": 1.0416704866008543e-07, + "loss": 0.5072, + "step": 7717 + }, + { + "epoch": 3.8152267952045484, + "grad_norm": 0.14349566583497594, + "learning_rate": 1.0360722841448067e-07, + "loss": 0.5197, + "step": 7718 + }, + { + "epoch": 3.8157211716722284, + "grad_norm": 0.1344698065847337, + "learning_rate": 1.0304890869957229e-07, + "loss": 0.5245, + "step": 7719 + }, + { + "epoch": 3.8162155481399083, + "grad_norm": 0.13187905157697166, + "learning_rate": 1.0249208960001477e-07, + "loss": 0.5082, + "step": 7720 + }, + { + "epoch": 3.8167099246075886, + "grad_norm": 0.13109501796914214, + "learning_rate": 1.0193677120023615e-07, + "loss": 0.5295, + "step": 7721 + }, + { + "epoch": 3.817204301075269, + "grad_norm": 0.1307323774293087, + "learning_rate": 1.0138295358443462e-07, + "loss": 0.5446, + "step": 7722 + }, + { + "epoch": 3.817698677542949, + "grad_norm": 0.13159985845416441, + "learning_rate": 1.0083063683658412e-07, + "loss": 0.5437, + "step": 7723 + }, + { + "epoch": 3.8181930540106293, + "grad_norm": 0.13421717208496958, + "learning_rate": 1.0027982104042655e-07, + "loss": 0.5248, + "step": 7724 + }, + { + "epoch": 3.818687430478309, + "grad_norm": 0.13255562908013493, + "learning_rate": 9.973050627948067e-08, + "loss": 0.536, + "step": 7725 + }, + { + "epoch": 3.8191818069459895, + "grad_norm": 0.13138969204943082, + "learning_rate": 9.918269263703539e-08, + "loss": 0.5607, + "step": 7726 + }, + { + "epoch": 3.8196761834136694, + "grad_norm": 0.13853193229854302, + "learning_rate": 9.863638019615206e-08, + "loss": 0.5282, + "step": 7727 + }, + { + "epoch": 3.82017055988135, + "grad_norm": 0.13316752872858137, + "learning_rate": 9.809156903966555e-08, + "loss": 0.5194, + "step": 7728 + }, + { + "epoch": 3.8206649363490297, + "grad_norm": 0.1352918652633144, + "learning_rate": 9.754825925018085e-08, + "loss": 0.5698, + "step": 7729 + }, + { + "epoch": 3.82115931281671, + "grad_norm": 0.14026546122712943, + "learning_rate": 9.700645091007877e-08, + "loss": 0.5713, + "step": 7730 + }, + { + "epoch": 3.82165368928439, + "grad_norm": 0.13480154613560577, + "learning_rate": 9.646614410150801e-08, + "loss": 0.5248, + "step": 7731 + }, + { + "epoch": 3.8221480657520703, + "grad_norm": 0.1335530191243185, + "learning_rate": 9.592733890639416e-08, + "loss": 0.5214, + "step": 7732 + }, + { + "epoch": 3.8226424422197502, + "grad_norm": 0.13299181590075596, + "learning_rate": 9.539003540643299e-08, + "loss": 0.5601, + "step": 7733 + }, + { + "epoch": 3.8231368186874306, + "grad_norm": 0.1314668268978899, + "learning_rate": 9.485423368309154e-08, + "loss": 0.5173, + "step": 7734 + }, + { + "epoch": 3.8236311951551105, + "grad_norm": 0.13441218794954088, + "learning_rate": 9.431993381761039e-08, + "loss": 0.5338, + "step": 7735 + }, + { + "epoch": 3.824125571622791, + "grad_norm": 0.13427826179672953, + "learning_rate": 9.378713589100141e-08, + "loss": 0.4995, + "step": 7736 + }, + { + "epoch": 3.824619948090471, + "grad_norm": 0.1391179751293926, + "learning_rate": 9.325583998405107e-08, + "loss": 0.5312, + "step": 7737 + }, + { + "epoch": 3.825114324558151, + "grad_norm": 0.12944332731310812, + "learning_rate": 9.272604617731718e-08, + "loss": 0.534, + "step": 7738 + }, + { + "epoch": 3.825608701025831, + "grad_norm": 0.12908292164731527, + "learning_rate": 9.219775455112656e-08, + "loss": 0.5125, + "step": 7739 + }, + { + "epoch": 3.8261030774935114, + "grad_norm": 0.13733414968172625, + "learning_rate": 9.167096518558405e-08, + "loss": 0.5517, + "step": 7740 + }, + { + "epoch": 3.8265974539611913, + "grad_norm": 0.1297805090783984, + "learning_rate": 9.114567816056019e-08, + "loss": 0.5332, + "step": 7741 + }, + { + "epoch": 3.8270918304288717, + "grad_norm": 0.12791111983695885, + "learning_rate": 9.06218935557035e-08, + "loss": 0.4966, + "step": 7742 + }, + { + "epoch": 3.8275862068965516, + "grad_norm": 0.13212287375391482, + "learning_rate": 9.009961145043266e-08, + "loss": 0.5043, + "step": 7743 + }, + { + "epoch": 3.828080583364232, + "grad_norm": 0.13856902076528757, + "learning_rate": 8.957883192393657e-08, + "loss": 0.5762, + "step": 7744 + }, + { + "epoch": 3.828574959831912, + "grad_norm": 0.13218567389997146, + "learning_rate": 8.905955505517761e-08, + "loss": 0.5208, + "step": 7745 + }, + { + "epoch": 3.8290693362995922, + "grad_norm": 0.13970980998644822, + "learning_rate": 8.854178092289279e-08, + "loss": 0.5626, + "step": 7746 + }, + { + "epoch": 3.829563712767272, + "grad_norm": 0.13229947169621253, + "learning_rate": 8.80255096055871e-08, + "loss": 0.5108, + "step": 7747 + }, + { + "epoch": 3.8300580892349525, + "grad_norm": 0.14451182922628755, + "learning_rate": 8.751074118154012e-08, + "loss": 0.5389, + "step": 7748 + }, + { + "epoch": 3.8305524657026324, + "grad_norm": 0.13310493602578255, + "learning_rate": 8.69974757288039e-08, + "loss": 0.5485, + "step": 7749 + }, + { + "epoch": 3.8310468421703128, + "grad_norm": 0.141341779746732, + "learning_rate": 8.648571332520062e-08, + "loss": 0.5213, + "step": 7750 + }, + { + "epoch": 3.8315412186379927, + "grad_norm": 0.12971833895964344, + "learning_rate": 8.597545404832707e-08, + "loss": 0.5099, + "step": 7751 + }, + { + "epoch": 3.832035595105673, + "grad_norm": 0.13597468354593345, + "learning_rate": 8.546669797554919e-08, + "loss": 0.5079, + "step": 7752 + }, + { + "epoch": 3.832529971573353, + "grad_norm": 0.13229527516795161, + "learning_rate": 8.495944518400856e-08, + "loss": 0.5039, + "step": 7753 + }, + { + "epoch": 3.8330243480410333, + "grad_norm": 0.13401888102924642, + "learning_rate": 8.44536957506159e-08, + "loss": 0.5541, + "step": 7754 + }, + { + "epoch": 3.8335187245087132, + "grad_norm": 0.1317198230219419, + "learning_rate": 8.394944975205433e-08, + "loss": 0.51, + "step": 7755 + }, + { + "epoch": 3.8340131009763936, + "grad_norm": 0.1312306291875465, + "learning_rate": 8.344670726478044e-08, + "loss": 0.5533, + "step": 7756 + }, + { + "epoch": 3.8345074774440735, + "grad_norm": 0.1361255518536946, + "learning_rate": 8.294546836502215e-08, + "loss": 0.5312, + "step": 7757 + }, + { + "epoch": 3.835001853911754, + "grad_norm": 0.13275560133820538, + "learning_rate": 8.244573312877869e-08, + "loss": 0.5182, + "step": 7758 + }, + { + "epoch": 3.8354962303794338, + "grad_norm": 0.13893930212522154, + "learning_rate": 8.194750163182164e-08, + "loss": 0.5831, + "step": 7759 + }, + { + "epoch": 3.835990606847114, + "grad_norm": 0.13371363442343084, + "learning_rate": 8.145077394969614e-08, + "loss": 0.518, + "step": 7760 + }, + { + "epoch": 3.8364849833147945, + "grad_norm": 0.13821290997357621, + "learning_rate": 8.09555501577175e-08, + "loss": 0.5384, + "step": 7761 + }, + { + "epoch": 3.8369793597824744, + "grad_norm": 0.1324860978576002, + "learning_rate": 8.046183033097343e-08, + "loss": 0.5183, + "step": 7762 + }, + { + "epoch": 3.8374737362501543, + "grad_norm": 0.13532388831425712, + "learning_rate": 7.996961454432294e-08, + "loss": 0.5467, + "step": 7763 + }, + { + "epoch": 3.8379681127178347, + "grad_norm": 0.13351290119095166, + "learning_rate": 7.947890287239856e-08, + "loss": 0.5219, + "step": 7764 + }, + { + "epoch": 3.838462489185515, + "grad_norm": 0.12954832082895018, + "learning_rate": 7.898969538960411e-08, + "loss": 0.4914, + "step": 7765 + }, + { + "epoch": 3.838956865653195, + "grad_norm": 0.1364268663103952, + "learning_rate": 7.850199217011578e-08, + "loss": 0.5645, + "step": 7766 + }, + { + "epoch": 3.839451242120875, + "grad_norm": 0.13492085552556388, + "learning_rate": 7.801579328788001e-08, + "loss": 0.565, + "step": 7767 + }, + { + "epoch": 3.839945618588555, + "grad_norm": 0.13604274814390296, + "learning_rate": 7.753109881661558e-08, + "loss": 0.5302, + "step": 7768 + }, + { + "epoch": 3.8404399950562356, + "grad_norm": 0.135813956372055, + "learning_rate": 7.704790882981483e-08, + "loss": 0.5258, + "step": 7769 + }, + { + "epoch": 3.8409343715239155, + "grad_norm": 0.13093404303390982, + "learning_rate": 7.656622340074139e-08, + "loss": 0.4979, + "step": 7770 + }, + { + "epoch": 3.8414287479915954, + "grad_norm": 0.1500562736031621, + "learning_rate": 7.608604260242903e-08, + "loss": 0.5485, + "step": 7771 + }, + { + "epoch": 3.8419231244592758, + "grad_norm": 0.1353951282276174, + "learning_rate": 7.560736650768619e-08, + "loss": 0.5163, + "step": 7772 + }, + { + "epoch": 3.842417500926956, + "grad_norm": 0.13650400506640042, + "learning_rate": 7.513019518909037e-08, + "loss": 0.5324, + "step": 7773 + }, + { + "epoch": 3.842911877394636, + "grad_norm": 0.1302216903134105, + "learning_rate": 7.465452871899259e-08, + "loss": 0.518, + "step": 7774 + }, + { + "epoch": 3.843406253862316, + "grad_norm": 0.12974441727747066, + "learning_rate": 7.418036716951627e-08, + "loss": 0.5515, + "step": 7775 + }, + { + "epoch": 3.8439006303299963, + "grad_norm": 0.13699508436635693, + "learning_rate": 7.370771061255388e-08, + "loss": 0.498, + "step": 7776 + }, + { + "epoch": 3.8443950067976767, + "grad_norm": 0.13217889631706248, + "learning_rate": 7.323655911977367e-08, + "loss": 0.5228, + "step": 7777 + }, + { + "epoch": 3.8448893832653566, + "grad_norm": 0.13635499255423386, + "learning_rate": 7.276691276261182e-08, + "loss": 0.5519, + "step": 7778 + }, + { + "epoch": 3.8453837597330365, + "grad_norm": 0.14087387453756578, + "learning_rate": 7.229877161227805e-08, + "loss": 0.5311, + "step": 7779 + }, + { + "epoch": 3.845878136200717, + "grad_norm": 0.1336564020870909, + "learning_rate": 7.183213573975334e-08, + "loss": 0.5171, + "step": 7780 + }, + { + "epoch": 3.846372512668397, + "grad_norm": 0.13272038582282025, + "learning_rate": 7.136700521579331e-08, + "loss": 0.5343, + "step": 7781 + }, + { + "epoch": 3.846866889136077, + "grad_norm": 0.13088482471239854, + "learning_rate": 7.090338011092046e-08, + "loss": 0.5152, + "step": 7782 + }, + { + "epoch": 3.847361265603757, + "grad_norm": 0.13689438937962922, + "learning_rate": 7.044126049543409e-08, + "loss": 0.5121, + "step": 7783 + }, + { + "epoch": 3.8478556420714374, + "grad_norm": 0.13417023026206454, + "learning_rate": 6.998064643939928e-08, + "loss": 0.5298, + "step": 7784 + }, + { + "epoch": 3.8483500185391177, + "grad_norm": 0.13995405075382733, + "learning_rate": 6.952153801265793e-08, + "loss": 0.5624, + "step": 7785 + }, + { + "epoch": 3.8488443950067976, + "grad_norm": 0.1313747430269667, + "learning_rate": 6.906393528482214e-08, + "loss": 0.5252, + "step": 7786 + }, + { + "epoch": 3.8493387714744776, + "grad_norm": 0.1352244539585264, + "learning_rate": 6.860783832527529e-08, + "loss": 0.528, + "step": 7787 + }, + { + "epoch": 3.849833147942158, + "grad_norm": 0.13233348697683145, + "learning_rate": 6.815324720317207e-08, + "loss": 0.5503, + "step": 7788 + }, + { + "epoch": 3.8503275244098383, + "grad_norm": 0.13822431277528732, + "learning_rate": 6.770016198744067e-08, + "loss": 0.5441, + "step": 7789 + }, + { + "epoch": 3.850821900877518, + "grad_norm": 0.13896836301063065, + "learning_rate": 6.724858274677726e-08, + "loss": 0.548, + "step": 7790 + }, + { + "epoch": 3.851316277345198, + "grad_norm": 0.13081692246321475, + "learning_rate": 6.679850954965483e-08, + "loss": 0.5126, + "step": 7791 + }, + { + "epoch": 3.8518106538128785, + "grad_norm": 0.13486073125867698, + "learning_rate": 6.634994246431437e-08, + "loss": 0.5243, + "step": 7792 + }, + { + "epoch": 3.852305030280559, + "grad_norm": 0.1355282094194547, + "learning_rate": 6.590288155876922e-08, + "loss": 0.5375, + "step": 7793 + }, + { + "epoch": 3.8527994067482387, + "grad_norm": 0.1286247558070802, + "learning_rate": 6.545732690080298e-08, + "loss": 0.5631, + "step": 7794 + }, + { + "epoch": 3.853293783215919, + "grad_norm": 0.1332063165767906, + "learning_rate": 6.50132785579749e-08, + "loss": 0.514, + "step": 7795 + }, + { + "epoch": 3.853788159683599, + "grad_norm": 0.13432490355399285, + "learning_rate": 6.457073659761226e-08, + "loss": 0.4948, + "step": 7796 + }, + { + "epoch": 3.8542825361512794, + "grad_norm": 0.13385736074244364, + "learning_rate": 6.412970108681472e-08, + "loss": 0.5449, + "step": 7797 + }, + { + "epoch": 3.8547769126189593, + "grad_norm": 0.13752208764320187, + "learning_rate": 6.369017209245543e-08, + "loss": 0.5071, + "step": 7798 + }, + { + "epoch": 3.8552712890866396, + "grad_norm": 0.13145934883364477, + "learning_rate": 6.325214968117555e-08, + "loss": 0.5328, + "step": 7799 + }, + { + "epoch": 3.8557656655543195, + "grad_norm": 0.13452558530033198, + "learning_rate": 6.281563391939083e-08, + "loss": 0.5097, + "step": 7800 + }, + { + "epoch": 3.856260042022, + "grad_norm": 0.13568947363152312, + "learning_rate": 6.238062487328833e-08, + "loss": 0.5331, + "step": 7801 + }, + { + "epoch": 3.85675441848968, + "grad_norm": 0.1333997643830496, + "learning_rate": 6.194712260882307e-08, + "loss": 0.5337, + "step": 7802 + }, + { + "epoch": 3.85724879495736, + "grad_norm": 0.1373800113793547, + "learning_rate": 6.151512719172803e-08, + "loss": 0.5267, + "step": 7803 + }, + { + "epoch": 3.85774317142504, + "grad_norm": 0.13708029053452084, + "learning_rate": 6.108463868750081e-08, + "loss": 0.5503, + "step": 7804 + }, + { + "epoch": 3.8582375478927204, + "grad_norm": 0.1345207390552049, + "learning_rate": 6.065565716141586e-08, + "loss": 0.5267, + "step": 7805 + }, + { + "epoch": 3.8587319243604004, + "grad_norm": 0.1330729186655867, + "learning_rate": 6.022818267851671e-08, + "loss": 0.5248, + "step": 7806 + }, + { + "epoch": 3.8592263008280807, + "grad_norm": 0.13418482058629305, + "learning_rate": 5.980221530361819e-08, + "loss": 0.5381, + "step": 7807 + }, + { + "epoch": 3.8597206772957606, + "grad_norm": 0.13605214856840117, + "learning_rate": 5.93777551013075e-08, + "loss": 0.5156, + "step": 7808 + }, + { + "epoch": 3.860215053763441, + "grad_norm": 0.13494093146129707, + "learning_rate": 5.895480213594318e-08, + "loss": 0.5281, + "step": 7809 + }, + { + "epoch": 3.860709430231121, + "grad_norm": 0.1313463936722279, + "learning_rate": 5.8533356471655037e-08, + "loss": 0.5719, + "step": 7810 + }, + { + "epoch": 3.8612038066988013, + "grad_norm": 0.13471330502322026, + "learning_rate": 5.811341817234417e-08, + "loss": 0.5164, + "step": 7811 + }, + { + "epoch": 3.861698183166481, + "grad_norm": 0.1320279592513443, + "learning_rate": 5.7694987301682994e-08, + "loss": 0.5035, + "step": 7812 + }, + { + "epoch": 3.8621925596341615, + "grad_norm": 0.13091627645415274, + "learning_rate": 5.72780639231163e-08, + "loss": 0.5179, + "step": 7813 + }, + { + "epoch": 3.8626869361018414, + "grad_norm": 0.1379981712733415, + "learning_rate": 5.686264809985909e-08, + "loss": 0.5035, + "step": 7814 + }, + { + "epoch": 3.863181312569522, + "grad_norm": 0.13699214879363678, + "learning_rate": 5.644873989489985e-08, + "loss": 0.5587, + "step": 7815 + }, + { + "epoch": 3.8636756890372017, + "grad_norm": 0.1394373017860169, + "learning_rate": 5.603633937099395e-08, + "loss": 0.5197, + "step": 7816 + }, + { + "epoch": 3.864170065504882, + "grad_norm": 0.13100542476239793, + "learning_rate": 5.56254465906747e-08, + "loss": 0.5274, + "step": 7817 + }, + { + "epoch": 3.864664441972562, + "grad_norm": 0.12878861677540648, + "learning_rate": 5.521606161624116e-08, + "loss": 0.484, + "step": 7818 + }, + { + "epoch": 3.8651588184402423, + "grad_norm": 0.13709966584875177, + "learning_rate": 5.4808184509765884e-08, + "loss": 0.529, + "step": 7819 + }, + { + "epoch": 3.8656531949079223, + "grad_norm": 0.13375586504115552, + "learning_rate": 5.440181533309386e-08, + "loss": 0.5341, + "step": 7820 + }, + { + "epoch": 3.8661475713756026, + "grad_norm": 0.12731197359290106, + "learning_rate": 5.399695414784023e-08, + "loss": 0.5023, + "step": 7821 + }, + { + "epoch": 3.8666419478432825, + "grad_norm": 0.1334388966623611, + "learning_rate": 5.359360101539035e-08, + "loss": 0.5009, + "step": 7822 + }, + { + "epoch": 3.867136324310963, + "grad_norm": 0.1336524881072893, + "learning_rate": 5.319175599690418e-08, + "loss": 0.524, + "step": 7823 + }, + { + "epoch": 3.867630700778643, + "grad_norm": 0.1379507287139838, + "learning_rate": 5.2791419153310764e-08, + "loss": 0.5348, + "step": 7824 + }, + { + "epoch": 3.868125077246323, + "grad_norm": 0.12883682524954754, + "learning_rate": 5.239259054530821e-08, + "loss": 0.536, + "step": 7825 + }, + { + "epoch": 3.868619453714003, + "grad_norm": 0.1317369585543254, + "learning_rate": 5.199527023337259e-08, + "loss": 0.5473, + "step": 7826 + }, + { + "epoch": 3.8691138301816834, + "grad_norm": 0.13508091248865528, + "learning_rate": 5.1599458277744596e-08, + "loss": 0.5473, + "step": 7827 + }, + { + "epoch": 3.8696082066493633, + "grad_norm": 0.13856295902290583, + "learning_rate": 5.1205154738438457e-08, + "loss": 0.5043, + "step": 7828 + }, + { + "epoch": 3.8701025831170437, + "grad_norm": 0.1493063825242346, + "learning_rate": 5.0812359675240786e-08, + "loss": 0.5203, + "step": 7829 + }, + { + "epoch": 3.8705969595847236, + "grad_norm": 0.1318199984190395, + "learning_rate": 5.0421073147709495e-08, + "loss": 0.5039, + "step": 7830 + }, + { + "epoch": 3.871091336052404, + "grad_norm": 0.13403552044077463, + "learning_rate": 5.003129521517269e-08, + "loss": 0.5123, + "step": 7831 + }, + { + "epoch": 3.8715857125200843, + "grad_norm": 0.13102698239013424, + "learning_rate": 4.964302593672976e-08, + "loss": 0.5075, + "step": 7832 + }, + { + "epoch": 3.8720800889877642, + "grad_norm": 0.1330282603986519, + "learning_rate": 4.925626537125139e-08, + "loss": 0.5347, + "step": 7833 + }, + { + "epoch": 3.872574465455444, + "grad_norm": 0.13538802321975107, + "learning_rate": 4.8871013577379554e-08, + "loss": 0.5465, + "step": 7834 + }, + { + "epoch": 3.8730688419231245, + "grad_norm": 0.13380977125792828, + "learning_rate": 4.8487270613528644e-08, + "loss": 0.5425, + "step": 7835 + }, + { + "epoch": 3.873563218390805, + "grad_norm": 0.13359829510497723, + "learning_rate": 4.810503653788212e-08, + "loss": 0.5251, + "step": 7836 + }, + { + "epoch": 3.8740575948584848, + "grad_norm": 0.13181921878514177, + "learning_rate": 4.772431140839695e-08, + "loss": 0.5006, + "step": 7837 + }, + { + "epoch": 3.8745519713261647, + "grad_norm": 0.1322033125597414, + "learning_rate": 4.734509528280029e-08, + "loss": 0.5284, + "step": 7838 + }, + { + "epoch": 3.875046347793845, + "grad_norm": 0.13724916883325208, + "learning_rate": 4.696738821858948e-08, + "loss": 0.5659, + "step": 7839 + }, + { + "epoch": 3.8755407242615254, + "grad_norm": 0.13448372637920566, + "learning_rate": 4.659119027303427e-08, + "loss": 0.5172, + "step": 7840 + }, + { + "epoch": 3.8760351007292053, + "grad_norm": 0.13680036420765299, + "learning_rate": 4.621650150317569e-08, + "loss": 0.4948, + "step": 7841 + }, + { + "epoch": 3.8765294771968852, + "grad_norm": 0.13349270686298834, + "learning_rate": 4.584332196582497e-08, + "loss": 0.5189, + "step": 7842 + }, + { + "epoch": 3.8770238536645656, + "grad_norm": 0.13220958694814575, + "learning_rate": 4.547165171756574e-08, + "loss": 0.5187, + "step": 7843 + }, + { + "epoch": 3.877518230132246, + "grad_norm": 0.1366673767429753, + "learning_rate": 4.510149081475179e-08, + "loss": 0.5129, + "step": 7844 + }, + { + "epoch": 3.878012606599926, + "grad_norm": 0.13542805134353714, + "learning_rate": 4.473283931350825e-08, + "loss": 0.5308, + "step": 7845 + }, + { + "epoch": 3.8785069830676058, + "grad_norm": 0.131708238126416, + "learning_rate": 4.43656972697315e-08, + "loss": 0.5288, + "step": 7846 + }, + { + "epoch": 3.879001359535286, + "grad_norm": 0.1333011583951022, + "learning_rate": 4.400006473908924e-08, + "loss": 0.5158, + "step": 7847 + }, + { + "epoch": 3.8794957360029665, + "grad_norm": 0.13552862984599054, + "learning_rate": 4.3635941777020465e-08, + "loss": 0.512, + "step": 7848 + }, + { + "epoch": 3.8799901124706464, + "grad_norm": 0.1354006659385537, + "learning_rate": 4.327332843873433e-08, + "loss": 0.5406, + "step": 7849 + }, + { + "epoch": 3.8804844889383263, + "grad_norm": 0.1333364844535761, + "learning_rate": 4.291222477921242e-08, + "loss": 0.5148, + "step": 7850 + }, + { + "epoch": 3.8809788654060067, + "grad_norm": 0.13627680277792809, + "learning_rate": 4.255263085320538e-08, + "loss": 0.557, + "step": 7851 + }, + { + "epoch": 3.881473241873687, + "grad_norm": 0.14113835379263295, + "learning_rate": 4.219454671523848e-08, + "loss": 0.4958, + "step": 7852 + }, + { + "epoch": 3.881967618341367, + "grad_norm": 0.1315607114155915, + "learning_rate": 4.1837972419604966e-08, + "loss": 0.5342, + "step": 7853 + }, + { + "epoch": 3.882461994809047, + "grad_norm": 0.1393360155902254, + "learning_rate": 4.148290802036825e-08, + "loss": 0.528, + "step": 7854 + }, + { + "epoch": 3.882956371276727, + "grad_norm": 0.13417776354441519, + "learning_rate": 4.112935357136749e-08, + "loss": 0.5443, + "step": 7855 + }, + { + "epoch": 3.8834507477444076, + "grad_norm": 0.13197324849342684, + "learning_rate": 4.077730912620759e-08, + "loss": 0.5128, + "step": 7856 + }, + { + "epoch": 3.8839451242120875, + "grad_norm": 0.1392764669796679, + "learning_rate": 4.0426774738268056e-08, + "loss": 0.5358, + "step": 7857 + }, + { + "epoch": 3.8844395006797674, + "grad_norm": 0.12907089308999112, + "learning_rate": 4.0077750460698616e-08, + "loss": 0.4962, + "step": 7858 + }, + { + "epoch": 3.8849338771474478, + "grad_norm": 0.12934762053599697, + "learning_rate": 3.973023634642026e-08, + "loss": 0.5324, + "step": 7859 + }, + { + "epoch": 3.885428253615128, + "grad_norm": 0.13270262127634402, + "learning_rate": 3.938423244812195e-08, + "loss": 0.5, + "step": 7860 + }, + { + "epoch": 3.885922630082808, + "grad_norm": 0.13752171806025693, + "learning_rate": 3.903973881826839e-08, + "loss": 0.547, + "step": 7861 + }, + { + "epoch": 3.886417006550488, + "grad_norm": 0.13365181685511418, + "learning_rate": 3.8696755509093354e-08, + "loss": 0.5414, + "step": 7862 + }, + { + "epoch": 3.8869113830181683, + "grad_norm": 0.1353956031731928, + "learning_rate": 3.835528257259857e-08, + "loss": 0.5383, + "step": 7863 + }, + { + "epoch": 3.8874057594858487, + "grad_norm": 0.1340044269004273, + "learning_rate": 3.8015320060563745e-08, + "loss": 0.5199, + "step": 7864 + }, + { + "epoch": 3.8879001359535286, + "grad_norm": 0.1367581585314765, + "learning_rate": 3.7676868024530966e-08, + "loss": 0.5666, + "step": 7865 + }, + { + "epoch": 3.8883945124212085, + "grad_norm": 0.13835734673128758, + "learning_rate": 3.733992651582141e-08, + "loss": 0.5316, + "step": 7866 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.12958902033519984, + "learning_rate": 3.700449558552088e-08, + "loss": 0.5086, + "step": 7867 + }, + { + "epoch": 3.889383265356569, + "grad_norm": 0.13813898343218556, + "learning_rate": 3.6670575284489804e-08, + "loss": 0.546, + "step": 7868 + }, + { + "epoch": 3.889877641824249, + "grad_norm": 0.1343136798791209, + "learning_rate": 3.6338165663358796e-08, + "loss": 0.5111, + "step": 7869 + }, + { + "epoch": 3.8903720182919295, + "grad_norm": 0.13575037050715563, + "learning_rate": 3.6007266772528635e-08, + "loss": 0.5382, + "step": 7870 + }, + { + "epoch": 3.8908663947596094, + "grad_norm": 0.13350845527241495, + "learning_rate": 3.5677878662171425e-08, + "loss": 0.5237, + "step": 7871 + }, + { + "epoch": 3.8913607712272897, + "grad_norm": 0.13783241865686063, + "learning_rate": 3.535000138223166e-08, + "loss": 0.5699, + "step": 7872 + }, + { + "epoch": 3.8918551476949697, + "grad_norm": 0.1349427197697064, + "learning_rate": 3.5023634982420674e-08, + "loss": 0.5244, + "step": 7873 + }, + { + "epoch": 3.89234952416265, + "grad_norm": 0.1361745404703039, + "learning_rate": 3.469877951222667e-08, + "loss": 0.5419, + "step": 7874 + }, + { + "epoch": 3.89284390063033, + "grad_norm": 0.13682329605368312, + "learning_rate": 3.437543502090357e-08, + "loss": 0.5225, + "step": 7875 + }, + { + "epoch": 3.8933382770980103, + "grad_norm": 0.13039808045811352, + "learning_rate": 3.405360155747772e-08, + "loss": 0.5175, + "step": 7876 + }, + { + "epoch": 3.89383265356569, + "grad_norm": 0.1320691482687498, + "learning_rate": 3.373327917074787e-08, + "loss": 0.5198, + "step": 7877 + }, + { + "epoch": 3.8943270300333706, + "grad_norm": 0.13394170338963818, + "learning_rate": 3.341446790928293e-08, + "loss": 0.5682, + "step": 7878 + }, + { + "epoch": 3.8948214065010505, + "grad_norm": 0.13384923760738446, + "learning_rate": 3.3097167821422025e-08, + "loss": 0.5251, + "step": 7879 + }, + { + "epoch": 3.895315782968731, + "grad_norm": 0.132945088309175, + "learning_rate": 3.278137895527444e-08, + "loss": 0.5168, + "step": 7880 + }, + { + "epoch": 3.8958101594364107, + "grad_norm": 0.13164520209730043, + "learning_rate": 3.2467101358721886e-08, + "loss": 0.5354, + "step": 7881 + }, + { + "epoch": 3.896304535904091, + "grad_norm": 0.1345826506030179, + "learning_rate": 3.215433507941734e-08, + "loss": 0.499, + "step": 7882 + }, + { + "epoch": 3.896798912371771, + "grad_norm": 0.1284487432062547, + "learning_rate": 3.184308016478288e-08, + "loss": 0.5046, + "step": 7883 + }, + { + "epoch": 3.8972932888394514, + "grad_norm": 0.13269378112881997, + "learning_rate": 3.153333666201186e-08, + "loss": 0.4863, + "step": 7884 + }, + { + "epoch": 3.8977876653071313, + "grad_norm": 0.1340175122259528, + "learning_rate": 3.1225104618068934e-08, + "loss": 0.5354, + "step": 7885 + }, + { + "epoch": 3.8982820417748116, + "grad_norm": 0.1352661283130633, + "learning_rate": 3.0918384079690053e-08, + "loss": 0.5198, + "step": 7886 + }, + { + "epoch": 3.8987764182424915, + "grad_norm": 0.13278765724566166, + "learning_rate": 3.0613175093381356e-08, + "loss": 0.5136, + "step": 7887 + }, + { + "epoch": 3.899270794710172, + "grad_norm": 0.13089316514539961, + "learning_rate": 3.0309477705420255e-08, + "loss": 0.5259, + "step": 7888 + }, + { + "epoch": 3.899765171177852, + "grad_norm": 0.13208757860968437, + "learning_rate": 3.0007291961853255e-08, + "loss": 0.5117, + "step": 7889 + }, + { + "epoch": 3.900259547645532, + "grad_norm": 0.1337104993892066, + "learning_rate": 2.970661790849927e-08, + "loss": 0.5391, + "step": 7890 + }, + { + "epoch": 3.900753924113212, + "grad_norm": 0.13890524638231064, + "learning_rate": 2.9407455590948486e-08, + "loss": 0.5535, + "step": 7891 + }, + { + "epoch": 3.9012483005808924, + "grad_norm": 0.13367661632593683, + "learning_rate": 2.910980505456129e-08, + "loss": 0.5051, + "step": 7892 + }, + { + "epoch": 3.9017426770485724, + "grad_norm": 0.13289155268327277, + "learning_rate": 2.881366634446825e-08, + "loss": 0.5474, + "step": 7893 + }, + { + "epoch": 3.9022370535162527, + "grad_norm": 0.13245734417025984, + "learning_rate": 2.851903950557011e-08, + "loss": 0.5396, + "step": 7894 + }, + { + "epoch": 3.9027314299839326, + "grad_norm": 0.13464777684192558, + "learning_rate": 2.822592458254003e-08, + "loss": 0.5275, + "step": 7895 + }, + { + "epoch": 3.903225806451613, + "grad_norm": 0.13172984689737138, + "learning_rate": 2.7934321619822457e-08, + "loss": 0.5087, + "step": 7896 + }, + { + "epoch": 3.903720182919293, + "grad_norm": 0.1340761848542783, + "learning_rate": 2.764423066162869e-08, + "loss": 0.5227, + "step": 7897 + }, + { + "epoch": 3.9042145593869733, + "grad_norm": 0.1326740605900675, + "learning_rate": 2.7355651751946876e-08, + "loss": 0.5345, + "step": 7898 + }, + { + "epoch": 3.904708935854653, + "grad_norm": 0.1316307887512598, + "learning_rate": 2.7068584934529795e-08, + "loss": 0.5337, + "step": 7899 + }, + { + "epoch": 3.9052033123223335, + "grad_norm": 0.1374268272241843, + "learning_rate": 2.6783030252904853e-08, + "loss": 0.5561, + "step": 7900 + }, + { + "epoch": 3.9056976887900134, + "grad_norm": 0.13039152602640258, + "learning_rate": 2.649898775036963e-08, + "loss": 0.4811, + "step": 7901 + }, + { + "epoch": 3.906192065257694, + "grad_norm": 0.13217377253272486, + "learning_rate": 2.6216457469989685e-08, + "loss": 0.5255, + "step": 7902 + }, + { + "epoch": 3.9066864417253737, + "grad_norm": 0.13355761122294676, + "learning_rate": 2.5935439454605195e-08, + "loss": 0.5009, + "step": 7903 + }, + { + "epoch": 3.907180818193054, + "grad_norm": 0.13125280879523077, + "learning_rate": 2.5655933746825403e-08, + "loss": 0.5252, + "step": 7904 + }, + { + "epoch": 3.907675194660734, + "grad_norm": 0.13026190580663774, + "learning_rate": 2.5377940389028632e-08, + "loss": 0.5028, + "step": 7905 + }, + { + "epoch": 3.9081695711284143, + "grad_norm": 0.13495711331853671, + "learning_rate": 2.5101459423365614e-08, + "loss": 0.5297, + "step": 7906 + }, + { + "epoch": 3.9086639475960947, + "grad_norm": 0.13059535626949134, + "learning_rate": 2.482649089175837e-08, + "loss": 0.5221, + "step": 7907 + }, + { + "epoch": 3.9091583240637746, + "grad_norm": 0.13715593283101324, + "learning_rate": 2.4553034835899102e-08, + "loss": 0.5018, + "step": 7908 + }, + { + "epoch": 3.9096527005314545, + "grad_norm": 0.13675391078069585, + "learning_rate": 2.4281091297249094e-08, + "loss": 0.5222, + "step": 7909 + }, + { + "epoch": 3.910147076999135, + "grad_norm": 0.13153880305245064, + "learning_rate": 2.4010660317042023e-08, + "loss": 0.5234, + "step": 7910 + }, + { + "epoch": 3.9106414534668152, + "grad_norm": 0.13393466860017772, + "learning_rate": 2.374174193628176e-08, + "loss": 0.5282, + "step": 7911 + }, + { + "epoch": 3.911135829934495, + "grad_norm": 0.13483974147041608, + "learning_rate": 2.3474336195742353e-08, + "loss": 0.5216, + "step": 7912 + }, + { + "epoch": 3.911630206402175, + "grad_norm": 0.1335184598100238, + "learning_rate": 2.320844313596915e-08, + "loss": 0.5279, + "step": 7913 + }, + { + "epoch": 3.9121245828698554, + "grad_norm": 0.13334989545630307, + "learning_rate": 2.2944062797277676e-08, + "loss": 0.5034, + "step": 7914 + }, + { + "epoch": 3.912618959337536, + "grad_norm": 0.1290606336171957, + "learning_rate": 2.2681195219755868e-08, + "loss": 0.5141, + "step": 7915 + }, + { + "epoch": 3.9131133358052157, + "grad_norm": 0.14145127243307365, + "learning_rate": 2.241984044325962e-08, + "loss": 0.5498, + "step": 7916 + }, + { + "epoch": 3.9136077122728956, + "grad_norm": 0.13428371316587948, + "learning_rate": 2.2159998507415015e-08, + "loss": 0.5271, + "step": 7917 + }, + { + "epoch": 3.914102088740576, + "grad_norm": 0.13032175563263942, + "learning_rate": 2.1901669451623863e-08, + "loss": 0.4909, + "step": 7918 + }, + { + "epoch": 3.9145964652082563, + "grad_norm": 0.13133564444801574, + "learning_rate": 2.1644853315051505e-08, + "loss": 0.5278, + "step": 7919 + }, + { + "epoch": 3.9150908416759362, + "grad_norm": 0.1315984390291668, + "learning_rate": 2.1389550136640126e-08, + "loss": 0.537, + "step": 7920 + }, + { + "epoch": 3.915585218143616, + "grad_norm": 0.13349640575553537, + "learning_rate": 2.1135759955097646e-08, + "loss": 0.5435, + "step": 7921 + }, + { + "epoch": 3.9160795946112965, + "grad_norm": 0.1367791313436281, + "learning_rate": 2.088348280890662e-08, + "loss": 0.5303, + "step": 7922 + }, + { + "epoch": 3.916573971078977, + "grad_norm": 0.14331209012224652, + "learning_rate": 2.0632718736317558e-08, + "loss": 0.546, + "step": 7923 + }, + { + "epoch": 3.917068347546657, + "grad_norm": 0.13346202130374968, + "learning_rate": 2.038346777535116e-08, + "loss": 0.5505, + "step": 7924 + }, + { + "epoch": 3.9175627240143367, + "grad_norm": 0.12767308646651512, + "learning_rate": 2.0135729963801642e-08, + "loss": 0.471, + "step": 7925 + }, + { + "epoch": 3.918057100482017, + "grad_norm": 0.1303662514877901, + "learning_rate": 1.9889505339232284e-08, + "loss": 0.5302, + "step": 7926 + }, + { + "epoch": 3.9185514769496974, + "grad_norm": 0.12899862800223144, + "learning_rate": 1.964479393897434e-08, + "loss": 0.521, + "step": 7927 + }, + { + "epoch": 3.9190458534173773, + "grad_norm": 0.13409663729973345, + "learning_rate": 1.9401595800133677e-08, + "loss": 0.527, + "step": 7928 + }, + { + "epoch": 3.9195402298850572, + "grad_norm": 0.13532201088114457, + "learning_rate": 1.915991095958414e-08, + "loss": 0.5152, + "step": 7929 + }, + { + "epoch": 3.9200346063527376, + "grad_norm": 0.13870067993210358, + "learning_rate": 1.8919739453971963e-08, + "loss": 0.5436, + "step": 7930 + }, + { + "epoch": 3.920528982820418, + "grad_norm": 0.12650472693174972, + "learning_rate": 1.868108131971247e-08, + "loss": 0.4853, + "step": 7931 + }, + { + "epoch": 3.921023359288098, + "grad_norm": 0.13498455885721533, + "learning_rate": 1.8443936592991152e-08, + "loss": 0.5114, + "step": 7932 + }, + { + "epoch": 3.921517735755778, + "grad_norm": 0.14195498662677145, + "learning_rate": 1.8208305309765916e-08, + "loss": 0.5629, + "step": 7933 + }, + { + "epoch": 3.922012112223458, + "grad_norm": 0.13904994070119153, + "learning_rate": 1.797418750576374e-08, + "loss": 0.537, + "step": 7934 + }, + { + "epoch": 3.9225064886911385, + "grad_norm": 0.139263064136814, + "learning_rate": 1.7741583216481783e-08, + "loss": 0.5436, + "step": 7935 + }, + { + "epoch": 3.9230008651588184, + "grad_norm": 0.1313221309776564, + "learning_rate": 1.751049247718961e-08, + "loss": 0.5247, + "step": 7936 + }, + { + "epoch": 3.9234952416264983, + "grad_norm": 0.13303353830727035, + "learning_rate": 1.7280915322925862e-08, + "loss": 0.5217, + "step": 7937 + }, + { + "epoch": 3.9239896180941787, + "grad_norm": 0.13422063360776468, + "learning_rate": 1.7052851788499357e-08, + "loss": 0.5464, + "step": 7938 + }, + { + "epoch": 3.924483994561859, + "grad_norm": 0.13858372663400761, + "learning_rate": 1.6826301908490218e-08, + "loss": 0.5194, + "step": 7939 + }, + { + "epoch": 3.924978371029539, + "grad_norm": 0.13098972836825443, + "learning_rate": 1.660126571724874e-08, + "loss": 0.5148, + "step": 7940 + }, + { + "epoch": 3.925472747497219, + "grad_norm": 0.13187513584236693, + "learning_rate": 1.6377743248897628e-08, + "loss": 0.5298, + "step": 7941 + }, + { + "epoch": 3.925967123964899, + "grad_norm": 0.13221346872356388, + "learning_rate": 1.615573453732533e-08, + "loss": 0.5175, + "step": 7942 + }, + { + "epoch": 3.9264615004325796, + "grad_norm": 0.1347412103678186, + "learning_rate": 1.593523961619492e-08, + "loss": 0.5194, + "step": 7943 + }, + { + "epoch": 3.9269558769002595, + "grad_norm": 0.13248312400459328, + "learning_rate": 1.5716258518939655e-08, + "loss": 0.5357, + "step": 7944 + }, + { + "epoch": 3.92745025336794, + "grad_norm": 0.13278263270025653, + "learning_rate": 1.5498791278760748e-08, + "loss": 0.5114, + "step": 7945 + }, + { + "epoch": 3.9279446298356198, + "grad_norm": 0.1333901172461707, + "learning_rate": 1.5282837928632944e-08, + "loss": 0.5096, + "step": 7946 + }, + { + "epoch": 3.9284390063033, + "grad_norm": 0.1323801585695426, + "learning_rate": 1.506839850129893e-08, + "loss": 0.5442, + "step": 7947 + }, + { + "epoch": 3.92893338277098, + "grad_norm": 0.1320870057440134, + "learning_rate": 1.4855473029273814e-08, + "loss": 0.5356, + "step": 7948 + }, + { + "epoch": 3.9294277592386604, + "grad_norm": 0.13532136674102807, + "learning_rate": 1.464406154484066e-08, + "loss": 0.5301, + "step": 7949 + }, + { + "epoch": 3.9299221357063403, + "grad_norm": 0.13045553576685334, + "learning_rate": 1.4434164080054935e-08, + "loss": 0.5179, + "step": 7950 + }, + { + "epoch": 3.9304165121740207, + "grad_norm": 0.13355091758522855, + "learning_rate": 1.4225780666743404e-08, + "loss": 0.4831, + "step": 7951 + }, + { + "epoch": 3.9309108886417006, + "grad_norm": 0.12877828385364548, + "learning_rate": 1.4018911336501907e-08, + "loss": 0.5255, + "step": 7952 + }, + { + "epoch": 3.931405265109381, + "grad_norm": 0.128022600884825, + "learning_rate": 1.3813556120695349e-08, + "loss": 0.5066, + "step": 7953 + }, + { + "epoch": 3.931899641577061, + "grad_norm": 0.13048030247883366, + "learning_rate": 1.3609715050461047e-08, + "loss": 0.5147, + "step": 7954 + }, + { + "epoch": 3.932394018044741, + "grad_norm": 0.1287369186347202, + "learning_rate": 1.3407388156706502e-08, + "loss": 0.5141, + "step": 7955 + }, + { + "epoch": 3.932888394512421, + "grad_norm": 0.13480746031753515, + "learning_rate": 1.3206575470110506e-08, + "loss": 0.5586, + "step": 7956 + }, + { + "epoch": 3.9333827709801015, + "grad_norm": 0.134705826022526, + "learning_rate": 1.3007277021118703e-08, + "loss": 0.5613, + "step": 7957 + }, + { + "epoch": 3.9338771474477814, + "grad_norm": 0.13549200734945324, + "learning_rate": 1.2809492839950255e-08, + "loss": 0.5259, + "step": 7958 + }, + { + "epoch": 3.9343715239154617, + "grad_norm": 0.13647815051708162, + "learning_rate": 1.2613222956595617e-08, + "loss": 0.51, + "step": 7959 + }, + { + "epoch": 3.9348659003831417, + "grad_norm": 0.13273001174773774, + "learning_rate": 1.2418467400812095e-08, + "loss": 0.5062, + "step": 7960 + }, + { + "epoch": 3.935360276850822, + "grad_norm": 0.12911472638738702, + "learning_rate": 1.2225226202129404e-08, + "loss": 0.5041, + "step": 7961 + }, + { + "epoch": 3.935854653318502, + "grad_norm": 0.13405677853372305, + "learning_rate": 1.2033499389848546e-08, + "loss": 0.5269, + "step": 7962 + }, + { + "epoch": 3.9363490297861823, + "grad_norm": 0.1262923943457907, + "learning_rate": 1.1843286993039604e-08, + "loss": 0.511, + "step": 7963 + }, + { + "epoch": 3.936843406253862, + "grad_norm": 0.1318901857376441, + "learning_rate": 1.165458904054395e-08, + "loss": 0.5073, + "step": 7964 + }, + { + "epoch": 3.9373377827215426, + "grad_norm": 0.13689543891521905, + "learning_rate": 1.1467405560970923e-08, + "loss": 0.5557, + "step": 7965 + }, + { + "epoch": 3.9378321591892225, + "grad_norm": 0.1352797257145088, + "learning_rate": 1.1281736582703374e-08, + "loss": 0.5478, + "step": 7966 + }, + { + "epoch": 3.938326535656903, + "grad_norm": 0.13534903293720954, + "learning_rate": 1.1097582133893226e-08, + "loss": 0.564, + "step": 7967 + }, + { + "epoch": 3.9388209121245827, + "grad_norm": 0.12753103976215174, + "learning_rate": 1.0914942242462589e-08, + "loss": 0.5044, + "step": 7968 + }, + { + "epoch": 3.939315288592263, + "grad_norm": 0.13280876042882928, + "learning_rate": 1.0733816936103759e-08, + "loss": 0.5408, + "step": 7969 + }, + { + "epoch": 3.939809665059943, + "grad_norm": 0.13184382550674634, + "learning_rate": 1.0554206242280319e-08, + "loss": 0.5098, + "step": 7970 + }, + { + "epoch": 3.9403040415276234, + "grad_norm": 0.1354367266601739, + "learning_rate": 1.037611018822382e-08, + "loss": 0.5333, + "step": 7971 + }, + { + "epoch": 3.9407984179953033, + "grad_norm": 0.13731670429566487, + "learning_rate": 1.0199528800940439e-08, + "loss": 0.5442, + "step": 7972 + }, + { + "epoch": 3.9412927944629836, + "grad_norm": 0.13361639208477435, + "learning_rate": 1.0024462107202094e-08, + "loss": 0.5074, + "step": 7973 + }, + { + "epoch": 3.9417871709306636, + "grad_norm": 0.13738696750948265, + "learning_rate": 9.850910133553104e-09, + "loss": 0.5624, + "step": 7974 + }, + { + "epoch": 3.942281547398344, + "grad_norm": 0.13419108839273988, + "learning_rate": 9.6788729063102e-09, + "loss": 0.5352, + "step": 7975 + }, + { + "epoch": 3.942775923866024, + "grad_norm": 0.13505902128677325, + "learning_rate": 9.508350451555847e-09, + "loss": 0.5078, + "step": 7976 + }, + { + "epoch": 3.943270300333704, + "grad_norm": 0.13565046573469108, + "learning_rate": 9.339342795147143e-09, + "loss": 0.5173, + "step": 7977 + }, + { + "epoch": 3.943764676801384, + "grad_norm": 0.13510758088346084, + "learning_rate": 9.171849962709146e-09, + "loss": 0.5239, + "step": 7978 + }, + { + "epoch": 3.9442590532690645, + "grad_norm": 0.13164923538798037, + "learning_rate": 9.005871979638203e-09, + "loss": 0.5341, + "step": 7979 + }, + { + "epoch": 3.9447534297367444, + "grad_norm": 0.130279100633445, + "learning_rate": 8.841408871098633e-09, + "loss": 0.4865, + "step": 7980 + }, + { + "epoch": 3.9452478062044247, + "grad_norm": 0.13126270481857094, + "learning_rate": 8.678460662029375e-09, + "loss": 0.5324, + "step": 7981 + }, + { + "epoch": 3.945742182672105, + "grad_norm": 0.1372107178237645, + "learning_rate": 8.517027377136222e-09, + "loss": 0.5042, + "step": 7982 + }, + { + "epoch": 3.946236559139785, + "grad_norm": 0.13099287256787523, + "learning_rate": 8.357109040897371e-09, + "loss": 0.516, + "step": 7983 + }, + { + "epoch": 3.946730935607465, + "grad_norm": 0.13133863185792943, + "learning_rate": 8.198705677557872e-09, + "loss": 0.5145, + "step": 7984 + }, + { + "epoch": 3.9472253120751453, + "grad_norm": 0.13413121034195577, + "learning_rate": 8.041817311137401e-09, + "loss": 0.5447, + "step": 7985 + }, + { + "epoch": 3.9477196885428256, + "grad_norm": 0.13576694181659166, + "learning_rate": 7.886443965423595e-09, + "loss": 0.5232, + "step": 7986 + }, + { + "epoch": 3.9482140650105055, + "grad_norm": 0.1351958382921128, + "learning_rate": 7.73258566397539e-09, + "loss": 0.5359, + "step": 7987 + }, + { + "epoch": 3.9487084414781854, + "grad_norm": 0.12689084368380435, + "learning_rate": 7.580242430119678e-09, + "loss": 0.5279, + "step": 7988 + }, + { + "epoch": 3.949202817945866, + "grad_norm": 0.1343589455386087, + "learning_rate": 7.429414286956871e-09, + "loss": 0.5536, + "step": 7989 + }, + { + "epoch": 3.949697194413546, + "grad_norm": 0.13062908058017467, + "learning_rate": 7.2801012573564535e-09, + "loss": 0.58, + "step": 7990 + }, + { + "epoch": 3.950191570881226, + "grad_norm": 0.13228653009191477, + "learning_rate": 7.1323033639558725e-09, + "loss": 0.5321, + "step": 7991 + }, + { + "epoch": 3.950685947348906, + "grad_norm": 0.13447830640744174, + "learning_rate": 6.9860206291672e-09, + "loss": 0.5384, + "step": 7992 + }, + { + "epoch": 3.9511803238165863, + "grad_norm": 0.1380654072277416, + "learning_rate": 6.8412530751682525e-09, + "loss": 0.5163, + "step": 7993 + }, + { + "epoch": 3.9516747002842667, + "grad_norm": 0.1324636643993336, + "learning_rate": 6.698000723911469e-09, + "loss": 0.5347, + "step": 7994 + }, + { + "epoch": 3.9521690767519466, + "grad_norm": 0.12606232080374574, + "learning_rate": 6.556263597115031e-09, + "loss": 0.5045, + "step": 7995 + }, + { + "epoch": 3.9526634532196265, + "grad_norm": 0.13564045759464263, + "learning_rate": 6.416041716271748e-09, + "loss": 0.5556, + "step": 7996 + }, + { + "epoch": 3.953157829687307, + "grad_norm": 0.13299664371392586, + "learning_rate": 6.27733510264128e-09, + "loss": 0.5356, + "step": 7997 + }, + { + "epoch": 3.9536522061549872, + "grad_norm": 0.13882433328916668, + "learning_rate": 6.1401437772545815e-09, + "loss": 0.56, + "step": 7998 + }, + { + "epoch": 3.954146582622667, + "grad_norm": 0.13359694994110424, + "learning_rate": 6.004467760915012e-09, + "loss": 0.5153, + "step": 7999 + }, + { + "epoch": 3.954640959090347, + "grad_norm": 0.13739346738993866, + "learning_rate": 5.870307074192782e-09, + "loss": 0.5361, + "step": 8000 + }, + { + "epoch": 3.9551353355580274, + "grad_norm": 0.1363730122458128, + "learning_rate": 5.737661737430511e-09, + "loss": 0.5451, + "step": 8001 + }, + { + "epoch": 3.955629712025708, + "grad_norm": 0.13674859102969558, + "learning_rate": 5.606531770739887e-09, + "loss": 0.5134, + "step": 8002 + }, + { + "epoch": 3.9561240884933877, + "grad_norm": 0.1334996494521766, + "learning_rate": 5.476917194003894e-09, + "loss": 0.5092, + "step": 8003 + }, + { + "epoch": 3.9566184649610676, + "grad_norm": 0.12848032824416472, + "learning_rate": 5.348818026874591e-09, + "loss": 0.5486, + "step": 8004 + }, + { + "epoch": 3.957112841428748, + "grad_norm": 0.1347058998822092, + "learning_rate": 5.222234288775329e-09, + "loss": 0.5279, + "step": 8005 + }, + { + "epoch": 3.9576072178964283, + "grad_norm": 0.13447863259660064, + "learning_rate": 5.097165998898535e-09, + "loss": 0.5412, + "step": 8006 + }, + { + "epoch": 3.9581015943641082, + "grad_norm": 0.13655351250850448, + "learning_rate": 4.973613176209035e-09, + "loss": 0.5462, + "step": 8007 + }, + { + "epoch": 3.958595970831788, + "grad_norm": 0.1330808092388406, + "learning_rate": 4.8515758394396265e-09, + "loss": 0.5265, + "step": 8008 + }, + { + "epoch": 3.9590903472994685, + "grad_norm": 0.13126506903532334, + "learning_rate": 4.731054007094393e-09, + "loss": 0.5217, + "step": 8009 + }, + { + "epoch": 3.959584723767149, + "grad_norm": 0.12887433369414267, + "learning_rate": 4.612047697445387e-09, + "loss": 0.4844, + "step": 8010 + }, + { + "epoch": 3.960079100234829, + "grad_norm": 0.12901196625001546, + "learning_rate": 4.494556928540395e-09, + "loss": 0.5306, + "step": 8011 + }, + { + "epoch": 3.9605734767025087, + "grad_norm": 0.13369766567214103, + "learning_rate": 4.378581718190722e-09, + "loss": 0.5341, + "step": 8012 + }, + { + "epoch": 3.961067853170189, + "grad_norm": 0.14088609266708577, + "learning_rate": 4.264122083982303e-09, + "loss": 0.5384, + "step": 8013 + }, + { + "epoch": 3.9615622296378694, + "grad_norm": 0.13510880527358976, + "learning_rate": 4.151178043270143e-09, + "loss": 0.4963, + "step": 8014 + }, + { + "epoch": 3.9620566061055493, + "grad_norm": 0.12976941502128925, + "learning_rate": 4.039749613178323e-09, + "loss": 0.5174, + "step": 8015 + }, + { + "epoch": 3.9625509825732292, + "grad_norm": 0.13410017840471616, + "learning_rate": 3.929836810603327e-09, + "loss": 0.5434, + "step": 8016 + }, + { + "epoch": 3.9630453590409096, + "grad_norm": 0.1409125074935302, + "learning_rate": 3.821439652209602e-09, + "loss": 0.5174, + "step": 8017 + }, + { + "epoch": 3.96353973550859, + "grad_norm": 0.13203045169544864, + "learning_rate": 3.71455815443289e-09, + "loss": 0.4861, + "step": 8018 + }, + { + "epoch": 3.96403411197627, + "grad_norm": 0.13011942424378697, + "learning_rate": 3.609192333479117e-09, + "loss": 0.5301, + "step": 8019 + }, + { + "epoch": 3.9645284884439502, + "grad_norm": 0.12981275993062782, + "learning_rate": 3.5053422053243913e-09, + "loss": 0.5439, + "step": 8020 + }, + { + "epoch": 3.96502286491163, + "grad_norm": 0.13137418778357737, + "learning_rate": 3.403007785715007e-09, + "loss": 0.517, + "step": 8021 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 0.1324430252639712, + "learning_rate": 3.30218909016633e-09, + "loss": 0.5325, + "step": 8022 + }, + { + "epoch": 3.9660116178469904, + "grad_norm": 0.13364108433704985, + "learning_rate": 3.2028861339672424e-09, + "loss": 0.5109, + "step": 8023 + }, + { + "epoch": 3.9665059943146708, + "grad_norm": 0.1303101231860329, + "learning_rate": 3.1050989321723677e-09, + "loss": 0.5279, + "step": 8024 + }, + { + "epoch": 3.9670003707823507, + "grad_norm": 0.13226255836033635, + "learning_rate": 3.008827499608735e-09, + "loss": 0.5324, + "step": 8025 + }, + { + "epoch": 3.967494747250031, + "grad_norm": 0.13423911054077087, + "learning_rate": 2.9140718508735567e-09, + "loss": 0.5365, + "step": 8026 + }, + { + "epoch": 3.967989123717711, + "grad_norm": 0.1306429629925887, + "learning_rate": 2.8208320003342284e-09, + "loss": 0.4996, + "step": 8027 + }, + { + "epoch": 3.9684835001853913, + "grad_norm": 0.13140328977124446, + "learning_rate": 2.7291079621294403e-09, + "loss": 0.5147, + "step": 8028 + }, + { + "epoch": 3.9689778766530712, + "grad_norm": 0.1331823313296943, + "learning_rate": 2.6388997501647363e-09, + "loss": 0.5332, + "step": 8029 + }, + { + "epoch": 3.9694722531207516, + "grad_norm": 0.1366256840609546, + "learning_rate": 2.5502073781180635e-09, + "loss": 0.5219, + "step": 8030 + }, + { + "epoch": 3.9699666295884315, + "grad_norm": 0.13315901141979647, + "learning_rate": 2.463030859438664e-09, + "loss": 0.5377, + "step": 8031 + }, + { + "epoch": 3.970461006056112, + "grad_norm": 0.2892341990971902, + "learning_rate": 2.377370207343743e-09, + "loss": 0.5657, + "step": 8032 + }, + { + "epoch": 3.9709553825237918, + "grad_norm": 0.13270385473616567, + "learning_rate": 2.2932254348217996e-09, + "loss": 0.5482, + "step": 8033 + }, + { + "epoch": 3.971449758991472, + "grad_norm": 0.1346997730072988, + "learning_rate": 2.2105965546315165e-09, + "loss": 0.5403, + "step": 8034 + }, + { + "epoch": 3.971944135459152, + "grad_norm": 0.13112823054463316, + "learning_rate": 2.1294835792995404e-09, + "loss": 0.5159, + "step": 8035 + }, + { + "epoch": 3.9724385119268324, + "grad_norm": 0.14004141320266922, + "learning_rate": 2.0498865211271424e-09, + "loss": 0.5206, + "step": 8036 + }, + { + "epoch": 3.9729328883945123, + "grad_norm": 0.1401930167829086, + "learning_rate": 1.9718053921813365e-09, + "loss": 0.5238, + "step": 8037 + }, + { + "epoch": 3.9734272648621927, + "grad_norm": 0.13513473546032698, + "learning_rate": 1.8952402043015404e-09, + "loss": 0.5313, + "step": 8038 + }, + { + "epoch": 3.9739216413298726, + "grad_norm": 0.1366951616018316, + "learning_rate": 1.8201909690973574e-09, + "loss": 0.5381, + "step": 8039 + }, + { + "epoch": 3.974416017797553, + "grad_norm": 0.13079366804791318, + "learning_rate": 1.7466576979474625e-09, + "loss": 0.5071, + "step": 8040 + }, + { + "epoch": 3.974910394265233, + "grad_norm": 0.1316468091129158, + "learning_rate": 1.674640402001826e-09, + "loss": 0.5223, + "step": 8041 + }, + { + "epoch": 3.975404770732913, + "grad_norm": 0.1334940165784171, + "learning_rate": 1.6041390921794907e-09, + "loss": 0.5122, + "step": 8042 + }, + { + "epoch": 3.975899147200593, + "grad_norm": 0.13360075532358281, + "learning_rate": 1.5351537791696847e-09, + "loss": 0.5096, + "step": 8043 + }, + { + "epoch": 3.9763935236682735, + "grad_norm": 0.1295988347637062, + "learning_rate": 1.467684473434039e-09, + "loss": 0.5203, + "step": 8044 + }, + { + "epoch": 3.9768879001359534, + "grad_norm": 0.13016858372425175, + "learning_rate": 1.4017311852010385e-09, + "loss": 0.5354, + "step": 8045 + }, + { + "epoch": 3.9773822766036337, + "grad_norm": 0.13382818361803558, + "learning_rate": 1.337293924470462e-09, + "loss": 0.5067, + "step": 8046 + }, + { + "epoch": 3.9778766530713137, + "grad_norm": 0.13568186690275424, + "learning_rate": 1.2743727010133821e-09, + "loss": 0.4981, + "step": 8047 + }, + { + "epoch": 3.978371029538994, + "grad_norm": 0.13146501382626646, + "learning_rate": 1.212967524371056e-09, + "loss": 0.5144, + "step": 8048 + }, + { + "epoch": 3.978865406006674, + "grad_norm": 0.13250099983412086, + "learning_rate": 1.153078403851593e-09, + "loss": 0.5064, + "step": 8049 + }, + { + "epoch": 3.9793597824743543, + "grad_norm": 0.13059160149294563, + "learning_rate": 1.094705348537728e-09, + "loss": 0.5053, + "step": 8050 + }, + { + "epoch": 3.979854158942034, + "grad_norm": 0.13347230988697728, + "learning_rate": 1.0378483672790484e-09, + "loss": 0.5449, + "step": 8051 + }, + { + "epoch": 3.9803485354097146, + "grad_norm": 0.13061995263478446, + "learning_rate": 9.825074686975467e-10, + "loss": 0.4948, + "step": 8052 + }, + { + "epoch": 3.9808429118773945, + "grad_norm": 0.13218455012717953, + "learning_rate": 9.286826611820677e-10, + "loss": 0.5312, + "step": 8053 + }, + { + "epoch": 3.981337288345075, + "grad_norm": 0.13320927479878755, + "learning_rate": 8.763739528960813e-10, + "loss": 0.5322, + "step": 8054 + }, + { + "epoch": 3.9818316648127547, + "grad_norm": 0.13525809593065294, + "learning_rate": 8.255813517699107e-10, + "loss": 0.5143, + "step": 8055 + }, + { + "epoch": 3.982326041280435, + "grad_norm": 0.137015216756061, + "learning_rate": 7.76304865505173e-10, + "loss": 0.5329, + "step": 8056 + }, + { + "epoch": 3.9828204177481155, + "grad_norm": 0.13093193515425655, + "learning_rate": 7.28544501572559e-10, + "loss": 0.5193, + "step": 8057 + }, + { + "epoch": 3.9833147942157954, + "grad_norm": 0.13434080473597462, + "learning_rate": 6.823002672151635e-10, + "loss": 0.5437, + "step": 8058 + }, + { + "epoch": 3.9838091706834753, + "grad_norm": 0.1396387947800981, + "learning_rate": 6.375721694429349e-10, + "loss": 0.5526, + "step": 8059 + }, + { + "epoch": 3.9843035471511556, + "grad_norm": 0.1327862020812984, + "learning_rate": 5.943602150393357e-10, + "loss": 0.5185, + "step": 8060 + }, + { + "epoch": 3.984797923618836, + "grad_norm": 0.13296628947596348, + "learning_rate": 5.526644105557921e-10, + "loss": 0.5292, + "step": 8061 + }, + { + "epoch": 3.985292300086516, + "grad_norm": 0.13043057677197203, + "learning_rate": 5.124847623128038e-10, + "loss": 0.524, + "step": 8062 + }, + { + "epoch": 3.985786676554196, + "grad_norm": 0.17326549721467732, + "learning_rate": 4.738212764054951e-10, + "loss": 0.5101, + "step": 8063 + }, + { + "epoch": 3.986281053021876, + "grad_norm": 0.13697549492423505, + "learning_rate": 4.366739586936231e-10, + "loss": 0.5375, + "step": 8064 + }, + { + "epoch": 3.9867754294895565, + "grad_norm": 0.12743526258219276, + "learning_rate": 4.010428148115697e-10, + "loss": 0.5184, + "step": 8065 + }, + { + "epoch": 3.9872698059572365, + "grad_norm": 0.1332114265846714, + "learning_rate": 3.6692785016056995e-10, + "loss": 0.517, + "step": 8066 + }, + { + "epoch": 3.9877641824249164, + "grad_norm": 0.13504054674056906, + "learning_rate": 3.3432906991426316e-10, + "loss": 0.534, + "step": 8067 + }, + { + "epoch": 3.9882585588925967, + "grad_norm": 0.13367059592850594, + "learning_rate": 3.0324647901425196e-10, + "loss": 0.5307, + "step": 8068 + }, + { + "epoch": 3.988752935360277, + "grad_norm": 0.1294385530483882, + "learning_rate": 2.736800821734331e-10, + "loss": 0.5225, + "step": 8069 + }, + { + "epoch": 3.989247311827957, + "grad_norm": 0.13293822678589207, + "learning_rate": 2.4562988387599737e-10, + "loss": 0.5023, + "step": 8070 + }, + { + "epoch": 3.989741688295637, + "grad_norm": 0.13011531929453676, + "learning_rate": 2.19095888374099e-10, + "loss": 0.5314, + "step": 8071 + }, + { + "epoch": 3.9902360647633173, + "grad_norm": 0.12972152238577764, + "learning_rate": 1.9407809969229642e-10, + "loss": 0.4876, + "step": 8072 + }, + { + "epoch": 3.9907304412309976, + "grad_norm": 0.13072698653460593, + "learning_rate": 1.70576521620891e-10, + "loss": 0.5078, + "step": 8073 + }, + { + "epoch": 3.9912248176986775, + "grad_norm": 0.13244626735342516, + "learning_rate": 1.4859115772591914e-10, + "loss": 0.5195, + "step": 8074 + }, + { + "epoch": 3.9917191941663575, + "grad_norm": 0.13394192811234198, + "learning_rate": 1.281220113413806e-10, + "loss": 0.5304, + "step": 8075 + }, + { + "epoch": 3.992213570634038, + "grad_norm": 0.1372674314379318, + "learning_rate": 1.0916908556812823e-10, + "loss": 0.5392, + "step": 8076 + }, + { + "epoch": 3.992707947101718, + "grad_norm": 0.1380910774341325, + "learning_rate": 9.17323832816397e-11, + "loss": 0.5336, + "step": 8077 + }, + { + "epoch": 3.993202323569398, + "grad_norm": 0.13921636293163092, + "learning_rate": 7.581190712535602e-11, + "loss": 0.5523, + "step": 8078 + }, + { + "epoch": 3.993696700037078, + "grad_norm": 0.12901837160717902, + "learning_rate": 6.140765951401229e-11, + "loss": 0.5507, + "step": 8079 + }, + { + "epoch": 3.9941910765047584, + "grad_norm": 0.1358412815792773, + "learning_rate": 4.851964263030695e-11, + "loss": 0.5066, + "step": 8080 + }, + { + "epoch": 3.9946854529724387, + "grad_norm": 0.12937036594582588, + "learning_rate": 3.7147858429342766e-11, + "loss": 0.4962, + "step": 8081 + }, + { + "epoch": 3.9951798294401186, + "grad_norm": 0.13062866140425863, + "learning_rate": 2.7292308635296083e-11, + "loss": 0.5216, + "step": 8082 + }, + { + "epoch": 3.9956742059077985, + "grad_norm": 0.1314212637902976, + "learning_rate": 1.8952994742527097e-11, + "loss": 0.4967, + "step": 8083 + }, + { + "epoch": 3.996168582375479, + "grad_norm": 0.13338264222471402, + "learning_rate": 1.2129918014469611e-11, + "loss": 0.5184, + "step": 8084 + }, + { + "epoch": 3.9966629588431593, + "grad_norm": 0.13429010866569396, + "learning_rate": 6.823079486961703e-12, + "loss": 0.5145, + "step": 8085 + }, + { + "epoch": 3.997157335310839, + "grad_norm": 0.13104847193224492, + "learning_rate": 3.0324799638048463e-12, + "loss": 0.5261, + "step": 8086 + }, + { + "epoch": 3.997651711778519, + "grad_norm": 0.1325092516841485, + "learning_rate": 7.581200200945659e-13, + "loss": 0.5143, + "step": 8087 + }, + { + "epoch": 3.9981460882461994, + "grad_norm": 0.1308755366568664, + "learning_rate": 0.0, + "loss": 0.498, + "step": 8088 + } + ], + "logging_steps": 1, + "max_steps": 8088, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 2022, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1719303325876224.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}