{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.49362402303578773, "eval_steps": 375, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00032908268202385847, "grad_norm": 46.0558967590332, "learning_rate": 2e-05, "loss": 4.5639, "step": 1 }, { "epoch": 0.0006581653640477169, "grad_norm": 116.75971221923828, "learning_rate": 4e-05, "loss": 4.7399, "step": 2 }, { "epoch": 0.0009872480460715754, "grad_norm": 51.51197814941406, "learning_rate": 6e-05, "loss": 4.8224, "step": 3 }, { "epoch": 0.0013163307280954339, "grad_norm": 78.98606872558594, "learning_rate": 8e-05, "loss": 5.1893, "step": 4 }, { "epoch": 0.0016454134101192926, "grad_norm": 48.04075622558594, "learning_rate": 0.0001, "loss": 4.4059, "step": 5 }, { "epoch": 0.001974496092143151, "grad_norm": 38.80451202392578, "learning_rate": 0.00012, "loss": 4.0177, "step": 6 }, { "epoch": 0.0023035787741670093, "grad_norm": 39.55317687988281, "learning_rate": 0.00014, "loss": 4.291, "step": 7 }, { "epoch": 0.0026326614561908677, "grad_norm": 40.044979095458984, "learning_rate": 0.00016, "loss": 3.793, "step": 8 }, { "epoch": 0.0029617441382147267, "grad_norm": 58.86537170410156, "learning_rate": 0.00018, "loss": 4.1226, "step": 9 }, { "epoch": 0.003290826820238585, "grad_norm": 24.430255889892578, "learning_rate": 0.0002, "loss": 3.6641, "step": 10 }, { "epoch": 0.0036199095022624436, "grad_norm": 54.99418640136719, "learning_rate": 0.00019999977772170748, "loss": 3.8054, "step": 11 }, { "epoch": 0.003948992184286302, "grad_norm": 76.18338775634766, "learning_rate": 0.00019999911088781805, "loss": 3.4282, "step": 12 }, { "epoch": 0.0042780748663101605, "grad_norm": 38.46460723876953, "learning_rate": 0.0001999979995012962, "loss": 3.3058, "step": 13 }, { "epoch": 0.0046071575483340186, "grad_norm": 44.550357818603516, "learning_rate": 0.00019999644356708261, "loss": 3.4775, "step": 14 }, { "epoch": 0.0049362402303578775, "grad_norm": 34.525020599365234, "learning_rate": 0.00019999444309209432, "loss": 3.3947, "step": 15 }, { "epoch": 0.0052653229123817355, "grad_norm": 57.00188446044922, "learning_rate": 0.0001999919980852246, "loss": 3.3006, "step": 16 }, { "epoch": 0.005594405594405594, "grad_norm": 117.598876953125, "learning_rate": 0.00019998910855734288, "loss": 3.2207, "step": 17 }, { "epoch": 0.005923488276429453, "grad_norm": 37.49771499633789, "learning_rate": 0.0001999857745212947, "loss": 3.1492, "step": 18 }, { "epoch": 0.006252570958453311, "grad_norm": 30.6367244720459, "learning_rate": 0.00019998199599190178, "loss": 3.2436, "step": 19 }, { "epoch": 0.00658165364047717, "grad_norm": 39.12260055541992, "learning_rate": 0.0001999777729859618, "loss": 3.2412, "step": 20 }, { "epoch": 0.006910736322501028, "grad_norm": 172.7960968017578, "learning_rate": 0.00019997310552224846, "loss": 3.1229, "step": 21 }, { "epoch": 0.007239819004524887, "grad_norm": 39.700347900390625, "learning_rate": 0.00019996799362151122, "loss": 3.1227, "step": 22 }, { "epoch": 0.007568901686548745, "grad_norm": 72.12504577636719, "learning_rate": 0.00019996243730647538, "loss": 3.23, "step": 23 }, { "epoch": 0.007897984368572603, "grad_norm": 35.2486457824707, "learning_rate": 0.00019995643660184191, "loss": 3.1196, "step": 24 }, { "epoch": 0.008227067050596462, "grad_norm": 38.94593048095703, "learning_rate": 0.00019994999153428737, "loss": 3.1875, "step": 25 }, { "epoch": 0.008556149732620321, "grad_norm": 32.85285568237305, "learning_rate": 0.00019994310213246368, "loss": 3.0243, "step": 26 }, { "epoch": 0.00888523241464418, "grad_norm": 36.16541290283203, "learning_rate": 0.00019993576842699816, "loss": 2.9224, "step": 27 }, { "epoch": 0.009214315096668037, "grad_norm": 37.95417785644531, "learning_rate": 0.0001999279904504933, "loss": 3.1117, "step": 28 }, { "epoch": 0.009543397778691896, "grad_norm": 30.928470611572266, "learning_rate": 0.00019991976823752653, "loss": 3.0161, "step": 29 }, { "epoch": 0.009872480460715755, "grad_norm": 30.129854202270508, "learning_rate": 0.00019991110182465032, "loss": 2.9128, "step": 30 }, { "epoch": 0.010201563142739614, "grad_norm": 32.489471435546875, "learning_rate": 0.00019990199125039174, "loss": 2.8793, "step": 31 }, { "epoch": 0.010530645824763471, "grad_norm": 40.40331268310547, "learning_rate": 0.00019989243655525247, "loss": 3.0345, "step": 32 }, { "epoch": 0.01085972850678733, "grad_norm": 36.23051834106445, "learning_rate": 0.00019988243778170853, "loss": 2.9974, "step": 33 }, { "epoch": 0.011188811188811189, "grad_norm": 46.42790603637695, "learning_rate": 0.0001998719949742101, "loss": 3.0721, "step": 34 }, { "epoch": 0.011517893870835048, "grad_norm": 41.625057220458984, "learning_rate": 0.0001998611081791814, "loss": 2.9996, "step": 35 }, { "epoch": 0.011846976552858907, "grad_norm": 45.90873718261719, "learning_rate": 0.00019984977744502038, "loss": 2.9567, "step": 36 }, { "epoch": 0.012176059234882764, "grad_norm": 184.02110290527344, "learning_rate": 0.00019983800282209857, "loss": 3.202, "step": 37 }, { "epoch": 0.012505141916906623, "grad_norm": 39.56529998779297, "learning_rate": 0.00019982578436276082, "loss": 3.1411, "step": 38 }, { "epoch": 0.012834224598930482, "grad_norm": 146.53335571289062, "learning_rate": 0.00019981312212132512, "loss": 3.1131, "step": 39 }, { "epoch": 0.01316330728095434, "grad_norm": 150.16696166992188, "learning_rate": 0.00019980001615408228, "loss": 3.1807, "step": 40 }, { "epoch": 0.013492389962978198, "grad_norm": 55.29084014892578, "learning_rate": 0.00019978646651929572, "loss": 3.0728, "step": 41 }, { "epoch": 0.013821472645002057, "grad_norm": 67.03924560546875, "learning_rate": 0.00019977247327720128, "loss": 3.1384, "step": 42 }, { "epoch": 0.014150555327025915, "grad_norm": 52.389244079589844, "learning_rate": 0.0001997580364900068, "loss": 3.0188, "step": 43 }, { "epoch": 0.014479638009049774, "grad_norm": 52.6592903137207, "learning_rate": 0.000199743156221892, "loss": 3.2027, "step": 44 }, { "epoch": 0.014808720691073632, "grad_norm": 60.01515197753906, "learning_rate": 0.00019972783253900808, "loss": 3.3363, "step": 45 }, { "epoch": 0.01513780337309749, "grad_norm": 63.57032012939453, "learning_rate": 0.00019971206550947748, "loss": 3.4156, "step": 46 }, { "epoch": 0.01546688605512135, "grad_norm": 95.09534454345703, "learning_rate": 0.00019969585520339354, "loss": 3.019, "step": 47 }, { "epoch": 0.015795968737145206, "grad_norm": 75.03887939453125, "learning_rate": 0.0001996792016928203, "loss": 3.6353, "step": 48 }, { "epoch": 0.016125051419169065, "grad_norm": 208.92349243164062, "learning_rate": 0.00019966210505179197, "loss": 3.8851, "step": 49 }, { "epoch": 0.016454134101192924, "grad_norm": 96.15306091308594, "learning_rate": 0.00019964456535631286, "loss": 3.951, "step": 50 }, { "epoch": 0.016783216783216783, "grad_norm": 205.12437438964844, "learning_rate": 0.0001996265826843568, "loss": 2.9162, "step": 51 }, { "epoch": 0.017112299465240642, "grad_norm": 65.38352966308594, "learning_rate": 0.00019960815711586696, "loss": 2.8065, "step": 52 }, { "epoch": 0.0174413821472645, "grad_norm": 59.9429931640625, "learning_rate": 0.00019958928873275539, "loss": 2.646, "step": 53 }, { "epoch": 0.01777046482928836, "grad_norm": 40.976078033447266, "learning_rate": 0.00019956997761890277, "loss": 2.622, "step": 54 }, { "epoch": 0.01809954751131222, "grad_norm": 47.47517776489258, "learning_rate": 0.00019955022386015792, "loss": 2.799, "step": 55 }, { "epoch": 0.018428630193336074, "grad_norm": 47.42388153076172, "learning_rate": 0.00019953002754433743, "loss": 2.6488, "step": 56 }, { "epoch": 0.018757712875359933, "grad_norm": 25.498687744140625, "learning_rate": 0.00019950938876122542, "loss": 2.4878, "step": 57 }, { "epoch": 0.019086795557383792, "grad_norm": 23.83648681640625, "learning_rate": 0.00019948830760257291, "loss": 2.6812, "step": 58 }, { "epoch": 0.01941587823940765, "grad_norm": 37.21333694458008, "learning_rate": 0.0001994667841620976, "loss": 2.6438, "step": 59 }, { "epoch": 0.01974496092143151, "grad_norm": 56.65325927734375, "learning_rate": 0.00019944481853548335, "loss": 2.7186, "step": 60 }, { "epoch": 0.02007404360345537, "grad_norm": 25.59310531616211, "learning_rate": 0.00019942241082037982, "loss": 2.6601, "step": 61 }, { "epoch": 0.020403126285479228, "grad_norm": 36.67039108276367, "learning_rate": 0.00019939956111640197, "loss": 2.4964, "step": 62 }, { "epoch": 0.020732208967503087, "grad_norm": 25.524160385131836, "learning_rate": 0.00019937626952512964, "loss": 2.6319, "step": 63 }, { "epoch": 0.021061291649526942, "grad_norm": 103.86463165283203, "learning_rate": 0.0001993525361501072, "loss": 2.6137, "step": 64 }, { "epoch": 0.0213903743315508, "grad_norm": 28.601654052734375, "learning_rate": 0.00019932836109684286, "loss": 2.6402, "step": 65 }, { "epoch": 0.02171945701357466, "grad_norm": 45.02948760986328, "learning_rate": 0.00019930374447280845, "loss": 2.5556, "step": 66 }, { "epoch": 0.02204853969559852, "grad_norm": 48.84526062011719, "learning_rate": 0.00019927868638743875, "loss": 2.6032, "step": 67 }, { "epoch": 0.022377622377622378, "grad_norm": 29.451082229614258, "learning_rate": 0.0001992531869521312, "loss": 2.6241, "step": 68 }, { "epoch": 0.022706705059646237, "grad_norm": 66.02137756347656, "learning_rate": 0.00019922724628024515, "loss": 2.6018, "step": 69 }, { "epoch": 0.023035787741670095, "grad_norm": 30.177003860473633, "learning_rate": 0.0001992008644871016, "loss": 2.6833, "step": 70 }, { "epoch": 0.023364870423693954, "grad_norm": 57.67784118652344, "learning_rate": 0.00019917404168998256, "loss": 2.4953, "step": 71 }, { "epoch": 0.023693953105717813, "grad_norm": 29.414043426513672, "learning_rate": 0.0001991467780081305, "loss": 2.4697, "step": 72 }, { "epoch": 0.02402303578774167, "grad_norm": 30.574806213378906, "learning_rate": 0.00019911907356274795, "loss": 2.524, "step": 73 }, { "epoch": 0.024352118469765528, "grad_norm": 44.30821228027344, "learning_rate": 0.00019909092847699683, "loss": 2.5585, "step": 74 }, { "epoch": 0.024681201151789386, "grad_norm": 34.140777587890625, "learning_rate": 0.00019906234287599798, "loss": 2.4821, "step": 75 }, { "epoch": 0.025010283833813245, "grad_norm": 39.644832611083984, "learning_rate": 0.00019903331688683057, "loss": 2.7098, "step": 76 }, { "epoch": 0.025339366515837104, "grad_norm": 73.76107025146484, "learning_rate": 0.00019900385063853154, "loss": 2.6193, "step": 77 }, { "epoch": 0.025668449197860963, "grad_norm": 34.471954345703125, "learning_rate": 0.00019897394426209505, "loss": 2.3583, "step": 78 }, { "epoch": 0.025997531879884822, "grad_norm": 44.11329650878906, "learning_rate": 0.00019894359789047187, "loss": 2.5031, "step": 79 }, { "epoch": 0.02632661456190868, "grad_norm": 38.81841278076172, "learning_rate": 0.00019891281165856873, "loss": 2.7198, "step": 80 }, { "epoch": 0.026655697243932536, "grad_norm": 35.21909713745117, "learning_rate": 0.00019888158570324795, "loss": 2.5912, "step": 81 }, { "epoch": 0.026984779925956395, "grad_norm": 47.79863357543945, "learning_rate": 0.0001988499201633265, "loss": 2.562, "step": 82 }, { "epoch": 0.027313862607980254, "grad_norm": 50.25193786621094, "learning_rate": 0.00019881781517957562, "loss": 2.7047, "step": 83 }, { "epoch": 0.027642945290004113, "grad_norm": 36.6878776550293, "learning_rate": 0.0001987852708947202, "loss": 2.4972, "step": 84 }, { "epoch": 0.027972027972027972, "grad_norm": 42.61648941040039, "learning_rate": 0.00019875228745343794, "loss": 2.5156, "step": 85 }, { "epoch": 0.02830111065405183, "grad_norm": 81.98995208740234, "learning_rate": 0.0001987188650023589, "loss": 2.569, "step": 86 }, { "epoch": 0.02863019333607569, "grad_norm": 46.91239929199219, "learning_rate": 0.0001986850036900648, "loss": 2.7152, "step": 87 }, { "epoch": 0.02895927601809955, "grad_norm": 52.46195602416992, "learning_rate": 0.00019865070366708836, "loss": 2.7093, "step": 88 }, { "epoch": 0.029288358700123408, "grad_norm": 41.07624435424805, "learning_rate": 0.00019861596508591255, "loss": 2.7295, "step": 89 }, { "epoch": 0.029617441382147263, "grad_norm": 55.525753021240234, "learning_rate": 0.00019858078810097002, "loss": 2.6919, "step": 90 }, { "epoch": 0.029946524064171122, "grad_norm": 50.650325775146484, "learning_rate": 0.00019854517286864245, "loss": 2.7554, "step": 91 }, { "epoch": 0.03027560674619498, "grad_norm": 53.117122650146484, "learning_rate": 0.0001985091195472596, "loss": 2.7014, "step": 92 }, { "epoch": 0.03060468942821884, "grad_norm": 50.88463592529297, "learning_rate": 0.0001984726282970989, "loss": 2.9423, "step": 93 }, { "epoch": 0.0309337721102427, "grad_norm": 56.69411849975586, "learning_rate": 0.0001984356992803847, "loss": 2.9391, "step": 94 }, { "epoch": 0.03126285479226656, "grad_norm": 49.7108268737793, "learning_rate": 0.00019839833266128724, "loss": 2.806, "step": 95 }, { "epoch": 0.03159193747429041, "grad_norm": 55.716922760009766, "learning_rate": 0.00019836052860592237, "loss": 2.7375, "step": 96 }, { "epoch": 0.031921020156314275, "grad_norm": 63.348724365234375, "learning_rate": 0.0001983222872823505, "loss": 2.8574, "step": 97 }, { "epoch": 0.03225010283833813, "grad_norm": 61.3756103515625, "learning_rate": 0.00019828360886057594, "loss": 3.3003, "step": 98 }, { "epoch": 0.03257918552036199, "grad_norm": 102.8317642211914, "learning_rate": 0.00019824449351254616, "loss": 2.9397, "step": 99 }, { "epoch": 0.03290826820238585, "grad_norm": 85.07316589355469, "learning_rate": 0.00019820494141215104, "loss": 3.2812, "step": 100 }, { "epoch": 0.03323735088440971, "grad_norm": 44.999996185302734, "learning_rate": 0.000198164952735222, "loss": 2.4481, "step": 101 }, { "epoch": 0.033566433566433566, "grad_norm": 35.384925842285156, "learning_rate": 0.00019812452765953135, "loss": 2.2091, "step": 102 }, { "epoch": 0.03389551624845742, "grad_norm": 29.201196670532227, "learning_rate": 0.00019808366636479147, "loss": 2.4956, "step": 103 }, { "epoch": 0.034224598930481284, "grad_norm": 18.83892822265625, "learning_rate": 0.00019804236903265388, "loss": 2.3206, "step": 104 }, { "epoch": 0.03455368161250514, "grad_norm": 17.895370483398438, "learning_rate": 0.00019800063584670863, "loss": 2.3439, "step": 105 }, { "epoch": 0.034882764294529, "grad_norm": 17.173036575317383, "learning_rate": 0.00019795846699248332, "loss": 2.2369, "step": 106 }, { "epoch": 0.03521184697655286, "grad_norm": 20.797094345092773, "learning_rate": 0.00019791586265744237, "loss": 2.2587, "step": 107 }, { "epoch": 0.03554092965857672, "grad_norm": 20.905153274536133, "learning_rate": 0.00019787282303098617, "loss": 2.3331, "step": 108 }, { "epoch": 0.035870012340600575, "grad_norm": 24.041580200195312, "learning_rate": 0.0001978293483044502, "loss": 2.3722, "step": 109 }, { "epoch": 0.03619909502262444, "grad_norm": 21.71147918701172, "learning_rate": 0.00019778543867110426, "loss": 2.381, "step": 110 }, { "epoch": 0.03652817770464829, "grad_norm": 19.62755012512207, "learning_rate": 0.00019774109432615147, "loss": 2.2829, "step": 111 }, { "epoch": 0.03685726038667215, "grad_norm": 19.470176696777344, "learning_rate": 0.00019769631546672756, "loss": 2.3016, "step": 112 }, { "epoch": 0.03718634306869601, "grad_norm": 25.57579803466797, "learning_rate": 0.00019765110229189988, "loss": 2.2059, "step": 113 }, { "epoch": 0.037515425750719866, "grad_norm": 26.536754608154297, "learning_rate": 0.00019760545500266657, "loss": 2.3904, "step": 114 }, { "epoch": 0.03784450843274373, "grad_norm": 21.48318099975586, "learning_rate": 0.00019755937380195568, "loss": 2.3908, "step": 115 }, { "epoch": 0.038173591114767584, "grad_norm": 33.38320541381836, "learning_rate": 0.00019751285889462423, "loss": 2.4642, "step": 116 }, { "epoch": 0.038502673796791446, "grad_norm": 24.473419189453125, "learning_rate": 0.0001974659104874573, "loss": 2.3328, "step": 117 }, { "epoch": 0.0388317564788153, "grad_norm": 23.20199203491211, "learning_rate": 0.0001974185287891671, "loss": 2.2691, "step": 118 }, { "epoch": 0.039160839160839164, "grad_norm": 27.398014068603516, "learning_rate": 0.0001973707140103921, "loss": 2.3794, "step": 119 }, { "epoch": 0.03948992184286302, "grad_norm": 30.54142189025879, "learning_rate": 0.00019732246636369605, "loss": 2.3396, "step": 120 }, { "epoch": 0.039819004524886875, "grad_norm": 28.70248031616211, "learning_rate": 0.00019727378606356703, "loss": 2.3872, "step": 121 }, { "epoch": 0.04014808720691074, "grad_norm": 25.75336456298828, "learning_rate": 0.00019722467332641656, "loss": 2.2814, "step": 122 }, { "epoch": 0.04047716988893459, "grad_norm": 29.759145736694336, "learning_rate": 0.00019717512837057855, "loss": 2.3478, "step": 123 }, { "epoch": 0.040806252570958455, "grad_norm": 26.37862777709961, "learning_rate": 0.0001971251514163083, "loss": 2.3774, "step": 124 }, { "epoch": 0.04113533525298231, "grad_norm": 28.256498336791992, "learning_rate": 0.0001970747426857817, "loss": 2.2388, "step": 125 }, { "epoch": 0.04146441793500617, "grad_norm": 33.0186767578125, "learning_rate": 0.00019702390240309404, "loss": 2.3465, "step": 126 }, { "epoch": 0.04179350061703003, "grad_norm": 34.24951934814453, "learning_rate": 0.0001969726307942592, "loss": 2.4891, "step": 127 }, { "epoch": 0.042122583299053884, "grad_norm": 31.087078094482422, "learning_rate": 0.00019692092808720846, "loss": 2.4853, "step": 128 }, { "epoch": 0.042451665981077746, "grad_norm": 35.06305694580078, "learning_rate": 0.0001968687945117896, "loss": 2.3925, "step": 129 }, { "epoch": 0.0427807486631016, "grad_norm": 29.33173370361328, "learning_rate": 0.00019681623029976588, "loss": 2.4487, "step": 130 }, { "epoch": 0.043109831345125464, "grad_norm": 31.75111198425293, "learning_rate": 0.00019676323568481498, "loss": 2.3077, "step": 131 }, { "epoch": 0.04343891402714932, "grad_norm": 27.908418655395508, "learning_rate": 0.00019670981090252792, "loss": 2.4815, "step": 132 }, { "epoch": 0.04376799670917318, "grad_norm": 32.187171936035156, "learning_rate": 0.00019665595619040808, "loss": 2.4108, "step": 133 }, { "epoch": 0.04409707939119704, "grad_norm": 35.1542854309082, "learning_rate": 0.0001966016717878702, "loss": 2.441, "step": 134 }, { "epoch": 0.0444261620732209, "grad_norm": 33.850250244140625, "learning_rate": 0.00019654695793623907, "loss": 2.5023, "step": 135 }, { "epoch": 0.044755244755244755, "grad_norm": 38.28538513183594, "learning_rate": 0.0001964918148787488, "loss": 2.6209, "step": 136 }, { "epoch": 0.04508432743726861, "grad_norm": 45.59490203857422, "learning_rate": 0.00019643624286054144, "loss": 2.563, "step": 137 }, { "epoch": 0.04541341011929247, "grad_norm": 41.408103942871094, "learning_rate": 0.00019638024212866606, "loss": 2.5537, "step": 138 }, { "epoch": 0.04574249280131633, "grad_norm": 35.567420959472656, "learning_rate": 0.0001963238129320776, "loss": 2.5059, "step": 139 }, { "epoch": 0.04607157548334019, "grad_norm": 51.440792083740234, "learning_rate": 0.00019626695552163578, "loss": 2.5288, "step": 140 }, { "epoch": 0.046400658165364046, "grad_norm": 55.620033264160156, "learning_rate": 0.00019620967015010395, "loss": 2.7742, "step": 141 }, { "epoch": 0.04672974084738791, "grad_norm": 47.921180725097656, "learning_rate": 0.00019615195707214803, "loss": 2.545, "step": 142 }, { "epoch": 0.047058823529411764, "grad_norm": 52.1619758605957, "learning_rate": 0.0001960938165443353, "loss": 2.4808, "step": 143 }, { "epoch": 0.047387906211435626, "grad_norm": 52.686729431152344, "learning_rate": 0.00019603524882513327, "loss": 2.5127, "step": 144 }, { "epoch": 0.04771698889345948, "grad_norm": 48.75902557373047, "learning_rate": 0.0001959762541749086, "loss": 2.5492, "step": 145 }, { "epoch": 0.04804607157548334, "grad_norm": 57.62579345703125, "learning_rate": 0.00019591683285592593, "loss": 2.5136, "step": 146 }, { "epoch": 0.0483751542575072, "grad_norm": 66.0849380493164, "learning_rate": 0.00019585698513234663, "loss": 2.9436, "step": 147 }, { "epoch": 0.048704236939531055, "grad_norm": 65.15868377685547, "learning_rate": 0.0001957967112702277, "loss": 2.9614, "step": 148 }, { "epoch": 0.04903331962155492, "grad_norm": 61.37369155883789, "learning_rate": 0.00019573601153752052, "loss": 3.0038, "step": 149 }, { "epoch": 0.04936240230357877, "grad_norm": 132.2886505126953, "learning_rate": 0.00019567488620406983, "loss": 3.1973, "step": 150 }, { "epoch": 0.049691484985602635, "grad_norm": 35.6921272277832, "learning_rate": 0.00019561333554161224, "loss": 2.1981, "step": 151 }, { "epoch": 0.05002056766762649, "grad_norm": 32.4654541015625, "learning_rate": 0.0001955513598237753, "loss": 2.197, "step": 152 }, { "epoch": 0.05034965034965035, "grad_norm": 25.712648391723633, "learning_rate": 0.00019548895932607621, "loss": 2.338, "step": 153 }, { "epoch": 0.05067873303167421, "grad_norm": 19.411991119384766, "learning_rate": 0.00019542613432592038, "loss": 2.2655, "step": 154 }, { "epoch": 0.051007815713698064, "grad_norm": 13.076166152954102, "learning_rate": 0.00019536288510260056, "loss": 1.9767, "step": 155 }, { "epoch": 0.051336898395721926, "grad_norm": 15.647604942321777, "learning_rate": 0.00019529921193729534, "loss": 2.2871, "step": 156 }, { "epoch": 0.05166598107774578, "grad_norm": 15.999380111694336, "learning_rate": 0.00019523511511306793, "loss": 2.4586, "step": 157 }, { "epoch": 0.051995063759769644, "grad_norm": 16.724533081054688, "learning_rate": 0.000195170594914865, "loss": 2.1605, "step": 158 }, { "epoch": 0.0523241464417935, "grad_norm": 17.9171142578125, "learning_rate": 0.00019510565162951537, "loss": 2.2012, "step": 159 }, { "epoch": 0.05265322912381736, "grad_norm": 18.63538360595703, "learning_rate": 0.00019504028554572864, "loss": 2.2715, "step": 160 }, { "epoch": 0.05298231180584122, "grad_norm": 17.931528091430664, "learning_rate": 0.00019497449695409408, "loss": 2.2195, "step": 161 }, { "epoch": 0.05331139448786507, "grad_norm": 18.84501075744629, "learning_rate": 0.00019490828614707916, "loss": 2.2326, "step": 162 }, { "epoch": 0.053640477169888935, "grad_norm": 18.82663345336914, "learning_rate": 0.00019484165341902845, "loss": 2.3262, "step": 163 }, { "epoch": 0.05396955985191279, "grad_norm": 20.771976470947266, "learning_rate": 0.00019477459906616206, "loss": 2.3659, "step": 164 }, { "epoch": 0.05429864253393665, "grad_norm": 19.50181007385254, "learning_rate": 0.00019470712338657458, "loss": 2.2192, "step": 165 }, { "epoch": 0.05462772521596051, "grad_norm": 18.81004524230957, "learning_rate": 0.0001946392266802336, "loss": 2.1428, "step": 166 }, { "epoch": 0.05495680789798437, "grad_norm": 21.053260803222656, "learning_rate": 0.0001945709092489783, "loss": 2.2203, "step": 167 }, { "epoch": 0.055285890580008226, "grad_norm": 25.28620147705078, "learning_rate": 0.00019450217139651844, "loss": 2.1879, "step": 168 }, { "epoch": 0.05561497326203209, "grad_norm": 22.09459686279297, "learning_rate": 0.0001944330134284326, "loss": 2.1769, "step": 169 }, { "epoch": 0.055944055944055944, "grad_norm": 24.473697662353516, "learning_rate": 0.00019436343565216711, "loss": 2.415, "step": 170 }, { "epoch": 0.0562731386260798, "grad_norm": 25.860061645507812, "learning_rate": 0.00019429343837703455, "loss": 2.299, "step": 171 }, { "epoch": 0.05660222130810366, "grad_norm": 25.009765625, "learning_rate": 0.0001942230219142124, "loss": 2.1542, "step": 172 }, { "epoch": 0.05693130399012752, "grad_norm": 28.018394470214844, "learning_rate": 0.0001941521865767417, "loss": 2.4432, "step": 173 }, { "epoch": 0.05726038667215138, "grad_norm": 31.617511749267578, "learning_rate": 0.0001940809326795256, "loss": 2.3726, "step": 174 }, { "epoch": 0.057589469354175235, "grad_norm": 26.330232620239258, "learning_rate": 0.000194009260539328, "loss": 2.2941, "step": 175 }, { "epoch": 0.0579185520361991, "grad_norm": 28.39286994934082, "learning_rate": 0.0001939371704747721, "loss": 2.368, "step": 176 }, { "epoch": 0.05824763471822295, "grad_norm": 29.393531799316406, "learning_rate": 0.00019386466280633906, "loss": 2.3252, "step": 177 }, { "epoch": 0.058576717400246815, "grad_norm": 27.360153198242188, "learning_rate": 0.00019379173785636646, "loss": 2.1943, "step": 178 }, { "epoch": 0.05890580008227067, "grad_norm": 29.285520553588867, "learning_rate": 0.000193718395949047, "loss": 2.4499, "step": 179 }, { "epoch": 0.059234882764294526, "grad_norm": 29.243824005126953, "learning_rate": 0.00019364463741042694, "loss": 2.3499, "step": 180 }, { "epoch": 0.05956396544631839, "grad_norm": 32.861839294433594, "learning_rate": 0.00019357046256840473, "loss": 2.3157, "step": 181 }, { "epoch": 0.059893048128342244, "grad_norm": 40.80834197998047, "learning_rate": 0.00019349587175272948, "loss": 2.2473, "step": 182 }, { "epoch": 0.060222130810366106, "grad_norm": 33.760169982910156, "learning_rate": 0.0001934208652949996, "loss": 2.4274, "step": 183 }, { "epoch": 0.06055121349238996, "grad_norm": 34.87383270263672, "learning_rate": 0.00019334544352866127, "loss": 2.5144, "step": 184 }, { "epoch": 0.060880296174413824, "grad_norm": 36.6989631652832, "learning_rate": 0.00019326960678900688, "loss": 2.2844, "step": 185 }, { "epoch": 0.06120937885643768, "grad_norm": 36.20161056518555, "learning_rate": 0.00019319335541317361, "loss": 2.5463, "step": 186 }, { "epoch": 0.06153846153846154, "grad_norm": 37.2934455871582, "learning_rate": 0.00019311668974014208, "loss": 2.3696, "step": 187 }, { "epoch": 0.0618675442204854, "grad_norm": 39.52557373046875, "learning_rate": 0.00019303961011073447, "loss": 2.4879, "step": 188 }, { "epoch": 0.06219662690250925, "grad_norm": 38.334991455078125, "learning_rate": 0.00019296211686761346, "loss": 2.5505, "step": 189 }, { "epoch": 0.06252570958453312, "grad_norm": 43.470672607421875, "learning_rate": 0.00019288421035528028, "loss": 2.4267, "step": 190 }, { "epoch": 0.06285479226655698, "grad_norm": 38.25648498535156, "learning_rate": 0.00019280589092007352, "loss": 2.4197, "step": 191 }, { "epoch": 0.06318387494858083, "grad_norm": 45.45105743408203, "learning_rate": 0.00019272715891016735, "loss": 2.5399, "step": 192 }, { "epoch": 0.06351295763060469, "grad_norm": 46.69264221191406, "learning_rate": 0.00019264801467557007, "loss": 2.5085, "step": 193 }, { "epoch": 0.06384204031262855, "grad_norm": 51.463844299316406, "learning_rate": 0.00019256845856812266, "loss": 2.602, "step": 194 }, { "epoch": 0.06417112299465241, "grad_norm": 53.442283630371094, "learning_rate": 0.000192488490941497, "loss": 2.6754, "step": 195 }, { "epoch": 0.06450020567667626, "grad_norm": 64.25224304199219, "learning_rate": 0.00019240811215119448, "loss": 2.766, "step": 196 }, { "epoch": 0.06482928835870012, "grad_norm": 46.239723205566406, "learning_rate": 0.00019232732255454422, "loss": 2.4271, "step": 197 }, { "epoch": 0.06515837104072399, "grad_norm": 70.70040130615234, "learning_rate": 0.00019224612251070175, "loss": 2.6559, "step": 198 }, { "epoch": 0.06548745372274783, "grad_norm": 69.72952270507812, "learning_rate": 0.0001921645123806472, "loss": 2.8281, "step": 199 }, { "epoch": 0.0658165364047717, "grad_norm": 81.83056640625, "learning_rate": 0.0001920824925271838, "loss": 3.1637, "step": 200 }, { "epoch": 0.06614561908679556, "grad_norm": 26.758020401000977, "learning_rate": 0.0001920000633149362, "loss": 2.2422, "step": 201 }, { "epoch": 0.06647470176881942, "grad_norm": 24.241539001464844, "learning_rate": 0.00019191722511034884, "loss": 2.2236, "step": 202 }, { "epoch": 0.06680378445084327, "grad_norm": 19.917999267578125, "learning_rate": 0.00019183397828168448, "loss": 2.2469, "step": 203 }, { "epoch": 0.06713286713286713, "grad_norm": 14.072540283203125, "learning_rate": 0.00019175032319902234, "loss": 2.0199, "step": 204 }, { "epoch": 0.067461949814891, "grad_norm": 15.998994827270508, "learning_rate": 0.00019166626023425662, "loss": 2.1876, "step": 205 }, { "epoch": 0.06779103249691484, "grad_norm": 31.02007293701172, "learning_rate": 0.00019158178976109476, "loss": 2.0833, "step": 206 }, { "epoch": 0.0681201151789387, "grad_norm": 17.03249168395996, "learning_rate": 0.0001914969121550558, "loss": 2.1668, "step": 207 }, { "epoch": 0.06844919786096257, "grad_norm": 15.970086097717285, "learning_rate": 0.00019141162779346874, "loss": 2.0027, "step": 208 }, { "epoch": 0.06877828054298643, "grad_norm": 17.21071434020996, "learning_rate": 0.00019132593705547082, "loss": 2.1795, "step": 209 }, { "epoch": 0.06910736322501028, "grad_norm": 16.515851974487305, "learning_rate": 0.00019123984032200586, "loss": 2.1902, "step": 210 }, { "epoch": 0.06943644590703414, "grad_norm": 16.371320724487305, "learning_rate": 0.00019115333797582254, "loss": 2.2563, "step": 211 }, { "epoch": 0.069765528589058, "grad_norm": 17.641443252563477, "learning_rate": 0.00019106643040147278, "loss": 2.1812, "step": 212 }, { "epoch": 0.07009461127108185, "grad_norm": 34.028690338134766, "learning_rate": 0.00019097911798530987, "loss": 2.0955, "step": 213 }, { "epoch": 0.07042369395310571, "grad_norm": 18.238664627075195, "learning_rate": 0.00019089140111548696, "loss": 2.2354, "step": 214 }, { "epoch": 0.07075277663512958, "grad_norm": 19.597766876220703, "learning_rate": 0.00019080328018195513, "loss": 2.2604, "step": 215 }, { "epoch": 0.07108185931715344, "grad_norm": 22.564088821411133, "learning_rate": 0.0001907147555764618, "loss": 2.2941, "step": 216 }, { "epoch": 0.07141094199917729, "grad_norm": 19.086936950683594, "learning_rate": 0.00019062582769254895, "loss": 2.162, "step": 217 }, { "epoch": 0.07174002468120115, "grad_norm": 21.065017700195312, "learning_rate": 0.00019053649692555135, "loss": 1.9859, "step": 218 }, { "epoch": 0.07206910736322501, "grad_norm": 22.767284393310547, "learning_rate": 0.00019044676367259476, "loss": 2.358, "step": 219 }, { "epoch": 0.07239819004524888, "grad_norm": 22.147249221801758, "learning_rate": 0.00019035662833259432, "loss": 2.1264, "step": 220 }, { "epoch": 0.07272727272727272, "grad_norm": 21.24959945678711, "learning_rate": 0.00019026609130625257, "loss": 2.1611, "step": 221 }, { "epoch": 0.07305635540929659, "grad_norm": 24.62726593017578, "learning_rate": 0.00019017515299605788, "loss": 2.2199, "step": 222 }, { "epoch": 0.07338543809132045, "grad_norm": 22.732820510864258, "learning_rate": 0.00019008381380628247, "loss": 2.2954, "step": 223 }, { "epoch": 0.0737145207733443, "grad_norm": 22.863624572753906, "learning_rate": 0.00018999207414298067, "loss": 2.2531, "step": 224 }, { "epoch": 0.07404360345536816, "grad_norm": 22.743608474731445, "learning_rate": 0.00018989993441398726, "loss": 2.1744, "step": 225 }, { "epoch": 0.07437268613739202, "grad_norm": 25.53584861755371, "learning_rate": 0.00018980739502891546, "loss": 2.2578, "step": 226 }, { "epoch": 0.07470176881941588, "grad_norm": 24.606985092163086, "learning_rate": 0.0001897144563991552, "loss": 2.3099, "step": 227 }, { "epoch": 0.07503085150143973, "grad_norm": 28.77580451965332, "learning_rate": 0.00018962111893787128, "loss": 2.4734, "step": 228 }, { "epoch": 0.0753599341834636, "grad_norm": 24.38824462890625, "learning_rate": 0.00018952738306000151, "loss": 2.2832, "step": 229 }, { "epoch": 0.07568901686548746, "grad_norm": 26.79664421081543, "learning_rate": 0.00018943324918225494, "loss": 2.2934, "step": 230 }, { "epoch": 0.0760180995475113, "grad_norm": 26.263587951660156, "learning_rate": 0.0001893387177231099, "loss": 2.3581, "step": 231 }, { "epoch": 0.07634718222953517, "grad_norm": 31.044546127319336, "learning_rate": 0.0001892437891028122, "loss": 2.2172, "step": 232 }, { "epoch": 0.07667626491155903, "grad_norm": 30.85577392578125, "learning_rate": 0.0001891484637433733, "loss": 2.3933, "step": 233 }, { "epoch": 0.07700534759358289, "grad_norm": 30.781057357788086, "learning_rate": 0.00018905274206856837, "loss": 2.2013, "step": 234 }, { "epoch": 0.07733443027560674, "grad_norm": 27.969951629638672, "learning_rate": 0.00018895662450393438, "loss": 2.3257, "step": 235 }, { "epoch": 0.0776635129576306, "grad_norm": 34.26099395751953, "learning_rate": 0.00018886011147676833, "loss": 2.2869, "step": 236 }, { "epoch": 0.07799259563965447, "grad_norm": 30.937467575073242, "learning_rate": 0.00018876320341612522, "loss": 2.5343, "step": 237 }, { "epoch": 0.07832167832167833, "grad_norm": 35.388099670410156, "learning_rate": 0.00018866590075281624, "loss": 2.4132, "step": 238 }, { "epoch": 0.07865076100370218, "grad_norm": 32.884273529052734, "learning_rate": 0.00018856820391940674, "loss": 2.366, "step": 239 }, { "epoch": 0.07897984368572604, "grad_norm": 34.471805572509766, "learning_rate": 0.00018847011335021449, "loss": 2.4882, "step": 240 }, { "epoch": 0.0793089263677499, "grad_norm": 38.46910095214844, "learning_rate": 0.00018837162948130752, "loss": 2.468, "step": 241 }, { "epoch": 0.07963800904977375, "grad_norm": 45.747642517089844, "learning_rate": 0.00018827275275050233, "loss": 2.5533, "step": 242 }, { "epoch": 0.07996709173179761, "grad_norm": 42.68091583251953, "learning_rate": 0.00018817348359736203, "loss": 2.6073, "step": 243 }, { "epoch": 0.08029617441382148, "grad_norm": 42.13290786743164, "learning_rate": 0.00018807382246319412, "loss": 2.5101, "step": 244 }, { "epoch": 0.08062525709584534, "grad_norm": 44.64775466918945, "learning_rate": 0.00018797376979104872, "loss": 2.4074, "step": 245 }, { "epoch": 0.08095433977786919, "grad_norm": 44.45621871948242, "learning_rate": 0.00018787332602571662, "loss": 2.6026, "step": 246 }, { "epoch": 0.08128342245989305, "grad_norm": 46.53767013549805, "learning_rate": 0.00018777249161372713, "loss": 2.7316, "step": 247 }, { "epoch": 0.08161250514191691, "grad_norm": 66.93853759765625, "learning_rate": 0.00018767126700334634, "loss": 3.0533, "step": 248 }, { "epoch": 0.08194158782394076, "grad_norm": 70.93820190429688, "learning_rate": 0.0001875696526445749, "loss": 3.0685, "step": 249 }, { "epoch": 0.08227067050596462, "grad_norm": 79.4852294921875, "learning_rate": 0.0001874676489891461, "loss": 3.0647, "step": 250 }, { "epoch": 0.08259975318798848, "grad_norm": 32.315250396728516, "learning_rate": 0.00018736525649052394, "loss": 2.2311, "step": 251 }, { "epoch": 0.08292883587001235, "grad_norm": 30.23067855834961, "learning_rate": 0.00018726247560390099, "loss": 2.0774, "step": 252 }, { "epoch": 0.0832579185520362, "grad_norm": 26.982328414916992, "learning_rate": 0.00018715930678619644, "loss": 2.122, "step": 253 }, { "epoch": 0.08358700123406006, "grad_norm": 18.85164451599121, "learning_rate": 0.00018705575049605413, "loss": 2.2208, "step": 254 }, { "epoch": 0.08391608391608392, "grad_norm": 13.671011924743652, "learning_rate": 0.00018695180719384029, "loss": 2.0684, "step": 255 }, { "epoch": 0.08424516659810777, "grad_norm": 14.02846908569336, "learning_rate": 0.00018684747734164177, "loss": 1.9996, "step": 256 }, { "epoch": 0.08457424928013163, "grad_norm": 13.098374366760254, "learning_rate": 0.00018674276140326376, "loss": 2.0488, "step": 257 }, { "epoch": 0.08490333196215549, "grad_norm": 14.445398330688477, "learning_rate": 0.00018663765984422786, "loss": 2.1794, "step": 258 }, { "epoch": 0.08523241464417936, "grad_norm": 14.592230796813965, "learning_rate": 0.00018653217313177004, "loss": 2.0188, "step": 259 }, { "epoch": 0.0855614973262032, "grad_norm": 16.81397247314453, "learning_rate": 0.00018642630173483832, "loss": 2.1347, "step": 260 }, { "epoch": 0.08589058000822707, "grad_norm": 15.40119743347168, "learning_rate": 0.00018632004612409103, "loss": 2.1071, "step": 261 }, { "epoch": 0.08621966269025093, "grad_norm": 16.573291778564453, "learning_rate": 0.00018621340677189453, "loss": 2.0809, "step": 262 }, { "epoch": 0.08654874537227479, "grad_norm": 16.650859832763672, "learning_rate": 0.00018610638415232097, "loss": 2.0691, "step": 263 }, { "epoch": 0.08687782805429864, "grad_norm": 18.518003463745117, "learning_rate": 0.00018599897874114652, "loss": 2.1923, "step": 264 }, { "epoch": 0.0872069107363225, "grad_norm": 18.871150970458984, "learning_rate": 0.00018589119101584898, "loss": 2.1555, "step": 265 }, { "epoch": 0.08753599341834636, "grad_norm": 19.301851272583008, "learning_rate": 0.00018578302145560584, "loss": 2.1732, "step": 266 }, { "epoch": 0.08786507610037021, "grad_norm": 18.156076431274414, "learning_rate": 0.00018567447054129195, "loss": 2.1245, "step": 267 }, { "epoch": 0.08819415878239407, "grad_norm": 19.837614059448242, "learning_rate": 0.00018556553875547754, "loss": 2.2374, "step": 268 }, { "epoch": 0.08852324146441794, "grad_norm": 18.92027473449707, "learning_rate": 0.00018545622658242607, "loss": 2.2301, "step": 269 }, { "epoch": 0.0888523241464418, "grad_norm": 21.270837783813477, "learning_rate": 0.00018534653450809197, "loss": 2.2331, "step": 270 }, { "epoch": 0.08918140682846565, "grad_norm": 21.555301666259766, "learning_rate": 0.00018523646302011867, "loss": 2.128, "step": 271 }, { "epoch": 0.08951048951048951, "grad_norm": 23.148181915283203, "learning_rate": 0.00018512601260783606, "loss": 2.2258, "step": 272 }, { "epoch": 0.08983957219251337, "grad_norm": 20.981473922729492, "learning_rate": 0.00018501518376225887, "loss": 2.2262, "step": 273 }, { "epoch": 0.09016865487453722, "grad_norm": 23.206968307495117, "learning_rate": 0.00018490397697608395, "loss": 2.2757, "step": 274 }, { "epoch": 0.09049773755656108, "grad_norm": 22.09346580505371, "learning_rate": 0.0001847923927436884, "loss": 2.205, "step": 275 }, { "epoch": 0.09082682023858495, "grad_norm": 23.256229400634766, "learning_rate": 0.00018468043156112728, "loss": 2.1677, "step": 276 }, { "epoch": 0.09115590292060881, "grad_norm": 23.727642059326172, "learning_rate": 0.0001845680939261314, "loss": 2.2491, "step": 277 }, { "epoch": 0.09148498560263266, "grad_norm": 25.546274185180664, "learning_rate": 0.00018445538033810515, "loss": 2.3562, "step": 278 }, { "epoch": 0.09181406828465652, "grad_norm": 23.656875610351562, "learning_rate": 0.00018434229129812418, "loss": 2.109, "step": 279 }, { "epoch": 0.09214315096668038, "grad_norm": 28.220298767089844, "learning_rate": 0.0001842288273089332, "loss": 2.4128, "step": 280 }, { "epoch": 0.09247223364870423, "grad_norm": 28.209348678588867, "learning_rate": 0.00018411498887494396, "loss": 2.3428, "step": 281 }, { "epoch": 0.09280131633072809, "grad_norm": 28.554391860961914, "learning_rate": 0.00018400077650223263, "loss": 2.2634, "step": 282 }, { "epoch": 0.09313039901275195, "grad_norm": 30.214752197265625, "learning_rate": 0.0001838861906985379, "loss": 2.3671, "step": 283 }, { "epoch": 0.09345948169477582, "grad_norm": 31.313325881958008, "learning_rate": 0.00018377123197325842, "loss": 2.4922, "step": 284 }, { "epoch": 0.09378856437679967, "grad_norm": 31.089052200317383, "learning_rate": 0.00018365590083745085, "loss": 2.4211, "step": 285 }, { "epoch": 0.09411764705882353, "grad_norm": 32.58794403076172, "learning_rate": 0.00018354019780382735, "loss": 2.3834, "step": 286 }, { "epoch": 0.09444672974084739, "grad_norm": 34.73944854736328, "learning_rate": 0.0001834241233867533, "loss": 2.2465, "step": 287 }, { "epoch": 0.09477581242287125, "grad_norm": 38.76915740966797, "learning_rate": 0.00018330767810224524, "loss": 2.2918, "step": 288 }, { "epoch": 0.0951048951048951, "grad_norm": 37.93132781982422, "learning_rate": 0.0001831908624679683, "loss": 2.5079, "step": 289 }, { "epoch": 0.09543397778691896, "grad_norm": 37.57957458496094, "learning_rate": 0.0001830736770032341, "loss": 2.2431, "step": 290 }, { "epoch": 0.09576306046894283, "grad_norm": 36.72309112548828, "learning_rate": 0.0001829561222289984, "loss": 2.5937, "step": 291 }, { "epoch": 0.09609214315096667, "grad_norm": 43.910457611083984, "learning_rate": 0.00018283819866785853, "loss": 2.5796, "step": 292 }, { "epoch": 0.09642122583299054, "grad_norm": 37.60491943359375, "learning_rate": 0.0001827199068440516, "loss": 2.501, "step": 293 }, { "epoch": 0.0967503085150144, "grad_norm": 41.552066802978516, "learning_rate": 0.00018260124728345162, "loss": 2.5463, "step": 294 }, { "epoch": 0.09707939119703826, "grad_norm": 42.12718963623047, "learning_rate": 0.00018248222051356754, "loss": 2.5723, "step": 295 }, { "epoch": 0.09740847387906211, "grad_norm": 44.49871063232422, "learning_rate": 0.00018236282706354063, "loss": 2.6006, "step": 296 }, { "epoch": 0.09773755656108597, "grad_norm": 46.53413391113281, "learning_rate": 0.00018224306746414238, "loss": 2.5239, "step": 297 }, { "epoch": 0.09806663924310983, "grad_norm": 71.21157836914062, "learning_rate": 0.00018212294224777197, "loss": 2.8279, "step": 298 }, { "epoch": 0.09839572192513368, "grad_norm": 76.81084442138672, "learning_rate": 0.00018200245194845399, "loss": 3.0209, "step": 299 }, { "epoch": 0.09872480460715755, "grad_norm": 75.40888214111328, "learning_rate": 0.00018188159710183594, "loss": 2.9355, "step": 300 }, { "epoch": 0.09905388728918141, "grad_norm": 25.123001098632812, "learning_rate": 0.000181760378245186, "loss": 2.1246, "step": 301 }, { "epoch": 0.09938296997120527, "grad_norm": 24.127634048461914, "learning_rate": 0.00018163879591739067, "loss": 2.0098, "step": 302 }, { "epoch": 0.09971205265322912, "grad_norm": 19.23702049255371, "learning_rate": 0.0001815168506589521, "loss": 2.0683, "step": 303 }, { "epoch": 0.10004113533525298, "grad_norm": 15.723398208618164, "learning_rate": 0.000181394543011986, "loss": 2.1503, "step": 304 }, { "epoch": 0.10037021801727684, "grad_norm": 13.344893455505371, "learning_rate": 0.00018127187352021907, "loss": 2.099, "step": 305 }, { "epoch": 0.1006993006993007, "grad_norm": 12.449121475219727, "learning_rate": 0.0001811488427289866, "loss": 2.12, "step": 306 }, { "epoch": 0.10102838338132455, "grad_norm": 10.836393356323242, "learning_rate": 0.00018102545118523007, "loss": 1.9312, "step": 307 }, { "epoch": 0.10135746606334842, "grad_norm": 13.552892684936523, "learning_rate": 0.00018090169943749476, "loss": 2.1161, "step": 308 }, { "epoch": 0.10168654874537228, "grad_norm": 12.804911613464355, "learning_rate": 0.00018077758803592718, "loss": 1.9294, "step": 309 }, { "epoch": 0.10201563142739613, "grad_norm": 15.500523567199707, "learning_rate": 0.00018065311753227273, "loss": 2.1699, "step": 310 }, { "epoch": 0.10234471410941999, "grad_norm": 13.986373901367188, "learning_rate": 0.0001805282884798732, "loss": 2.1869, "step": 311 }, { "epoch": 0.10267379679144385, "grad_norm": 15.537351608276367, "learning_rate": 0.00018040310143366446, "loss": 2.0441, "step": 312 }, { "epoch": 0.10300287947346771, "grad_norm": 14.554361343383789, "learning_rate": 0.00018027755695017368, "loss": 2.1316, "step": 313 }, { "epoch": 0.10333196215549156, "grad_norm": 15.637019157409668, "learning_rate": 0.00018015165558751717, "loss": 2.0265, "step": 314 }, { "epoch": 0.10366104483751543, "grad_norm": 16.677814483642578, "learning_rate": 0.00018002539790539773, "loss": 2.1547, "step": 315 }, { "epoch": 0.10399012751953929, "grad_norm": 17.848800659179688, "learning_rate": 0.00017989878446510215, "loss": 2.0712, "step": 316 }, { "epoch": 0.10431921020156314, "grad_norm": 21.053678512573242, "learning_rate": 0.00017977181582949888, "loss": 2.1301, "step": 317 }, { "epoch": 0.104648292883587, "grad_norm": 18.58283042907715, "learning_rate": 0.0001796444925630353, "loss": 2.1719, "step": 318 }, { "epoch": 0.10497737556561086, "grad_norm": 18.665454864501953, "learning_rate": 0.00017951681523173542, "loss": 2.1489, "step": 319 }, { "epoch": 0.10530645824763472, "grad_norm": 17.518543243408203, "learning_rate": 0.0001793887844031972, "loss": 2.1305, "step": 320 }, { "epoch": 0.10563554092965857, "grad_norm": 19.706811904907227, "learning_rate": 0.00017926040064659014, "loss": 2.1922, "step": 321 }, { "epoch": 0.10596462361168243, "grad_norm": 19.07513427734375, "learning_rate": 0.0001791316645326526, "loss": 2.1125, "step": 322 }, { "epoch": 0.1062937062937063, "grad_norm": 21.25273323059082, "learning_rate": 0.00017900257663368963, "loss": 2.0967, "step": 323 }, { "epoch": 0.10662278897573015, "grad_norm": 23.022178649902344, "learning_rate": 0.0001788731375235698, "loss": 2.1737, "step": 324 }, { "epoch": 0.10695187165775401, "grad_norm": 24.29770851135254, "learning_rate": 0.00017874334777772327, "loss": 2.2003, "step": 325 }, { "epoch": 0.10728095433977787, "grad_norm": 24.43622398376465, "learning_rate": 0.00017861320797313892, "loss": 2.2152, "step": 326 }, { "epoch": 0.10761003702180173, "grad_norm": 28.540864944458008, "learning_rate": 0.0001784827186883618, "loss": 2.1726, "step": 327 }, { "epoch": 0.10793911970382558, "grad_norm": 22.597213745117188, "learning_rate": 0.00017835188050349064, "loss": 2.2314, "step": 328 }, { "epoch": 0.10826820238584944, "grad_norm": 25.8801326751709, "learning_rate": 0.00017822069400017516, "loss": 2.2515, "step": 329 }, { "epoch": 0.1085972850678733, "grad_norm": 21.48238754272461, "learning_rate": 0.00017808915976161362, "loss": 2.2769, "step": 330 }, { "epoch": 0.10892636774989717, "grad_norm": 26.163372039794922, "learning_rate": 0.00017795727837255015, "loss": 2.1905, "step": 331 }, { "epoch": 0.10925545043192102, "grad_norm": 26.860511779785156, "learning_rate": 0.00017782505041927216, "loss": 2.2843, "step": 332 }, { "epoch": 0.10958453311394488, "grad_norm": 28.667987823486328, "learning_rate": 0.00017769247648960774, "loss": 2.3807, "step": 333 }, { "epoch": 0.10991361579596874, "grad_norm": 37.908939361572266, "learning_rate": 0.00017755955717292296, "loss": 2.331, "step": 334 }, { "epoch": 0.11024269847799259, "grad_norm": 30.53937339782715, "learning_rate": 0.00017742629306011944, "loss": 2.4017, "step": 335 }, { "epoch": 0.11057178116001645, "grad_norm": 31.421762466430664, "learning_rate": 0.00017729268474363154, "loss": 2.421, "step": 336 }, { "epoch": 0.11090086384204031, "grad_norm": 28.493751525878906, "learning_rate": 0.0001771587328174239, "loss": 2.2773, "step": 337 }, { "epoch": 0.11122994652406418, "grad_norm": 28.117023468017578, "learning_rate": 0.0001770244378769885, "loss": 2.4702, "step": 338 }, { "epoch": 0.11155902920608803, "grad_norm": 36.35397720336914, "learning_rate": 0.0001768898005193425, "loss": 2.502, "step": 339 }, { "epoch": 0.11188811188811189, "grad_norm": 31.640914916992188, "learning_rate": 0.000176754821343025, "loss": 2.4533, "step": 340 }, { "epoch": 0.11221719457013575, "grad_norm": 33.728538513183594, "learning_rate": 0.0001766195009480949, "loss": 2.4158, "step": 341 }, { "epoch": 0.1125462772521596, "grad_norm": 30.469196319580078, "learning_rate": 0.0001764838399361279, "loss": 2.4001, "step": 342 }, { "epoch": 0.11287535993418346, "grad_norm": 39.749664306640625, "learning_rate": 0.00017634783891021393, "loss": 2.3815, "step": 343 }, { "epoch": 0.11320444261620732, "grad_norm": 41.524436950683594, "learning_rate": 0.00017621149847495458, "loss": 2.4092, "step": 344 }, { "epoch": 0.11353352529823119, "grad_norm": 55.04652404785156, "learning_rate": 0.00017607481923646016, "loss": 2.6198, "step": 345 }, { "epoch": 0.11386260798025503, "grad_norm": 41.766761779785156, "learning_rate": 0.0001759378018023473, "loss": 2.4331, "step": 346 }, { "epoch": 0.1141916906622789, "grad_norm": 73.7178955078125, "learning_rate": 0.00017580044678173592, "loss": 2.6612, "step": 347 }, { "epoch": 0.11452077334430276, "grad_norm": 58.45720672607422, "learning_rate": 0.00017566275478524693, "loss": 2.7114, "step": 348 }, { "epoch": 0.11484985602632661, "grad_norm": 62.9808349609375, "learning_rate": 0.0001755247264249991, "loss": 2.6575, "step": 349 }, { "epoch": 0.11517893870835047, "grad_norm": 65.06092071533203, "learning_rate": 0.0001753863623146066, "loss": 3.1385, "step": 350 }, { "epoch": 0.11550802139037433, "grad_norm": 21.07193946838379, "learning_rate": 0.00017524766306917618, "loss": 2.1127, "step": 351 }, { "epoch": 0.1158371040723982, "grad_norm": 22.23618507385254, "learning_rate": 0.0001751086293053045, "loss": 2.1514, "step": 352 }, { "epoch": 0.11616618675442204, "grad_norm": 20.072818756103516, "learning_rate": 0.0001749692616410753, "loss": 2.0668, "step": 353 }, { "epoch": 0.1164952694364459, "grad_norm": 16.212791442871094, "learning_rate": 0.00017482956069605668, "loss": 2.0342, "step": 354 }, { "epoch": 0.11682435211846977, "grad_norm": 12.142776489257812, "learning_rate": 0.00017468952709129846, "loss": 1.9391, "step": 355 }, { "epoch": 0.11715343480049363, "grad_norm": 13.167658805847168, "learning_rate": 0.00017454916144932922, "loss": 2.1191, "step": 356 }, { "epoch": 0.11748251748251748, "grad_norm": 12.89388656616211, "learning_rate": 0.0001744084643941536, "loss": 1.9692, "step": 357 }, { "epoch": 0.11781160016454134, "grad_norm": 12.43535327911377, "learning_rate": 0.00017426743655124974, "loss": 2.1307, "step": 358 }, { "epoch": 0.1181406828465652, "grad_norm": 13.791735649108887, "learning_rate": 0.0001741260785475661, "loss": 2.1729, "step": 359 }, { "epoch": 0.11846976552858905, "grad_norm": 13.562893867492676, "learning_rate": 0.00017398439101151905, "loss": 2.0874, "step": 360 }, { "epoch": 0.11879884821061291, "grad_norm": 14.418350219726562, "learning_rate": 0.00017384237457298987, "loss": 2.1214, "step": 361 }, { "epoch": 0.11912793089263678, "grad_norm": 14.90912914276123, "learning_rate": 0.00017370002986332193, "loss": 2.15, "step": 362 }, { "epoch": 0.11945701357466064, "grad_norm": 14.4798583984375, "learning_rate": 0.00017355735751531807, "loss": 2.104, "step": 363 }, { "epoch": 0.11978609625668449, "grad_norm": 14.9854154586792, "learning_rate": 0.00017341435816323756, "loss": 2.1634, "step": 364 }, { "epoch": 0.12011517893870835, "grad_norm": 15.484580039978027, "learning_rate": 0.00017327103244279348, "loss": 2.1227, "step": 365 }, { "epoch": 0.12044426162073221, "grad_norm": 16.974597930908203, "learning_rate": 0.00017312738099114973, "loss": 2.1332, "step": 366 }, { "epoch": 0.12077334430275606, "grad_norm": 17.421018600463867, "learning_rate": 0.00017298340444691835, "loss": 2.1759, "step": 367 }, { "epoch": 0.12110242698477992, "grad_norm": 16.27506446838379, "learning_rate": 0.00017283910345015647, "loss": 2.1096, "step": 368 }, { "epoch": 0.12143150966680379, "grad_norm": 17.663877487182617, "learning_rate": 0.0001726944786423637, "loss": 2.0878, "step": 369 }, { "epoch": 0.12176059234882765, "grad_norm": 18.855493545532227, "learning_rate": 0.00017254953066647913, "loss": 2.1556, "step": 370 }, { "epoch": 0.1220896750308515, "grad_norm": 20.37822151184082, "learning_rate": 0.00017240426016687863, "loss": 2.1551, "step": 371 }, { "epoch": 0.12241875771287536, "grad_norm": 18.425636291503906, "learning_rate": 0.00017225866778937165, "loss": 2.1598, "step": 372 }, { "epoch": 0.12274784039489922, "grad_norm": 18.747047424316406, "learning_rate": 0.00017211275418119876, "loss": 2.0371, "step": 373 }, { "epoch": 0.12307692307692308, "grad_norm": 20.014904022216797, "learning_rate": 0.0001719665199910285, "loss": 2.2492, "step": 374 }, { "epoch": 0.12340600575894693, "grad_norm": 21.430776596069336, "learning_rate": 0.00017181996586895454, "loss": 2.2077, "step": 375 }, { "epoch": 0.12340600575894693, "eval_loss": 1.9685778617858887, "eval_runtime": 163.1829, "eval_samples_per_second": 31.364, "eval_steps_per_second": 15.682, "step": 375 }, { "epoch": 0.1237350884409708, "grad_norm": 22.844011306762695, "learning_rate": 0.00017167309246649297, "loss": 2.1541, "step": 376 }, { "epoch": 0.12406417112299466, "grad_norm": 21.73259925842285, "learning_rate": 0.0001715259004365791, "loss": 2.2755, "step": 377 }, { "epoch": 0.1243932538050185, "grad_norm": 25.81591796875, "learning_rate": 0.00017137839043356484, "loss": 2.3155, "step": 378 }, { "epoch": 0.12472233648704237, "grad_norm": 24.79435157775879, "learning_rate": 0.00017123056311321562, "loss": 2.2261, "step": 379 }, { "epoch": 0.12505141916906623, "grad_norm": 26.0931339263916, "learning_rate": 0.0001710824191327075, "loss": 2.3436, "step": 380 }, { "epoch": 0.1253805018510901, "grad_norm": 25.1461181640625, "learning_rate": 0.00017093395915062428, "loss": 2.2754, "step": 381 }, { "epoch": 0.12570958453311395, "grad_norm": 25.35841941833496, "learning_rate": 0.00017078518382695465, "loss": 2.2749, "step": 382 }, { "epoch": 0.1260386672151378, "grad_norm": 27.528522491455078, "learning_rate": 0.00017063609382308908, "loss": 2.2594, "step": 383 }, { "epoch": 0.12636774989716165, "grad_norm": 29.911043167114258, "learning_rate": 0.00017048668980181698, "loss": 2.3037, "step": 384 }, { "epoch": 0.12669683257918551, "grad_norm": 28.31783103942871, "learning_rate": 0.00017033697242732377, "loss": 2.1738, "step": 385 }, { "epoch": 0.12702591526120938, "grad_norm": 32.08406448364258, "learning_rate": 0.0001701869423651879, "loss": 2.3866, "step": 386 }, { "epoch": 0.12735499794323324, "grad_norm": 27.648075103759766, "learning_rate": 0.00017003660028237793, "loss": 2.3798, "step": 387 }, { "epoch": 0.1276840806252571, "grad_norm": 37.912052154541016, "learning_rate": 0.00016988594684724947, "loss": 2.427, "step": 388 }, { "epoch": 0.12801316330728096, "grad_norm": 34.16417694091797, "learning_rate": 0.00016973498272954222, "loss": 2.3632, "step": 389 }, { "epoch": 0.12834224598930483, "grad_norm": 34.66891098022461, "learning_rate": 0.00016958370860037717, "loss": 2.2175, "step": 390 }, { "epoch": 0.12867132867132866, "grad_norm": 36.58308029174805, "learning_rate": 0.00016943212513225345, "loss": 2.4907, "step": 391 }, { "epoch": 0.12900041135335252, "grad_norm": 42.51967239379883, "learning_rate": 0.00016928023299904533, "loss": 2.4237, "step": 392 }, { "epoch": 0.12932949403537639, "grad_norm": 36.27920913696289, "learning_rate": 0.0001691280328759992, "loss": 2.4428, "step": 393 }, { "epoch": 0.12965857671740025, "grad_norm": 39.748634338378906, "learning_rate": 0.00016897552543973084, "loss": 2.3939, "step": 394 }, { "epoch": 0.1299876593994241, "grad_norm": 49.178367614746094, "learning_rate": 0.00016882271136822206, "loss": 2.51, "step": 395 }, { "epoch": 0.13031674208144797, "grad_norm": 45.95671463012695, "learning_rate": 0.0001686695913408179, "loss": 2.7345, "step": 396 }, { "epoch": 0.13064582476347184, "grad_norm": 47.4686279296875, "learning_rate": 0.0001685161660382235, "loss": 2.6422, "step": 397 }, { "epoch": 0.13097490744549567, "grad_norm": 53.40972137451172, "learning_rate": 0.00016836243614250113, "loss": 2.7456, "step": 398 }, { "epoch": 0.13130399012751953, "grad_norm": 60.63425064086914, "learning_rate": 0.00016820840233706719, "loss": 2.6192, "step": 399 }, { "epoch": 0.1316330728095434, "grad_norm": 88.8916015625, "learning_rate": 0.0001680540653066891, "loss": 2.993, "step": 400 }, { "epoch": 0.13196215549156726, "grad_norm": 20.78827667236328, "learning_rate": 0.00016789942573748232, "loss": 2.1662, "step": 401 }, { "epoch": 0.13229123817359112, "grad_norm": 20.47433090209961, "learning_rate": 0.0001677444843169072, "loss": 1.9775, "step": 402 }, { "epoch": 0.13262032085561498, "grad_norm": 17.47659683227539, "learning_rate": 0.00016758924173376603, "loss": 2.0234, "step": 403 }, { "epoch": 0.13294940353763884, "grad_norm": 13.700460433959961, "learning_rate": 0.0001674336986781999, "loss": 2.095, "step": 404 }, { "epoch": 0.13327848621966268, "grad_norm": 11.73477840423584, "learning_rate": 0.00016727785584168581, "loss": 2.1294, "step": 405 }, { "epoch": 0.13360756890168654, "grad_norm": 11.78417682647705, "learning_rate": 0.0001671217139170333, "loss": 1.9935, "step": 406 }, { "epoch": 0.1339366515837104, "grad_norm": 11.921859741210938, "learning_rate": 0.00016696527359838154, "loss": 2.071, "step": 407 }, { "epoch": 0.13426573426573427, "grad_norm": 12.540040969848633, "learning_rate": 0.00016680853558119632, "loss": 2.1218, "step": 408 }, { "epoch": 0.13459481694775813, "grad_norm": 13.562238693237305, "learning_rate": 0.0001666515005622668, "loss": 2.1177, "step": 409 }, { "epoch": 0.134923899629782, "grad_norm": 13.720065116882324, "learning_rate": 0.0001664941692397025, "loss": 2.0738, "step": 410 }, { "epoch": 0.13525298231180585, "grad_norm": 13.741817474365234, "learning_rate": 0.00016633654231293013, "loss": 2.0708, "step": 411 }, { "epoch": 0.1355820649938297, "grad_norm": 14.87505054473877, "learning_rate": 0.00016617862048269065, "loss": 2.0411, "step": 412 }, { "epoch": 0.13591114767585355, "grad_norm": 13.756852149963379, "learning_rate": 0.00016602040445103588, "loss": 2.0897, "step": 413 }, { "epoch": 0.1362402303578774, "grad_norm": 15.395737648010254, "learning_rate": 0.00016586189492132566, "loss": 2.1691, "step": 414 }, { "epoch": 0.13656931303990127, "grad_norm": 16.711942672729492, "learning_rate": 0.00016570309259822453, "loss": 2.0768, "step": 415 }, { "epoch": 0.13689839572192514, "grad_norm": 16.43849754333496, "learning_rate": 0.0001655439981876987, "loss": 2.1459, "step": 416 }, { "epoch": 0.137227478403949, "grad_norm": 15.554736137390137, "learning_rate": 0.00016538461239701277, "loss": 2.0951, "step": 417 }, { "epoch": 0.13755656108597286, "grad_norm": 18.45875358581543, "learning_rate": 0.00016522493593472683, "loss": 2.1005, "step": 418 }, { "epoch": 0.1378856437679967, "grad_norm": 19.053752899169922, "learning_rate": 0.0001650649695106931, "loss": 2.0698, "step": 419 }, { "epoch": 0.13821472645002056, "grad_norm": 18.527767181396484, "learning_rate": 0.00016490471383605288, "loss": 2.1617, "step": 420 }, { "epoch": 0.13854380913204442, "grad_norm": 17.477855682373047, "learning_rate": 0.00016474416962323325, "loss": 2.0955, "step": 421 }, { "epoch": 0.13887289181406828, "grad_norm": 20.116382598876953, "learning_rate": 0.00016458333758594414, "loss": 2.1986, "step": 422 }, { "epoch": 0.13920197449609215, "grad_norm": 19.28567886352539, "learning_rate": 0.00016442221843917496, "loss": 2.2186, "step": 423 }, { "epoch": 0.139531057178116, "grad_norm": 23.134096145629883, "learning_rate": 0.00016426081289919143, "loss": 2.1091, "step": 424 }, { "epoch": 0.13986013986013987, "grad_norm": 21.550539016723633, "learning_rate": 0.0001640991216835326, "loss": 2.2548, "step": 425 }, { "epoch": 0.1401892225421637, "grad_norm": 19.954418182373047, "learning_rate": 0.00016393714551100734, "loss": 2.0714, "step": 426 }, { "epoch": 0.14051830522418757, "grad_norm": 21.376625061035156, "learning_rate": 0.0001637748851016914, "loss": 2.1441, "step": 427 }, { "epoch": 0.14084738790621143, "grad_norm": 23.426603317260742, "learning_rate": 0.00016361234117692413, "loss": 2.1419, "step": 428 }, { "epoch": 0.1411764705882353, "grad_norm": 22.261470794677734, "learning_rate": 0.00016344951445930526, "loss": 2.1996, "step": 429 }, { "epoch": 0.14150555327025915, "grad_norm": 24.481969833374023, "learning_rate": 0.0001632864056726917, "loss": 2.1861, "step": 430 }, { "epoch": 0.14183463595228302, "grad_norm": 25.616226196289062, "learning_rate": 0.00016312301554219426, "loss": 2.3309, "step": 431 }, { "epoch": 0.14216371863430688, "grad_norm": 24.801036834716797, "learning_rate": 0.00016295934479417453, "loss": 2.4246, "step": 432 }, { "epoch": 0.14249280131633074, "grad_norm": 25.4134521484375, "learning_rate": 0.00016279539415624164, "loss": 2.3035, "step": 433 }, { "epoch": 0.14282188399835458, "grad_norm": 27.192171096801758, "learning_rate": 0.0001626311643572489, "loss": 2.2683, "step": 434 }, { "epoch": 0.14315096668037844, "grad_norm": 25.618234634399414, "learning_rate": 0.00016246665612729074, "loss": 2.167, "step": 435 }, { "epoch": 0.1434800493624023, "grad_norm": 28.360218048095703, "learning_rate": 0.00016230187019769928, "loss": 2.3093, "step": 436 }, { "epoch": 0.14380913204442616, "grad_norm": 29.186748504638672, "learning_rate": 0.00016213680730104124, "loss": 2.1995, "step": 437 }, { "epoch": 0.14413821472645003, "grad_norm": 29.221302032470703, "learning_rate": 0.0001619714681711146, "loss": 2.2473, "step": 438 }, { "epoch": 0.1444672974084739, "grad_norm": 32.56549835205078, "learning_rate": 0.00016180585354294536, "loss": 2.3759, "step": 439 }, { "epoch": 0.14479638009049775, "grad_norm": 37.25788116455078, "learning_rate": 0.00016163996415278424, "loss": 2.3653, "step": 440 }, { "epoch": 0.14512546277252159, "grad_norm": 33.03981399536133, "learning_rate": 0.00016147380073810346, "loss": 2.4879, "step": 441 }, { "epoch": 0.14545454545454545, "grad_norm": 34.7199821472168, "learning_rate": 0.0001613073640375934, "loss": 2.3231, "step": 442 }, { "epoch": 0.1457836281365693, "grad_norm": 32.635986328125, "learning_rate": 0.00016114065479115946, "loss": 2.2718, "step": 443 }, { "epoch": 0.14611271081859317, "grad_norm": 42.9588508605957, "learning_rate": 0.00016097367373991842, "loss": 2.4779, "step": 444 }, { "epoch": 0.14644179350061703, "grad_norm": 36.11931610107422, "learning_rate": 0.00016080642162619565, "loss": 2.0783, "step": 445 }, { "epoch": 0.1467708761826409, "grad_norm": 50.71851348876953, "learning_rate": 0.0001606388991935214, "loss": 2.3016, "step": 446 }, { "epoch": 0.14709995886466476, "grad_norm": 48.255367279052734, "learning_rate": 0.0001604711071866277, "loss": 2.4718, "step": 447 }, { "epoch": 0.1474290415466886, "grad_norm": 44.14384460449219, "learning_rate": 0.00016030304635144494, "loss": 2.2925, "step": 448 }, { "epoch": 0.14775812422871246, "grad_norm": 65.98005676269531, "learning_rate": 0.00016013471743509862, "loss": 2.8047, "step": 449 }, { "epoch": 0.14808720691073632, "grad_norm": 94.87761688232422, "learning_rate": 0.00015996612118590603, "loss": 3.1099, "step": 450 }, { "epoch": 0.14841628959276018, "grad_norm": 19.420984268188477, "learning_rate": 0.00015979725835337294, "loss": 2.0887, "step": 451 }, { "epoch": 0.14874537227478404, "grad_norm": 20.774240493774414, "learning_rate": 0.00015962812968819016, "loss": 2.084, "step": 452 }, { "epoch": 0.1490744549568079, "grad_norm": 17.931415557861328, "learning_rate": 0.0001594587359422303, "loss": 2.0841, "step": 453 }, { "epoch": 0.14940353763883177, "grad_norm": 15.078619003295898, "learning_rate": 0.0001592890778685444, "loss": 1.9613, "step": 454 }, { "epoch": 0.1497326203208556, "grad_norm": 14.589539527893066, "learning_rate": 0.00015911915622135862, "loss": 2.1347, "step": 455 }, { "epoch": 0.15006170300287947, "grad_norm": 12.253896713256836, "learning_rate": 0.00015894897175607086, "loss": 2.0515, "step": 456 }, { "epoch": 0.15039078568490333, "grad_norm": 12.456748008728027, "learning_rate": 0.00015877852522924732, "loss": 2.1171, "step": 457 }, { "epoch": 0.1507198683669272, "grad_norm": 11.61238956451416, "learning_rate": 0.00015860781739861928, "loss": 2.0313, "step": 458 }, { "epoch": 0.15104895104895105, "grad_norm": 12.655720710754395, "learning_rate": 0.00015843684902307962, "loss": 2.1129, "step": 459 }, { "epoch": 0.15137803373097491, "grad_norm": 13.252540588378906, "learning_rate": 0.00015826562086267956, "loss": 2.0902, "step": 460 }, { "epoch": 0.15170711641299878, "grad_norm": 16.699647903442383, "learning_rate": 0.00015809413367862512, "loss": 2.1071, "step": 461 }, { "epoch": 0.1520361990950226, "grad_norm": 13.88497543334961, "learning_rate": 0.00015792238823327388, "loss": 2.1244, "step": 462 }, { "epoch": 0.15236528177704647, "grad_norm": 14.418646812438965, "learning_rate": 0.00015775038529013152, "loss": 1.9705, "step": 463 }, { "epoch": 0.15269436445907034, "grad_norm": 15.111443519592285, "learning_rate": 0.0001575781256138485, "loss": 2.072, "step": 464 }, { "epoch": 0.1530234471410942, "grad_norm": 14.344696044921875, "learning_rate": 0.00015740560997021648, "loss": 2.0361, "step": 465 }, { "epoch": 0.15335252982311806, "grad_norm": 17.06289291381836, "learning_rate": 0.00015723283912616513, "loss": 2.1011, "step": 466 }, { "epoch": 0.15368161250514192, "grad_norm": 17.146263122558594, "learning_rate": 0.00015705981384975866, "loss": 2.0345, "step": 467 }, { "epoch": 0.15401069518716579, "grad_norm": 17.768329620361328, "learning_rate": 0.0001568865349101923, "loss": 2.1504, "step": 468 }, { "epoch": 0.15433977786918962, "grad_norm": 22.8175106048584, "learning_rate": 0.00015671300307778898, "loss": 2.1139, "step": 469 }, { "epoch": 0.15466886055121348, "grad_norm": 18.92057991027832, "learning_rate": 0.00015653921912399589, "loss": 2.1474, "step": 470 }, { "epoch": 0.15499794323323735, "grad_norm": 19.475536346435547, "learning_rate": 0.00015636518382138107, "loss": 2.1211, "step": 471 }, { "epoch": 0.1553270259152612, "grad_norm": 19.589750289916992, "learning_rate": 0.0001561908979436299, "loss": 2.1808, "step": 472 }, { "epoch": 0.15565610859728507, "grad_norm": 21.1541805267334, "learning_rate": 0.00015601636226554168, "loss": 2.1225, "step": 473 }, { "epoch": 0.15598519127930893, "grad_norm": 22.461345672607422, "learning_rate": 0.00015584157756302634, "loss": 2.0673, "step": 474 }, { "epoch": 0.1563142739613328, "grad_norm": 22.78941535949707, "learning_rate": 0.0001556665446131007, "loss": 2.1653, "step": 475 }, { "epoch": 0.15664335664335666, "grad_norm": 23.3026180267334, "learning_rate": 0.00015549126419388536, "loss": 2.224, "step": 476 }, { "epoch": 0.1569724393253805, "grad_norm": 24.918376922607422, "learning_rate": 0.0001553157370846009, "loss": 2.232, "step": 477 }, { "epoch": 0.15730152200740435, "grad_norm": 25.054052352905273, "learning_rate": 0.00015513996406556465, "loss": 2.2618, "step": 478 }, { "epoch": 0.15763060468942822, "grad_norm": 27.31525993347168, "learning_rate": 0.00015496394591818716, "loss": 2.2294, "step": 479 }, { "epoch": 0.15795968737145208, "grad_norm": 27.930578231811523, "learning_rate": 0.0001547876834249687, "loss": 2.1336, "step": 480 }, { "epoch": 0.15828877005347594, "grad_norm": 26.394222259521484, "learning_rate": 0.00015461117736949577, "loss": 2.0374, "step": 481 }, { "epoch": 0.1586178527354998, "grad_norm": 23.68645668029785, "learning_rate": 0.00015443442853643762, "loss": 2.2091, "step": 482 }, { "epoch": 0.15894693541752367, "grad_norm": 29.133563995361328, "learning_rate": 0.00015425743771154294, "loss": 2.2799, "step": 483 }, { "epoch": 0.1592760180995475, "grad_norm": 28.51854133605957, "learning_rate": 0.00015408020568163602, "loss": 2.3114, "step": 484 }, { "epoch": 0.15960510078157136, "grad_norm": 26.7332763671875, "learning_rate": 0.00015390273323461352, "loss": 2.3515, "step": 485 }, { "epoch": 0.15993418346359523, "grad_norm": 30.377696990966797, "learning_rate": 0.0001537250211594409, "loss": 2.1942, "step": 486 }, { "epoch": 0.1602632661456191, "grad_norm": 27.210660934448242, "learning_rate": 0.0001535470702461489, "loss": 2.2058, "step": 487 }, { "epoch": 0.16059234882764295, "grad_norm": 47.20982360839844, "learning_rate": 0.00015336888128583, "loss": 2.4245, "step": 488 }, { "epoch": 0.1609214315096668, "grad_norm": 42.282928466796875, "learning_rate": 0.000153190455070635, "loss": 2.3975, "step": 489 }, { "epoch": 0.16125051419169067, "grad_norm": 33.728939056396484, "learning_rate": 0.00015301179239376938, "loss": 2.2445, "step": 490 }, { "epoch": 0.1615795968737145, "grad_norm": 36.65166473388672, "learning_rate": 0.00015283289404948976, "loss": 2.4152, "step": 491 }, { "epoch": 0.16190867955573837, "grad_norm": 39.38380813598633, "learning_rate": 0.0001526537608331006, "loss": 2.3843, "step": 492 }, { "epoch": 0.16223776223776223, "grad_norm": 36.73198699951172, "learning_rate": 0.00015247439354095041, "loss": 2.5001, "step": 493 }, { "epoch": 0.1625668449197861, "grad_norm": 37.7665901184082, "learning_rate": 0.00015229479297042823, "loss": 2.4288, "step": 494 }, { "epoch": 0.16289592760180996, "grad_norm": 42.50233840942383, "learning_rate": 0.00015211495991996027, "loss": 2.2909, "step": 495 }, { "epoch": 0.16322501028383382, "grad_norm": 45.45063400268555, "learning_rate": 0.0001519348951890062, "loss": 2.4955, "step": 496 }, { "epoch": 0.16355409296585768, "grad_norm": 42.770782470703125, "learning_rate": 0.0001517545995780556, "loss": 2.5254, "step": 497 }, { "epoch": 0.16388317564788152, "grad_norm": 54.30143356323242, "learning_rate": 0.00015157407388862452, "loss": 2.5006, "step": 498 }, { "epoch": 0.16421225832990538, "grad_norm": 57.01381301879883, "learning_rate": 0.00015139331892325179, "loss": 2.5408, "step": 499 }, { "epoch": 0.16454134101192924, "grad_norm": 81.99574279785156, "learning_rate": 0.0001512123354854955, "loss": 2.8353, "step": 500 }, { "epoch": 0.1648704236939531, "grad_norm": 17.387454986572266, "learning_rate": 0.0001510311243799295, "loss": 2.0141, "step": 501 }, { "epoch": 0.16519950637597697, "grad_norm": 18.2310848236084, "learning_rate": 0.00015084968641213958, "loss": 2.065, "step": 502 }, { "epoch": 0.16552858905800083, "grad_norm": 16.473785400390625, "learning_rate": 0.00015066802238872023, "loss": 1.9796, "step": 503 }, { "epoch": 0.1658576717400247, "grad_norm": 14.520034790039062, "learning_rate": 0.0001504861331172709, "loss": 2.061, "step": 504 }, { "epoch": 0.16618675442204853, "grad_norm": 13.04027271270752, "learning_rate": 0.0001503040194063922, "loss": 2.0168, "step": 505 }, { "epoch": 0.1665158371040724, "grad_norm": 12.51745319366455, "learning_rate": 0.00015012168206568268, "loss": 2.0406, "step": 506 }, { "epoch": 0.16684491978609625, "grad_norm": 12.554720878601074, "learning_rate": 0.00014993912190573505, "loss": 2.1265, "step": 507 }, { "epoch": 0.16717400246812011, "grad_norm": 11.707505226135254, "learning_rate": 0.00014975633973813242, "loss": 2.1124, "step": 508 }, { "epoch": 0.16750308515014398, "grad_norm": 12.905622482299805, "learning_rate": 0.00014957333637544503, "loss": 1.9911, "step": 509 }, { "epoch": 0.16783216783216784, "grad_norm": 15.165553092956543, "learning_rate": 0.00014939011263122634, "loss": 2.0091, "step": 510 }, { "epoch": 0.1681612505141917, "grad_norm": 15.309354782104492, "learning_rate": 0.0001492066693200096, "loss": 2.0237, "step": 511 }, { "epoch": 0.16849033319621554, "grad_norm": 14.663786888122559, "learning_rate": 0.00014902300725730413, "loss": 2.0856, "step": 512 }, { "epoch": 0.1688194158782394, "grad_norm": 13.048662185668945, "learning_rate": 0.00014883912725959167, "loss": 1.9925, "step": 513 }, { "epoch": 0.16914849856026326, "grad_norm": 15.981327056884766, "learning_rate": 0.00014865503014432292, "loss": 2.1218, "step": 514 }, { "epoch": 0.16947758124228712, "grad_norm": 14.421281814575195, "learning_rate": 0.00014847071672991367, "loss": 2.1116, "step": 515 }, { "epoch": 0.16980666392431099, "grad_norm": 15.308381080627441, "learning_rate": 0.0001482861878357414, "loss": 2.1286, "step": 516 }, { "epoch": 0.17013574660633485, "grad_norm": 19.13900375366211, "learning_rate": 0.00014810144428214144, "loss": 2.1093, "step": 517 }, { "epoch": 0.1704648292883587, "grad_norm": 14.989489555358887, "learning_rate": 0.0001479164868904034, "loss": 2.0368, "step": 518 }, { "epoch": 0.17079391197038254, "grad_norm": 18.93499755859375, "learning_rate": 0.00014773131648276758, "loss": 2.0549, "step": 519 }, { "epoch": 0.1711229946524064, "grad_norm": 15.950383186340332, "learning_rate": 0.00014754593388242117, "loss": 1.9985, "step": 520 }, { "epoch": 0.17145207733443027, "grad_norm": 17.711463928222656, "learning_rate": 0.0001473603399134948, "loss": 2.1116, "step": 521 }, { "epoch": 0.17178116001645413, "grad_norm": 19.4927978515625, "learning_rate": 0.0001471745354010586, "loss": 2.1417, "step": 522 }, { "epoch": 0.172110242698478, "grad_norm": 20.421518325805664, "learning_rate": 0.00014698852117111884, "loss": 2.2147, "step": 523 }, { "epoch": 0.17243932538050186, "grad_norm": 18.737533569335938, "learning_rate": 0.000146802298050614, "loss": 2.0713, "step": 524 }, { "epoch": 0.17276840806252572, "grad_norm": 22.72432518005371, "learning_rate": 0.0001466158668674112, "loss": 2.1376, "step": 525 }, { "epoch": 0.17309749074454958, "grad_norm": 20.022228240966797, "learning_rate": 0.00014642922845030257, "loss": 2.0706, "step": 526 }, { "epoch": 0.17342657342657342, "grad_norm": 20.431339263916016, "learning_rate": 0.0001462423836290015, "loss": 2.0171, "step": 527 }, { "epoch": 0.17375565610859728, "grad_norm": 19.09589195251465, "learning_rate": 0.00014605533323413887, "loss": 2.2592, "step": 528 }, { "epoch": 0.17408473879062114, "grad_norm": 24.208362579345703, "learning_rate": 0.00014586807809725962, "loss": 2.2802, "step": 529 }, { "epoch": 0.174413821472645, "grad_norm": 21.66008758544922, "learning_rate": 0.00014568061905081875, "loss": 2.2369, "step": 530 }, { "epoch": 0.17474290415466887, "grad_norm": 24.156883239746094, "learning_rate": 0.00014549295692817778, "loss": 2.24, "step": 531 }, { "epoch": 0.17507198683669273, "grad_norm": 25.639503479003906, "learning_rate": 0.00014530509256360102, "loss": 2.4159, "step": 532 }, { "epoch": 0.1754010695187166, "grad_norm": 24.354341506958008, "learning_rate": 0.00014511702679225193, "loss": 2.2112, "step": 533 }, { "epoch": 0.17573015220074042, "grad_norm": 30.67277717590332, "learning_rate": 0.0001449287604501893, "loss": 2.377, "step": 534 }, { "epoch": 0.1760592348827643, "grad_norm": 25.33722686767578, "learning_rate": 0.00014474029437436348, "loss": 2.2305, "step": 535 }, { "epoch": 0.17638831756478815, "grad_norm": 28.127777099609375, "learning_rate": 0.00014455162940261285, "loss": 2.406, "step": 536 }, { "epoch": 0.176717400246812, "grad_norm": 28.615427017211914, "learning_rate": 0.0001443627663736599, "loss": 2.2733, "step": 537 }, { "epoch": 0.17704648292883587, "grad_norm": 28.554399490356445, "learning_rate": 0.00014417370612710778, "loss": 2.3758, "step": 538 }, { "epoch": 0.17737556561085974, "grad_norm": 29.043025970458984, "learning_rate": 0.00014398444950343623, "loss": 2.1905, "step": 539 }, { "epoch": 0.1777046482928836, "grad_norm": 36.561988830566406, "learning_rate": 0.00014379499734399798, "loss": 2.3668, "step": 540 }, { "epoch": 0.17803373097490743, "grad_norm": 38.97964859008789, "learning_rate": 0.0001436053504910151, "loss": 2.2409, "step": 541 }, { "epoch": 0.1783628136569313, "grad_norm": 34.46533203125, "learning_rate": 0.0001434155097875752, "loss": 2.2668, "step": 542 }, { "epoch": 0.17869189633895516, "grad_norm": 47.838592529296875, "learning_rate": 0.00014322547607762762, "loss": 2.5153, "step": 543 }, { "epoch": 0.17902097902097902, "grad_norm": 35.19904327392578, "learning_rate": 0.0001430352502059797, "loss": 2.3537, "step": 544 }, { "epoch": 0.17935006170300288, "grad_norm": 43.838836669921875, "learning_rate": 0.0001428448330182931, "loss": 2.5353, "step": 545 }, { "epoch": 0.17967914438502675, "grad_norm": 42.94895553588867, "learning_rate": 0.00014265422536107993, "loss": 2.4404, "step": 546 }, { "epoch": 0.1800082270670506, "grad_norm": 47.776153564453125, "learning_rate": 0.00014246342808169914, "loss": 2.6892, "step": 547 }, { "epoch": 0.18033730974907444, "grad_norm": 96.93193054199219, "learning_rate": 0.00014227244202835257, "loss": 2.7044, "step": 548 }, { "epoch": 0.1806663924310983, "grad_norm": 62.383399963378906, "learning_rate": 0.0001420812680500813, "loss": 2.9085, "step": 549 }, { "epoch": 0.18099547511312217, "grad_norm": 90.59630584716797, "learning_rate": 0.00014188990699676184, "loss": 3.0649, "step": 550 }, { "epoch": 0.18132455779514603, "grad_norm": 13.641534805297852, "learning_rate": 0.00014169835971910238, "loss": 1.9386, "step": 551 }, { "epoch": 0.1816536404771699, "grad_norm": 15.527789115905762, "learning_rate": 0.0001415066270686389, "loss": 1.9512, "step": 552 }, { "epoch": 0.18198272315919375, "grad_norm": 16.21123695373535, "learning_rate": 0.00014131470989773158, "loss": 2.0653, "step": 553 }, { "epoch": 0.18231180584121762, "grad_norm": 13.681528091430664, "learning_rate": 0.0001411226090595608, "loss": 2.0078, "step": 554 }, { "epoch": 0.18264088852324145, "grad_norm": 12.626108169555664, "learning_rate": 0.00014093032540812348, "loss": 1.9582, "step": 555 }, { "epoch": 0.1829699712052653, "grad_norm": 11.949996948242188, "learning_rate": 0.0001407378597982293, "loss": 1.9566, "step": 556 }, { "epoch": 0.18329905388728918, "grad_norm": 11.09714126586914, "learning_rate": 0.00014054521308549673, "loss": 1.9751, "step": 557 }, { "epoch": 0.18362813656931304, "grad_norm": 12.219293594360352, "learning_rate": 0.0001403523861263495, "loss": 2.0179, "step": 558 }, { "epoch": 0.1839572192513369, "grad_norm": 11.462242126464844, "learning_rate": 0.00014015937977801256, "loss": 1.9827, "step": 559 }, { "epoch": 0.18428630193336076, "grad_norm": 13.0610933303833, "learning_rate": 0.00013996619489850822, "loss": 1.9774, "step": 560 }, { "epoch": 0.18461538461538463, "grad_norm": 12.64937686920166, "learning_rate": 0.00013977283234665273, "loss": 2.1051, "step": 561 }, { "epoch": 0.18494446729740846, "grad_norm": 12.783965110778809, "learning_rate": 0.00013957929298205195, "loss": 2.0033, "step": 562 }, { "epoch": 0.18527354997943232, "grad_norm": 12.272329330444336, "learning_rate": 0.00013938557766509792, "loss": 2.0593, "step": 563 }, { "epoch": 0.18560263266145618, "grad_norm": 15.710333824157715, "learning_rate": 0.0001391916872569648, "loss": 2.1094, "step": 564 }, { "epoch": 0.18593171534348005, "grad_norm": 14.490073204040527, "learning_rate": 0.00013899762261960518, "loss": 2.1097, "step": 565 }, { "epoch": 0.1862607980255039, "grad_norm": 17.13546371459961, "learning_rate": 0.0001388033846157462, "loss": 2.0658, "step": 566 }, { "epoch": 0.18658988070752777, "grad_norm": 16.9393253326416, "learning_rate": 0.0001386089741088857, "loss": 1.9681, "step": 567 }, { "epoch": 0.18691896338955163, "grad_norm": 15.681133270263672, "learning_rate": 0.00013841439196328836, "loss": 2.1139, "step": 568 }, { "epoch": 0.1872480460715755, "grad_norm": 17.12744140625, "learning_rate": 0.00013821963904398193, "loss": 2.1178, "step": 569 }, { "epoch": 0.18757712875359933, "grad_norm": 15.954170227050781, "learning_rate": 0.00013802471621675338, "loss": 2.096, "step": 570 }, { "epoch": 0.1879062114356232, "grad_norm": 19.09977149963379, "learning_rate": 0.00013782962434814492, "loss": 2.0819, "step": 571 }, { "epoch": 0.18823529411764706, "grad_norm": 16.83810806274414, "learning_rate": 0.00013763436430545034, "loss": 2.0546, "step": 572 }, { "epoch": 0.18856437679967092, "grad_norm": 18.745885848999023, "learning_rate": 0.00013743893695671096, "loss": 2.1512, "step": 573 }, { "epoch": 0.18889345948169478, "grad_norm": 20.046236038208008, "learning_rate": 0.00013724334317071198, "loss": 2.0934, "step": 574 }, { "epoch": 0.18922254216371864, "grad_norm": 19.864009857177734, "learning_rate": 0.00013704758381697844, "loss": 2.1073, "step": 575 }, { "epoch": 0.1895516248457425, "grad_norm": 21.05132484436035, "learning_rate": 0.00013685165976577146, "loss": 2.26, "step": 576 }, { "epoch": 0.18988070752776634, "grad_norm": 20.978456497192383, "learning_rate": 0.0001366555718880843, "loss": 2.1846, "step": 577 }, { "epoch": 0.1902097902097902, "grad_norm": 22.513282775878906, "learning_rate": 0.00013645932105563844, "loss": 2.0587, "step": 578 }, { "epoch": 0.19053887289181407, "grad_norm": 21.265405654907227, "learning_rate": 0.00013626290814088005, "loss": 2.1256, "step": 579 }, { "epoch": 0.19086795557383793, "grad_norm": 26.82299041748047, "learning_rate": 0.00013606633401697557, "loss": 2.283, "step": 580 }, { "epoch": 0.1911970382558618, "grad_norm": 27.060853958129883, "learning_rate": 0.00013586959955780824, "loss": 2.1478, "step": 581 }, { "epoch": 0.19152612093788565, "grad_norm": 21.665401458740234, "learning_rate": 0.00013567270563797398, "loss": 2.0667, "step": 582 }, { "epoch": 0.19185520361990951, "grad_norm": 24.97159767150879, "learning_rate": 0.00013547565313277776, "loss": 2.1521, "step": 583 }, { "epoch": 0.19218428630193335, "grad_norm": 32.8044319152832, "learning_rate": 0.00013527844291822948, "loss": 2.0817, "step": 584 }, { "epoch": 0.1925133689839572, "grad_norm": 27.70381736755371, "learning_rate": 0.0001350810758710401, "loss": 2.3528, "step": 585 }, { "epoch": 0.19284245166598107, "grad_norm": 25.7266845703125, "learning_rate": 0.00013488355286861783, "loss": 2.2124, "step": 586 }, { "epoch": 0.19317153434800494, "grad_norm": 26.19965934753418, "learning_rate": 0.0001346858747890642, "loss": 2.4083, "step": 587 }, { "epoch": 0.1935006170300288, "grad_norm": 35.831382751464844, "learning_rate": 0.00013448804251117003, "loss": 2.3275, "step": 588 }, { "epoch": 0.19382969971205266, "grad_norm": 26.360593795776367, "learning_rate": 0.0001342900569144119, "loss": 2.3216, "step": 589 }, { "epoch": 0.19415878239407652, "grad_norm": 34.13278579711914, "learning_rate": 0.0001340919188789477, "loss": 2.3619, "step": 590 }, { "epoch": 0.19448786507610036, "grad_norm": 32.32003402709961, "learning_rate": 0.00013389362928561317, "loss": 2.3538, "step": 591 }, { "epoch": 0.19481694775812422, "grad_norm": 32.8123664855957, "learning_rate": 0.00013369518901591772, "loss": 2.3024, "step": 592 }, { "epoch": 0.19514603044014808, "grad_norm": 36.42515563964844, "learning_rate": 0.00013349659895204067, "loss": 2.4452, "step": 593 }, { "epoch": 0.19547511312217195, "grad_norm": 28.456432342529297, "learning_rate": 0.0001332978599768272, "loss": 2.15, "step": 594 }, { "epoch": 0.1958041958041958, "grad_norm": 45.40401077270508, "learning_rate": 0.00013309897297378455, "loss": 2.5472, "step": 595 }, { "epoch": 0.19613327848621967, "grad_norm": 35.83034133911133, "learning_rate": 0.00013289993882707797, "loss": 2.44, "step": 596 }, { "epoch": 0.19646236116824353, "grad_norm": 46.86837387084961, "learning_rate": 0.00013270075842152678, "loss": 2.3963, "step": 597 }, { "epoch": 0.19679144385026737, "grad_norm": 40.551326751708984, "learning_rate": 0.00013250143264260074, "loss": 2.4064, "step": 598 }, { "epoch": 0.19712052653229123, "grad_norm": 50.9204216003418, "learning_rate": 0.0001323019623764156, "loss": 2.6763, "step": 599 }, { "epoch": 0.1974496092143151, "grad_norm": 80.23843383789062, "learning_rate": 0.00013210234850972964, "loss": 2.8969, "step": 600 }, { "epoch": 0.19777869189633895, "grad_norm": 13.052701950073242, "learning_rate": 0.0001319025919299394, "loss": 2.0314, "step": 601 }, { "epoch": 0.19810777457836282, "grad_norm": 17.274169921875, "learning_rate": 0.00013170269352507597, "loss": 1.9936, "step": 602 }, { "epoch": 0.19843685726038668, "grad_norm": 14.206451416015625, "learning_rate": 0.0001315026541838008, "loss": 2.0876, "step": 603 }, { "epoch": 0.19876593994241054, "grad_norm": 13.755434036254883, "learning_rate": 0.00013130247479540202, "loss": 2.1398, "step": 604 }, { "epoch": 0.19909502262443438, "grad_norm": 12.4016752243042, "learning_rate": 0.00013110215624979025, "loss": 1.9961, "step": 605 }, { "epoch": 0.19942410530645824, "grad_norm": 11.712889671325684, "learning_rate": 0.00013090169943749476, "loss": 2.0138, "step": 606 }, { "epoch": 0.1997531879884821, "grad_norm": 12.178582191467285, "learning_rate": 0.00013070110524965954, "loss": 2.0096, "step": 607 }, { "epoch": 0.20008227067050596, "grad_norm": 12.263139724731445, "learning_rate": 0.00013050037457803924, "loss": 2.0671, "step": 608 }, { "epoch": 0.20041135335252983, "grad_norm": 13.043238639831543, "learning_rate": 0.0001302995083149953, "loss": 1.9947, "step": 609 }, { "epoch": 0.2007404360345537, "grad_norm": 13.848551750183105, "learning_rate": 0.0001300985073534919, "loss": 1.9089, "step": 610 }, { "epoch": 0.20106951871657755, "grad_norm": 12.28908920288086, "learning_rate": 0.00012989737258709203, "loss": 1.9493, "step": 611 }, { "epoch": 0.2013986013986014, "grad_norm": 20.78250503540039, "learning_rate": 0.00012969610490995358, "loss": 2.038, "step": 612 }, { "epoch": 0.20172768408062525, "grad_norm": 13.323689460754395, "learning_rate": 0.00012949470521682528, "loss": 2.0706, "step": 613 }, { "epoch": 0.2020567667626491, "grad_norm": 13.389912605285645, "learning_rate": 0.0001292931744030427, "loss": 2.0052, "step": 614 }, { "epoch": 0.20238584944467297, "grad_norm": 13.891485214233398, "learning_rate": 0.0001290915133645243, "loss": 1.9795, "step": 615 }, { "epoch": 0.20271493212669683, "grad_norm": 14.323101043701172, "learning_rate": 0.00012888972299776754, "loss": 2.0917, "step": 616 }, { "epoch": 0.2030440148087207, "grad_norm": 14.606636047363281, "learning_rate": 0.00012868780419984482, "loss": 2.1993, "step": 617 }, { "epoch": 0.20337309749074456, "grad_norm": 16.17580223083496, "learning_rate": 0.00012848575786839943, "loss": 2.0651, "step": 618 }, { "epoch": 0.20370218017276842, "grad_norm": 16.932405471801758, "learning_rate": 0.0001282835849016416, "loss": 2.0763, "step": 619 }, { "epoch": 0.20403126285479226, "grad_norm": 17.19739532470703, "learning_rate": 0.00012808128619834461, "loss": 1.9914, "step": 620 }, { "epoch": 0.20436034553681612, "grad_norm": 20.44463539123535, "learning_rate": 0.0001278788626578407, "loss": 2.1702, "step": 621 }, { "epoch": 0.20468942821883998, "grad_norm": 16.912687301635742, "learning_rate": 0.00012767631518001698, "loss": 1.9944, "step": 622 }, { "epoch": 0.20501851090086384, "grad_norm": 22.962688446044922, "learning_rate": 0.00012747364466531163, "loss": 2.1902, "step": 623 }, { "epoch": 0.2053475935828877, "grad_norm": 21.148052215576172, "learning_rate": 0.00012727085201470973, "loss": 2.1135, "step": 624 }, { "epoch": 0.20567667626491157, "grad_norm": 19.16214370727539, "learning_rate": 0.00012706793812973941, "loss": 2.2242, "step": 625 }, { "epoch": 0.20600575894693543, "grad_norm": 20.022171020507812, "learning_rate": 0.0001268649039124677, "loss": 2.0946, "step": 626 }, { "epoch": 0.20633484162895926, "grad_norm": 26.837522506713867, "learning_rate": 0.00012666175026549662, "loss": 2.0879, "step": 627 }, { "epoch": 0.20666392431098313, "grad_norm": 19.869239807128906, "learning_rate": 0.000126458478091959, "loss": 2.2091, "step": 628 }, { "epoch": 0.206993006993007, "grad_norm": 21.796751022338867, "learning_rate": 0.00012625508829551473, "loss": 2.134, "step": 629 }, { "epoch": 0.20732208967503085, "grad_norm": 22.81996726989746, "learning_rate": 0.00012605158178034654, "loss": 2.1529, "step": 630 }, { "epoch": 0.20765117235705471, "grad_norm": 41.82063674926758, "learning_rate": 0.00012584795945115603, "loss": 2.2019, "step": 631 }, { "epoch": 0.20798025503907858, "grad_norm": 24.8160457611084, "learning_rate": 0.0001256442222131597, "loss": 2.1741, "step": 632 }, { "epoch": 0.20830933772110244, "grad_norm": 26.7482852935791, "learning_rate": 0.0001254403709720848, "loss": 2.075, "step": 633 }, { "epoch": 0.20863842040312627, "grad_norm": 23.513917922973633, "learning_rate": 0.0001252364066341655, "loss": 2.2156, "step": 634 }, { "epoch": 0.20896750308515014, "grad_norm": 36.46263122558594, "learning_rate": 0.00012503233010613865, "loss": 2.288, "step": 635 }, { "epoch": 0.209296585767174, "grad_norm": 33.37617111206055, "learning_rate": 0.00012482814229523997, "loss": 2.2255, "step": 636 }, { "epoch": 0.20962566844919786, "grad_norm": 27.558984756469727, "learning_rate": 0.00012462384410919975, "loss": 2.3285, "step": 637 }, { "epoch": 0.20995475113122172, "grad_norm": 33.055416107177734, "learning_rate": 0.00012441943645623903, "loss": 2.2938, "step": 638 }, { "epoch": 0.21028383381324559, "grad_norm": 32.11117172241211, "learning_rate": 0.00012421492024506555, "loss": 2.2859, "step": 639 }, { "epoch": 0.21061291649526945, "grad_norm": 28.522022247314453, "learning_rate": 0.00012401029638486953, "loss": 2.2071, "step": 640 }, { "epoch": 0.21094199917729328, "grad_norm": 29.912519454956055, "learning_rate": 0.0001238055657853198, "loss": 2.1902, "step": 641 }, { "epoch": 0.21127108185931714, "grad_norm": 38.46026611328125, "learning_rate": 0.00012360072935655982, "loss": 2.5371, "step": 642 }, { "epoch": 0.211600164541341, "grad_norm": 31.007673263549805, "learning_rate": 0.00012339578800920332, "loss": 2.113, "step": 643 }, { "epoch": 0.21192924722336487, "grad_norm": 42.14206314086914, "learning_rate": 0.00012319074265433063, "loss": 2.6329, "step": 644 }, { "epoch": 0.21225832990538873, "grad_norm": 43.48518371582031, "learning_rate": 0.00012298559420348437, "loss": 2.6028, "step": 645 }, { "epoch": 0.2125874125874126, "grad_norm": 41.29949188232422, "learning_rate": 0.00012278034356866545, "loss": 2.3886, "step": 646 }, { "epoch": 0.21291649526943646, "grad_norm": 46.199947357177734, "learning_rate": 0.00012257499166232907, "loss": 2.556, "step": 647 }, { "epoch": 0.2132455779514603, "grad_norm": 58.474090576171875, "learning_rate": 0.0001223695393973807, "loss": 2.7324, "step": 648 }, { "epoch": 0.21357466063348415, "grad_norm": 56.12749099731445, "learning_rate": 0.0001221639876871719, "loss": 2.6067, "step": 649 }, { "epoch": 0.21390374331550802, "grad_norm": 64.321533203125, "learning_rate": 0.0001219583374454963, "loss": 2.8009, "step": 650 }, { "epoch": 0.21423282599753188, "grad_norm": 13.228840827941895, "learning_rate": 0.00012175258958658564, "loss": 1.9767, "step": 651 }, { "epoch": 0.21456190867955574, "grad_norm": 12.878254890441895, "learning_rate": 0.00012154674502510555, "loss": 1.8694, "step": 652 }, { "epoch": 0.2148909913615796, "grad_norm": 14.516738891601562, "learning_rate": 0.00012134080467615159, "loss": 2.076, "step": 653 }, { "epoch": 0.21522007404360347, "grad_norm": 12.499964714050293, "learning_rate": 0.00012113476945524513, "loss": 2.0092, "step": 654 }, { "epoch": 0.2155491567256273, "grad_norm": 12.54210090637207, "learning_rate": 0.00012092864027832933, "loss": 1.9639, "step": 655 }, { "epoch": 0.21587823940765116, "grad_norm": 12.161681175231934, "learning_rate": 0.000120722418061765, "loss": 2.0739, "step": 656 }, { "epoch": 0.21620732208967502, "grad_norm": 13.306363105773926, "learning_rate": 0.0001205161037223266, "loss": 2.1012, "step": 657 }, { "epoch": 0.2165364047716989, "grad_norm": 13.940315246582031, "learning_rate": 0.00012030969817719808, "loss": 1.9406, "step": 658 }, { "epoch": 0.21686548745372275, "grad_norm": 12.362981796264648, "learning_rate": 0.00012010320234396894, "loss": 2.0337, "step": 659 }, { "epoch": 0.2171945701357466, "grad_norm": 13.084452629089355, "learning_rate": 0.00011989661714062999, "loss": 1.9527, "step": 660 }, { "epoch": 0.21752365281777047, "grad_norm": 13.463068962097168, "learning_rate": 0.0001196899434855693, "loss": 1.992, "step": 661 }, { "epoch": 0.21785273549979434, "grad_norm": 12.966229438781738, "learning_rate": 0.00011948318229756827, "loss": 2.0754, "step": 662 }, { "epoch": 0.21818181818181817, "grad_norm": 16.607391357421875, "learning_rate": 0.00011927633449579735, "loss": 2.0541, "step": 663 }, { "epoch": 0.21851090086384203, "grad_norm": 14.404459953308105, "learning_rate": 0.0001190694009998121, "loss": 2.053, "step": 664 }, { "epoch": 0.2188399835458659, "grad_norm": 15.27001667022705, "learning_rate": 0.00011886238272954897, "loss": 2.0786, "step": 665 }, { "epoch": 0.21916906622788976, "grad_norm": 14.320028305053711, "learning_rate": 0.00011865528060532127, "loss": 2.1257, "step": 666 }, { "epoch": 0.21949814890991362, "grad_norm": 14.5005464553833, "learning_rate": 0.0001184480955478152, "loss": 2.0426, "step": 667 }, { "epoch": 0.21982723159193748, "grad_norm": 16.71283531188965, "learning_rate": 0.00011824082847808558, "loss": 2.0368, "step": 668 }, { "epoch": 0.22015631427396135, "grad_norm": 14.973550796508789, "learning_rate": 0.00011803348031755179, "loss": 2.0396, "step": 669 }, { "epoch": 0.22048539695598518, "grad_norm": 16.722694396972656, "learning_rate": 0.0001178260519879937, "loss": 2.0596, "step": 670 }, { "epoch": 0.22081447963800904, "grad_norm": 18.121156692504883, "learning_rate": 0.00011761854441154767, "loss": 2.0335, "step": 671 }, { "epoch": 0.2211435623200329, "grad_norm": 16.091184616088867, "learning_rate": 0.00011741095851070228, "loss": 2.0742, "step": 672 }, { "epoch": 0.22147264500205677, "grad_norm": 18.889684677124023, "learning_rate": 0.00011720329520829429, "loss": 2.0445, "step": 673 }, { "epoch": 0.22180172768408063, "grad_norm": 18.530851364135742, "learning_rate": 0.0001169955554275046, "loss": 2.061, "step": 674 }, { "epoch": 0.2221308103661045, "grad_norm": 18.098955154418945, "learning_rate": 0.0001167877400918541, "loss": 2.0886, "step": 675 }, { "epoch": 0.22245989304812835, "grad_norm": 18.40189552307129, "learning_rate": 0.00011657985012519952, "loss": 2.0826, "step": 676 }, { "epoch": 0.2227889757301522, "grad_norm": 19.61872673034668, "learning_rate": 0.00011637188645172944, "loss": 2.0708, "step": 677 }, { "epoch": 0.22311805841217605, "grad_norm": 19.37993812561035, "learning_rate": 0.00011616384999596006, "loss": 2.1604, "step": 678 }, { "epoch": 0.2234471410941999, "grad_norm": 20.826814651489258, "learning_rate": 0.00011595574168273111, "loss": 2.2506, "step": 679 }, { "epoch": 0.22377622377622378, "grad_norm": 23.484943389892578, "learning_rate": 0.0001157475624372018, "loss": 2.2822, "step": 680 }, { "epoch": 0.22410530645824764, "grad_norm": 21.028526306152344, "learning_rate": 0.0001155393131848467, "loss": 2.1224, "step": 681 }, { "epoch": 0.2244343891402715, "grad_norm": 24.73885154724121, "learning_rate": 0.00011533099485145155, "loss": 2.2279, "step": 682 }, { "epoch": 0.22476347182229536, "grad_norm": 25.653488159179688, "learning_rate": 0.00011512260836310924, "loss": 2.2368, "step": 683 }, { "epoch": 0.2250925545043192, "grad_norm": 23.84608268737793, "learning_rate": 0.00011491415464621562, "loss": 2.1462, "step": 684 }, { "epoch": 0.22542163718634306, "grad_norm": 26.653785705566406, "learning_rate": 0.00011470563462746541, "loss": 2.213, "step": 685 }, { "epoch": 0.22575071986836692, "grad_norm": 27.0756778717041, "learning_rate": 0.00011449704923384812, "loss": 2.28, "step": 686 }, { "epoch": 0.22607980255039078, "grad_norm": 37.719032287597656, "learning_rate": 0.00011428839939264382, "loss": 2.1781, "step": 687 }, { "epoch": 0.22640888523241465, "grad_norm": 31.60564613342285, "learning_rate": 0.0001140796860314191, "loss": 2.3294, "step": 688 }, { "epoch": 0.2267379679144385, "grad_norm": 27.467967987060547, "learning_rate": 0.00011387091007802297, "loss": 2.2626, "step": 689 }, { "epoch": 0.22706705059646237, "grad_norm": 33.567909240722656, "learning_rate": 0.0001136620724605827, "loss": 2.3544, "step": 690 }, { "epoch": 0.2273961332784862, "grad_norm": 36.141746520996094, "learning_rate": 0.00011345317410749964, "loss": 2.1965, "step": 691 }, { "epoch": 0.22772521596051007, "grad_norm": 36.09406280517578, "learning_rate": 0.00011324421594744516, "loss": 2.4381, "step": 692 }, { "epoch": 0.22805429864253393, "grad_norm": 36.952274322509766, "learning_rate": 0.00011303519890935656, "loss": 2.349, "step": 693 }, { "epoch": 0.2283833813245578, "grad_norm": 40.29912185668945, "learning_rate": 0.00011282612392243286, "loss": 2.53, "step": 694 }, { "epoch": 0.22871246400658166, "grad_norm": 41.57139587402344, "learning_rate": 0.00011261699191613066, "loss": 2.1912, "step": 695 }, { "epoch": 0.22904154668860552, "grad_norm": 35.29206085205078, "learning_rate": 0.00011240780382016005, "loss": 2.5422, "step": 696 }, { "epoch": 0.22937062937062938, "grad_norm": 52.51593780517578, "learning_rate": 0.00011219856056448051, "loss": 2.4349, "step": 697 }, { "epoch": 0.22969971205265322, "grad_norm": 57.522212982177734, "learning_rate": 0.00011198926307929664, "loss": 2.6504, "step": 698 }, { "epoch": 0.23002879473467708, "grad_norm": 64.0078353881836, "learning_rate": 0.00011177991229505431, "loss": 2.6034, "step": 699 }, { "epoch": 0.23035787741670094, "grad_norm": 72.65283966064453, "learning_rate": 0.00011157050914243614, "loss": 2.7078, "step": 700 }, { "epoch": 0.2306869600987248, "grad_norm": 10.07480525970459, "learning_rate": 0.00011136105455235766, "loss": 1.9436, "step": 701 }, { "epoch": 0.23101604278074866, "grad_norm": 11.241374969482422, "learning_rate": 0.00011115154945596305, "loss": 2.0251, "step": 702 }, { "epoch": 0.23134512546277253, "grad_norm": 10.862390518188477, "learning_rate": 0.00011094199478462095, "loss": 1.8279, "step": 703 }, { "epoch": 0.2316742081447964, "grad_norm": 11.869223594665527, "learning_rate": 0.00011073239146992054, "loss": 1.9652, "step": 704 }, { "epoch": 0.23200329082682025, "grad_norm": 11.989618301391602, "learning_rate": 0.00011052274044366711, "loss": 1.9758, "step": 705 }, { "epoch": 0.2323323735088441, "grad_norm": 12.00688648223877, "learning_rate": 0.00011031304263787812, "loss": 2.0687, "step": 706 }, { "epoch": 0.23266145619086795, "grad_norm": 11.302569389343262, "learning_rate": 0.00011010329898477891, "loss": 1.9037, "step": 707 }, { "epoch": 0.2329905388728918, "grad_norm": 12.563880920410156, "learning_rate": 0.0001098935104167988, "loss": 2.1042, "step": 708 }, { "epoch": 0.23331962155491567, "grad_norm": 13.164795875549316, "learning_rate": 0.00010968367786656663, "loss": 2.1303, "step": 709 }, { "epoch": 0.23364870423693954, "grad_norm": 13.53084659576416, "learning_rate": 0.00010947380226690684, "loss": 2.1355, "step": 710 }, { "epoch": 0.2339777869189634, "grad_norm": 13.575818061828613, "learning_rate": 0.00010926388455083522, "loss": 2.0146, "step": 711 }, { "epoch": 0.23430686960098726, "grad_norm": 15.078895568847656, "learning_rate": 0.00010905392565155477, "loss": 2.033, "step": 712 }, { "epoch": 0.2346359522830111, "grad_norm": 12.624197006225586, "learning_rate": 0.00010884392650245165, "loss": 2.0124, "step": 713 }, { "epoch": 0.23496503496503496, "grad_norm": 15.29921817779541, "learning_rate": 0.00010863388803709089, "loss": 1.9558, "step": 714 }, { "epoch": 0.23529411764705882, "grad_norm": 15.582419395446777, "learning_rate": 0.00010842381118921232, "loss": 2.0249, "step": 715 }, { "epoch": 0.23562320032908268, "grad_norm": 18.057111740112305, "learning_rate": 0.00010821369689272638, "loss": 2.1225, "step": 716 }, { "epoch": 0.23595228301110655, "grad_norm": 14.280094146728516, "learning_rate": 0.00010800354608171003, "loss": 2.0183, "step": 717 }, { "epoch": 0.2362813656931304, "grad_norm": 15.614278793334961, "learning_rate": 0.00010779335969040252, "loss": 2.2212, "step": 718 }, { "epoch": 0.23661044837515427, "grad_norm": 25.103755950927734, "learning_rate": 0.00010758313865320134, "loss": 2.1214, "step": 719 }, { "epoch": 0.2369395310571781, "grad_norm": 16.18218421936035, "learning_rate": 0.00010737288390465792, "loss": 2.0215, "step": 720 }, { "epoch": 0.23726861373920197, "grad_norm": 17.047441482543945, "learning_rate": 0.00010716259637947357, "loss": 2.1172, "step": 721 }, { "epoch": 0.23759769642122583, "grad_norm": 18.27086067199707, "learning_rate": 0.00010695227701249537, "loss": 2.0184, "step": 722 }, { "epoch": 0.2379267791032497, "grad_norm": 16.60795021057129, "learning_rate": 0.00010674192673871191, "loss": 2.132, "step": 723 }, { "epoch": 0.23825586178527355, "grad_norm": 15.888787269592285, "learning_rate": 0.00010653154649324917, "loss": 1.9284, "step": 724 }, { "epoch": 0.23858494446729742, "grad_norm": 18.759923934936523, "learning_rate": 0.00010632113721136636, "loss": 2.1032, "step": 725 }, { "epoch": 0.23891402714932128, "grad_norm": 18.31932830810547, "learning_rate": 0.00010611069982845183, "loss": 2.2255, "step": 726 }, { "epoch": 0.2392431098313451, "grad_norm": 22.61402702331543, "learning_rate": 0.00010590023528001884, "loss": 2.1991, "step": 727 }, { "epoch": 0.23957219251336898, "grad_norm": 22.977676391601562, "learning_rate": 0.00010568974450170139, "loss": 2.0856, "step": 728 }, { "epoch": 0.23990127519539284, "grad_norm": 20.042518615722656, "learning_rate": 0.00010547922842925008, "loss": 2.1873, "step": 729 }, { "epoch": 0.2402303578774167, "grad_norm": 21.759876251220703, "learning_rate": 0.00010526868799852796, "loss": 2.1242, "step": 730 }, { "epoch": 0.24055944055944056, "grad_norm": 22.52090072631836, "learning_rate": 0.0001050581241455064, "loss": 2.178, "step": 731 }, { "epoch": 0.24088852324146443, "grad_norm": 33.1634407043457, "learning_rate": 0.00010484753780626089, "loss": 2.198, "step": 732 }, { "epoch": 0.2412176059234883, "grad_norm": 24.193359375, "learning_rate": 0.00010463692991696685, "loss": 2.0522, "step": 733 }, { "epoch": 0.24154668860551212, "grad_norm": 28.362548828125, "learning_rate": 0.00010442630141389549, "loss": 2.3115, "step": 734 }, { "epoch": 0.24187577128753598, "grad_norm": 22.501371383666992, "learning_rate": 0.00010421565323340971, "loss": 2.239, "step": 735 }, { "epoch": 0.24220485396955985, "grad_norm": 28.872379302978516, "learning_rate": 0.00010400498631195992, "loss": 2.0856, "step": 736 }, { "epoch": 0.2425339366515837, "grad_norm": 23.524127960205078, "learning_rate": 0.00010379430158607975, "loss": 2.1149, "step": 737 }, { "epoch": 0.24286301933360757, "grad_norm": 30.967195510864258, "learning_rate": 0.000103583599992382, "loss": 2.2761, "step": 738 }, { "epoch": 0.24319210201563143, "grad_norm": 29.7477970123291, "learning_rate": 0.0001033728824675545, "loss": 2.1642, "step": 739 }, { "epoch": 0.2435211846976553, "grad_norm": 28.8867244720459, "learning_rate": 0.0001031621499483559, "loss": 2.2713, "step": 740 }, { "epoch": 0.24385026737967913, "grad_norm": 31.588350296020508, "learning_rate": 0.00010295140337161146, "loss": 2.3268, "step": 741 }, { "epoch": 0.244179350061703, "grad_norm": 41.23060989379883, "learning_rate": 0.00010274064367420897, "loss": 2.554, "step": 742 }, { "epoch": 0.24450843274372686, "grad_norm": 31.445274353027344, "learning_rate": 0.00010252987179309459, "loss": 2.3653, "step": 743 }, { "epoch": 0.24483751542575072, "grad_norm": 39.72010040283203, "learning_rate": 0.00010231908866526851, "loss": 2.6269, "step": 744 }, { "epoch": 0.24516659810777458, "grad_norm": 43.691707611083984, "learning_rate": 0.00010210829522778111, "loss": 2.4433, "step": 745 }, { "epoch": 0.24549568078979844, "grad_norm": 35.66619110107422, "learning_rate": 0.00010189749241772844, "loss": 2.2938, "step": 746 }, { "epoch": 0.2458247634718223, "grad_norm": 44.48923110961914, "learning_rate": 0.00010168668117224825, "loss": 2.4232, "step": 747 }, { "epoch": 0.24615384615384617, "grad_norm": 48.32442092895508, "learning_rate": 0.00010147586242851585, "loss": 2.4149, "step": 748 }, { "epoch": 0.24648292883587, "grad_norm": 67.80316162109375, "learning_rate": 0.00010126503712373982, "loss": 2.561, "step": 749 }, { "epoch": 0.24681201151789386, "grad_norm": 77.36808776855469, "learning_rate": 0.00010105420619515798, "loss": 3.0023, "step": 750 }, { "epoch": 0.24681201151789386, "eval_loss": 1.9403769969940186, "eval_runtime": 163.4214, "eval_samples_per_second": 31.318, "eval_steps_per_second": 15.659, "step": 750 }, { "epoch": 0.24714109419991773, "grad_norm": 8.972406387329102, "learning_rate": 0.00010084337058003303, "loss": 1.9459, "step": 751 }, { "epoch": 0.2474701768819416, "grad_norm": 10.900063514709473, "learning_rate": 0.00010063253121564868, "loss": 2.0303, "step": 752 }, { "epoch": 0.24779925956396545, "grad_norm": 10.38176441192627, "learning_rate": 0.00010042168903930514, "loss": 1.9903, "step": 753 }, { "epoch": 0.24812834224598931, "grad_norm": 11.382026672363281, "learning_rate": 0.00010021084498831522, "loss": 2.0121, "step": 754 }, { "epoch": 0.24845742492801318, "grad_norm": 11.841076850891113, "learning_rate": 0.0001, "loss": 1.9599, "step": 755 }, { "epoch": 0.248786507610037, "grad_norm": 11.867929458618164, "learning_rate": 9.97891550116848e-05, "loss": 2.0562, "step": 756 }, { "epoch": 0.24911559029206087, "grad_norm": 21.927106857299805, "learning_rate": 9.957831096069488e-05, "loss": 1.9636, "step": 757 }, { "epoch": 0.24944467297408474, "grad_norm": 12.391194343566895, "learning_rate": 9.936746878435136e-05, "loss": 2.1568, "step": 758 }, { "epoch": 0.2497737556561086, "grad_norm": 12.095949172973633, "learning_rate": 9.915662941996699e-05, "loss": 1.9753, "step": 759 }, { "epoch": 0.25010283833813246, "grad_norm": 13.29311752319336, "learning_rate": 9.894579380484204e-05, "loss": 1.9378, "step": 760 }, { "epoch": 0.2504319210201563, "grad_norm": 12.196897506713867, "learning_rate": 9.873496287626019e-05, "loss": 2.0743, "step": 761 }, { "epoch": 0.2507610037021802, "grad_norm": 14.626516342163086, "learning_rate": 9.852413757148417e-05, "loss": 1.9068, "step": 762 }, { "epoch": 0.25109008638420405, "grad_norm": 15.57929515838623, "learning_rate": 9.831331882775178e-05, "loss": 1.9605, "step": 763 }, { "epoch": 0.2514191690662279, "grad_norm": 15.098057746887207, "learning_rate": 9.81025075822716e-05, "loss": 2.0539, "step": 764 }, { "epoch": 0.2517482517482518, "grad_norm": 14.740304946899414, "learning_rate": 9.789170477221891e-05, "loss": 2.0943, "step": 765 }, { "epoch": 0.2520773344302756, "grad_norm": 14.944255828857422, "learning_rate": 9.76809113347315e-05, "loss": 2.0326, "step": 766 }, { "epoch": 0.25240641711229944, "grad_norm": 14.989266395568848, "learning_rate": 9.747012820690543e-05, "loss": 2.0643, "step": 767 }, { "epoch": 0.2527354997943233, "grad_norm": 15.422737121582031, "learning_rate": 9.725935632579104e-05, "loss": 2.0058, "step": 768 }, { "epoch": 0.25306458247634717, "grad_norm": 16.56816864013672, "learning_rate": 9.704859662838855e-05, "loss": 2.0015, "step": 769 }, { "epoch": 0.25339366515837103, "grad_norm": 15.776151657104492, "learning_rate": 9.683785005164411e-05, "loss": 1.9423, "step": 770 }, { "epoch": 0.2537227478403949, "grad_norm": 17.842538833618164, "learning_rate": 9.662711753244551e-05, "loss": 2.0377, "step": 771 }, { "epoch": 0.25405183052241875, "grad_norm": 16.78591537475586, "learning_rate": 9.641640000761802e-05, "loss": 2.1832, "step": 772 }, { "epoch": 0.2543809132044426, "grad_norm": 17.174484252929688, "learning_rate": 9.620569841392029e-05, "loss": 2.1229, "step": 773 }, { "epoch": 0.2547099958864665, "grad_norm": 17.69537353515625, "learning_rate": 9.59950136880401e-05, "loss": 2.0529, "step": 774 }, { "epoch": 0.25503907856849034, "grad_norm": 17.368465423583984, "learning_rate": 9.57843467665903e-05, "loss": 2.0553, "step": 775 }, { "epoch": 0.2553681612505142, "grad_norm": 23.612733840942383, "learning_rate": 9.557369858610453e-05, "loss": 2.082, "step": 776 }, { "epoch": 0.25569724393253807, "grad_norm": 19.465009689331055, "learning_rate": 9.53630700830332e-05, "loss": 2.0575, "step": 777 }, { "epoch": 0.2560263266145619, "grad_norm": 21.351573944091797, "learning_rate": 9.51524621937391e-05, "loss": 2.1903, "step": 778 }, { "epoch": 0.2563554092965858, "grad_norm": 22.40749740600586, "learning_rate": 9.494187585449358e-05, "loss": 2.0625, "step": 779 }, { "epoch": 0.25668449197860965, "grad_norm": 26.167518615722656, "learning_rate": 9.473131200147205e-05, "loss": 2.2082, "step": 780 }, { "epoch": 0.25701357466063346, "grad_norm": 24.237781524658203, "learning_rate": 9.452077157074994e-05, "loss": 2.2448, "step": 781 }, { "epoch": 0.2573426573426573, "grad_norm": 24.78247833251953, "learning_rate": 9.431025549829862e-05, "loss": 2.2677, "step": 782 }, { "epoch": 0.2576717400246812, "grad_norm": 22.84943962097168, "learning_rate": 9.409976471998118e-05, "loss": 2.1183, "step": 783 }, { "epoch": 0.25800082270670505, "grad_norm": 26.44524574279785, "learning_rate": 9.388930017154819e-05, "loss": 2.2817, "step": 784 }, { "epoch": 0.2583299053887289, "grad_norm": 24.45064926147461, "learning_rate": 9.367886278863366e-05, "loss": 2.2415, "step": 785 }, { "epoch": 0.25865898807075277, "grad_norm": 32.97056579589844, "learning_rate": 9.346845350675088e-05, "loss": 2.3186, "step": 786 }, { "epoch": 0.25898807075277663, "grad_norm": 27.71230125427246, "learning_rate": 9.325807326128814e-05, "loss": 2.1204, "step": 787 }, { "epoch": 0.2593171534348005, "grad_norm": 34.70222854614258, "learning_rate": 9.304772298750463e-05, "loss": 2.1478, "step": 788 }, { "epoch": 0.25964623611682436, "grad_norm": 28.618940353393555, "learning_rate": 9.283740362052642e-05, "loss": 2.223, "step": 789 }, { "epoch": 0.2599753187988482, "grad_norm": 40.13567352294922, "learning_rate": 9.26271160953421e-05, "loss": 2.3385, "step": 790 }, { "epoch": 0.2603044014808721, "grad_norm": 36.20771408081055, "learning_rate": 9.241686134679867e-05, "loss": 2.2689, "step": 791 }, { "epoch": 0.26063348416289595, "grad_norm": 42.00349044799805, "learning_rate": 9.220664030959749e-05, "loss": 2.546, "step": 792 }, { "epoch": 0.2609625668449198, "grad_norm": 34.87019348144531, "learning_rate": 9.199645391828999e-05, "loss": 2.314, "step": 793 }, { "epoch": 0.26129164952694367, "grad_norm": 42.31432342529297, "learning_rate": 9.178630310727365e-05, "loss": 2.4367, "step": 794 }, { "epoch": 0.2616207322089675, "grad_norm": 50.56330108642578, "learning_rate": 9.157618881078772e-05, "loss": 2.39, "step": 795 }, { "epoch": 0.26194981489099134, "grad_norm": 42.20820617675781, "learning_rate": 9.136611196290915e-05, "loss": 2.4738, "step": 796 }, { "epoch": 0.2622788975730152, "grad_norm": 42.01798629760742, "learning_rate": 9.115607349754834e-05, "loss": 2.4593, "step": 797 }, { "epoch": 0.26260798025503906, "grad_norm": 60.381988525390625, "learning_rate": 9.094607434844523e-05, "loss": 2.5749, "step": 798 }, { "epoch": 0.2629370629370629, "grad_norm": 67.22920989990234, "learning_rate": 9.07361154491648e-05, "loss": 2.9682, "step": 799 }, { "epoch": 0.2632661456190868, "grad_norm": 78.32366943359375, "learning_rate": 9.052619773309317e-05, "loss": 2.8467, "step": 800 }, { "epoch": 0.26359522830111065, "grad_norm": 8.615310668945312, "learning_rate": 9.031632213343339e-05, "loss": 1.803, "step": 801 }, { "epoch": 0.2639243109831345, "grad_norm": 10.402542114257812, "learning_rate": 9.01064895832012e-05, "loss": 1.9751, "step": 802 }, { "epoch": 0.2642533936651584, "grad_norm": 10.270675659179688, "learning_rate": 8.98967010152211e-05, "loss": 2.062, "step": 803 }, { "epoch": 0.26458247634718224, "grad_norm": 11.063749313354492, "learning_rate": 8.968695736212193e-05, "loss": 1.9716, "step": 804 }, { "epoch": 0.2649115590292061, "grad_norm": 12.584948539733887, "learning_rate": 8.947725955633294e-05, "loss": 2.0415, "step": 805 }, { "epoch": 0.26524064171122996, "grad_norm": 11.312567710876465, "learning_rate": 8.926760853007946e-05, "loss": 2.0419, "step": 806 }, { "epoch": 0.2655697243932538, "grad_norm": 12.20005989074707, "learning_rate": 8.905800521537905e-05, "loss": 1.9149, "step": 807 }, { "epoch": 0.2658988070752777, "grad_norm": 11.308486938476562, "learning_rate": 8.884845054403699e-05, "loss": 1.9422, "step": 808 }, { "epoch": 0.2662278897573015, "grad_norm": 11.632606506347656, "learning_rate": 8.863894544764236e-05, "loss": 2.0469, "step": 809 }, { "epoch": 0.26655697243932536, "grad_norm": 12.024511337280273, "learning_rate": 8.84294908575639e-05, "loss": 1.924, "step": 810 }, { "epoch": 0.2668860551213492, "grad_norm": 13.50638198852539, "learning_rate": 8.822008770494572e-05, "loss": 2.0112, "step": 811 }, { "epoch": 0.2672151378033731, "grad_norm": 12.49749755859375, "learning_rate": 8.801073692070337e-05, "loss": 2.0824, "step": 812 }, { "epoch": 0.26754422048539694, "grad_norm": 12.698909759521484, "learning_rate": 8.780143943551954e-05, "loss": 2.0246, "step": 813 }, { "epoch": 0.2678733031674208, "grad_norm": 15.196614265441895, "learning_rate": 8.759219617983999e-05, "loss": 2.148, "step": 814 }, { "epoch": 0.26820238584944467, "grad_norm": 14.986250877380371, "learning_rate": 8.738300808386935e-05, "loss": 2.1394, "step": 815 }, { "epoch": 0.26853146853146853, "grad_norm": 13.409408569335938, "learning_rate": 8.717387607756713e-05, "loss": 2.0542, "step": 816 }, { "epoch": 0.2688605512134924, "grad_norm": 16.65415382385254, "learning_rate": 8.696480109064342e-05, "loss": 1.9694, "step": 817 }, { "epoch": 0.26918963389551626, "grad_norm": 15.701547622680664, "learning_rate": 8.675578405255485e-05, "loss": 2.0541, "step": 818 }, { "epoch": 0.2695187165775401, "grad_norm": 15.767630577087402, "learning_rate": 8.654682589250038e-05, "loss": 2.083, "step": 819 }, { "epoch": 0.269847799259564, "grad_norm": 15.328268051147461, "learning_rate": 8.633792753941733e-05, "loss": 1.9698, "step": 820 }, { "epoch": 0.27017688194158784, "grad_norm": 16.38344955444336, "learning_rate": 8.612908992197705e-05, "loss": 2.0026, "step": 821 }, { "epoch": 0.2705059646236117, "grad_norm": 17.049196243286133, "learning_rate": 8.592031396858093e-05, "loss": 2.1503, "step": 822 }, { "epoch": 0.27083504730563557, "grad_norm": 21.292707443237305, "learning_rate": 8.571160060735624e-05, "loss": 2.1275, "step": 823 }, { "epoch": 0.2711641299876594, "grad_norm": 17.869112014770508, "learning_rate": 8.550295076615188e-05, "loss": 2.0375, "step": 824 }, { "epoch": 0.27149321266968324, "grad_norm": 21.55376625061035, "learning_rate": 8.529436537253458e-05, "loss": 2.2385, "step": 825 }, { "epoch": 0.2718222953517071, "grad_norm": 18.948772430419922, "learning_rate": 8.508584535378439e-05, "loss": 2.1376, "step": 826 }, { "epoch": 0.27215137803373096, "grad_norm": 24.21001434326172, "learning_rate": 8.487739163689079e-05, "loss": 2.1148, "step": 827 }, { "epoch": 0.2724804607157548, "grad_norm": 23.726116180419922, "learning_rate": 8.466900514854847e-05, "loss": 2.1794, "step": 828 }, { "epoch": 0.2728095433977787, "grad_norm": 20.92803382873535, "learning_rate": 8.446068681515334e-05, "loss": 2.2013, "step": 829 }, { "epoch": 0.27313862607980255, "grad_norm": 30.320751190185547, "learning_rate": 8.425243756279824e-05, "loss": 2.0499, "step": 830 }, { "epoch": 0.2734677087618264, "grad_norm": 25.24691390991211, "learning_rate": 8.404425831726894e-05, "loss": 2.0836, "step": 831 }, { "epoch": 0.2737967914438503, "grad_norm": 24.881235122680664, "learning_rate": 8.383615000404e-05, "loss": 2.0732, "step": 832 }, { "epoch": 0.27412587412587414, "grad_norm": 28.099210739135742, "learning_rate": 8.362811354827059e-05, "loss": 2.0513, "step": 833 }, { "epoch": 0.274454956807898, "grad_norm": 32.072174072265625, "learning_rate": 8.342014987480047e-05, "loss": 2.0472, "step": 834 }, { "epoch": 0.27478403948992186, "grad_norm": 27.141910552978516, "learning_rate": 8.321225990814591e-05, "loss": 2.1514, "step": 835 }, { "epoch": 0.2751131221719457, "grad_norm": 26.689529418945312, "learning_rate": 8.300444457249543e-05, "loss": 2.2488, "step": 836 }, { "epoch": 0.2754422048539696, "grad_norm": 25.770963668823242, "learning_rate": 8.279670479170573e-05, "loss": 2.0501, "step": 837 }, { "epoch": 0.2757712875359934, "grad_norm": 30.486297607421875, "learning_rate": 8.258904148929775e-05, "loss": 2.2652, "step": 838 }, { "epoch": 0.27610037021801725, "grad_norm": 30.005529403686523, "learning_rate": 8.238145558845235e-05, "loss": 2.2909, "step": 839 }, { "epoch": 0.2764294529000411, "grad_norm": 32.25809097290039, "learning_rate": 8.217394801200631e-05, "loss": 2.1714, "step": 840 }, { "epoch": 0.276758535582065, "grad_norm": 41.15380859375, "learning_rate": 8.196651968244826e-05, "loss": 2.3035, "step": 841 }, { "epoch": 0.27708761826408884, "grad_norm": 58.05665588378906, "learning_rate": 8.175917152191447e-05, "loss": 2.2623, "step": 842 }, { "epoch": 0.2774167009461127, "grad_norm": 37.57664108276367, "learning_rate": 8.15519044521848e-05, "loss": 2.4858, "step": 843 }, { "epoch": 0.27774578362813657, "grad_norm": 50.59832000732422, "learning_rate": 8.134471939467874e-05, "loss": 2.6186, "step": 844 }, { "epoch": 0.27807486631016043, "grad_norm": 40.98662185668945, "learning_rate": 8.113761727045105e-05, "loss": 2.2737, "step": 845 }, { "epoch": 0.2784039489921843, "grad_norm": 50.94327163696289, "learning_rate": 8.093059900018792e-05, "loss": 2.3689, "step": 846 }, { "epoch": 0.27873303167420815, "grad_norm": 46.65205383300781, "learning_rate": 8.072366550420266e-05, "loss": 2.5434, "step": 847 }, { "epoch": 0.279062114356232, "grad_norm": 51.99758529663086, "learning_rate": 8.051681770243175e-05, "loss": 2.4975, "step": 848 }, { "epoch": 0.2793911970382559, "grad_norm": 188.24462890625, "learning_rate": 8.031005651443073e-05, "loss": 2.4672, "step": 849 }, { "epoch": 0.27972027972027974, "grad_norm": 99.69184875488281, "learning_rate": 8.010338285937006e-05, "loss": 3.0825, "step": 850 }, { "epoch": 0.2800493624023036, "grad_norm": 7.931797981262207, "learning_rate": 7.989679765603108e-05, "loss": 2.0178, "step": 851 }, { "epoch": 0.2803784450843274, "grad_norm": 9.671011924743652, "learning_rate": 7.969030182280192e-05, "loss": 1.9835, "step": 852 }, { "epoch": 0.2807075277663513, "grad_norm": 9.853970527648926, "learning_rate": 7.948389627767343e-05, "loss": 2.044, "step": 853 }, { "epoch": 0.28103661044837513, "grad_norm": 11.105704307556152, "learning_rate": 7.927758193823501e-05, "loss": 1.9648, "step": 854 }, { "epoch": 0.281365693130399, "grad_norm": 10.888609886169434, "learning_rate": 7.907135972167069e-05, "loss": 1.9465, "step": 855 }, { "epoch": 0.28169477581242286, "grad_norm": 11.673775672912598, "learning_rate": 7.88652305447549e-05, "loss": 1.9616, "step": 856 }, { "epoch": 0.2820238584944467, "grad_norm": 12.271096229553223, "learning_rate": 7.865919532384844e-05, "loss": 2.0189, "step": 857 }, { "epoch": 0.2823529411764706, "grad_norm": 12.325889587402344, "learning_rate": 7.845325497489449e-05, "loss": 2.0828, "step": 858 }, { "epoch": 0.28268202385849445, "grad_norm": 12.788196563720703, "learning_rate": 7.82474104134144e-05, "loss": 1.9658, "step": 859 }, { "epoch": 0.2830111065405183, "grad_norm": 12.732770919799805, "learning_rate": 7.804166255450373e-05, "loss": 1.9867, "step": 860 }, { "epoch": 0.28334018922254217, "grad_norm": 13.357149124145508, "learning_rate": 7.783601231282812e-05, "loss": 2.011, "step": 861 }, { "epoch": 0.28366927190456603, "grad_norm": 13.59726619720459, "learning_rate": 7.763046060261932e-05, "loss": 1.9952, "step": 862 }, { "epoch": 0.2839983545865899, "grad_norm": 15.345632553100586, "learning_rate": 7.742500833767094e-05, "loss": 2.0161, "step": 863 }, { "epoch": 0.28432743726861376, "grad_norm": 14.791611671447754, "learning_rate": 7.721965643133458e-05, "loss": 1.9784, "step": 864 }, { "epoch": 0.2846565199506376, "grad_norm": 13.62166690826416, "learning_rate": 7.701440579651566e-05, "loss": 1.9325, "step": 865 }, { "epoch": 0.2849856026326615, "grad_norm": 14.333718299865723, "learning_rate": 7.680925734566937e-05, "loss": 2.0661, "step": 866 }, { "epoch": 0.2853146853146853, "grad_norm": 15.228720664978027, "learning_rate": 7.660421199079669e-05, "loss": 2.0601, "step": 867 }, { "epoch": 0.28564376799670915, "grad_norm": 17.964385986328125, "learning_rate": 7.639927064344022e-05, "loss": 2.0936, "step": 868 }, { "epoch": 0.285972850678733, "grad_norm": 17.966461181640625, "learning_rate": 7.619443421468021e-05, "loss": 1.9936, "step": 869 }, { "epoch": 0.2863019333607569, "grad_norm": 15.548964500427246, "learning_rate": 7.598970361513051e-05, "loss": 2.0485, "step": 870 }, { "epoch": 0.28663101604278074, "grad_norm": 20.900808334350586, "learning_rate": 7.578507975493448e-05, "loss": 2.0972, "step": 871 }, { "epoch": 0.2869600987248046, "grad_norm": 16.962770462036133, "learning_rate": 7.558056354376098e-05, "loss": 2.2122, "step": 872 }, { "epoch": 0.28728918140682846, "grad_norm": 19.86479949951172, "learning_rate": 7.537615589080027e-05, "loss": 1.9331, "step": 873 }, { "epoch": 0.2876182640888523, "grad_norm": 18.055803298950195, "learning_rate": 7.517185770476006e-05, "loss": 2.1374, "step": 874 }, { "epoch": 0.2879473467708762, "grad_norm": 20.225400924682617, "learning_rate": 7.496766989386136e-05, "loss": 2.0926, "step": 875 }, { "epoch": 0.28827642945290005, "grad_norm": 20.352558135986328, "learning_rate": 7.476359336583454e-05, "loss": 2.1468, "step": 876 }, { "epoch": 0.2886055121349239, "grad_norm": 28.181440353393555, "learning_rate": 7.455962902791522e-05, "loss": 2.1219, "step": 877 }, { "epoch": 0.2889345948169478, "grad_norm": 18.355966567993164, "learning_rate": 7.435577778684033e-05, "loss": 2.1164, "step": 878 }, { "epoch": 0.28926367749897164, "grad_norm": 21.07115364074707, "learning_rate": 7.415204054884399e-05, "loss": 2.2211, "step": 879 }, { "epoch": 0.2895927601809955, "grad_norm": 19.219778060913086, "learning_rate": 7.394841821965345e-05, "loss": 2.1377, "step": 880 }, { "epoch": 0.2899218428630193, "grad_norm": 22.46569061279297, "learning_rate": 7.374491170448525e-05, "loss": 2.3079, "step": 881 }, { "epoch": 0.29025092554504317, "grad_norm": 19.8582763671875, "learning_rate": 7.3541521908041e-05, "loss": 2.1775, "step": 882 }, { "epoch": 0.29058000822706703, "grad_norm": 25.4183349609375, "learning_rate": 7.33382497345034e-05, "loss": 2.1043, "step": 883 }, { "epoch": 0.2909090909090909, "grad_norm": 23.688602447509766, "learning_rate": 7.313509608753231e-05, "loss": 2.3192, "step": 884 }, { "epoch": 0.29123817359111476, "grad_norm": 23.912355422973633, "learning_rate": 7.293206187026061e-05, "loss": 2.0337, "step": 885 }, { "epoch": 0.2915672562731386, "grad_norm": 28.26968765258789, "learning_rate": 7.27291479852903e-05, "loss": 2.2142, "step": 886 }, { "epoch": 0.2918963389551625, "grad_norm": 25.0449275970459, "learning_rate": 7.252635533468843e-05, "loss": 2.3965, "step": 887 }, { "epoch": 0.29222542163718634, "grad_norm": 27.359485626220703, "learning_rate": 7.232368481998309e-05, "loss": 2.2302, "step": 888 }, { "epoch": 0.2925545043192102, "grad_norm": 39.02745056152344, "learning_rate": 7.212113734215932e-05, "loss": 2.2806, "step": 889 }, { "epoch": 0.29288358700123407, "grad_norm": 30.66390609741211, "learning_rate": 7.191871380165538e-05, "loss": 2.2677, "step": 890 }, { "epoch": 0.29321266968325793, "grad_norm": 27.493064880371094, "learning_rate": 7.17164150983584e-05, "loss": 2.4569, "step": 891 }, { "epoch": 0.2935417523652818, "grad_norm": 28.003833770751953, "learning_rate": 7.151424213160061e-05, "loss": 2.1724, "step": 892 }, { "epoch": 0.29387083504730566, "grad_norm": 40.272884368896484, "learning_rate": 7.131219580015521e-05, "loss": 2.3235, "step": 893 }, { "epoch": 0.2941999177293295, "grad_norm": 34.9064826965332, "learning_rate": 7.11102770022325e-05, "loss": 2.2342, "step": 894 }, { "epoch": 0.2945290004113533, "grad_norm": 41.64081954956055, "learning_rate": 7.090848663547574e-05, "loss": 2.407, "step": 895 }, { "epoch": 0.2948580830933772, "grad_norm": 39.975582122802734, "learning_rate": 7.070682559695736e-05, "loss": 2.7414, "step": 896 }, { "epoch": 0.29518716577540105, "grad_norm": 46.14976501464844, "learning_rate": 7.050529478317476e-05, "loss": 2.2675, "step": 897 }, { "epoch": 0.2955162484574249, "grad_norm": 96.83391571044922, "learning_rate": 7.03038950900464e-05, "loss": 2.7256, "step": 898 }, { "epoch": 0.2958453311394488, "grad_norm": 59.257015228271484, "learning_rate": 7.010262741290798e-05, "loss": 2.9748, "step": 899 }, { "epoch": 0.29617441382147264, "grad_norm": 102.45021057128906, "learning_rate": 6.990149264650814e-05, "loss": 2.9272, "step": 900 }, { "epoch": 0.2965034965034965, "grad_norm": 8.606595039367676, "learning_rate": 6.970049168500474e-05, "loss": 1.8981, "step": 901 }, { "epoch": 0.29683257918552036, "grad_norm": 8.06563949584961, "learning_rate": 6.94996254219608e-05, "loss": 1.8517, "step": 902 }, { "epoch": 0.2971616618675442, "grad_norm": 9.476439476013184, "learning_rate": 6.929889475034048e-05, "loss": 1.9587, "step": 903 }, { "epoch": 0.2974907445495681, "grad_norm": 9.9060697555542, "learning_rate": 6.909830056250527e-05, "loss": 1.889, "step": 904 }, { "epoch": 0.29781982723159195, "grad_norm": 11.162384033203125, "learning_rate": 6.889784375020978e-05, "loss": 1.9698, "step": 905 }, { "epoch": 0.2981489099136158, "grad_norm": 11.544063568115234, "learning_rate": 6.869752520459803e-05, "loss": 1.9447, "step": 906 }, { "epoch": 0.2984779925956397, "grad_norm": 10.749346733093262, "learning_rate": 6.849734581619918e-05, "loss": 1.9685, "step": 907 }, { "epoch": 0.29880707527766354, "grad_norm": 12.048572540283203, "learning_rate": 6.829730647492404e-05, "loss": 1.998, "step": 908 }, { "epoch": 0.2991361579596874, "grad_norm": 13.45187759399414, "learning_rate": 6.80974080700606e-05, "loss": 2.0359, "step": 909 }, { "epoch": 0.2994652406417112, "grad_norm": 12.923592567443848, "learning_rate": 6.789765149027039e-05, "loss": 1.8813, "step": 910 }, { "epoch": 0.29979432332373507, "grad_norm": 14.21700668334961, "learning_rate": 6.769803762358443e-05, "loss": 1.9824, "step": 911 }, { "epoch": 0.30012340600575893, "grad_norm": 14.06299877166748, "learning_rate": 6.749856735739928e-05, "loss": 2.0633, "step": 912 }, { "epoch": 0.3004524886877828, "grad_norm": 14.114026069641113, "learning_rate": 6.729924157847323e-05, "loss": 2.0217, "step": 913 }, { "epoch": 0.30078157136980666, "grad_norm": 14.693693161010742, "learning_rate": 6.710006117292209e-05, "loss": 1.9669, "step": 914 }, { "epoch": 0.3011106540518305, "grad_norm": 16.160860061645508, "learning_rate": 6.690102702621548e-05, "loss": 1.9935, "step": 915 }, { "epoch": 0.3014397367338544, "grad_norm": 15.228388786315918, "learning_rate": 6.670214002317278e-05, "loss": 2.0427, "step": 916 }, { "epoch": 0.30176881941587824, "grad_norm": 16.014341354370117, "learning_rate": 6.650340104795932e-05, "loss": 2.0298, "step": 917 }, { "epoch": 0.3020979020979021, "grad_norm": 15.433086395263672, "learning_rate": 6.630481098408228e-05, "loss": 2.1108, "step": 918 }, { "epoch": 0.30242698477992597, "grad_norm": 18.013729095458984, "learning_rate": 6.610637071438686e-05, "loss": 2.0939, "step": 919 }, { "epoch": 0.30275606746194983, "grad_norm": 18.422605514526367, "learning_rate": 6.590808112105232e-05, "loss": 2.0235, "step": 920 }, { "epoch": 0.3030851501439737, "grad_norm": 16.388046264648438, "learning_rate": 6.570994308558812e-05, "loss": 2.0216, "step": 921 }, { "epoch": 0.30341423282599755, "grad_norm": 17.965810775756836, "learning_rate": 6.551195748882997e-05, "loss": 2.0561, "step": 922 }, { "epoch": 0.3037433155080214, "grad_norm": 17.768281936645508, "learning_rate": 6.531412521093586e-05, "loss": 1.9709, "step": 923 }, { "epoch": 0.3040723981900452, "grad_norm": 20.556699752807617, "learning_rate": 6.51164471313822e-05, "loss": 2.212, "step": 924 }, { "epoch": 0.3044014808720691, "grad_norm": 17.003408432006836, "learning_rate": 6.491892412895995e-05, "loss": 1.9437, "step": 925 }, { "epoch": 0.30473056355409295, "grad_norm": 18.407974243164062, "learning_rate": 6.472155708177052e-05, "loss": 2.0596, "step": 926 }, { "epoch": 0.3050596462361168, "grad_norm": 20.38560676574707, "learning_rate": 6.452434686722224e-05, "loss": 2.0596, "step": 927 }, { "epoch": 0.3053887289181407, "grad_norm": 20.452512741088867, "learning_rate": 6.432729436202604e-05, "loss": 2.1395, "step": 928 }, { "epoch": 0.30571781160016454, "grad_norm": 21.988248825073242, "learning_rate": 6.41304004421918e-05, "loss": 2.1638, "step": 929 }, { "epoch": 0.3060468942821884, "grad_norm": 25.528146743774414, "learning_rate": 6.393366598302446e-05, "loss": 2.2066, "step": 930 }, { "epoch": 0.30637597696421226, "grad_norm": 22.592243194580078, "learning_rate": 6.373709185911998e-05, "loss": 2.2472, "step": 931 }, { "epoch": 0.3067050596462361, "grad_norm": 18.68706512451172, "learning_rate": 6.354067894436155e-05, "loss": 1.884, "step": 932 }, { "epoch": 0.30703414232826, "grad_norm": 28.759267807006836, "learning_rate": 6.334442811191576e-05, "loss": 2.0178, "step": 933 }, { "epoch": 0.30736322501028385, "grad_norm": 29.898094177246094, "learning_rate": 6.314834023422858e-05, "loss": 2.1995, "step": 934 }, { "epoch": 0.3076923076923077, "grad_norm": 25.61287498474121, "learning_rate": 6.295241618302156e-05, "loss": 2.0838, "step": 935 }, { "epoch": 0.30802139037433157, "grad_norm": 28.773006439208984, "learning_rate": 6.275665682928803e-05, "loss": 2.2459, "step": 936 }, { "epoch": 0.30835047305635543, "grad_norm": 29.669248580932617, "learning_rate": 6.256106304328905e-05, "loss": 2.3264, "step": 937 }, { "epoch": 0.30867955573837924, "grad_norm": 28.54385757446289, "learning_rate": 6.23656356945497e-05, "loss": 2.1641, "step": 938 }, { "epoch": 0.3090086384204031, "grad_norm": 32.43621826171875, "learning_rate": 6.21703756518551e-05, "loss": 2.1876, "step": 939 }, { "epoch": 0.30933772110242697, "grad_norm": 27.514690399169922, "learning_rate": 6.197528378324665e-05, "loss": 2.2677, "step": 940 }, { "epoch": 0.30966680378445083, "grad_norm": 32.22248458862305, "learning_rate": 6.17803609560181e-05, "loss": 2.1212, "step": 941 }, { "epoch": 0.3099958864664747, "grad_norm": 31.3935604095459, "learning_rate": 6.158560803671168e-05, "loss": 2.0918, "step": 942 }, { "epoch": 0.31032496914849855, "grad_norm": 35.551395416259766, "learning_rate": 6.139102589111435e-05, "loss": 2.4482, "step": 943 }, { "epoch": 0.3106540518305224, "grad_norm": 35.783023834228516, "learning_rate": 6.119661538425381e-05, "loss": 2.2436, "step": 944 }, { "epoch": 0.3109831345125463, "grad_norm": 43.18023681640625, "learning_rate": 6.100237738039484e-05, "loss": 2.182, "step": 945 }, { "epoch": 0.31131221719457014, "grad_norm": 49.416465759277344, "learning_rate": 6.0808312743035236e-05, "loss": 2.6734, "step": 946 }, { "epoch": 0.311641299876594, "grad_norm": 106.99371337890625, "learning_rate": 6.061442233490211e-05, "loss": 2.5521, "step": 947 }, { "epoch": 0.31197038255861786, "grad_norm": 48.063133239746094, "learning_rate": 6.042070701794806e-05, "loss": 2.5888, "step": 948 }, { "epoch": 0.3122994652406417, "grad_norm": 59.49570846557617, "learning_rate": 6.0227167653347305e-05, "loss": 2.6928, "step": 949 }, { "epoch": 0.3126285479226656, "grad_norm": 80.73985290527344, "learning_rate": 6.0033805101491794e-05, "loss": 2.9342, "step": 950 }, { "epoch": 0.31295763060468945, "grad_norm": 7.6070380210876465, "learning_rate": 5.98406202219875e-05, "loss": 1.9027, "step": 951 }, { "epoch": 0.3132867132867133, "grad_norm": 8.111433982849121, "learning_rate": 5.964761387365052e-05, "loss": 1.8913, "step": 952 }, { "epoch": 0.3136157959687371, "grad_norm": 9.067609786987305, "learning_rate": 5.9454786914503255e-05, "loss": 1.943, "step": 953 }, { "epoch": 0.313944878650761, "grad_norm": 9.627612113952637, "learning_rate": 5.926214020177074e-05, "loss": 1.8712, "step": 954 }, { "epoch": 0.31427396133278485, "grad_norm": 11.528953552246094, "learning_rate": 5.9069674591876534e-05, "loss": 2.0334, "step": 955 }, { "epoch": 0.3146030440148087, "grad_norm": 10.677018165588379, "learning_rate": 5.887739094043923e-05, "loss": 2.0192, "step": 956 }, { "epoch": 0.31493212669683257, "grad_norm": 12.317497253417969, "learning_rate": 5.868529010226845e-05, "loss": 2.0911, "step": 957 }, { "epoch": 0.31526120937885643, "grad_norm": 10.695416450500488, "learning_rate": 5.849337293136112e-05, "loss": 1.9064, "step": 958 }, { "epoch": 0.3155902920608803, "grad_norm": 11.668805122375488, "learning_rate": 5.830164028089766e-05, "loss": 1.9976, "step": 959 }, { "epoch": 0.31591937474290416, "grad_norm": 10.927106857299805, "learning_rate": 5.811009300323818e-05, "loss": 1.9324, "step": 960 }, { "epoch": 0.316248457424928, "grad_norm": 12.905808448791504, "learning_rate": 5.791873194991872e-05, "loss": 2.0708, "step": 961 }, { "epoch": 0.3165775401069519, "grad_norm": 12.852319717407227, "learning_rate": 5.7727557971647427e-05, "loss": 2.0454, "step": 962 }, { "epoch": 0.31690662278897574, "grad_norm": 12.606741905212402, "learning_rate": 5.7536571918300864e-05, "loss": 1.9414, "step": 963 }, { "epoch": 0.3172357054709996, "grad_norm": 12.670434951782227, "learning_rate": 5.734577463892008e-05, "loss": 1.9795, "step": 964 }, { "epoch": 0.31756478815302347, "grad_norm": 15.521458625793457, "learning_rate": 5.7155166981706956e-05, "loss": 2.0713, "step": 965 }, { "epoch": 0.31789387083504733, "grad_norm": 14.167004585266113, "learning_rate": 5.6964749794020354e-05, "loss": 1.9973, "step": 966 }, { "epoch": 0.31822295351707114, "grad_norm": 15.580550193786621, "learning_rate": 5.6774523922372394e-05, "loss": 1.9656, "step": 967 }, { "epoch": 0.318552036199095, "grad_norm": 15.85921573638916, "learning_rate": 5.6584490212424804e-05, "loss": 2.1541, "step": 968 }, { "epoch": 0.31888111888111886, "grad_norm": 17.21166229248047, "learning_rate": 5.639464950898491e-05, "loss": 2.0969, "step": 969 }, { "epoch": 0.3192102015631427, "grad_norm": 15.54463005065918, "learning_rate": 5.620500265600206e-05, "loss": 2.0791, "step": 970 }, { "epoch": 0.3195392842451666, "grad_norm": 16.726932525634766, "learning_rate": 5.601555049656382e-05, "loss": 1.9952, "step": 971 }, { "epoch": 0.31986836692719045, "grad_norm": 18.592525482177734, "learning_rate": 5.58262938728922e-05, "loss": 2.2251, "step": 972 }, { "epoch": 0.3201974496092143, "grad_norm": 18.98313331604004, "learning_rate": 5.563723362634008e-05, "loss": 2.0569, "step": 973 }, { "epoch": 0.3205265322912382, "grad_norm": 19.475948333740234, "learning_rate": 5.544837059738719e-05, "loss": 2.1375, "step": 974 }, { "epoch": 0.32085561497326204, "grad_norm": 19.036008834838867, "learning_rate": 5.525970562563656e-05, "loss": 2.1603, "step": 975 }, { "epoch": 0.3211846976552859, "grad_norm": 20.155803680419922, "learning_rate": 5.507123954981073e-05, "loss": 2.077, "step": 976 }, { "epoch": 0.32151378033730976, "grad_norm": 20.398292541503906, "learning_rate": 5.488297320774807e-05, "loss": 2.1764, "step": 977 }, { "epoch": 0.3218428630193336, "grad_norm": 18.43659782409668, "learning_rate": 5.4694907436399e-05, "loss": 2.1615, "step": 978 }, { "epoch": 0.3221719457013575, "grad_norm": 20.170846939086914, "learning_rate": 5.4507043071822284e-05, "loss": 2.1737, "step": 979 }, { "epoch": 0.32250102838338135, "grad_norm": 21.036090850830078, "learning_rate": 5.431938094918132e-05, "loss": 2.0855, "step": 980 }, { "epoch": 0.32283011106540516, "grad_norm": 26.383455276489258, "learning_rate": 5.41319219027404e-05, "loss": 2.2095, "step": 981 }, { "epoch": 0.323159193747429, "grad_norm": 23.362022399902344, "learning_rate": 5.394466676586114e-05, "loss": 2.2212, "step": 982 }, { "epoch": 0.3234882764294529, "grad_norm": 24.784385681152344, "learning_rate": 5.375761637099854e-05, "loss": 2.1978, "step": 983 }, { "epoch": 0.32381735911147674, "grad_norm": 24.057310104370117, "learning_rate": 5.357077154969742e-05, "loss": 2.1605, "step": 984 }, { "epoch": 0.3241464417935006, "grad_norm": 26.63469886779785, "learning_rate": 5.3384133132588784e-05, "loss": 2.232, "step": 985 }, { "epoch": 0.32447552447552447, "grad_norm": 26.517337799072266, "learning_rate": 5.3197701949386e-05, "loss": 2.1131, "step": 986 }, { "epoch": 0.32480460715754833, "grad_norm": 26.246366500854492, "learning_rate": 5.301147882888116e-05, "loss": 2.2181, "step": 987 }, { "epoch": 0.3251336898395722, "grad_norm": 27.842470169067383, "learning_rate": 5.28254645989414e-05, "loss": 2.1505, "step": 988 }, { "epoch": 0.32546277252159606, "grad_norm": 25.297283172607422, "learning_rate": 5.2639660086505226e-05, "loss": 2.1642, "step": 989 }, { "epoch": 0.3257918552036199, "grad_norm": 28.8968505859375, "learning_rate": 5.2454066117578815e-05, "loss": 2.2379, "step": 990 }, { "epoch": 0.3261209378856438, "grad_norm": 33.47592544555664, "learning_rate": 5.226868351723244e-05, "loss": 2.5188, "step": 991 }, { "epoch": 0.32645002056766764, "grad_norm": 42.386600494384766, "learning_rate": 5.2083513109596616e-05, "loss": 2.5441, "step": 992 }, { "epoch": 0.3267791032496915, "grad_norm": 42.62797164916992, "learning_rate": 5.189855571785859e-05, "loss": 2.2962, "step": 993 }, { "epoch": 0.32710818593171537, "grad_norm": 37.47578811645508, "learning_rate": 5.171381216425863e-05, "loss": 2.4511, "step": 994 }, { "epoch": 0.32743726861373923, "grad_norm": 34.23417663574219, "learning_rate": 5.152928327008635e-05, "loss": 2.2716, "step": 995 }, { "epoch": 0.32776635129576304, "grad_norm": 53.03375244140625, "learning_rate": 5.134496985567714e-05, "loss": 2.5088, "step": 996 }, { "epoch": 0.3280954339777869, "grad_norm": 39.68029022216797, "learning_rate": 5.116087274040837e-05, "loss": 2.5192, "step": 997 }, { "epoch": 0.32842451665981076, "grad_norm": 56.27915573120117, "learning_rate": 5.0976992742695925e-05, "loss": 2.3729, "step": 998 }, { "epoch": 0.3287535993418346, "grad_norm": 58.642738342285156, "learning_rate": 5.07933306799904e-05, "loss": 2.3853, "step": 999 }, { "epoch": 0.3290826820238585, "grad_norm": 80.52285766601562, "learning_rate": 5.060988736877366e-05, "loss": 2.7843, "step": 1000 }, { "epoch": 0.32941176470588235, "grad_norm": 7.423452854156494, "learning_rate": 5.042666362455498e-05, "loss": 1.7589, "step": 1001 }, { "epoch": 0.3297408473879062, "grad_norm": 7.808231353759766, "learning_rate": 5.024366026186755e-05, "loss": 1.8392, "step": 1002 }, { "epoch": 0.3300699300699301, "grad_norm": 7.777120113372803, "learning_rate": 5.006087809426496e-05, "loss": 1.791, "step": 1003 }, { "epoch": 0.33039901275195394, "grad_norm": 9.34269905090332, "learning_rate": 4.987831793431731e-05, "loss": 2.0456, "step": 1004 }, { "epoch": 0.3307280954339778, "grad_norm": 9.566130638122559, "learning_rate": 4.9695980593607817e-05, "loss": 1.9789, "step": 1005 }, { "epoch": 0.33105717811600166, "grad_norm": 10.933757781982422, "learning_rate": 4.9513866882729146e-05, "loss": 1.9437, "step": 1006 }, { "epoch": 0.3313862607980255, "grad_norm": 11.410195350646973, "learning_rate": 4.9331977611279777e-05, "loss": 2.0096, "step": 1007 }, { "epoch": 0.3317153434800494, "grad_norm": 11.923565864562988, "learning_rate": 4.9150313587860433e-05, "loss": 1.7981, "step": 1008 }, { "epoch": 0.33204442616207325, "grad_norm": 14.223522186279297, "learning_rate": 4.896887562007054e-05, "loss": 1.9875, "step": 1009 }, { "epoch": 0.33237350884409705, "grad_norm": 12.79909896850586, "learning_rate": 4.8787664514504504e-05, "loss": 2.1416, "step": 1010 }, { "epoch": 0.3327025915261209, "grad_norm": 15.30400276184082, "learning_rate": 4.860668107674823e-05, "loss": 2.039, "step": 1011 }, { "epoch": 0.3330316742081448, "grad_norm": 13.779805183410645, "learning_rate": 4.8425926111375506e-05, "loss": 2.0499, "step": 1012 }, { "epoch": 0.33336075689016864, "grad_norm": 14.045153617858887, "learning_rate": 4.824540042194443e-05, "loss": 2.0338, "step": 1013 }, { "epoch": 0.3336898395721925, "grad_norm": 13.342597007751465, "learning_rate": 4.8065104810993856e-05, "loss": 2.0549, "step": 1014 }, { "epoch": 0.33401892225421637, "grad_norm": 14.685636520385742, "learning_rate": 4.788504008003978e-05, "loss": 1.9895, "step": 1015 }, { "epoch": 0.33434800493624023, "grad_norm": 16.241355895996094, "learning_rate": 4.770520702957182e-05, "loss": 1.9656, "step": 1016 }, { "epoch": 0.3346770876182641, "grad_norm": 16.78399658203125, "learning_rate": 4.752560645904962e-05, "loss": 2.0993, "step": 1017 }, { "epoch": 0.33500617030028795, "grad_norm": 15.50197982788086, "learning_rate": 4.734623916689941e-05, "loss": 2.015, "step": 1018 }, { "epoch": 0.3353352529823118, "grad_norm": 16.54322052001953, "learning_rate": 4.716710595051022e-05, "loss": 2.0529, "step": 1019 }, { "epoch": 0.3356643356643357, "grad_norm": 15.670755386352539, "learning_rate": 4.698820760623064e-05, "loss": 2.1027, "step": 1020 }, { "epoch": 0.33599341834635954, "grad_norm": 16.396520614624023, "learning_rate": 4.6809544929365004e-05, "loss": 1.9745, "step": 1021 }, { "epoch": 0.3363225010283834, "grad_norm": 17.455230712890625, "learning_rate": 4.663111871417e-05, "loss": 2.1229, "step": 1022 }, { "epoch": 0.33665158371040727, "grad_norm": 16.742382049560547, "learning_rate": 4.645292975385111e-05, "loss": 2.1863, "step": 1023 }, { "epoch": 0.33698066639243107, "grad_norm": 19.29606056213379, "learning_rate": 4.627497884055912e-05, "loss": 2.0192, "step": 1024 }, { "epoch": 0.33730974907445493, "grad_norm": 19.857383728027344, "learning_rate": 4.609726676538652e-05, "loss": 2.1857, "step": 1025 }, { "epoch": 0.3376388317564788, "grad_norm": 18.494062423706055, "learning_rate": 4.591979431836402e-05, "loss": 2.0436, "step": 1026 }, { "epoch": 0.33796791443850266, "grad_norm": 23.841251373291016, "learning_rate": 4.574256228845706e-05, "loss": 2.0551, "step": 1027 }, { "epoch": 0.3382969971205265, "grad_norm": 20.27541160583496, "learning_rate": 4.5565571463562365e-05, "loss": 2.1141, "step": 1028 }, { "epoch": 0.3386260798025504, "grad_norm": 20.647497177124023, "learning_rate": 4.5388822630504256e-05, "loss": 2.0562, "step": 1029 }, { "epoch": 0.33895516248457425, "grad_norm": 24.1492977142334, "learning_rate": 4.521231657503132e-05, "loss": 2.0477, "step": 1030 }, { "epoch": 0.3392842451665981, "grad_norm": 24.78376007080078, "learning_rate": 4.503605408181286e-05, "loss": 2.1074, "step": 1031 }, { "epoch": 0.33961332784862197, "grad_norm": 22.048702239990234, "learning_rate": 4.486003593443537e-05, "loss": 2.1236, "step": 1032 }, { "epoch": 0.33994241053064583, "grad_norm": 22.491592407226562, "learning_rate": 4.468426291539914e-05, "loss": 2.2517, "step": 1033 }, { "epoch": 0.3402714932126697, "grad_norm": 31.3500919342041, "learning_rate": 4.4508735806114654e-05, "loss": 2.2431, "step": 1034 }, { "epoch": 0.34060057589469356, "grad_norm": 27.213743209838867, "learning_rate": 4.433345538689929e-05, "loss": 2.2952, "step": 1035 }, { "epoch": 0.3409296585767174, "grad_norm": 29.022459030151367, "learning_rate": 4.415842243697369e-05, "loss": 2.1188, "step": 1036 }, { "epoch": 0.3412587412587413, "grad_norm": 29.84700584411621, "learning_rate": 4.39836377344583e-05, "loss": 2.3474, "step": 1037 }, { "epoch": 0.3415878239407651, "grad_norm": 29.270647048950195, "learning_rate": 4.380910205637012e-05, "loss": 2.2511, "step": 1038 }, { "epoch": 0.34191690662278895, "grad_norm": 28.785202026367188, "learning_rate": 4.363481617861893e-05, "loss": 2.4374, "step": 1039 }, { "epoch": 0.3422459893048128, "grad_norm": 31.60390281677246, "learning_rate": 4.346078087600412e-05, "loss": 2.3761, "step": 1040 }, { "epoch": 0.3425750719868367, "grad_norm": 34.05215835571289, "learning_rate": 4.3286996922211034e-05, "loss": 2.3048, "step": 1041 }, { "epoch": 0.34290415466886054, "grad_norm": 32.6806640625, "learning_rate": 4.311346508980772e-05, "loss": 2.3638, "step": 1042 }, { "epoch": 0.3432332373508844, "grad_norm": 32.4832649230957, "learning_rate": 4.2940186150241365e-05, "loss": 2.331, "step": 1043 }, { "epoch": 0.34356232003290826, "grad_norm": 34.924476623535156, "learning_rate": 4.27671608738349e-05, "loss": 2.2383, "step": 1044 }, { "epoch": 0.3438914027149321, "grad_norm": 36.276885986328125, "learning_rate": 4.2594390029783534e-05, "loss": 2.3428, "step": 1045 }, { "epoch": 0.344220485396956, "grad_norm": 42.95642852783203, "learning_rate": 4.242187438615153e-05, "loss": 2.6901, "step": 1046 }, { "epoch": 0.34454956807897985, "grad_norm": 45.85472869873047, "learning_rate": 4.224961470986849e-05, "loss": 2.2695, "step": 1047 }, { "epoch": 0.3448786507610037, "grad_norm": 59.72822189331055, "learning_rate": 4.207761176672614e-05, "loss": 2.6945, "step": 1048 }, { "epoch": 0.3452077334430276, "grad_norm": 63.7497444152832, "learning_rate": 4.190586632137491e-05, "loss": 2.5494, "step": 1049 }, { "epoch": 0.34553681612505144, "grad_norm": 97.82295989990234, "learning_rate": 4.173437913732048e-05, "loss": 3.1521, "step": 1050 }, { "epoch": 0.3458658988070753, "grad_norm": 6.471660614013672, "learning_rate": 4.156315097692037e-05, "loss": 1.844, "step": 1051 }, { "epoch": 0.34619498148909916, "grad_norm": 7.613820552825928, "learning_rate": 4.139218260138074e-05, "loss": 1.8956, "step": 1052 }, { "epoch": 0.34652406417112297, "grad_norm": 8.178187370300293, "learning_rate": 4.12214747707527e-05, "loss": 1.9327, "step": 1053 }, { "epoch": 0.34685314685314683, "grad_norm": 9.461479187011719, "learning_rate": 4.1051028243929125e-05, "loss": 1.9154, "step": 1054 }, { "epoch": 0.3471822295351707, "grad_norm": 10.933238983154297, "learning_rate": 4.088084377864135e-05, "loss": 2.1418, "step": 1055 }, { "epoch": 0.34751131221719456, "grad_norm": 10.149748802185059, "learning_rate": 4.07109221314556e-05, "loss": 1.9596, "step": 1056 }, { "epoch": 0.3478403948992184, "grad_norm": 10.874551773071289, "learning_rate": 4.054126405776971e-05, "loss": 1.8712, "step": 1057 }, { "epoch": 0.3481694775812423, "grad_norm": 11.490303993225098, "learning_rate": 4.037187031180985e-05, "loss": 1.923, "step": 1058 }, { "epoch": 0.34849856026326614, "grad_norm": 12.213667869567871, "learning_rate": 4.020274164662707e-05, "loss": 1.836, "step": 1059 }, { "epoch": 0.34882764294529, "grad_norm": 11.098823547363281, "learning_rate": 4.003387881409397e-05, "loss": 1.8828, "step": 1060 }, { "epoch": 0.34915672562731387, "grad_norm": 16.802589416503906, "learning_rate": 3.986528256490141e-05, "loss": 1.9789, "step": 1061 }, { "epoch": 0.34948580830933773, "grad_norm": 12.012221336364746, "learning_rate": 3.969695364855511e-05, "loss": 1.9553, "step": 1062 }, { "epoch": 0.3498148909913616, "grad_norm": 14.2031831741333, "learning_rate": 3.952889281337235e-05, "loss": 2.0149, "step": 1063 }, { "epoch": 0.35014397367338546, "grad_norm": 12.735995292663574, "learning_rate": 3.93611008064786e-05, "loss": 2.0496, "step": 1064 }, { "epoch": 0.3504730563554093, "grad_norm": 15.24793529510498, "learning_rate": 3.9193578373804364e-05, "loss": 2.0217, "step": 1065 }, { "epoch": 0.3508021390374332, "grad_norm": 17.1986026763916, "learning_rate": 3.90263262600816e-05, "loss": 2.049, "step": 1066 }, { "epoch": 0.351131221719457, "grad_norm": 53.82430648803711, "learning_rate": 3.88593452088406e-05, "loss": 2.0008, "step": 1067 }, { "epoch": 0.35146030440148085, "grad_norm": 15.815654754638672, "learning_rate": 3.869263596240661e-05, "loss": 1.9584, "step": 1068 }, { "epoch": 0.3517893870835047, "grad_norm": 14.452220916748047, "learning_rate": 3.8526199261896544e-05, "loss": 2.0494, "step": 1069 }, { "epoch": 0.3521184697655286, "grad_norm": 16.41934585571289, "learning_rate": 3.836003584721577e-05, "loss": 2.0366, "step": 1070 }, { "epoch": 0.35244755244755244, "grad_norm": 15.771565437316895, "learning_rate": 3.8194146457054655e-05, "loss": 2.0431, "step": 1071 }, { "epoch": 0.3527766351295763, "grad_norm": 16.568445205688477, "learning_rate": 3.802853182888543e-05, "loss": 1.9882, "step": 1072 }, { "epoch": 0.35310571781160016, "grad_norm": 16.752239227294922, "learning_rate": 3.786319269895877e-05, "loss": 2.1004, "step": 1073 }, { "epoch": 0.353434800493624, "grad_norm": 18.60424041748047, "learning_rate": 3.769812980230074e-05, "loss": 2.1016, "step": 1074 }, { "epoch": 0.3537638831756479, "grad_norm": 19.42106819152832, "learning_rate": 3.7533343872709294e-05, "loss": 2.0782, "step": 1075 }, { "epoch": 0.35409296585767175, "grad_norm": 17.515928268432617, "learning_rate": 3.736883564275112e-05, "loss": 2.1685, "step": 1076 }, { "epoch": 0.3544220485396956, "grad_norm": 20.955604553222656, "learning_rate": 3.7204605843758386e-05, "loss": 2.2049, "step": 1077 }, { "epoch": 0.3547511312217195, "grad_norm": 20.868257522583008, "learning_rate": 3.704065520582549e-05, "loss": 2.0723, "step": 1078 }, { "epoch": 0.35508021390374334, "grad_norm": 31.952220916748047, "learning_rate": 3.6876984457805786e-05, "loss": 2.1723, "step": 1079 }, { "epoch": 0.3554092965857672, "grad_norm": 21.46729278564453, "learning_rate": 3.671359432730834e-05, "loss": 2.2066, "step": 1080 }, { "epoch": 0.355738379267791, "grad_norm": 24.422775268554688, "learning_rate": 3.655048554069478e-05, "loss": 2.3851, "step": 1081 }, { "epoch": 0.35606746194981487, "grad_norm": 21.09502601623535, "learning_rate": 3.638765882307589e-05, "loss": 1.9947, "step": 1082 }, { "epoch": 0.35639654463183873, "grad_norm": 24.67693519592285, "learning_rate": 3.6225114898308634e-05, "loss": 2.0587, "step": 1083 }, { "epoch": 0.3567256273138626, "grad_norm": 24.87090301513672, "learning_rate": 3.6062854488992714e-05, "loss": 2.2291, "step": 1084 }, { "epoch": 0.35705470999588645, "grad_norm": 25.945091247558594, "learning_rate": 3.5900878316467454e-05, "loss": 2.1683, "step": 1085 }, { "epoch": 0.3573837926779103, "grad_norm": 27.096158981323242, "learning_rate": 3.573918710080857e-05, "loss": 2.1915, "step": 1086 }, { "epoch": 0.3577128753599342, "grad_norm": 38.67768096923828, "learning_rate": 3.5577781560825066e-05, "loss": 2.4148, "step": 1087 }, { "epoch": 0.35804195804195804, "grad_norm": 30.857200622558594, "learning_rate": 3.541666241405588e-05, "loss": 2.1666, "step": 1088 }, { "epoch": 0.3583710407239819, "grad_norm": 27.743030548095703, "learning_rate": 3.5255830376766764e-05, "loss": 2.2761, "step": 1089 }, { "epoch": 0.35870012340600577, "grad_norm": 29.143587112426758, "learning_rate": 3.509528616394716e-05, "loss": 2.1797, "step": 1090 }, { "epoch": 0.35902920608802963, "grad_norm": 37.60934066772461, "learning_rate": 3.4935030489306883e-05, "loss": 2.2681, "step": 1091 }, { "epoch": 0.3593582887700535, "grad_norm": 33.6703987121582, "learning_rate": 3.4775064065273165e-05, "loss": 2.2283, "step": 1092 }, { "epoch": 0.35968737145207735, "grad_norm": 36.05352020263672, "learning_rate": 3.4615387602987236e-05, "loss": 2.197, "step": 1093 }, { "epoch": 0.3600164541341012, "grad_norm": 50.288875579833984, "learning_rate": 3.445600181230134e-05, "loss": 2.5384, "step": 1094 }, { "epoch": 0.3603455368161251, "grad_norm": 36.781105041503906, "learning_rate": 3.429690740177549e-05, "loss": 2.075, "step": 1095 }, { "epoch": 0.3606746194981489, "grad_norm": 48.31385040283203, "learning_rate": 3.413810507867436e-05, "loss": 2.4039, "step": 1096 }, { "epoch": 0.36100370218017275, "grad_norm": 54.922367095947266, "learning_rate": 3.397959554896415e-05, "loss": 2.674, "step": 1097 }, { "epoch": 0.3613327848621966, "grad_norm": 52.07112121582031, "learning_rate": 3.3821379517309405e-05, "loss": 2.5718, "step": 1098 }, { "epoch": 0.36166186754422047, "grad_norm": 64.56893920898438, "learning_rate": 3.3663457687069924e-05, "loss": 2.5957, "step": 1099 }, { "epoch": 0.36199095022624433, "grad_norm": 91.17286682128906, "learning_rate": 3.350583076029754e-05, "loss": 2.7574, "step": 1100 }, { "epoch": 0.3623200329082682, "grad_norm": 6.4083428382873535, "learning_rate": 3.334849943773323e-05, "loss": 1.8988, "step": 1101 }, { "epoch": 0.36264911559029206, "grad_norm": 7.724403381347656, "learning_rate": 3.319146441880371e-05, "loss": 1.8226, "step": 1102 }, { "epoch": 0.3629781982723159, "grad_norm": 7.997345447540283, "learning_rate": 3.3034726401618444e-05, "loss": 1.924, "step": 1103 }, { "epoch": 0.3633072809543398, "grad_norm": 8.64883041381836, "learning_rate": 3.28782860829667e-05, "loss": 2.0466, "step": 1104 }, { "epoch": 0.36363636363636365, "grad_norm": 9.921974182128906, "learning_rate": 3.272214415831418e-05, "loss": 2.0738, "step": 1105 }, { "epoch": 0.3639654463183875, "grad_norm": 10.615996360778809, "learning_rate": 3.2566301321800085e-05, "loss": 2.075, "step": 1106 }, { "epoch": 0.36429452900041137, "grad_norm": 34.82708740234375, "learning_rate": 3.241075826623401e-05, "loss": 2.0296, "step": 1107 }, { "epoch": 0.36462361168243523, "grad_norm": 10.764836311340332, "learning_rate": 3.225551568309284e-05, "loss": 1.9587, "step": 1108 }, { "epoch": 0.3649526943644591, "grad_norm": 11.545236587524414, "learning_rate": 3.210057426251773e-05, "loss": 1.93, "step": 1109 }, { "epoch": 0.3652817770464829, "grad_norm": 12.930041313171387, "learning_rate": 3.1945934693310896e-05, "loss": 2.0391, "step": 1110 }, { "epoch": 0.36561085972850677, "grad_norm": 12.978158950805664, "learning_rate": 3.179159766293282e-05, "loss": 1.9997, "step": 1111 }, { "epoch": 0.3659399424105306, "grad_norm": 12.713103294372559, "learning_rate": 3.163756385749889e-05, "loss": 2.0474, "step": 1112 }, { "epoch": 0.3662690250925545, "grad_norm": 13.143235206604004, "learning_rate": 3.148383396177653e-05, "loss": 1.9117, "step": 1113 }, { "epoch": 0.36659810777457835, "grad_norm": 14.726356506347656, "learning_rate": 3.133040865918213e-05, "loss": 1.9652, "step": 1114 }, { "epoch": 0.3669271904566022, "grad_norm": 15.944626808166504, "learning_rate": 3.117728863177796e-05, "loss": 1.9871, "step": 1115 }, { "epoch": 0.3672562731386261, "grad_norm": 14.794142723083496, "learning_rate": 3.102447456026919e-05, "loss": 2.067, "step": 1116 }, { "epoch": 0.36758535582064994, "grad_norm": 15.859716415405273, "learning_rate": 3.0871967124000834e-05, "loss": 2.0365, "step": 1117 }, { "epoch": 0.3679144385026738, "grad_norm": 16.27312469482422, "learning_rate": 3.0719767000954714e-05, "loss": 1.997, "step": 1118 }, { "epoch": 0.36824352118469766, "grad_norm": 15.993875503540039, "learning_rate": 3.056787486774656e-05, "loss": 2.156, "step": 1119 }, { "epoch": 0.3685726038667215, "grad_norm": 17.42777442932129, "learning_rate": 3.041629139962283e-05, "loss": 2.0718, "step": 1120 }, { "epoch": 0.3689016865487454, "grad_norm": 16.689842224121094, "learning_rate": 3.0265017270457775e-05, "loss": 1.987, "step": 1121 }, { "epoch": 0.36923076923076925, "grad_norm": 19.354761123657227, "learning_rate": 3.0114053152750556e-05, "loss": 2.111, "step": 1122 }, { "epoch": 0.3695598519127931, "grad_norm": 21.725404739379883, "learning_rate": 2.9963399717622077e-05, "loss": 2.2344, "step": 1123 }, { "epoch": 0.3698889345948169, "grad_norm": 17.07735824584961, "learning_rate": 2.98130576348121e-05, "loss": 2.0488, "step": 1124 }, { "epoch": 0.3702180172768408, "grad_norm": 19.4981689453125, "learning_rate": 2.966302757267625e-05, "loss": 1.9402, "step": 1125 }, { "epoch": 0.3702180172768408, "eval_loss": 1.9160404205322266, "eval_runtime": 163.4387, "eval_samples_per_second": 31.314, "eval_steps_per_second": 15.657, "step": 1125 }, { "epoch": 0.37054709995886465, "grad_norm": 18.196046829223633, "learning_rate": 2.9513310198183065e-05, "loss": 1.9625, "step": 1126 }, { "epoch": 0.3708761826408885, "grad_norm": 18.447654724121094, "learning_rate": 2.936390617691097e-05, "loss": 1.9211, "step": 1127 }, { "epoch": 0.37120526532291237, "grad_norm": 20.676822662353516, "learning_rate": 2.9214816173045356e-05, "loss": 1.9947, "step": 1128 }, { "epoch": 0.37153434800493623, "grad_norm": 21.159812927246094, "learning_rate": 2.906604084937572e-05, "loss": 1.9502, "step": 1129 }, { "epoch": 0.3718634306869601, "grad_norm": 24.089841842651367, "learning_rate": 2.8917580867292526e-05, "loss": 2.0995, "step": 1130 }, { "epoch": 0.37219251336898396, "grad_norm": 23.02242660522461, "learning_rate": 2.8769436886784408e-05, "loss": 2.0998, "step": 1131 }, { "epoch": 0.3725215960510078, "grad_norm": 23.296146392822266, "learning_rate": 2.862160956643517e-05, "loss": 2.1375, "step": 1132 }, { "epoch": 0.3728506787330317, "grad_norm": 24.225025177001953, "learning_rate": 2.847409956342092e-05, "loss": 2.0185, "step": 1133 }, { "epoch": 0.37317976141505554, "grad_norm": 24.618310928344727, "learning_rate": 2.8326907533507074e-05, "loss": 2.2716, "step": 1134 }, { "epoch": 0.3735088440970794, "grad_norm": 26.13825225830078, "learning_rate": 2.8180034131045464e-05, "loss": 2.191, "step": 1135 }, { "epoch": 0.37383792677910327, "grad_norm": 28.195363998413086, "learning_rate": 2.8033480008971546e-05, "loss": 2.3122, "step": 1136 }, { "epoch": 0.37416700946112713, "grad_norm": 38.0821418762207, "learning_rate": 2.7887245818801277e-05, "loss": 2.2972, "step": 1137 }, { "epoch": 0.374496092143151, "grad_norm": 28.91097068786621, "learning_rate": 2.7741332210628345e-05, "loss": 2.3068, "step": 1138 }, { "epoch": 0.3748251748251748, "grad_norm": 32.098114013671875, "learning_rate": 2.759573983312138e-05, "loss": 2.2755, "step": 1139 }, { "epoch": 0.37515425750719866, "grad_norm": 34.55158615112305, "learning_rate": 2.7450469333520855e-05, "loss": 2.4332, "step": 1140 }, { "epoch": 0.3754833401892225, "grad_norm": 34.65534591674805, "learning_rate": 2.730552135763632e-05, "loss": 2.4652, "step": 1141 }, { "epoch": 0.3758124228712464, "grad_norm": 44.816043853759766, "learning_rate": 2.7160896549843562e-05, "loss": 2.3643, "step": 1142 }, { "epoch": 0.37614150555327025, "grad_norm": 30.94624137878418, "learning_rate": 2.701659555308169e-05, "loss": 2.1585, "step": 1143 }, { "epoch": 0.3764705882352941, "grad_norm": 40.655635833740234, "learning_rate": 2.6872619008850274e-05, "loss": 2.4626, "step": 1144 }, { "epoch": 0.376799670917318, "grad_norm": 40.45767593383789, "learning_rate": 2.672896755720654e-05, "loss": 2.4258, "step": 1145 }, { "epoch": 0.37712875359934184, "grad_norm": 39.313873291015625, "learning_rate": 2.6585641836762433e-05, "loss": 2.4184, "step": 1146 }, { "epoch": 0.3774578362813657, "grad_norm": 44.33061218261719, "learning_rate": 2.6442642484681944e-05, "loss": 2.2651, "step": 1147 }, { "epoch": 0.37778691896338956, "grad_norm": 48.290775299072266, "learning_rate": 2.6299970136678077e-05, "loss": 2.5471, "step": 1148 }, { "epoch": 0.3781160016454134, "grad_norm": 72.39301300048828, "learning_rate": 2.6157625427010156e-05, "loss": 2.7164, "step": 1149 }, { "epoch": 0.3784450843274373, "grad_norm": 70.35841369628906, "learning_rate": 2.6015608988480955e-05, "loss": 2.8396, "step": 1150 }, { "epoch": 0.37877416700946115, "grad_norm": 6.425146102905273, "learning_rate": 2.5873921452433915e-05, "loss": 1.8474, "step": 1151 }, { "epoch": 0.379103249691485, "grad_norm": 7.3597564697265625, "learning_rate": 2.57325634487503e-05, "loss": 1.8327, "step": 1152 }, { "epoch": 0.3794323323735088, "grad_norm": 8.217094421386719, "learning_rate": 2.5591535605846383e-05, "loss": 1.8734, "step": 1153 }, { "epoch": 0.3797614150555327, "grad_norm": 9.918932914733887, "learning_rate": 2.5450838550670808e-05, "loss": 1.9017, "step": 1154 }, { "epoch": 0.38009049773755654, "grad_norm": 10.04442310333252, "learning_rate": 2.5310472908701555e-05, "loss": 2.0162, "step": 1155 }, { "epoch": 0.3804195804195804, "grad_norm": 9.98444938659668, "learning_rate": 2.5170439303943294e-05, "loss": 2.0051, "step": 1156 }, { "epoch": 0.38074866310160427, "grad_norm": 11.286638259887695, "learning_rate": 2.503073835892471e-05, "loss": 1.9226, "step": 1157 }, { "epoch": 0.38107774578362813, "grad_norm": 11.279165267944336, "learning_rate": 2.4891370694695517e-05, "loss": 1.9484, "step": 1158 }, { "epoch": 0.381406828465652, "grad_norm": 11.020593643188477, "learning_rate": 2.4752336930823837e-05, "loss": 1.9685, "step": 1159 }, { "epoch": 0.38173591114767585, "grad_norm": 12.053435325622559, "learning_rate": 2.4613637685393432e-05, "loss": 1.9985, "step": 1160 }, { "epoch": 0.3820649938296997, "grad_norm": 14.00677490234375, "learning_rate": 2.4475273575000936e-05, "loss": 2.059, "step": 1161 }, { "epoch": 0.3823940765117236, "grad_norm": 13.342594146728516, "learning_rate": 2.4337245214753103e-05, "loss": 1.9679, "step": 1162 }, { "epoch": 0.38272315919374744, "grad_norm": 13.582804679870605, "learning_rate": 2.4199553218264093e-05, "loss": 2.0315, "step": 1163 }, { "epoch": 0.3830522418757713, "grad_norm": 16.02691650390625, "learning_rate": 2.4062198197652752e-05, "loss": 1.9688, "step": 1164 }, { "epoch": 0.38338132455779517, "grad_norm": 14.144081115722656, "learning_rate": 2.3925180763539844e-05, "loss": 2.0025, "step": 1165 }, { "epoch": 0.38371040723981903, "grad_norm": 15.010599136352539, "learning_rate": 2.3788501525045438e-05, "loss": 2.0487, "step": 1166 }, { "epoch": 0.38403948992184284, "grad_norm": 13.84047794342041, "learning_rate": 2.3652161089786086e-05, "loss": 1.9639, "step": 1167 }, { "epoch": 0.3843685726038667, "grad_norm": 14.894204139709473, "learning_rate": 2.351616006387214e-05, "loss": 1.9797, "step": 1168 }, { "epoch": 0.38469765528589056, "grad_norm": 16.461366653442383, "learning_rate": 2.3380499051905137e-05, "loss": 2.0941, "step": 1169 }, { "epoch": 0.3850267379679144, "grad_norm": 17.398792266845703, "learning_rate": 2.324517865697501e-05, "loss": 2.059, "step": 1170 }, { "epoch": 0.3853558206499383, "grad_norm": 15.767130851745605, "learning_rate": 2.3110199480657525e-05, "loss": 2.0292, "step": 1171 }, { "epoch": 0.38568490333196215, "grad_norm": 18.126869201660156, "learning_rate": 2.2975562123011495e-05, "loss": 2.0484, "step": 1172 }, { "epoch": 0.386013986013986, "grad_norm": 17.78265380859375, "learning_rate": 2.2841267182576143e-05, "loss": 2.1811, "step": 1173 }, { "epoch": 0.3863430686960099, "grad_norm": 16.53473472595215, "learning_rate": 2.2707315256368433e-05, "loss": 2.1613, "step": 1174 }, { "epoch": 0.38667215137803373, "grad_norm": 19.978187561035156, "learning_rate": 2.2573706939880555e-05, "loss": 2.0661, "step": 1175 }, { "epoch": 0.3870012340600576, "grad_norm": 18.85763931274414, "learning_rate": 2.2440442827077045e-05, "loss": 2.1202, "step": 1176 }, { "epoch": 0.38733031674208146, "grad_norm": 19.964628219604492, "learning_rate": 2.230752351039228e-05, "loss": 2.1485, "step": 1177 }, { "epoch": 0.3876593994241053, "grad_norm": 21.469234466552734, "learning_rate": 2.2174949580727832e-05, "loss": 2.1133, "step": 1178 }, { "epoch": 0.3879884821061292, "grad_norm": 18.83524513244629, "learning_rate": 2.2042721627449846e-05, "loss": 1.9823, "step": 1179 }, { "epoch": 0.38831756478815305, "grad_norm": 23.10477638244629, "learning_rate": 2.1910840238386398e-05, "loss": 2.0482, "step": 1180 }, { "epoch": 0.3886466474701769, "grad_norm": 23.553312301635742, "learning_rate": 2.1779305999824884e-05, "loss": 2.192, "step": 1181 }, { "epoch": 0.3889757301522007, "grad_norm": 22.671098709106445, "learning_rate": 2.164811949650942e-05, "loss": 2.1453, "step": 1182 }, { "epoch": 0.3893048128342246, "grad_norm": 63.4785270690918, "learning_rate": 2.1517281311638217e-05, "loss": 2.1105, "step": 1183 }, { "epoch": 0.38963389551624844, "grad_norm": 31.181482315063477, "learning_rate": 2.1386792026861103e-05, "loss": 2.0824, "step": 1184 }, { "epoch": 0.3899629781982723, "grad_norm": 26.41512680053711, "learning_rate": 2.125665222227675e-05, "loss": 2.1978, "step": 1185 }, { "epoch": 0.39029206088029617, "grad_norm": 22.66438102722168, "learning_rate": 2.112686247643024e-05, "loss": 2.0896, "step": 1186 }, { "epoch": 0.39062114356232003, "grad_norm": 31.160655975341797, "learning_rate": 2.09974233663104e-05, "loss": 2.2079, "step": 1187 }, { "epoch": 0.3909502262443439, "grad_norm": 30.701261520385742, "learning_rate": 2.0868335467347366e-05, "loss": 2.2646, "step": 1188 }, { "epoch": 0.39127930892636775, "grad_norm": 26.341800689697266, "learning_rate": 2.073959935340988e-05, "loss": 2.3537, "step": 1189 }, { "epoch": 0.3916083916083916, "grad_norm": 30.7208309173584, "learning_rate": 2.06112155968028e-05, "loss": 2.2763, "step": 1190 }, { "epoch": 0.3919374742904155, "grad_norm": 32.32775115966797, "learning_rate": 2.0483184768264596e-05, "loss": 2.3207, "step": 1191 }, { "epoch": 0.39226655697243934, "grad_norm": 29.995956420898438, "learning_rate": 2.035550743696468e-05, "loss": 2.1887, "step": 1192 }, { "epoch": 0.3925956396544632, "grad_norm": 37.220909118652344, "learning_rate": 2.022818417050113e-05, "loss": 2.2471, "step": 1193 }, { "epoch": 0.39292472233648706, "grad_norm": 39.63582229614258, "learning_rate": 2.0101215534897855e-05, "loss": 2.3105, "step": 1194 }, { "epoch": 0.3932538050185109, "grad_norm": 36.33919906616211, "learning_rate": 1.99746020946023e-05, "loss": 2.3398, "step": 1195 }, { "epoch": 0.39358288770053473, "grad_norm": 44.34443283081055, "learning_rate": 1.9848344412482854e-05, "loss": 2.5124, "step": 1196 }, { "epoch": 0.3939119703825586, "grad_norm": 55.1679573059082, "learning_rate": 1.9722443049826344e-05, "loss": 2.6579, "step": 1197 }, { "epoch": 0.39424105306458246, "grad_norm": 61.29106903076172, "learning_rate": 1.9596898566335576e-05, "loss": 2.6228, "step": 1198 }, { "epoch": 0.3945701357466063, "grad_norm": 87.30812072753906, "learning_rate": 1.9471711520126824e-05, "loss": 2.4729, "step": 1199 }, { "epoch": 0.3948992184286302, "grad_norm": 102.99884033203125, "learning_rate": 1.9346882467727325e-05, "loss": 3.0272, "step": 1200 }, { "epoch": 0.39522830111065405, "grad_norm": 6.708251476287842, "learning_rate": 1.9222411964072884e-05, "loss": 1.9019, "step": 1201 }, { "epoch": 0.3955573837926779, "grad_norm": 7.768139839172363, "learning_rate": 1.9098300562505266e-05, "loss": 1.8629, "step": 1202 }, { "epoch": 0.39588646647470177, "grad_norm": 8.628134727478027, "learning_rate": 1.8974548814769944e-05, "loss": 1.8803, "step": 1203 }, { "epoch": 0.39621554915672563, "grad_norm": 8.367804527282715, "learning_rate": 1.8851157271013442e-05, "loss": 1.8866, "step": 1204 }, { "epoch": 0.3965446318387495, "grad_norm": 15.445680618286133, "learning_rate": 1.872812647978095e-05, "loss": 1.9183, "step": 1205 }, { "epoch": 0.39687371452077336, "grad_norm": 10.29015827178955, "learning_rate": 1.8605456988014015e-05, "loss": 1.9327, "step": 1206 }, { "epoch": 0.3972027972027972, "grad_norm": 9.444917678833008, "learning_rate": 1.8483149341047923e-05, "loss": 1.9523, "step": 1207 }, { "epoch": 0.3975318798848211, "grad_norm": 11.390829086303711, "learning_rate": 1.8361204082609352e-05, "loss": 1.9527, "step": 1208 }, { "epoch": 0.39786096256684494, "grad_norm": 12.91218090057373, "learning_rate": 1.8239621754813995e-05, "loss": 2.0042, "step": 1209 }, { "epoch": 0.39819004524886875, "grad_norm": 12.400900840759277, "learning_rate": 1.811840289816409e-05, "loss": 2.0492, "step": 1210 }, { "epoch": 0.3985191279308926, "grad_norm": 12.321341514587402, "learning_rate": 1.799754805154603e-05, "loss": 1.8665, "step": 1211 }, { "epoch": 0.3988482106129165, "grad_norm": 13.084604263305664, "learning_rate": 1.787705775222802e-05, "loss": 1.9392, "step": 1212 }, { "epoch": 0.39917729329494034, "grad_norm": 14.647891998291016, "learning_rate": 1.775693253585763e-05, "loss": 1.9775, "step": 1213 }, { "epoch": 0.3995063759769642, "grad_norm": 13.0132474899292, "learning_rate": 1.763717293645939e-05, "loss": 2.0115, "step": 1214 }, { "epoch": 0.39983545865898806, "grad_norm": 14.433551788330078, "learning_rate": 1.7517779486432495e-05, "loss": 1.9593, "step": 1215 }, { "epoch": 0.4001645413410119, "grad_norm": 14.049041748046875, "learning_rate": 1.7398752716548395e-05, "loss": 1.9478, "step": 1216 }, { "epoch": 0.4004936240230358, "grad_norm": 14.040032386779785, "learning_rate": 1.728009315594843e-05, "loss": 2.0611, "step": 1217 }, { "epoch": 0.40082270670505965, "grad_norm": 15.079927444458008, "learning_rate": 1.716180133214149e-05, "loss": 1.988, "step": 1218 }, { "epoch": 0.4011517893870835, "grad_norm": 16.497156143188477, "learning_rate": 1.704387777100165e-05, "loss": 2.0264, "step": 1219 }, { "epoch": 0.4014808720691074, "grad_norm": 16.60602569580078, "learning_rate": 1.6926322996765897e-05, "loss": 1.9747, "step": 1220 }, { "epoch": 0.40180995475113124, "grad_norm": 19.36663055419922, "learning_rate": 1.6809137532031704e-05, "loss": 1.9413, "step": 1221 }, { "epoch": 0.4021390374331551, "grad_norm": 17.419912338256836, "learning_rate": 1.6692321897754758e-05, "loss": 2.0995, "step": 1222 }, { "epoch": 0.40246812011517896, "grad_norm": 17.52667236328125, "learning_rate": 1.65758766132467e-05, "loss": 2.11, "step": 1223 }, { "epoch": 0.4027972027972028, "grad_norm": 17.958740234375, "learning_rate": 1.6459802196172668e-05, "loss": 2.0606, "step": 1224 }, { "epoch": 0.40312628547922663, "grad_norm": 17.743858337402344, "learning_rate": 1.634409916254914e-05, "loss": 2.0363, "step": 1225 }, { "epoch": 0.4034553681612505, "grad_norm": 18.795883178710938, "learning_rate": 1.622876802674158e-05, "loss": 2.0344, "step": 1226 }, { "epoch": 0.40378445084327436, "grad_norm": 20.47906494140625, "learning_rate": 1.6113809301462125e-05, "loss": 2.0609, "step": 1227 }, { "epoch": 0.4041135335252982, "grad_norm": 20.907217025756836, "learning_rate": 1.599922349776738e-05, "loss": 2.1522, "step": 1228 }, { "epoch": 0.4044426162073221, "grad_norm": 20.93229866027832, "learning_rate": 1.5885011125056047e-05, "loss": 2.0959, "step": 1229 }, { "epoch": 0.40477169888934594, "grad_norm": 21.93674659729004, "learning_rate": 1.5771172691066794e-05, "loss": 2.0773, "step": 1230 }, { "epoch": 0.4051007815713698, "grad_norm": 21.904081344604492, "learning_rate": 1.565770870187585e-05, "loss": 2.1557, "step": 1231 }, { "epoch": 0.40542986425339367, "grad_norm": 25.50005531311035, "learning_rate": 1.5544619661894864e-05, "loss": 1.9875, "step": 1232 }, { "epoch": 0.40575894693541753, "grad_norm": 25.01803207397461, "learning_rate": 1.543190607386861e-05, "loss": 2.2651, "step": 1233 }, { "epoch": 0.4060880296174414, "grad_norm": 25.391359329223633, "learning_rate": 1.5319568438872745e-05, "loss": 2.1336, "step": 1234 }, { "epoch": 0.40641711229946526, "grad_norm": 22.25278091430664, "learning_rate": 1.520760725631164e-05, "loss": 2.2059, "step": 1235 }, { "epoch": 0.4067461949814891, "grad_norm": 25.460494995117188, "learning_rate": 1.5096023023916094e-05, "loss": 2.156, "step": 1236 }, { "epoch": 0.407075277663513, "grad_norm": 28.21173095703125, "learning_rate": 1.498481623774115e-05, "loss": 2.3182, "step": 1237 }, { "epoch": 0.40740436034553684, "grad_norm": 26.0024356842041, "learning_rate": 1.4873987392163947e-05, "loss": 2.2136, "step": 1238 }, { "epoch": 0.40773344302756065, "grad_norm": 37.3074951171875, "learning_rate": 1.4763536979881354e-05, "loss": 2.2812, "step": 1239 }, { "epoch": 0.4080625257095845, "grad_norm": 36.39536666870117, "learning_rate": 1.4653465491908003e-05, "loss": 2.1709, "step": 1240 }, { "epoch": 0.4083916083916084, "grad_norm": 33.50535202026367, "learning_rate": 1.4543773417573925e-05, "loss": 2.1955, "step": 1241 }, { "epoch": 0.40872069107363224, "grad_norm": 31.410282135009766, "learning_rate": 1.4434461244522458e-05, "loss": 2.4936, "step": 1242 }, { "epoch": 0.4090497737556561, "grad_norm": 35.52507781982422, "learning_rate": 1.4325529458708065e-05, "loss": 2.3845, "step": 1243 }, { "epoch": 0.40937885643767996, "grad_norm": 46.448974609375, "learning_rate": 1.4216978544394177e-05, "loss": 2.4259, "step": 1244 }, { "epoch": 0.4097079391197038, "grad_norm": 42.85747528076172, "learning_rate": 1.4108808984151023e-05, "loss": 2.2648, "step": 1245 }, { "epoch": 0.4100370218017277, "grad_norm": 44.22357177734375, "learning_rate": 1.4001021258853509e-05, "loss": 2.5238, "step": 1246 }, { "epoch": 0.41036610448375155, "grad_norm": 50.365116119384766, "learning_rate": 1.3893615847679065e-05, "loss": 2.6135, "step": 1247 }, { "epoch": 0.4106951871657754, "grad_norm": 42.32537841796875, "learning_rate": 1.3786593228105494e-05, "loss": 2.4763, "step": 1248 }, { "epoch": 0.4110242698477993, "grad_norm": 68.29736328125, "learning_rate": 1.3679953875908957e-05, "loss": 3.0618, "step": 1249 }, { "epoch": 0.41135335252982314, "grad_norm": 89.57001495361328, "learning_rate": 1.3573698265161683e-05, "loss": 2.8523, "step": 1250 }, { "epoch": 0.411682435211847, "grad_norm": 6.144054889678955, "learning_rate": 1.3467826868229994e-05, "loss": 1.7732, "step": 1251 }, { "epoch": 0.41201151789387086, "grad_norm": 7.039280891418457, "learning_rate": 1.3362340155772146e-05, "loss": 1.8176, "step": 1252 }, { "epoch": 0.41234060057589467, "grad_norm": 8.08688735961914, "learning_rate": 1.3257238596736266e-05, "loss": 1.9427, "step": 1253 }, { "epoch": 0.41266968325791853, "grad_norm": 8.163884162902832, "learning_rate": 1.3152522658358245e-05, "loss": 1.8312, "step": 1254 }, { "epoch": 0.4129987659399424, "grad_norm": 9.327411651611328, "learning_rate": 1.3048192806159721e-05, "loss": 1.9367, "step": 1255 }, { "epoch": 0.41332784862196625, "grad_norm": 9.430188179016113, "learning_rate": 1.2944249503945894e-05, "loss": 1.7849, "step": 1256 }, { "epoch": 0.4136569313039901, "grad_norm": 10.883514404296875, "learning_rate": 1.2840693213803545e-05, "loss": 1.9261, "step": 1257 }, { "epoch": 0.413986013986014, "grad_norm": 10.201639175415039, "learning_rate": 1.2737524396099032e-05, "loss": 1.8927, "step": 1258 }, { "epoch": 0.41431509666803784, "grad_norm": 11.297063827514648, "learning_rate": 1.2634743509476088e-05, "loss": 2.0122, "step": 1259 }, { "epoch": 0.4146441793500617, "grad_norm": 10.938859939575195, "learning_rate": 1.2532351010853916e-05, "loss": 1.9315, "step": 1260 }, { "epoch": 0.41497326203208557, "grad_norm": 13.044384002685547, "learning_rate": 1.243034735542512e-05, "loss": 1.9602, "step": 1261 }, { "epoch": 0.41530234471410943, "grad_norm": 13.668478012084961, "learning_rate": 1.2328732996653669e-05, "loss": 1.9457, "step": 1262 }, { "epoch": 0.4156314273961333, "grad_norm": 12.585790634155273, "learning_rate": 1.2227508386272878e-05, "loss": 1.9089, "step": 1263 }, { "epoch": 0.41596051007815715, "grad_norm": 13.193551063537598, "learning_rate": 1.212667397428342e-05, "loss": 2.0675, "step": 1264 }, { "epoch": 0.416289592760181, "grad_norm": 13.966743469238281, "learning_rate": 1.2026230208951306e-05, "loss": 2.0223, "step": 1265 }, { "epoch": 0.4166186754422049, "grad_norm": 14.350855827331543, "learning_rate": 1.1926177536805905e-05, "loss": 2.02, "step": 1266 }, { "epoch": 0.41694775812422874, "grad_norm": 14.455948829650879, "learning_rate": 1.1826516402637989e-05, "loss": 1.9852, "step": 1267 }, { "epoch": 0.41727684080625255, "grad_norm": 14.103328704833984, "learning_rate": 1.1727247249497685e-05, "loss": 2.0047, "step": 1268 }, { "epoch": 0.4176059234882764, "grad_norm": 17.174379348754883, "learning_rate": 1.1628370518692533e-05, "loss": 2.1244, "step": 1269 }, { "epoch": 0.41793500617030027, "grad_norm": 13.484230995178223, "learning_rate": 1.152988664978556e-05, "loss": 1.9868, "step": 1270 }, { "epoch": 0.41826408885232413, "grad_norm": 16.617740631103516, "learning_rate": 1.1431796080593283e-05, "loss": 2.0345, "step": 1271 }, { "epoch": 0.418593171534348, "grad_norm": 16.100736618041992, "learning_rate": 1.1334099247183783e-05, "loss": 1.9894, "step": 1272 }, { "epoch": 0.41892225421637186, "grad_norm": 18.469749450683594, "learning_rate": 1.1236796583874787e-05, "loss": 1.9899, "step": 1273 }, { "epoch": 0.4192513368983957, "grad_norm": 16.56525993347168, "learning_rate": 1.1139888523231678e-05, "loss": 2.1198, "step": 1274 }, { "epoch": 0.4195804195804196, "grad_norm": 17.962722778320312, "learning_rate": 1.1043375496065611e-05, "loss": 2.1012, "step": 1275 }, { "epoch": 0.41990950226244345, "grad_norm": 22.29395294189453, "learning_rate": 1.0947257931431642e-05, "loss": 2.1776, "step": 1276 }, { "epoch": 0.4202385849444673, "grad_norm": 18.07130241394043, "learning_rate": 1.0851536256626705e-05, "loss": 2.0208, "step": 1277 }, { "epoch": 0.42056766762649117, "grad_norm": 20.278606414794922, "learning_rate": 1.0756210897187812e-05, "loss": 2.1435, "step": 1278 }, { "epoch": 0.42089675030851503, "grad_norm": 20.04753303527832, "learning_rate": 1.0661282276890127e-05, "loss": 2.1703, "step": 1279 }, { "epoch": 0.4212258329905389, "grad_norm": 23.56580924987793, "learning_rate": 1.0566750817745074e-05, "loss": 2.2329, "step": 1280 }, { "epoch": 0.42155491567256276, "grad_norm": 21.810321807861328, "learning_rate": 1.0472616939998492e-05, "loss": 2.1978, "step": 1281 }, { "epoch": 0.42188399835458656, "grad_norm": 22.978836059570312, "learning_rate": 1.0378881062128731e-05, "loss": 1.9538, "step": 1282 }, { "epoch": 0.4222130810366104, "grad_norm": 23.370534896850586, "learning_rate": 1.0285543600844804e-05, "loss": 2.1214, "step": 1283 }, { "epoch": 0.4225421637186343, "grad_norm": 24.7341365814209, "learning_rate": 1.019260497108453e-05, "loss": 2.1278, "step": 1284 }, { "epoch": 0.42287124640065815, "grad_norm": 31.217824935913086, "learning_rate": 1.010006558601274e-05, "loss": 2.3104, "step": 1285 }, { "epoch": 0.423200329082682, "grad_norm": 26.847412109375, "learning_rate": 1.000792585701934e-05, "loss": 2.2456, "step": 1286 }, { "epoch": 0.4235294117647059, "grad_norm": 32.30526351928711, "learning_rate": 9.91618619371757e-06, "loss": 2.1274, "step": 1287 }, { "epoch": 0.42385849444672974, "grad_norm": 33.923152923583984, "learning_rate": 9.82484700394215e-06, "loss": 2.1649, "step": 1288 }, { "epoch": 0.4241875771287536, "grad_norm": 30.774425506591797, "learning_rate": 9.73390869374743e-06, "loss": 2.2294, "step": 1289 }, { "epoch": 0.42451665981077746, "grad_norm": 30.465274810791016, "learning_rate": 9.643371667405698e-06, "loss": 2.3588, "step": 1290 }, { "epoch": 0.4248457424928013, "grad_norm": 41.59019470214844, "learning_rate": 9.553236327405246e-06, "loss": 2.391, "step": 1291 }, { "epoch": 0.4251748251748252, "grad_norm": 41.06251525878906, "learning_rate": 9.463503074448677e-06, "loss": 2.3706, "step": 1292 }, { "epoch": 0.42550390785684905, "grad_norm": 48.63645553588867, "learning_rate": 9.374172307451068e-06, "loss": 2.358, "step": 1293 }, { "epoch": 0.4258329905388729, "grad_norm": 45.9267463684082, "learning_rate": 9.285244423538197e-06, "loss": 2.3997, "step": 1294 }, { "epoch": 0.4261620732208968, "grad_norm": 49.919368743896484, "learning_rate": 9.196719818044886e-06, "loss": 2.5101, "step": 1295 }, { "epoch": 0.4264911559029206, "grad_norm": 48.8257942199707, "learning_rate": 9.108598884513053e-06, "loss": 2.5913, "step": 1296 }, { "epoch": 0.42682023858494444, "grad_norm": 46.26669692993164, "learning_rate": 9.020882014690136e-06, "loss": 2.74, "step": 1297 }, { "epoch": 0.4271493212669683, "grad_norm": 61.14147186279297, "learning_rate": 8.933569598527247e-06, "loss": 2.7426, "step": 1298 }, { "epoch": 0.42747840394899217, "grad_norm": 70.25901794433594, "learning_rate": 8.846662024177477e-06, "loss": 2.7166, "step": 1299 }, { "epoch": 0.42780748663101603, "grad_norm": 96.01586151123047, "learning_rate": 8.760159677994172e-06, "loss": 3.1068, "step": 1300 }, { "epoch": 0.4281365693130399, "grad_norm": 6.2090582847595215, "learning_rate": 8.674062944529216e-06, "loss": 1.8765, "step": 1301 }, { "epoch": 0.42846565199506376, "grad_norm": 6.953567981719971, "learning_rate": 8.588372206531292e-06, "loss": 1.8747, "step": 1302 }, { "epoch": 0.4287947346770876, "grad_norm": 7.804721832275391, "learning_rate": 8.503087844944213e-06, "loss": 1.9297, "step": 1303 }, { "epoch": 0.4291238173591115, "grad_norm": 8.776617050170898, "learning_rate": 8.418210238905256e-06, "loss": 1.9867, "step": 1304 }, { "epoch": 0.42945290004113534, "grad_norm": 9.205742835998535, "learning_rate": 8.333739765743398e-06, "loss": 1.9765, "step": 1305 }, { "epoch": 0.4297819827231592, "grad_norm": 9.957127571105957, "learning_rate": 8.249676800977658e-06, "loss": 2.0201, "step": 1306 }, { "epoch": 0.43011106540518307, "grad_norm": 10.34471321105957, "learning_rate": 8.16602171831553e-06, "loss": 2.0047, "step": 1307 }, { "epoch": 0.43044014808720693, "grad_norm": 11.813502311706543, "learning_rate": 8.082774889651168e-06, "loss": 1.9664, "step": 1308 }, { "epoch": 0.4307692307692308, "grad_norm": 11.64344310760498, "learning_rate": 7.999936685063835e-06, "loss": 1.9588, "step": 1309 }, { "epoch": 0.4310983134512546, "grad_norm": 13.429327011108398, "learning_rate": 7.91750747281621e-06, "loss": 1.993, "step": 1310 }, { "epoch": 0.43142739613327846, "grad_norm": 11.903853416442871, "learning_rate": 7.835487619352811e-06, "loss": 2.1067, "step": 1311 }, { "epoch": 0.4317564788153023, "grad_norm": 12.879226684570312, "learning_rate": 7.753877489298244e-06, "loss": 1.9805, "step": 1312 }, { "epoch": 0.4320855614973262, "grad_norm": 13.535259246826172, "learning_rate": 7.67267744545579e-06, "loss": 2.0013, "step": 1313 }, { "epoch": 0.43241464417935005, "grad_norm": 11.86251163482666, "learning_rate": 7.591887848805545e-06, "loss": 2.004, "step": 1314 }, { "epoch": 0.4327437268613739, "grad_norm": 13.795025825500488, "learning_rate": 7.5115090585029966e-06, "loss": 2.0591, "step": 1315 }, { "epoch": 0.4330728095433978, "grad_norm": 14.76348876953125, "learning_rate": 7.431541431877342e-06, "loss": 1.9661, "step": 1316 }, { "epoch": 0.43340189222542164, "grad_norm": 13.596538543701172, "learning_rate": 7.351985324429933e-06, "loss": 2.0315, "step": 1317 }, { "epoch": 0.4337309749074455, "grad_norm": 15.94446086883545, "learning_rate": 7.272841089832694e-06, "loss": 2.0748, "step": 1318 }, { "epoch": 0.43406005758946936, "grad_norm": 17.08642578125, "learning_rate": 7.194109079926514e-06, "loss": 2.0751, "step": 1319 }, { "epoch": 0.4343891402714932, "grad_norm": 15.234790802001953, "learning_rate": 7.115789644719728e-06, "loss": 2.181, "step": 1320 }, { "epoch": 0.4347182229535171, "grad_norm": 17.301694869995117, "learning_rate": 7.037883132386547e-06, "loss": 2.0382, "step": 1321 }, { "epoch": 0.43504730563554095, "grad_norm": 16.598833084106445, "learning_rate": 6.960389889265517e-06, "loss": 1.995, "step": 1322 }, { "epoch": 0.4353763883175648, "grad_norm": 16.732702255249023, "learning_rate": 6.883310259857944e-06, "loss": 2.0586, "step": 1323 }, { "epoch": 0.4357054709995887, "grad_norm": 19.75437355041504, "learning_rate": 6.806644586826383e-06, "loss": 1.9706, "step": 1324 }, { "epoch": 0.4360345536816125, "grad_norm": 18.437931060791016, "learning_rate": 6.730393210993147e-06, "loss": 2.1038, "step": 1325 }, { "epoch": 0.43636363636363634, "grad_norm": 21.67526626586914, "learning_rate": 6.654556471338746e-06, "loss": 2.0629, "step": 1326 }, { "epoch": 0.4366927190456602, "grad_norm": 18.956262588500977, "learning_rate": 6.579134705000412e-06, "loss": 2.0842, "step": 1327 }, { "epoch": 0.43702180172768407, "grad_norm": 22.002849578857422, "learning_rate": 6.504128247270546e-06, "loss": 2.0195, "step": 1328 }, { "epoch": 0.43735088440970793, "grad_norm": 17.55010414123535, "learning_rate": 6.429537431595312e-06, "loss": 2.1114, "step": 1329 }, { "epoch": 0.4376799670917318, "grad_norm": 22.961774826049805, "learning_rate": 6.355362589573077e-06, "loss": 2.128, "step": 1330 }, { "epoch": 0.43800904977375565, "grad_norm": 20.224414825439453, "learning_rate": 6.2816040509530165e-06, "loss": 2.043, "step": 1331 }, { "epoch": 0.4383381324557795, "grad_norm": 26.24925422668457, "learning_rate": 6.2082621436335475e-06, "loss": 2.1314, "step": 1332 }, { "epoch": 0.4386672151378034, "grad_norm": 24.064102172851562, "learning_rate": 6.135337193660962e-06, "loss": 2.0978, "step": 1333 }, { "epoch": 0.43899629781982724, "grad_norm": 30.092029571533203, "learning_rate": 6.062829525227909e-06, "loss": 2.321, "step": 1334 }, { "epoch": 0.4393253805018511, "grad_norm": 24.80964469909668, "learning_rate": 5.990739460672024e-06, "loss": 2.2227, "step": 1335 }, { "epoch": 0.43965446318387497, "grad_norm": 23.15215301513672, "learning_rate": 5.9190673204744255e-06, "loss": 2.239, "step": 1336 }, { "epoch": 0.43998354586589883, "grad_norm": 30.856779098510742, "learning_rate": 5.84781342325833e-06, "loss": 2.2035, "step": 1337 }, { "epoch": 0.4403126285479227, "grad_norm": 31.48607635498047, "learning_rate": 5.77697808578761e-06, "loss": 2.2536, "step": 1338 }, { "epoch": 0.4406417112299465, "grad_norm": 23.054996490478516, "learning_rate": 5.706561622965467e-06, "loss": 2.1596, "step": 1339 }, { "epoch": 0.44097079391197036, "grad_norm": 27.312984466552734, "learning_rate": 5.636564347832907e-06, "loss": 2.2025, "step": 1340 }, { "epoch": 0.4412998765939942, "grad_norm": 35.519962310791016, "learning_rate": 5.566986571567401e-06, "loss": 2.2423, "step": 1341 }, { "epoch": 0.4416289592760181, "grad_norm": 32.36489486694336, "learning_rate": 5.497828603481569e-06, "loss": 2.3924, "step": 1342 }, { "epoch": 0.44195804195804195, "grad_norm": 41.11808395385742, "learning_rate": 5.429090751021704e-06, "loss": 2.2952, "step": 1343 }, { "epoch": 0.4422871246400658, "grad_norm": 39.649845123291016, "learning_rate": 5.3607733197664436e-06, "loss": 2.3465, "step": 1344 }, { "epoch": 0.44261620732208967, "grad_norm": 41.914772033691406, "learning_rate": 5.2928766134254345e-06, "loss": 2.2136, "step": 1345 }, { "epoch": 0.44294529000411353, "grad_norm": 37.402427673339844, "learning_rate": 5.225400933837954e-06, "loss": 1.9684, "step": 1346 }, { "epoch": 0.4432743726861374, "grad_norm": 45.12228775024414, "learning_rate": 5.158346580971573e-06, "loss": 2.5793, "step": 1347 }, { "epoch": 0.44360345536816126, "grad_norm": 58.22795486450195, "learning_rate": 5.091713852920854e-06, "loss": 2.519, "step": 1348 }, { "epoch": 0.4439325380501851, "grad_norm": 78.44586944580078, "learning_rate": 5.025503045905933e-06, "loss": 2.7479, "step": 1349 }, { "epoch": 0.444261620732209, "grad_norm": 86.1622314453125, "learning_rate": 4.959714454271369e-06, "loss": 2.8948, "step": 1350 }, { "epoch": 0.44459070341423285, "grad_norm": 6.513820648193359, "learning_rate": 4.8943483704846475e-06, "loss": 1.9271, "step": 1351 }, { "epoch": 0.4449197860962567, "grad_norm": 7.70077657699585, "learning_rate": 4.829405085134997e-06, "loss": 1.9199, "step": 1352 }, { "epoch": 0.4452488687782805, "grad_norm": 8.517674446105957, "learning_rate": 4.764884886932086e-06, "loss": 1.9007, "step": 1353 }, { "epoch": 0.4455779514603044, "grad_norm": 9.393051147460938, "learning_rate": 4.700788062704687e-06, "loss": 1.9483, "step": 1354 }, { "epoch": 0.44590703414232824, "grad_norm": 10.011371612548828, "learning_rate": 4.6371148973994525e-06, "loss": 1.9036, "step": 1355 }, { "epoch": 0.4462361168243521, "grad_norm": 10.585219383239746, "learning_rate": 4.573865674079625e-06, "loss": 1.8642, "step": 1356 }, { "epoch": 0.44656519950637596, "grad_norm": 11.157539367675781, "learning_rate": 4.511040673923828e-06, "loss": 1.9377, "step": 1357 }, { "epoch": 0.4468942821883998, "grad_norm": 11.436899185180664, "learning_rate": 4.448640176224694e-06, "loss": 2.0311, "step": 1358 }, { "epoch": 0.4472233648704237, "grad_norm": 11.287192344665527, "learning_rate": 4.386664458387779e-06, "loss": 1.9436, "step": 1359 }, { "epoch": 0.44755244755244755, "grad_norm": 12.782743453979492, "learning_rate": 4.325113795930203e-06, "loss": 1.9954, "step": 1360 }, { "epoch": 0.4478815302344714, "grad_norm": 12.932801246643066, "learning_rate": 4.263988462479484e-06, "loss": 2.0857, "step": 1361 }, { "epoch": 0.4482106129164953, "grad_norm": 13.613186836242676, "learning_rate": 4.203288729772326e-06, "loss": 2.048, "step": 1362 }, { "epoch": 0.44853969559851914, "grad_norm": 14.11001205444336, "learning_rate": 4.143014867653383e-06, "loss": 1.9651, "step": 1363 }, { "epoch": 0.448868778280543, "grad_norm": 14.086909294128418, "learning_rate": 4.083167144074073e-06, "loss": 1.9946, "step": 1364 }, { "epoch": 0.44919786096256686, "grad_norm": 16.231159210205078, "learning_rate": 4.023745825091407e-06, "loss": 1.9411, "step": 1365 }, { "epoch": 0.4495269436445907, "grad_norm": 14.6588773727417, "learning_rate": 3.964751174866765e-06, "loss": 2.0319, "step": 1366 }, { "epoch": 0.4498560263266146, "grad_norm": 15.309178352355957, "learning_rate": 3.906183455664725e-06, "loss": 2.045, "step": 1367 }, { "epoch": 0.4501851090086384, "grad_norm": 15.547152519226074, "learning_rate": 3.84804292785198e-06, "loss": 2.0158, "step": 1368 }, { "epoch": 0.45051419169066226, "grad_norm": 14.245484352111816, "learning_rate": 3.7903298498960572e-06, "loss": 1.8691, "step": 1369 }, { "epoch": 0.4508432743726861, "grad_norm": 17.334442138671875, "learning_rate": 3.7330444783642338e-06, "loss": 2.104, "step": 1370 }, { "epoch": 0.45117235705471, "grad_norm": 17.768733978271484, "learning_rate": 3.676187067922421e-06, "loss": 2.0331, "step": 1371 }, { "epoch": 0.45150143973673385, "grad_norm": 16.022096633911133, "learning_rate": 3.619757871333973e-06, "loss": 2.0661, "step": 1372 }, { "epoch": 0.4518305224187577, "grad_norm": 17.654077529907227, "learning_rate": 3.563757139458579e-06, "loss": 2.0025, "step": 1373 }, { "epoch": 0.45215960510078157, "grad_norm": 18.448652267456055, "learning_rate": 3.5081851212512175e-06, "loss": 1.9865, "step": 1374 }, { "epoch": 0.45248868778280543, "grad_norm": 19.819725036621094, "learning_rate": 3.4530420637609363e-06, "loss": 2.1312, "step": 1375 }, { "epoch": 0.4528177704648293, "grad_norm": 18.95676612854004, "learning_rate": 3.3983282121298086e-06, "loss": 2.1466, "step": 1376 }, { "epoch": 0.45314685314685316, "grad_norm": 17.762617111206055, "learning_rate": 3.3440438095919126e-06, "loss": 2.1314, "step": 1377 }, { "epoch": 0.453475935828877, "grad_norm": 19.83434295654297, "learning_rate": 3.290189097472096e-06, "loss": 2.1422, "step": 1378 }, { "epoch": 0.4538050185109009, "grad_norm": 21.070661544799805, "learning_rate": 3.236764315185037e-06, "loss": 2.0593, "step": 1379 }, { "epoch": 0.45413410119292474, "grad_norm": 19.110713958740234, "learning_rate": 3.1837697002341293e-06, "loss": 2.1673, "step": 1380 }, { "epoch": 0.4544631838749486, "grad_norm": 24.10408592224121, "learning_rate": 3.131205488210409e-06, "loss": 2.098, "step": 1381 }, { "epoch": 0.4547922665569724, "grad_norm": 23.1433162689209, "learning_rate": 3.0790719127915646e-06, "loss": 2.1496, "step": 1382 }, { "epoch": 0.4551213492389963, "grad_norm": 22.71400260925293, "learning_rate": 3.0273692057408265e-06, "loss": 2.0937, "step": 1383 }, { "epoch": 0.45545043192102014, "grad_norm": 25.77712059020996, "learning_rate": 2.976097596905969e-06, "loss": 2.2775, "step": 1384 }, { "epoch": 0.455779514603044, "grad_norm": 25.014951705932617, "learning_rate": 2.9252573142183326e-06, "loss": 2.2485, "step": 1385 }, { "epoch": 0.45610859728506786, "grad_norm": 25.602083206176758, "learning_rate": 2.874848583691714e-06, "loss": 2.1574, "step": 1386 }, { "epoch": 0.4564376799670917, "grad_norm": 32.69834518432617, "learning_rate": 2.8248716294214774e-06, "loss": 2.2374, "step": 1387 }, { "epoch": 0.4567667626491156, "grad_norm": 26.588674545288086, "learning_rate": 2.7753266735834338e-06, "loss": 2.2378, "step": 1388 }, { "epoch": 0.45709584533113945, "grad_norm": 32.752559661865234, "learning_rate": 2.7262139364329643e-06, "loss": 2.3323, "step": 1389 }, { "epoch": 0.4574249280131633, "grad_norm": 29.74610137939453, "learning_rate": 2.677533636303964e-06, "loss": 2.2521, "step": 1390 }, { "epoch": 0.4577540106951872, "grad_norm": 38.49164581298828, "learning_rate": 2.6292859896079213e-06, "loss": 2.3274, "step": 1391 }, { "epoch": 0.45808309337721104, "grad_norm": 37.02663803100586, "learning_rate": 2.581471210832931e-06, "loss": 2.4391, "step": 1392 }, { "epoch": 0.4584121760592349, "grad_norm": 32.62033462524414, "learning_rate": 2.5340895125427364e-06, "loss": 2.2568, "step": 1393 }, { "epoch": 0.45874125874125876, "grad_norm": 38.69566345214844, "learning_rate": 2.4871411053757898e-06, "loss": 2.354, "step": 1394 }, { "epoch": 0.4590703414232826, "grad_norm": 37.76717758178711, "learning_rate": 2.440626198044327e-06, "loss": 2.2801, "step": 1395 }, { "epoch": 0.45939942410530643, "grad_norm": 38.9256477355957, "learning_rate": 2.394544997333437e-06, "loss": 2.5098, "step": 1396 }, { "epoch": 0.4597285067873303, "grad_norm": 81.20575714111328, "learning_rate": 2.3488977081001394e-06, "loss": 2.431, "step": 1397 }, { "epoch": 0.46005758946935416, "grad_norm": 52.67332458496094, "learning_rate": 2.3036845332724543e-06, "loss": 2.4047, "step": 1398 }, { "epoch": 0.460386672151378, "grad_norm": 65.97159576416016, "learning_rate": 2.2589056738485324e-06, "loss": 2.7279, "step": 1399 }, { "epoch": 0.4607157548334019, "grad_norm": 96.04291534423828, "learning_rate": 2.2145613288957478e-06, "loss": 2.7223, "step": 1400 }, { "epoch": 0.46104483751542574, "grad_norm": 6.366292476654053, "learning_rate": 2.170651695549786e-06, "loss": 1.9021, "step": 1401 }, { "epoch": 0.4613739201974496, "grad_norm": 8.07584285736084, "learning_rate": 2.1271769690138332e-06, "loss": 1.8857, "step": 1402 }, { "epoch": 0.46170300287947347, "grad_norm": 8.908464431762695, "learning_rate": 2.084137342557646e-06, "loss": 1.9948, "step": 1403 }, { "epoch": 0.46203208556149733, "grad_norm": 9.378193855285645, "learning_rate": 2.0415330075166937e-06, "loss": 2.0201, "step": 1404 }, { "epoch": 0.4623611682435212, "grad_norm": 10.962492942810059, "learning_rate": 1.9993641532913833e-06, "loss": 1.9552, "step": 1405 }, { "epoch": 0.46269025092554505, "grad_norm": 9.426522254943848, "learning_rate": 1.9576309673461357e-06, "loss": 1.8056, "step": 1406 }, { "epoch": 0.4630193336075689, "grad_norm": 10.506486892700195, "learning_rate": 1.916333635208556e-06, "loss": 1.8805, "step": 1407 }, { "epoch": 0.4633484162895928, "grad_norm": 11.55906867980957, "learning_rate": 1.8754723404686425e-06, "loss": 1.924, "step": 1408 }, { "epoch": 0.46367749897161664, "grad_norm": 10.870866775512695, "learning_rate": 1.8350472647780116e-06, "loss": 1.8162, "step": 1409 }, { "epoch": 0.4640065816536405, "grad_norm": 13.117429733276367, "learning_rate": 1.7950585878489856e-06, "loss": 1.9526, "step": 1410 }, { "epoch": 0.4643356643356643, "grad_norm": 13.66409969329834, "learning_rate": 1.7555064874538397e-06, "loss": 1.966, "step": 1411 }, { "epoch": 0.4646647470176882, "grad_norm": 12.832799911499023, "learning_rate": 1.7163911394240672e-06, "loss": 2.0769, "step": 1412 }, { "epoch": 0.46499382969971204, "grad_norm": 13.330915451049805, "learning_rate": 1.6777127176495043e-06, "loss": 2.0545, "step": 1413 }, { "epoch": 0.4653229123817359, "grad_norm": 14.45919418334961, "learning_rate": 1.6394713940776296e-06, "loss": 1.9912, "step": 1414 }, { "epoch": 0.46565199506375976, "grad_norm": 15.78681468963623, "learning_rate": 1.6016673387127646e-06, "loss": 2.0658, "step": 1415 }, { "epoch": 0.4659810777457836, "grad_norm": 16.10745620727539, "learning_rate": 1.5643007196153302e-06, "loss": 1.9882, "step": 1416 }, { "epoch": 0.4663101604278075, "grad_norm": 15.115486145019531, "learning_rate": 1.5273717029010925e-06, "loss": 2.0086, "step": 1417 }, { "epoch": 0.46663924310983135, "grad_norm": 17.12721061706543, "learning_rate": 1.4908804527404286e-06, "loss": 2.1053, "step": 1418 }, { "epoch": 0.4669683257918552, "grad_norm": 16.89619255065918, "learning_rate": 1.4548271313575835e-06, "loss": 1.9053, "step": 1419 }, { "epoch": 0.4672974084738791, "grad_norm": 16.430625915527344, "learning_rate": 1.4192118990299707e-06, "loss": 2.044, "step": 1420 }, { "epoch": 0.46762649115590293, "grad_norm": 17.478805541992188, "learning_rate": 1.3840349140874619e-06, "loss": 2.0037, "step": 1421 }, { "epoch": 0.4679555738379268, "grad_norm": 16.054033279418945, "learning_rate": 1.3492963329116537e-06, "loss": 1.8964, "step": 1422 }, { "epoch": 0.46828465651995066, "grad_norm": 18.172758102416992, "learning_rate": 1.3149963099352014e-06, "loss": 2.1236, "step": 1423 }, { "epoch": 0.4686137392019745, "grad_norm": 20.382368087768555, "learning_rate": 1.2811349976411202e-06, "loss": 2.072, "step": 1424 }, { "epoch": 0.46894282188399833, "grad_norm": 20.75324058532715, "learning_rate": 1.2477125465620853e-06, "loss": 2.0739, "step": 1425 }, { "epoch": 0.4692719045660222, "grad_norm": 21.989503860473633, "learning_rate": 1.2147291052798216e-06, "loss": 2.1497, "step": 1426 }, { "epoch": 0.46960098724804605, "grad_norm": 21.784547805786133, "learning_rate": 1.1821848204243814e-06, "loss": 2.1946, "step": 1427 }, { "epoch": 0.4699300699300699, "grad_norm": 20.777917861938477, "learning_rate": 1.1500798366735233e-06, "loss": 2.12, "step": 1428 }, { "epoch": 0.4702591526120938, "grad_norm": 23.172271728515625, "learning_rate": 1.1184142967520794e-06, "loss": 2.1546, "step": 1429 }, { "epoch": 0.47058823529411764, "grad_norm": 22.436458587646484, "learning_rate": 1.0871883414312777e-06, "loss": 2.1029, "step": 1430 }, { "epoch": 0.4709173179761415, "grad_norm": 25.20066261291504, "learning_rate": 1.0564021095281652e-06, "loss": 2.2805, "step": 1431 }, { "epoch": 0.47124640065816537, "grad_norm": 22.462982177734375, "learning_rate": 1.0260557379049519e-06, "loss": 2.1897, "step": 1432 }, { "epoch": 0.4715754833401892, "grad_norm": 23.205955505371094, "learning_rate": 9.96149361468457e-07, "loss": 2.0839, "step": 1433 }, { "epoch": 0.4719045660222131, "grad_norm": 27.205961227416992, "learning_rate": 9.66683113169431e-07, "loss": 2.3129, "step": 1434 }, { "epoch": 0.47223364870423695, "grad_norm": 27.13913345336914, "learning_rate": 9.376571240020227e-07, "loss": 2.109, "step": 1435 }, { "epoch": 0.4725627313862608, "grad_norm": 28.557546615600586, "learning_rate": 9.090715230031688e-07, "loss": 2.1914, "step": 1436 }, { "epoch": 0.4728918140682847, "grad_norm": 25.52642822265625, "learning_rate": 8.809264372520609e-07, "loss": 2.2227, "step": 1437 }, { "epoch": 0.47322089675030854, "grad_norm": 29.200714111328125, "learning_rate": 8.532219918695128e-07, "loss": 2.1908, "step": 1438 }, { "epoch": 0.47354997943233235, "grad_norm": 29.401416778564453, "learning_rate": 8.259583100174606e-07, "loss": 2.2153, "step": 1439 }, { "epoch": 0.4738790621143562, "grad_norm": 31.104408264160156, "learning_rate": 7.991355128984079e-07, "loss": 2.2264, "step": 1440 }, { "epoch": 0.47420814479638007, "grad_norm": 45.96186828613281, "learning_rate": 7.727537197548707e-07, "loss": 2.2206, "step": 1441 }, { "epoch": 0.47453722747840393, "grad_norm": 37.7703971862793, "learning_rate": 7.468130478688218e-07, "loss": 2.2341, "step": 1442 }, { "epoch": 0.4748663101604278, "grad_norm": 39.4819221496582, "learning_rate": 7.213136125612586e-07, "loss": 2.4188, "step": 1443 }, { "epoch": 0.47519539284245166, "grad_norm": 37.77880096435547, "learning_rate": 6.962555271915805e-07, "loss": 2.5001, "step": 1444 }, { "epoch": 0.4755244755244755, "grad_norm": 35.299259185791016, "learning_rate": 6.716389031571568e-07, "loss": 2.3435, "step": 1445 }, { "epoch": 0.4758535582064994, "grad_norm": 41.10722732543945, "learning_rate": 6.474638498928265e-07, "loss": 2.4146, "step": 1446 }, { "epoch": 0.47618264088852325, "grad_norm": 48.131595611572266, "learning_rate": 6.237304748703543e-07, "loss": 2.6015, "step": 1447 }, { "epoch": 0.4765117235705471, "grad_norm": 52.694244384765625, "learning_rate": 6.004388835980423e-07, "loss": 2.6642, "step": 1448 }, { "epoch": 0.47684080625257097, "grad_norm": 71.38502502441406, "learning_rate": 5.77589179620186e-07, "loss": 2.5632, "step": 1449 }, { "epoch": 0.47716988893459483, "grad_norm": 75.39311218261719, "learning_rate": 5.55181464516652e-07, "loss": 2.8077, "step": 1450 }, { "epoch": 0.4774989716166187, "grad_norm": 6.674432277679443, "learning_rate": 5.332158379024122e-07, "loss": 1.8294, "step": 1451 }, { "epoch": 0.47782805429864256, "grad_norm": 7.07524299621582, "learning_rate": 5.116923974270993e-07, "loss": 1.9334, "step": 1452 }, { "epoch": 0.4781571369806664, "grad_norm": 7.546227931976318, "learning_rate": 4.906112387745965e-07, "loss": 1.8997, "step": 1453 }, { "epoch": 0.4784862196626902, "grad_norm": 8.624175071716309, "learning_rate": 4.6997245566257064e-07, "loss": 2.0949, "step": 1454 }, { "epoch": 0.4788153023447141, "grad_norm": 8.930048942565918, "learning_rate": 4.497761398421063e-07, "loss": 1.9802, "step": 1455 }, { "epoch": 0.47914438502673795, "grad_norm": 8.84058952331543, "learning_rate": 4.3002238109723927e-07, "loss": 1.8811, "step": 1456 }, { "epoch": 0.4794734677087618, "grad_norm": 10.311079025268555, "learning_rate": 4.107112672446123e-07, "loss": 1.8308, "step": 1457 }, { "epoch": 0.4798025503907857, "grad_norm": 12.471019744873047, "learning_rate": 3.9184288413306456e-07, "loss": 2.0286, "step": 1458 }, { "epoch": 0.48013163307280954, "grad_norm": 11.416449546813965, "learning_rate": 3.734173156432208e-07, "loss": 1.9373, "step": 1459 }, { "epoch": 0.4804607157548334, "grad_norm": 11.909920692443848, "learning_rate": 3.554346436871581e-07, "loss": 1.9966, "step": 1460 }, { "epoch": 0.48078979843685726, "grad_norm": 14.254650115966797, "learning_rate": 3.3789494820803957e-07, "loss": 1.8936, "step": 1461 }, { "epoch": 0.4811188811188811, "grad_norm": 12.74559497833252, "learning_rate": 3.2079830717972606e-07, "loss": 1.9565, "step": 1462 }, { "epoch": 0.481447963800905, "grad_norm": 16.555574417114258, "learning_rate": 3.041447966064648e-07, "loss": 2.0124, "step": 1463 }, { "epoch": 0.48177704648292885, "grad_norm": 15.371599197387695, "learning_rate": 2.8793449052254563e-07, "loss": 1.9626, "step": 1464 }, { "epoch": 0.4821061291649527, "grad_norm": 13.81943130493164, "learning_rate": 2.721674609919345e-07, "loss": 1.9951, "step": 1465 }, { "epoch": 0.4824352118469766, "grad_norm": 14.830611228942871, "learning_rate": 2.568437781080069e-07, "loss": 2.0027, "step": 1466 }, { "epoch": 0.48276429452900044, "grad_norm": 14.844118118286133, "learning_rate": 2.4196350999320384e-07, "loss": 2.0695, "step": 1467 }, { "epoch": 0.48309337721102424, "grad_norm": 17.546384811401367, "learning_rate": 2.275267227987321e-07, "loss": 2.1974, "step": 1468 }, { "epoch": 0.4834224598930481, "grad_norm": 15.326736450195312, "learning_rate": 2.135334807042866e-07, "loss": 2.0453, "step": 1469 }, { "epoch": 0.48375154257507197, "grad_norm": 17.10259437561035, "learning_rate": 1.9998384591773944e-07, "loss": 2.0812, "step": 1470 }, { "epoch": 0.48408062525709583, "grad_norm": 21.315500259399414, "learning_rate": 1.8687787867489592e-07, "loss": 2.1536, "step": 1471 }, { "epoch": 0.4844097079391197, "grad_norm": 17.803470611572266, "learning_rate": 1.7421563723919454e-07, "loss": 2.0989, "step": 1472 }, { "epoch": 0.48473879062114356, "grad_norm": 17.617263793945312, "learning_rate": 1.6199717790145174e-07, "loss": 2.1418, "step": 1473 }, { "epoch": 0.4850678733031674, "grad_norm": 22.622848510742188, "learning_rate": 1.5022255497962879e-07, "loss": 2.1478, "step": 1474 }, { "epoch": 0.4853969559851913, "grad_norm": 20.80709457397461, "learning_rate": 1.3889182081860962e-07, "loss": 2.1255, "step": 1475 }, { "epoch": 0.48572603866721514, "grad_norm": 19.131553649902344, "learning_rate": 1.2800502578991235e-07, "loss": 2.0341, "step": 1476 }, { "epoch": 0.486055121349239, "grad_norm": 20.654287338256836, "learning_rate": 1.1756221829148928e-07, "loss": 2.2056, "step": 1477 }, { "epoch": 0.48638420403126287, "grad_norm": 20.79045867919922, "learning_rate": 1.0756344474753821e-07, "loss": 2.2221, "step": 1478 }, { "epoch": 0.48671328671328673, "grad_norm": 22.550579071044922, "learning_rate": 9.800874960826933e-08, "loss": 2.1065, "step": 1479 }, { "epoch": 0.4870423693953106, "grad_norm": 22.112821578979492, "learning_rate": 8.889817534969425e-08, "loss": 2.0922, "step": 1480 }, { "epoch": 0.48737145207733445, "grad_norm": 24.239492416381836, "learning_rate": 8.023176247348163e-08, "loss": 2.1649, "step": 1481 }, { "epoch": 0.48770053475935826, "grad_norm": 20.545568466186523, "learning_rate": 7.200954950673522e-08, "loss": 2.1906, "step": 1482 }, { "epoch": 0.4880296174413821, "grad_norm": 23.218069076538086, "learning_rate": 6.423157300184946e-08, "loss": 2.022, "step": 1483 }, { "epoch": 0.488358700123406, "grad_norm": 23.711633682250977, "learning_rate": 5.6897867536331864e-08, "loss": 2.1665, "step": 1484 }, { "epoch": 0.48868778280542985, "grad_norm": 27.987411499023438, "learning_rate": 5.000846571264761e-08, "loss": 2.1722, "step": 1485 }, { "epoch": 0.4890168654874537, "grad_norm": 24.78904151916504, "learning_rate": 4.35633981580974e-08, "loss": 2.154, "step": 1486 }, { "epoch": 0.4893459481694776, "grad_norm": 28.390851974487305, "learning_rate": 3.756269352462871e-08, "loss": 2.3073, "step": 1487 }, { "epoch": 0.48967503085150144, "grad_norm": 32.638423919677734, "learning_rate": 3.20063784888025e-08, "loss": 2.1804, "step": 1488 }, { "epoch": 0.4900041135335253, "grad_norm": 35.395267486572266, "learning_rate": 2.6894477751548964e-08, "loss": 2.1746, "step": 1489 }, { "epoch": 0.49033319621554916, "grad_norm": 35.180233001708984, "learning_rate": 2.222701403818972e-08, "loss": 2.3482, "step": 1490 }, { "epoch": 0.490662278897573, "grad_norm": 30.182146072387695, "learning_rate": 1.8004008098226887e-08, "loss": 2.1675, "step": 1491 }, { "epoch": 0.4909913615795969, "grad_norm": 32.82539749145508, "learning_rate": 1.4225478705309769e-08, "loss": 2.2513, "step": 1492 }, { "epoch": 0.49132044426162075, "grad_norm": 32.21061706542969, "learning_rate": 1.0891442657134932e-08, "loss": 2.184, "step": 1493 }, { "epoch": 0.4916495269436446, "grad_norm": 44.68870162963867, "learning_rate": 8.001914775401798e-09, "loss": 2.3492, "step": 1494 }, { "epoch": 0.4919786096256685, "grad_norm": 43.02174377441406, "learning_rate": 5.5569079056794206e-09, "loss": 2.4599, "step": 1495 }, { "epoch": 0.49230769230769234, "grad_norm": 50.33421325683594, "learning_rate": 3.5564329174064824e-09, "loss": 2.3369, "step": 1496 }, { "epoch": 0.49263677498971614, "grad_norm": 53.78901672363281, "learning_rate": 2.0004987038246824e-09, "loss": 2.615, "step": 1497 }, { "epoch": 0.49296585767174, "grad_norm": 49.755619049072266, "learning_rate": 8.891121819565306e-10, "loss": 2.2884, "step": 1498 }, { "epoch": 0.49329494035376387, "grad_norm": 63.387603759765625, "learning_rate": 2.2227829252763344e-10, "loss": 2.4959, "step": 1499 }, { "epoch": 0.49362402303578773, "grad_norm": 88.65837860107422, "learning_rate": 0.0, "loss": 3.0448, "step": 1500 }, { "epoch": 0.49362402303578773, "eval_loss": 1.9142155647277832, "eval_runtime": 163.0928, "eval_samples_per_second": 31.381, "eval_steps_per_second": 15.69, "step": 1500 } ], "logging_steps": 1, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 375, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.6443765859038e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }