{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.752688172043011, "eval_steps": 1000, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005973715651135006, "grad_norm": 8.9375, "learning_rate": 2e-06, "loss": 1.3168, "step": 1 }, { "epoch": 0.05973715651135006, "grad_norm": 0.61328125, "learning_rate": 0.0002, "loss": 0.5503, "step": 100 }, { "epoch": 0.11947431302270012, "grad_norm": 0.314453125, "learning_rate": 0.0004, "loss": 0.3885, "step": 200 }, { "epoch": 0.17921146953405018, "grad_norm": 0.298828125, "learning_rate": 0.0006, "loss": 0.3758, "step": 300 }, { "epoch": 0.23894862604540024, "grad_norm": 0.12890625, "learning_rate": 0.0008, "loss": 0.363, "step": 400 }, { "epoch": 0.2986857825567503, "grad_norm": 0.1943359375, "learning_rate": 0.001, "loss": 0.3567, "step": 500 }, { "epoch": 0.35842293906810035, "grad_norm": 0.11181640625, "learning_rate": 0.0012, "loss": 0.3435, "step": 600 }, { "epoch": 0.41816009557945044, "grad_norm": 0.12109375, "learning_rate": 0.0014, "loss": 0.3415, "step": 700 }, { "epoch": 0.4778972520908005, "grad_norm": 0.09130859375, "learning_rate": 0.0016, "loss": 0.3394, "step": 800 }, { "epoch": 0.5376344086021505, "grad_norm": 0.11572265625, "learning_rate": 0.0018000000000000002, "loss": 0.345, "step": 900 }, { "epoch": 0.5973715651135006, "grad_norm": 0.07421875, "learning_rate": 0.002, "loss": 0.3177, "step": 1000 }, { "epoch": 0.6571087216248507, "grad_norm": 0.06494140625, "learning_rate": 0.0019998292504580526, "loss": 0.3318, "step": 1100 }, { "epoch": 0.7168458781362007, "grad_norm": 0.1455078125, "learning_rate": 0.001999317060143023, "loss": 0.3229, "step": 1200 }, { "epoch": 0.7765830346475507, "grad_norm": 0.1298828125, "learning_rate": 0.001998463603967434, "loss": 0.3145, "step": 1300 }, { "epoch": 0.8363201911589009, "grad_norm": 0.0732421875, "learning_rate": 0.0019972691733857882, "loss": 0.3158, "step": 1400 }, { "epoch": 0.8960573476702509, "grad_norm": 0.13671875, "learning_rate": 0.0019957341762950344, "loss": 0.3132, "step": 1500 }, { "epoch": 0.955794504181601, "grad_norm": 0.11572265625, "learning_rate": 0.001993859136895274, "loss": 0.3011, "step": 1600 }, { "epoch": 1.015531660692951, "grad_norm": 0.125, "learning_rate": 0.0019916446955107426, "loss": 0.3014, "step": 1700 }, { "epoch": 1.075268817204301, "grad_norm": 0.11279296875, "learning_rate": 0.001989091608371146, "loss": 0.3, "step": 1800 }, { "epoch": 1.135005973715651, "grad_norm": 0.08544921875, "learning_rate": 0.0019862007473534027, "loss": 0.3016, "step": 1900 }, { "epoch": 1.1947431302270013, "grad_norm": 0.10791015625, "learning_rate": 0.001982973099683902, "loss": 0.2991, "step": 2000 }, { "epoch": 1.2544802867383513, "grad_norm": 0.103515625, "learning_rate": 0.001979409767601366, "loss": 0.2974, "step": 2100 }, { "epoch": 1.3142174432497014, "grad_norm": 0.08056640625, "learning_rate": 0.001975511967980437, "loss": 0.2903, "step": 2200 }, { "epoch": 1.3739545997610514, "grad_norm": 0.109375, "learning_rate": 0.001971281031916114, "loss": 0.2796, "step": 2300 }, { "epoch": 1.4336917562724014, "grad_norm": 0.125, "learning_rate": 0.0019667184042691877, "loss": 0.2746, "step": 2400 }, { "epoch": 1.4934289127837514, "grad_norm": 0.10400390625, "learning_rate": 0.001961825643172819, "loss": 0.2766, "step": 2500 }, { "epoch": 1.5531660692951015, "grad_norm": 0.06494140625, "learning_rate": 0.0019566044195004407, "loss": 0.2739, "step": 2600 }, { "epoch": 1.6129032258064515, "grad_norm": 0.12255859375, "learning_rate": 0.0019510565162951536, "loss": 0.2677, "step": 2700 }, { "epoch": 1.6726403823178018, "grad_norm": 0.0869140625, "learning_rate": 0.0019451838281608197, "loss": 0.2605, "step": 2800 }, { "epoch": 1.7323775388291518, "grad_norm": 0.08837890625, "learning_rate": 0.0019389883606150567, "loss": 0.259, "step": 2900 }, { "epoch": 1.7921146953405018, "grad_norm": 0.1279296875, "learning_rate": 0.0019324722294043557, "loss": 0.256, "step": 3000 }, { "epoch": 1.8518518518518519, "grad_norm": 0.1279296875, "learning_rate": 0.0019256376597815564, "loss": 0.246, "step": 3100 }, { "epoch": 1.911589008363202, "grad_norm": 0.12890625, "learning_rate": 0.001918486985745923, "loss": 0.2431, "step": 3200 }, { "epoch": 1.971326164874552, "grad_norm": 0.083984375, "learning_rate": 0.0019110226492460884, "loss": 0.2364, "step": 3300 }, { "epoch": 2.031063321385902, "grad_norm": 0.0927734375, "learning_rate": 0.0019032471993461289, "loss": 0.2306, "step": 3400 }, { "epoch": 2.090800477897252, "grad_norm": 0.087890625, "learning_rate": 0.0018951632913550625, "loss": 0.2331, "step": 3500 }, { "epoch": 2.150537634408602, "grad_norm": 0.091796875, "learning_rate": 0.0018867736859200619, "loss": 0.2291, "step": 3600 }, { "epoch": 2.2102747909199523, "grad_norm": 0.0849609375, "learning_rate": 0.0018780812480836979, "loss": 0.227, "step": 3700 }, { "epoch": 2.270011947431302, "grad_norm": 0.1015625, "learning_rate": 0.0018690889463055284, "loss": 0.2255, "step": 3800 }, { "epoch": 2.3297491039426523, "grad_norm": 0.0703125, "learning_rate": 0.0018597998514483724, "loss": 0.2165, "step": 3900 }, { "epoch": 2.3894862604540026, "grad_norm": 0.091796875, "learning_rate": 0.0018502171357296143, "loss": 0.2157, "step": 4000 }, { "epoch": 2.4492234169653524, "grad_norm": 0.08740234375, "learning_rate": 0.0018403440716378927, "loss": 0.2086, "step": 4100 }, { "epoch": 2.5089605734767026, "grad_norm": 0.111328125, "learning_rate": 0.0018301840308155505, "loss": 0.2077, "step": 4200 }, { "epoch": 2.5686977299880525, "grad_norm": 0.09619140625, "learning_rate": 0.0018197404829072212, "loss": 0.201, "step": 4300 }, { "epoch": 2.6284348864994027, "grad_norm": 0.11083984375, "learning_rate": 0.0018090169943749475, "loss": 0.1991, "step": 4400 }, { "epoch": 2.6881720430107525, "grad_norm": 0.0830078125, "learning_rate": 0.0017980172272802398, "loss": 0.1961, "step": 4500 }, { "epoch": 2.7479091995221028, "grad_norm": 0.10107421875, "learning_rate": 0.0017867449380334832, "loss": 0.1934, "step": 4600 }, { "epoch": 2.807646356033453, "grad_norm": 0.1064453125, "learning_rate": 0.0017752039761111298, "loss": 0.1862, "step": 4700 }, { "epoch": 2.867383512544803, "grad_norm": 0.08154296875, "learning_rate": 0.001763398282741103, "loss": 0.1849, "step": 4800 }, { "epoch": 2.927120669056153, "grad_norm": 0.0869140625, "learning_rate": 0.0017513318895568735, "loss": 0.1828, "step": 4900 }, { "epoch": 2.986857825567503, "grad_norm": 0.09619140625, "learning_rate": 0.001739008917220659, "loss": 0.1769, "step": 5000 }, { "epoch": 3.046594982078853, "grad_norm": 0.107421875, "learning_rate": 0.0017264335740162242, "loss": 0.1729, "step": 5100 }, { "epoch": 3.106332138590203, "grad_norm": 0.0859375, "learning_rate": 0.0017136101544117524, "loss": 0.1745, "step": 5200 }, { "epoch": 3.166069295101553, "grad_norm": 0.107421875, "learning_rate": 0.0017005430375932908, "loss": 0.1742, "step": 5300 }, { "epoch": 3.225806451612903, "grad_norm": 0.08837890625, "learning_rate": 0.0016872366859692627, "loss": 0.175, "step": 5400 }, { "epoch": 3.2855436081242533, "grad_norm": 0.126953125, "learning_rate": 0.0016736956436465573, "loss": 0.1643, "step": 5500 }, { "epoch": 3.3452807646356035, "grad_norm": 0.07666015625, "learning_rate": 0.0016599245348787228, "loss": 0.1651, "step": 5600 }, { "epoch": 3.4050179211469533, "grad_norm": 0.12158203125, "learning_rate": 0.0016459280624867873, "loss": 0.1613, "step": 5700 }, { "epoch": 3.4647550776583036, "grad_norm": 0.1396484375, "learning_rate": 0.001631711006253251, "loss": 0.1586, "step": 5800 }, { "epoch": 3.5244922341696534, "grad_norm": 0.09228515625, "learning_rate": 0.001617278221289793, "loss": 0.1601, "step": 5900 }, { "epoch": 3.5842293906810037, "grad_norm": 0.0888671875, "learning_rate": 0.0016026346363792565, "loss": 0.1515, "step": 6000 }, { "epoch": 3.6439665471923535, "grad_norm": 0.1259765625, "learning_rate": 0.0015877852522924731, "loss": 0.1492, "step": 6100 }, { "epoch": 3.7037037037037037, "grad_norm": 0.0771484375, "learning_rate": 0.0015727351400805052, "loss": 0.1514, "step": 6200 }, { "epoch": 3.763440860215054, "grad_norm": 0.0771484375, "learning_rate": 0.0015574894393428856, "loss": 0.1462, "step": 6300 }, { "epoch": 3.823178016726404, "grad_norm": 0.078125, "learning_rate": 0.0015420533564724495, "loss": 0.143, "step": 6400 }, { "epoch": 3.882915173237754, "grad_norm": 0.0673828125, "learning_rate": 0.0015264321628773558, "loss": 0.1426, "step": 6500 }, { "epoch": 3.942652329749104, "grad_norm": 0.07421875, "learning_rate": 0.001510631193180907, "loss": 0.1391, "step": 6600 }, { "epoch": 4.002389486260454, "grad_norm": 0.10107421875, "learning_rate": 0.001494655843399779, "loss": 0.1355, "step": 6700 }, { "epoch": 4.062126642771804, "grad_norm": 0.0791015625, "learning_rate": 0.0014785115691012866, "loss": 0.1335, "step": 6800 }, { "epoch": 4.121863799283154, "grad_norm": 0.09375, "learning_rate": 0.0014622038835403132, "loss": 0.1398, "step": 6900 }, { "epoch": 4.181600955794504, "grad_norm": 0.08203125, "learning_rate": 0.0014457383557765385, "loss": 0.1342, "step": 7000 }, { "epoch": 4.241338112305854, "grad_norm": 0.1064453125, "learning_rate": 0.001429120608772609, "loss": 0.135, "step": 7100 }, { "epoch": 4.301075268817204, "grad_norm": 0.07958984375, "learning_rate": 0.0014123563174739035, "loss": 0.1297, "step": 7200 }, { "epoch": 4.360812425328555, "grad_norm": 0.0927734375, "learning_rate": 0.0013954512068705424, "loss": 0.1294, "step": 7300 }, { "epoch": 4.4205495818399045, "grad_norm": 0.09423828125, "learning_rate": 0.0013784110500423103, "loss": 0.1257, "step": 7400 }, { "epoch": 4.480286738351254, "grad_norm": 0.076171875, "learning_rate": 0.0013612416661871532, "loss": 0.1249, "step": 7500 }, { "epoch": 4.540023894862604, "grad_norm": 0.064453125, "learning_rate": 0.0013439489186339282, "loss": 0.1243, "step": 7600 }, { "epoch": 4.599761051373955, "grad_norm": 0.07275390625, "learning_rate": 0.0013265387128400831, "loss": 0.1191, "step": 7700 }, { "epoch": 4.659498207885305, "grad_norm": 0.06591796875, "learning_rate": 0.0013090169943749475, "loss": 0.1188, "step": 7800 }, { "epoch": 4.7192353643966545, "grad_norm": 0.0703125, "learning_rate": 0.0012913897468893247, "loss": 0.1174, "step": 7900 }, { "epoch": 4.778972520908005, "grad_norm": 0.061279296875, "learning_rate": 0.0012736629900720832, "loss": 0.1168, "step": 8000 }, { "epoch": 4.838709677419355, "grad_norm": 0.06640625, "learning_rate": 0.0012558427775944357, "loss": 0.1135, "step": 8100 }, { "epoch": 4.898446833930705, "grad_norm": 0.09912109375, "learning_rate": 0.0012379351950426187, "loss": 0.1117, "step": 8200 }, { "epoch": 4.958183990442055, "grad_norm": 0.05712890625, "learning_rate": 0.0012199463578396689, "loss": 0.1092, "step": 8300 }, { "epoch": 5.017921146953405, "grad_norm": 0.07861328125, "learning_rate": 0.0012018824091570102, "loss": 0.1091, "step": 8400 }, { "epoch": 5.077658303464755, "grad_norm": 0.0859375, "learning_rate": 0.0011837495178165704, "loss": 0.1086, "step": 8500 }, { "epoch": 5.137395459976105, "grad_norm": 0.06103515625, "learning_rate": 0.00116555387618413, "loss": 0.109, "step": 8600 }, { "epoch": 5.197132616487456, "grad_norm": 0.0869140625, "learning_rate": 0.0011473016980546376, "loss": 0.1071, "step": 8700 }, { "epoch": 5.256869772998805, "grad_norm": 0.0703125, "learning_rate": 0.0011289992165302034, "loss": 0.1081, "step": 8800 }, { "epoch": 5.316606929510155, "grad_norm": 0.0869140625, "learning_rate": 0.001110652681891501, "loss": 0.1061, "step": 8900 }, { "epoch": 5.376344086021505, "grad_norm": 0.123046875, "learning_rate": 0.001092268359463302, "loss": 0.1044, "step": 9000 }, { "epoch": 5.436081242532856, "grad_norm": 0.07177734375, "learning_rate": 0.001073852527474874, "loss": 0.1004, "step": 9100 }, { "epoch": 5.4958183990442055, "grad_norm": 0.09130859375, "learning_rate": 0.00105541147491597, "loss": 0.1004, "step": 9200 }, { "epoch": 5.555555555555555, "grad_norm": 0.058837890625, "learning_rate": 0.0010369514993891452, "loss": 0.098, "step": 9300 }, { "epoch": 5.615292712066906, "grad_norm": 0.057861328125, "learning_rate": 0.00101847890495913, "loss": 0.0989, "step": 9400 }, { "epoch": 5.675029868578256, "grad_norm": 0.07421875, "learning_rate": 0.001, "loss": 0.0954, "step": 9500 }, { "epoch": 5.734767025089606, "grad_norm": 0.09375, "learning_rate": 0.0009815210950408703, "loss": 0.0955, "step": 9600 }, { "epoch": 5.7945041816009555, "grad_norm": 0.0654296875, "learning_rate": 0.0009630485006108553, "loss": 0.094, "step": 9700 }, { "epoch": 5.854241338112306, "grad_norm": 0.06787109375, "learning_rate": 0.0009445885250840301, "loss": 0.0924, "step": 9800 }, { "epoch": 5.913978494623656, "grad_norm": 0.06298828125, "learning_rate": 0.0009261474725251261, "loss": 0.0911, "step": 9900 }, { "epoch": 5.973715651135006, "grad_norm": 0.054443359375, "learning_rate": 0.0009077316405366981, "loss": 0.0902, "step": 10000 }, { "epoch": 6.033452807646356, "grad_norm": 0.0673828125, "learning_rate": 0.0008893473181084994, "loss": 0.0891, "step": 10100 }, { "epoch": 6.093189964157706, "grad_norm": 0.06103515625, "learning_rate": 0.000871000783469797, "loss": 0.0914, "step": 10200 }, { "epoch": 6.152927120669056, "grad_norm": 0.053466796875, "learning_rate": 0.0008526983019453623, "loss": 0.0872, "step": 10300 }, { "epoch": 6.212664277180406, "grad_norm": 0.08740234375, "learning_rate": 0.00083444612381587, "loss": 0.0903, "step": 10400 }, { "epoch": 6.272401433691757, "grad_norm": 0.0712890625, "learning_rate": 0.0008162504821834296, "loss": 0.0858, "step": 10500 }, { "epoch": 6.332138590203106, "grad_norm": 0.051513671875, "learning_rate": 0.00079811759084299, "loss": 0.0876, "step": 10600 }, { "epoch": 6.391875746714456, "grad_norm": 0.05810546875, "learning_rate": 0.0007800536421603317, "loss": 0.0874, "step": 10700 }, { "epoch": 6.451612903225806, "grad_norm": 0.060546875, "learning_rate": 0.0007620648049573815, "loss": 0.0834, "step": 10800 }, { "epoch": 6.511350059737157, "grad_norm": 0.05859375, "learning_rate": 0.0007441572224055644, "loss": 0.0814, "step": 10900 }, { "epoch": 6.571087216248507, "grad_norm": 0.06494140625, "learning_rate": 0.0007263370099279172, "loss": 0.0822, "step": 11000 }, { "epoch": 6.630824372759856, "grad_norm": 0.04833984375, "learning_rate": 0.0007086102531106754, "loss": 0.0814, "step": 11100 }, { "epoch": 6.690561529271207, "grad_norm": 0.072265625, "learning_rate": 0.0006909830056250527, "loss": 0.0803, "step": 11200 }, { "epoch": 6.750298685782557, "grad_norm": 0.052734375, "learning_rate": 0.0006734612871599168, "loss": 0.0802, "step": 11300 }, { "epoch": 6.810035842293907, "grad_norm": 0.053955078125, "learning_rate": 0.0006560510813660718, "loss": 0.0781, "step": 11400 }, { "epoch": 6.8697729988052565, "grad_norm": 0.051025390625, "learning_rate": 0.0006387583338128471, "loss": 0.0779, "step": 11500 }, { "epoch": 6.929510155316607, "grad_norm": 0.08544921875, "learning_rate": 0.0006215889499576897, "loss": 0.0783, "step": 11600 }, { "epoch": 6.989247311827957, "grad_norm": 0.0732421875, "learning_rate": 0.0006045487931294575, "loss": 0.0765, "step": 11700 }, { "epoch": 7.048984468339307, "grad_norm": 0.062255859375, "learning_rate": 0.0005876436825260967, "loss": 0.0764, "step": 11800 }, { "epoch": 7.1087216248506575, "grad_norm": 0.06298828125, "learning_rate": 0.000570879391227391, "loss": 0.0773, "step": 11900 }, { "epoch": 7.168458781362007, "grad_norm": 0.056640625, "learning_rate": 0.0005542616442234618, "loss": 0.0759, "step": 12000 }, { "epoch": 7.228195937873357, "grad_norm": 0.042236328125, "learning_rate": 0.0005377961164596869, "loss": 0.0783, "step": 12100 }, { "epoch": 7.287933094384707, "grad_norm": 0.053955078125, "learning_rate": 0.0005214884308987136, "loss": 0.0757, "step": 12200 }, { "epoch": 7.347670250896058, "grad_norm": 0.053955078125, "learning_rate": 0.0005053441566002214, "loss": 0.0768, "step": 12300 }, { "epoch": 7.407407407407407, "grad_norm": 0.0556640625, "learning_rate": 0.0004893688068190932, "loss": 0.0746, "step": 12400 }, { "epoch": 7.467144563918757, "grad_norm": 0.0537109375, "learning_rate": 0.0004735678371226441, "loss": 0.0749, "step": 12500 }, { "epoch": 7.526881720430108, "grad_norm": 0.057861328125, "learning_rate": 0.00045794664352755057, "loss": 0.0728, "step": 12600 }, { "epoch": 7.586618876941458, "grad_norm": 0.05419921875, "learning_rate": 0.0004425105606571145, "loss": 0.0732, "step": 12700 }, { "epoch": 7.646356033452808, "grad_norm": 0.0498046875, "learning_rate": 0.00042726485991949483, "loss": 0.0731, "step": 12800 }, { "epoch": 7.706093189964157, "grad_norm": 0.05859375, "learning_rate": 0.00041221474770752696, "loss": 0.0728, "step": 12900 }, { "epoch": 7.765830346475508, "grad_norm": 0.046630859375, "learning_rate": 0.0003973653636207437, "loss": 0.072, "step": 13000 }, { "epoch": 7.825567502986858, "grad_norm": 0.043212890625, "learning_rate": 0.0003827217787102072, "loss": 0.0718, "step": 13100 }, { "epoch": 7.885304659498208, "grad_norm": 0.0400390625, "learning_rate": 0.0003682889937467493, "loss": 0.0721, "step": 13200 }, { "epoch": 7.945041816009558, "grad_norm": 0.047607421875, "learning_rate": 0.00035407193751321286, "loss": 0.0703, "step": 13300 }, { "epoch": 1.004778972520908, "grad_norm": 0.057861328125, "learning_rate": 0.0003400754651212776, "loss": 0.0716, "step": 13400 }, { "epoch": 1.064516129032258, "grad_norm": 0.049072265625, "learning_rate": 0.0003263043563534428, "loss": 0.0711, "step": 13500 }, { "epoch": 1.124253285543608, "grad_norm": 0.039794921875, "learning_rate": 0.0003127633140307373, "loss": 0.0714, "step": 13600 }, { "epoch": 1.183990442054958, "grad_norm": 0.046630859375, "learning_rate": 0.00029945696240670904, "loss": 0.0728, "step": 13700 }, { "epoch": 1.2437275985663083, "grad_norm": 0.06005859375, "learning_rate": 0.00028638984558824776, "loss": 0.0733, "step": 13800 }, { "epoch": 1.3034647550776584, "grad_norm": 0.04833984375, "learning_rate": 0.000273566425983776, "loss": 0.071, "step": 13900 }, { "epoch": 1.3632019115890084, "grad_norm": 0.042724609375, "learning_rate": 0.000260991082779341, "loss": 0.0729, "step": 14000 }, { "epoch": 1.4229390681003584, "grad_norm": 0.0380859375, "learning_rate": 0.00024866811044312666, "loss": 0.0711, "step": 14100 }, { "epoch": 1.4826762246117084, "grad_norm": 0.043701171875, "learning_rate": 0.00023660171725889702, "loss": 0.0704, "step": 14200 }, { "epoch": 1.5424133811230587, "grad_norm": 0.04931640625, "learning_rate": 0.0002247960238888701, "loss": 0.0705, "step": 14300 }, { "epoch": 1.6021505376344085, "grad_norm": 0.04638671875, "learning_rate": 0.00021325506196651677, "loss": 0.0709, "step": 14400 }, { "epoch": 1.6618876941457588, "grad_norm": 0.034423828125, "learning_rate": 0.0002019827727197605, "loss": 0.0708, "step": 14500 }, { "epoch": 1.7216248506571086, "grad_norm": 0.04541015625, "learning_rate": 0.00019098300562505265, "loss": 0.0692, "step": 14600 }, { "epoch": 1.7813620071684588, "grad_norm": 0.056396484375, "learning_rate": 0.000180259517092779, "loss": 0.0702, "step": 14700 }, { "epoch": 1.8410991636798089, "grad_norm": 0.050048828125, "learning_rate": 0.00016981596918444952, "loss": 0.0701, "step": 14800 }, { "epoch": 1.900836320191159, "grad_norm": 0.049560546875, "learning_rate": 0.0001596559283621074, "loss": 0.0695, "step": 14900 }, { "epoch": 1.960573476702509, "grad_norm": 0.05224609375, "learning_rate": 0.00014978286427038602, "loss": 0.0701, "step": 15000 }, { "epoch": 2.020310633213859, "grad_norm": 0.05419921875, "learning_rate": 0.00014020014855162756, "loss": 0.0684, "step": 15100 }, { "epoch": 2.080047789725209, "grad_norm": 0.040283203125, "learning_rate": 0.00013091105369447166, "loss": 0.072, "step": 15200 }, { "epoch": 2.139784946236559, "grad_norm": 0.040283203125, "learning_rate": 0.00012191875191630208, "loss": 0.0695, "step": 15300 }, { "epoch": 2.1995221027479093, "grad_norm": 0.051025390625, "learning_rate": 0.00011322631407993811, "loss": 0.0717, "step": 15400 }, { "epoch": 2.2592592592592595, "grad_norm": 0.046142578125, "learning_rate": 0.00010483670864493777, "loss": 0.0721, "step": 15500 }, { "epoch": 2.3189964157706093, "grad_norm": 0.036865234375, "learning_rate": 9.675280065387115e-05, "loss": 0.0704, "step": 15600 }, { "epoch": 2.378733572281959, "grad_norm": 0.040771484375, "learning_rate": 8.897735075391155e-05, "loss": 0.0711, "step": 15700 }, { "epoch": 2.4384707287933094, "grad_norm": 0.0322265625, "learning_rate": 8.151301425407698e-05, "loss": 0.0698, "step": 15800 }, { "epoch": 2.4982078853046596, "grad_norm": 0.043701171875, "learning_rate": 7.43623402184438e-05, "loss": 0.07, "step": 15900 }, { "epoch": 2.5579450418160095, "grad_norm": 0.041015625, "learning_rate": 6.75277705956443e-05, "loss": 0.0694, "step": 16000 }, { "epoch": 2.6176821983273597, "grad_norm": 0.046630859375, "learning_rate": 6.1011639384943586e-05, "loss": 0.0713, "step": 16100 }, { "epoch": 2.67741935483871, "grad_norm": 0.037353515625, "learning_rate": 5.481617183918053e-05, "loss": 0.069, "step": 16200 }, { "epoch": 2.7371565113500598, "grad_norm": 0.0380859375, "learning_rate": 4.894348370484647e-05, "loss": 0.0701, "step": 16300 }, { "epoch": 2.7968936678614096, "grad_norm": 0.04296875, "learning_rate": 4.339558049955927e-05, "loss": 0.0692, "step": 16400 }, { "epoch": 2.85663082437276, "grad_norm": 0.043212890625, "learning_rate": 3.817435682718096e-05, "loss": 0.0692, "step": 16500 }, { "epoch": 2.91636798088411, "grad_norm": 0.05078125, "learning_rate": 3.3281595730812576e-05, "loss": 0.0696, "step": 16600 }, { "epoch": 2.97610513739546, "grad_norm": 0.041015625, "learning_rate": 2.8718968083886077e-05, "loss": 0.0695, "step": 16700 }, { "epoch": 3.03584229390681, "grad_norm": 0.053955078125, "learning_rate": 2.44880320195634e-05, "loss": 0.0686, "step": 16800 }, { "epoch": 3.09557945041816, "grad_norm": 0.041015625, "learning_rate": 2.059023239863411e-05, "loss": 0.0713, "step": 16900 }, { "epoch": 3.15531660692951, "grad_norm": 0.04248046875, "learning_rate": 1.7026900316098216e-05, "loss": 0.069, "step": 17000 }, { "epoch": 3.21505376344086, "grad_norm": 0.0498046875, "learning_rate": 1.3799252646597427e-05, "loss": 0.0726, "step": 17100 }, { "epoch": 3.2747909199522103, "grad_norm": 0.04248046875, "learning_rate": 1.0908391628854042e-05, "loss": 0.0703, "step": 17200 }, { "epoch": 3.3345280764635605, "grad_norm": 0.049560546875, "learning_rate": 8.355304489257254e-06, "loss": 0.0711, "step": 17300 }, { "epoch": 3.3942652329749103, "grad_norm": 0.059814453125, "learning_rate": 6.140863104726391e-06, "loss": 0.072, "step": 17400 }, { "epoch": 3.4540023894862606, "grad_norm": 0.046142578125, "learning_rate": 4.265823704965532e-06, "loss": 0.0696, "step": 17500 }, { "epoch": 3.5137395459976104, "grad_norm": 0.0693359375, "learning_rate": 2.730826614211979e-06, "loss": 0.0685, "step": 17600 }, { "epoch": 3.5734767025089607, "grad_norm": 0.055419921875, "learning_rate": 1.5363960325660565e-06, "loss": 0.07, "step": 17700 }, { "epoch": 3.6332138590203105, "grad_norm": 0.046630859375, "learning_rate": 6.829398569770939e-07, "loss": 0.0708, "step": 17800 }, { "epoch": 3.6929510155316607, "grad_norm": 0.049560546875, "learning_rate": 1.7074954194729043e-07, "loss": 0.0694, "step": 17900 }, { "epoch": 3.752688172043011, "grad_norm": 0.036865234375, "learning_rate": 0.0, "loss": 0.0694, "step": 18000 } ], "logging_steps": 100, "max_steps": 18000, "num_input_tokens_seen": 0, "num_train_epochs": 11, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.467487355285709e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }