{ "best_metric": 78.0784, "best_model_checkpoint": "marianMT_hin_eng_cs/checkpoint-22360", "epoch": 20.0, "eval_steps": 500, "global_step": 22360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008944543828264758, "grad_norm": 568.685302734375, "learning_rate": 3.3333333333333334e-08, "loss": 45.8279, "step": 10 }, { "epoch": 0.017889087656529516, "grad_norm": 569.685546875, "learning_rate": 6.666666666666667e-08, "loss": 45.7174, "step": 20 }, { "epoch": 0.026833631484794274, "grad_norm": 574.8485717773438, "learning_rate": 1e-07, "loss": 45.079, "step": 30 }, { "epoch": 0.03577817531305903, "grad_norm": 567.7886962890625, "learning_rate": 1.3333333333333334e-07, "loss": 43.9196, "step": 40 }, { "epoch": 0.044722719141323794, "grad_norm": 576.7008056640625, "learning_rate": 1.6666666666666665e-07, "loss": 43.168, "step": 50 }, { "epoch": 0.05366726296958855, "grad_norm": 562.3006591796875, "learning_rate": 2e-07, "loss": 41.2498, "step": 60 }, { "epoch": 0.0626118067978533, "grad_norm": 570.4864501953125, "learning_rate": 2.3333333333333333e-07, "loss": 39.3382, "step": 70 }, { "epoch": 0.07155635062611806, "grad_norm": 574.6784057617188, "learning_rate": 2.6666666666666667e-07, "loss": 37.5161, "step": 80 }, { "epoch": 0.08050089445438283, "grad_norm": 569.1917114257812, "learning_rate": 3e-07, "loss": 34.6571, "step": 90 }, { "epoch": 0.08944543828264759, "grad_norm": 575.496826171875, "learning_rate": 3.333333333333333e-07, "loss": 31.8409, "step": 100 }, { "epoch": 0.09838998211091235, "grad_norm": 559.3043212890625, "learning_rate": 3.666666666666666e-07, "loss": 28.8879, "step": 110 }, { "epoch": 0.1073345259391771, "grad_norm": 549.9949951171875, "learning_rate": 4e-07, "loss": 25.2081, "step": 120 }, { "epoch": 0.11627906976744186, "grad_norm": 503.8363037109375, "learning_rate": 4.3333333333333335e-07, "loss": 21.1121, "step": 130 }, { "epoch": 0.1252236135957066, "grad_norm": 403.3987121582031, "learning_rate": 4.6666666666666666e-07, "loss": 17.6223, "step": 140 }, { "epoch": 0.13416815742397137, "grad_norm": 300.0085754394531, "learning_rate": 5e-07, "loss": 14.2539, "step": 150 }, { "epoch": 0.14311270125223613, "grad_norm": 184.18865966796875, "learning_rate": 5.333333333333333e-07, "loss": 11.7277, "step": 160 }, { "epoch": 0.1520572450805009, "grad_norm": 87.85770416259766, "learning_rate": 5.666666666666666e-07, "loss": 9.9414, "step": 170 }, { "epoch": 0.16100178890876565, "grad_norm": 60.96449279785156, "learning_rate": 6e-07, "loss": 9.0615, "step": 180 }, { "epoch": 0.16994633273703041, "grad_norm": 48.676456451416016, "learning_rate": 6.333333333333332e-07, "loss": 8.4672, "step": 190 }, { "epoch": 0.17889087656529518, "grad_norm": 43.717464447021484, "learning_rate": 6.666666666666666e-07, "loss": 7.8418, "step": 200 }, { "epoch": 0.18783542039355994, "grad_norm": 38.93547439575195, "learning_rate": 7e-07, "loss": 7.3526, "step": 210 }, { "epoch": 0.1967799642218247, "grad_norm": 34.23944854736328, "learning_rate": 7.333333333333332e-07, "loss": 6.9987, "step": 220 }, { "epoch": 0.20572450805008943, "grad_norm": 31.877201080322266, "learning_rate": 7.666666666666667e-07, "loss": 6.4741, "step": 230 }, { "epoch": 0.2146690518783542, "grad_norm": 26.220779418945312, "learning_rate": 8e-07, "loss": 6.1131, "step": 240 }, { "epoch": 0.22361359570661896, "grad_norm": 24.312641143798828, "learning_rate": 8.333333333333333e-07, "loss": 5.864, "step": 250 }, { "epoch": 0.23255813953488372, "grad_norm": 21.310455322265625, "learning_rate": 8.666666666666667e-07, "loss": 5.6485, "step": 260 }, { "epoch": 0.24150268336314848, "grad_norm": 20.5892276763916, "learning_rate": 9e-07, "loss": 5.4319, "step": 270 }, { "epoch": 0.2504472271914132, "grad_norm": 16.048423767089844, "learning_rate": 9.333333333333333e-07, "loss": 5.2566, "step": 280 }, { "epoch": 0.259391771019678, "grad_norm": 14.29564094543457, "learning_rate": 9.666666666666666e-07, "loss": 5.0271, "step": 290 }, { "epoch": 0.26833631484794274, "grad_norm": 12.640876770019531, "learning_rate": 1e-06, "loss": 4.9135, "step": 300 }, { "epoch": 0.2772808586762075, "grad_norm": 11.171175003051758, "learning_rate": 9.99699157641396e-07, "loss": 4.7181, "step": 310 }, { "epoch": 0.28622540250447226, "grad_norm": 11.868614196777344, "learning_rate": 9.993983152827917e-07, "loss": 4.6156, "step": 320 }, { "epoch": 0.295169946332737, "grad_norm": 9.595853805541992, "learning_rate": 9.990974729241877e-07, "loss": 4.5295, "step": 330 }, { "epoch": 0.3041144901610018, "grad_norm": 8.965413093566895, "learning_rate": 9.987966305655835e-07, "loss": 4.3876, "step": 340 }, { "epoch": 0.31305903398926654, "grad_norm": 8.445356369018555, "learning_rate": 9.984957882069795e-07, "loss": 4.3003, "step": 350 }, { "epoch": 0.3220035778175313, "grad_norm": 7.859620094299316, "learning_rate": 9.981949458483753e-07, "loss": 4.1741, "step": 360 }, { "epoch": 0.33094812164579607, "grad_norm": 7.521224498748779, "learning_rate": 9.978941034897713e-07, "loss": 4.1204, "step": 370 }, { "epoch": 0.33989266547406083, "grad_norm": 7.16605281829834, "learning_rate": 9.975932611311673e-07, "loss": 4.025, "step": 380 }, { "epoch": 0.3488372093023256, "grad_norm": 7.4897990226745605, "learning_rate": 9.97292418772563e-07, "loss": 3.9424, "step": 390 }, { "epoch": 0.35778175313059035, "grad_norm": 6.378204345703125, "learning_rate": 9.96991576413959e-07, "loss": 3.9045, "step": 400 }, { "epoch": 0.3667262969588551, "grad_norm": 7.619123935699463, "learning_rate": 9.96690734055355e-07, "loss": 3.8186, "step": 410 }, { "epoch": 0.3756708407871199, "grad_norm": 6.114331245422363, "learning_rate": 9.963898916967508e-07, "loss": 3.7494, "step": 420 }, { "epoch": 0.38461538461538464, "grad_norm": 6.384728908538818, "learning_rate": 9.960890493381468e-07, "loss": 3.7179, "step": 430 }, { "epoch": 0.3935599284436494, "grad_norm": 5.996768951416016, "learning_rate": 9.957882069795426e-07, "loss": 3.6116, "step": 440 }, { "epoch": 0.40250447227191416, "grad_norm": 6.2389140129089355, "learning_rate": 9.954873646209386e-07, "loss": 3.5626, "step": 450 }, { "epoch": 0.41144901610017887, "grad_norm": 6.4500651359558105, "learning_rate": 9.951865222623344e-07, "loss": 3.4897, "step": 460 }, { "epoch": 0.4203935599284436, "grad_norm": 6.330971717834473, "learning_rate": 9.948856799037304e-07, "loss": 3.4178, "step": 470 }, { "epoch": 0.4293381037567084, "grad_norm": 5.717202663421631, "learning_rate": 9.945848375451264e-07, "loss": 3.3517, "step": 480 }, { "epoch": 0.43828264758497315, "grad_norm": 5.807801723480225, "learning_rate": 9.942839951865222e-07, "loss": 3.2986, "step": 490 }, { "epoch": 0.4472271914132379, "grad_norm": 5.935306072235107, "learning_rate": 9.939831528279182e-07, "loss": 3.2396, "step": 500 }, { "epoch": 0.4561717352415027, "grad_norm": 6.021726608276367, "learning_rate": 9.93682310469314e-07, "loss": 3.1649, "step": 510 }, { "epoch": 0.46511627906976744, "grad_norm": 5.559403419494629, "learning_rate": 9.9338146811071e-07, "loss": 3.1074, "step": 520 }, { "epoch": 0.4740608228980322, "grad_norm": 5.631515026092529, "learning_rate": 9.930806257521057e-07, "loss": 3.049, "step": 530 }, { "epoch": 0.48300536672629696, "grad_norm": 5.0299391746521, "learning_rate": 9.927797833935017e-07, "loss": 2.999, "step": 540 }, { "epoch": 0.4919499105545617, "grad_norm": 5.350959777832031, "learning_rate": 9.924789410348977e-07, "loss": 2.9523, "step": 550 }, { "epoch": 0.5008944543828264, "grad_norm": 5.76137638092041, "learning_rate": 9.921780986762935e-07, "loss": 2.8882, "step": 560 }, { "epoch": 0.5098389982110912, "grad_norm": 5.530500411987305, "learning_rate": 9.918772563176895e-07, "loss": 2.8392, "step": 570 }, { "epoch": 0.518783542039356, "grad_norm": 5.285860538482666, "learning_rate": 9.915764139590855e-07, "loss": 2.8097, "step": 580 }, { "epoch": 0.5277280858676208, "grad_norm": 5.185799598693848, "learning_rate": 9.912755716004813e-07, "loss": 2.7741, "step": 590 }, { "epoch": 0.5366726296958855, "grad_norm": 5.190760135650635, "learning_rate": 9.909747292418773e-07, "loss": 2.7067, "step": 600 }, { "epoch": 0.5456171735241503, "grad_norm": 5.321168899536133, "learning_rate": 9.90673886883273e-07, "loss": 2.6636, "step": 610 }, { "epoch": 0.554561717352415, "grad_norm": 4.955347537994385, "learning_rate": 9.90373044524669e-07, "loss": 2.6456, "step": 620 }, { "epoch": 0.5635062611806798, "grad_norm": 5.024599552154541, "learning_rate": 9.900722021660649e-07, "loss": 2.5973, "step": 630 }, { "epoch": 0.5724508050089445, "grad_norm": 4.905087471008301, "learning_rate": 9.897713598074608e-07, "loss": 2.5483, "step": 640 }, { "epoch": 0.5813953488372093, "grad_norm": 4.616291522979736, "learning_rate": 9.894705174488568e-07, "loss": 2.5395, "step": 650 }, { "epoch": 0.590339892665474, "grad_norm": 5.065720081329346, "learning_rate": 9.891696750902526e-07, "loss": 2.4616, "step": 660 }, { "epoch": 0.5992844364937389, "grad_norm": 4.8041558265686035, "learning_rate": 9.888688327316486e-07, "loss": 2.4739, "step": 670 }, { "epoch": 0.6082289803220036, "grad_norm": 5.215109348297119, "learning_rate": 9.885679903730444e-07, "loss": 2.4249, "step": 680 }, { "epoch": 0.6171735241502684, "grad_norm": 4.656722545623779, "learning_rate": 9.882671480144404e-07, "loss": 2.3901, "step": 690 }, { "epoch": 0.6261180679785331, "grad_norm": 4.943691730499268, "learning_rate": 9.879663056558362e-07, "loss": 2.3465, "step": 700 }, { "epoch": 0.6350626118067979, "grad_norm": 4.540484428405762, "learning_rate": 9.876654632972322e-07, "loss": 2.3274, "step": 710 }, { "epoch": 0.6440071556350626, "grad_norm": 4.563231468200684, "learning_rate": 9.873646209386282e-07, "loss": 2.2895, "step": 720 }, { "epoch": 0.6529516994633273, "grad_norm": 4.655714988708496, "learning_rate": 9.87063778580024e-07, "loss": 2.2459, "step": 730 }, { "epoch": 0.6618962432915921, "grad_norm": 4.837522029876709, "learning_rate": 9.8676293622142e-07, "loss": 2.225, "step": 740 }, { "epoch": 0.6708407871198568, "grad_norm": 4.510364532470703, "learning_rate": 9.86462093862816e-07, "loss": 2.2263, "step": 750 }, { "epoch": 0.6797853309481217, "grad_norm": 4.513600826263428, "learning_rate": 9.861612515042117e-07, "loss": 2.2018, "step": 760 }, { "epoch": 0.6887298747763864, "grad_norm": 4.477957248687744, "learning_rate": 9.858604091456077e-07, "loss": 2.161, "step": 770 }, { "epoch": 0.6976744186046512, "grad_norm": 4.512912750244141, "learning_rate": 9.855595667870035e-07, "loss": 2.151, "step": 780 }, { "epoch": 0.7066189624329159, "grad_norm": 5.71266508102417, "learning_rate": 9.852587244283995e-07, "loss": 2.0973, "step": 790 }, { "epoch": 0.7155635062611807, "grad_norm": 5.014021873474121, "learning_rate": 9.849578820697953e-07, "loss": 2.1292, "step": 800 }, { "epoch": 0.7245080500894454, "grad_norm": 4.280351638793945, "learning_rate": 9.846570397111913e-07, "loss": 2.061, "step": 810 }, { "epoch": 0.7334525939177102, "grad_norm": 4.5558929443359375, "learning_rate": 9.843561973525873e-07, "loss": 2.0387, "step": 820 }, { "epoch": 0.7423971377459749, "grad_norm": 4.458014011383057, "learning_rate": 9.84055354993983e-07, "loss": 2.0474, "step": 830 }, { "epoch": 0.7513416815742398, "grad_norm": 4.254577159881592, "learning_rate": 9.83754512635379e-07, "loss": 2.0304, "step": 840 }, { "epoch": 0.7602862254025045, "grad_norm": 4.3591742515563965, "learning_rate": 9.834536702767749e-07, "loss": 2.0006, "step": 850 }, { "epoch": 0.7692307692307693, "grad_norm": 4.3223042488098145, "learning_rate": 9.831528279181708e-07, "loss": 1.973, "step": 860 }, { "epoch": 0.778175313059034, "grad_norm": 4.374037742614746, "learning_rate": 9.828519855595666e-07, "loss": 1.9684, "step": 870 }, { "epoch": 0.7871198568872988, "grad_norm": 4.324105739593506, "learning_rate": 9.825511432009626e-07, "loss": 1.9514, "step": 880 }, { "epoch": 0.7960644007155635, "grad_norm": 4.300051212310791, "learning_rate": 9.822503008423586e-07, "loss": 1.9275, "step": 890 }, { "epoch": 0.8050089445438283, "grad_norm": 4.329366207122803, "learning_rate": 9.819494584837544e-07, "loss": 1.8943, "step": 900 }, { "epoch": 0.813953488372093, "grad_norm": 4.005756855010986, "learning_rate": 9.816486161251504e-07, "loss": 1.8852, "step": 910 }, { "epoch": 0.8228980322003577, "grad_norm": 4.359870910644531, "learning_rate": 9.813477737665464e-07, "loss": 1.87, "step": 920 }, { "epoch": 0.8318425760286225, "grad_norm": 4.084425926208496, "learning_rate": 9.810469314079422e-07, "loss": 1.8368, "step": 930 }, { "epoch": 0.8407871198568873, "grad_norm": 3.967466354370117, "learning_rate": 9.807460890493382e-07, "loss": 1.8078, "step": 940 }, { "epoch": 0.8497316636851521, "grad_norm": 4.0641021728515625, "learning_rate": 9.80445246690734e-07, "loss": 1.8084, "step": 950 }, { "epoch": 0.8586762075134168, "grad_norm": 4.061304092407227, "learning_rate": 9.8014440433213e-07, "loss": 1.801, "step": 960 }, { "epoch": 0.8676207513416816, "grad_norm": 4.145083427429199, "learning_rate": 9.798435619735257e-07, "loss": 1.7596, "step": 970 }, { "epoch": 0.8765652951699463, "grad_norm": 4.002536296844482, "learning_rate": 9.795427196149217e-07, "loss": 1.7762, "step": 980 }, { "epoch": 0.8855098389982111, "grad_norm": 3.7484164237976074, "learning_rate": 9.792418772563177e-07, "loss": 1.7532, "step": 990 }, { "epoch": 0.8944543828264758, "grad_norm": 4.02016544342041, "learning_rate": 9.789410348977135e-07, "loss": 1.729, "step": 1000 }, { "epoch": 0.9033989266547406, "grad_norm": 4.240043640136719, "learning_rate": 9.786401925391095e-07, "loss": 1.71, "step": 1010 }, { "epoch": 0.9123434704830053, "grad_norm": 3.7680490016937256, "learning_rate": 9.783393501805053e-07, "loss": 1.6782, "step": 1020 }, { "epoch": 0.9212880143112702, "grad_norm": 3.9167776107788086, "learning_rate": 9.780385078219013e-07, "loss": 1.6722, "step": 1030 }, { "epoch": 0.9302325581395349, "grad_norm": 4.05009126663208, "learning_rate": 9.77737665463297e-07, "loss": 1.6392, "step": 1040 }, { "epoch": 0.9391771019677997, "grad_norm": 4.031562805175781, "learning_rate": 9.77436823104693e-07, "loss": 1.6431, "step": 1050 }, { "epoch": 0.9481216457960644, "grad_norm": 4.088625431060791, "learning_rate": 9.77135980746089e-07, "loss": 1.6517, "step": 1060 }, { "epoch": 0.9570661896243292, "grad_norm": 4.250485420227051, "learning_rate": 9.768351383874849e-07, "loss": 1.6238, "step": 1070 }, { "epoch": 0.9660107334525939, "grad_norm": 4.007058143615723, "learning_rate": 9.765342960288808e-07, "loss": 1.6368, "step": 1080 }, { "epoch": 0.9749552772808586, "grad_norm": 4.101932525634766, "learning_rate": 9.762334536702768e-07, "loss": 1.6112, "step": 1090 }, { "epoch": 0.9838998211091234, "grad_norm": 3.9927737712860107, "learning_rate": 9.759326113116726e-07, "loss": 1.5906, "step": 1100 }, { "epoch": 0.9928443649373881, "grad_norm": 3.737098455429077, "learning_rate": 9.756317689530686e-07, "loss": 1.5823, "step": 1110 }, { "epoch": 1.0, "eval_bleu": 11.6257, "eval_gen_len": 77.1622, "eval_loss": 1.177813172340393, "eval_runtime": 59.2168, "eval_samples_per_second": 17.596, "eval_steps_per_second": 0.186, "step": 1118 }, { "epoch": 1.0017889087656529, "grad_norm": 4.285216808319092, "learning_rate": 9.753309265944644e-07, "loss": 1.5971, "step": 1120 }, { "epoch": 1.0107334525939178, "grad_norm": 4.146667003631592, "learning_rate": 9.750300842358604e-07, "loss": 1.5671, "step": 1130 }, { "epoch": 1.0196779964221825, "grad_norm": 4.014759063720703, "learning_rate": 9.747292418772562e-07, "loss": 1.5578, "step": 1140 }, { "epoch": 1.0286225402504472, "grad_norm": 3.8846042156219482, "learning_rate": 9.744283995186522e-07, "loss": 1.5398, "step": 1150 }, { "epoch": 1.037567084078712, "grad_norm": 3.8807787895202637, "learning_rate": 9.741275571600482e-07, "loss": 1.5235, "step": 1160 }, { "epoch": 1.0465116279069768, "grad_norm": 3.5784103870391846, "learning_rate": 9.73826714801444e-07, "loss": 1.5122, "step": 1170 }, { "epoch": 1.0554561717352415, "grad_norm": 3.704495668411255, "learning_rate": 9.7352587244284e-07, "loss": 1.4861, "step": 1180 }, { "epoch": 1.0644007155635062, "grad_norm": 3.9804067611694336, "learning_rate": 9.732250300842357e-07, "loss": 1.4979, "step": 1190 }, { "epoch": 1.073345259391771, "grad_norm": 3.8511621952056885, "learning_rate": 9.729241877256317e-07, "loss": 1.4808, "step": 1200 }, { "epoch": 1.0822898032200359, "grad_norm": 3.7490718364715576, "learning_rate": 9.726233453670275e-07, "loss": 1.4304, "step": 1210 }, { "epoch": 1.0912343470483006, "grad_norm": 3.821725845336914, "learning_rate": 9.723225030084235e-07, "loss": 1.4809, "step": 1220 }, { "epoch": 1.1001788908765653, "grad_norm": 3.8172216415405273, "learning_rate": 9.720216606498195e-07, "loss": 1.4608, "step": 1230 }, { "epoch": 1.10912343470483, "grad_norm": 4.13401460647583, "learning_rate": 9.717208182912153e-07, "loss": 1.461, "step": 1240 }, { "epoch": 1.118067978533095, "grad_norm": 3.9874379634857178, "learning_rate": 9.714199759326113e-07, "loss": 1.4375, "step": 1250 }, { "epoch": 1.1270125223613596, "grad_norm": 3.4529519081115723, "learning_rate": 9.711191335740073e-07, "loss": 1.4295, "step": 1260 }, { "epoch": 1.1359570661896243, "grad_norm": 3.6411478519439697, "learning_rate": 9.70818291215403e-07, "loss": 1.3979, "step": 1270 }, { "epoch": 1.144901610017889, "grad_norm": 3.712270736694336, "learning_rate": 9.70517448856799e-07, "loss": 1.4054, "step": 1280 }, { "epoch": 1.1538461538461537, "grad_norm": 3.6629765033721924, "learning_rate": 9.702166064981949e-07, "loss": 1.4118, "step": 1290 }, { "epoch": 1.1627906976744187, "grad_norm": 3.4929187297821045, "learning_rate": 9.699157641395908e-07, "loss": 1.3981, "step": 1300 }, { "epoch": 1.1717352415026834, "grad_norm": 3.4843080043792725, "learning_rate": 9.696149217809866e-07, "loss": 1.3595, "step": 1310 }, { "epoch": 1.180679785330948, "grad_norm": 3.445066452026367, "learning_rate": 9.693140794223826e-07, "loss": 1.4054, "step": 1320 }, { "epoch": 1.1896243291592128, "grad_norm": 3.7293801307678223, "learning_rate": 9.690132370637786e-07, "loss": 1.3854, "step": 1330 }, { "epoch": 1.1985688729874777, "grad_norm": 3.4736623764038086, "learning_rate": 9.687123947051744e-07, "loss": 1.3661, "step": 1340 }, { "epoch": 1.2075134168157424, "grad_norm": 3.5197012424468994, "learning_rate": 9.684115523465704e-07, "loss": 1.3532, "step": 1350 }, { "epoch": 1.2164579606440071, "grad_norm": 3.8631348609924316, "learning_rate": 9.681107099879662e-07, "loss": 1.3467, "step": 1360 }, { "epoch": 1.2254025044722718, "grad_norm": 3.636857509613037, "learning_rate": 9.678098676293622e-07, "loss": 1.3086, "step": 1370 }, { "epoch": 1.2343470483005368, "grad_norm": 3.636439561843872, "learning_rate": 9.67509025270758e-07, "loss": 1.3196, "step": 1380 }, { "epoch": 1.2432915921288015, "grad_norm": 3.594397783279419, "learning_rate": 9.67208182912154e-07, "loss": 1.3261, "step": 1390 }, { "epoch": 1.2522361359570662, "grad_norm": 3.3458898067474365, "learning_rate": 9.6690734055355e-07, "loss": 1.3092, "step": 1400 }, { "epoch": 1.2611806797853309, "grad_norm": 3.4665396213531494, "learning_rate": 9.666064981949457e-07, "loss": 1.2903, "step": 1410 }, { "epoch": 1.2701252236135958, "grad_norm": 3.383054733276367, "learning_rate": 9.663056558363417e-07, "loss": 1.2863, "step": 1420 }, { "epoch": 1.2790697674418605, "grad_norm": 3.463866949081421, "learning_rate": 9.660048134777377e-07, "loss": 1.28, "step": 1430 }, { "epoch": 1.2880143112701252, "grad_norm": 3.318000078201294, "learning_rate": 9.657039711191335e-07, "loss": 1.2701, "step": 1440 }, { "epoch": 1.29695885509839, "grad_norm": 3.5626983642578125, "learning_rate": 9.654031287605295e-07, "loss": 1.2798, "step": 1450 }, { "epoch": 1.3059033989266546, "grad_norm": 3.5325281620025635, "learning_rate": 9.651022864019253e-07, "loss": 1.2848, "step": 1460 }, { "epoch": 1.3148479427549196, "grad_norm": 3.1660006046295166, "learning_rate": 9.648014440433213e-07, "loss": 1.2856, "step": 1470 }, { "epoch": 1.3237924865831843, "grad_norm": 3.7746942043304443, "learning_rate": 9.64500601684717e-07, "loss": 1.2511, "step": 1480 }, { "epoch": 1.332737030411449, "grad_norm": 3.2455437183380127, "learning_rate": 9.64199759326113e-07, "loss": 1.2239, "step": 1490 }, { "epoch": 1.341681574239714, "grad_norm": 3.6265106201171875, "learning_rate": 9.63898916967509e-07, "loss": 1.2483, "step": 1500 }, { "epoch": 1.3506261180679786, "grad_norm": 3.450028419494629, "learning_rate": 9.635980746089049e-07, "loss": 1.2594, "step": 1510 }, { "epoch": 1.3595706618962433, "grad_norm": 3.248667001724243, "learning_rate": 9.632972322503009e-07, "loss": 1.2371, "step": 1520 }, { "epoch": 1.368515205724508, "grad_norm": 3.4102725982666016, "learning_rate": 9.629963898916966e-07, "loss": 1.2297, "step": 1530 }, { "epoch": 1.3774597495527727, "grad_norm": 3.563889265060425, "learning_rate": 9.626955475330926e-07, "loss": 1.2094, "step": 1540 }, { "epoch": 1.3864042933810374, "grad_norm": 3.4570846557617188, "learning_rate": 9.623947051744884e-07, "loss": 1.1988, "step": 1550 }, { "epoch": 1.3953488372093024, "grad_norm": 3.3971352577209473, "learning_rate": 9.620938628158844e-07, "loss": 1.214, "step": 1560 }, { "epoch": 1.404293381037567, "grad_norm": 3.2068612575531006, "learning_rate": 9.617930204572804e-07, "loss": 1.1866, "step": 1570 }, { "epoch": 1.4132379248658318, "grad_norm": 3.515761137008667, "learning_rate": 9.614921780986762e-07, "loss": 1.1937, "step": 1580 }, { "epoch": 1.4221824686940967, "grad_norm": 3.3083653450012207, "learning_rate": 9.611913357400722e-07, "loss": 1.1842, "step": 1590 }, { "epoch": 1.4311270125223614, "grad_norm": 3.1191821098327637, "learning_rate": 9.608904933814682e-07, "loss": 1.1759, "step": 1600 }, { "epoch": 1.4400715563506261, "grad_norm": 3.5757179260253906, "learning_rate": 9.60589651022864e-07, "loss": 1.1686, "step": 1610 }, { "epoch": 1.4490161001788908, "grad_norm": 3.2756786346435547, "learning_rate": 9.6028880866426e-07, "loss": 1.1844, "step": 1620 }, { "epoch": 1.4579606440071555, "grad_norm": 3.5108566284179688, "learning_rate": 9.599879663056557e-07, "loss": 1.1476, "step": 1630 }, { "epoch": 1.4669051878354205, "grad_norm": 3.323366641998291, "learning_rate": 9.596871239470517e-07, "loss": 1.1609, "step": 1640 }, { "epoch": 1.4758497316636852, "grad_norm": 3.339520215988159, "learning_rate": 9.593862815884475e-07, "loss": 1.1642, "step": 1650 }, { "epoch": 1.4847942754919499, "grad_norm": 3.155186176300049, "learning_rate": 9.590854392298435e-07, "loss": 1.1638, "step": 1660 }, { "epoch": 1.4937388193202148, "grad_norm": 3.370572805404663, "learning_rate": 9.587845968712395e-07, "loss": 1.1436, "step": 1670 }, { "epoch": 1.5026833631484795, "grad_norm": 3.346024751663208, "learning_rate": 9.584837545126353e-07, "loss": 1.1417, "step": 1680 }, { "epoch": 1.5116279069767442, "grad_norm": 3.285853862762451, "learning_rate": 9.581829121540313e-07, "loss": 1.1113, "step": 1690 }, { "epoch": 1.520572450805009, "grad_norm": 3.1128010749816895, "learning_rate": 9.57882069795427e-07, "loss": 1.1205, "step": 1700 }, { "epoch": 1.5295169946332736, "grad_norm": 3.3890109062194824, "learning_rate": 9.57581227436823e-07, "loss": 1.13, "step": 1710 }, { "epoch": 1.5384615384615383, "grad_norm": 3.10841703414917, "learning_rate": 9.572803850782189e-07, "loss": 1.105, "step": 1720 }, { "epoch": 1.5474060822898033, "grad_norm": 3.3679182529449463, "learning_rate": 9.569795427196149e-07, "loss": 1.1032, "step": 1730 }, { "epoch": 1.556350626118068, "grad_norm": 3.1372768878936768, "learning_rate": 9.566787003610109e-07, "loss": 1.0836, "step": 1740 }, { "epoch": 1.5652951699463329, "grad_norm": 3.139641284942627, "learning_rate": 9.563778580024066e-07, "loss": 1.1069, "step": 1750 }, { "epoch": 1.5742397137745976, "grad_norm": 3.0833637714385986, "learning_rate": 9.560770156438026e-07, "loss": 1.0943, "step": 1760 }, { "epoch": 1.5831842576028623, "grad_norm": 3.8753104209899902, "learning_rate": 9.557761732851986e-07, "loss": 1.1077, "step": 1770 }, { "epoch": 1.592128801431127, "grad_norm": 3.6157336235046387, "learning_rate": 9.554753309265944e-07, "loss": 1.0878, "step": 1780 }, { "epoch": 1.6010733452593917, "grad_norm": 3.359645366668701, "learning_rate": 9.551744885679904e-07, "loss": 1.0951, "step": 1790 }, { "epoch": 1.6100178890876564, "grad_norm": 3.478275775909424, "learning_rate": 9.548736462093862e-07, "loss": 1.0714, "step": 1800 }, { "epoch": 1.6189624329159211, "grad_norm": 3.354357957839966, "learning_rate": 9.545728038507822e-07, "loss": 1.0665, "step": 1810 }, { "epoch": 1.627906976744186, "grad_norm": 3.028160572052002, "learning_rate": 9.54271961492178e-07, "loss": 1.0745, "step": 1820 }, { "epoch": 1.6368515205724508, "grad_norm": 3.5806148052215576, "learning_rate": 9.53971119133574e-07, "loss": 1.0503, "step": 1830 }, { "epoch": 1.6457960644007157, "grad_norm": 3.154953718185425, "learning_rate": 9.5367027677497e-07, "loss": 1.0341, "step": 1840 }, { "epoch": 1.6547406082289804, "grad_norm": 3.376316785812378, "learning_rate": 9.533694344163657e-07, "loss": 1.0852, "step": 1850 }, { "epoch": 1.663685152057245, "grad_norm": 3.3376107215881348, "learning_rate": 9.530685920577616e-07, "loss": 1.0677, "step": 1860 }, { "epoch": 1.6726296958855098, "grad_norm": 3.161376714706421, "learning_rate": 9.527677496991576e-07, "loss": 1.0487, "step": 1870 }, { "epoch": 1.6815742397137745, "grad_norm": 3.0562355518341064, "learning_rate": 9.524669073405534e-07, "loss": 1.0354, "step": 1880 }, { "epoch": 1.6905187835420392, "grad_norm": 3.007615804672241, "learning_rate": 9.521660649819494e-07, "loss": 1.0253, "step": 1890 }, { "epoch": 1.6994633273703041, "grad_norm": 3.5059332847595215, "learning_rate": 9.518652226233453e-07, "loss": 1.0258, "step": 1900 }, { "epoch": 1.7084078711985689, "grad_norm": 2.9691965579986572, "learning_rate": 9.515643802647413e-07, "loss": 1.024, "step": 1910 }, { "epoch": 1.7173524150268338, "grad_norm": 3.405560255050659, "learning_rate": 9.512635379061371e-07, "loss": 1.027, "step": 1920 }, { "epoch": 1.7262969588550985, "grad_norm": 3.0473272800445557, "learning_rate": 9.509626955475331e-07, "loss": 1.0313, "step": 1930 }, { "epoch": 1.7352415026833632, "grad_norm": 3.096930980682373, "learning_rate": 9.50661853188929e-07, "loss": 0.9979, "step": 1940 }, { "epoch": 1.744186046511628, "grad_norm": 3.17958402633667, "learning_rate": 9.503610108303249e-07, "loss": 1.0146, "step": 1950 }, { "epoch": 1.7531305903398926, "grad_norm": 2.9525468349456787, "learning_rate": 9.500601684717207e-07, "loss": 1.0005, "step": 1960 }, { "epoch": 1.7620751341681573, "grad_norm": 3.1976871490478516, "learning_rate": 9.497593261131167e-07, "loss": 1.0108, "step": 1970 }, { "epoch": 1.7710196779964222, "grad_norm": 3.2845773696899414, "learning_rate": 9.494584837545125e-07, "loss": 0.9818, "step": 1980 }, { "epoch": 1.779964221824687, "grad_norm": 2.9621360301971436, "learning_rate": 9.491576413959085e-07, "loss": 0.9846, "step": 1990 }, { "epoch": 1.7889087656529516, "grad_norm": 3.1748251914978027, "learning_rate": 9.488567990373044e-07, "loss": 0.993, "step": 2000 }, { "epoch": 1.7978533094812166, "grad_norm": 3.0182857513427734, "learning_rate": 9.485559566787004e-07, "loss": 0.9828, "step": 2010 }, { "epoch": 1.8067978533094813, "grad_norm": 2.983999490737915, "learning_rate": 9.482551143200962e-07, "loss": 0.9596, "step": 2020 }, { "epoch": 1.815742397137746, "grad_norm": 3.285773277282715, "learning_rate": 9.479542719614922e-07, "loss": 0.99, "step": 2030 }, { "epoch": 1.8246869409660107, "grad_norm": 2.963573455810547, "learning_rate": 9.476534296028881e-07, "loss": 0.9562, "step": 2040 }, { "epoch": 1.8336314847942754, "grad_norm": 3.3950607776641846, "learning_rate": 9.473525872442839e-07, "loss": 0.9805, "step": 2050 }, { "epoch": 1.84257602862254, "grad_norm": 2.9940521717071533, "learning_rate": 9.470517448856799e-07, "loss": 0.9535, "step": 2060 }, { "epoch": 1.851520572450805, "grad_norm": 3.064953565597534, "learning_rate": 9.467509025270757e-07, "loss": 0.9832, "step": 2070 }, { "epoch": 1.8604651162790697, "grad_norm": 3.1656112670898438, "learning_rate": 9.464500601684717e-07, "loss": 0.9431, "step": 2080 }, { "epoch": 1.8694096601073347, "grad_norm": 3.262320041656494, "learning_rate": 9.461492178098675e-07, "loss": 0.9641, "step": 2090 }, { "epoch": 1.8783542039355994, "grad_norm": 3.1577858924865723, "learning_rate": 9.458483754512635e-07, "loss": 0.9686, "step": 2100 }, { "epoch": 1.887298747763864, "grad_norm": 3.1922857761383057, "learning_rate": 9.455475330926594e-07, "loss": 0.9449, "step": 2110 }, { "epoch": 1.8962432915921288, "grad_norm": 2.9973959922790527, "learning_rate": 9.452466907340553e-07, "loss": 0.9772, "step": 2120 }, { "epoch": 1.9051878354203935, "grad_norm": 3.2709221839904785, "learning_rate": 9.449458483754512e-07, "loss": 0.9299, "step": 2130 }, { "epoch": 1.9141323792486582, "grad_norm": 2.9278268814086914, "learning_rate": 9.446450060168472e-07, "loss": 0.9355, "step": 2140 }, { "epoch": 1.9230769230769231, "grad_norm": 3.182664632797241, "learning_rate": 9.44344163658243e-07, "loss": 0.9213, "step": 2150 }, { "epoch": 1.9320214669051878, "grad_norm": 2.9108495712280273, "learning_rate": 9.44043321299639e-07, "loss": 0.9293, "step": 2160 }, { "epoch": 1.9409660107334525, "grad_norm": 3.257746934890747, "learning_rate": 9.437424789410349e-07, "loss": 0.9202, "step": 2170 }, { "epoch": 1.9499105545617175, "grad_norm": 2.7252960205078125, "learning_rate": 9.434416365824309e-07, "loss": 0.9369, "step": 2180 }, { "epoch": 1.9588550983899822, "grad_norm": 3.083718776702881, "learning_rate": 9.431407942238266e-07, "loss": 0.9066, "step": 2190 }, { "epoch": 1.9677996422182469, "grad_norm": 2.9476046562194824, "learning_rate": 9.428399518652226e-07, "loss": 0.9189, "step": 2200 }, { "epoch": 1.9767441860465116, "grad_norm": 2.823620080947876, "learning_rate": 9.425391095066185e-07, "loss": 0.9092, "step": 2210 }, { "epoch": 1.9856887298747763, "grad_norm": 3.1929540634155273, "learning_rate": 9.422382671480143e-07, "loss": 0.919, "step": 2220 }, { "epoch": 1.994633273703041, "grad_norm": 3.178370475769043, "learning_rate": 9.419374247894103e-07, "loss": 0.921, "step": 2230 }, { "epoch": 2.0, "eval_bleu": 33.2917, "eval_gen_len": 76.1459, "eval_loss": 0.6356510519981384, "eval_runtime": 55.3064, "eval_samples_per_second": 18.841, "eval_steps_per_second": 0.199, "step": 2236 }, { "epoch": 2.0035778175313057, "grad_norm": 3.014850616455078, "learning_rate": 9.416365824308062e-07, "loss": 0.9086, "step": 2240 }, { "epoch": 2.012522361359571, "grad_norm": 2.8700523376464844, "learning_rate": 9.413357400722022e-07, "loss": 0.8985, "step": 2250 }, { "epoch": 2.0214669051878356, "grad_norm": 3.0652620792388916, "learning_rate": 9.41034897713598e-07, "loss": 0.9062, "step": 2260 }, { "epoch": 2.0304114490161003, "grad_norm": 3.1319339275360107, "learning_rate": 9.40734055354994e-07, "loss": 0.8954, "step": 2270 }, { "epoch": 2.039355992844365, "grad_norm": 3.253756284713745, "learning_rate": 9.404332129963899e-07, "loss": 0.9073, "step": 2280 }, { "epoch": 2.0483005366726297, "grad_norm": 2.780285120010376, "learning_rate": 9.401323706377857e-07, "loss": 0.8782, "step": 2290 }, { "epoch": 2.0572450805008944, "grad_norm": 2.7521615028381348, "learning_rate": 9.398315282791816e-07, "loss": 0.8654, "step": 2300 }, { "epoch": 2.066189624329159, "grad_norm": 3.2367069721221924, "learning_rate": 9.395306859205776e-07, "loss": 0.8866, "step": 2310 }, { "epoch": 2.075134168157424, "grad_norm": 2.7654380798339844, "learning_rate": 9.392298435619734e-07, "loss": 0.8966, "step": 2320 }, { "epoch": 2.084078711985689, "grad_norm": 2.697866916656494, "learning_rate": 9.389290012033694e-07, "loss": 0.871, "step": 2330 }, { "epoch": 2.0930232558139537, "grad_norm": 3.0392096042633057, "learning_rate": 9.386281588447653e-07, "loss": 0.8726, "step": 2340 }, { "epoch": 2.1019677996422184, "grad_norm": 2.8354122638702393, "learning_rate": 9.383273164861613e-07, "loss": 0.8677, "step": 2350 }, { "epoch": 2.110912343470483, "grad_norm": 2.7471110820770264, "learning_rate": 9.380264741275571e-07, "loss": 0.8711, "step": 2360 }, { "epoch": 2.1198568872987478, "grad_norm": 3.0733835697174072, "learning_rate": 9.377256317689531e-07, "loss": 0.8396, "step": 2370 }, { "epoch": 2.1288014311270125, "grad_norm": 2.961397647857666, "learning_rate": 9.37424789410349e-07, "loss": 0.8514, "step": 2380 }, { "epoch": 2.137745974955277, "grad_norm": 2.858783006668091, "learning_rate": 9.371239470517448e-07, "loss": 0.8549, "step": 2390 }, { "epoch": 2.146690518783542, "grad_norm": 2.8222157955169678, "learning_rate": 9.368231046931407e-07, "loss": 0.8596, "step": 2400 }, { "epoch": 2.1556350626118066, "grad_norm": 2.9093058109283447, "learning_rate": 9.365222623345366e-07, "loss": 0.8729, "step": 2410 }, { "epoch": 2.1645796064400717, "grad_norm": 2.8252718448638916, "learning_rate": 9.362214199759325e-07, "loss": 0.8523, "step": 2420 }, { "epoch": 2.1735241502683365, "grad_norm": 2.9063096046447754, "learning_rate": 9.359205776173284e-07, "loss": 0.852, "step": 2430 }, { "epoch": 2.182468694096601, "grad_norm": 2.9497509002685547, "learning_rate": 9.356197352587244e-07, "loss": 0.8409, "step": 2440 }, { "epoch": 2.191413237924866, "grad_norm": 3.069898843765259, "learning_rate": 9.353188929001203e-07, "loss": 0.8372, "step": 2450 }, { "epoch": 2.2003577817531306, "grad_norm": 2.689397096633911, "learning_rate": 9.350180505415162e-07, "loss": 0.8282, "step": 2460 }, { "epoch": 2.2093023255813953, "grad_norm": 2.8414719104766846, "learning_rate": 9.347172081829121e-07, "loss": 0.8352, "step": 2470 }, { "epoch": 2.21824686940966, "grad_norm": 2.834557056427002, "learning_rate": 9.344163658243081e-07, "loss": 0.8425, "step": 2480 }, { "epoch": 2.2271914132379247, "grad_norm": 3.1694188117980957, "learning_rate": 9.341155234657039e-07, "loss": 0.8362, "step": 2490 }, { "epoch": 2.23613595706619, "grad_norm": 2.7628824710845947, "learning_rate": 9.338146811070999e-07, "loss": 0.8291, "step": 2500 }, { "epoch": 2.2450805008944545, "grad_norm": 2.9400546550750732, "learning_rate": 9.335138387484958e-07, "loss": 0.8252, "step": 2510 }, { "epoch": 2.2540250447227193, "grad_norm": 2.8414359092712402, "learning_rate": 9.332129963898917e-07, "loss": 0.8165, "step": 2520 }, { "epoch": 2.262969588550984, "grad_norm": 2.7981302738189697, "learning_rate": 9.329121540312875e-07, "loss": 0.8054, "step": 2530 }, { "epoch": 2.2719141323792487, "grad_norm": 3.369706630706787, "learning_rate": 9.326113116726835e-07, "loss": 0.8079, "step": 2540 }, { "epoch": 2.2808586762075134, "grad_norm": 3.226773738861084, "learning_rate": 9.323104693140794e-07, "loss": 0.8184, "step": 2550 }, { "epoch": 2.289803220035778, "grad_norm": 2.6831185817718506, "learning_rate": 9.320096269554752e-07, "loss": 0.8149, "step": 2560 }, { "epoch": 2.298747763864043, "grad_norm": 2.8104960918426514, "learning_rate": 9.317087845968712e-07, "loss": 0.809, "step": 2570 }, { "epoch": 2.3076923076923075, "grad_norm": 2.8982748985290527, "learning_rate": 9.314079422382671e-07, "loss": 0.828, "step": 2580 }, { "epoch": 2.3166368515205726, "grad_norm": 2.8262553215026855, "learning_rate": 9.31107099879663e-07, "loss": 0.8061, "step": 2590 }, { "epoch": 2.3255813953488373, "grad_norm": 2.9029324054718018, "learning_rate": 9.308062575210589e-07, "loss": 0.7949, "step": 2600 }, { "epoch": 2.334525939177102, "grad_norm": 2.753194570541382, "learning_rate": 9.305054151624549e-07, "loss": 0.7933, "step": 2610 }, { "epoch": 2.3434704830053668, "grad_norm": 2.6942641735076904, "learning_rate": 9.302045728038508e-07, "loss": 0.7904, "step": 2620 }, { "epoch": 2.3524150268336315, "grad_norm": 2.987013816833496, "learning_rate": 9.299037304452466e-07, "loss": 0.8139, "step": 2630 }, { "epoch": 2.361359570661896, "grad_norm": 3.0176124572753906, "learning_rate": 9.296028880866425e-07, "loss": 0.814, "step": 2640 }, { "epoch": 2.370304114490161, "grad_norm": 2.544062614440918, "learning_rate": 9.293020457280385e-07, "loss": 0.7866, "step": 2650 }, { "epoch": 2.3792486583184256, "grad_norm": 3.0904483795166016, "learning_rate": 9.290012033694343e-07, "loss": 0.7841, "step": 2660 }, { "epoch": 2.3881932021466907, "grad_norm": 2.8507888317108154, "learning_rate": 9.287003610108303e-07, "loss": 0.7828, "step": 2670 }, { "epoch": 2.3971377459749554, "grad_norm": 2.9731242656707764, "learning_rate": 9.283995186522262e-07, "loss": 0.7884, "step": 2680 }, { "epoch": 2.40608228980322, "grad_norm": 2.6749331951141357, "learning_rate": 9.280986762936222e-07, "loss": 0.78, "step": 2690 }, { "epoch": 2.415026833631485, "grad_norm": 2.694559335708618, "learning_rate": 9.27797833935018e-07, "loss": 0.7832, "step": 2700 }, { "epoch": 2.4239713774597496, "grad_norm": 2.7893500328063965, "learning_rate": 9.27496991576414e-07, "loss": 0.7872, "step": 2710 }, { "epoch": 2.4329159212880143, "grad_norm": 3.1287553310394287, "learning_rate": 9.271961492178099e-07, "loss": 0.7593, "step": 2720 }, { "epoch": 2.441860465116279, "grad_norm": 3.007920503616333, "learning_rate": 9.268953068592056e-07, "loss": 0.7803, "step": 2730 }, { "epoch": 2.4508050089445437, "grad_norm": 2.706397533416748, "learning_rate": 9.265944645006016e-07, "loss": 0.7833, "step": 2740 }, { "epoch": 2.4597495527728084, "grad_norm": 2.878617763519287, "learning_rate": 9.262936221419975e-07, "loss": 0.756, "step": 2750 }, { "epoch": 2.4686940966010735, "grad_norm": 3.0355637073516846, "learning_rate": 9.259927797833934e-07, "loss": 0.7572, "step": 2760 }, { "epoch": 2.4776386404293382, "grad_norm": 2.7085120677948, "learning_rate": 9.256919374247893e-07, "loss": 0.7513, "step": 2770 }, { "epoch": 2.486583184257603, "grad_norm": 3.1326651573181152, "learning_rate": 9.253910950661853e-07, "loss": 0.7592, "step": 2780 }, { "epoch": 2.4955277280858676, "grad_norm": 3.014554500579834, "learning_rate": 9.250902527075812e-07, "loss": 0.7562, "step": 2790 }, { "epoch": 2.5044722719141324, "grad_norm": 2.5138909816741943, "learning_rate": 9.247894103489771e-07, "loss": 0.7435, "step": 2800 }, { "epoch": 2.513416815742397, "grad_norm": 2.6745359897613525, "learning_rate": 9.24488567990373e-07, "loss": 0.7361, "step": 2810 }, { "epoch": 2.5223613595706618, "grad_norm": 3.270789623260498, "learning_rate": 9.24187725631769e-07, "loss": 0.7563, "step": 2820 }, { "epoch": 2.531305903398927, "grad_norm": 2.8245232105255127, "learning_rate": 9.238868832731648e-07, "loss": 0.7571, "step": 2830 }, { "epoch": 2.5402504472271916, "grad_norm": 2.7508704662323, "learning_rate": 9.235860409145608e-07, "loss": 0.7691, "step": 2840 }, { "epoch": 2.5491949910554563, "grad_norm": 2.965639114379883, "learning_rate": 9.232851985559566e-07, "loss": 0.7438, "step": 2850 }, { "epoch": 2.558139534883721, "grad_norm": 2.903453826904297, "learning_rate": 9.229843561973526e-07, "loss": 0.7612, "step": 2860 }, { "epoch": 2.5670840787119857, "grad_norm": 2.4326062202453613, "learning_rate": 9.226835138387484e-07, "loss": 0.7373, "step": 2870 }, { "epoch": 2.5760286225402504, "grad_norm": 2.5255181789398193, "learning_rate": 9.223826714801444e-07, "loss": 0.703, "step": 2880 }, { "epoch": 2.584973166368515, "grad_norm": 2.712981700897217, "learning_rate": 9.220818291215403e-07, "loss": 0.7386, "step": 2890 }, { "epoch": 2.59391771019678, "grad_norm": 2.6336443424224854, "learning_rate": 9.217809867629361e-07, "loss": 0.7088, "step": 2900 }, { "epoch": 2.6028622540250446, "grad_norm": 2.936994791030884, "learning_rate": 9.214801444043321e-07, "loss": 0.7431, "step": 2910 }, { "epoch": 2.6118067978533093, "grad_norm": 2.911325693130493, "learning_rate": 9.21179302045728e-07, "loss": 0.741, "step": 2920 }, { "epoch": 2.620751341681574, "grad_norm": 2.7983999252319336, "learning_rate": 9.208784596871239e-07, "loss": 0.7363, "step": 2930 }, { "epoch": 2.629695885509839, "grad_norm": 2.58998703956604, "learning_rate": 9.205776173285198e-07, "loss": 0.7193, "step": 2940 }, { "epoch": 2.638640429338104, "grad_norm": 2.603731632232666, "learning_rate": 9.202767749699158e-07, "loss": 0.7366, "step": 2950 }, { "epoch": 2.6475849731663685, "grad_norm": 3.180845022201538, "learning_rate": 9.199759326113116e-07, "loss": 0.7156, "step": 2960 }, { "epoch": 2.6565295169946332, "grad_norm": 2.47441029548645, "learning_rate": 9.196750902527075e-07, "loss": 0.7323, "step": 2970 }, { "epoch": 2.665474060822898, "grad_norm": 2.681979179382324, "learning_rate": 9.193742478941034e-07, "loss": 0.7238, "step": 2980 }, { "epoch": 2.6744186046511627, "grad_norm": 2.7500979900360107, "learning_rate": 9.190734055354994e-07, "loss": 0.7205, "step": 2990 }, { "epoch": 2.683363148479428, "grad_norm": 2.9268977642059326, "learning_rate": 9.187725631768952e-07, "loss": 0.7104, "step": 3000 }, { "epoch": 2.6923076923076925, "grad_norm": 2.865962028503418, "learning_rate": 9.184717208182912e-07, "loss": 0.7189, "step": 3010 }, { "epoch": 2.701252236135957, "grad_norm": 3.1300830841064453, "learning_rate": 9.181708784596871e-07, "loss": 0.7118, "step": 3020 }, { "epoch": 2.710196779964222, "grad_norm": 2.472964286804199, "learning_rate": 9.178700361010831e-07, "loss": 0.7168, "step": 3030 }, { "epoch": 2.7191413237924866, "grad_norm": 2.9153640270233154, "learning_rate": 9.175691937424789e-07, "loss": 0.7052, "step": 3040 }, { "epoch": 2.7280858676207513, "grad_norm": 2.578078031539917, "learning_rate": 9.172683513838749e-07, "loss": 0.7028, "step": 3050 }, { "epoch": 2.737030411449016, "grad_norm": 2.813464879989624, "learning_rate": 9.169675090252708e-07, "loss": 0.682, "step": 3060 }, { "epoch": 2.7459749552772807, "grad_norm": 2.818431854248047, "learning_rate": 9.166666666666665e-07, "loss": 0.6974, "step": 3070 }, { "epoch": 2.7549194991055455, "grad_norm": 2.902752161026001, "learning_rate": 9.163658243080625e-07, "loss": 0.6881, "step": 3080 }, { "epoch": 2.76386404293381, "grad_norm": 3.032303810119629, "learning_rate": 9.160649819494584e-07, "loss": 0.695, "step": 3090 }, { "epoch": 2.772808586762075, "grad_norm": 2.627807140350342, "learning_rate": 9.157641395908543e-07, "loss": 0.6846, "step": 3100 }, { "epoch": 2.78175313059034, "grad_norm": 2.711284875869751, "learning_rate": 9.154632972322502e-07, "loss": 0.6836, "step": 3110 }, { "epoch": 2.7906976744186047, "grad_norm": 3.2660164833068848, "learning_rate": 9.151624548736462e-07, "loss": 0.703, "step": 3120 }, { "epoch": 2.7996422182468694, "grad_norm": 2.5693178176879883, "learning_rate": 9.148616125150421e-07, "loss": 0.6885, "step": 3130 }, { "epoch": 2.808586762075134, "grad_norm": 2.5643057823181152, "learning_rate": 9.14560770156438e-07, "loss": 0.6825, "step": 3140 }, { "epoch": 2.817531305903399, "grad_norm": 2.8102810382843018, "learning_rate": 9.142599277978339e-07, "loss": 0.6685, "step": 3150 }, { "epoch": 2.8264758497316635, "grad_norm": 2.8618929386138916, "learning_rate": 9.139590854392299e-07, "loss": 0.6847, "step": 3160 }, { "epoch": 2.8354203935599287, "grad_norm": 2.610710382461548, "learning_rate": 9.136582430806256e-07, "loss": 0.6774, "step": 3170 }, { "epoch": 2.8443649373881934, "grad_norm": 3.4627251625061035, "learning_rate": 9.133574007220216e-07, "loss": 0.6865, "step": 3180 }, { "epoch": 2.853309481216458, "grad_norm": 2.6292290687561035, "learning_rate": 9.130565583634175e-07, "loss": 0.7044, "step": 3190 }, { "epoch": 2.862254025044723, "grad_norm": 2.454831600189209, "learning_rate": 9.127557160048135e-07, "loss": 0.6791, "step": 3200 }, { "epoch": 2.8711985688729875, "grad_norm": 2.51297926902771, "learning_rate": 9.124548736462093e-07, "loss": 0.6838, "step": 3210 }, { "epoch": 2.8801431127012522, "grad_norm": 2.57023549079895, "learning_rate": 9.121540312876053e-07, "loss": 0.6691, "step": 3220 }, { "epoch": 2.889087656529517, "grad_norm": 2.8879973888397217, "learning_rate": 9.118531889290012e-07, "loss": 0.6612, "step": 3230 }, { "epoch": 2.8980322003577816, "grad_norm": 2.8545825481414795, "learning_rate": 9.11552346570397e-07, "loss": 0.6701, "step": 3240 }, { "epoch": 2.9069767441860463, "grad_norm": 2.746073007583618, "learning_rate": 9.11251504211793e-07, "loss": 0.6867, "step": 3250 }, { "epoch": 2.915921288014311, "grad_norm": 2.964799404144287, "learning_rate": 9.109506618531889e-07, "loss": 0.6676, "step": 3260 }, { "epoch": 2.9248658318425758, "grad_norm": 2.561464548110962, "learning_rate": 9.106498194945848e-07, "loss": 0.6847, "step": 3270 }, { "epoch": 2.933810375670841, "grad_norm": 2.822511672973633, "learning_rate": 9.103489771359806e-07, "loss": 0.6591, "step": 3280 }, { "epoch": 2.9427549194991056, "grad_norm": 2.471494197845459, "learning_rate": 9.100481347773766e-07, "loss": 0.6356, "step": 3290 }, { "epoch": 2.9516994633273703, "grad_norm": 2.75529146194458, "learning_rate": 9.097472924187725e-07, "loss": 0.6628, "step": 3300 }, { "epoch": 2.960644007155635, "grad_norm": 2.634551525115967, "learning_rate": 9.094464500601684e-07, "loss": 0.6445, "step": 3310 }, { "epoch": 2.9695885509838997, "grad_norm": 2.6194956302642822, "learning_rate": 9.091456077015643e-07, "loss": 0.6657, "step": 3320 }, { "epoch": 2.9785330948121644, "grad_norm": 2.409846305847168, "learning_rate": 9.088447653429603e-07, "loss": 0.6449, "step": 3330 }, { "epoch": 2.9874776386404296, "grad_norm": 2.7916181087493896, "learning_rate": 9.085439229843561e-07, "loss": 0.6567, "step": 3340 }, { "epoch": 2.9964221824686943, "grad_norm": 2.529426336288452, "learning_rate": 9.082430806257521e-07, "loss": 0.6472, "step": 3350 }, { "epoch": 3.0, "eval_bleu": 47.3533, "eval_gen_len": 75.9194, "eval_loss": 0.45044583082199097, "eval_runtime": 57.4786, "eval_samples_per_second": 18.128, "eval_steps_per_second": 0.191, "step": 3354 }, { "epoch": 3.005366726296959, "grad_norm": 2.2102513313293457, "learning_rate": 9.07942238267148e-07, "loss": 0.6442, "step": 3360 }, { "epoch": 3.0143112701252237, "grad_norm": 2.70259428024292, "learning_rate": 9.076413959085439e-07, "loss": 0.6565, "step": 3370 }, { "epoch": 3.0232558139534884, "grad_norm": 2.7066004276275635, "learning_rate": 9.073405535499398e-07, "loss": 0.656, "step": 3380 }, { "epoch": 3.032200357781753, "grad_norm": 2.758183717727661, "learning_rate": 9.070397111913358e-07, "loss": 0.645, "step": 3390 }, { "epoch": 3.041144901610018, "grad_norm": 2.4055726528167725, "learning_rate": 9.067388688327316e-07, "loss": 0.6314, "step": 3400 }, { "epoch": 3.0500894454382825, "grad_norm": 2.4877970218658447, "learning_rate": 9.064380264741275e-07, "loss": 0.6397, "step": 3410 }, { "epoch": 3.0590339892665472, "grad_norm": 2.6725428104400635, "learning_rate": 9.061371841155234e-07, "loss": 0.6429, "step": 3420 }, { "epoch": 3.067978533094812, "grad_norm": 2.4992988109588623, "learning_rate": 9.058363417569193e-07, "loss": 0.6472, "step": 3430 }, { "epoch": 3.076923076923077, "grad_norm": 2.4662342071533203, "learning_rate": 9.055354993983152e-07, "loss": 0.6309, "step": 3440 }, { "epoch": 3.085867620751342, "grad_norm": 2.4642457962036133, "learning_rate": 9.052346570397111e-07, "loss": 0.6161, "step": 3450 }, { "epoch": 3.0948121645796065, "grad_norm": 2.5901832580566406, "learning_rate": 9.049338146811071e-07, "loss": 0.6439, "step": 3460 }, { "epoch": 3.103756708407871, "grad_norm": 2.6220297813415527, "learning_rate": 9.04632972322503e-07, "loss": 0.6496, "step": 3470 }, { "epoch": 3.112701252236136, "grad_norm": 2.4917731285095215, "learning_rate": 9.043321299638989e-07, "loss": 0.6178, "step": 3480 }, { "epoch": 3.1216457960644006, "grad_norm": 2.4936294555664062, "learning_rate": 9.040312876052948e-07, "loss": 0.6482, "step": 3490 }, { "epoch": 3.1305903398926653, "grad_norm": 2.4493210315704346, "learning_rate": 9.037304452466908e-07, "loss": 0.6177, "step": 3500 }, { "epoch": 3.13953488372093, "grad_norm": 2.941797971725464, "learning_rate": 9.034296028880865e-07, "loss": 0.6231, "step": 3510 }, { "epoch": 3.148479427549195, "grad_norm": 2.5311951637268066, "learning_rate": 9.031287605294825e-07, "loss": 0.6191, "step": 3520 }, { "epoch": 3.15742397137746, "grad_norm": 2.6150107383728027, "learning_rate": 9.028279181708784e-07, "loss": 0.6243, "step": 3530 }, { "epoch": 3.1663685152057246, "grad_norm": 2.4990551471710205, "learning_rate": 9.025270758122743e-07, "loss": 0.616, "step": 3540 }, { "epoch": 3.1753130590339893, "grad_norm": 2.4558522701263428, "learning_rate": 9.022262334536702e-07, "loss": 0.6286, "step": 3550 }, { "epoch": 3.184257602862254, "grad_norm": 2.2986326217651367, "learning_rate": 9.019253910950662e-07, "loss": 0.6191, "step": 3560 }, { "epoch": 3.1932021466905187, "grad_norm": 2.9514191150665283, "learning_rate": 9.016245487364621e-07, "loss": 0.5976, "step": 3570 }, { "epoch": 3.2021466905187834, "grad_norm": 2.2443857192993164, "learning_rate": 9.01323706377858e-07, "loss": 0.6335, "step": 3580 }, { "epoch": 3.211091234347048, "grad_norm": 2.7957968711853027, "learning_rate": 9.010228640192539e-07, "loss": 0.6137, "step": 3590 }, { "epoch": 3.220035778175313, "grad_norm": 2.374537944793701, "learning_rate": 9.007220216606498e-07, "loss": 0.6211, "step": 3600 }, { "epoch": 3.228980322003578, "grad_norm": 2.501329183578491, "learning_rate": 9.004211793020456e-07, "loss": 0.6182, "step": 3610 }, { "epoch": 3.2379248658318427, "grad_norm": 2.5942885875701904, "learning_rate": 9.001203369434415e-07, "loss": 0.6014, "step": 3620 }, { "epoch": 3.2468694096601074, "grad_norm": 2.34264874458313, "learning_rate": 8.998194945848375e-07, "loss": 0.6056, "step": 3630 }, { "epoch": 3.255813953488372, "grad_norm": 2.3675620555877686, "learning_rate": 8.995186522262334e-07, "loss": 0.6075, "step": 3640 }, { "epoch": 3.264758497316637, "grad_norm": 2.7011513710021973, "learning_rate": 8.992178098676293e-07, "loss": 0.6236, "step": 3650 }, { "epoch": 3.2737030411449015, "grad_norm": 2.562945604324341, "learning_rate": 8.989169675090252e-07, "loss": 0.6193, "step": 3660 }, { "epoch": 3.282647584973166, "grad_norm": 2.497159957885742, "learning_rate": 8.986161251504212e-07, "loss": 0.5963, "step": 3670 }, { "epoch": 3.2915921288014314, "grad_norm": 2.405364513397217, "learning_rate": 8.98315282791817e-07, "loss": 0.6122, "step": 3680 }, { "epoch": 3.300536672629696, "grad_norm": 2.8417489528656006, "learning_rate": 8.98014440433213e-07, "loss": 0.6098, "step": 3690 }, { "epoch": 3.309481216457961, "grad_norm": 2.4375507831573486, "learning_rate": 8.977135980746089e-07, "loss": 0.6107, "step": 3700 }, { "epoch": 3.3184257602862255, "grad_norm": 2.3820364475250244, "learning_rate": 8.974127557160048e-07, "loss": 0.6088, "step": 3710 }, { "epoch": 3.32737030411449, "grad_norm": 2.655949592590332, "learning_rate": 8.971119133574007e-07, "loss": 0.5994, "step": 3720 }, { "epoch": 3.336314847942755, "grad_norm": 2.63189435005188, "learning_rate": 8.968110709987966e-07, "loss": 0.5925, "step": 3730 }, { "epoch": 3.3452593917710196, "grad_norm": 2.4572486877441406, "learning_rate": 8.965102286401925e-07, "loss": 0.5931, "step": 3740 }, { "epoch": 3.3542039355992843, "grad_norm": 2.2869107723236084, "learning_rate": 8.962093862815884e-07, "loss": 0.5864, "step": 3750 }, { "epoch": 3.363148479427549, "grad_norm": 2.425100326538086, "learning_rate": 8.959085439229843e-07, "loss": 0.5963, "step": 3760 }, { "epoch": 3.3720930232558137, "grad_norm": 2.5817906856536865, "learning_rate": 8.956077015643802e-07, "loss": 0.6021, "step": 3770 }, { "epoch": 3.381037567084079, "grad_norm": 2.679655075073242, "learning_rate": 8.953068592057761e-07, "loss": 0.5926, "step": 3780 }, { "epoch": 3.3899821109123436, "grad_norm": 2.54133677482605, "learning_rate": 8.95006016847172e-07, "loss": 0.5984, "step": 3790 }, { "epoch": 3.3989266547406083, "grad_norm": 2.5012824535369873, "learning_rate": 8.94705174488568e-07, "loss": 0.5903, "step": 3800 }, { "epoch": 3.407871198568873, "grad_norm": 2.6578915119171143, "learning_rate": 8.944043321299639e-07, "loss": 0.6046, "step": 3810 }, { "epoch": 3.4168157423971377, "grad_norm": 2.964258909225464, "learning_rate": 8.941034897713598e-07, "loss": 0.6001, "step": 3820 }, { "epoch": 3.4257602862254024, "grad_norm": 2.4920527935028076, "learning_rate": 8.938026474127557e-07, "loss": 0.5796, "step": 3830 }, { "epoch": 3.434704830053667, "grad_norm": 2.3578991889953613, "learning_rate": 8.935018050541516e-07, "loss": 0.5828, "step": 3840 }, { "epoch": 3.4436493738819323, "grad_norm": 2.511328935623169, "learning_rate": 8.932009626955474e-07, "loss": 0.5674, "step": 3850 }, { "epoch": 3.452593917710197, "grad_norm": 2.626751184463501, "learning_rate": 8.929001203369434e-07, "loss": 0.5862, "step": 3860 }, { "epoch": 3.4615384615384617, "grad_norm": 2.8260960578918457, "learning_rate": 8.925992779783393e-07, "loss": 0.5911, "step": 3870 }, { "epoch": 3.4704830053667264, "grad_norm": 2.422102928161621, "learning_rate": 8.922984356197352e-07, "loss": 0.5726, "step": 3880 }, { "epoch": 3.479427549194991, "grad_norm": 2.27225923538208, "learning_rate": 8.919975932611311e-07, "loss": 0.5553, "step": 3890 }, { "epoch": 3.488372093023256, "grad_norm": 2.5205020904541016, "learning_rate": 8.916967509025271e-07, "loss": 0.5918, "step": 3900 }, { "epoch": 3.4973166368515205, "grad_norm": 2.428128242492676, "learning_rate": 8.91395908543923e-07, "loss": 0.5759, "step": 3910 }, { "epoch": 3.506261180679785, "grad_norm": 2.4170215129852295, "learning_rate": 8.910950661853189e-07, "loss": 0.5597, "step": 3920 }, { "epoch": 3.51520572450805, "grad_norm": 2.1635830402374268, "learning_rate": 8.907942238267148e-07, "loss": 0.5546, "step": 3930 }, { "epoch": 3.5241502683363146, "grad_norm": 3.3241753578186035, "learning_rate": 8.904933814681108e-07, "loss": 0.5603, "step": 3940 }, { "epoch": 3.5330948121645798, "grad_norm": 2.1622519493103027, "learning_rate": 8.901925391095065e-07, "loss": 0.5674, "step": 3950 }, { "epoch": 3.5420393559928445, "grad_norm": 2.3452277183532715, "learning_rate": 8.898916967509024e-07, "loss": 0.5768, "step": 3960 }, { "epoch": 3.550983899821109, "grad_norm": 2.950314998626709, "learning_rate": 8.895908543922984e-07, "loss": 0.571, "step": 3970 }, { "epoch": 3.559928443649374, "grad_norm": 2.2387633323669434, "learning_rate": 8.892900120336943e-07, "loss": 0.5706, "step": 3980 }, { "epoch": 3.5688729874776386, "grad_norm": 2.417853832244873, "learning_rate": 8.889891696750902e-07, "loss": 0.56, "step": 3990 }, { "epoch": 3.5778175313059033, "grad_norm": 2.549022674560547, "learning_rate": 8.886883273164861e-07, "loss": 0.5686, "step": 4000 }, { "epoch": 3.586762075134168, "grad_norm": 2.2357287406921387, "learning_rate": 8.883874849578821e-07, "loss": 0.5694, "step": 4010 }, { "epoch": 3.595706618962433, "grad_norm": 2.5129199028015137, "learning_rate": 8.880866425992779e-07, "loss": 0.565, "step": 4020 }, { "epoch": 3.604651162790698, "grad_norm": 2.858186721801758, "learning_rate": 8.877858002406739e-07, "loss": 0.5604, "step": 4030 }, { "epoch": 3.6135957066189626, "grad_norm": 2.274585247039795, "learning_rate": 8.874849578820698e-07, "loss": 0.5549, "step": 4040 }, { "epoch": 3.6225402504472273, "grad_norm": 2.324376106262207, "learning_rate": 8.871841155234657e-07, "loss": 0.5607, "step": 4050 }, { "epoch": 3.631484794275492, "grad_norm": 2.541766881942749, "learning_rate": 8.868832731648615e-07, "loss": 0.5387, "step": 4060 }, { "epoch": 3.6404293381037567, "grad_norm": 2.4490244388580322, "learning_rate": 8.865824308062575e-07, "loss": 0.5613, "step": 4070 }, { "epoch": 3.6493738819320214, "grad_norm": 2.665994167327881, "learning_rate": 8.862815884476534e-07, "loss": 0.5636, "step": 4080 }, { "epoch": 3.658318425760286, "grad_norm": 2.3954038619995117, "learning_rate": 8.859807460890493e-07, "loss": 0.5684, "step": 4090 }, { "epoch": 3.667262969588551, "grad_norm": 2.3968029022216797, "learning_rate": 8.856799037304452e-07, "loss": 0.5503, "step": 4100 }, { "epoch": 3.6762075134168155, "grad_norm": 2.2309978008270264, "learning_rate": 8.853790613718412e-07, "loss": 0.5553, "step": 4110 }, { "epoch": 3.6851520572450807, "grad_norm": 2.3942465782165527, "learning_rate": 8.85078219013237e-07, "loss": 0.5405, "step": 4120 }, { "epoch": 3.6940966010733454, "grad_norm": 2.1299009323120117, "learning_rate": 8.847773766546329e-07, "loss": 0.5501, "step": 4130 }, { "epoch": 3.70304114490161, "grad_norm": 2.2872629165649414, "learning_rate": 8.844765342960289e-07, "loss": 0.5402, "step": 4140 }, { "epoch": 3.7119856887298748, "grad_norm": 2.65865159034729, "learning_rate": 8.841756919374247e-07, "loss": 0.5467, "step": 4150 }, { "epoch": 3.7209302325581395, "grad_norm": 3.056772232055664, "learning_rate": 8.838748495788207e-07, "loss": 0.5516, "step": 4160 }, { "epoch": 3.729874776386404, "grad_norm": 2.4120004177093506, "learning_rate": 8.835740072202165e-07, "loss": 0.5606, "step": 4170 }, { "epoch": 3.738819320214669, "grad_norm": 2.7942564487457275, "learning_rate": 8.832731648616125e-07, "loss": 0.547, "step": 4180 }, { "epoch": 3.747763864042934, "grad_norm": 2.5700485706329346, "learning_rate": 8.829723225030083e-07, "loss": 0.5318, "step": 4190 }, { "epoch": 3.7567084078711988, "grad_norm": 2.232795476913452, "learning_rate": 8.826714801444043e-07, "loss": 0.5529, "step": 4200 }, { "epoch": 3.7656529516994635, "grad_norm": 2.5302608013153076, "learning_rate": 8.823706377858002e-07, "loss": 0.5422, "step": 4210 }, { "epoch": 3.774597495527728, "grad_norm": 2.1510462760925293, "learning_rate": 8.820697954271961e-07, "loss": 0.5389, "step": 4220 }, { "epoch": 3.783542039355993, "grad_norm": 2.637227773666382, "learning_rate": 8.81768953068592e-07, "loss": 0.5516, "step": 4230 }, { "epoch": 3.7924865831842576, "grad_norm": 2.3238768577575684, "learning_rate": 8.81468110709988e-07, "loss": 0.5304, "step": 4240 }, { "epoch": 3.8014311270125223, "grad_norm": 2.2426376342773438, "learning_rate": 8.811672683513839e-07, "loss": 0.5337, "step": 4250 }, { "epoch": 3.810375670840787, "grad_norm": 2.6865127086639404, "learning_rate": 8.808664259927798e-07, "loss": 0.5536, "step": 4260 }, { "epoch": 3.8193202146690517, "grad_norm": 2.312762975692749, "learning_rate": 8.805655836341757e-07, "loss": 0.5353, "step": 4270 }, { "epoch": 3.8282647584973164, "grad_norm": 2.282130241394043, "learning_rate": 8.802647412755716e-07, "loss": 0.5492, "step": 4280 }, { "epoch": 3.8372093023255816, "grad_norm": 2.3553049564361572, "learning_rate": 8.799638989169674e-07, "loss": 0.5362, "step": 4290 }, { "epoch": 3.8461538461538463, "grad_norm": 2.6251494884490967, "learning_rate": 8.796630565583633e-07, "loss": 0.5387, "step": 4300 }, { "epoch": 3.855098389982111, "grad_norm": 2.353665590286255, "learning_rate": 8.793622141997593e-07, "loss": 0.5447, "step": 4310 }, { "epoch": 3.8640429338103757, "grad_norm": 3.079887628555298, "learning_rate": 8.790613718411551e-07, "loss": 0.5483, "step": 4320 }, { "epoch": 3.8729874776386404, "grad_norm": 2.3928916454315186, "learning_rate": 8.787605294825511e-07, "loss": 0.5224, "step": 4330 }, { "epoch": 3.881932021466905, "grad_norm": 2.356884717941284, "learning_rate": 8.78459687123947e-07, "loss": 0.5263, "step": 4340 }, { "epoch": 3.89087656529517, "grad_norm": 2.5218331813812256, "learning_rate": 8.78158844765343e-07, "loss": 0.527, "step": 4350 }, { "epoch": 3.899821109123435, "grad_norm": 2.455214023590088, "learning_rate": 8.778580024067388e-07, "loss": 0.5356, "step": 4360 }, { "epoch": 3.9087656529516996, "grad_norm": 2.339240312576294, "learning_rate": 8.775571600481348e-07, "loss": 0.5239, "step": 4370 }, { "epoch": 3.9177101967799643, "grad_norm": 2.3760018348693848, "learning_rate": 8.772563176895307e-07, "loss": 0.5143, "step": 4380 }, { "epoch": 3.926654740608229, "grad_norm": 2.7996761798858643, "learning_rate": 8.769554753309265e-07, "loss": 0.5442, "step": 4390 }, { "epoch": 3.9355992844364938, "grad_norm": 2.385220527648926, "learning_rate": 8.766546329723224e-07, "loss": 0.5248, "step": 4400 }, { "epoch": 3.9445438282647585, "grad_norm": 2.228893518447876, "learning_rate": 8.763537906137184e-07, "loss": 0.5097, "step": 4410 }, { "epoch": 3.953488372093023, "grad_norm": 2.426032066345215, "learning_rate": 8.760529482551143e-07, "loss": 0.5188, "step": 4420 }, { "epoch": 3.962432915921288, "grad_norm": 2.2290916442871094, "learning_rate": 8.757521058965102e-07, "loss": 0.5198, "step": 4430 }, { "epoch": 3.9713774597495526, "grad_norm": 2.2398645877838135, "learning_rate": 8.754512635379061e-07, "loss": 0.5115, "step": 4440 }, { "epoch": 3.9803220035778173, "grad_norm": 2.507640838623047, "learning_rate": 8.751504211793021e-07, "loss": 0.527, "step": 4450 }, { "epoch": 3.9892665474060824, "grad_norm": 2.547366142272949, "learning_rate": 8.748495788206979e-07, "loss": 0.5188, "step": 4460 }, { "epoch": 3.998211091234347, "grad_norm": 2.1927096843719482, "learning_rate": 8.745487364620938e-07, "loss": 0.5246, "step": 4470 }, { "epoch": 4.0, "eval_bleu": 55.2169, "eval_gen_len": 75.6871, "eval_loss": 0.3579396903514862, "eval_runtime": 56.6446, "eval_samples_per_second": 18.395, "eval_steps_per_second": 0.194, "step": 4472 }, { "epoch": 4.007155635062611, "grad_norm": 2.197103977203369, "learning_rate": 8.742478941034898e-07, "loss": 0.5227, "step": 4480 }, { "epoch": 4.016100178890877, "grad_norm": 2.287051200866699, "learning_rate": 8.739470517448855e-07, "loss": 0.5079, "step": 4490 }, { "epoch": 4.025044722719142, "grad_norm": 2.386626958847046, "learning_rate": 8.736462093862815e-07, "loss": 0.5162, "step": 4500 }, { "epoch": 4.033989266547406, "grad_norm": 2.674652099609375, "learning_rate": 8.733453670276774e-07, "loss": 0.5346, "step": 4510 }, { "epoch": 4.042933810375671, "grad_norm": 2.3477110862731934, "learning_rate": 8.730445246690734e-07, "loss": 0.5254, "step": 4520 }, { "epoch": 4.051878354203936, "grad_norm": 2.2642228603363037, "learning_rate": 8.727436823104692e-07, "loss": 0.5201, "step": 4530 }, { "epoch": 4.0608228980322005, "grad_norm": 2.0289313793182373, "learning_rate": 8.724428399518652e-07, "loss": 0.5189, "step": 4540 }, { "epoch": 4.069767441860465, "grad_norm": 2.5789413452148438, "learning_rate": 8.721419975932611e-07, "loss": 0.5165, "step": 4550 }, { "epoch": 4.07871198568873, "grad_norm": 2.535637617111206, "learning_rate": 8.71841155234657e-07, "loss": 0.4972, "step": 4560 }, { "epoch": 4.087656529516995, "grad_norm": 2.328303098678589, "learning_rate": 8.715403128760529e-07, "loss": 0.501, "step": 4570 }, { "epoch": 4.096601073345259, "grad_norm": 2.030282735824585, "learning_rate": 8.712394705174489e-07, "loss": 0.491, "step": 4580 }, { "epoch": 4.105545617173524, "grad_norm": 2.2180206775665283, "learning_rate": 8.709386281588448e-07, "loss": 0.5042, "step": 4590 }, { "epoch": 4.114490161001789, "grad_norm": 2.104442596435547, "learning_rate": 8.706377858002407e-07, "loss": 0.5095, "step": 4600 }, { "epoch": 4.1234347048300535, "grad_norm": 2.341421127319336, "learning_rate": 8.703369434416365e-07, "loss": 0.5096, "step": 4610 }, { "epoch": 4.132379248658318, "grad_norm": 2.276421070098877, "learning_rate": 8.700361010830325e-07, "loss": 0.51, "step": 4620 }, { "epoch": 4.141323792486583, "grad_norm": 2.3223495483398438, "learning_rate": 8.697352587244283e-07, "loss": 0.5036, "step": 4630 }, { "epoch": 4.150268336314848, "grad_norm": 2.262240171432495, "learning_rate": 8.694344163658242e-07, "loss": 0.4992, "step": 4640 }, { "epoch": 4.159212880143112, "grad_norm": 2.533426284790039, "learning_rate": 8.691335740072202e-07, "loss": 0.5042, "step": 4650 }, { "epoch": 4.168157423971378, "grad_norm": 2.2887630462646484, "learning_rate": 8.68832731648616e-07, "loss": 0.5022, "step": 4660 }, { "epoch": 4.177101967799643, "grad_norm": 1.967930793762207, "learning_rate": 8.68531889290012e-07, "loss": 0.4739, "step": 4670 }, { "epoch": 4.186046511627907, "grad_norm": 2.440462589263916, "learning_rate": 8.682310469314079e-07, "loss": 0.4958, "step": 4680 }, { "epoch": 4.194991055456172, "grad_norm": 2.404984712600708, "learning_rate": 8.679302045728039e-07, "loss": 0.494, "step": 4690 }, { "epoch": 4.203935599284437, "grad_norm": 2.4689669609069824, "learning_rate": 8.676293622141997e-07, "loss": 0.4923, "step": 4700 }, { "epoch": 4.212880143112701, "grad_norm": 2.2958297729492188, "learning_rate": 8.673285198555957e-07, "loss": 0.5075, "step": 4710 }, { "epoch": 4.221824686940966, "grad_norm": 2.1192386150360107, "learning_rate": 8.670276774969915e-07, "loss": 0.4984, "step": 4720 }, { "epoch": 4.230769230769231, "grad_norm": 2.3063201904296875, "learning_rate": 8.667268351383874e-07, "loss": 0.5009, "step": 4730 }, { "epoch": 4.2397137745974955, "grad_norm": 2.4694032669067383, "learning_rate": 8.664259927797833e-07, "loss": 0.5009, "step": 4740 }, { "epoch": 4.24865831842576, "grad_norm": 2.2757201194763184, "learning_rate": 8.661251504211793e-07, "loss": 0.5108, "step": 4750 }, { "epoch": 4.257602862254025, "grad_norm": 2.134617567062378, "learning_rate": 8.658243080625752e-07, "loss": 0.496, "step": 4760 }, { "epoch": 4.26654740608229, "grad_norm": 2.357692003250122, "learning_rate": 8.655234657039711e-07, "loss": 0.4936, "step": 4770 }, { "epoch": 4.275491949910554, "grad_norm": 2.2824032306671143, "learning_rate": 8.65222623345367e-07, "loss": 0.4818, "step": 4780 }, { "epoch": 4.284436493738819, "grad_norm": 2.100003719329834, "learning_rate": 8.64921780986763e-07, "loss": 0.4943, "step": 4790 }, { "epoch": 4.293381037567084, "grad_norm": 2.776961326599121, "learning_rate": 8.646209386281588e-07, "loss": 0.4815, "step": 4800 }, { "epoch": 4.3023255813953485, "grad_norm": 2.3573222160339355, "learning_rate": 8.643200962695547e-07, "loss": 0.5031, "step": 4810 }, { "epoch": 4.311270125223613, "grad_norm": 2.1648378372192383, "learning_rate": 8.640192539109507e-07, "loss": 0.4951, "step": 4820 }, { "epoch": 4.320214669051879, "grad_norm": 2.6196367740631104, "learning_rate": 8.637184115523464e-07, "loss": 0.4832, "step": 4830 }, { "epoch": 4.3291592128801435, "grad_norm": 2.0457088947296143, "learning_rate": 8.634175691937424e-07, "loss": 0.4645, "step": 4840 }, { "epoch": 4.338103756708408, "grad_norm": 2.3532934188842773, "learning_rate": 8.631167268351383e-07, "loss": 0.4832, "step": 4850 }, { "epoch": 4.347048300536673, "grad_norm": 2.3529703617095947, "learning_rate": 8.628158844765343e-07, "loss": 0.4777, "step": 4860 }, { "epoch": 4.355992844364938, "grad_norm": 2.3178508281707764, "learning_rate": 8.625150421179301e-07, "loss": 0.4952, "step": 4870 }, { "epoch": 4.364937388193202, "grad_norm": 2.1043753623962402, "learning_rate": 8.622141997593261e-07, "loss": 0.4872, "step": 4880 }, { "epoch": 4.373881932021467, "grad_norm": 2.3565258979797363, "learning_rate": 8.61913357400722e-07, "loss": 0.4641, "step": 4890 }, { "epoch": 4.382826475849732, "grad_norm": 2.2326138019561768, "learning_rate": 8.616125150421179e-07, "loss": 0.4719, "step": 4900 }, { "epoch": 4.391771019677996, "grad_norm": 2.4230077266693115, "learning_rate": 8.613116726835138e-07, "loss": 0.4814, "step": 4910 }, { "epoch": 4.400715563506261, "grad_norm": 2.375678300857544, "learning_rate": 8.610108303249098e-07, "loss": 0.4746, "step": 4920 }, { "epoch": 4.409660107334526, "grad_norm": 2.1337859630584717, "learning_rate": 8.607099879663056e-07, "loss": 0.4658, "step": 4930 }, { "epoch": 4.4186046511627906, "grad_norm": 2.2945430278778076, "learning_rate": 8.604091456077015e-07, "loss": 0.4875, "step": 4940 }, { "epoch": 4.427549194991055, "grad_norm": 2.20735502243042, "learning_rate": 8.601083032490974e-07, "loss": 0.4671, "step": 4950 }, { "epoch": 4.43649373881932, "grad_norm": 2.89607834815979, "learning_rate": 8.598074608904934e-07, "loss": 0.4757, "step": 4960 }, { "epoch": 4.445438282647585, "grad_norm": 2.1082570552825928, "learning_rate": 8.595066185318892e-07, "loss": 0.4722, "step": 4970 }, { "epoch": 4.454382826475849, "grad_norm": 2.1505637168884277, "learning_rate": 8.592057761732851e-07, "loss": 0.4911, "step": 4980 }, { "epoch": 4.463327370304114, "grad_norm": 2.17527174949646, "learning_rate": 8.589049338146811e-07, "loss": 0.4774, "step": 4990 }, { "epoch": 4.47227191413238, "grad_norm": 2.530137777328491, "learning_rate": 8.586040914560769e-07, "loss": 0.478, "step": 5000 }, { "epoch": 4.481216457960644, "grad_norm": 2.1128530502319336, "learning_rate": 8.583032490974729e-07, "loss": 0.4885, "step": 5010 }, { "epoch": 4.490161001788909, "grad_norm": 2.4407949447631836, "learning_rate": 8.580024067388688e-07, "loss": 0.4733, "step": 5020 }, { "epoch": 4.499105545617174, "grad_norm": 2.1629414558410645, "learning_rate": 8.577015643802648e-07, "loss": 0.4683, "step": 5030 }, { "epoch": 4.5080500894454385, "grad_norm": 2.161496639251709, "learning_rate": 8.574007220216606e-07, "loss": 0.4607, "step": 5040 }, { "epoch": 4.516994633273703, "grad_norm": 2.093541383743286, "learning_rate": 8.570998796630565e-07, "loss": 0.4683, "step": 5050 }, { "epoch": 4.525939177101968, "grad_norm": 2.1078083515167236, "learning_rate": 8.567990373044524e-07, "loss": 0.4698, "step": 5060 }, { "epoch": 4.534883720930233, "grad_norm": 2.2364354133605957, "learning_rate": 8.564981949458483e-07, "loss": 0.4843, "step": 5070 }, { "epoch": 4.543828264758497, "grad_norm": 2.204305648803711, "learning_rate": 8.561973525872442e-07, "loss": 0.4785, "step": 5080 }, { "epoch": 4.552772808586762, "grad_norm": 2.314359426498413, "learning_rate": 8.558965102286402e-07, "loss": 0.4814, "step": 5090 }, { "epoch": 4.561717352415027, "grad_norm": 2.3714308738708496, "learning_rate": 8.55595667870036e-07, "loss": 0.4677, "step": 5100 }, { "epoch": 4.5706618962432914, "grad_norm": 2.1810977458953857, "learning_rate": 8.55294825511432e-07, "loss": 0.4567, "step": 5110 }, { "epoch": 4.579606440071556, "grad_norm": 2.3970401287078857, "learning_rate": 8.549939831528279e-07, "loss": 0.4854, "step": 5120 }, { "epoch": 4.588550983899821, "grad_norm": 2.307947874069214, "learning_rate": 8.546931407942239e-07, "loss": 0.4575, "step": 5130 }, { "epoch": 4.597495527728086, "grad_norm": 2.3009941577911377, "learning_rate": 8.543922984356197e-07, "loss": 0.4662, "step": 5140 }, { "epoch": 4.60644007155635, "grad_norm": 2.2120587825775146, "learning_rate": 8.540914560770156e-07, "loss": 0.4637, "step": 5150 }, { "epoch": 4.615384615384615, "grad_norm": 1.8983241319656372, "learning_rate": 8.537906137184115e-07, "loss": 0.4662, "step": 5160 }, { "epoch": 4.624329159212881, "grad_norm": 2.24377703666687, "learning_rate": 8.534897713598073e-07, "loss": 0.4628, "step": 5170 }, { "epoch": 4.633273703041145, "grad_norm": 2.33957839012146, "learning_rate": 8.531889290012033e-07, "loss": 0.4778, "step": 5180 }, { "epoch": 4.64221824686941, "grad_norm": 2.2532317638397217, "learning_rate": 8.528880866425992e-07, "loss": 0.4579, "step": 5190 }, { "epoch": 4.651162790697675, "grad_norm": 2.1577749252319336, "learning_rate": 8.525872442839952e-07, "loss": 0.4625, "step": 5200 }, { "epoch": 4.660107334525939, "grad_norm": 2.20407772064209, "learning_rate": 8.52286401925391e-07, "loss": 0.4669, "step": 5210 }, { "epoch": 4.669051878354204, "grad_norm": 2.2237601280212402, "learning_rate": 8.51985559566787e-07, "loss": 0.4705, "step": 5220 }, { "epoch": 4.677996422182469, "grad_norm": 2.367025852203369, "learning_rate": 8.516847172081829e-07, "loss": 0.4538, "step": 5230 }, { "epoch": 4.6869409660107335, "grad_norm": 2.0631942749023438, "learning_rate": 8.513838748495788e-07, "loss": 0.4572, "step": 5240 }, { "epoch": 4.695885509838998, "grad_norm": 2.1658992767333984, "learning_rate": 8.510830324909747e-07, "loss": 0.4475, "step": 5250 }, { "epoch": 4.704830053667263, "grad_norm": 4.232874870300293, "learning_rate": 8.507821901323707e-07, "loss": 0.4613, "step": 5260 }, { "epoch": 4.713774597495528, "grad_norm": 2.224188804626465, "learning_rate": 8.504813477737664e-07, "loss": 0.4424, "step": 5270 }, { "epoch": 4.722719141323792, "grad_norm": 2.1103410720825195, "learning_rate": 8.501805054151624e-07, "loss": 0.4552, "step": 5280 }, { "epoch": 4.731663685152057, "grad_norm": 1.8934681415557861, "learning_rate": 8.498796630565583e-07, "loss": 0.4497, "step": 5290 }, { "epoch": 4.740608228980322, "grad_norm": 2.366163492202759, "learning_rate": 8.495788206979543e-07, "loss": 0.4416, "step": 5300 }, { "epoch": 4.7495527728085865, "grad_norm": 2.462538003921509, "learning_rate": 8.492779783393501e-07, "loss": 0.4612, "step": 5310 }, { "epoch": 4.758497316636851, "grad_norm": 2.191615104675293, "learning_rate": 8.489771359807461e-07, "loss": 0.4596, "step": 5320 }, { "epoch": 4.767441860465116, "grad_norm": 2.1773529052734375, "learning_rate": 8.48676293622142e-07, "loss": 0.4657, "step": 5330 }, { "epoch": 4.7763864042933815, "grad_norm": 2.416468858718872, "learning_rate": 8.483754512635378e-07, "loss": 0.4553, "step": 5340 }, { "epoch": 4.785330948121646, "grad_norm": 2.129228115081787, "learning_rate": 8.480746089049338e-07, "loss": 0.4545, "step": 5350 }, { "epoch": 4.794275491949911, "grad_norm": 2.597944498062134, "learning_rate": 8.477737665463297e-07, "loss": 0.4424, "step": 5360 }, { "epoch": 4.803220035778176, "grad_norm": 2.125438928604126, "learning_rate": 8.474729241877257e-07, "loss": 0.4565, "step": 5370 }, { "epoch": 4.81216457960644, "grad_norm": 2.2695999145507812, "learning_rate": 8.471720818291214e-07, "loss": 0.4642, "step": 5380 }, { "epoch": 4.821109123434705, "grad_norm": 2.325247049331665, "learning_rate": 8.468712394705174e-07, "loss": 0.4528, "step": 5390 }, { "epoch": 4.83005366726297, "grad_norm": 2.11484432220459, "learning_rate": 8.465703971119133e-07, "loss": 0.4494, "step": 5400 }, { "epoch": 4.838998211091234, "grad_norm": 2.2997634410858154, "learning_rate": 8.462695547533092e-07, "loss": 0.4426, "step": 5410 }, { "epoch": 4.847942754919499, "grad_norm": 2.39806866645813, "learning_rate": 8.459687123947051e-07, "loss": 0.4298, "step": 5420 }, { "epoch": 4.856887298747764, "grad_norm": 2.1731514930725098, "learning_rate": 8.456678700361011e-07, "loss": 0.4535, "step": 5430 }, { "epoch": 4.8658318425760285, "grad_norm": 1.9812053442001343, "learning_rate": 8.453670276774969e-07, "loss": 0.4431, "step": 5440 }, { "epoch": 4.874776386404293, "grad_norm": 2.2461302280426025, "learning_rate": 8.450661853188929e-07, "loss": 0.4432, "step": 5450 }, { "epoch": 4.883720930232558, "grad_norm": 2.3431684970855713, "learning_rate": 8.447653429602888e-07, "loss": 0.4323, "step": 5460 }, { "epoch": 4.892665474060823, "grad_norm": 2.1888179779052734, "learning_rate": 8.444645006016848e-07, "loss": 0.454, "step": 5470 }, { "epoch": 4.901610017889087, "grad_norm": 2.4536116123199463, "learning_rate": 8.441636582430806e-07, "loss": 0.4468, "step": 5480 }, { "epoch": 4.910554561717352, "grad_norm": 2.444634199142456, "learning_rate": 8.438628158844765e-07, "loss": 0.4511, "step": 5490 }, { "epoch": 4.919499105545617, "grad_norm": 2.3427412509918213, "learning_rate": 8.435619735258724e-07, "loss": 0.427, "step": 5500 }, { "epoch": 4.928443649373882, "grad_norm": 2.6630396842956543, "learning_rate": 8.432611311672682e-07, "loss": 0.4286, "step": 5510 }, { "epoch": 4.937388193202147, "grad_norm": 2.0534238815307617, "learning_rate": 8.429602888086642e-07, "loss": 0.4479, "step": 5520 }, { "epoch": 4.946332737030412, "grad_norm": 2.080934762954712, "learning_rate": 8.426594464500601e-07, "loss": 0.4274, "step": 5530 }, { "epoch": 4.9552772808586765, "grad_norm": 2.3056702613830566, "learning_rate": 8.423586040914561e-07, "loss": 0.4229, "step": 5540 }, { "epoch": 4.964221824686941, "grad_norm": 2.275747060775757, "learning_rate": 8.420577617328519e-07, "loss": 0.4371, "step": 5550 }, { "epoch": 4.973166368515206, "grad_norm": 2.3384668827056885, "learning_rate": 8.417569193742479e-07, "loss": 0.4237, "step": 5560 }, { "epoch": 4.982110912343471, "grad_norm": 2.177608013153076, "learning_rate": 8.414560770156438e-07, "loss": 0.4334, "step": 5570 }, { "epoch": 4.991055456171735, "grad_norm": 1.9598808288574219, "learning_rate": 8.411552346570397e-07, "loss": 0.4296, "step": 5580 }, { "epoch": 5.0, "grad_norm": 3.7194361686706543, "learning_rate": 8.408543922984356e-07, "loss": 0.4228, "step": 5590 }, { "epoch": 5.0, "eval_bleu": 60.8262, "eval_gen_len": 75.5777, "eval_loss": 0.30407437682151794, "eval_runtime": 57.0241, "eval_samples_per_second": 18.273, "eval_steps_per_second": 0.193, "step": 5590 }, { "epoch": 5.008944543828265, "grad_norm": 2.257715940475464, "learning_rate": 8.405535499398315e-07, "loss": 0.4484, "step": 5600 }, { "epoch": 5.017889087656529, "grad_norm": 2.1407155990600586, "learning_rate": 8.402527075812273e-07, "loss": 0.4399, "step": 5610 }, { "epoch": 5.026833631484794, "grad_norm": 1.955741047859192, "learning_rate": 8.399518652226233e-07, "loss": 0.4216, "step": 5620 }, { "epoch": 5.035778175313059, "grad_norm": 2.0968446731567383, "learning_rate": 8.396510228640192e-07, "loss": 0.441, "step": 5630 }, { "epoch": 5.0447227191413235, "grad_norm": 2.1058216094970703, "learning_rate": 8.393501805054152e-07, "loss": 0.4197, "step": 5640 }, { "epoch": 5.053667262969588, "grad_norm": 2.249011516571045, "learning_rate": 8.39049338146811e-07, "loss": 0.4409, "step": 5650 }, { "epoch": 5.062611806797853, "grad_norm": 2.140320062637329, "learning_rate": 8.38748495788207e-07, "loss": 0.4305, "step": 5660 }, { "epoch": 5.071556350626118, "grad_norm": 2.0407919883728027, "learning_rate": 8.384476534296029e-07, "loss": 0.4351, "step": 5670 }, { "epoch": 5.080500894454383, "grad_norm": 2.0861878395080566, "learning_rate": 8.381468110709987e-07, "loss": 0.4366, "step": 5680 }, { "epoch": 5.089445438282648, "grad_norm": 2.259369373321533, "learning_rate": 8.378459687123947e-07, "loss": 0.4188, "step": 5690 }, { "epoch": 5.098389982110913, "grad_norm": 2.647836923599243, "learning_rate": 8.375451263537906e-07, "loss": 0.4298, "step": 5700 }, { "epoch": 5.107334525939177, "grad_norm": 2.4857370853424072, "learning_rate": 8.372442839951864e-07, "loss": 0.4314, "step": 5710 }, { "epoch": 5.116279069767442, "grad_norm": 2.1227731704711914, "learning_rate": 8.369434416365823e-07, "loss": 0.4378, "step": 5720 }, { "epoch": 5.125223613595707, "grad_norm": 2.1125993728637695, "learning_rate": 8.366425992779783e-07, "loss": 0.4339, "step": 5730 }, { "epoch": 5.1341681574239715, "grad_norm": 2.170203685760498, "learning_rate": 8.363417569193742e-07, "loss": 0.4329, "step": 5740 }, { "epoch": 5.143112701252236, "grad_norm": 2.1479272842407227, "learning_rate": 8.360409145607701e-07, "loss": 0.4423, "step": 5750 }, { "epoch": 5.152057245080501, "grad_norm": 2.229536771774292, "learning_rate": 8.35740072202166e-07, "loss": 0.4332, "step": 5760 }, { "epoch": 5.161001788908766, "grad_norm": 2.070498466491699, "learning_rate": 8.35439229843562e-07, "loss": 0.435, "step": 5770 }, { "epoch": 5.16994633273703, "grad_norm": 2.2134475708007812, "learning_rate": 8.351383874849578e-07, "loss": 0.4407, "step": 5780 }, { "epoch": 5.178890876565295, "grad_norm": 2.3678598403930664, "learning_rate": 8.348375451263538e-07, "loss": 0.41, "step": 5790 }, { "epoch": 5.18783542039356, "grad_norm": 2.2990410327911377, "learning_rate": 8.345367027677497e-07, "loss": 0.4115, "step": 5800 }, { "epoch": 5.196779964221824, "grad_norm": 2.1436893939971924, "learning_rate": 8.342358604091457e-07, "loss": 0.4172, "step": 5810 }, { "epoch": 5.205724508050089, "grad_norm": 1.9223712682724, "learning_rate": 8.339350180505414e-07, "loss": 0.4096, "step": 5820 }, { "epoch": 5.214669051878354, "grad_norm": 1.97529935836792, "learning_rate": 8.336341756919374e-07, "loss": 0.4282, "step": 5830 }, { "epoch": 5.2236135957066185, "grad_norm": 2.008768320083618, "learning_rate": 8.333333333333333e-07, "loss": 0.4185, "step": 5840 }, { "epoch": 5.232558139534884, "grad_norm": 2.5357260704040527, "learning_rate": 8.330324909747291e-07, "loss": 0.4185, "step": 5850 }, { "epoch": 5.241502683363149, "grad_norm": 1.980273962020874, "learning_rate": 8.327316486161251e-07, "loss": 0.4169, "step": 5860 }, { "epoch": 5.2504472271914135, "grad_norm": 2.093468427658081, "learning_rate": 8.32430806257521e-07, "loss": 0.411, "step": 5870 }, { "epoch": 5.259391771019678, "grad_norm": 2.3774306774139404, "learning_rate": 8.321299638989169e-07, "loss": 0.4457, "step": 5880 }, { "epoch": 5.268336314847943, "grad_norm": 2.2056474685668945, "learning_rate": 8.318291215403128e-07, "loss": 0.4186, "step": 5890 }, { "epoch": 5.277280858676208, "grad_norm": 1.9724749326705933, "learning_rate": 8.315282791817088e-07, "loss": 0.4209, "step": 5900 }, { "epoch": 5.286225402504472, "grad_norm": 2.140307664871216, "learning_rate": 8.312274368231047e-07, "loss": 0.4132, "step": 5910 }, { "epoch": 5.295169946332737, "grad_norm": 1.9825023412704468, "learning_rate": 8.309265944645006e-07, "loss": 0.4241, "step": 5920 }, { "epoch": 5.304114490161002, "grad_norm": 2.3204171657562256, "learning_rate": 8.306257521058964e-07, "loss": 0.4332, "step": 5930 }, { "epoch": 5.3130590339892665, "grad_norm": 1.8255951404571533, "learning_rate": 8.303249097472924e-07, "loss": 0.4166, "step": 5940 }, { "epoch": 5.322003577817531, "grad_norm": 2.5971059799194336, "learning_rate": 8.300240673886882e-07, "loss": 0.4203, "step": 5950 }, { "epoch": 5.330948121645796, "grad_norm": 2.185863494873047, "learning_rate": 8.297232250300842e-07, "loss": 0.414, "step": 5960 }, { "epoch": 5.339892665474061, "grad_norm": 2.592667818069458, "learning_rate": 8.294223826714801e-07, "loss": 0.4288, "step": 5970 }, { "epoch": 5.348837209302325, "grad_norm": 1.9179635047912598, "learning_rate": 8.291215403128761e-07, "loss": 0.4083, "step": 5980 }, { "epoch": 5.35778175313059, "grad_norm": 2.0136308670043945, "learning_rate": 8.288206979542719e-07, "loss": 0.4153, "step": 5990 }, { "epoch": 5.366726296958855, "grad_norm": 1.8011486530303955, "learning_rate": 8.285198555956679e-07, "loss": 0.4106, "step": 6000 }, { "epoch": 5.375670840787119, "grad_norm": 1.690496802330017, "learning_rate": 8.282190132370638e-07, "loss": 0.4148, "step": 6010 }, { "epoch": 5.384615384615385, "grad_norm": 1.9623056650161743, "learning_rate": 8.279181708784596e-07, "loss": 0.4152, "step": 6020 }, { "epoch": 5.39355992844365, "grad_norm": 2.0957489013671875, "learning_rate": 8.276173285198556e-07, "loss": 0.4076, "step": 6030 }, { "epoch": 5.402504472271914, "grad_norm": 2.2126786708831787, "learning_rate": 8.273164861612514e-07, "loss": 0.4134, "step": 6040 }, { "epoch": 5.411449016100179, "grad_norm": 2.522958517074585, "learning_rate": 8.270156438026473e-07, "loss": 0.4087, "step": 6050 }, { "epoch": 5.420393559928444, "grad_norm": 2.0177793502807617, "learning_rate": 8.267148014440432e-07, "loss": 0.4058, "step": 6060 }, { "epoch": 5.4293381037567086, "grad_norm": 2.025320291519165, "learning_rate": 8.264139590854392e-07, "loss": 0.4136, "step": 6070 }, { "epoch": 5.438282647584973, "grad_norm": 2.357020378112793, "learning_rate": 8.261131167268351e-07, "loss": 0.3919, "step": 6080 }, { "epoch": 5.447227191413238, "grad_norm": 2.289153575897217, "learning_rate": 8.25812274368231e-07, "loss": 0.3974, "step": 6090 }, { "epoch": 5.456171735241503, "grad_norm": 2.366131067276001, "learning_rate": 8.255114320096269e-07, "loss": 0.4129, "step": 6100 }, { "epoch": 5.465116279069767, "grad_norm": 2.0910308361053467, "learning_rate": 8.252105896510229e-07, "loss": 0.4023, "step": 6110 }, { "epoch": 5.474060822898032, "grad_norm": 1.8637781143188477, "learning_rate": 8.249097472924187e-07, "loss": 0.3988, "step": 6120 }, { "epoch": 5.483005366726297, "grad_norm": 2.1386542320251465, "learning_rate": 8.246089049338147e-07, "loss": 0.3972, "step": 6130 }, { "epoch": 5.4919499105545615, "grad_norm": 2.4520576000213623, "learning_rate": 8.243080625752106e-07, "loss": 0.3992, "step": 6140 }, { "epoch": 5.500894454382826, "grad_norm": 2.33306884765625, "learning_rate": 8.240072202166066e-07, "loss": 0.4174, "step": 6150 }, { "epoch": 5.509838998211091, "grad_norm": 2.198904514312744, "learning_rate": 8.237063778580023e-07, "loss": 0.391, "step": 6160 }, { "epoch": 5.518783542039356, "grad_norm": 2.1780428886413574, "learning_rate": 8.234055354993983e-07, "loss": 0.3996, "step": 6170 }, { "epoch": 5.52772808586762, "grad_norm": 2.1225438117980957, "learning_rate": 8.231046931407942e-07, "loss": 0.4143, "step": 6180 }, { "epoch": 5.536672629695886, "grad_norm": 2.3154380321502686, "learning_rate": 8.2280385078219e-07, "loss": 0.406, "step": 6190 }, { "epoch": 5.545617173524151, "grad_norm": 2.2880001068115234, "learning_rate": 8.22503008423586e-07, "loss": 0.4119, "step": 6200 }, { "epoch": 5.554561717352415, "grad_norm": 1.9084720611572266, "learning_rate": 8.222021660649819e-07, "loss": 0.402, "step": 6210 }, { "epoch": 5.56350626118068, "grad_norm": 1.8375004529953003, "learning_rate": 8.219013237063778e-07, "loss": 0.4025, "step": 6220 }, { "epoch": 5.572450805008945, "grad_norm": 1.825980544090271, "learning_rate": 8.216004813477737e-07, "loss": 0.3976, "step": 6230 }, { "epoch": 5.5813953488372094, "grad_norm": 1.917263388633728, "learning_rate": 8.212996389891697e-07, "loss": 0.3974, "step": 6240 }, { "epoch": 5.590339892665474, "grad_norm": 1.8471791744232178, "learning_rate": 8.209987966305656e-07, "loss": 0.3988, "step": 6250 }, { "epoch": 5.599284436493739, "grad_norm": 2.0131492614746094, "learning_rate": 8.206979542719614e-07, "loss": 0.3971, "step": 6260 }, { "epoch": 5.608228980322004, "grad_norm": 1.9742953777313232, "learning_rate": 8.203971119133573e-07, "loss": 0.4083, "step": 6270 }, { "epoch": 5.617173524150268, "grad_norm": 1.6688611507415771, "learning_rate": 8.200962695547533e-07, "loss": 0.3905, "step": 6280 }, { "epoch": 5.626118067978533, "grad_norm": 2.111806631088257, "learning_rate": 8.197954271961491e-07, "loss": 0.3913, "step": 6290 }, { "epoch": 5.635062611806798, "grad_norm": 2.0097107887268066, "learning_rate": 8.194945848375451e-07, "loss": 0.3915, "step": 6300 }, { "epoch": 5.644007155635062, "grad_norm": 2.3371779918670654, "learning_rate": 8.19193742478941e-07, "loss": 0.4012, "step": 6310 }, { "epoch": 5.652951699463327, "grad_norm": 2.0339999198913574, "learning_rate": 8.18892900120337e-07, "loss": 0.4054, "step": 6320 }, { "epoch": 5.661896243291592, "grad_norm": 2.0710511207580566, "learning_rate": 8.185920577617328e-07, "loss": 0.4002, "step": 6330 }, { "epoch": 5.6708407871198565, "grad_norm": 1.9887924194335938, "learning_rate": 8.182912154031288e-07, "loss": 0.3952, "step": 6340 }, { "epoch": 5.679785330948121, "grad_norm": 2.114912986755371, "learning_rate": 8.179903730445247e-07, "loss": 0.3923, "step": 6350 }, { "epoch": 5.688729874776387, "grad_norm": 1.895389437675476, "learning_rate": 8.176895306859205e-07, "loss": 0.4021, "step": 6360 }, { "epoch": 5.6976744186046515, "grad_norm": 2.2383832931518555, "learning_rate": 8.173886883273164e-07, "loss": 0.3911, "step": 6370 }, { "epoch": 5.706618962432916, "grad_norm": 1.9833958148956299, "learning_rate": 8.170878459687123e-07, "loss": 0.3896, "step": 6380 }, { "epoch": 5.715563506261181, "grad_norm": 1.88776695728302, "learning_rate": 8.167870036101082e-07, "loss": 0.3753, "step": 6390 }, { "epoch": 5.724508050089446, "grad_norm": 1.8492844104766846, "learning_rate": 8.164861612515041e-07, "loss": 0.3803, "step": 6400 }, { "epoch": 5.73345259391771, "grad_norm": 2.098515033721924, "learning_rate": 8.161853188929001e-07, "loss": 0.3929, "step": 6410 }, { "epoch": 5.742397137745975, "grad_norm": 2.0026450157165527, "learning_rate": 8.15884476534296e-07, "loss": 0.4026, "step": 6420 }, { "epoch": 5.75134168157424, "grad_norm": 2.0872244834899902, "learning_rate": 8.155836341756919e-07, "loss": 0.3999, "step": 6430 }, { "epoch": 5.7602862254025045, "grad_norm": 2.047790765762329, "learning_rate": 8.152827918170878e-07, "loss": 0.4006, "step": 6440 }, { "epoch": 5.769230769230769, "grad_norm": 2.2762393951416016, "learning_rate": 8.149819494584838e-07, "loss": 0.3964, "step": 6450 }, { "epoch": 5.778175313059034, "grad_norm": 2.1704812049865723, "learning_rate": 8.146811070998796e-07, "loss": 0.387, "step": 6460 }, { "epoch": 5.787119856887299, "grad_norm": 2.2697205543518066, "learning_rate": 8.143802647412756e-07, "loss": 0.3891, "step": 6470 }, { "epoch": 5.796064400715563, "grad_norm": 2.1582305431365967, "learning_rate": 8.140794223826714e-07, "loss": 0.3932, "step": 6480 }, { "epoch": 5.805008944543828, "grad_norm": 2.3086400032043457, "learning_rate": 8.137785800240674e-07, "loss": 0.3955, "step": 6490 }, { "epoch": 5.813953488372093, "grad_norm": 1.9696953296661377, "learning_rate": 8.134777376654632e-07, "loss": 0.3912, "step": 6500 }, { "epoch": 5.822898032200357, "grad_norm": 2.045171022415161, "learning_rate": 8.131768953068592e-07, "loss": 0.3898, "step": 6510 }, { "epoch": 5.831842576028622, "grad_norm": 1.8143988847732544, "learning_rate": 8.128760529482551e-07, "loss": 0.3799, "step": 6520 }, { "epoch": 5.840787119856888, "grad_norm": 2.1355209350585938, "learning_rate": 8.125752105896509e-07, "loss": 0.3877, "step": 6530 }, { "epoch": 5.849731663685152, "grad_norm": 1.9495118856430054, "learning_rate": 8.122743682310469e-07, "loss": 0.3993, "step": 6540 }, { "epoch": 5.858676207513417, "grad_norm": 2.00313401222229, "learning_rate": 8.119735258724428e-07, "loss": 0.3859, "step": 6550 }, { "epoch": 5.867620751341682, "grad_norm": 2.0196080207824707, "learning_rate": 8.116726835138387e-07, "loss": 0.3979, "step": 6560 }, { "epoch": 5.8765652951699465, "grad_norm": 2.0028293132781982, "learning_rate": 8.113718411552346e-07, "loss": 0.3915, "step": 6570 }, { "epoch": 5.885509838998211, "grad_norm": 1.7704436779022217, "learning_rate": 8.110709987966306e-07, "loss": 0.3942, "step": 6580 }, { "epoch": 5.894454382826476, "grad_norm": 1.9910413026809692, "learning_rate": 8.107701564380264e-07, "loss": 0.3909, "step": 6590 }, { "epoch": 5.903398926654741, "grad_norm": 2.428554058074951, "learning_rate": 8.104693140794223e-07, "loss": 0.363, "step": 6600 }, { "epoch": 5.912343470483005, "grad_norm": 2.052687406539917, "learning_rate": 8.101684717208182e-07, "loss": 0.3876, "step": 6610 }, { "epoch": 5.92128801431127, "grad_norm": 2.212726354598999, "learning_rate": 8.098676293622142e-07, "loss": 0.3824, "step": 6620 }, { "epoch": 5.930232558139535, "grad_norm": 1.9802848100662231, "learning_rate": 8.0956678700361e-07, "loss": 0.3696, "step": 6630 }, { "epoch": 5.9391771019677995, "grad_norm": 2.106544256210327, "learning_rate": 8.09265944645006e-07, "loss": 0.3884, "step": 6640 }, { "epoch": 5.948121645796064, "grad_norm": 1.7887368202209473, "learning_rate": 8.089651022864019e-07, "loss": 0.3764, "step": 6650 }, { "epoch": 5.957066189624329, "grad_norm": 1.9476968050003052, "learning_rate": 8.086642599277978e-07, "loss": 0.3758, "step": 6660 }, { "epoch": 5.966010733452594, "grad_norm": 1.7526689767837524, "learning_rate": 8.083634175691937e-07, "loss": 0.3862, "step": 6670 }, { "epoch": 5.974955277280858, "grad_norm": 1.9941354990005493, "learning_rate": 8.080625752105897e-07, "loss": 0.3759, "step": 6680 }, { "epoch": 5.983899821109123, "grad_norm": 2.159471273422241, "learning_rate": 8.077617328519856e-07, "loss": 0.3838, "step": 6690 }, { "epoch": 5.992844364937389, "grad_norm": 2.2671890258789062, "learning_rate": 8.074608904933814e-07, "loss": 0.3745, "step": 6700 }, { "epoch": 6.0, "eval_bleu": 64.8987, "eval_gen_len": 75.4424, "eval_loss": 0.26931023597717285, "eval_runtime": 57.2125, "eval_samples_per_second": 18.213, "eval_steps_per_second": 0.192, "step": 6708 }, { "epoch": 6.001788908765653, "grad_norm": 2.142197847366333, "learning_rate": 8.071600481347773e-07, "loss": 0.3731, "step": 6710 }, { "epoch": 6.010733452593918, "grad_norm": 2.0640597343444824, "learning_rate": 8.068592057761732e-07, "loss": 0.3712, "step": 6720 }, { "epoch": 6.019677996422183, "grad_norm": 2.0293760299682617, "learning_rate": 8.065583634175691e-07, "loss": 0.3621, "step": 6730 }, { "epoch": 6.028622540250447, "grad_norm": 2.1380388736724854, "learning_rate": 8.06257521058965e-07, "loss": 0.3859, "step": 6740 }, { "epoch": 6.037567084078712, "grad_norm": 2.00776743888855, "learning_rate": 8.05956678700361e-07, "loss": 0.372, "step": 6750 }, { "epoch": 6.046511627906977, "grad_norm": 2.1911516189575195, "learning_rate": 8.056558363417569e-07, "loss": 0.3605, "step": 6760 }, { "epoch": 6.0554561717352415, "grad_norm": 1.8570131063461304, "learning_rate": 8.053549939831528e-07, "loss": 0.3731, "step": 6770 }, { "epoch": 6.064400715563506, "grad_norm": 2.439568519592285, "learning_rate": 8.050541516245487e-07, "loss": 0.3819, "step": 6780 }, { "epoch": 6.073345259391771, "grad_norm": 2.1658380031585693, "learning_rate": 8.047533092659447e-07, "loss": 0.3862, "step": 6790 }, { "epoch": 6.082289803220036, "grad_norm": 1.8695380687713623, "learning_rate": 8.044524669073405e-07, "loss": 0.3829, "step": 6800 }, { "epoch": 6.0912343470483, "grad_norm": 1.9415037631988525, "learning_rate": 8.041516245487365e-07, "loss": 0.3825, "step": 6810 }, { "epoch": 6.100178890876565, "grad_norm": 1.9677858352661133, "learning_rate": 8.038507821901323e-07, "loss": 0.3645, "step": 6820 }, { "epoch": 6.10912343470483, "grad_norm": 2.04807710647583, "learning_rate": 8.035499398315282e-07, "loss": 0.3718, "step": 6830 }, { "epoch": 6.1180679785330945, "grad_norm": 2.2417209148406982, "learning_rate": 8.032490974729241e-07, "loss": 0.3703, "step": 6840 }, { "epoch": 6.127012522361359, "grad_norm": 2.086850881576538, "learning_rate": 8.029482551143201e-07, "loss": 0.3684, "step": 6850 }, { "epoch": 6.135957066189624, "grad_norm": 2.112938165664673, "learning_rate": 8.02647412755716e-07, "loss": 0.383, "step": 6860 }, { "epoch": 6.1449016100178895, "grad_norm": 2.30755615234375, "learning_rate": 8.023465703971119e-07, "loss": 0.3692, "step": 6870 }, { "epoch": 6.153846153846154, "grad_norm": 1.943410873413086, "learning_rate": 8.020457280385078e-07, "loss": 0.3741, "step": 6880 }, { "epoch": 6.162790697674419, "grad_norm": 1.8077304363250732, "learning_rate": 8.017448856799037e-07, "loss": 0.3713, "step": 6890 }, { "epoch": 6.171735241502684, "grad_norm": 2.3878605365753174, "learning_rate": 8.014440433212996e-07, "loss": 0.3672, "step": 6900 }, { "epoch": 6.180679785330948, "grad_norm": 1.8652573823928833, "learning_rate": 8.011432009626955e-07, "loss": 0.3611, "step": 6910 }, { "epoch": 6.189624329159213, "grad_norm": 1.9883804321289062, "learning_rate": 8.008423586040915e-07, "loss": 0.3632, "step": 6920 }, { "epoch": 6.198568872987478, "grad_norm": 1.811181664466858, "learning_rate": 8.005415162454873e-07, "loss": 0.3643, "step": 6930 }, { "epoch": 6.207513416815742, "grad_norm": 2.0005087852478027, "learning_rate": 8.002406738868832e-07, "loss": 0.38, "step": 6940 }, { "epoch": 6.216457960644007, "grad_norm": 2.004239082336426, "learning_rate": 7.999398315282791e-07, "loss": 0.3682, "step": 6950 }, { "epoch": 6.225402504472272, "grad_norm": 2.189047336578369, "learning_rate": 7.996389891696751e-07, "loss": 0.3688, "step": 6960 }, { "epoch": 6.2343470483005365, "grad_norm": 1.7951610088348389, "learning_rate": 7.993381468110709e-07, "loss": 0.363, "step": 6970 }, { "epoch": 6.243291592128801, "grad_norm": 2.3958542346954346, "learning_rate": 7.990373044524669e-07, "loss": 0.3753, "step": 6980 }, { "epoch": 6.252236135957066, "grad_norm": 1.7929232120513916, "learning_rate": 7.987364620938628e-07, "loss": 0.3627, "step": 6990 }, { "epoch": 6.261180679785331, "grad_norm": 2.1548123359680176, "learning_rate": 7.984356197352587e-07, "loss": 0.3801, "step": 7000 }, { "epoch": 6.270125223613595, "grad_norm": 2.0325024127960205, "learning_rate": 7.981347773766546e-07, "loss": 0.3719, "step": 7010 }, { "epoch": 6.27906976744186, "grad_norm": 1.8743184804916382, "learning_rate": 7.978339350180506e-07, "loss": 0.3743, "step": 7020 }, { "epoch": 6.288014311270125, "grad_norm": 2.211442708969116, "learning_rate": 7.975330926594465e-07, "loss": 0.3474, "step": 7030 }, { "epoch": 6.29695885509839, "grad_norm": 2.0763611793518066, "learning_rate": 7.972322503008423e-07, "loss": 0.366, "step": 7040 }, { "epoch": 6.305903398926655, "grad_norm": 2.271003484725952, "learning_rate": 7.969314079422382e-07, "loss": 0.3742, "step": 7050 }, { "epoch": 6.31484794275492, "grad_norm": 2.1758437156677246, "learning_rate": 7.966305655836341e-07, "loss": 0.3637, "step": 7060 }, { "epoch": 6.3237924865831845, "grad_norm": 2.1391825675964355, "learning_rate": 7.9632972322503e-07, "loss": 0.3567, "step": 7070 }, { "epoch": 6.332737030411449, "grad_norm": 1.9387537240982056, "learning_rate": 7.960288808664259e-07, "loss": 0.3645, "step": 7080 }, { "epoch": 6.341681574239714, "grad_norm": 2.2183737754821777, "learning_rate": 7.957280385078219e-07, "loss": 0.3575, "step": 7090 }, { "epoch": 6.350626118067979, "grad_norm": 1.9483188390731812, "learning_rate": 7.954271961492178e-07, "loss": 0.3747, "step": 7100 }, { "epoch": 6.359570661896243, "grad_norm": 1.929976463317871, "learning_rate": 7.951263537906137e-07, "loss": 0.3479, "step": 7110 }, { "epoch": 6.368515205724508, "grad_norm": 1.893912434577942, "learning_rate": 7.948255114320096e-07, "loss": 0.3543, "step": 7120 }, { "epoch": 6.377459749552773, "grad_norm": 2.3432559967041016, "learning_rate": 7.945246690734056e-07, "loss": 0.3651, "step": 7130 }, { "epoch": 6.386404293381037, "grad_norm": 2.40236759185791, "learning_rate": 7.942238267148013e-07, "loss": 0.3757, "step": 7140 }, { "epoch": 6.395348837209302, "grad_norm": 1.7062771320343018, "learning_rate": 7.939229843561973e-07, "loss": 0.3597, "step": 7150 }, { "epoch": 6.404293381037567, "grad_norm": 1.9003205299377441, "learning_rate": 7.936221419975932e-07, "loss": 0.3616, "step": 7160 }, { "epoch": 6.4132379248658316, "grad_norm": 1.929549217224121, "learning_rate": 7.933212996389891e-07, "loss": 0.37, "step": 7170 }, { "epoch": 6.422182468694096, "grad_norm": 2.0662312507629395, "learning_rate": 7.93020457280385e-07, "loss": 0.3654, "step": 7180 }, { "epoch": 6.431127012522361, "grad_norm": 2.080864191055298, "learning_rate": 7.92719614921781e-07, "loss": 0.361, "step": 7190 }, { "epoch": 6.440071556350626, "grad_norm": 2.1335203647613525, "learning_rate": 7.924187725631769e-07, "loss": 0.3767, "step": 7200 }, { "epoch": 6.449016100178891, "grad_norm": 2.0065715312957764, "learning_rate": 7.921179302045728e-07, "loss": 0.3654, "step": 7210 }, { "epoch": 6.457960644007156, "grad_norm": 2.2601537704467773, "learning_rate": 7.918170878459687e-07, "loss": 0.3665, "step": 7220 }, { "epoch": 6.466905187835421, "grad_norm": 1.9954750537872314, "learning_rate": 7.915162454873647e-07, "loss": 0.3744, "step": 7230 }, { "epoch": 6.475849731663685, "grad_norm": 2.185459852218628, "learning_rate": 7.912154031287605e-07, "loss": 0.3667, "step": 7240 }, { "epoch": 6.48479427549195, "grad_norm": 1.9032912254333496, "learning_rate": 7.909145607701563e-07, "loss": 0.3529, "step": 7250 }, { "epoch": 6.493738819320215, "grad_norm": 1.798946738243103, "learning_rate": 7.906137184115523e-07, "loss": 0.3671, "step": 7260 }, { "epoch": 6.5026833631484795, "grad_norm": 2.072037935256958, "learning_rate": 7.903128760529482e-07, "loss": 0.3748, "step": 7270 }, { "epoch": 6.511627906976744, "grad_norm": 2.0674805641174316, "learning_rate": 7.900120336943441e-07, "loss": 0.3539, "step": 7280 }, { "epoch": 6.520572450805009, "grad_norm": 1.8857333660125732, "learning_rate": 7.8971119133574e-07, "loss": 0.3476, "step": 7290 }, { "epoch": 6.529516994633274, "grad_norm": 1.8023921251296997, "learning_rate": 7.89410348977136e-07, "loss": 0.3654, "step": 7300 }, { "epoch": 6.538461538461538, "grad_norm": 1.7650887966156006, "learning_rate": 7.891095066185318e-07, "loss": 0.3548, "step": 7310 }, { "epoch": 6.547406082289803, "grad_norm": 1.9362927675247192, "learning_rate": 7.888086642599278e-07, "loss": 0.3561, "step": 7320 }, { "epoch": 6.556350626118068, "grad_norm": 1.8597862720489502, "learning_rate": 7.885078219013237e-07, "loss": 0.3621, "step": 7330 }, { "epoch": 6.565295169946332, "grad_norm": 2.072204351425171, "learning_rate": 7.882069795427196e-07, "loss": 0.3599, "step": 7340 }, { "epoch": 6.574239713774597, "grad_norm": 2.5796756744384766, "learning_rate": 7.879061371841155e-07, "loss": 0.3505, "step": 7350 }, { "epoch": 6.583184257602863, "grad_norm": 2.2466342449188232, "learning_rate": 7.876052948255115e-07, "loss": 0.3551, "step": 7360 }, { "epoch": 6.592128801431127, "grad_norm": 1.9930235147476196, "learning_rate": 7.873044524669073e-07, "loss": 0.3643, "step": 7370 }, { "epoch": 6.601073345259392, "grad_norm": 2.447427988052368, "learning_rate": 7.870036101083032e-07, "loss": 0.3649, "step": 7380 }, { "epoch": 6.610017889087657, "grad_norm": 1.9434752464294434, "learning_rate": 7.867027677496991e-07, "loss": 0.3573, "step": 7390 }, { "epoch": 6.618962432915922, "grad_norm": 1.8593007326126099, "learning_rate": 7.864019253910951e-07, "loss": 0.3528, "step": 7400 }, { "epoch": 6.627906976744186, "grad_norm": 1.7246472835540771, "learning_rate": 7.861010830324909e-07, "loss": 0.3397, "step": 7410 }, { "epoch": 6.636851520572451, "grad_norm": 1.7897623777389526, "learning_rate": 7.858002406738868e-07, "loss": 0.3483, "step": 7420 }, { "epoch": 6.645796064400716, "grad_norm": 1.6254141330718994, "learning_rate": 7.854993983152828e-07, "loss": 0.3543, "step": 7430 }, { "epoch": 6.65474060822898, "grad_norm": 2.3027360439300537, "learning_rate": 7.851985559566786e-07, "loss": 0.3663, "step": 7440 }, { "epoch": 6.663685152057245, "grad_norm": 1.8483282327651978, "learning_rate": 7.848977135980746e-07, "loss": 0.352, "step": 7450 }, { "epoch": 6.67262969588551, "grad_norm": 1.87179696559906, "learning_rate": 7.845968712394705e-07, "loss": 0.3652, "step": 7460 }, { "epoch": 6.6815742397137745, "grad_norm": 1.859720230102539, "learning_rate": 7.842960288808665e-07, "loss": 0.3572, "step": 7470 }, { "epoch": 6.690518783542039, "grad_norm": 1.9132184982299805, "learning_rate": 7.839951865222622e-07, "loss": 0.3506, "step": 7480 }, { "epoch": 6.699463327370304, "grad_norm": 2.097236394882202, "learning_rate": 7.836943441636582e-07, "loss": 0.3484, "step": 7490 }, { "epoch": 6.708407871198569, "grad_norm": 1.7888951301574707, "learning_rate": 7.833935018050541e-07, "loss": 0.3555, "step": 7500 }, { "epoch": 6.717352415026833, "grad_norm": 2.1230521202087402, "learning_rate": 7.8309265944645e-07, "loss": 0.345, "step": 7510 }, { "epoch": 6.726296958855098, "grad_norm": 2.173988103866577, "learning_rate": 7.827918170878459e-07, "loss": 0.3505, "step": 7520 }, { "epoch": 6.735241502683364, "grad_norm": 2.3823533058166504, "learning_rate": 7.824909747292419e-07, "loss": 0.3517, "step": 7530 }, { "epoch": 6.7441860465116275, "grad_norm": 1.8840171098709106, "learning_rate": 7.821901323706378e-07, "loss": 0.3518, "step": 7540 }, { "epoch": 6.753130590339893, "grad_norm": 1.7635117769241333, "learning_rate": 7.818892900120337e-07, "loss": 0.355, "step": 7550 }, { "epoch": 6.762075134168158, "grad_norm": 2.197007656097412, "learning_rate": 7.815884476534296e-07, "loss": 0.3512, "step": 7560 }, { "epoch": 6.7710196779964225, "grad_norm": 1.796606421470642, "learning_rate": 7.812876052948256e-07, "loss": 0.3483, "step": 7570 }, { "epoch": 6.779964221824687, "grad_norm": 1.990676760673523, "learning_rate": 7.809867629362213e-07, "loss": 0.3528, "step": 7580 }, { "epoch": 6.788908765652952, "grad_norm": 1.8666518926620483, "learning_rate": 7.806859205776172e-07, "loss": 0.3449, "step": 7590 }, { "epoch": 6.797853309481217, "grad_norm": 1.7525831460952759, "learning_rate": 7.803850782190132e-07, "loss": 0.3534, "step": 7600 }, { "epoch": 6.806797853309481, "grad_norm": 2.3835935592651367, "learning_rate": 7.80084235860409e-07, "loss": 0.3486, "step": 7610 }, { "epoch": 6.815742397137746, "grad_norm": 1.987086296081543, "learning_rate": 7.79783393501805e-07, "loss": 0.3554, "step": 7620 }, { "epoch": 6.824686940966011, "grad_norm": 1.9066219329833984, "learning_rate": 7.794825511432009e-07, "loss": 0.3469, "step": 7630 }, { "epoch": 6.833631484794275, "grad_norm": 1.7373526096343994, "learning_rate": 7.791817087845969e-07, "loss": 0.3421, "step": 7640 }, { "epoch": 6.84257602862254, "grad_norm": 1.9056932926177979, "learning_rate": 7.788808664259927e-07, "loss": 0.3489, "step": 7650 }, { "epoch": 6.851520572450805, "grad_norm": 1.7385718822479248, "learning_rate": 7.785800240673887e-07, "loss": 0.3528, "step": 7660 }, { "epoch": 6.8604651162790695, "grad_norm": 2.358637571334839, "learning_rate": 7.782791817087846e-07, "loss": 0.3438, "step": 7670 }, { "epoch": 6.869409660107334, "grad_norm": 2.082883834838867, "learning_rate": 7.779783393501805e-07, "loss": 0.3635, "step": 7680 }, { "epoch": 6.878354203935599, "grad_norm": 1.8540581464767456, "learning_rate": 7.776774969915763e-07, "loss": 0.3412, "step": 7690 }, { "epoch": 6.8872987477638645, "grad_norm": 2.0504400730133057, "learning_rate": 7.773766546329723e-07, "loss": 0.3476, "step": 7700 }, { "epoch": 6.896243291592128, "grad_norm": 1.7489218711853027, "learning_rate": 7.770758122743682e-07, "loss": 0.3459, "step": 7710 }, { "epoch": 6.905187835420394, "grad_norm": 1.8986119031906128, "learning_rate": 7.767749699157641e-07, "loss": 0.3532, "step": 7720 }, { "epoch": 6.914132379248659, "grad_norm": 2.127493143081665, "learning_rate": 7.7647412755716e-07, "loss": 0.3532, "step": 7730 }, { "epoch": 6.923076923076923, "grad_norm": 1.7227003574371338, "learning_rate": 7.76173285198556e-07, "loss": 0.3277, "step": 7740 }, { "epoch": 6.932021466905188, "grad_norm": 1.9514144659042358, "learning_rate": 7.758724428399518e-07, "loss": 0.3448, "step": 7750 }, { "epoch": 6.940966010733453, "grad_norm": 2.035494089126587, "learning_rate": 7.755716004813477e-07, "loss": 0.3462, "step": 7760 }, { "epoch": 6.9499105545617175, "grad_norm": 1.9275267124176025, "learning_rate": 7.752707581227437e-07, "loss": 0.3435, "step": 7770 }, { "epoch": 6.958855098389982, "grad_norm": 1.8266222476959229, "learning_rate": 7.749699157641395e-07, "loss": 0.3354, "step": 7780 }, { "epoch": 6.967799642218247, "grad_norm": 2.1054952144622803, "learning_rate": 7.746690734055355e-07, "loss": 0.348, "step": 7790 }, { "epoch": 6.976744186046512, "grad_norm": 1.929856300354004, "learning_rate": 7.743682310469313e-07, "loss": 0.3368, "step": 7800 }, { "epoch": 6.985688729874776, "grad_norm": 1.9243886470794678, "learning_rate": 7.740673886883273e-07, "loss": 0.339, "step": 7810 }, { "epoch": 6.994633273703041, "grad_norm": 1.7127997875213623, "learning_rate": 7.737665463297231e-07, "loss": 0.3552, "step": 7820 }, { "epoch": 7.0, "eval_bleu": 67.7607, "eval_gen_len": 75.2438, "eval_loss": 0.24548278748989105, "eval_runtime": 55.6065, "eval_samples_per_second": 18.739, "eval_steps_per_second": 0.198, "step": 7826 }, { "epoch": 7.003577817531306, "grad_norm": 1.4987674951553345, "learning_rate": 7.734657039711191e-07, "loss": 0.3365, "step": 7830 }, { "epoch": 7.01252236135957, "grad_norm": 1.9626741409301758, "learning_rate": 7.73164861612515e-07, "loss": 0.3284, "step": 7840 }, { "epoch": 7.021466905187835, "grad_norm": 2.43878436088562, "learning_rate": 7.728640192539109e-07, "loss": 0.3435, "step": 7850 }, { "epoch": 7.0304114490161, "grad_norm": 2.184654712677002, "learning_rate": 7.725631768953068e-07, "loss": 0.3573, "step": 7860 }, { "epoch": 7.0393559928443645, "grad_norm": 2.2012953758239746, "learning_rate": 7.722623345367028e-07, "loss": 0.3417, "step": 7870 }, { "epoch": 7.04830053667263, "grad_norm": 2.001495599746704, "learning_rate": 7.719614921780987e-07, "loss": 0.3223, "step": 7880 }, { "epoch": 7.057245080500895, "grad_norm": 1.6574831008911133, "learning_rate": 7.716606498194946e-07, "loss": 0.3331, "step": 7890 }, { "epoch": 7.0661896243291595, "grad_norm": 1.8421671390533447, "learning_rate": 7.713598074608905e-07, "loss": 0.3402, "step": 7900 }, { "epoch": 7.075134168157424, "grad_norm": 2.073538303375244, "learning_rate": 7.710589651022865e-07, "loss": 0.3427, "step": 7910 }, { "epoch": 7.084078711985689, "grad_norm": 1.7038462162017822, "learning_rate": 7.707581227436822e-07, "loss": 0.3423, "step": 7920 }, { "epoch": 7.093023255813954, "grad_norm": 1.9820359945297241, "learning_rate": 7.704572803850781e-07, "loss": 0.3418, "step": 7930 }, { "epoch": 7.101967799642218, "grad_norm": 1.8462133407592773, "learning_rate": 7.701564380264741e-07, "loss": 0.3441, "step": 7940 }, { "epoch": 7.110912343470483, "grad_norm": 1.9584110975265503, "learning_rate": 7.698555956678699e-07, "loss": 0.3414, "step": 7950 }, { "epoch": 7.119856887298748, "grad_norm": 2.0095632076263428, "learning_rate": 7.695547533092659e-07, "loss": 0.3334, "step": 7960 }, { "epoch": 7.1288014311270125, "grad_norm": 2.040109395980835, "learning_rate": 7.692539109506618e-07, "loss": 0.3381, "step": 7970 }, { "epoch": 7.137745974955277, "grad_norm": 1.7459338903427124, "learning_rate": 7.689530685920578e-07, "loss": 0.3286, "step": 7980 }, { "epoch": 7.146690518783542, "grad_norm": 1.7973277568817139, "learning_rate": 7.686522262334536e-07, "loss": 0.3241, "step": 7990 }, { "epoch": 7.155635062611807, "grad_norm": 2.1315131187438965, "learning_rate": 7.683513838748496e-07, "loss": 0.3131, "step": 8000 }, { "epoch": 7.164579606440071, "grad_norm": 2.5606272220611572, "learning_rate": 7.680505415162455e-07, "loss": 0.3306, "step": 8010 }, { "epoch": 7.173524150268336, "grad_norm": 1.9568837881088257, "learning_rate": 7.677496991576414e-07, "loss": 0.3286, "step": 8020 }, { "epoch": 7.182468694096601, "grad_norm": 1.9463409185409546, "learning_rate": 7.674488567990372e-07, "loss": 0.3464, "step": 8030 }, { "epoch": 7.191413237924865, "grad_norm": 1.9276138544082642, "learning_rate": 7.671480144404332e-07, "loss": 0.3437, "step": 8040 }, { "epoch": 7.200357781753131, "grad_norm": 1.7747093439102173, "learning_rate": 7.668471720818291e-07, "loss": 0.3372, "step": 8050 }, { "epoch": 7.209302325581396, "grad_norm": 1.7246421575546265, "learning_rate": 7.66546329723225e-07, "loss": 0.3437, "step": 8060 }, { "epoch": 7.21824686940966, "grad_norm": 2.0087859630584717, "learning_rate": 7.662454873646209e-07, "loss": 0.3437, "step": 8070 }, { "epoch": 7.227191413237925, "grad_norm": 1.9562363624572754, "learning_rate": 7.659446450060169e-07, "loss": 0.3148, "step": 8080 }, { "epoch": 7.23613595706619, "grad_norm": 1.9914944171905518, "learning_rate": 7.656438026474127e-07, "loss": 0.3327, "step": 8090 }, { "epoch": 7.2450805008944545, "grad_norm": 2.276630401611328, "learning_rate": 7.653429602888086e-07, "loss": 0.3285, "step": 8100 }, { "epoch": 7.254025044722719, "grad_norm": 1.6960512399673462, "learning_rate": 7.650421179302046e-07, "loss": 0.3303, "step": 8110 }, { "epoch": 7.262969588550984, "grad_norm": 1.9524924755096436, "learning_rate": 7.647412755716004e-07, "loss": 0.3328, "step": 8120 }, { "epoch": 7.271914132379249, "grad_norm": 1.7584964036941528, "learning_rate": 7.644404332129964e-07, "loss": 0.3264, "step": 8130 }, { "epoch": 7.280858676207513, "grad_norm": 1.9152427911758423, "learning_rate": 7.641395908543922e-07, "loss": 0.333, "step": 8140 }, { "epoch": 7.289803220035778, "grad_norm": 1.6606615781784058, "learning_rate": 7.638387484957882e-07, "loss": 0.328, "step": 8150 }, { "epoch": 7.298747763864043, "grad_norm": 1.8712750673294067, "learning_rate": 7.63537906137184e-07, "loss": 0.3203, "step": 8160 }, { "epoch": 7.3076923076923075, "grad_norm": 1.7427736520767212, "learning_rate": 7.6323706377858e-07, "loss": 0.3284, "step": 8170 }, { "epoch": 7.316636851520572, "grad_norm": 2.324427366256714, "learning_rate": 7.629362214199759e-07, "loss": 0.3354, "step": 8180 }, { "epoch": 7.325581395348837, "grad_norm": 1.9044541120529175, "learning_rate": 7.626353790613718e-07, "loss": 0.3379, "step": 8190 }, { "epoch": 7.334525939177102, "grad_norm": 2.359497547149658, "learning_rate": 7.623345367027677e-07, "loss": 0.3252, "step": 8200 }, { "epoch": 7.343470483005366, "grad_norm": 2.01823091506958, "learning_rate": 7.620336943441637e-07, "loss": 0.3203, "step": 8210 }, { "epoch": 7.352415026833631, "grad_norm": 2.5603623390197754, "learning_rate": 7.617328519855595e-07, "loss": 0.3251, "step": 8220 }, { "epoch": 7.361359570661897, "grad_norm": 2.2850921154022217, "learning_rate": 7.614320096269555e-07, "loss": 0.3468, "step": 8230 }, { "epoch": 7.370304114490161, "grad_norm": 1.7620161771774292, "learning_rate": 7.611311672683514e-07, "loss": 0.3267, "step": 8240 }, { "epoch": 7.379248658318426, "grad_norm": 1.9929213523864746, "learning_rate": 7.608303249097473e-07, "loss": 0.3309, "step": 8250 }, { "epoch": 7.388193202146691, "grad_norm": 1.846187710762024, "learning_rate": 7.605294825511431e-07, "loss": 0.3391, "step": 8260 }, { "epoch": 7.397137745974955, "grad_norm": 1.825868844985962, "learning_rate": 7.60228640192539e-07, "loss": 0.3348, "step": 8270 }, { "epoch": 7.40608228980322, "grad_norm": 1.8973084688186646, "learning_rate": 7.59927797833935e-07, "loss": 0.3282, "step": 8280 }, { "epoch": 7.415026833631485, "grad_norm": 2.0674967765808105, "learning_rate": 7.596269554753308e-07, "loss": 0.322, "step": 8290 }, { "epoch": 7.4239713774597496, "grad_norm": 1.8808948993682861, "learning_rate": 7.593261131167268e-07, "loss": 0.3143, "step": 8300 }, { "epoch": 7.432915921288014, "grad_norm": 2.2094178199768066, "learning_rate": 7.590252707581227e-07, "loss": 0.3319, "step": 8310 }, { "epoch": 7.441860465116279, "grad_norm": 1.8964875936508179, "learning_rate": 7.587244283995187e-07, "loss": 0.3245, "step": 8320 }, { "epoch": 7.450805008944544, "grad_norm": 1.961187481880188, "learning_rate": 7.584235860409145e-07, "loss": 0.3192, "step": 8330 }, { "epoch": 7.459749552772808, "grad_norm": 2.1825661659240723, "learning_rate": 7.581227436823105e-07, "loss": 0.3427, "step": 8340 }, { "epoch": 7.468694096601073, "grad_norm": 1.9353907108306885, "learning_rate": 7.578219013237064e-07, "loss": 0.315, "step": 8350 }, { "epoch": 7.477638640429338, "grad_norm": 1.852070927619934, "learning_rate": 7.575210589651022e-07, "loss": 0.3275, "step": 8360 }, { "epoch": 7.4865831842576025, "grad_norm": 1.8378336429595947, "learning_rate": 7.572202166064981e-07, "loss": 0.323, "step": 8370 }, { "epoch": 7.495527728085868, "grad_norm": 1.6793603897094727, "learning_rate": 7.569193742478941e-07, "loss": 0.3284, "step": 8380 }, { "epoch": 7.504472271914132, "grad_norm": 1.8029990196228027, "learning_rate": 7.566185318892899e-07, "loss": 0.3216, "step": 8390 }, { "epoch": 7.5134168157423975, "grad_norm": 1.9036970138549805, "learning_rate": 7.563176895306859e-07, "loss": 0.324, "step": 8400 }, { "epoch": 7.522361359570662, "grad_norm": 1.8701826333999634, "learning_rate": 7.560168471720818e-07, "loss": 0.3232, "step": 8410 }, { "epoch": 7.531305903398927, "grad_norm": 1.6452906131744385, "learning_rate": 7.557160048134778e-07, "loss": 0.3327, "step": 8420 }, { "epoch": 7.540250447227192, "grad_norm": 1.9045932292938232, "learning_rate": 7.554151624548736e-07, "loss": 0.3254, "step": 8430 }, { "epoch": 7.549194991055456, "grad_norm": 1.8441321849822998, "learning_rate": 7.551143200962695e-07, "loss": 0.3255, "step": 8440 }, { "epoch": 7.558139534883721, "grad_norm": 1.7707712650299072, "learning_rate": 7.548134777376655e-07, "loss": 0.33, "step": 8450 }, { "epoch": 7.567084078711986, "grad_norm": 1.815173625946045, "learning_rate": 7.545126353790612e-07, "loss": 0.3307, "step": 8460 }, { "epoch": 7.5760286225402504, "grad_norm": 1.857528805732727, "learning_rate": 7.542117930204572e-07, "loss": 0.3196, "step": 8470 }, { "epoch": 7.584973166368515, "grad_norm": 2.055410623550415, "learning_rate": 7.539109506618531e-07, "loss": 0.3183, "step": 8480 }, { "epoch": 7.59391771019678, "grad_norm": 1.8891446590423584, "learning_rate": 7.536101083032491e-07, "loss": 0.3357, "step": 8490 }, { "epoch": 7.602862254025045, "grad_norm": 1.4552106857299805, "learning_rate": 7.533092659446449e-07, "loss": 0.314, "step": 8500 }, { "epoch": 7.611806797853309, "grad_norm": 1.7067893743515015, "learning_rate": 7.530084235860409e-07, "loss": 0.3181, "step": 8510 }, { "epoch": 7.620751341681574, "grad_norm": 1.9766528606414795, "learning_rate": 7.527075812274368e-07, "loss": 0.3173, "step": 8520 }, { "epoch": 7.629695885509839, "grad_norm": 1.8885570764541626, "learning_rate": 7.524067388688327e-07, "loss": 0.32, "step": 8530 }, { "epoch": 7.638640429338103, "grad_norm": 2.078587055206299, "learning_rate": 7.521058965102286e-07, "loss": 0.307, "step": 8540 }, { "epoch": 7.647584973166369, "grad_norm": 1.9613964557647705, "learning_rate": 7.518050541516246e-07, "loss": 0.3182, "step": 8550 }, { "epoch": 7.656529516994633, "grad_norm": 1.9764385223388672, "learning_rate": 7.515042117930204e-07, "loss": 0.3352, "step": 8560 }, { "epoch": 7.665474060822898, "grad_norm": 2.0007693767547607, "learning_rate": 7.512033694344164e-07, "loss": 0.3244, "step": 8570 }, { "epoch": 7.674418604651163, "grad_norm": 1.6230690479278564, "learning_rate": 7.509025270758122e-07, "loss": 0.3219, "step": 8580 }, { "epoch": 7.683363148479428, "grad_norm": 1.8112707138061523, "learning_rate": 7.506016847172082e-07, "loss": 0.3196, "step": 8590 }, { "epoch": 7.6923076923076925, "grad_norm": 1.8433512449264526, "learning_rate": 7.50300842358604e-07, "loss": 0.3201, "step": 8600 }, { "epoch": 7.701252236135957, "grad_norm": 2.180025100708008, "learning_rate": 7.5e-07, "loss": 0.3189, "step": 8610 }, { "epoch": 7.710196779964222, "grad_norm": 1.7990797758102417, "learning_rate": 7.496991576413959e-07, "loss": 0.3161, "step": 8620 }, { "epoch": 7.719141323792487, "grad_norm": 1.9342771768569946, "learning_rate": 7.493983152827917e-07, "loss": 0.3198, "step": 8630 }, { "epoch": 7.728085867620751, "grad_norm": 1.7878928184509277, "learning_rate": 7.490974729241877e-07, "loss": 0.3136, "step": 8640 }, { "epoch": 7.737030411449016, "grad_norm": 1.9334170818328857, "learning_rate": 7.487966305655836e-07, "loss": 0.3101, "step": 8650 }, { "epoch": 7.745974955277281, "grad_norm": 1.6836000680923462, "learning_rate": 7.484957882069796e-07, "loss": 0.3102, "step": 8660 }, { "epoch": 7.7549194991055455, "grad_norm": 2.0518503189086914, "learning_rate": 7.481949458483754e-07, "loss": 0.3089, "step": 8670 }, { "epoch": 7.76386404293381, "grad_norm": 1.9015884399414062, "learning_rate": 7.478941034897714e-07, "loss": 0.3116, "step": 8680 }, { "epoch": 7.772808586762075, "grad_norm": 1.910198450088501, "learning_rate": 7.475932611311672e-07, "loss": 0.3133, "step": 8690 }, { "epoch": 7.78175313059034, "grad_norm": 1.6957149505615234, "learning_rate": 7.472924187725631e-07, "loss": 0.3079, "step": 8700 }, { "epoch": 7.790697674418604, "grad_norm": 1.673759937286377, "learning_rate": 7.46991576413959e-07, "loss": 0.3109, "step": 8710 }, { "epoch": 7.79964221824687, "grad_norm": 1.676411509513855, "learning_rate": 7.46690734055355e-07, "loss": 0.3271, "step": 8720 }, { "epoch": 7.808586762075134, "grad_norm": 1.729395866394043, "learning_rate": 7.463898916967508e-07, "loss": 0.3222, "step": 8730 }, { "epoch": 7.817531305903399, "grad_norm": 1.7788453102111816, "learning_rate": 7.460890493381468e-07, "loss": 0.3099, "step": 8740 }, { "epoch": 7.826475849731664, "grad_norm": 1.9072816371917725, "learning_rate": 7.457882069795427e-07, "loss": 0.3078, "step": 8750 }, { "epoch": 7.835420393559929, "grad_norm": 1.8437711000442505, "learning_rate": 7.454873646209387e-07, "loss": 0.3151, "step": 8760 }, { "epoch": 7.844364937388193, "grad_norm": 1.8351961374282837, "learning_rate": 7.451865222623345e-07, "loss": 0.3233, "step": 8770 }, { "epoch": 7.853309481216458, "grad_norm": 1.892714023590088, "learning_rate": 7.448856799037305e-07, "loss": 0.3242, "step": 8780 }, { "epoch": 7.862254025044723, "grad_norm": 1.8229130506515503, "learning_rate": 7.445848375451264e-07, "loss": 0.3077, "step": 8790 }, { "epoch": 7.8711985688729875, "grad_norm": 1.7857228517532349, "learning_rate": 7.442839951865221e-07, "loss": 0.3269, "step": 8800 }, { "epoch": 7.880143112701252, "grad_norm": 1.8515849113464355, "learning_rate": 7.439831528279181e-07, "loss": 0.3163, "step": 8810 }, { "epoch": 7.889087656529517, "grad_norm": 1.964338779449463, "learning_rate": 7.43682310469314e-07, "loss": 0.3207, "step": 8820 }, { "epoch": 7.898032200357782, "grad_norm": 1.8012237548828125, "learning_rate": 7.4338146811071e-07, "loss": 0.3204, "step": 8830 }, { "epoch": 7.906976744186046, "grad_norm": 1.8915505409240723, "learning_rate": 7.430806257521058e-07, "loss": 0.3157, "step": 8840 }, { "epoch": 7.915921288014311, "grad_norm": 1.9770463705062866, "learning_rate": 7.427797833935018e-07, "loss": 0.302, "step": 8850 }, { "epoch": 7.924865831842576, "grad_norm": 1.6322442293167114, "learning_rate": 7.424789410348977e-07, "loss": 0.3229, "step": 8860 }, { "epoch": 7.9338103756708405, "grad_norm": 2.0155563354492188, "learning_rate": 7.421780986762936e-07, "loss": 0.3021, "step": 8870 }, { "epoch": 7.942754919499105, "grad_norm": 1.6682124137878418, "learning_rate": 7.418772563176895e-07, "loss": 0.3259, "step": 8880 }, { "epoch": 7.951699463327371, "grad_norm": 1.604107141494751, "learning_rate": 7.415764139590855e-07, "loss": 0.3006, "step": 8890 }, { "epoch": 7.960644007155635, "grad_norm": 2.204362630844116, "learning_rate": 7.412755716004812e-07, "loss": 0.31, "step": 8900 }, { "epoch": 7.9695885509839, "grad_norm": 1.7704609632492065, "learning_rate": 7.409747292418772e-07, "loss": 0.3284, "step": 8910 }, { "epoch": 7.978533094812165, "grad_norm": 2.1972293853759766, "learning_rate": 7.406738868832731e-07, "loss": 0.32, "step": 8920 }, { "epoch": 7.98747763864043, "grad_norm": 1.8060202598571777, "learning_rate": 7.403730445246691e-07, "loss": 0.3107, "step": 8930 }, { "epoch": 7.996422182468694, "grad_norm": 1.6772303581237793, "learning_rate": 7.400722021660649e-07, "loss": 0.3324, "step": 8940 }, { "epoch": 8.0, "eval_bleu": 69.635, "eval_gen_len": 75.1036, "eval_loss": 0.22736571729183197, "eval_runtime": 56.5288, "eval_samples_per_second": 18.433, "eval_steps_per_second": 0.195, "step": 8944 }, { "epoch": 8.005366726296959, "grad_norm": 1.878150463104248, "learning_rate": 7.397713598074609e-07, "loss": 0.3096, "step": 8950 }, { "epoch": 8.014311270125223, "grad_norm": 1.708453893661499, "learning_rate": 7.394705174488568e-07, "loss": 0.3129, "step": 8960 }, { "epoch": 8.023255813953488, "grad_norm": 2.111327886581421, "learning_rate": 7.391696750902526e-07, "loss": 0.3115, "step": 8970 }, { "epoch": 8.032200357781754, "grad_norm": 1.8746176958084106, "learning_rate": 7.388688327316486e-07, "loss": 0.3133, "step": 8980 }, { "epoch": 8.041144901610018, "grad_norm": 1.7758466005325317, "learning_rate": 7.385679903730445e-07, "loss": 0.3204, "step": 8990 }, { "epoch": 8.050089445438283, "grad_norm": 1.8004777431488037, "learning_rate": 7.382671480144405e-07, "loss": 0.3175, "step": 9000 }, { "epoch": 8.059033989266547, "grad_norm": 1.8340625762939453, "learning_rate": 7.379663056558362e-07, "loss": 0.3095, "step": 9010 }, { "epoch": 8.067978533094813, "grad_norm": 1.6976382732391357, "learning_rate": 7.376654632972322e-07, "loss": 0.3202, "step": 9020 }, { "epoch": 8.076923076923077, "grad_norm": 1.9153364896774292, "learning_rate": 7.373646209386281e-07, "loss": 0.3142, "step": 9030 }, { "epoch": 8.085867620751342, "grad_norm": 1.8551111221313477, "learning_rate": 7.37063778580024e-07, "loss": 0.2903, "step": 9040 }, { "epoch": 8.094812164579606, "grad_norm": 1.7493977546691895, "learning_rate": 7.367629362214199e-07, "loss": 0.3034, "step": 9050 }, { "epoch": 8.103756708407872, "grad_norm": 1.7226780652999878, "learning_rate": 7.364620938628159e-07, "loss": 0.309, "step": 9060 }, { "epoch": 8.112701252236135, "grad_norm": 2.0469791889190674, "learning_rate": 7.361612515042117e-07, "loss": 0.3073, "step": 9070 }, { "epoch": 8.121645796064401, "grad_norm": 1.8778507709503174, "learning_rate": 7.358604091456077e-07, "loss": 0.3106, "step": 9080 }, { "epoch": 8.130590339892665, "grad_norm": 2.180492639541626, "learning_rate": 7.355595667870036e-07, "loss": 0.3061, "step": 9090 }, { "epoch": 8.13953488372093, "grad_norm": 1.9905855655670166, "learning_rate": 7.352587244283996e-07, "loss": 0.315, "step": 9100 }, { "epoch": 8.148479427549194, "grad_norm": 1.7467095851898193, "learning_rate": 7.349578820697954e-07, "loss": 0.2965, "step": 9110 }, { "epoch": 8.15742397137746, "grad_norm": 1.6741291284561157, "learning_rate": 7.346570397111914e-07, "loss": 0.2965, "step": 9120 }, { "epoch": 8.166368515205724, "grad_norm": 1.6304727792739868, "learning_rate": 7.343561973525872e-07, "loss": 0.3133, "step": 9130 }, { "epoch": 8.17531305903399, "grad_norm": 1.9069669246673584, "learning_rate": 7.34055354993983e-07, "loss": 0.3063, "step": 9140 }, { "epoch": 8.184257602862255, "grad_norm": 1.7819583415985107, "learning_rate": 7.33754512635379e-07, "loss": 0.3107, "step": 9150 }, { "epoch": 8.193202146690519, "grad_norm": 2.1937334537506104, "learning_rate": 7.334536702767749e-07, "loss": 0.312, "step": 9160 }, { "epoch": 8.202146690518784, "grad_norm": 2.098236083984375, "learning_rate": 7.331528279181708e-07, "loss": 0.3091, "step": 9170 }, { "epoch": 8.211091234347048, "grad_norm": 1.8808610439300537, "learning_rate": 7.328519855595667e-07, "loss": 0.3014, "step": 9180 }, { "epoch": 8.220035778175314, "grad_norm": 2.0765531063079834, "learning_rate": 7.325511432009627e-07, "loss": 0.3057, "step": 9190 }, { "epoch": 8.228980322003578, "grad_norm": 1.7418488264083862, "learning_rate": 7.322503008423586e-07, "loss": 0.2948, "step": 9200 }, { "epoch": 8.237924865831843, "grad_norm": 2.491260528564453, "learning_rate": 7.319494584837545e-07, "loss": 0.2992, "step": 9210 }, { "epoch": 8.246869409660107, "grad_norm": 1.9340630769729614, "learning_rate": 7.316486161251504e-07, "loss": 0.3065, "step": 9220 }, { "epoch": 8.255813953488373, "grad_norm": 1.5319713354110718, "learning_rate": 7.313477737665464e-07, "loss": 0.2944, "step": 9230 }, { "epoch": 8.264758497316636, "grad_norm": 1.8187763690948486, "learning_rate": 7.310469314079421e-07, "loss": 0.3155, "step": 9240 }, { "epoch": 8.273703041144902, "grad_norm": 2.0588202476501465, "learning_rate": 7.307460890493381e-07, "loss": 0.3024, "step": 9250 }, { "epoch": 8.282647584973166, "grad_norm": 1.6918816566467285, "learning_rate": 7.30445246690734e-07, "loss": 0.2959, "step": 9260 }, { "epoch": 8.291592128801431, "grad_norm": 1.8093841075897217, "learning_rate": 7.3014440433213e-07, "loss": 0.3049, "step": 9270 }, { "epoch": 8.300536672629695, "grad_norm": 1.782524824142456, "learning_rate": 7.298435619735258e-07, "loss": 0.2912, "step": 9280 }, { "epoch": 8.30948121645796, "grad_norm": 1.842565894126892, "learning_rate": 7.295427196149218e-07, "loss": 0.2942, "step": 9290 }, { "epoch": 8.318425760286225, "grad_norm": 1.786942958831787, "learning_rate": 7.292418772563177e-07, "loss": 0.2944, "step": 9300 }, { "epoch": 8.32737030411449, "grad_norm": 2.370697021484375, "learning_rate": 7.289410348977135e-07, "loss": 0.3061, "step": 9310 }, { "epoch": 8.336314847942756, "grad_norm": 1.8887977600097656, "learning_rate": 7.286401925391095e-07, "loss": 0.2966, "step": 9320 }, { "epoch": 8.34525939177102, "grad_norm": 1.9103881120681763, "learning_rate": 7.283393501805054e-07, "loss": 0.3035, "step": 9330 }, { "epoch": 8.354203935599285, "grad_norm": 1.6420018672943115, "learning_rate": 7.280385078219013e-07, "loss": 0.302, "step": 9340 }, { "epoch": 8.363148479427549, "grad_norm": 1.8546104431152344, "learning_rate": 7.277376654632971e-07, "loss": 0.2954, "step": 9350 }, { "epoch": 8.372093023255815, "grad_norm": 1.7940912246704102, "learning_rate": 7.274368231046931e-07, "loss": 0.307, "step": 9360 }, { "epoch": 8.381037567084078, "grad_norm": 2.2668471336364746, "learning_rate": 7.27135980746089e-07, "loss": 0.301, "step": 9370 }, { "epoch": 8.389982110912344, "grad_norm": 1.848625898361206, "learning_rate": 7.268351383874849e-07, "loss": 0.3041, "step": 9380 }, { "epoch": 8.398926654740608, "grad_norm": 1.559826135635376, "learning_rate": 7.265342960288808e-07, "loss": 0.292, "step": 9390 }, { "epoch": 8.407871198568873, "grad_norm": 1.899900197982788, "learning_rate": 7.262334536702768e-07, "loss": 0.2948, "step": 9400 }, { "epoch": 8.416815742397137, "grad_norm": 1.8511258363723755, "learning_rate": 7.259326113116726e-07, "loss": 0.2937, "step": 9410 }, { "epoch": 8.425760286225403, "grad_norm": 1.9412320852279663, "learning_rate": 7.256317689530686e-07, "loss": 0.303, "step": 9420 }, { "epoch": 8.434704830053667, "grad_norm": 1.8842699527740479, "learning_rate": 7.253309265944645e-07, "loss": 0.3054, "step": 9430 }, { "epoch": 8.443649373881932, "grad_norm": 2.0506179332733154, "learning_rate": 7.250300842358605e-07, "loss": 0.304, "step": 9440 }, { "epoch": 8.452593917710196, "grad_norm": 1.71364164352417, "learning_rate": 7.247292418772563e-07, "loss": 0.3071, "step": 9450 }, { "epoch": 8.461538461538462, "grad_norm": 1.8477452993392944, "learning_rate": 7.244283995186522e-07, "loss": 0.3042, "step": 9460 }, { "epoch": 8.470483005366725, "grad_norm": 2.0586087703704834, "learning_rate": 7.241275571600481e-07, "loss": 0.3098, "step": 9470 }, { "epoch": 8.479427549194991, "grad_norm": 1.9386125802993774, "learning_rate": 7.238267148014439e-07, "loss": 0.3002, "step": 9480 }, { "epoch": 8.488372093023255, "grad_norm": 1.7622777223587036, "learning_rate": 7.235258724428399e-07, "loss": 0.3056, "step": 9490 }, { "epoch": 8.49731663685152, "grad_norm": 1.7191228866577148, "learning_rate": 7.232250300842358e-07, "loss": 0.2963, "step": 9500 }, { "epoch": 8.506261180679786, "grad_norm": 1.622677206993103, "learning_rate": 7.229241877256317e-07, "loss": 0.305, "step": 9510 }, { "epoch": 8.51520572450805, "grad_norm": 1.6148167848587036, "learning_rate": 7.226233453670276e-07, "loss": 0.3046, "step": 9520 }, { "epoch": 8.524150268336316, "grad_norm": 1.6732158660888672, "learning_rate": 7.223225030084236e-07, "loss": 0.3014, "step": 9530 }, { "epoch": 8.53309481216458, "grad_norm": 1.8138659000396729, "learning_rate": 7.220216606498195e-07, "loss": 0.3041, "step": 9540 }, { "epoch": 8.542039355992845, "grad_norm": 1.5868489742279053, "learning_rate": 7.217208182912154e-07, "loss": 0.2994, "step": 9550 }, { "epoch": 8.550983899821109, "grad_norm": 1.57341468334198, "learning_rate": 7.214199759326113e-07, "loss": 0.2986, "step": 9560 }, { "epoch": 8.559928443649374, "grad_norm": 2.0454437732696533, "learning_rate": 7.211191335740072e-07, "loss": 0.3019, "step": 9570 }, { "epoch": 8.568872987477638, "grad_norm": 1.9171931743621826, "learning_rate": 7.20818291215403e-07, "loss": 0.2963, "step": 9580 }, { "epoch": 8.577817531305904, "grad_norm": 1.9101694822311401, "learning_rate": 7.20517448856799e-07, "loss": 0.3049, "step": 9590 }, { "epoch": 8.586762075134168, "grad_norm": 2.052638292312622, "learning_rate": 7.202166064981949e-07, "loss": 0.3065, "step": 9600 }, { "epoch": 8.595706618962433, "grad_norm": 1.6304017305374146, "learning_rate": 7.199157641395909e-07, "loss": 0.2959, "step": 9610 }, { "epoch": 8.604651162790697, "grad_norm": 1.8963510990142822, "learning_rate": 7.196149217809867e-07, "loss": 0.3002, "step": 9620 }, { "epoch": 8.613595706618963, "grad_norm": 1.8579572439193726, "learning_rate": 7.193140794223827e-07, "loss": 0.3044, "step": 9630 }, { "epoch": 8.622540250447226, "grad_norm": 2.3206236362457275, "learning_rate": 7.190132370637786e-07, "loss": 0.2881, "step": 9640 }, { "epoch": 8.631484794275492, "grad_norm": 2.7964982986450195, "learning_rate": 7.187123947051744e-07, "loss": 0.298, "step": 9650 }, { "epoch": 8.640429338103758, "grad_norm": 1.6848253011703491, "learning_rate": 7.184115523465704e-07, "loss": 0.3032, "step": 9660 }, { "epoch": 8.649373881932021, "grad_norm": 2.1918492317199707, "learning_rate": 7.181107099879663e-07, "loss": 0.2981, "step": 9670 }, { "epoch": 8.658318425760287, "grad_norm": 1.746699571609497, "learning_rate": 7.178098676293621e-07, "loss": 0.2893, "step": 9680 }, { "epoch": 8.66726296958855, "grad_norm": 1.6501363515853882, "learning_rate": 7.17509025270758e-07, "loss": 0.3029, "step": 9690 }, { "epoch": 8.676207513416816, "grad_norm": 1.6390386819839478, "learning_rate": 7.17208182912154e-07, "loss": 0.2977, "step": 9700 }, { "epoch": 8.68515205724508, "grad_norm": 1.9444026947021484, "learning_rate": 7.169073405535499e-07, "loss": 0.2797, "step": 9710 }, { "epoch": 8.694096601073346, "grad_norm": 1.679573893547058, "learning_rate": 7.166064981949458e-07, "loss": 0.2978, "step": 9720 }, { "epoch": 8.70304114490161, "grad_norm": 1.9855968952178955, "learning_rate": 7.163056558363417e-07, "loss": 0.2979, "step": 9730 }, { "epoch": 8.711985688729875, "grad_norm": 1.834344506263733, "learning_rate": 7.160048134777377e-07, "loss": 0.2891, "step": 9740 }, { "epoch": 8.720930232558139, "grad_norm": 1.6731619834899902, "learning_rate": 7.157039711191335e-07, "loss": 0.2886, "step": 9750 }, { "epoch": 8.729874776386405, "grad_norm": 1.90044105052948, "learning_rate": 7.154031287605295e-07, "loss": 0.2836, "step": 9760 }, { "epoch": 8.738819320214668, "grad_norm": 1.754361867904663, "learning_rate": 7.151022864019254e-07, "loss": 0.2967, "step": 9770 }, { "epoch": 8.747763864042934, "grad_norm": 1.7115799188613892, "learning_rate": 7.148014440433214e-07, "loss": 0.2941, "step": 9780 }, { "epoch": 8.756708407871198, "grad_norm": 1.5401322841644287, "learning_rate": 7.145006016847171e-07, "loss": 0.2962, "step": 9790 }, { "epoch": 8.765652951699463, "grad_norm": 1.596837043762207, "learning_rate": 7.141997593261131e-07, "loss": 0.2987, "step": 9800 }, { "epoch": 8.774597495527727, "grad_norm": 1.6483724117279053, "learning_rate": 7.13898916967509e-07, "loss": 0.292, "step": 9810 }, { "epoch": 8.783542039355993, "grad_norm": 2.597574472427368, "learning_rate": 7.135980746089048e-07, "loss": 0.2954, "step": 9820 }, { "epoch": 8.792486583184257, "grad_norm": 1.6123573780059814, "learning_rate": 7.132972322503008e-07, "loss": 0.291, "step": 9830 }, { "epoch": 8.801431127012522, "grad_norm": 1.6893271207809448, "learning_rate": 7.129963898916967e-07, "loss": 0.2701, "step": 9840 }, { "epoch": 8.810375670840788, "grad_norm": 2.026670217514038, "learning_rate": 7.126955475330926e-07, "loss": 0.2884, "step": 9850 }, { "epoch": 8.819320214669052, "grad_norm": 1.7611801624298096, "learning_rate": 7.123947051744885e-07, "loss": 0.2849, "step": 9860 }, { "epoch": 8.828264758497317, "grad_norm": 1.6061441898345947, "learning_rate": 7.120938628158845e-07, "loss": 0.2921, "step": 9870 }, { "epoch": 8.837209302325581, "grad_norm": 2.0565128326416016, "learning_rate": 7.117930204572804e-07, "loss": 0.2856, "step": 9880 }, { "epoch": 8.846153846153847, "grad_norm": 2.602135181427002, "learning_rate": 7.114921780986763e-07, "loss": 0.2967, "step": 9890 }, { "epoch": 8.85509838998211, "grad_norm": 1.7691055536270142, "learning_rate": 7.111913357400721e-07, "loss": 0.2996, "step": 9900 }, { "epoch": 8.864042933810376, "grad_norm": 1.7833753824234009, "learning_rate": 7.108904933814681e-07, "loss": 0.2951, "step": 9910 }, { "epoch": 8.87298747763864, "grad_norm": 1.9108906984329224, "learning_rate": 7.105896510228639e-07, "loss": 0.2925, "step": 9920 }, { "epoch": 8.881932021466906, "grad_norm": 36.32560729980469, "learning_rate": 7.102888086642599e-07, "loss": 0.2947, "step": 9930 }, { "epoch": 8.89087656529517, "grad_norm": 1.918143630027771, "learning_rate": 7.099879663056558e-07, "loss": 0.2847, "step": 9940 }, { "epoch": 8.899821109123435, "grad_norm": 2.5026514530181885, "learning_rate": 7.096871239470517e-07, "loss": 0.2871, "step": 9950 }, { "epoch": 8.908765652951699, "grad_norm": 1.8258767127990723, "learning_rate": 7.093862815884476e-07, "loss": 0.2853, "step": 9960 }, { "epoch": 8.917710196779964, "grad_norm": 1.6664685010910034, "learning_rate": 7.090854392298436e-07, "loss": 0.2846, "step": 9970 }, { "epoch": 8.926654740608228, "grad_norm": 1.9599624872207642, "learning_rate": 7.087845968712395e-07, "loss": 0.2807, "step": 9980 }, { "epoch": 8.935599284436494, "grad_norm": 1.8693242073059082, "learning_rate": 7.084837545126353e-07, "loss": 0.2931, "step": 9990 }, { "epoch": 8.94454382826476, "grad_norm": 2.025860548019409, "learning_rate": 7.081829121540313e-07, "loss": 0.2916, "step": 10000 }, { "epoch": 8.953488372093023, "grad_norm": 1.6296007633209229, "learning_rate": 7.078820697954271e-07, "loss": 0.2901, "step": 10010 }, { "epoch": 8.962432915921289, "grad_norm": 1.8131719827651978, "learning_rate": 7.07581227436823e-07, "loss": 0.2935, "step": 10020 }, { "epoch": 8.971377459749553, "grad_norm": 1.7503496408462524, "learning_rate": 7.072803850782189e-07, "loss": 0.2843, "step": 10030 }, { "epoch": 8.980322003577818, "grad_norm": 2.0243091583251953, "learning_rate": 7.069795427196149e-07, "loss": 0.2877, "step": 10040 }, { "epoch": 8.989266547406082, "grad_norm": 2.1338999271392822, "learning_rate": 7.066787003610108e-07, "loss": 0.2816, "step": 10050 }, { "epoch": 8.998211091234348, "grad_norm": 1.9088892936706543, "learning_rate": 7.063778580024067e-07, "loss": 0.2912, "step": 10060 }, { "epoch": 9.0, "eval_bleu": 71.3086, "eval_gen_len": 75.0326, "eval_loss": 0.21165511012077332, "eval_runtime": 57.4802, "eval_samples_per_second": 18.128, "eval_steps_per_second": 0.191, "step": 10062 }, { "epoch": 9.007155635062611, "grad_norm": 1.994418978691101, "learning_rate": 7.060770156438026e-07, "loss": 0.2897, "step": 10070 }, { "epoch": 9.016100178890877, "grad_norm": 1.5814578533172607, "learning_rate": 7.057761732851986e-07, "loss": 0.2765, "step": 10080 }, { "epoch": 9.02504472271914, "grad_norm": 1.6096930503845215, "learning_rate": 7.054753309265944e-07, "loss": 0.2732, "step": 10090 }, { "epoch": 9.033989266547406, "grad_norm": 1.7393819093704224, "learning_rate": 7.051744885679904e-07, "loss": 0.3009, "step": 10100 }, { "epoch": 9.04293381037567, "grad_norm": 1.8216224908828735, "learning_rate": 7.048736462093863e-07, "loss": 0.2829, "step": 10110 }, { "epoch": 9.051878354203936, "grad_norm": 1.5788861513137817, "learning_rate": 7.045728038507821e-07, "loss": 0.2716, "step": 10120 }, { "epoch": 9.0608228980322, "grad_norm": 1.5919556617736816, "learning_rate": 7.04271961492178e-07, "loss": 0.2838, "step": 10130 }, { "epoch": 9.069767441860465, "grad_norm": 1.6876708269119263, "learning_rate": 7.03971119133574e-07, "loss": 0.2832, "step": 10140 }, { "epoch": 9.078711985688729, "grad_norm": 1.8378523588180542, "learning_rate": 7.036702767749699e-07, "loss": 0.2942, "step": 10150 }, { "epoch": 9.087656529516995, "grad_norm": 1.6949840784072876, "learning_rate": 7.033694344163658e-07, "loss": 0.2952, "step": 10160 }, { "epoch": 9.09660107334526, "grad_norm": 1.837931513786316, "learning_rate": 7.030685920577617e-07, "loss": 0.2837, "step": 10170 }, { "epoch": 9.105545617173524, "grad_norm": 1.8467128276824951, "learning_rate": 7.027677496991576e-07, "loss": 0.2939, "step": 10180 }, { "epoch": 9.11449016100179, "grad_norm": 1.966645359992981, "learning_rate": 7.024669073405535e-07, "loss": 0.2838, "step": 10190 }, { "epoch": 9.123434704830053, "grad_norm": 1.8841787576675415, "learning_rate": 7.021660649819494e-07, "loss": 0.2896, "step": 10200 }, { "epoch": 9.132379248658319, "grad_norm": 1.8857272863388062, "learning_rate": 7.018652226233454e-07, "loss": 0.2955, "step": 10210 }, { "epoch": 9.141323792486583, "grad_norm": 1.6923903226852417, "learning_rate": 7.015643802647413e-07, "loss": 0.2851, "step": 10220 }, { "epoch": 9.150268336314848, "grad_norm": 1.8548059463500977, "learning_rate": 7.012635379061371e-07, "loss": 0.2843, "step": 10230 }, { "epoch": 9.159212880143112, "grad_norm": 2.048212766647339, "learning_rate": 7.00962695547533e-07, "loss": 0.2927, "step": 10240 }, { "epoch": 9.168157423971378, "grad_norm": 1.835397481918335, "learning_rate": 7.00661853188929e-07, "loss": 0.2801, "step": 10250 }, { "epoch": 9.177101967799642, "grad_norm": 1.7286008596420288, "learning_rate": 7.003610108303248e-07, "loss": 0.2852, "step": 10260 }, { "epoch": 9.186046511627907, "grad_norm": 1.7264772653579712, "learning_rate": 7.000601684717208e-07, "loss": 0.2932, "step": 10270 }, { "epoch": 9.194991055456171, "grad_norm": 2.0432634353637695, "learning_rate": 6.997593261131167e-07, "loss": 0.2696, "step": 10280 }, { "epoch": 9.203935599284437, "grad_norm": 1.849487543106079, "learning_rate": 6.994584837545126e-07, "loss": 0.2886, "step": 10290 }, { "epoch": 9.2128801431127, "grad_norm": 1.8255362510681152, "learning_rate": 6.991576413959085e-07, "loss": 0.2889, "step": 10300 }, { "epoch": 9.221824686940966, "grad_norm": 1.9005097150802612, "learning_rate": 6.988567990373045e-07, "loss": 0.2798, "step": 10310 }, { "epoch": 9.23076923076923, "grad_norm": 1.788237452507019, "learning_rate": 6.985559566787004e-07, "loss": 0.2821, "step": 10320 }, { "epoch": 9.239713774597496, "grad_norm": 1.9736496210098267, "learning_rate": 6.982551143200963e-07, "loss": 0.2824, "step": 10330 }, { "epoch": 9.248658318425761, "grad_norm": 1.855490803718567, "learning_rate": 6.979542719614921e-07, "loss": 0.2907, "step": 10340 }, { "epoch": 9.257602862254025, "grad_norm": 1.6318082809448242, "learning_rate": 6.97653429602888e-07, "loss": 0.2845, "step": 10350 }, { "epoch": 9.26654740608229, "grad_norm": 1.9707635641098022, "learning_rate": 6.973525872442839e-07, "loss": 0.2981, "step": 10360 }, { "epoch": 9.275491949910554, "grad_norm": 1.567692756652832, "learning_rate": 6.970517448856798e-07, "loss": 0.2837, "step": 10370 }, { "epoch": 9.28443649373882, "grad_norm": 1.9208908081054688, "learning_rate": 6.967509025270758e-07, "loss": 0.2824, "step": 10380 }, { "epoch": 9.293381037567084, "grad_norm": 1.8427250385284424, "learning_rate": 6.964500601684717e-07, "loss": 0.2848, "step": 10390 }, { "epoch": 9.30232558139535, "grad_norm": 2.1784746646881104, "learning_rate": 6.961492178098676e-07, "loss": 0.2847, "step": 10400 }, { "epoch": 9.311270125223613, "grad_norm": 1.7605880498886108, "learning_rate": 6.958483754512635e-07, "loss": 0.2803, "step": 10410 }, { "epoch": 9.320214669051879, "grad_norm": 1.820336103439331, "learning_rate": 6.955475330926595e-07, "loss": 0.2756, "step": 10420 }, { "epoch": 9.329159212880143, "grad_norm": 1.5924071073532104, "learning_rate": 6.952466907340553e-07, "loss": 0.2767, "step": 10430 }, { "epoch": 9.338103756708408, "grad_norm": 1.563889741897583, "learning_rate": 6.949458483754513e-07, "loss": 0.2748, "step": 10440 }, { "epoch": 9.347048300536672, "grad_norm": 1.9999738931655884, "learning_rate": 6.946450060168471e-07, "loss": 0.2816, "step": 10450 }, { "epoch": 9.355992844364938, "grad_norm": 1.93051016330719, "learning_rate": 6.94344163658243e-07, "loss": 0.2838, "step": 10460 }, { "epoch": 9.364937388193201, "grad_norm": 1.7076514959335327, "learning_rate": 6.940433212996389e-07, "loss": 0.2773, "step": 10470 }, { "epoch": 9.373881932021467, "grad_norm": 1.8156001567840576, "learning_rate": 6.937424789410349e-07, "loss": 0.2745, "step": 10480 }, { "epoch": 9.38282647584973, "grad_norm": 1.8215938806533813, "learning_rate": 6.934416365824308e-07, "loss": 0.2851, "step": 10490 }, { "epoch": 9.391771019677996, "grad_norm": 2.2098872661590576, "learning_rate": 6.931407942238267e-07, "loss": 0.281, "step": 10500 }, { "epoch": 9.400715563506262, "grad_norm": 1.8551905155181885, "learning_rate": 6.928399518652226e-07, "loss": 0.2785, "step": 10510 }, { "epoch": 9.409660107334526, "grad_norm": 1.5589836835861206, "learning_rate": 6.925391095066185e-07, "loss": 0.2728, "step": 10520 }, { "epoch": 9.418604651162791, "grad_norm": 1.5664674043655396, "learning_rate": 6.922382671480144e-07, "loss": 0.2886, "step": 10530 }, { "epoch": 9.427549194991055, "grad_norm": 1.721440076828003, "learning_rate": 6.919374247894103e-07, "loss": 0.2747, "step": 10540 }, { "epoch": 9.43649373881932, "grad_norm": 1.601876974105835, "learning_rate": 6.916365824308063e-07, "loss": 0.2685, "step": 10550 }, { "epoch": 9.445438282647585, "grad_norm": 1.8169065713882446, "learning_rate": 6.913357400722021e-07, "loss": 0.2787, "step": 10560 }, { "epoch": 9.45438282647585, "grad_norm": 1.734645128250122, "learning_rate": 6.91034897713598e-07, "loss": 0.2892, "step": 10570 }, { "epoch": 9.463327370304114, "grad_norm": 1.8845486640930176, "learning_rate": 6.907340553549939e-07, "loss": 0.2785, "step": 10580 }, { "epoch": 9.47227191413238, "grad_norm": 1.6264091730117798, "learning_rate": 6.904332129963899e-07, "loss": 0.2712, "step": 10590 }, { "epoch": 9.481216457960643, "grad_norm": 1.8591593503952026, "learning_rate": 6.901323706377857e-07, "loss": 0.2745, "step": 10600 }, { "epoch": 9.490161001788909, "grad_norm": 1.8113240003585815, "learning_rate": 6.898315282791817e-07, "loss": 0.2773, "step": 10610 }, { "epoch": 9.499105545617173, "grad_norm": 2.180983066558838, "learning_rate": 6.895306859205776e-07, "loss": 0.2841, "step": 10620 }, { "epoch": 9.508050089445439, "grad_norm": 1.5472530126571655, "learning_rate": 6.892298435619735e-07, "loss": 0.2784, "step": 10630 }, { "epoch": 9.516994633273702, "grad_norm": 1.6539241075515747, "learning_rate": 6.889290012033694e-07, "loss": 0.2803, "step": 10640 }, { "epoch": 9.525939177101968, "grad_norm": 2.5152432918548584, "learning_rate": 6.886281588447654e-07, "loss": 0.2754, "step": 10650 }, { "epoch": 9.534883720930232, "grad_norm": 1.7078677415847778, "learning_rate": 6.883273164861613e-07, "loss": 0.2816, "step": 10660 }, { "epoch": 9.543828264758497, "grad_norm": 1.7446966171264648, "learning_rate": 6.880264741275571e-07, "loss": 0.2845, "step": 10670 }, { "epoch": 9.552772808586763, "grad_norm": 1.8630328178405762, "learning_rate": 6.87725631768953e-07, "loss": 0.2716, "step": 10680 }, { "epoch": 9.561717352415027, "grad_norm": 1.6282811164855957, "learning_rate": 6.87424789410349e-07, "loss": 0.2888, "step": 10690 }, { "epoch": 9.570661896243292, "grad_norm": 1.8140019178390503, "learning_rate": 6.871239470517448e-07, "loss": 0.284, "step": 10700 }, { "epoch": 9.579606440071556, "grad_norm": 1.568709373474121, "learning_rate": 6.868231046931407e-07, "loss": 0.2728, "step": 10710 }, { "epoch": 9.588550983899822, "grad_norm": 2.294747829437256, "learning_rate": 6.865222623345367e-07, "loss": 0.2732, "step": 10720 }, { "epoch": 9.597495527728086, "grad_norm": 1.5327823162078857, "learning_rate": 6.862214199759325e-07, "loss": 0.2704, "step": 10730 }, { "epoch": 9.606440071556351, "grad_norm": 1.429221272468567, "learning_rate": 6.859205776173285e-07, "loss": 0.2758, "step": 10740 }, { "epoch": 9.615384615384615, "grad_norm": 2.2276451587677, "learning_rate": 6.856197352587244e-07, "loss": 0.2649, "step": 10750 }, { "epoch": 9.62432915921288, "grad_norm": 2.1165475845336914, "learning_rate": 6.853188929001204e-07, "loss": 0.2848, "step": 10760 }, { "epoch": 9.633273703041144, "grad_norm": 1.5652105808258057, "learning_rate": 6.850180505415162e-07, "loss": 0.2683, "step": 10770 }, { "epoch": 9.64221824686941, "grad_norm": 1.8563920259475708, "learning_rate": 6.847172081829121e-07, "loss": 0.2701, "step": 10780 }, { "epoch": 9.651162790697674, "grad_norm": 1.9464433193206787, "learning_rate": 6.84416365824308e-07, "loss": 0.2643, "step": 10790 }, { "epoch": 9.66010733452594, "grad_norm": 1.7110228538513184, "learning_rate": 6.841155234657039e-07, "loss": 0.2678, "step": 10800 }, { "epoch": 9.669051878354203, "grad_norm": 1.559122920036316, "learning_rate": 6.838146811070998e-07, "loss": 0.2769, "step": 10810 }, { "epoch": 9.677996422182469, "grad_norm": 1.5416113138198853, "learning_rate": 6.835138387484958e-07, "loss": 0.2847, "step": 10820 }, { "epoch": 9.686940966010733, "grad_norm": 1.944888710975647, "learning_rate": 6.832129963898917e-07, "loss": 0.285, "step": 10830 }, { "epoch": 9.695885509838998, "grad_norm": 1.6021989583969116, "learning_rate": 6.829121540312876e-07, "loss": 0.2639, "step": 10840 }, { "epoch": 9.704830053667262, "grad_norm": 1.9095149040222168, "learning_rate": 6.826113116726835e-07, "loss": 0.2783, "step": 10850 }, { "epoch": 9.713774597495528, "grad_norm": 1.979996919631958, "learning_rate": 6.823104693140795e-07, "loss": 0.27, "step": 10860 }, { "epoch": 9.722719141323793, "grad_norm": 1.568677544593811, "learning_rate": 6.820096269554753e-07, "loss": 0.2717, "step": 10870 }, { "epoch": 9.731663685152057, "grad_norm": 1.8690828084945679, "learning_rate": 6.817087845968712e-07, "loss": 0.2799, "step": 10880 }, { "epoch": 9.740608228980323, "grad_norm": 2.1000804901123047, "learning_rate": 6.814079422382671e-07, "loss": 0.2891, "step": 10890 }, { "epoch": 9.749552772808586, "grad_norm": 1.8976290225982666, "learning_rate": 6.811070998796629e-07, "loss": 0.2687, "step": 10900 }, { "epoch": 9.758497316636852, "grad_norm": 1.5658888816833496, "learning_rate": 6.808062575210589e-07, "loss": 0.2728, "step": 10910 }, { "epoch": 9.767441860465116, "grad_norm": 2.6436548233032227, "learning_rate": 6.805054151624548e-07, "loss": 0.2645, "step": 10920 }, { "epoch": 9.776386404293381, "grad_norm": 1.801951289176941, "learning_rate": 6.802045728038508e-07, "loss": 0.2759, "step": 10930 }, { "epoch": 9.785330948121645, "grad_norm": 1.6843072175979614, "learning_rate": 6.799037304452466e-07, "loss": 0.2751, "step": 10940 }, { "epoch": 9.79427549194991, "grad_norm": 1.7897447347640991, "learning_rate": 6.796028880866426e-07, "loss": 0.2716, "step": 10950 }, { "epoch": 9.803220035778175, "grad_norm": 1.4712356328964233, "learning_rate": 6.793020457280385e-07, "loss": 0.271, "step": 10960 }, { "epoch": 9.81216457960644, "grad_norm": 1.9668405055999756, "learning_rate": 6.790012033694344e-07, "loss": 0.2825, "step": 10970 }, { "epoch": 9.821109123434704, "grad_norm": 1.4257514476776123, "learning_rate": 6.787003610108303e-07, "loss": 0.2695, "step": 10980 }, { "epoch": 9.83005366726297, "grad_norm": 2.357689380645752, "learning_rate": 6.783995186522263e-07, "loss": 0.285, "step": 10990 }, { "epoch": 9.838998211091234, "grad_norm": 1.616826057434082, "learning_rate": 6.780986762936222e-07, "loss": 0.2639, "step": 11000 }, { "epoch": 9.847942754919499, "grad_norm": 1.521409034729004, "learning_rate": 6.77797833935018e-07, "loss": 0.2713, "step": 11010 }, { "epoch": 9.856887298747765, "grad_norm": 1.5054371356964111, "learning_rate": 6.774969915764139e-07, "loss": 0.2704, "step": 11020 }, { "epoch": 9.865831842576029, "grad_norm": 1.6104198694229126, "learning_rate": 6.771961492178099e-07, "loss": 0.2672, "step": 11030 }, { "epoch": 9.874776386404294, "grad_norm": 1.7688450813293457, "learning_rate": 6.768953068592057e-07, "loss": 0.2667, "step": 11040 }, { "epoch": 9.883720930232558, "grad_norm": 1.5123931169509888, "learning_rate": 6.765944645006016e-07, "loss": 0.2616, "step": 11050 }, { "epoch": 9.892665474060824, "grad_norm": 1.5442701578140259, "learning_rate": 6.762936221419976e-07, "loss": 0.2797, "step": 11060 }, { "epoch": 9.901610017889087, "grad_norm": 1.9512841701507568, "learning_rate": 6.759927797833934e-07, "loss": 0.2676, "step": 11070 }, { "epoch": 9.910554561717353, "grad_norm": 1.6428849697113037, "learning_rate": 6.756919374247894e-07, "loss": 0.2735, "step": 11080 }, { "epoch": 9.919499105545617, "grad_norm": 2.7944211959838867, "learning_rate": 6.753910950661853e-07, "loss": 0.2712, "step": 11090 }, { "epoch": 9.928443649373882, "grad_norm": 1.6916003227233887, "learning_rate": 6.750902527075813e-07, "loss": 0.272, "step": 11100 }, { "epoch": 9.937388193202146, "grad_norm": 1.6299495697021484, "learning_rate": 6.74789410348977e-07, "loss": 0.2795, "step": 11110 }, { "epoch": 9.946332737030412, "grad_norm": 1.7412554025650024, "learning_rate": 6.74488567990373e-07, "loss": 0.2697, "step": 11120 }, { "epoch": 9.955277280858676, "grad_norm": 1.537885308265686, "learning_rate": 6.741877256317689e-07, "loss": 0.2593, "step": 11130 }, { "epoch": 9.964221824686941, "grad_norm": 1.4216618537902832, "learning_rate": 6.738868832731648e-07, "loss": 0.2626, "step": 11140 }, { "epoch": 9.973166368515205, "grad_norm": 1.3773558139801025, "learning_rate": 6.735860409145607e-07, "loss": 0.2648, "step": 11150 }, { "epoch": 9.98211091234347, "grad_norm": 1.9161559343338013, "learning_rate": 6.732851985559567e-07, "loss": 0.2675, "step": 11160 }, { "epoch": 9.991055456171736, "grad_norm": 1.692733645439148, "learning_rate": 6.729843561973526e-07, "loss": 0.2601, "step": 11170 }, { "epoch": 10.0, "grad_norm": 2.6017653942108154, "learning_rate": 6.726835138387485e-07, "loss": 0.2591, "step": 11180 }, { "epoch": 10.0, "eval_bleu": 72.392, "eval_gen_len": 74.9607, "eval_loss": 0.20007498562335968, "eval_runtime": 57.3761, "eval_samples_per_second": 18.161, "eval_steps_per_second": 0.192, "step": 11180 }, { "epoch": 10.008944543828266, "grad_norm": 1.955108642578125, "learning_rate": 6.723826714801444e-07, "loss": 0.2603, "step": 11190 }, { "epoch": 10.01788908765653, "grad_norm": 1.7971755266189575, "learning_rate": 6.720818291215404e-07, "loss": 0.2574, "step": 11200 }, { "epoch": 10.026833631484795, "grad_norm": 1.5595004558563232, "learning_rate": 6.717809867629362e-07, "loss": 0.2729, "step": 11210 }, { "epoch": 10.035778175313059, "grad_norm": 1.6440141201019287, "learning_rate": 6.71480144404332e-07, "loss": 0.2706, "step": 11220 }, { "epoch": 10.044722719141324, "grad_norm": 1.6471924781799316, "learning_rate": 6.71179302045728e-07, "loss": 0.2662, "step": 11230 }, { "epoch": 10.053667262969588, "grad_norm": 1.8682457208633423, "learning_rate": 6.708784596871238e-07, "loss": 0.2615, "step": 11240 }, { "epoch": 10.062611806797854, "grad_norm": 1.9249464273452759, "learning_rate": 6.705776173285198e-07, "loss": 0.2718, "step": 11250 }, { "epoch": 10.071556350626118, "grad_norm": 1.7548773288726807, "learning_rate": 6.702767749699157e-07, "loss": 0.2604, "step": 11260 }, { "epoch": 10.080500894454383, "grad_norm": 1.8712800741195679, "learning_rate": 6.699759326113117e-07, "loss": 0.2587, "step": 11270 }, { "epoch": 10.089445438282647, "grad_norm": 1.5508815050125122, "learning_rate": 6.696750902527075e-07, "loss": 0.2692, "step": 11280 }, { "epoch": 10.098389982110913, "grad_norm": 1.7437515258789062, "learning_rate": 6.693742478941035e-07, "loss": 0.2724, "step": 11290 }, { "epoch": 10.107334525939176, "grad_norm": 1.8249415159225464, "learning_rate": 6.690734055354994e-07, "loss": 0.2701, "step": 11300 }, { "epoch": 10.116279069767442, "grad_norm": 2.235722064971924, "learning_rate": 6.687725631768953e-07, "loss": 0.2718, "step": 11310 }, { "epoch": 10.125223613595706, "grad_norm": 1.6692746877670288, "learning_rate": 6.684717208182912e-07, "loss": 0.2679, "step": 11320 }, { "epoch": 10.134168157423971, "grad_norm": 1.651930332183838, "learning_rate": 6.681708784596872e-07, "loss": 0.2573, "step": 11330 }, { "epoch": 10.143112701252235, "grad_norm": 2.019298791885376, "learning_rate": 6.67870036101083e-07, "loss": 0.2617, "step": 11340 }, { "epoch": 10.152057245080501, "grad_norm": 1.7158833742141724, "learning_rate": 6.675691937424789e-07, "loss": 0.2806, "step": 11350 }, { "epoch": 10.161001788908766, "grad_norm": 1.6784923076629639, "learning_rate": 6.672683513838748e-07, "loss": 0.2717, "step": 11360 }, { "epoch": 10.16994633273703, "grad_norm": 1.6140066385269165, "learning_rate": 6.669675090252708e-07, "loss": 0.2557, "step": 11370 }, { "epoch": 10.178890876565296, "grad_norm": 1.7521976232528687, "learning_rate": 6.666666666666666e-07, "loss": 0.2697, "step": 11380 }, { "epoch": 10.18783542039356, "grad_norm": 1.640120267868042, "learning_rate": 6.663658243080625e-07, "loss": 0.268, "step": 11390 }, { "epoch": 10.196779964221825, "grad_norm": 1.9078781604766846, "learning_rate": 6.660649819494585e-07, "loss": 0.2661, "step": 11400 }, { "epoch": 10.20572450805009, "grad_norm": 1.6086190938949585, "learning_rate": 6.657641395908543e-07, "loss": 0.2656, "step": 11410 }, { "epoch": 10.214669051878355, "grad_norm": 1.8679802417755127, "learning_rate": 6.654632972322503e-07, "loss": 0.2875, "step": 11420 }, { "epoch": 10.223613595706619, "grad_norm": 1.6009517908096313, "learning_rate": 6.651624548736462e-07, "loss": 0.2691, "step": 11430 }, { "epoch": 10.232558139534884, "grad_norm": 1.8012363910675049, "learning_rate": 6.648616125150422e-07, "loss": 0.2695, "step": 11440 }, { "epoch": 10.241502683363148, "grad_norm": 2.0842690467834473, "learning_rate": 6.645607701564379e-07, "loss": 0.275, "step": 11450 }, { "epoch": 10.250447227191414, "grad_norm": 1.7462635040283203, "learning_rate": 6.642599277978339e-07, "loss": 0.2578, "step": 11460 }, { "epoch": 10.259391771019677, "grad_norm": 1.591439127922058, "learning_rate": 6.639590854392298e-07, "loss": 0.2627, "step": 11470 }, { "epoch": 10.268336314847943, "grad_norm": 1.7550685405731201, "learning_rate": 6.636582430806257e-07, "loss": 0.2597, "step": 11480 }, { "epoch": 10.277280858676207, "grad_norm": 1.7708865404129028, "learning_rate": 6.633574007220216e-07, "loss": 0.2737, "step": 11490 }, { "epoch": 10.286225402504472, "grad_norm": 1.8559082746505737, "learning_rate": 6.630565583634176e-07, "loss": 0.2778, "step": 11500 }, { "epoch": 10.295169946332736, "grad_norm": 1.6985397338867188, "learning_rate": 6.627557160048135e-07, "loss": 0.2616, "step": 11510 }, { "epoch": 10.304114490161002, "grad_norm": 1.6595687866210938, "learning_rate": 6.624548736462094e-07, "loss": 0.2626, "step": 11520 }, { "epoch": 10.313059033989267, "grad_norm": 1.7601011991500854, "learning_rate": 6.621540312876053e-07, "loss": 0.2625, "step": 11530 }, { "epoch": 10.322003577817531, "grad_norm": 1.7115074396133423, "learning_rate": 6.618531889290013e-07, "loss": 0.2632, "step": 11540 }, { "epoch": 10.330948121645797, "grad_norm": 1.7978862524032593, "learning_rate": 6.61552346570397e-07, "loss": 0.2591, "step": 11550 }, { "epoch": 10.33989266547406, "grad_norm": 1.7057119607925415, "learning_rate": 6.612515042117929e-07, "loss": 0.2651, "step": 11560 }, { "epoch": 10.348837209302326, "grad_norm": 1.6174237728118896, "learning_rate": 6.609506618531889e-07, "loss": 0.2562, "step": 11570 }, { "epoch": 10.35778175313059, "grad_norm": 1.4859338998794556, "learning_rate": 6.606498194945847e-07, "loss": 0.2542, "step": 11580 }, { "epoch": 10.366726296958856, "grad_norm": 1.6329357624053955, "learning_rate": 6.603489771359807e-07, "loss": 0.2652, "step": 11590 }, { "epoch": 10.37567084078712, "grad_norm": 1.7987093925476074, "learning_rate": 6.600481347773766e-07, "loss": 0.2693, "step": 11600 }, { "epoch": 10.384615384615385, "grad_norm": 2.0187013149261475, "learning_rate": 6.597472924187726e-07, "loss": 0.2544, "step": 11610 }, { "epoch": 10.393559928443649, "grad_norm": 1.8823727369308472, "learning_rate": 6.594464500601684e-07, "loss": 0.2667, "step": 11620 }, { "epoch": 10.402504472271914, "grad_norm": 2.0941925048828125, "learning_rate": 6.591456077015644e-07, "loss": 0.2643, "step": 11630 }, { "epoch": 10.411449016100178, "grad_norm": 1.7243622541427612, "learning_rate": 6.588447653429603e-07, "loss": 0.257, "step": 11640 }, { "epoch": 10.420393559928444, "grad_norm": 1.586456537246704, "learning_rate": 6.585439229843562e-07, "loss": 0.2544, "step": 11650 }, { "epoch": 10.429338103756708, "grad_norm": 1.5672041177749634, "learning_rate": 6.58243080625752e-07, "loss": 0.2714, "step": 11660 }, { "epoch": 10.438282647584973, "grad_norm": 1.4649882316589355, "learning_rate": 6.57942238267148e-07, "loss": 0.2523, "step": 11670 }, { "epoch": 10.447227191413237, "grad_norm": 1.5319983959197998, "learning_rate": 6.576413959085438e-07, "loss": 0.2574, "step": 11680 }, { "epoch": 10.456171735241503, "grad_norm": 1.672580599784851, "learning_rate": 6.573405535499398e-07, "loss": 0.253, "step": 11690 }, { "epoch": 10.465116279069768, "grad_norm": 1.5990978479385376, "learning_rate": 6.570397111913357e-07, "loss": 0.2598, "step": 11700 }, { "epoch": 10.474060822898032, "grad_norm": 1.6208809614181519, "learning_rate": 6.567388688327317e-07, "loss": 0.2591, "step": 11710 }, { "epoch": 10.483005366726298, "grad_norm": 1.6113932132720947, "learning_rate": 6.564380264741275e-07, "loss": 0.2595, "step": 11720 }, { "epoch": 10.491949910554561, "grad_norm": 1.9188364744186401, "learning_rate": 6.561371841155234e-07, "loss": 0.2634, "step": 11730 }, { "epoch": 10.500894454382827, "grad_norm": 1.6541216373443604, "learning_rate": 6.558363417569194e-07, "loss": 0.2655, "step": 11740 }, { "epoch": 10.509838998211091, "grad_norm": 1.631947636604309, "learning_rate": 6.555354993983152e-07, "loss": 0.262, "step": 11750 }, { "epoch": 10.518783542039357, "grad_norm": 1.7661869525909424, "learning_rate": 6.552346570397112e-07, "loss": 0.2632, "step": 11760 }, { "epoch": 10.52772808586762, "grad_norm": 1.4233602285385132, "learning_rate": 6.54933814681107e-07, "loss": 0.2599, "step": 11770 }, { "epoch": 10.536672629695886, "grad_norm": 1.8507752418518066, "learning_rate": 6.54632972322503e-07, "loss": 0.2615, "step": 11780 }, { "epoch": 10.54561717352415, "grad_norm": 1.598519206047058, "learning_rate": 6.543321299638988e-07, "loss": 0.2568, "step": 11790 }, { "epoch": 10.554561717352415, "grad_norm": 1.6336430311203003, "learning_rate": 6.540312876052948e-07, "loss": 0.2549, "step": 11800 }, { "epoch": 10.56350626118068, "grad_norm": 1.557985782623291, "learning_rate": 6.537304452466907e-07, "loss": 0.2638, "step": 11810 }, { "epoch": 10.572450805008945, "grad_norm": 1.6330904960632324, "learning_rate": 6.534296028880866e-07, "loss": 0.256, "step": 11820 }, { "epoch": 10.581395348837209, "grad_norm": 1.4486061334609985, "learning_rate": 6.531287605294825e-07, "loss": 0.2527, "step": 11830 }, { "epoch": 10.590339892665474, "grad_norm": 1.7647820711135864, "learning_rate": 6.528279181708785e-07, "loss": 0.255, "step": 11840 }, { "epoch": 10.59928443649374, "grad_norm": 1.6095329523086548, "learning_rate": 6.525270758122743e-07, "loss": 0.2484, "step": 11850 }, { "epoch": 10.608228980322004, "grad_norm": 1.395642876625061, "learning_rate": 6.522262334536703e-07, "loss": 0.2507, "step": 11860 }, { "epoch": 10.61717352415027, "grad_norm": 1.7519396543502808, "learning_rate": 6.519253910950662e-07, "loss": 0.2574, "step": 11870 }, { "epoch": 10.626118067978533, "grad_norm": 1.5717977285385132, "learning_rate": 6.516245487364622e-07, "loss": 0.253, "step": 11880 }, { "epoch": 10.635062611806799, "grad_norm": 1.6412906646728516, "learning_rate": 6.513237063778579e-07, "loss": 0.2586, "step": 11890 }, { "epoch": 10.644007155635062, "grad_norm": 1.8597332239151, "learning_rate": 6.510228640192538e-07, "loss": 0.2592, "step": 11900 }, { "epoch": 10.652951699463328, "grad_norm": 1.8751498460769653, "learning_rate": 6.507220216606498e-07, "loss": 0.27, "step": 11910 }, { "epoch": 10.661896243291592, "grad_norm": 1.6349623203277588, "learning_rate": 6.504211793020456e-07, "loss": 0.2566, "step": 11920 }, { "epoch": 10.670840787119857, "grad_norm": 1.5924897193908691, "learning_rate": 6.501203369434416e-07, "loss": 0.2518, "step": 11930 }, { "epoch": 10.679785330948121, "grad_norm": 1.5484246015548706, "learning_rate": 6.498194945848375e-07, "loss": 0.25, "step": 11940 }, { "epoch": 10.688729874776387, "grad_norm": 1.6486756801605225, "learning_rate": 6.495186522262335e-07, "loss": 0.2617, "step": 11950 }, { "epoch": 10.69767441860465, "grad_norm": 1.730144739151001, "learning_rate": 6.492178098676293e-07, "loss": 0.2569, "step": 11960 }, { "epoch": 10.706618962432916, "grad_norm": 2.03178334236145, "learning_rate": 6.489169675090253e-07, "loss": 0.2526, "step": 11970 }, { "epoch": 10.71556350626118, "grad_norm": 1.855293869972229, "learning_rate": 6.486161251504212e-07, "loss": 0.2685, "step": 11980 }, { "epoch": 10.724508050089446, "grad_norm": 1.7201274633407593, "learning_rate": 6.48315282791817e-07, "loss": 0.2529, "step": 11990 }, { "epoch": 10.73345259391771, "grad_norm": 1.6383235454559326, "learning_rate": 6.480144404332129e-07, "loss": 0.267, "step": 12000 }, { "epoch": 10.742397137745975, "grad_norm": 1.538323998451233, "learning_rate": 6.477135980746089e-07, "loss": 0.2543, "step": 12010 }, { "epoch": 10.751341681574239, "grad_norm": 1.6423795223236084, "learning_rate": 6.474127557160047e-07, "loss": 0.2586, "step": 12020 }, { "epoch": 10.760286225402504, "grad_norm": 1.6282501220703125, "learning_rate": 6.471119133574007e-07, "loss": 0.2552, "step": 12030 }, { "epoch": 10.76923076923077, "grad_norm": 1.472403645515442, "learning_rate": 6.468110709987966e-07, "loss": 0.2591, "step": 12040 }, { "epoch": 10.778175313059034, "grad_norm": 1.8465627431869507, "learning_rate": 6.465102286401926e-07, "loss": 0.2547, "step": 12050 }, { "epoch": 10.7871198568873, "grad_norm": 1.6513354778289795, "learning_rate": 6.462093862815884e-07, "loss": 0.2611, "step": 12060 }, { "epoch": 10.796064400715563, "grad_norm": 1.4019272327423096, "learning_rate": 6.459085439229844e-07, "loss": 0.2526, "step": 12070 }, { "epoch": 10.805008944543829, "grad_norm": 1.6927614212036133, "learning_rate": 6.456077015643803e-07, "loss": 0.2659, "step": 12080 }, { "epoch": 10.813953488372093, "grad_norm": 1.825679063796997, "learning_rate": 6.453068592057761e-07, "loss": 0.2677, "step": 12090 }, { "epoch": 10.822898032200358, "grad_norm": 1.6658412218093872, "learning_rate": 6.45006016847172e-07, "loss": 0.2548, "step": 12100 }, { "epoch": 10.831842576028622, "grad_norm": 1.9499558210372925, "learning_rate": 6.447051744885679e-07, "loss": 0.2597, "step": 12110 }, { "epoch": 10.840787119856888, "grad_norm": 1.8598222732543945, "learning_rate": 6.444043321299639e-07, "loss": 0.2574, "step": 12120 }, { "epoch": 10.849731663685152, "grad_norm": 1.6625630855560303, "learning_rate": 6.441034897713597e-07, "loss": 0.2526, "step": 12130 }, { "epoch": 10.858676207513417, "grad_norm": 1.5452898740768433, "learning_rate": 6.438026474127557e-07, "loss": 0.2515, "step": 12140 }, { "epoch": 10.867620751341681, "grad_norm": 1.7677903175354004, "learning_rate": 6.435018050541516e-07, "loss": 0.2607, "step": 12150 }, { "epoch": 10.876565295169947, "grad_norm": 1.5394271612167358, "learning_rate": 6.432009626955475e-07, "loss": 0.2502, "step": 12160 }, { "epoch": 10.88550983899821, "grad_norm": 1.6044206619262695, "learning_rate": 6.429001203369434e-07, "loss": 0.246, "step": 12170 }, { "epoch": 10.894454382826476, "grad_norm": 1.5506819486618042, "learning_rate": 6.425992779783394e-07, "loss": 0.2637, "step": 12180 }, { "epoch": 10.903398926654742, "grad_norm": 1.797220230102539, "learning_rate": 6.422984356197352e-07, "loss": 0.2535, "step": 12190 }, { "epoch": 10.912343470483005, "grad_norm": 1.6893784999847412, "learning_rate": 6.419975932611312e-07, "loss": 0.2554, "step": 12200 }, { "epoch": 10.921288014311271, "grad_norm": 2.001776695251465, "learning_rate": 6.41696750902527e-07, "loss": 0.2591, "step": 12210 }, { "epoch": 10.930232558139535, "grad_norm": 1.6749728918075562, "learning_rate": 6.41395908543923e-07, "loss": 0.2387, "step": 12220 }, { "epoch": 10.9391771019678, "grad_norm": 1.7582108974456787, "learning_rate": 6.410950661853188e-07, "loss": 0.2542, "step": 12230 }, { "epoch": 10.948121645796064, "grad_norm": 1.4224308729171753, "learning_rate": 6.407942238267148e-07, "loss": 0.2597, "step": 12240 }, { "epoch": 10.95706618962433, "grad_norm": 1.700706958770752, "learning_rate": 6.404933814681107e-07, "loss": 0.256, "step": 12250 }, { "epoch": 10.966010733452594, "grad_norm": 1.681830883026123, "learning_rate": 6.401925391095065e-07, "loss": 0.2516, "step": 12260 }, { "epoch": 10.97495527728086, "grad_norm": 1.5968338251113892, "learning_rate": 6.398916967509025e-07, "loss": 0.2514, "step": 12270 }, { "epoch": 10.983899821109123, "grad_norm": 1.705511450767517, "learning_rate": 6.395908543922984e-07, "loss": 0.2532, "step": 12280 }, { "epoch": 10.992844364937389, "grad_norm": 1.6551766395568848, "learning_rate": 6.392900120336944e-07, "loss": 0.2471, "step": 12290 }, { "epoch": 11.0, "eval_bleu": 73.4758, "eval_gen_len": 74.9251, "eval_loss": 0.18992315232753754, "eval_runtime": 56.392, "eval_samples_per_second": 18.478, "eval_steps_per_second": 0.195, "step": 12298 }, { "epoch": 11.001788908765652, "grad_norm": 1.6203548908233643, "learning_rate": 6.389891696750902e-07, "loss": 0.2487, "step": 12300 }, { "epoch": 11.010733452593918, "grad_norm": 1.4733091592788696, "learning_rate": 6.386883273164862e-07, "loss": 0.2623, "step": 12310 }, { "epoch": 11.019677996422182, "grad_norm": 1.8115324974060059, "learning_rate": 6.38387484957882e-07, "loss": 0.2616, "step": 12320 }, { "epoch": 11.028622540250447, "grad_norm": 1.9844839572906494, "learning_rate": 6.380866425992779e-07, "loss": 0.2561, "step": 12330 }, { "epoch": 11.037567084078711, "grad_norm": 3.2231359481811523, "learning_rate": 6.377858002406738e-07, "loss": 0.2553, "step": 12340 }, { "epoch": 11.046511627906977, "grad_norm": 1.4005216360092163, "learning_rate": 6.374849578820698e-07, "loss": 0.2399, "step": 12350 }, { "epoch": 11.05545617173524, "grad_norm": 1.6509549617767334, "learning_rate": 6.371841155234656e-07, "loss": 0.251, "step": 12360 }, { "epoch": 11.064400715563506, "grad_norm": 1.507264494895935, "learning_rate": 6.368832731648616e-07, "loss": 0.2449, "step": 12370 }, { "epoch": 11.073345259391772, "grad_norm": 1.9406224489212036, "learning_rate": 6.365824308062575e-07, "loss": 0.25, "step": 12380 }, { "epoch": 11.082289803220036, "grad_norm": 1.834517002105713, "learning_rate": 6.362815884476535e-07, "loss": 0.2619, "step": 12390 }, { "epoch": 11.091234347048301, "grad_norm": 1.7223912477493286, "learning_rate": 6.359807460890493e-07, "loss": 0.2576, "step": 12400 }, { "epoch": 11.100178890876565, "grad_norm": 1.5235416889190674, "learning_rate": 6.356799037304453e-07, "loss": 0.24, "step": 12410 }, { "epoch": 11.10912343470483, "grad_norm": 1.5340937376022339, "learning_rate": 6.353790613718412e-07, "loss": 0.2567, "step": 12420 }, { "epoch": 11.118067978533094, "grad_norm": 1.5485321283340454, "learning_rate": 6.350782190132369e-07, "loss": 0.2501, "step": 12430 }, { "epoch": 11.12701252236136, "grad_norm": 1.5416953563690186, "learning_rate": 6.347773766546329e-07, "loss": 0.2493, "step": 12440 }, { "epoch": 11.135957066189624, "grad_norm": 1.7156434059143066, "learning_rate": 6.344765342960288e-07, "loss": 0.2451, "step": 12450 }, { "epoch": 11.14490161001789, "grad_norm": 1.7634798288345337, "learning_rate": 6.341756919374247e-07, "loss": 0.2525, "step": 12460 }, { "epoch": 11.153846153846153, "grad_norm": 1.6508985757827759, "learning_rate": 6.338748495788206e-07, "loss": 0.2492, "step": 12470 }, { "epoch": 11.162790697674419, "grad_norm": 2.021599769592285, "learning_rate": 6.335740072202166e-07, "loss": 0.2534, "step": 12480 }, { "epoch": 11.171735241502683, "grad_norm": 1.6555845737457275, "learning_rate": 6.332731648616125e-07, "loss": 0.2561, "step": 12490 }, { "epoch": 11.180679785330948, "grad_norm": 1.904301404953003, "learning_rate": 6.329723225030084e-07, "loss": 0.243, "step": 12500 }, { "epoch": 11.189624329159212, "grad_norm": 1.5915064811706543, "learning_rate": 6.326714801444043e-07, "loss": 0.2526, "step": 12510 }, { "epoch": 11.198568872987478, "grad_norm": 1.4680665731430054, "learning_rate": 6.323706377858003e-07, "loss": 0.2336, "step": 12520 }, { "epoch": 11.207513416815742, "grad_norm": 1.414535641670227, "learning_rate": 6.320697954271961e-07, "loss": 0.2491, "step": 12530 }, { "epoch": 11.216457960644007, "grad_norm": 1.5655280351638794, "learning_rate": 6.31768953068592e-07, "loss": 0.2554, "step": 12540 }, { "epoch": 11.225402504472273, "grad_norm": 1.8156970739364624, "learning_rate": 6.314681107099879e-07, "loss": 0.2549, "step": 12550 }, { "epoch": 11.234347048300537, "grad_norm": 1.654660940170288, "learning_rate": 6.311672683513839e-07, "loss": 0.2481, "step": 12560 }, { "epoch": 11.243291592128802, "grad_norm": 1.5700427293777466, "learning_rate": 6.308664259927797e-07, "loss": 0.2534, "step": 12570 }, { "epoch": 11.252236135957066, "grad_norm": 1.6425267457962036, "learning_rate": 6.305655836341757e-07, "loss": 0.257, "step": 12580 }, { "epoch": 11.261180679785332, "grad_norm": 1.6263014078140259, "learning_rate": 6.302647412755716e-07, "loss": 0.2413, "step": 12590 }, { "epoch": 11.270125223613595, "grad_norm": 1.5932801961898804, "learning_rate": 6.299638989169674e-07, "loss": 0.239, "step": 12600 }, { "epoch": 11.279069767441861, "grad_norm": 1.8274989128112793, "learning_rate": 6.296630565583634e-07, "loss": 0.2491, "step": 12610 }, { "epoch": 11.288014311270125, "grad_norm": 1.538745641708374, "learning_rate": 6.293622141997593e-07, "loss": 0.243, "step": 12620 }, { "epoch": 11.29695885509839, "grad_norm": 1.4591760635375977, "learning_rate": 6.290613718411552e-07, "loss": 0.2395, "step": 12630 }, { "epoch": 11.305903398926654, "grad_norm": 1.4508098363876343, "learning_rate": 6.287605294825511e-07, "loss": 0.236, "step": 12640 }, { "epoch": 11.31484794275492, "grad_norm": 1.6408956050872803, "learning_rate": 6.28459687123947e-07, "loss": 0.2486, "step": 12650 }, { "epoch": 11.323792486583184, "grad_norm": 1.584268569946289, "learning_rate": 6.281588447653429e-07, "loss": 0.2416, "step": 12660 }, { "epoch": 11.33273703041145, "grad_norm": 1.9997392892837524, "learning_rate": 6.278580024067388e-07, "loss": 0.2643, "step": 12670 }, { "epoch": 11.341681574239713, "grad_norm": 1.5162452459335327, "learning_rate": 6.275571600481347e-07, "loss": 0.2404, "step": 12680 }, { "epoch": 11.350626118067979, "grad_norm": 1.5581248998641968, "learning_rate": 6.272563176895307e-07, "loss": 0.2503, "step": 12690 }, { "epoch": 11.359570661896242, "grad_norm": 1.477333903312683, "learning_rate": 6.269554753309265e-07, "loss": 0.2379, "step": 12700 }, { "epoch": 11.368515205724508, "grad_norm": 1.7269163131713867, "learning_rate": 6.266546329723225e-07, "loss": 0.2526, "step": 12710 }, { "epoch": 11.377459749552774, "grad_norm": 1.6344338655471802, "learning_rate": 6.263537906137184e-07, "loss": 0.2481, "step": 12720 }, { "epoch": 11.386404293381037, "grad_norm": 1.5223592519760132, "learning_rate": 6.260529482551144e-07, "loss": 0.2525, "step": 12730 }, { "epoch": 11.395348837209303, "grad_norm": 1.904078722000122, "learning_rate": 6.257521058965102e-07, "loss": 0.2427, "step": 12740 }, { "epoch": 11.404293381037567, "grad_norm": 1.5667195320129395, "learning_rate": 6.254512635379062e-07, "loss": 0.2402, "step": 12750 }, { "epoch": 11.413237924865832, "grad_norm": 1.8779903650283813, "learning_rate": 6.251504211793021e-07, "loss": 0.251, "step": 12760 }, { "epoch": 11.422182468694096, "grad_norm": 1.4085509777069092, "learning_rate": 6.248495788206978e-07, "loss": 0.2532, "step": 12770 }, { "epoch": 11.431127012522362, "grad_norm": 1.5099104642868042, "learning_rate": 6.245487364620938e-07, "loss": 0.2482, "step": 12780 }, { "epoch": 11.440071556350626, "grad_norm": 1.6909841299057007, "learning_rate": 6.242478941034897e-07, "loss": 0.2376, "step": 12790 }, { "epoch": 11.449016100178891, "grad_norm": 1.5439828634262085, "learning_rate": 6.239470517448856e-07, "loss": 0.2375, "step": 12800 }, { "epoch": 11.457960644007155, "grad_norm": 1.8462985754013062, "learning_rate": 6.236462093862815e-07, "loss": 0.2528, "step": 12810 }, { "epoch": 11.46690518783542, "grad_norm": 1.5367786884307861, "learning_rate": 6.233453670276775e-07, "loss": 0.2409, "step": 12820 }, { "epoch": 11.475849731663684, "grad_norm": 1.5457161664962769, "learning_rate": 6.230445246690734e-07, "loss": 0.2508, "step": 12830 }, { "epoch": 11.48479427549195, "grad_norm": 1.5234055519104004, "learning_rate": 6.227436823104693e-07, "loss": 0.2439, "step": 12840 }, { "epoch": 11.493738819320214, "grad_norm": 1.925322413444519, "learning_rate": 6.224428399518652e-07, "loss": 0.2497, "step": 12850 }, { "epoch": 11.50268336314848, "grad_norm": 1.5820813179016113, "learning_rate": 6.221419975932612e-07, "loss": 0.2616, "step": 12860 }, { "epoch": 11.511627906976745, "grad_norm": 1.547390341758728, "learning_rate": 6.21841155234657e-07, "loss": 0.2432, "step": 12870 }, { "epoch": 11.520572450805009, "grad_norm": 1.5614700317382812, "learning_rate": 6.215403128760529e-07, "loss": 0.2466, "step": 12880 }, { "epoch": 11.529516994633275, "grad_norm": 1.9572466611862183, "learning_rate": 6.212394705174488e-07, "loss": 0.253, "step": 12890 }, { "epoch": 11.538461538461538, "grad_norm": 1.6293258666992188, "learning_rate": 6.209386281588448e-07, "loss": 0.2541, "step": 12900 }, { "epoch": 11.547406082289804, "grad_norm": 1.781266212463379, "learning_rate": 6.206377858002406e-07, "loss": 0.2503, "step": 12910 }, { "epoch": 11.556350626118068, "grad_norm": 1.7980941534042358, "learning_rate": 6.203369434416366e-07, "loss": 0.2477, "step": 12920 }, { "epoch": 11.565295169946333, "grad_norm": 1.8397564888000488, "learning_rate": 6.200361010830325e-07, "loss": 0.2472, "step": 12930 }, { "epoch": 11.574239713774597, "grad_norm": 1.603770136833191, "learning_rate": 6.197352587244283e-07, "loss": 0.2457, "step": 12940 }, { "epoch": 11.583184257602863, "grad_norm": 1.601441740989685, "learning_rate": 6.194344163658243e-07, "loss": 0.2462, "step": 12950 }, { "epoch": 11.592128801431127, "grad_norm": 1.6930180788040161, "learning_rate": 6.191335740072202e-07, "loss": 0.2463, "step": 12960 }, { "epoch": 11.601073345259392, "grad_norm": 2.2629811763763428, "learning_rate": 6.188327316486161e-07, "loss": 0.2463, "step": 12970 }, { "epoch": 11.610017889087656, "grad_norm": 1.9006010293960571, "learning_rate": 6.18531889290012e-07, "loss": 0.2487, "step": 12980 }, { "epoch": 11.618962432915922, "grad_norm": 1.6862515211105347, "learning_rate": 6.182310469314079e-07, "loss": 0.2367, "step": 12990 }, { "epoch": 11.627906976744185, "grad_norm": 1.5999614000320435, "learning_rate": 6.179302045728038e-07, "loss": 0.2415, "step": 13000 }, { "epoch": 11.636851520572451, "grad_norm": 2.216756582260132, "learning_rate": 6.176293622141997e-07, "loss": 0.2416, "step": 13010 }, { "epoch": 11.645796064400715, "grad_norm": 1.8712037801742554, "learning_rate": 6.173285198555956e-07, "loss": 0.2471, "step": 13020 }, { "epoch": 11.65474060822898, "grad_norm": 1.4762934446334839, "learning_rate": 6.170276774969916e-07, "loss": 0.2512, "step": 13030 }, { "epoch": 11.663685152057244, "grad_norm": 1.8089286088943481, "learning_rate": 6.167268351383874e-07, "loss": 0.2418, "step": 13040 }, { "epoch": 11.67262969588551, "grad_norm": 1.5072790384292603, "learning_rate": 6.164259927797834e-07, "loss": 0.2431, "step": 13050 }, { "epoch": 11.681574239713775, "grad_norm": 1.778215765953064, "learning_rate": 6.161251504211793e-07, "loss": 0.2433, "step": 13060 }, { "epoch": 11.69051878354204, "grad_norm": 2.7389516830444336, "learning_rate": 6.158243080625753e-07, "loss": 0.2508, "step": 13070 }, { "epoch": 11.699463327370305, "grad_norm": 1.4133468866348267, "learning_rate": 6.155234657039711e-07, "loss": 0.2408, "step": 13080 }, { "epoch": 11.708407871198569, "grad_norm": 1.6936426162719727, "learning_rate": 6.152226233453671e-07, "loss": 0.2542, "step": 13090 }, { "epoch": 11.717352415026834, "grad_norm": 1.8718106746673584, "learning_rate": 6.149217809867629e-07, "loss": 0.2432, "step": 13100 }, { "epoch": 11.726296958855098, "grad_norm": 1.8084053993225098, "learning_rate": 6.146209386281587e-07, "loss": 0.2412, "step": 13110 }, { "epoch": 11.735241502683364, "grad_norm": 1.9868453741073608, "learning_rate": 6.143200962695547e-07, "loss": 0.2508, "step": 13120 }, { "epoch": 11.744186046511627, "grad_norm": 1.9329386949539185, "learning_rate": 6.140192539109506e-07, "loss": 0.2501, "step": 13130 }, { "epoch": 11.753130590339893, "grad_norm": 1.5499323606491089, "learning_rate": 6.137184115523465e-07, "loss": 0.2395, "step": 13140 }, { "epoch": 11.762075134168157, "grad_norm": 1.7129054069519043, "learning_rate": 6.134175691937424e-07, "loss": 0.247, "step": 13150 }, { "epoch": 11.771019677996422, "grad_norm": 1.7108467817306519, "learning_rate": 6.131167268351384e-07, "loss": 0.2462, "step": 13160 }, { "epoch": 11.779964221824686, "grad_norm": 1.5333002805709839, "learning_rate": 6.128158844765343e-07, "loss": 0.2364, "step": 13170 }, { "epoch": 11.788908765652952, "grad_norm": 1.4564987421035767, "learning_rate": 6.125150421179302e-07, "loss": 0.2477, "step": 13180 }, { "epoch": 11.797853309481216, "grad_norm": 1.80374276638031, "learning_rate": 6.122141997593261e-07, "loss": 0.2373, "step": 13190 }, { "epoch": 11.806797853309481, "grad_norm": 1.8643357753753662, "learning_rate": 6.119133574007221e-07, "loss": 0.2483, "step": 13200 }, { "epoch": 11.815742397137747, "grad_norm": 1.589076042175293, "learning_rate": 6.116125150421178e-07, "loss": 0.2448, "step": 13210 }, { "epoch": 11.82468694096601, "grad_norm": 1.6423367261886597, "learning_rate": 6.113116726835138e-07, "loss": 0.2399, "step": 13220 }, { "epoch": 11.833631484794276, "grad_norm": 1.806188702583313, "learning_rate": 6.110108303249097e-07, "loss": 0.2454, "step": 13230 }, { "epoch": 11.84257602862254, "grad_norm": 1.4810848236083984, "learning_rate": 6.107099879663056e-07, "loss": 0.2393, "step": 13240 }, { "epoch": 11.851520572450806, "grad_norm": 1.4600383043289185, "learning_rate": 6.104091456077015e-07, "loss": 0.2497, "step": 13250 }, { "epoch": 11.86046511627907, "grad_norm": 1.5684937238693237, "learning_rate": 6.101083032490975e-07, "loss": 0.236, "step": 13260 }, { "epoch": 11.869409660107335, "grad_norm": 1.8886797428131104, "learning_rate": 6.098074608904934e-07, "loss": 0.2515, "step": 13270 }, { "epoch": 11.878354203935599, "grad_norm": 1.8850244283676147, "learning_rate": 6.095066185318892e-07, "loss": 0.2448, "step": 13280 }, { "epoch": 11.887298747763865, "grad_norm": 1.6738568544387817, "learning_rate": 6.092057761732852e-07, "loss": 0.2504, "step": 13290 }, { "epoch": 11.896243291592128, "grad_norm": 2.228408098220825, "learning_rate": 6.089049338146811e-07, "loss": 0.2402, "step": 13300 }, { "epoch": 11.905187835420394, "grad_norm": 1.6836262941360474, "learning_rate": 6.08604091456077e-07, "loss": 0.2387, "step": 13310 }, { "epoch": 11.914132379248658, "grad_norm": 1.552213430404663, "learning_rate": 6.083032490974728e-07, "loss": 0.2374, "step": 13320 }, { "epoch": 11.923076923076923, "grad_norm": 1.9206942319869995, "learning_rate": 6.080024067388688e-07, "loss": 0.25, "step": 13330 }, { "epoch": 11.932021466905187, "grad_norm": 1.483750343322754, "learning_rate": 6.077015643802647e-07, "loss": 0.2366, "step": 13340 }, { "epoch": 11.940966010733453, "grad_norm": 1.670534372329712, "learning_rate": 6.074007220216606e-07, "loss": 0.2496, "step": 13350 }, { "epoch": 11.949910554561717, "grad_norm": 1.7745046615600586, "learning_rate": 6.070998796630565e-07, "loss": 0.2434, "step": 13360 }, { "epoch": 11.958855098389982, "grad_norm": 1.43484365940094, "learning_rate": 6.067990373044525e-07, "loss": 0.2424, "step": 13370 }, { "epoch": 11.967799642218246, "grad_norm": 2.0954678058624268, "learning_rate": 6.064981949458483e-07, "loss": 0.2462, "step": 13380 }, { "epoch": 11.976744186046512, "grad_norm": 1.776699185371399, "learning_rate": 6.061973525872443e-07, "loss": 0.2324, "step": 13390 }, { "epoch": 11.985688729874777, "grad_norm": 1.5925743579864502, "learning_rate": 6.058965102286402e-07, "loss": 0.2372, "step": 13400 }, { "epoch": 11.994633273703041, "grad_norm": 1.5993913412094116, "learning_rate": 6.055956678700361e-07, "loss": 0.236, "step": 13410 }, { "epoch": 12.0, "eval_bleu": 74.4219, "eval_gen_len": 74.833, "eval_loss": 0.18223372101783752, "eval_runtime": 57.8879, "eval_samples_per_second": 18.0, "eval_steps_per_second": 0.19, "step": 13416 }, { "epoch": 12.003577817531307, "grad_norm": 1.4651010036468506, "learning_rate": 6.05294825511432e-07, "loss": 0.2337, "step": 13420 }, { "epoch": 12.01252236135957, "grad_norm": 1.615624189376831, "learning_rate": 6.049939831528279e-07, "loss": 0.2309, "step": 13430 }, { "epoch": 12.021466905187836, "grad_norm": 1.7083666324615479, "learning_rate": 6.046931407942238e-07, "loss": 0.2513, "step": 13440 }, { "epoch": 12.0304114490161, "grad_norm": 1.6717640161514282, "learning_rate": 6.043922984356197e-07, "loss": 0.2406, "step": 13450 }, { "epoch": 12.039355992844365, "grad_norm": 1.376678228378296, "learning_rate": 6.040914560770156e-07, "loss": 0.2402, "step": 13460 }, { "epoch": 12.04830053667263, "grad_norm": 1.4994889497756958, "learning_rate": 6.037906137184115e-07, "loss": 0.2328, "step": 13470 }, { "epoch": 12.057245080500895, "grad_norm": 1.5415583848953247, "learning_rate": 6.034897713598074e-07, "loss": 0.2435, "step": 13480 }, { "epoch": 12.066189624329159, "grad_norm": 1.8038772344589233, "learning_rate": 6.031889290012033e-07, "loss": 0.2404, "step": 13490 }, { "epoch": 12.075134168157424, "grad_norm": 1.442786693572998, "learning_rate": 6.028880866425993e-07, "loss": 0.2309, "step": 13500 }, { "epoch": 12.084078711985688, "grad_norm": 1.993149757385254, "learning_rate": 6.025872442839952e-07, "loss": 0.2324, "step": 13510 }, { "epoch": 12.093023255813954, "grad_norm": 1.6655985116958618, "learning_rate": 6.022864019253911e-07, "loss": 0.2397, "step": 13520 }, { "epoch": 12.101967799642217, "grad_norm": 1.6622962951660156, "learning_rate": 6.01985559566787e-07, "loss": 0.2349, "step": 13530 }, { "epoch": 12.110912343470483, "grad_norm": 1.6364377737045288, "learning_rate": 6.01684717208183e-07, "loss": 0.2277, "step": 13540 }, { "epoch": 12.119856887298747, "grad_norm": 1.8937984704971313, "learning_rate": 6.013838748495787e-07, "loss": 0.244, "step": 13550 }, { "epoch": 12.128801431127012, "grad_norm": 1.9228928089141846, "learning_rate": 6.010830324909747e-07, "loss": 0.2334, "step": 13560 }, { "epoch": 12.137745974955278, "grad_norm": 1.6912461519241333, "learning_rate": 6.007821901323706e-07, "loss": 0.2524, "step": 13570 }, { "epoch": 12.146690518783542, "grad_norm": 1.6879265308380127, "learning_rate": 6.004813477737665e-07, "loss": 0.2405, "step": 13580 }, { "epoch": 12.155635062611807, "grad_norm": 1.6367559432983398, "learning_rate": 6.001805054151624e-07, "loss": 0.2489, "step": 13590 }, { "epoch": 12.164579606440071, "grad_norm": 1.880110502243042, "learning_rate": 5.998796630565584e-07, "loss": 0.239, "step": 13600 }, { "epoch": 12.173524150268337, "grad_norm": 1.7080553770065308, "learning_rate": 5.995788206979543e-07, "loss": 0.2482, "step": 13610 }, { "epoch": 12.1824686940966, "grad_norm": 1.6933702230453491, "learning_rate": 5.992779783393502e-07, "loss": 0.2411, "step": 13620 }, { "epoch": 12.191413237924866, "grad_norm": 1.5030890703201294, "learning_rate": 5.989771359807461e-07, "loss": 0.2363, "step": 13630 }, { "epoch": 12.20035778175313, "grad_norm": 2.3736958503723145, "learning_rate": 5.98676293622142e-07, "loss": 0.2323, "step": 13640 }, { "epoch": 12.209302325581396, "grad_norm": 1.4384323358535767, "learning_rate": 5.983754512635378e-07, "loss": 0.2327, "step": 13650 }, { "epoch": 12.21824686940966, "grad_norm": 1.5894116163253784, "learning_rate": 5.980746089049337e-07, "loss": 0.2536, "step": 13660 }, { "epoch": 12.227191413237925, "grad_norm": 1.9905613660812378, "learning_rate": 5.977737665463297e-07, "loss": 0.2452, "step": 13670 }, { "epoch": 12.236135957066189, "grad_norm": 1.5962880849838257, "learning_rate": 5.974729241877256e-07, "loss": 0.2286, "step": 13680 }, { "epoch": 12.245080500894455, "grad_norm": 1.4475836753845215, "learning_rate": 5.971720818291215e-07, "loss": 0.2287, "step": 13690 }, { "epoch": 12.254025044722718, "grad_norm": 1.6959360837936401, "learning_rate": 5.968712394705174e-07, "loss": 0.2525, "step": 13700 }, { "epoch": 12.262969588550984, "grad_norm": 1.7910102605819702, "learning_rate": 5.965703971119134e-07, "loss": 0.235, "step": 13710 }, { "epoch": 12.271914132379248, "grad_norm": 1.5741043090820312, "learning_rate": 5.962695547533092e-07, "loss": 0.2418, "step": 13720 }, { "epoch": 12.280858676207513, "grad_norm": 1.4918581247329712, "learning_rate": 5.959687123947052e-07, "loss": 0.2271, "step": 13730 }, { "epoch": 12.289803220035779, "grad_norm": 1.4841476678848267, "learning_rate": 5.956678700361011e-07, "loss": 0.2324, "step": 13740 }, { "epoch": 12.298747763864043, "grad_norm": 1.6960680484771729, "learning_rate": 5.95367027677497e-07, "loss": 0.2242, "step": 13750 }, { "epoch": 12.307692307692308, "grad_norm": 1.4989248514175415, "learning_rate": 5.950661853188928e-07, "loss": 0.2342, "step": 13760 }, { "epoch": 12.316636851520572, "grad_norm": 1.4370133876800537, "learning_rate": 5.947653429602888e-07, "loss": 0.2478, "step": 13770 }, { "epoch": 12.325581395348838, "grad_norm": 1.4599870443344116, "learning_rate": 5.944645006016847e-07, "loss": 0.2353, "step": 13780 }, { "epoch": 12.334525939177102, "grad_norm": 1.5223127603530884, "learning_rate": 5.941636582430806e-07, "loss": 0.2349, "step": 13790 }, { "epoch": 12.343470483005367, "grad_norm": 1.4748775959014893, "learning_rate": 5.938628158844765e-07, "loss": 0.2312, "step": 13800 }, { "epoch": 12.352415026833631, "grad_norm": 1.7307710647583008, "learning_rate": 5.935619735258724e-07, "loss": 0.2412, "step": 13810 }, { "epoch": 12.361359570661897, "grad_norm": 1.6681541204452515, "learning_rate": 5.932611311672683e-07, "loss": 0.243, "step": 13820 }, { "epoch": 12.37030411449016, "grad_norm": 1.7818663120269775, "learning_rate": 5.929602888086642e-07, "loss": 0.2326, "step": 13830 }, { "epoch": 12.379248658318426, "grad_norm": 1.616071343421936, "learning_rate": 5.926594464500602e-07, "loss": 0.2371, "step": 13840 }, { "epoch": 12.38819320214669, "grad_norm": 1.447346568107605, "learning_rate": 5.923586040914561e-07, "loss": 0.2448, "step": 13850 }, { "epoch": 12.397137745974955, "grad_norm": 1.6985827684402466, "learning_rate": 5.92057761732852e-07, "loss": 0.2446, "step": 13860 }, { "epoch": 12.40608228980322, "grad_norm": 2.0520944595336914, "learning_rate": 5.917569193742478e-07, "loss": 0.2266, "step": 13870 }, { "epoch": 12.415026833631485, "grad_norm": 1.4311727285385132, "learning_rate": 5.914560770156438e-07, "loss": 0.2322, "step": 13880 }, { "epoch": 12.42397137745975, "grad_norm": 1.4834709167480469, "learning_rate": 5.911552346570396e-07, "loss": 0.233, "step": 13890 }, { "epoch": 12.432915921288014, "grad_norm": 1.5307376384735107, "learning_rate": 5.908543922984356e-07, "loss": 0.2346, "step": 13900 }, { "epoch": 12.44186046511628, "grad_norm": 1.6108955144882202, "learning_rate": 5.905535499398315e-07, "loss": 0.2394, "step": 13910 }, { "epoch": 12.450805008944544, "grad_norm": 1.6023128032684326, "learning_rate": 5.902527075812274e-07, "loss": 0.24, "step": 13920 }, { "epoch": 12.45974955277281, "grad_norm": 1.350578784942627, "learning_rate": 5.899518652226233e-07, "loss": 0.2354, "step": 13930 }, { "epoch": 12.468694096601073, "grad_norm": 1.5131261348724365, "learning_rate": 5.896510228640193e-07, "loss": 0.2245, "step": 13940 }, { "epoch": 12.477638640429339, "grad_norm": 2.112520456314087, "learning_rate": 5.893501805054152e-07, "loss": 0.2441, "step": 13950 }, { "epoch": 12.486583184257602, "grad_norm": 1.790661096572876, "learning_rate": 5.890493381468111e-07, "loss": 0.2312, "step": 13960 }, { "epoch": 12.495527728085868, "grad_norm": 1.4098833799362183, "learning_rate": 5.88748495788207e-07, "loss": 0.2353, "step": 13970 }, { "epoch": 12.504472271914132, "grad_norm": 1.5231249332427979, "learning_rate": 5.88447653429603e-07, "loss": 0.235, "step": 13980 }, { "epoch": 12.513416815742398, "grad_norm": 1.5738141536712646, "learning_rate": 5.881468110709987e-07, "loss": 0.2339, "step": 13990 }, { "epoch": 12.522361359570661, "grad_norm": 1.6588445901870728, "learning_rate": 5.878459687123946e-07, "loss": 0.2472, "step": 14000 }, { "epoch": 12.531305903398927, "grad_norm": 1.5778062343597412, "learning_rate": 5.875451263537906e-07, "loss": 0.2351, "step": 14010 }, { "epoch": 12.54025044722719, "grad_norm": 1.7402443885803223, "learning_rate": 5.872442839951864e-07, "loss": 0.2331, "step": 14020 }, { "epoch": 12.549194991055456, "grad_norm": 1.3807802200317383, "learning_rate": 5.869434416365824e-07, "loss": 0.2381, "step": 14030 }, { "epoch": 12.55813953488372, "grad_norm": 1.5534865856170654, "learning_rate": 5.866425992779783e-07, "loss": 0.2285, "step": 14040 }, { "epoch": 12.567084078711986, "grad_norm": 1.992496132850647, "learning_rate": 5.863417569193743e-07, "loss": 0.2476, "step": 14050 }, { "epoch": 12.57602862254025, "grad_norm": 1.708743691444397, "learning_rate": 5.860409145607701e-07, "loss": 0.2262, "step": 14060 }, { "epoch": 12.584973166368515, "grad_norm": 1.4764150381088257, "learning_rate": 5.857400722021661e-07, "loss": 0.2248, "step": 14070 }, { "epoch": 12.59391771019678, "grad_norm": 1.5710704326629639, "learning_rate": 5.85439229843562e-07, "loss": 0.2392, "step": 14080 }, { "epoch": 12.602862254025045, "grad_norm": 1.4406896829605103, "learning_rate": 5.851383874849578e-07, "loss": 0.233, "step": 14090 }, { "epoch": 12.61180679785331, "grad_norm": 1.868032455444336, "learning_rate": 5.848375451263537e-07, "loss": 0.2333, "step": 14100 }, { "epoch": 12.620751341681574, "grad_norm": 1.7434061765670776, "learning_rate": 5.845367027677497e-07, "loss": 0.2436, "step": 14110 }, { "epoch": 12.62969588550984, "grad_norm": 1.3921831846237183, "learning_rate": 5.842358604091456e-07, "loss": 0.226, "step": 14120 }, { "epoch": 12.638640429338103, "grad_norm": 1.5419607162475586, "learning_rate": 5.839350180505415e-07, "loss": 0.2331, "step": 14130 }, { "epoch": 12.647584973166369, "grad_norm": 1.5571625232696533, "learning_rate": 5.836341756919374e-07, "loss": 0.2235, "step": 14140 }, { "epoch": 12.656529516994633, "grad_norm": 1.3981750011444092, "learning_rate": 5.833333333333334e-07, "loss": 0.2311, "step": 14150 }, { "epoch": 12.665474060822898, "grad_norm": 1.5649229288101196, "learning_rate": 5.830324909747292e-07, "loss": 0.2495, "step": 14160 }, { "epoch": 12.674418604651162, "grad_norm": 1.7458295822143555, "learning_rate": 5.827316486161251e-07, "loss": 0.2369, "step": 14170 }, { "epoch": 12.683363148479428, "grad_norm": 1.5172150135040283, "learning_rate": 5.824308062575211e-07, "loss": 0.2297, "step": 14180 }, { "epoch": 12.692307692307692, "grad_norm": 1.622676968574524, "learning_rate": 5.821299638989168e-07, "loss": 0.2294, "step": 14190 }, { "epoch": 12.701252236135957, "grad_norm": 1.3619602918624878, "learning_rate": 5.818291215403128e-07, "loss": 0.2456, "step": 14200 }, { "epoch": 12.710196779964221, "grad_norm": 1.630332589149475, "learning_rate": 5.815282791817087e-07, "loss": 0.229, "step": 14210 }, { "epoch": 12.719141323792487, "grad_norm": 1.5032150745391846, "learning_rate": 5.812274368231047e-07, "loss": 0.2365, "step": 14220 }, { "epoch": 12.728085867620752, "grad_norm": 1.4477381706237793, "learning_rate": 5.809265944645005e-07, "loss": 0.2284, "step": 14230 }, { "epoch": 12.737030411449016, "grad_norm": 1.5683680772781372, "learning_rate": 5.806257521058965e-07, "loss": 0.2225, "step": 14240 }, { "epoch": 12.745974955277282, "grad_norm": 1.756728172302246, "learning_rate": 5.803249097472924e-07, "loss": 0.2287, "step": 14250 }, { "epoch": 12.754919499105545, "grad_norm": 1.389137864112854, "learning_rate": 5.800240673886883e-07, "loss": 0.2249, "step": 14260 }, { "epoch": 12.763864042933811, "grad_norm": 1.4172710180282593, "learning_rate": 5.797232250300842e-07, "loss": 0.2336, "step": 14270 }, { "epoch": 12.772808586762075, "grad_norm": 1.6355493068695068, "learning_rate": 5.794223826714802e-07, "loss": 0.2387, "step": 14280 }, { "epoch": 12.78175313059034, "grad_norm": 1.4162112474441528, "learning_rate": 5.791215403128761e-07, "loss": 0.2295, "step": 14290 }, { "epoch": 12.790697674418604, "grad_norm": 1.554172396659851, "learning_rate": 5.78820697954272e-07, "loss": 0.2247, "step": 14300 }, { "epoch": 12.79964221824687, "grad_norm": 1.6874934434890747, "learning_rate": 5.785198555956678e-07, "loss": 0.2308, "step": 14310 }, { "epoch": 12.808586762075134, "grad_norm": 1.5708050727844238, "learning_rate": 5.782190132370638e-07, "loss": 0.2256, "step": 14320 }, { "epoch": 12.8175313059034, "grad_norm": 1.644698977470398, "learning_rate": 5.779181708784596e-07, "loss": 0.2274, "step": 14330 }, { "epoch": 12.826475849731663, "grad_norm": 1.6415210962295532, "learning_rate": 5.776173285198555e-07, "loss": 0.2334, "step": 14340 }, { "epoch": 12.835420393559929, "grad_norm": 1.6948776245117188, "learning_rate": 5.773164861612515e-07, "loss": 0.2389, "step": 14350 }, { "epoch": 12.844364937388193, "grad_norm": 1.391992449760437, "learning_rate": 5.770156438026473e-07, "loss": 0.2262, "step": 14360 }, { "epoch": 12.853309481216458, "grad_norm": 1.5542418956756592, "learning_rate": 5.767148014440433e-07, "loss": 0.2391, "step": 14370 }, { "epoch": 12.862254025044722, "grad_norm": 1.5921099185943604, "learning_rate": 5.764139590854392e-07, "loss": 0.2271, "step": 14380 }, { "epoch": 12.871198568872988, "grad_norm": 1.3908007144927979, "learning_rate": 5.761131167268352e-07, "loss": 0.226, "step": 14390 }, { "epoch": 12.880143112701251, "grad_norm": 1.539855718612671, "learning_rate": 5.75812274368231e-07, "loss": 0.2336, "step": 14400 }, { "epoch": 12.889087656529517, "grad_norm": 1.658566951751709, "learning_rate": 5.75511432009627e-07, "loss": 0.2375, "step": 14410 }, { "epoch": 12.898032200357783, "grad_norm": 1.6505448818206787, "learning_rate": 5.752105896510228e-07, "loss": 0.2309, "step": 14420 }, { "epoch": 12.906976744186046, "grad_norm": 1.6353178024291992, "learning_rate": 5.749097472924187e-07, "loss": 0.2266, "step": 14430 }, { "epoch": 12.915921288014312, "grad_norm": 1.521298885345459, "learning_rate": 5.746089049338146e-07, "loss": 0.2257, "step": 14440 }, { "epoch": 12.924865831842576, "grad_norm": 1.3476217985153198, "learning_rate": 5.743080625752106e-07, "loss": 0.2366, "step": 14450 }, { "epoch": 12.933810375670841, "grad_norm": 1.5443305969238281, "learning_rate": 5.740072202166065e-07, "loss": 0.2222, "step": 14460 }, { "epoch": 12.942754919499105, "grad_norm": 1.4951808452606201, "learning_rate": 5.737063778580024e-07, "loss": 0.2311, "step": 14470 }, { "epoch": 12.95169946332737, "grad_norm": 1.7990375757217407, "learning_rate": 5.734055354993983e-07, "loss": 0.2422, "step": 14480 }, { "epoch": 12.960644007155635, "grad_norm": 1.728073000907898, "learning_rate": 5.731046931407943e-07, "loss": 0.2179, "step": 14490 }, { "epoch": 12.9695885509839, "grad_norm": 2.882551908493042, "learning_rate": 5.728038507821901e-07, "loss": 0.2271, "step": 14500 }, { "epoch": 12.978533094812164, "grad_norm": 1.6010873317718506, "learning_rate": 5.72503008423586e-07, "loss": 0.2243, "step": 14510 }, { "epoch": 12.98747763864043, "grad_norm": 1.5712891817092896, "learning_rate": 5.72202166064982e-07, "loss": 0.2225, "step": 14520 }, { "epoch": 12.996422182468693, "grad_norm": 1.4817034006118774, "learning_rate": 5.719013237063777e-07, "loss": 0.2265, "step": 14530 }, { "epoch": 13.0, "eval_bleu": 75.1435, "eval_gen_len": 74.9069, "eval_loss": 0.17450717091560364, "eval_runtime": 56.9118, "eval_samples_per_second": 18.309, "eval_steps_per_second": 0.193, "step": 14534 }, { "epoch": 13.005366726296959, "grad_norm": 1.9442760944366455, "learning_rate": 5.716004813477737e-07, "loss": 0.2372, "step": 14540 }, { "epoch": 13.014311270125223, "grad_norm": 1.7336536645889282, "learning_rate": 5.712996389891696e-07, "loss": 0.2355, "step": 14550 }, { "epoch": 13.023255813953488, "grad_norm": 1.4192476272583008, "learning_rate": 5.709987966305656e-07, "loss": 0.2268, "step": 14560 }, { "epoch": 13.032200357781754, "grad_norm": 1.541877269744873, "learning_rate": 5.706979542719614e-07, "loss": 0.2381, "step": 14570 }, { "epoch": 13.041144901610018, "grad_norm": 1.6108826398849487, "learning_rate": 5.703971119133574e-07, "loss": 0.2286, "step": 14580 }, { "epoch": 13.050089445438283, "grad_norm": 1.3249037265777588, "learning_rate": 5.700962695547533e-07, "loss": 0.2288, "step": 14590 }, { "epoch": 13.059033989266547, "grad_norm": 1.6004801988601685, "learning_rate": 5.697954271961492e-07, "loss": 0.2288, "step": 14600 }, { "epoch": 13.067978533094813, "grad_norm": 1.8404788970947266, "learning_rate": 5.694945848375451e-07, "loss": 0.2288, "step": 14610 }, { "epoch": 13.076923076923077, "grad_norm": 1.4611536264419556, "learning_rate": 5.691937424789411e-07, "loss": 0.2241, "step": 14620 }, { "epoch": 13.085867620751342, "grad_norm": 1.7837131023406982, "learning_rate": 5.68892900120337e-07, "loss": 0.2309, "step": 14630 }, { "epoch": 13.094812164579606, "grad_norm": 1.7805612087249756, "learning_rate": 5.685920577617328e-07, "loss": 0.2299, "step": 14640 }, { "epoch": 13.103756708407872, "grad_norm": 1.5109468698501587, "learning_rate": 5.682912154031287e-07, "loss": 0.2272, "step": 14650 }, { "epoch": 13.112701252236135, "grad_norm": 1.4673620462417603, "learning_rate": 5.679903730445247e-07, "loss": 0.2296, "step": 14660 }, { "epoch": 13.121645796064401, "grad_norm": 1.500759243965149, "learning_rate": 5.676895306859205e-07, "loss": 0.2218, "step": 14670 }, { "epoch": 13.130590339892665, "grad_norm": 1.8135573863983154, "learning_rate": 5.673886883273164e-07, "loss": 0.2228, "step": 14680 }, { "epoch": 13.13953488372093, "grad_norm": 1.4665645360946655, "learning_rate": 5.670878459687124e-07, "loss": 0.2258, "step": 14690 }, { "epoch": 13.148479427549194, "grad_norm": 1.346288800239563, "learning_rate": 5.667870036101082e-07, "loss": 0.2231, "step": 14700 }, { "epoch": 13.15742397137746, "grad_norm": 1.46870756149292, "learning_rate": 5.664861612515042e-07, "loss": 0.2237, "step": 14710 }, { "epoch": 13.166368515205724, "grad_norm": 1.597507357597351, "learning_rate": 5.661853188929001e-07, "loss": 0.22, "step": 14720 }, { "epoch": 13.17531305903399, "grad_norm": 1.7851216793060303, "learning_rate": 5.658844765342961e-07, "loss": 0.2346, "step": 14730 }, { "epoch": 13.184257602862255, "grad_norm": 1.528058409690857, "learning_rate": 5.655836341756919e-07, "loss": 0.2262, "step": 14740 }, { "epoch": 13.193202146690519, "grad_norm": 1.7863088846206665, "learning_rate": 5.652827918170878e-07, "loss": 0.2216, "step": 14750 }, { "epoch": 13.202146690518784, "grad_norm": 1.3855292797088623, "learning_rate": 5.649819494584837e-07, "loss": 0.2202, "step": 14760 }, { "epoch": 13.211091234347048, "grad_norm": 1.404234528541565, "learning_rate": 5.646811070998796e-07, "loss": 0.229, "step": 14770 }, { "epoch": 13.220035778175314, "grad_norm": 1.9877007007598877, "learning_rate": 5.643802647412755e-07, "loss": 0.2174, "step": 14780 }, { "epoch": 13.228980322003578, "grad_norm": 1.4733182191848755, "learning_rate": 5.640794223826715e-07, "loss": 0.2357, "step": 14790 }, { "epoch": 13.237924865831843, "grad_norm": 1.4954806566238403, "learning_rate": 5.637785800240674e-07, "loss": 0.2372, "step": 14800 }, { "epoch": 13.246869409660107, "grad_norm": 1.637521505355835, "learning_rate": 5.634777376654633e-07, "loss": 0.2226, "step": 14810 }, { "epoch": 13.255813953488373, "grad_norm": 1.5723708868026733, "learning_rate": 5.631768953068592e-07, "loss": 0.2235, "step": 14820 }, { "epoch": 13.264758497316636, "grad_norm": 1.7516061067581177, "learning_rate": 5.628760529482552e-07, "loss": 0.2335, "step": 14830 }, { "epoch": 13.273703041144902, "grad_norm": 1.581664800643921, "learning_rate": 5.62575210589651e-07, "loss": 0.2274, "step": 14840 }, { "epoch": 13.282647584973166, "grad_norm": 1.4926707744598389, "learning_rate": 5.622743682310469e-07, "loss": 0.2219, "step": 14850 }, { "epoch": 13.291592128801431, "grad_norm": 1.4483482837677002, "learning_rate": 5.619735258724428e-07, "loss": 0.2319, "step": 14860 }, { "epoch": 13.300536672629695, "grad_norm": 1.5603644847869873, "learning_rate": 5.616726835138386e-07, "loss": 0.2221, "step": 14870 }, { "epoch": 13.30948121645796, "grad_norm": 1.8198426961898804, "learning_rate": 5.613718411552346e-07, "loss": 0.2286, "step": 14880 }, { "epoch": 13.318425760286225, "grad_norm": 1.4116787910461426, "learning_rate": 5.610709987966305e-07, "loss": 0.2146, "step": 14890 }, { "epoch": 13.32737030411449, "grad_norm": 1.5447922945022583, "learning_rate": 5.607701564380265e-07, "loss": 0.2247, "step": 14900 }, { "epoch": 13.336314847942756, "grad_norm": 1.5551434755325317, "learning_rate": 5.604693140794223e-07, "loss": 0.231, "step": 14910 }, { "epoch": 13.34525939177102, "grad_norm": 1.7243269681930542, "learning_rate": 5.601684717208183e-07, "loss": 0.2283, "step": 14920 }, { "epoch": 13.354203935599285, "grad_norm": 1.6431998014450073, "learning_rate": 5.598676293622142e-07, "loss": 0.2137, "step": 14930 }, { "epoch": 13.363148479427549, "grad_norm": 1.7388650178909302, "learning_rate": 5.595667870036101e-07, "loss": 0.2377, "step": 14940 }, { "epoch": 13.372093023255815, "grad_norm": 1.2878618240356445, "learning_rate": 5.59265944645006e-07, "loss": 0.2393, "step": 14950 }, { "epoch": 13.381037567084078, "grad_norm": 1.5231084823608398, "learning_rate": 5.58965102286402e-07, "loss": 0.2316, "step": 14960 }, { "epoch": 13.389982110912344, "grad_norm": 1.5280569791793823, "learning_rate": 5.586642599277977e-07, "loss": 0.2186, "step": 14970 }, { "epoch": 13.398926654740608, "grad_norm": 1.4062917232513428, "learning_rate": 5.583634175691937e-07, "loss": 0.2317, "step": 14980 }, { "epoch": 13.407871198568873, "grad_norm": 1.6176033020019531, "learning_rate": 5.580625752105896e-07, "loss": 0.2296, "step": 14990 }, { "epoch": 13.416815742397137, "grad_norm": 1.3912999629974365, "learning_rate": 5.577617328519856e-07, "loss": 0.2316, "step": 15000 }, { "epoch": 13.425760286225403, "grad_norm": 1.6246581077575684, "learning_rate": 5.574608904933814e-07, "loss": 0.2355, "step": 15010 }, { "epoch": 13.434704830053667, "grad_norm": 1.3514788150787354, "learning_rate": 5.571600481347773e-07, "loss": 0.2274, "step": 15020 }, { "epoch": 13.443649373881932, "grad_norm": 1.55750572681427, "learning_rate": 5.568592057761733e-07, "loss": 0.2262, "step": 15030 }, { "epoch": 13.452593917710196, "grad_norm": 1.4673680067062378, "learning_rate": 5.565583634175691e-07, "loss": 0.2322, "step": 15040 }, { "epoch": 13.461538461538462, "grad_norm": 1.9114556312561035, "learning_rate": 5.562575210589651e-07, "loss": 0.2168, "step": 15050 }, { "epoch": 13.470483005366725, "grad_norm": 1.3632735013961792, "learning_rate": 5.55956678700361e-07, "loss": 0.2199, "step": 15060 }, { "epoch": 13.479427549194991, "grad_norm": 1.4581695795059204, "learning_rate": 5.55655836341757e-07, "loss": 0.225, "step": 15070 }, { "epoch": 13.488372093023255, "grad_norm": 1.6625158786773682, "learning_rate": 5.553549939831527e-07, "loss": 0.2325, "step": 15080 }, { "epoch": 13.49731663685152, "grad_norm": 1.43048894405365, "learning_rate": 5.550541516245487e-07, "loss": 0.2253, "step": 15090 }, { "epoch": 13.506261180679786, "grad_norm": 1.6861284971237183, "learning_rate": 5.547533092659446e-07, "loss": 0.222, "step": 15100 }, { "epoch": 13.51520572450805, "grad_norm": 1.7863863706588745, "learning_rate": 5.544524669073405e-07, "loss": 0.2173, "step": 15110 }, { "epoch": 13.524150268336316, "grad_norm": 1.679726481437683, "learning_rate": 5.541516245487364e-07, "loss": 0.2273, "step": 15120 }, { "epoch": 13.53309481216458, "grad_norm": 1.5793452262878418, "learning_rate": 5.538507821901324e-07, "loss": 0.2241, "step": 15130 }, { "epoch": 13.542039355992845, "grad_norm": 1.9857158660888672, "learning_rate": 5.535499398315282e-07, "loss": 0.2256, "step": 15140 }, { "epoch": 13.550983899821109, "grad_norm": 1.6583348512649536, "learning_rate": 5.532490974729242e-07, "loss": 0.2263, "step": 15150 }, { "epoch": 13.559928443649374, "grad_norm": 1.7588074207305908, "learning_rate": 5.529482551143201e-07, "loss": 0.229, "step": 15160 }, { "epoch": 13.568872987477638, "grad_norm": 1.573966145515442, "learning_rate": 5.526474127557161e-07, "loss": 0.2261, "step": 15170 }, { "epoch": 13.577817531305904, "grad_norm": 1.895883321762085, "learning_rate": 5.523465703971119e-07, "loss": 0.222, "step": 15180 }, { "epoch": 13.586762075134168, "grad_norm": 1.5346202850341797, "learning_rate": 5.520457280385077e-07, "loss": 0.2263, "step": 15190 }, { "epoch": 13.595706618962433, "grad_norm": 1.743774175643921, "learning_rate": 5.517448856799037e-07, "loss": 0.2198, "step": 15200 }, { "epoch": 13.604651162790697, "grad_norm": 1.6069917678833008, "learning_rate": 5.514440433212995e-07, "loss": 0.2261, "step": 15210 }, { "epoch": 13.613595706618963, "grad_norm": 1.4142626523971558, "learning_rate": 5.511432009626955e-07, "loss": 0.2206, "step": 15220 }, { "epoch": 13.622540250447226, "grad_norm": 1.695739984512329, "learning_rate": 5.508423586040914e-07, "loss": 0.2306, "step": 15230 }, { "epoch": 13.631484794275492, "grad_norm": 1.6444977521896362, "learning_rate": 5.505415162454874e-07, "loss": 0.229, "step": 15240 }, { "epoch": 13.640429338103758, "grad_norm": 1.3577944040298462, "learning_rate": 5.502406738868832e-07, "loss": 0.2212, "step": 15250 }, { "epoch": 13.649373881932021, "grad_norm": 1.6013015508651733, "learning_rate": 5.499398315282792e-07, "loss": 0.2244, "step": 15260 }, { "epoch": 13.658318425760287, "grad_norm": 1.581383228302002, "learning_rate": 5.496389891696751e-07, "loss": 0.2318, "step": 15270 }, { "epoch": 13.66726296958855, "grad_norm": 1.4835501909255981, "learning_rate": 5.49338146811071e-07, "loss": 0.2114, "step": 15280 }, { "epoch": 13.676207513416816, "grad_norm": 1.5943286418914795, "learning_rate": 5.490373044524669e-07, "loss": 0.2187, "step": 15290 }, { "epoch": 13.68515205724508, "grad_norm": 2.1119771003723145, "learning_rate": 5.487364620938629e-07, "loss": 0.2243, "step": 15300 }, { "epoch": 13.694096601073346, "grad_norm": 1.923747181892395, "learning_rate": 5.484356197352586e-07, "loss": 0.2276, "step": 15310 }, { "epoch": 13.70304114490161, "grad_norm": 1.8697950839996338, "learning_rate": 5.481347773766546e-07, "loss": 0.2199, "step": 15320 }, { "epoch": 13.711985688729875, "grad_norm": 1.5428868532180786, "learning_rate": 5.478339350180505e-07, "loss": 0.2234, "step": 15330 }, { "epoch": 13.720930232558139, "grad_norm": 1.7273482084274292, "learning_rate": 5.475330926594465e-07, "loss": 0.2277, "step": 15340 }, { "epoch": 13.729874776386405, "grad_norm": 1.4095451831817627, "learning_rate": 5.472322503008423e-07, "loss": 0.2252, "step": 15350 }, { "epoch": 13.738819320214668, "grad_norm": 1.2859090566635132, "learning_rate": 5.469314079422383e-07, "loss": 0.2246, "step": 15360 }, { "epoch": 13.747763864042934, "grad_norm": 2.154752254486084, "learning_rate": 5.466305655836342e-07, "loss": 0.2145, "step": 15370 }, { "epoch": 13.756708407871198, "grad_norm": 1.4549967050552368, "learning_rate": 5.4632972322503e-07, "loss": 0.2267, "step": 15380 }, { "epoch": 13.765652951699463, "grad_norm": 1.5076959133148193, "learning_rate": 5.46028880866426e-07, "loss": 0.2286, "step": 15390 }, { "epoch": 13.774597495527727, "grad_norm": 1.331883430480957, "learning_rate": 5.457280385078219e-07, "loss": 0.2102, "step": 15400 }, { "epoch": 13.783542039355993, "grad_norm": 1.76063871383667, "learning_rate": 5.454271961492179e-07, "loss": 0.2262, "step": 15410 }, { "epoch": 13.792486583184257, "grad_norm": 1.3071162700653076, "learning_rate": 5.451263537906136e-07, "loss": 0.2228, "step": 15420 }, { "epoch": 13.801431127012522, "grad_norm": 1.6594287157058716, "learning_rate": 5.448255114320096e-07, "loss": 0.2219, "step": 15430 }, { "epoch": 13.810375670840788, "grad_norm": 1.4981613159179688, "learning_rate": 5.445246690734055e-07, "loss": 0.2246, "step": 15440 }, { "epoch": 13.819320214669052, "grad_norm": 2.1496357917785645, "learning_rate": 5.442238267148014e-07, "loss": 0.2329, "step": 15450 }, { "epoch": 13.828264758497317, "grad_norm": 1.5639127492904663, "learning_rate": 5.439229843561973e-07, "loss": 0.2211, "step": 15460 }, { "epoch": 13.837209302325581, "grad_norm": 1.3146132230758667, "learning_rate": 5.436221419975933e-07, "loss": 0.2098, "step": 15470 }, { "epoch": 13.846153846153847, "grad_norm": 1.4802268743515015, "learning_rate": 5.433212996389891e-07, "loss": 0.2156, "step": 15480 }, { "epoch": 13.85509838998211, "grad_norm": 1.6789542436599731, "learning_rate": 5.430204572803851e-07, "loss": 0.2182, "step": 15490 }, { "epoch": 13.864042933810376, "grad_norm": 1.58533775806427, "learning_rate": 5.42719614921781e-07, "loss": 0.2153, "step": 15500 }, { "epoch": 13.87298747763864, "grad_norm": 1.6321181058883667, "learning_rate": 5.42418772563177e-07, "loss": 0.2185, "step": 15510 }, { "epoch": 13.881932021466906, "grad_norm": 1.8955652713775635, "learning_rate": 5.421179302045727e-07, "loss": 0.223, "step": 15520 }, { "epoch": 13.89087656529517, "grad_norm": 1.695584774017334, "learning_rate": 5.418170878459687e-07, "loss": 0.2254, "step": 15530 }, { "epoch": 13.899821109123435, "grad_norm": 2.4142327308654785, "learning_rate": 5.415162454873646e-07, "loss": 0.2355, "step": 15540 }, { "epoch": 13.908765652951699, "grad_norm": 1.4618829488754272, "learning_rate": 5.412154031287604e-07, "loss": 0.2249, "step": 15550 }, { "epoch": 13.917710196779964, "grad_norm": 2.5095059871673584, "learning_rate": 5.409145607701564e-07, "loss": 0.2154, "step": 15560 }, { "epoch": 13.926654740608228, "grad_norm": 1.5938920974731445, "learning_rate": 5.406137184115523e-07, "loss": 0.2304, "step": 15570 }, { "epoch": 13.935599284436494, "grad_norm": 1.4232935905456543, "learning_rate": 5.403128760529483e-07, "loss": 0.2188, "step": 15580 }, { "epoch": 13.94454382826476, "grad_norm": 1.2760659456253052, "learning_rate": 5.400120336943441e-07, "loss": 0.2173, "step": 15590 }, { "epoch": 13.953488372093023, "grad_norm": 1.420570731163025, "learning_rate": 5.397111913357401e-07, "loss": 0.2174, "step": 15600 }, { "epoch": 13.962432915921289, "grad_norm": 1.5642368793487549, "learning_rate": 5.39410348977136e-07, "loss": 0.2208, "step": 15610 }, { "epoch": 13.971377459749553, "grad_norm": 1.8670076131820679, "learning_rate": 5.391095066185319e-07, "loss": 0.2174, "step": 15620 }, { "epoch": 13.980322003577818, "grad_norm": 1.752993106842041, "learning_rate": 5.388086642599277e-07, "loss": 0.2235, "step": 15630 }, { "epoch": 13.989266547406082, "grad_norm": 1.523247480392456, "learning_rate": 5.385078219013237e-07, "loss": 0.2222, "step": 15640 }, { "epoch": 13.998211091234348, "grad_norm": 1.7766495943069458, "learning_rate": 5.382069795427195e-07, "loss": 0.2152, "step": 15650 }, { "epoch": 14.0, "eval_bleu": 75.7614, "eval_gen_len": 74.7409, "eval_loss": 0.16948764026165009, "eval_runtime": 58.1609, "eval_samples_per_second": 17.916, "eval_steps_per_second": 0.189, "step": 15652 }, { "epoch": 14.007155635062611, "grad_norm": 1.6034523248672485, "learning_rate": 5.379061371841155e-07, "loss": 0.1954, "step": 15660 }, { "epoch": 14.016100178890877, "grad_norm": 1.594677209854126, "learning_rate": 5.376052948255114e-07, "loss": 0.2191, "step": 15670 }, { "epoch": 14.02504472271914, "grad_norm": 1.6265920400619507, "learning_rate": 5.373044524669074e-07, "loss": 0.2293, "step": 15680 }, { "epoch": 14.033989266547406, "grad_norm": 1.4514870643615723, "learning_rate": 5.370036101083032e-07, "loss": 0.2187, "step": 15690 }, { "epoch": 14.04293381037567, "grad_norm": 1.4903329610824585, "learning_rate": 5.367027677496992e-07, "loss": 0.222, "step": 15700 }, { "epoch": 14.051878354203936, "grad_norm": 1.57286536693573, "learning_rate": 5.364019253910951e-07, "loss": 0.2299, "step": 15710 }, { "epoch": 14.0608228980322, "grad_norm": 1.7294665575027466, "learning_rate": 5.361010830324909e-07, "loss": 0.2163, "step": 15720 }, { "epoch": 14.069767441860465, "grad_norm": 1.366902232170105, "learning_rate": 5.358002406738869e-07, "loss": 0.2179, "step": 15730 }, { "epoch": 14.078711985688729, "grad_norm": 1.6594551801681519, "learning_rate": 5.354993983152827e-07, "loss": 0.2259, "step": 15740 }, { "epoch": 14.087656529516995, "grad_norm": 1.7197548151016235, "learning_rate": 5.351985559566786e-07, "loss": 0.2286, "step": 15750 }, { "epoch": 14.09660107334526, "grad_norm": 1.67202627658844, "learning_rate": 5.348977135980745e-07, "loss": 0.2189, "step": 15760 }, { "epoch": 14.105545617173524, "grad_norm": 1.654624342918396, "learning_rate": 5.345968712394705e-07, "loss": 0.2223, "step": 15770 }, { "epoch": 14.11449016100179, "grad_norm": 1.4540494680404663, "learning_rate": 5.342960288808664e-07, "loss": 0.2154, "step": 15780 }, { "epoch": 14.123434704830053, "grad_norm": 1.5942274332046509, "learning_rate": 5.339951865222623e-07, "loss": 0.2292, "step": 15790 }, { "epoch": 14.132379248658319, "grad_norm": 1.457651972770691, "learning_rate": 5.336943441636582e-07, "loss": 0.2127, "step": 15800 }, { "epoch": 14.141323792486583, "grad_norm": 1.7262177467346191, "learning_rate": 5.333935018050542e-07, "loss": 0.2226, "step": 15810 }, { "epoch": 14.150268336314848, "grad_norm": 1.687378168106079, "learning_rate": 5.3309265944645e-07, "loss": 0.2161, "step": 15820 }, { "epoch": 14.159212880143112, "grad_norm": 1.799683928489685, "learning_rate": 5.32791817087846e-07, "loss": 0.2109, "step": 15830 }, { "epoch": 14.168157423971378, "grad_norm": 1.6736897230148315, "learning_rate": 5.324909747292419e-07, "loss": 0.2216, "step": 15840 }, { "epoch": 14.177101967799642, "grad_norm": 1.7621829509735107, "learning_rate": 5.321901323706379e-07, "loss": 0.2123, "step": 15850 }, { "epoch": 14.186046511627907, "grad_norm": 1.7803364992141724, "learning_rate": 5.318892900120336e-07, "loss": 0.2205, "step": 15860 }, { "epoch": 14.194991055456171, "grad_norm": 1.577661395072937, "learning_rate": 5.315884476534296e-07, "loss": 0.2099, "step": 15870 }, { "epoch": 14.203935599284437, "grad_norm": 1.684728980064392, "learning_rate": 5.312876052948255e-07, "loss": 0.2161, "step": 15880 }, { "epoch": 14.2128801431127, "grad_norm": 1.5282610654830933, "learning_rate": 5.309867629362213e-07, "loss": 0.2246, "step": 15890 }, { "epoch": 14.221824686940966, "grad_norm": 1.5472359657287598, "learning_rate": 5.306859205776173e-07, "loss": 0.216, "step": 15900 }, { "epoch": 14.23076923076923, "grad_norm": 1.4690499305725098, "learning_rate": 5.303850782190132e-07, "loss": 0.2126, "step": 15910 }, { "epoch": 14.239713774597496, "grad_norm": 1.7932279109954834, "learning_rate": 5.300842358604091e-07, "loss": 0.2184, "step": 15920 }, { "epoch": 14.248658318425761, "grad_norm": 1.4595445394515991, "learning_rate": 5.29783393501805e-07, "loss": 0.2213, "step": 15930 }, { "epoch": 14.257602862254025, "grad_norm": 2.5874969959259033, "learning_rate": 5.29482551143201e-07, "loss": 0.2221, "step": 15940 }, { "epoch": 14.26654740608229, "grad_norm": 1.4573915004730225, "learning_rate": 5.291817087845969e-07, "loss": 0.2122, "step": 15950 }, { "epoch": 14.275491949910554, "grad_norm": 1.4587106704711914, "learning_rate": 5.288808664259927e-07, "loss": 0.2107, "step": 15960 }, { "epoch": 14.28443649373882, "grad_norm": 2.2001304626464844, "learning_rate": 5.285800240673886e-07, "loss": 0.2255, "step": 15970 }, { "epoch": 14.293381037567084, "grad_norm": 1.332115650177002, "learning_rate": 5.282791817087846e-07, "loss": 0.219, "step": 15980 }, { "epoch": 14.30232558139535, "grad_norm": 1.5230737924575806, "learning_rate": 5.279783393501804e-07, "loss": 0.2204, "step": 15990 }, { "epoch": 14.311270125223613, "grad_norm": 1.4917973279953003, "learning_rate": 5.276774969915764e-07, "loss": 0.2182, "step": 16000 }, { "epoch": 14.320214669051879, "grad_norm": 1.415745496749878, "learning_rate": 5.273766546329723e-07, "loss": 0.2123, "step": 16010 }, { "epoch": 14.329159212880143, "grad_norm": 1.3475311994552612, "learning_rate": 5.270758122743683e-07, "loss": 0.2265, "step": 16020 }, { "epoch": 14.338103756708408, "grad_norm": 1.4397448301315308, "learning_rate": 5.267749699157641e-07, "loss": 0.2181, "step": 16030 }, { "epoch": 14.347048300536672, "grad_norm": 1.4729828834533691, "learning_rate": 5.264741275571601e-07, "loss": 0.2118, "step": 16040 }, { "epoch": 14.355992844364938, "grad_norm": 1.548867106437683, "learning_rate": 5.26173285198556e-07, "loss": 0.2201, "step": 16050 }, { "epoch": 14.364937388193201, "grad_norm": 1.7978005409240723, "learning_rate": 5.258724428399518e-07, "loss": 0.216, "step": 16060 }, { "epoch": 14.373881932021467, "grad_norm": 1.3418782949447632, "learning_rate": 5.255716004813477e-07, "loss": 0.2106, "step": 16070 }, { "epoch": 14.38282647584973, "grad_norm": 1.7105077505111694, "learning_rate": 5.252707581227436e-07, "loss": 0.2216, "step": 16080 }, { "epoch": 14.391771019677996, "grad_norm": 1.3112951517105103, "learning_rate": 5.249699157641395e-07, "loss": 0.2117, "step": 16090 }, { "epoch": 14.400715563506262, "grad_norm": 1.4080175161361694, "learning_rate": 5.246690734055354e-07, "loss": 0.2246, "step": 16100 }, { "epoch": 14.409660107334526, "grad_norm": 1.5949844121932983, "learning_rate": 5.243682310469314e-07, "loss": 0.2108, "step": 16110 }, { "epoch": 14.418604651162791, "grad_norm": 1.4474152326583862, "learning_rate": 5.240673886883273e-07, "loss": 0.2204, "step": 16120 }, { "epoch": 14.427549194991055, "grad_norm": 1.3992246389389038, "learning_rate": 5.237665463297232e-07, "loss": 0.2125, "step": 16130 }, { "epoch": 14.43649373881932, "grad_norm": 1.6255208253860474, "learning_rate": 5.234657039711191e-07, "loss": 0.2149, "step": 16140 }, { "epoch": 14.445438282647585, "grad_norm": 1.5684301853179932, "learning_rate": 5.231648616125151e-07, "loss": 0.2147, "step": 16150 }, { "epoch": 14.45438282647585, "grad_norm": 1.4556910991668701, "learning_rate": 5.228640192539109e-07, "loss": 0.2263, "step": 16160 }, { "epoch": 14.463327370304114, "grad_norm": 1.5484651327133179, "learning_rate": 5.225631768953069e-07, "loss": 0.2193, "step": 16170 }, { "epoch": 14.47227191413238, "grad_norm": 1.8516258001327515, "learning_rate": 5.222623345367027e-07, "loss": 0.2235, "step": 16180 }, { "epoch": 14.481216457960643, "grad_norm": 1.7158583402633667, "learning_rate": 5.219614921780987e-07, "loss": 0.22, "step": 16190 }, { "epoch": 14.490161001788909, "grad_norm": 1.575059413909912, "learning_rate": 5.216606498194945e-07, "loss": 0.2194, "step": 16200 }, { "epoch": 14.499105545617173, "grad_norm": 1.519271731376648, "learning_rate": 5.213598074608905e-07, "loss": 0.2137, "step": 16210 }, { "epoch": 14.508050089445439, "grad_norm": 1.4205260276794434, "learning_rate": 5.210589651022864e-07, "loss": 0.2228, "step": 16220 }, { "epoch": 14.516994633273702, "grad_norm": 1.2977843284606934, "learning_rate": 5.207581227436822e-07, "loss": 0.2162, "step": 16230 }, { "epoch": 14.525939177101968, "grad_norm": 1.5938493013381958, "learning_rate": 5.204572803850782e-07, "loss": 0.2092, "step": 16240 }, { "epoch": 14.534883720930232, "grad_norm": 1.624735713005066, "learning_rate": 5.201564380264741e-07, "loss": 0.2267, "step": 16250 }, { "epoch": 14.543828264758497, "grad_norm": 1.8016725778579712, "learning_rate": 5.1985559566787e-07, "loss": 0.2169, "step": 16260 }, { "epoch": 14.552772808586763, "grad_norm": 1.4361350536346436, "learning_rate": 5.195547533092659e-07, "loss": 0.2109, "step": 16270 }, { "epoch": 14.561717352415027, "grad_norm": 1.4204323291778564, "learning_rate": 5.192539109506619e-07, "loss": 0.2088, "step": 16280 }, { "epoch": 14.570661896243292, "grad_norm": 1.6258000135421753, "learning_rate": 5.189530685920577e-07, "loss": 0.2079, "step": 16290 }, { "epoch": 14.579606440071556, "grad_norm": 1.4437638521194458, "learning_rate": 5.186522262334536e-07, "loss": 0.221, "step": 16300 }, { "epoch": 14.588550983899822, "grad_norm": 1.4674285650253296, "learning_rate": 5.183513838748495e-07, "loss": 0.2204, "step": 16310 }, { "epoch": 14.597495527728086, "grad_norm": 1.5488989353179932, "learning_rate": 5.180505415162455e-07, "loss": 0.2269, "step": 16320 }, { "epoch": 14.606440071556351, "grad_norm": 1.4164832830429077, "learning_rate": 5.177496991576413e-07, "loss": 0.2116, "step": 16330 }, { "epoch": 14.615384615384615, "grad_norm": 1.3262627124786377, "learning_rate": 5.174488567990373e-07, "loss": 0.2127, "step": 16340 }, { "epoch": 14.62432915921288, "grad_norm": 1.4488004446029663, "learning_rate": 5.171480144404332e-07, "loss": 0.2246, "step": 16350 }, { "epoch": 14.633273703041144, "grad_norm": 1.6791784763336182, "learning_rate": 5.168471720818292e-07, "loss": 0.2244, "step": 16360 }, { "epoch": 14.64221824686941, "grad_norm": 1.7339402437210083, "learning_rate": 5.16546329723225e-07, "loss": 0.2066, "step": 16370 }, { "epoch": 14.651162790697674, "grad_norm": 1.5072243213653564, "learning_rate": 5.16245487364621e-07, "loss": 0.2042, "step": 16380 }, { "epoch": 14.66010733452594, "grad_norm": 1.5121406316757202, "learning_rate": 5.159446450060169e-07, "loss": 0.2077, "step": 16390 }, { "epoch": 14.669051878354203, "grad_norm": 1.6104305982589722, "learning_rate": 5.156438026474126e-07, "loss": 0.216, "step": 16400 }, { "epoch": 14.677996422182469, "grad_norm": 1.7308858633041382, "learning_rate": 5.153429602888086e-07, "loss": 0.2214, "step": 16410 }, { "epoch": 14.686940966010733, "grad_norm": 1.368651270866394, "learning_rate": 5.150421179302045e-07, "loss": 0.2125, "step": 16420 }, { "epoch": 14.695885509838998, "grad_norm": 1.5493015050888062, "learning_rate": 5.147412755716004e-07, "loss": 0.2132, "step": 16430 }, { "epoch": 14.704830053667262, "grad_norm": 1.538008451461792, "learning_rate": 5.144404332129963e-07, "loss": 0.2087, "step": 16440 }, { "epoch": 14.713774597495528, "grad_norm": 1.774583101272583, "learning_rate": 5.141395908543923e-07, "loss": 0.2137, "step": 16450 }, { "epoch": 14.722719141323793, "grad_norm": 1.5929735898971558, "learning_rate": 5.138387484957882e-07, "loss": 0.2266, "step": 16460 }, { "epoch": 14.731663685152057, "grad_norm": 1.4200128316879272, "learning_rate": 5.135379061371841e-07, "loss": 0.2239, "step": 16470 }, { "epoch": 14.740608228980323, "grad_norm": 2.4553580284118652, "learning_rate": 5.1323706377858e-07, "loss": 0.2186, "step": 16480 }, { "epoch": 14.749552772808586, "grad_norm": 1.5825417041778564, "learning_rate": 5.12936221419976e-07, "loss": 0.2183, "step": 16490 }, { "epoch": 14.758497316636852, "grad_norm": 2.1846418380737305, "learning_rate": 5.126353790613718e-07, "loss": 0.2109, "step": 16500 }, { "epoch": 14.767441860465116, "grad_norm": 1.337199091911316, "learning_rate": 5.123345367027678e-07, "loss": 0.2029, "step": 16510 }, { "epoch": 14.776386404293381, "grad_norm": 1.4627991914749146, "learning_rate": 5.120336943441636e-07, "loss": 0.213, "step": 16520 }, { "epoch": 14.785330948121645, "grad_norm": 1.2725661993026733, "learning_rate": 5.117328519855595e-07, "loss": 0.2104, "step": 16530 }, { "epoch": 14.79427549194991, "grad_norm": 1.5113191604614258, "learning_rate": 5.114320096269554e-07, "loss": 0.2164, "step": 16540 }, { "epoch": 14.803220035778175, "grad_norm": 1.4444341659545898, "learning_rate": 5.111311672683514e-07, "loss": 0.218, "step": 16550 }, { "epoch": 14.81216457960644, "grad_norm": 1.361047625541687, "learning_rate": 5.108303249097473e-07, "loss": 0.2149, "step": 16560 }, { "epoch": 14.821109123434704, "grad_norm": 1.663375973701477, "learning_rate": 5.105294825511431e-07, "loss": 0.2115, "step": 16570 }, { "epoch": 14.83005366726297, "grad_norm": 1.6099789142608643, "learning_rate": 5.102286401925391e-07, "loss": 0.2151, "step": 16580 }, { "epoch": 14.838998211091234, "grad_norm": 1.769354224205017, "learning_rate": 5.09927797833935e-07, "loss": 0.2201, "step": 16590 }, { "epoch": 14.847942754919499, "grad_norm": 1.6463778018951416, "learning_rate": 5.096269554753309e-07, "loss": 0.2144, "step": 16600 }, { "epoch": 14.856887298747765, "grad_norm": 1.7376254796981812, "learning_rate": 5.093261131167268e-07, "loss": 0.2196, "step": 16610 }, { "epoch": 14.865831842576029, "grad_norm": 2.0024073123931885, "learning_rate": 5.090252707581228e-07, "loss": 0.2124, "step": 16620 }, { "epoch": 14.874776386404294, "grad_norm": 1.7807821035385132, "learning_rate": 5.087244283995186e-07, "loss": 0.2132, "step": 16630 }, { "epoch": 14.883720930232558, "grad_norm": 1.5845977067947388, "learning_rate": 5.084235860409145e-07, "loss": 0.224, "step": 16640 }, { "epoch": 14.892665474060824, "grad_norm": 1.4428369998931885, "learning_rate": 5.081227436823104e-07, "loss": 0.2047, "step": 16650 }, { "epoch": 14.901610017889087, "grad_norm": 1.8869410753250122, "learning_rate": 5.078219013237064e-07, "loss": 0.2116, "step": 16660 }, { "epoch": 14.910554561717353, "grad_norm": 1.3606252670288086, "learning_rate": 5.075210589651022e-07, "loss": 0.2137, "step": 16670 }, { "epoch": 14.919499105545617, "grad_norm": 1.5670171976089478, "learning_rate": 5.072202166064982e-07, "loss": 0.205, "step": 16680 }, { "epoch": 14.928443649373882, "grad_norm": 2.256553888320923, "learning_rate": 5.069193742478941e-07, "loss": 0.2021, "step": 16690 }, { "epoch": 14.937388193202146, "grad_norm": 1.7542644739151, "learning_rate": 5.0661853188929e-07, "loss": 0.2003, "step": 16700 }, { "epoch": 14.946332737030412, "grad_norm": 1.4539682865142822, "learning_rate": 5.063176895306859e-07, "loss": 0.2116, "step": 16710 }, { "epoch": 14.955277280858676, "grad_norm": 1.3423739671707153, "learning_rate": 5.060168471720819e-07, "loss": 0.2261, "step": 16720 }, { "epoch": 14.964221824686941, "grad_norm": 1.2797144651412964, "learning_rate": 5.057160048134778e-07, "loss": 0.2113, "step": 16730 }, { "epoch": 14.973166368515205, "grad_norm": 1.4044950008392334, "learning_rate": 5.054151624548736e-07, "loss": 0.2081, "step": 16740 }, { "epoch": 14.98211091234347, "grad_norm": 1.540293574333191, "learning_rate": 5.051143200962695e-07, "loss": 0.212, "step": 16750 }, { "epoch": 14.991055456171736, "grad_norm": 1.5579257011413574, "learning_rate": 5.048134777376654e-07, "loss": 0.2068, "step": 16760 }, { "epoch": 15.0, "grad_norm": 2.3970136642456055, "learning_rate": 5.045126353790613e-07, "loss": 0.2078, "step": 16770 }, { "epoch": 15.0, "eval_bleu": 76.2353, "eval_gen_len": 74.7092, "eval_loss": 0.1641465276479721, "eval_runtime": 55.922, "eval_samples_per_second": 18.633, "eval_steps_per_second": 0.197, "step": 16770 }, { "epoch": 15.008944543828266, "grad_norm": 1.358554720878601, "learning_rate": 5.042117930204572e-07, "loss": 0.208, "step": 16780 }, { "epoch": 15.01788908765653, "grad_norm": 1.2903168201446533, "learning_rate": 5.039109506618532e-07, "loss": 0.208, "step": 16790 }, { "epoch": 15.026833631484795, "grad_norm": 1.5758551359176636, "learning_rate": 5.036101083032491e-07, "loss": 0.209, "step": 16800 }, { "epoch": 15.035778175313059, "grad_norm": 1.582900047302246, "learning_rate": 5.03309265944645e-07, "loss": 0.2205, "step": 16810 }, { "epoch": 15.044722719141324, "grad_norm": 1.4756799936294556, "learning_rate": 5.030084235860409e-07, "loss": 0.2061, "step": 16820 }, { "epoch": 15.053667262969588, "grad_norm": 1.690066933631897, "learning_rate": 5.027075812274369e-07, "loss": 0.2104, "step": 16830 }, { "epoch": 15.062611806797854, "grad_norm": 1.670901894569397, "learning_rate": 5.024067388688326e-07, "loss": 0.2067, "step": 16840 }, { "epoch": 15.071556350626118, "grad_norm": 1.7854502201080322, "learning_rate": 5.021058965102286e-07, "loss": 0.2014, "step": 16850 }, { "epoch": 15.080500894454383, "grad_norm": 1.5761840343475342, "learning_rate": 5.018050541516245e-07, "loss": 0.2229, "step": 16860 }, { "epoch": 15.089445438282647, "grad_norm": 1.319814920425415, "learning_rate": 5.015042117930204e-07, "loss": 0.2138, "step": 16870 }, { "epoch": 15.098389982110913, "grad_norm": 1.3874000310897827, "learning_rate": 5.012033694344163e-07, "loss": 0.2049, "step": 16880 }, { "epoch": 15.107334525939176, "grad_norm": 1.3627005815505981, "learning_rate": 5.009025270758123e-07, "loss": 0.2059, "step": 16890 }, { "epoch": 15.116279069767442, "grad_norm": 1.7481482028961182, "learning_rate": 5.006016847172082e-07, "loss": 0.2112, "step": 16900 }, { "epoch": 15.125223613595706, "grad_norm": 1.55675208568573, "learning_rate": 5.003008423586041e-07, "loss": 0.2068, "step": 16910 }, { "epoch": 15.134168157423971, "grad_norm": 1.743762731552124, "learning_rate": 5e-07, "loss": 0.211, "step": 16920 }, { "epoch": 15.143112701252235, "grad_norm": 1.395524501800537, "learning_rate": 4.996991576413959e-07, "loss": 0.2188, "step": 16930 }, { "epoch": 15.152057245080501, "grad_norm": 1.6198045015335083, "learning_rate": 4.993983152827918e-07, "loss": 0.2048, "step": 16940 }, { "epoch": 15.161001788908766, "grad_norm": 1.6451237201690674, "learning_rate": 4.990974729241876e-07, "loss": 0.2193, "step": 16950 }, { "epoch": 15.16994633273703, "grad_norm": 2.0471298694610596, "learning_rate": 4.987966305655836e-07, "loss": 0.2117, "step": 16960 }, { "epoch": 15.178890876565296, "grad_norm": 1.4981588125228882, "learning_rate": 4.984957882069795e-07, "loss": 0.2121, "step": 16970 }, { "epoch": 15.18783542039356, "grad_norm": 1.6169943809509277, "learning_rate": 4.981949458483754e-07, "loss": 0.2198, "step": 16980 }, { "epoch": 15.196779964221825, "grad_norm": 1.5530139207839966, "learning_rate": 4.978941034897713e-07, "loss": 0.2116, "step": 16990 }, { "epoch": 15.20572450805009, "grad_norm": 1.7657912969589233, "learning_rate": 4.975932611311672e-07, "loss": 0.2164, "step": 17000 }, { "epoch": 15.214669051878355, "grad_norm": 1.5685510635375977, "learning_rate": 4.972924187725632e-07, "loss": 0.2272, "step": 17010 }, { "epoch": 15.223613595706619, "grad_norm": 1.2543871402740479, "learning_rate": 4.969915764139591e-07, "loss": 0.2058, "step": 17020 }, { "epoch": 15.232558139534884, "grad_norm": 1.641847848892212, "learning_rate": 4.96690734055355e-07, "loss": 0.2097, "step": 17030 }, { "epoch": 15.241502683363148, "grad_norm": 2.023759365081787, "learning_rate": 4.963898916967509e-07, "loss": 0.2152, "step": 17040 }, { "epoch": 15.250447227191414, "grad_norm": 1.667617917060852, "learning_rate": 4.960890493381468e-07, "loss": 0.2129, "step": 17050 }, { "epoch": 15.259391771019677, "grad_norm": 3.800983428955078, "learning_rate": 4.957882069795428e-07, "loss": 0.2103, "step": 17060 }, { "epoch": 15.268336314847943, "grad_norm": 1.4913359880447388, "learning_rate": 4.954873646209386e-07, "loss": 0.2056, "step": 17070 }, { "epoch": 15.277280858676207, "grad_norm": 1.730721116065979, "learning_rate": 4.951865222623345e-07, "loss": 0.2079, "step": 17080 }, { "epoch": 15.286225402504472, "grad_norm": 1.3525733947753906, "learning_rate": 4.948856799037304e-07, "loss": 0.2154, "step": 17090 }, { "epoch": 15.295169946332736, "grad_norm": 1.395639181137085, "learning_rate": 4.945848375451263e-07, "loss": 0.2089, "step": 17100 }, { "epoch": 15.304114490161002, "grad_norm": 1.5808838605880737, "learning_rate": 4.942839951865222e-07, "loss": 0.2113, "step": 17110 }, { "epoch": 15.313059033989267, "grad_norm": 1.6018389463424683, "learning_rate": 4.939831528279181e-07, "loss": 0.2196, "step": 17120 }, { "epoch": 15.322003577817531, "grad_norm": 1.5155175924301147, "learning_rate": 4.936823104693141e-07, "loss": 0.2124, "step": 17130 }, { "epoch": 15.330948121645797, "grad_norm": 1.5393834114074707, "learning_rate": 4.9338146811071e-07, "loss": 0.2149, "step": 17140 }, { "epoch": 15.33989266547406, "grad_norm": 1.7581160068511963, "learning_rate": 4.930806257521059e-07, "loss": 0.2066, "step": 17150 }, { "epoch": 15.348837209302326, "grad_norm": 1.5426652431488037, "learning_rate": 4.927797833935018e-07, "loss": 0.2133, "step": 17160 }, { "epoch": 15.35778175313059, "grad_norm": 1.506504774093628, "learning_rate": 4.924789410348976e-07, "loss": 0.2201, "step": 17170 }, { "epoch": 15.366726296958856, "grad_norm": 1.3995765447616577, "learning_rate": 4.921780986762936e-07, "loss": 0.2069, "step": 17180 }, { "epoch": 15.37567084078712, "grad_norm": 1.7154018878936768, "learning_rate": 4.918772563176895e-07, "loss": 0.2123, "step": 17190 }, { "epoch": 15.384615384615385, "grad_norm": 1.425632119178772, "learning_rate": 4.915764139590854e-07, "loss": 0.2014, "step": 17200 }, { "epoch": 15.393559928443649, "grad_norm": 1.711829423904419, "learning_rate": 4.912755716004813e-07, "loss": 0.2097, "step": 17210 }, { "epoch": 15.402504472271914, "grad_norm": 1.620375394821167, "learning_rate": 4.909747292418772e-07, "loss": 0.2028, "step": 17220 }, { "epoch": 15.411449016100178, "grad_norm": 1.46085786819458, "learning_rate": 4.906738868832732e-07, "loss": 0.2123, "step": 17230 }, { "epoch": 15.420393559928444, "grad_norm": 1.2808018922805786, "learning_rate": 4.903730445246691e-07, "loss": 0.2187, "step": 17240 }, { "epoch": 15.429338103756708, "grad_norm": 1.4686278104782104, "learning_rate": 4.90072202166065e-07, "loss": 0.205, "step": 17250 }, { "epoch": 15.438282647584973, "grad_norm": 1.3774062395095825, "learning_rate": 4.897713598074609e-07, "loss": 0.2052, "step": 17260 }, { "epoch": 15.447227191413237, "grad_norm": 1.7220516204833984, "learning_rate": 4.894705174488568e-07, "loss": 0.2088, "step": 17270 }, { "epoch": 15.456171735241503, "grad_norm": 1.4161098003387451, "learning_rate": 4.891696750902526e-07, "loss": 0.2077, "step": 17280 }, { "epoch": 15.465116279069768, "grad_norm": 1.5691733360290527, "learning_rate": 4.888688327316485e-07, "loss": 0.2059, "step": 17290 }, { "epoch": 15.474060822898032, "grad_norm": 1.6341298818588257, "learning_rate": 4.885679903730445e-07, "loss": 0.2114, "step": 17300 }, { "epoch": 15.483005366726298, "grad_norm": 1.7802695035934448, "learning_rate": 4.882671480144404e-07, "loss": 0.2071, "step": 17310 }, { "epoch": 15.491949910554561, "grad_norm": 1.389773964881897, "learning_rate": 4.879663056558363e-07, "loss": 0.2093, "step": 17320 }, { "epoch": 15.500894454382827, "grad_norm": 1.3430007696151733, "learning_rate": 4.876654632972322e-07, "loss": 0.2018, "step": 17330 }, { "epoch": 15.509838998211091, "grad_norm": 1.5305284261703491, "learning_rate": 4.873646209386281e-07, "loss": 0.2018, "step": 17340 }, { "epoch": 15.518783542039357, "grad_norm": 1.3044167757034302, "learning_rate": 4.870637785800241e-07, "loss": 0.2153, "step": 17350 }, { "epoch": 15.52772808586762, "grad_norm": 1.5357669591903687, "learning_rate": 4.8676293622142e-07, "loss": 0.2068, "step": 17360 }, { "epoch": 15.536672629695886, "grad_norm": 1.3363947868347168, "learning_rate": 4.864620938628159e-07, "loss": 0.1996, "step": 17370 }, { "epoch": 15.54561717352415, "grad_norm": 1.579582929611206, "learning_rate": 4.861612515042118e-07, "loss": 0.2071, "step": 17380 }, { "epoch": 15.554561717352415, "grad_norm": 1.605961799621582, "learning_rate": 4.858604091456076e-07, "loss": 0.2046, "step": 17390 }, { "epoch": 15.56350626118068, "grad_norm": 1.5769977569580078, "learning_rate": 4.855595667870036e-07, "loss": 0.2158, "step": 17400 }, { "epoch": 15.572450805008945, "grad_norm": 1.3805499076843262, "learning_rate": 4.852587244283995e-07, "loss": 0.2031, "step": 17410 }, { "epoch": 15.581395348837209, "grad_norm": 1.209307312965393, "learning_rate": 4.849578820697954e-07, "loss": 0.2177, "step": 17420 }, { "epoch": 15.590339892665474, "grad_norm": 1.6182724237442017, "learning_rate": 4.846570397111913e-07, "loss": 0.2149, "step": 17430 }, { "epoch": 15.59928443649374, "grad_norm": 1.6751196384429932, "learning_rate": 4.843561973525872e-07, "loss": 0.206, "step": 17440 }, { "epoch": 15.608228980322004, "grad_norm": 1.4189718961715698, "learning_rate": 4.840553549939831e-07, "loss": 0.21, "step": 17450 }, { "epoch": 15.61717352415027, "grad_norm": 1.3712016344070435, "learning_rate": 4.83754512635379e-07, "loss": 0.2078, "step": 17460 }, { "epoch": 15.626118067978533, "grad_norm": 1.6174589395523071, "learning_rate": 4.83453670276775e-07, "loss": 0.2038, "step": 17470 }, { "epoch": 15.635062611806799, "grad_norm": 1.3036216497421265, "learning_rate": 4.831528279181709e-07, "loss": 0.2121, "step": 17480 }, { "epoch": 15.644007155635062, "grad_norm": 1.3612110614776611, "learning_rate": 4.828519855595668e-07, "loss": 0.2214, "step": 17490 }, { "epoch": 15.652951699463328, "grad_norm": 1.6744191646575928, "learning_rate": 4.825511432009626e-07, "loss": 0.2136, "step": 17500 }, { "epoch": 15.661896243291592, "grad_norm": 1.761144757270813, "learning_rate": 4.822503008423585e-07, "loss": 0.2136, "step": 17510 }, { "epoch": 15.670840787119857, "grad_norm": 1.4766287803649902, "learning_rate": 4.819494584837545e-07, "loss": 0.2097, "step": 17520 }, { "epoch": 15.679785330948121, "grad_norm": 1.473301887512207, "learning_rate": 4.816486161251504e-07, "loss": 0.2055, "step": 17530 }, { "epoch": 15.688729874776387, "grad_norm": 1.5784001350402832, "learning_rate": 4.813477737665463e-07, "loss": 0.215, "step": 17540 }, { "epoch": 15.69767441860465, "grad_norm": 1.6430200338363647, "learning_rate": 4.810469314079422e-07, "loss": 0.1925, "step": 17550 }, { "epoch": 15.706618962432916, "grad_norm": 1.4598944187164307, "learning_rate": 4.807460890493381e-07, "loss": 0.2028, "step": 17560 }, { "epoch": 15.71556350626118, "grad_norm": 1.5248929262161255, "learning_rate": 4.804452466907341e-07, "loss": 0.2104, "step": 17570 }, { "epoch": 15.724508050089446, "grad_norm": 1.4362632036209106, "learning_rate": 4.8014440433213e-07, "loss": 0.2094, "step": 17580 }, { "epoch": 15.73345259391771, "grad_norm": 1.6463420391082764, "learning_rate": 4.798435619735259e-07, "loss": 0.2037, "step": 17590 }, { "epoch": 15.742397137745975, "grad_norm": 1.5646806955337524, "learning_rate": 4.795427196149218e-07, "loss": 0.2107, "step": 17600 }, { "epoch": 15.751341681574239, "grad_norm": 1.5375139713287354, "learning_rate": 4.792418772563177e-07, "loss": 0.2101, "step": 17610 }, { "epoch": 15.760286225402504, "grad_norm": 1.3467087745666504, "learning_rate": 4.789410348977135e-07, "loss": 0.2035, "step": 17620 }, { "epoch": 15.76923076923077, "grad_norm": 1.3728797435760498, "learning_rate": 4.786401925391094e-07, "loss": 0.2005, "step": 17630 }, { "epoch": 15.778175313059034, "grad_norm": 1.9395620822906494, "learning_rate": 4.783393501805054e-07, "loss": 0.2093, "step": 17640 }, { "epoch": 15.7871198568873, "grad_norm": 1.927126407623291, "learning_rate": 4.780385078219013e-07, "loss": 0.1979, "step": 17650 }, { "epoch": 15.796064400715563, "grad_norm": 1.429348349571228, "learning_rate": 4.777376654632972e-07, "loss": 0.212, "step": 17660 }, { "epoch": 15.805008944543829, "grad_norm": 1.7249642610549927, "learning_rate": 4.774368231046931e-07, "loss": 0.2155, "step": 17670 }, { "epoch": 15.813953488372093, "grad_norm": 1.4198932647705078, "learning_rate": 4.77135980746089e-07, "loss": 0.2043, "step": 17680 }, { "epoch": 15.822898032200358, "grad_norm": 1.4026259183883667, "learning_rate": 4.76835138387485e-07, "loss": 0.2094, "step": 17690 }, { "epoch": 15.831842576028622, "grad_norm": 1.456063151359558, "learning_rate": 4.765342960288808e-07, "loss": 0.2019, "step": 17700 }, { "epoch": 15.840787119856888, "grad_norm": 1.901902437210083, "learning_rate": 4.762334536702767e-07, "loss": 0.2112, "step": 17710 }, { "epoch": 15.849731663685152, "grad_norm": 1.650774359703064, "learning_rate": 4.7593261131167265e-07, "loss": 0.2015, "step": 17720 }, { "epoch": 15.858676207513417, "grad_norm": 1.569238543510437, "learning_rate": 4.7563176895306854e-07, "loss": 0.2073, "step": 17730 }, { "epoch": 15.867620751341681, "grad_norm": 1.3177183866500854, "learning_rate": 4.753309265944645e-07, "loss": 0.2041, "step": 17740 }, { "epoch": 15.876565295169947, "grad_norm": 1.7612563371658325, "learning_rate": 4.750300842358604e-07, "loss": 0.2172, "step": 17750 }, { "epoch": 15.88550983899821, "grad_norm": 1.9814246892929077, "learning_rate": 4.7472924187725626e-07, "loss": 0.2067, "step": 17760 }, { "epoch": 15.894454382826476, "grad_norm": 1.4732369184494019, "learning_rate": 4.744283995186522e-07, "loss": 0.2022, "step": 17770 }, { "epoch": 15.903398926654742, "grad_norm": 1.6204142570495605, "learning_rate": 4.741275571600481e-07, "loss": 0.1983, "step": 17780 }, { "epoch": 15.912343470483005, "grad_norm": 1.36124849319458, "learning_rate": 4.7382671480144404e-07, "loss": 0.1996, "step": 17790 }, { "epoch": 15.921288014311271, "grad_norm": 1.7553470134735107, "learning_rate": 4.7352587244283993e-07, "loss": 0.2141, "step": 17800 }, { "epoch": 15.930232558139535, "grad_norm": 1.593579888343811, "learning_rate": 4.7322503008423587e-07, "loss": 0.204, "step": 17810 }, { "epoch": 15.9391771019678, "grad_norm": 1.376990556716919, "learning_rate": 4.7292418772563176e-07, "loss": 0.2125, "step": 17820 }, { "epoch": 15.948121645796064, "grad_norm": 1.6332093477249146, "learning_rate": 4.7262334536702765e-07, "loss": 0.2081, "step": 17830 }, { "epoch": 15.95706618962433, "grad_norm": 1.4526787996292114, "learning_rate": 4.723225030084236e-07, "loss": 0.2089, "step": 17840 }, { "epoch": 15.966010733452594, "grad_norm": 1.6975692510604858, "learning_rate": 4.720216606498195e-07, "loss": 0.2032, "step": 17850 }, { "epoch": 15.97495527728086, "grad_norm": 1.3301687240600586, "learning_rate": 4.7172081829121543e-07, "loss": 0.2068, "step": 17860 }, { "epoch": 15.983899821109123, "grad_norm": 1.5423060655593872, "learning_rate": 4.714199759326113e-07, "loss": 0.2013, "step": 17870 }, { "epoch": 15.992844364937389, "grad_norm": 1.4738348722457886, "learning_rate": 4.7111913357400715e-07, "loss": 0.2048, "step": 17880 }, { "epoch": 16.0, "eval_bleu": 76.7381, "eval_gen_len": 74.7274, "eval_loss": 0.15930308401584625, "eval_runtime": 58.2522, "eval_samples_per_second": 17.888, "eval_steps_per_second": 0.189, "step": 17888 }, { "epoch": 16.001788908765654, "grad_norm": 1.6148046255111694, "learning_rate": 4.708182912154031e-07, "loss": 0.2143, "step": 17890 }, { "epoch": 16.010733452593918, "grad_norm": 1.4588137865066528, "learning_rate": 4.70517448856799e-07, "loss": 0.2069, "step": 17900 }, { "epoch": 16.019677996422182, "grad_norm": 1.6100811958312988, "learning_rate": 4.7021660649819493e-07, "loss": 0.2009, "step": 17910 }, { "epoch": 16.028622540250446, "grad_norm": 1.336479902267456, "learning_rate": 4.699157641395908e-07, "loss": 0.2167, "step": 17920 }, { "epoch": 16.037567084078713, "grad_norm": 1.51796293258667, "learning_rate": 4.696149217809867e-07, "loss": 0.2081, "step": 17930 }, { "epoch": 16.046511627906977, "grad_norm": 1.6503642797470093, "learning_rate": 4.6931407942238265e-07, "loss": 0.1997, "step": 17940 }, { "epoch": 16.05545617173524, "grad_norm": 1.253343105316162, "learning_rate": 4.6901323706377854e-07, "loss": 0.201, "step": 17950 }, { "epoch": 16.064400715563508, "grad_norm": 1.4972025156021118, "learning_rate": 4.687123947051745e-07, "loss": 0.2015, "step": 17960 }, { "epoch": 16.073345259391772, "grad_norm": 1.5771032571792603, "learning_rate": 4.684115523465704e-07, "loss": 0.1969, "step": 17970 }, { "epoch": 16.082289803220036, "grad_norm": 1.274448275566101, "learning_rate": 4.6811070998796626e-07, "loss": 0.2054, "step": 17980 }, { "epoch": 16.0912343470483, "grad_norm": 1.4598562717437744, "learning_rate": 4.678098676293622e-07, "loss": 0.2019, "step": 17990 }, { "epoch": 16.100178890876567, "grad_norm": 1.2179709672927856, "learning_rate": 4.675090252707581e-07, "loss": 0.2069, "step": 18000 }, { "epoch": 16.10912343470483, "grad_norm": 1.5141541957855225, "learning_rate": 4.6720818291215404e-07, "loss": 0.2114, "step": 18010 }, { "epoch": 16.118067978533094, "grad_norm": 1.5173187255859375, "learning_rate": 4.6690734055354993e-07, "loss": 0.2063, "step": 18020 }, { "epoch": 16.12701252236136, "grad_norm": 1.3374969959259033, "learning_rate": 4.6660649819494587e-07, "loss": 0.2056, "step": 18030 }, { "epoch": 16.135957066189626, "grad_norm": 1.6040619611740112, "learning_rate": 4.6630565583634176e-07, "loss": 0.2061, "step": 18040 }, { "epoch": 16.14490161001789, "grad_norm": 1.3951270580291748, "learning_rate": 4.660048134777376e-07, "loss": 0.1981, "step": 18050 }, { "epoch": 16.153846153846153, "grad_norm": 1.493987798690796, "learning_rate": 4.6570397111913354e-07, "loss": 0.2026, "step": 18060 }, { "epoch": 16.162790697674417, "grad_norm": 1.9750702381134033, "learning_rate": 4.6540312876052943e-07, "loss": 0.2035, "step": 18070 }, { "epoch": 16.171735241502684, "grad_norm": 1.3505460023880005, "learning_rate": 4.651022864019254e-07, "loss": 0.2043, "step": 18080 }, { "epoch": 16.18067978533095, "grad_norm": 1.8226122856140137, "learning_rate": 4.6480144404332127e-07, "loss": 0.2089, "step": 18090 }, { "epoch": 16.189624329159212, "grad_norm": 1.454171895980835, "learning_rate": 4.6450060168471716e-07, "loss": 0.1979, "step": 18100 }, { "epoch": 16.198568872987476, "grad_norm": 1.459355115890503, "learning_rate": 4.641997593261131e-07, "loss": 0.2078, "step": 18110 }, { "epoch": 16.207513416815743, "grad_norm": 1.3417216539382935, "learning_rate": 4.63898916967509e-07, "loss": 0.2161, "step": 18120 }, { "epoch": 16.216457960644007, "grad_norm": 1.5160000324249268, "learning_rate": 4.6359807460890493e-07, "loss": 0.2021, "step": 18130 }, { "epoch": 16.22540250447227, "grad_norm": 1.6167906522750854, "learning_rate": 4.632972322503008e-07, "loss": 0.2005, "step": 18140 }, { "epoch": 16.23434704830054, "grad_norm": 1.8305431604385376, "learning_rate": 4.629963898916967e-07, "loss": 0.2182, "step": 18150 }, { "epoch": 16.243291592128802, "grad_norm": 1.4843783378601074, "learning_rate": 4.6269554753309265e-07, "loss": 0.2062, "step": 18160 }, { "epoch": 16.252236135957066, "grad_norm": 1.459753155708313, "learning_rate": 4.6239470517448854e-07, "loss": 0.2046, "step": 18170 }, { "epoch": 16.26118067978533, "grad_norm": 1.5825345516204834, "learning_rate": 4.620938628158845e-07, "loss": 0.2117, "step": 18180 }, { "epoch": 16.270125223613597, "grad_norm": 1.9152156114578247, "learning_rate": 4.617930204572804e-07, "loss": 0.2061, "step": 18190 }, { "epoch": 16.27906976744186, "grad_norm": 1.5024219751358032, "learning_rate": 4.614921780986763e-07, "loss": 0.1946, "step": 18200 }, { "epoch": 16.288014311270125, "grad_norm": 1.3793495893478394, "learning_rate": 4.611913357400722e-07, "loss": 0.2117, "step": 18210 }, { "epoch": 16.29695885509839, "grad_norm": 1.4050053358078003, "learning_rate": 4.6089049338146805e-07, "loss": 0.1973, "step": 18220 }, { "epoch": 16.305903398926656, "grad_norm": 1.4622000455856323, "learning_rate": 4.60589651022864e-07, "loss": 0.1954, "step": 18230 }, { "epoch": 16.31484794275492, "grad_norm": 1.5552752017974854, "learning_rate": 4.602888086642599e-07, "loss": 0.2019, "step": 18240 }, { "epoch": 16.323792486583184, "grad_norm": 1.2910131216049194, "learning_rate": 4.599879663056558e-07, "loss": 0.1979, "step": 18250 }, { "epoch": 16.332737030411447, "grad_norm": 1.3379510641098022, "learning_rate": 4.596871239470517e-07, "loss": 0.2044, "step": 18260 }, { "epoch": 16.341681574239715, "grad_norm": 1.603898048400879, "learning_rate": 4.593862815884476e-07, "loss": 0.2084, "step": 18270 }, { "epoch": 16.35062611806798, "grad_norm": 1.4816044569015503, "learning_rate": 4.5908543922984354e-07, "loss": 0.2011, "step": 18280 }, { "epoch": 16.359570661896242, "grad_norm": 1.563955307006836, "learning_rate": 4.5878459687123943e-07, "loss": 0.2062, "step": 18290 }, { "epoch": 16.36851520572451, "grad_norm": 1.677878975868225, "learning_rate": 4.584837545126354e-07, "loss": 0.2114, "step": 18300 }, { "epoch": 16.377459749552774, "grad_norm": 1.716578483581543, "learning_rate": 4.5818291215403127e-07, "loss": 0.2047, "step": 18310 }, { "epoch": 16.386404293381037, "grad_norm": 1.7913365364074707, "learning_rate": 4.5788206979542716e-07, "loss": 0.1992, "step": 18320 }, { "epoch": 16.3953488372093, "grad_norm": 1.5321120023727417, "learning_rate": 4.575812274368231e-07, "loss": 0.1967, "step": 18330 }, { "epoch": 16.40429338103757, "grad_norm": 2.0473198890686035, "learning_rate": 4.57280385078219e-07, "loss": 0.2125, "step": 18340 }, { "epoch": 16.413237924865832, "grad_norm": 1.355984091758728, "learning_rate": 4.5697954271961493e-07, "loss": 0.2019, "step": 18350 }, { "epoch": 16.422182468694096, "grad_norm": 1.967839002609253, "learning_rate": 4.566787003610108e-07, "loss": 0.2122, "step": 18360 }, { "epoch": 16.43112701252236, "grad_norm": 1.7413747310638428, "learning_rate": 4.5637785800240676e-07, "loss": 0.2055, "step": 18370 }, { "epoch": 16.440071556350627, "grad_norm": 1.432185173034668, "learning_rate": 4.5607701564380265e-07, "loss": 0.2027, "step": 18380 }, { "epoch": 16.44901610017889, "grad_norm": 1.5016663074493408, "learning_rate": 4.557761732851985e-07, "loss": 0.2012, "step": 18390 }, { "epoch": 16.457960644007155, "grad_norm": 1.3006632328033447, "learning_rate": 4.5547533092659443e-07, "loss": 0.204, "step": 18400 }, { "epoch": 16.46690518783542, "grad_norm": 1.7726274728775024, "learning_rate": 4.551744885679903e-07, "loss": 0.2034, "step": 18410 }, { "epoch": 16.475849731663686, "grad_norm": 1.264997959136963, "learning_rate": 4.5487364620938627e-07, "loss": 0.1984, "step": 18420 }, { "epoch": 16.48479427549195, "grad_norm": 1.3891260623931885, "learning_rate": 4.5457280385078216e-07, "loss": 0.197, "step": 18430 }, { "epoch": 16.493738819320214, "grad_norm": 1.2195409536361694, "learning_rate": 4.5427196149217805e-07, "loss": 0.1939, "step": 18440 }, { "epoch": 16.502683363148478, "grad_norm": 1.6551311016082764, "learning_rate": 4.53971119133574e-07, "loss": 0.2021, "step": 18450 }, { "epoch": 16.511627906976745, "grad_norm": 1.1608327627182007, "learning_rate": 4.536702767749699e-07, "loss": 0.2095, "step": 18460 }, { "epoch": 16.52057245080501, "grad_norm": 1.585875391960144, "learning_rate": 4.533694344163658e-07, "loss": 0.2021, "step": 18470 }, { "epoch": 16.529516994633273, "grad_norm": 1.4391627311706543, "learning_rate": 4.530685920577617e-07, "loss": 0.2048, "step": 18480 }, { "epoch": 16.53846153846154, "grad_norm": 1.7681891918182373, "learning_rate": 4.527677496991576e-07, "loss": 0.2034, "step": 18490 }, { "epoch": 16.547406082289804, "grad_norm": 1.448768138885498, "learning_rate": 4.5246690734055354e-07, "loss": 0.2056, "step": 18500 }, { "epoch": 16.556350626118068, "grad_norm": 1.363098382949829, "learning_rate": 4.5216606498194943e-07, "loss": 0.205, "step": 18510 }, { "epoch": 16.56529516994633, "grad_norm": 1.4220198392868042, "learning_rate": 4.518652226233454e-07, "loss": 0.2126, "step": 18520 }, { "epoch": 16.5742397137746, "grad_norm": 1.384798288345337, "learning_rate": 4.5156438026474127e-07, "loss": 0.2057, "step": 18530 }, { "epoch": 16.583184257602863, "grad_norm": 1.5988340377807617, "learning_rate": 4.5126353790613716e-07, "loss": 0.1958, "step": 18540 }, { "epoch": 16.592128801431127, "grad_norm": 1.342162013053894, "learning_rate": 4.509626955475331e-07, "loss": 0.1933, "step": 18550 }, { "epoch": 16.60107334525939, "grad_norm": 1.457943320274353, "learning_rate": 4.50661853188929e-07, "loss": 0.1974, "step": 18560 }, { "epoch": 16.610017889087658, "grad_norm": 1.8528119325637817, "learning_rate": 4.503610108303249e-07, "loss": 0.2047, "step": 18570 }, { "epoch": 16.61896243291592, "grad_norm": 1.2887921333312988, "learning_rate": 4.5006016847172077e-07, "loss": 0.1988, "step": 18580 }, { "epoch": 16.627906976744185, "grad_norm": 1.4914727210998535, "learning_rate": 4.497593261131167e-07, "loss": 0.1973, "step": 18590 }, { "epoch": 16.63685152057245, "grad_norm": 1.537087082862854, "learning_rate": 4.494584837545126e-07, "loss": 0.1994, "step": 18600 }, { "epoch": 16.645796064400717, "grad_norm": 1.5479522943496704, "learning_rate": 4.491576413959085e-07, "loss": 0.2152, "step": 18610 }, { "epoch": 16.65474060822898, "grad_norm": 1.733384370803833, "learning_rate": 4.4885679903730444e-07, "loss": 0.1986, "step": 18620 }, { "epoch": 16.663685152057244, "grad_norm": 1.5670526027679443, "learning_rate": 4.485559566787003e-07, "loss": 0.1961, "step": 18630 }, { "epoch": 16.67262969588551, "grad_norm": 1.5489834547042847, "learning_rate": 4.4825511432009627e-07, "loss": 0.207, "step": 18640 }, { "epoch": 16.681574239713775, "grad_norm": 1.5704835653305054, "learning_rate": 4.4795427196149216e-07, "loss": 0.1984, "step": 18650 }, { "epoch": 16.69051878354204, "grad_norm": 2.105590581893921, "learning_rate": 4.4765342960288805e-07, "loss": 0.2051, "step": 18660 }, { "epoch": 16.699463327370303, "grad_norm": 1.5057789087295532, "learning_rate": 4.47352587244284e-07, "loss": 0.2106, "step": 18670 }, { "epoch": 16.70840787119857, "grad_norm": 1.5319921970367432, "learning_rate": 4.470517448856799e-07, "loss": 0.2026, "step": 18680 }, { "epoch": 16.717352415026834, "grad_norm": 1.3663493394851685, "learning_rate": 4.467509025270758e-07, "loss": 0.1981, "step": 18690 }, { "epoch": 16.726296958855098, "grad_norm": 1.5751093626022339, "learning_rate": 4.464500601684717e-07, "loss": 0.2094, "step": 18700 }, { "epoch": 16.735241502683362, "grad_norm": 1.359885573387146, "learning_rate": 4.461492178098676e-07, "loss": 0.2007, "step": 18710 }, { "epoch": 16.74418604651163, "grad_norm": 1.4744230508804321, "learning_rate": 4.4584837545126355e-07, "loss": 0.2027, "step": 18720 }, { "epoch": 16.753130590339893, "grad_norm": 1.998220443725586, "learning_rate": 4.4554753309265944e-07, "loss": 0.1959, "step": 18730 }, { "epoch": 16.762075134168157, "grad_norm": 1.3327491283416748, "learning_rate": 4.452466907340554e-07, "loss": 0.2062, "step": 18740 }, { "epoch": 16.77101967799642, "grad_norm": 1.3492833375930786, "learning_rate": 4.449458483754512e-07, "loss": 0.2018, "step": 18750 }, { "epoch": 16.779964221824688, "grad_norm": 1.4005316495895386, "learning_rate": 4.4464500601684716e-07, "loss": 0.2044, "step": 18760 }, { "epoch": 16.788908765652952, "grad_norm": 1.7190395593643188, "learning_rate": 4.4434416365824305e-07, "loss": 0.2003, "step": 18770 }, { "epoch": 16.797853309481216, "grad_norm": 1.4776062965393066, "learning_rate": 4.4404332129963894e-07, "loss": 0.1939, "step": 18780 }, { "epoch": 16.80679785330948, "grad_norm": 1.5264639854431152, "learning_rate": 4.437424789410349e-07, "loss": 0.1953, "step": 18790 }, { "epoch": 16.815742397137747, "grad_norm": 1.4779977798461914, "learning_rate": 4.4344163658243077e-07, "loss": 0.2013, "step": 18800 }, { "epoch": 16.82468694096601, "grad_norm": 1.224633812904358, "learning_rate": 4.431407942238267e-07, "loss": 0.2037, "step": 18810 }, { "epoch": 16.833631484794275, "grad_norm": 1.5299253463745117, "learning_rate": 4.428399518652226e-07, "loss": 0.199, "step": 18820 }, { "epoch": 16.842576028622542, "grad_norm": 1.5062905550003052, "learning_rate": 4.425391095066185e-07, "loss": 0.194, "step": 18830 }, { "epoch": 16.851520572450806, "grad_norm": 1.432369351387024, "learning_rate": 4.4223826714801444e-07, "loss": 0.2071, "step": 18840 }, { "epoch": 16.86046511627907, "grad_norm": 1.5931068658828735, "learning_rate": 4.419374247894103e-07, "loss": 0.2024, "step": 18850 }, { "epoch": 16.869409660107333, "grad_norm": 1.5152790546417236, "learning_rate": 4.4163658243080627e-07, "loss": 0.202, "step": 18860 }, { "epoch": 16.8783542039356, "grad_norm": 1.5928902626037598, "learning_rate": 4.4133574007220216e-07, "loss": 0.1947, "step": 18870 }, { "epoch": 16.887298747763865, "grad_norm": 1.2694294452667236, "learning_rate": 4.4103489771359805e-07, "loss": 0.197, "step": 18880 }, { "epoch": 16.89624329159213, "grad_norm": 1.5884836912155151, "learning_rate": 4.40734055354994e-07, "loss": 0.2014, "step": 18890 }, { "epoch": 16.905187835420392, "grad_norm": 1.3282958269119263, "learning_rate": 4.404332129963899e-07, "loss": 0.1959, "step": 18900 }, { "epoch": 16.91413237924866, "grad_norm": 1.966212511062622, "learning_rate": 4.401323706377858e-07, "loss": 0.2147, "step": 18910 }, { "epoch": 16.923076923076923, "grad_norm": 1.6146342754364014, "learning_rate": 4.3983152827918166e-07, "loss": 0.1984, "step": 18920 }, { "epoch": 16.932021466905187, "grad_norm": 1.3908056020736694, "learning_rate": 4.3953068592057755e-07, "loss": 0.2016, "step": 18930 }, { "epoch": 16.94096601073345, "grad_norm": 1.724147915840149, "learning_rate": 4.392298435619735e-07, "loss": 0.2035, "step": 18940 }, { "epoch": 16.94991055456172, "grad_norm": 1.3435845375061035, "learning_rate": 4.389290012033694e-07, "loss": 0.2024, "step": 18950 }, { "epoch": 16.958855098389982, "grad_norm": 1.3221325874328613, "learning_rate": 4.3862815884476533e-07, "loss": 0.1984, "step": 18960 }, { "epoch": 16.967799642218246, "grad_norm": 1.3773683309555054, "learning_rate": 4.383273164861612e-07, "loss": 0.198, "step": 18970 }, { "epoch": 16.97674418604651, "grad_norm": 1.4278448820114136, "learning_rate": 4.3802647412755716e-07, "loss": 0.1952, "step": 18980 }, { "epoch": 16.985688729874777, "grad_norm": 1.6305217742919922, "learning_rate": 4.3772563176895305e-07, "loss": 0.1977, "step": 18990 }, { "epoch": 16.99463327370304, "grad_norm": 1.5182921886444092, "learning_rate": 4.3742478941034894e-07, "loss": 0.1975, "step": 19000 }, { "epoch": 17.0, "eval_bleu": 76.9954, "eval_gen_len": 74.7217, "eval_loss": 0.1558741331100464, "eval_runtime": 57.6067, "eval_samples_per_second": 18.088, "eval_steps_per_second": 0.191, "step": 19006 }, { "epoch": 17.003577817531305, "grad_norm": 1.6046218872070312, "learning_rate": 4.371239470517449e-07, "loss": 0.2037, "step": 19010 }, { "epoch": 17.012522361359572, "grad_norm": 1.2948042154312134, "learning_rate": 4.3682310469314077e-07, "loss": 0.2049, "step": 19020 }, { "epoch": 17.021466905187836, "grad_norm": 1.4588265419006348, "learning_rate": 4.365222623345367e-07, "loss": 0.2005, "step": 19030 }, { "epoch": 17.0304114490161, "grad_norm": 1.3486738204956055, "learning_rate": 4.362214199759326e-07, "loss": 0.1961, "step": 19040 }, { "epoch": 17.039355992844364, "grad_norm": 1.3999135494232178, "learning_rate": 4.359205776173285e-07, "loss": 0.1944, "step": 19050 }, { "epoch": 17.04830053667263, "grad_norm": 1.3097974061965942, "learning_rate": 4.3561973525872444e-07, "loss": 0.2025, "step": 19060 }, { "epoch": 17.057245080500895, "grad_norm": 1.586107611656189, "learning_rate": 4.3531889290012033e-07, "loss": 0.2042, "step": 19070 }, { "epoch": 17.06618962432916, "grad_norm": 1.2752659320831299, "learning_rate": 4.3501805054151627e-07, "loss": 0.1849, "step": 19080 }, { "epoch": 17.075134168157422, "grad_norm": 1.2279033660888672, "learning_rate": 4.347172081829121e-07, "loss": 0.1934, "step": 19090 }, { "epoch": 17.08407871198569, "grad_norm": 1.269182801246643, "learning_rate": 4.34416365824308e-07, "loss": 0.1994, "step": 19100 }, { "epoch": 17.093023255813954, "grad_norm": 1.5643492937088013, "learning_rate": 4.3411552346570394e-07, "loss": 0.2004, "step": 19110 }, { "epoch": 17.101967799642217, "grad_norm": 1.1503673791885376, "learning_rate": 4.3381468110709983e-07, "loss": 0.1979, "step": 19120 }, { "epoch": 17.11091234347048, "grad_norm": 1.837729573249817, "learning_rate": 4.3351383874849577e-07, "loss": 0.2037, "step": 19130 }, { "epoch": 17.11985688729875, "grad_norm": 1.4600396156311035, "learning_rate": 4.3321299638989166e-07, "loss": 0.2067, "step": 19140 }, { "epoch": 17.128801431127012, "grad_norm": 1.4574378728866577, "learning_rate": 4.329121540312876e-07, "loss": 0.1919, "step": 19150 }, { "epoch": 17.137745974955276, "grad_norm": 1.415816307067871, "learning_rate": 4.326113116726835e-07, "loss": 0.1992, "step": 19160 }, { "epoch": 17.146690518783544, "grad_norm": 1.5313915014266968, "learning_rate": 4.323104693140794e-07, "loss": 0.1917, "step": 19170 }, { "epoch": 17.155635062611807, "grad_norm": 1.6685739755630493, "learning_rate": 4.3200962695547533e-07, "loss": 0.1975, "step": 19180 }, { "epoch": 17.16457960644007, "grad_norm": 1.4296932220458984, "learning_rate": 4.317087845968712e-07, "loss": 0.1997, "step": 19190 }, { "epoch": 17.173524150268335, "grad_norm": 1.90603768825531, "learning_rate": 4.3140794223826716e-07, "loss": 0.2024, "step": 19200 }, { "epoch": 17.182468694096602, "grad_norm": 1.4273818731307983, "learning_rate": 4.3110709987966305e-07, "loss": 0.1979, "step": 19210 }, { "epoch": 17.191413237924866, "grad_norm": 1.6286354064941406, "learning_rate": 4.3080625752105894e-07, "loss": 0.2025, "step": 19220 }, { "epoch": 17.20035778175313, "grad_norm": 1.6004372835159302, "learning_rate": 4.305054151624549e-07, "loss": 0.1967, "step": 19230 }, { "epoch": 17.209302325581394, "grad_norm": 1.3184689283370972, "learning_rate": 4.3020457280385077e-07, "loss": 0.2022, "step": 19240 }, { "epoch": 17.21824686940966, "grad_norm": 1.2840800285339355, "learning_rate": 4.299037304452467e-07, "loss": 0.1959, "step": 19250 }, { "epoch": 17.227191413237925, "grad_norm": 1.484135389328003, "learning_rate": 4.2960288808664255e-07, "loss": 0.1954, "step": 19260 }, { "epoch": 17.23613595706619, "grad_norm": 1.282882809638977, "learning_rate": 4.2930204572803844e-07, "loss": 0.2026, "step": 19270 }, { "epoch": 17.245080500894453, "grad_norm": 1.2856765985488892, "learning_rate": 4.290012033694344e-07, "loss": 0.2099, "step": 19280 }, { "epoch": 17.25402504472272, "grad_norm": 1.3113852739334106, "learning_rate": 4.287003610108303e-07, "loss": 0.1912, "step": 19290 }, { "epoch": 17.262969588550984, "grad_norm": 1.4740384817123413, "learning_rate": 4.283995186522262e-07, "loss": 0.1943, "step": 19300 }, { "epoch": 17.271914132379248, "grad_norm": 1.3779065608978271, "learning_rate": 4.280986762936221e-07, "loss": 0.1923, "step": 19310 }, { "epoch": 17.280858676207515, "grad_norm": 1.5705339908599854, "learning_rate": 4.27797833935018e-07, "loss": 0.2067, "step": 19320 }, { "epoch": 17.28980322003578, "grad_norm": 1.4014439582824707, "learning_rate": 4.2749699157641394e-07, "loss": 0.1893, "step": 19330 }, { "epoch": 17.298747763864043, "grad_norm": 1.3906726837158203, "learning_rate": 4.2719614921780983e-07, "loss": 0.1976, "step": 19340 }, { "epoch": 17.307692307692307, "grad_norm": 1.3222224712371826, "learning_rate": 4.268953068592058e-07, "loss": 0.1928, "step": 19350 }, { "epoch": 17.316636851520574, "grad_norm": 1.4415886402130127, "learning_rate": 4.2659446450060166e-07, "loss": 0.1999, "step": 19360 }, { "epoch": 17.325581395348838, "grad_norm": 1.462741494178772, "learning_rate": 4.262936221419976e-07, "loss": 0.1904, "step": 19370 }, { "epoch": 17.3345259391771, "grad_norm": 1.4457674026489258, "learning_rate": 4.259927797833935e-07, "loss": 0.1976, "step": 19380 }, { "epoch": 17.343470483005365, "grad_norm": 1.386067271232605, "learning_rate": 4.256919374247894e-07, "loss": 0.1916, "step": 19390 }, { "epoch": 17.352415026833633, "grad_norm": 1.6763598918914795, "learning_rate": 4.2539109506618533e-07, "loss": 0.2001, "step": 19400 }, { "epoch": 17.361359570661897, "grad_norm": 1.5524550676345825, "learning_rate": 4.250902527075812e-07, "loss": 0.2001, "step": 19410 }, { "epoch": 17.37030411449016, "grad_norm": 1.7988560199737549, "learning_rate": 4.2478941034897716e-07, "loss": 0.1914, "step": 19420 }, { "epoch": 17.379248658318424, "grad_norm": 1.5703943967819214, "learning_rate": 4.2448856799037305e-07, "loss": 0.194, "step": 19430 }, { "epoch": 17.38819320214669, "grad_norm": 1.5991483926773071, "learning_rate": 4.241877256317689e-07, "loss": 0.194, "step": 19440 }, { "epoch": 17.397137745974955, "grad_norm": 1.4655667543411255, "learning_rate": 4.2388688327316483e-07, "loss": 0.2006, "step": 19450 }, { "epoch": 17.40608228980322, "grad_norm": 1.4785929918289185, "learning_rate": 4.235860409145607e-07, "loss": 0.1812, "step": 19460 }, { "epoch": 17.415026833631483, "grad_norm": 1.3344368934631348, "learning_rate": 4.2328519855595666e-07, "loss": 0.1981, "step": 19470 }, { "epoch": 17.42397137745975, "grad_norm": 1.6425262689590454, "learning_rate": 4.2298435619735255e-07, "loss": 0.2029, "step": 19480 }, { "epoch": 17.432915921288014, "grad_norm": 1.941648006439209, "learning_rate": 4.2268351383874844e-07, "loss": 0.1979, "step": 19490 }, { "epoch": 17.441860465116278, "grad_norm": 1.6927695274353027, "learning_rate": 4.223826714801444e-07, "loss": 0.2023, "step": 19500 }, { "epoch": 17.450805008944545, "grad_norm": 1.4542479515075684, "learning_rate": 4.220818291215403e-07, "loss": 0.1982, "step": 19510 }, { "epoch": 17.45974955277281, "grad_norm": 1.3546243906021118, "learning_rate": 4.217809867629362e-07, "loss": 0.1879, "step": 19520 }, { "epoch": 17.468694096601073, "grad_norm": 1.59962797164917, "learning_rate": 4.214801444043321e-07, "loss": 0.2071, "step": 19530 }, { "epoch": 17.477638640429337, "grad_norm": 1.761206030845642, "learning_rate": 4.2117930204572805e-07, "loss": 0.1929, "step": 19540 }, { "epoch": 17.486583184257604, "grad_norm": 1.3420017957687378, "learning_rate": 4.2087845968712394e-07, "loss": 0.1885, "step": 19550 }, { "epoch": 17.495527728085868, "grad_norm": 1.486976981163025, "learning_rate": 4.2057761732851983e-07, "loss": 0.1939, "step": 19560 }, { "epoch": 17.504472271914132, "grad_norm": 1.3547134399414062, "learning_rate": 4.202767749699158e-07, "loss": 0.1894, "step": 19570 }, { "epoch": 17.513416815742396, "grad_norm": 1.7011101245880127, "learning_rate": 4.1997593261131166e-07, "loss": 0.1934, "step": 19580 }, { "epoch": 17.522361359570663, "grad_norm": 1.407077431678772, "learning_rate": 4.196750902527076e-07, "loss": 0.1917, "step": 19590 }, { "epoch": 17.531305903398927, "grad_norm": 1.2789831161499023, "learning_rate": 4.193742478941035e-07, "loss": 0.1967, "step": 19600 }, { "epoch": 17.54025044722719, "grad_norm": 1.906341314315796, "learning_rate": 4.1907340553549933e-07, "loss": 0.1965, "step": 19610 }, { "epoch": 17.549194991055455, "grad_norm": 1.4115309715270996, "learning_rate": 4.187725631768953e-07, "loss": 0.1883, "step": 19620 }, { "epoch": 17.558139534883722, "grad_norm": 1.4893302917480469, "learning_rate": 4.1847172081829117e-07, "loss": 0.2001, "step": 19630 }, { "epoch": 17.567084078711986, "grad_norm": 1.6963932514190674, "learning_rate": 4.181708784596871e-07, "loss": 0.2085, "step": 19640 }, { "epoch": 17.57602862254025, "grad_norm": 1.3404951095581055, "learning_rate": 4.17870036101083e-07, "loss": 0.2071, "step": 19650 }, { "epoch": 17.584973166368513, "grad_norm": 1.547065019607544, "learning_rate": 4.175691937424789e-07, "loss": 0.198, "step": 19660 }, { "epoch": 17.59391771019678, "grad_norm": 1.356144666671753, "learning_rate": 4.1726835138387483e-07, "loss": 0.2053, "step": 19670 }, { "epoch": 17.602862254025045, "grad_norm": 1.4185278415679932, "learning_rate": 4.169675090252707e-07, "loss": 0.196, "step": 19680 }, { "epoch": 17.61180679785331, "grad_norm": 1.4725350141525269, "learning_rate": 4.1666666666666667e-07, "loss": 0.2051, "step": 19690 }, { "epoch": 17.620751341681576, "grad_norm": 1.3277311325073242, "learning_rate": 4.1636582430806256e-07, "loss": 0.2059, "step": 19700 }, { "epoch": 17.62969588550984, "grad_norm": 1.6677687168121338, "learning_rate": 4.1606498194945845e-07, "loss": 0.1958, "step": 19710 }, { "epoch": 17.638640429338103, "grad_norm": 1.5741796493530273, "learning_rate": 4.157641395908544e-07, "loss": 0.1913, "step": 19720 }, { "epoch": 17.647584973166367, "grad_norm": 1.4955782890319824, "learning_rate": 4.154632972322503e-07, "loss": 0.1906, "step": 19730 }, { "epoch": 17.656529516994635, "grad_norm": 1.5030593872070312, "learning_rate": 4.151624548736462e-07, "loss": 0.1899, "step": 19740 }, { "epoch": 17.6654740608229, "grad_norm": 1.590512752532959, "learning_rate": 4.148616125150421e-07, "loss": 0.2079, "step": 19750 }, { "epoch": 17.674418604651162, "grad_norm": 1.3719285726547241, "learning_rate": 4.1456077015643805e-07, "loss": 0.2053, "step": 19760 }, { "epoch": 17.683363148479426, "grad_norm": 1.3236194849014282, "learning_rate": 4.1425992779783394e-07, "loss": 0.188, "step": 19770 }, { "epoch": 17.692307692307693, "grad_norm": 1.441819429397583, "learning_rate": 4.139590854392298e-07, "loss": 0.1843, "step": 19780 }, { "epoch": 17.701252236135957, "grad_norm": 1.6539798974990845, "learning_rate": 4.136582430806257e-07, "loss": 0.1974, "step": 19790 }, { "epoch": 17.71019677996422, "grad_norm": 1.5544081926345825, "learning_rate": 4.133574007220216e-07, "loss": 0.1954, "step": 19800 }, { "epoch": 17.719141323792485, "grad_norm": 1.3864456415176392, "learning_rate": 4.1305655836341756e-07, "loss": 0.1954, "step": 19810 }, { "epoch": 17.728085867620752, "grad_norm": 1.770103096961975, "learning_rate": 4.1275571600481345e-07, "loss": 0.2061, "step": 19820 }, { "epoch": 17.737030411449016, "grad_norm": 1.5458662509918213, "learning_rate": 4.1245487364620934e-07, "loss": 0.1995, "step": 19830 }, { "epoch": 17.74597495527728, "grad_norm": 1.2489945888519287, "learning_rate": 4.121540312876053e-07, "loss": 0.2056, "step": 19840 }, { "epoch": 17.754919499105547, "grad_norm": 1.4535328149795532, "learning_rate": 4.1185318892900117e-07, "loss": 0.1998, "step": 19850 }, { "epoch": 17.76386404293381, "grad_norm": 1.4486058950424194, "learning_rate": 4.115523465703971e-07, "loss": 0.1887, "step": 19860 }, { "epoch": 17.772808586762075, "grad_norm": 1.2764486074447632, "learning_rate": 4.11251504211793e-07, "loss": 0.1917, "step": 19870 }, { "epoch": 17.78175313059034, "grad_norm": 1.711034893989563, "learning_rate": 4.109506618531889e-07, "loss": 0.1966, "step": 19880 }, { "epoch": 17.790697674418606, "grad_norm": 1.7573375701904297, "learning_rate": 4.1064981949458483e-07, "loss": 0.1929, "step": 19890 }, { "epoch": 17.79964221824687, "grad_norm": 1.5140432119369507, "learning_rate": 4.103489771359807e-07, "loss": 0.1876, "step": 19900 }, { "epoch": 17.808586762075134, "grad_norm": 1.3529492616653442, "learning_rate": 4.1004813477737667e-07, "loss": 0.1927, "step": 19910 }, { "epoch": 17.817531305903398, "grad_norm": 1.3087401390075684, "learning_rate": 4.0974729241877256e-07, "loss": 0.2023, "step": 19920 }, { "epoch": 17.826475849731665, "grad_norm": 1.36638343334198, "learning_rate": 4.094464500601685e-07, "loss": 0.2056, "step": 19930 }, { "epoch": 17.83542039355993, "grad_norm": 1.648963212966919, "learning_rate": 4.091456077015644e-07, "loss": 0.1988, "step": 19940 }, { "epoch": 17.844364937388193, "grad_norm": 1.9200732707977295, "learning_rate": 4.088447653429602e-07, "loss": 0.2034, "step": 19950 }, { "epoch": 17.853309481216456, "grad_norm": 1.5758631229400635, "learning_rate": 4.0854392298435617e-07, "loss": 0.195, "step": 19960 }, { "epoch": 17.862254025044724, "grad_norm": 1.334425449371338, "learning_rate": 4.0824308062575206e-07, "loss": 0.2079, "step": 19970 }, { "epoch": 17.871198568872988, "grad_norm": 1.6201728582382202, "learning_rate": 4.07942238267148e-07, "loss": 0.1999, "step": 19980 }, { "epoch": 17.88014311270125, "grad_norm": 1.4320831298828125, "learning_rate": 4.076413959085439e-07, "loss": 0.1943, "step": 19990 }, { "epoch": 17.88908765652952, "grad_norm": 1.2403602600097656, "learning_rate": 4.073405535499398e-07, "loss": 0.1925, "step": 20000 }, { "epoch": 17.898032200357783, "grad_norm": 1.2947001457214355, "learning_rate": 4.070397111913357e-07, "loss": 0.2029, "step": 20010 }, { "epoch": 17.906976744186046, "grad_norm": 1.276208758354187, "learning_rate": 4.067388688327316e-07, "loss": 0.193, "step": 20020 }, { "epoch": 17.91592128801431, "grad_norm": 1.4883465766906738, "learning_rate": 4.0643802647412756e-07, "loss": 0.201, "step": 20030 }, { "epoch": 17.924865831842578, "grad_norm": 1.36917245388031, "learning_rate": 4.0613718411552345e-07, "loss": 0.2017, "step": 20040 }, { "epoch": 17.93381037567084, "grad_norm": 1.5583710670471191, "learning_rate": 4.0583634175691934e-07, "loss": 0.198, "step": 20050 }, { "epoch": 17.942754919499105, "grad_norm": 1.357555866241455, "learning_rate": 4.055354993983153e-07, "loss": 0.1875, "step": 20060 }, { "epoch": 17.95169946332737, "grad_norm": 1.5523446798324585, "learning_rate": 4.0523465703971117e-07, "loss": 0.1954, "step": 20070 }, { "epoch": 17.960644007155636, "grad_norm": 1.2269097566604614, "learning_rate": 4.049338146811071e-07, "loss": 0.1992, "step": 20080 }, { "epoch": 17.9695885509839, "grad_norm": 1.7482787370681763, "learning_rate": 4.04632972322503e-07, "loss": 0.1998, "step": 20090 }, { "epoch": 17.978533094812164, "grad_norm": 1.9512193202972412, "learning_rate": 4.043321299638989e-07, "loss": 0.2004, "step": 20100 }, { "epoch": 17.987477638640428, "grad_norm": 1.7851731777191162, "learning_rate": 4.0403128760529484e-07, "loss": 0.1885, "step": 20110 }, { "epoch": 17.996422182468695, "grad_norm": 1.3705447912216187, "learning_rate": 4.037304452466907e-07, "loss": 0.1943, "step": 20120 }, { "epoch": 18.0, "eval_bleu": 77.421, "eval_gen_len": 74.6641, "eval_loss": 0.15242676436901093, "eval_runtime": 57.5495, "eval_samples_per_second": 18.106, "eval_steps_per_second": 0.191, "step": 20124 }, { "epoch": 18.00536672629696, "grad_norm": 1.3600645065307617, "learning_rate": 4.034296028880866e-07, "loss": 0.1936, "step": 20130 }, { "epoch": 18.014311270125223, "grad_norm": 1.514121174812317, "learning_rate": 4.031287605294825e-07, "loss": 0.1874, "step": 20140 }, { "epoch": 18.023255813953487, "grad_norm": 1.8172216415405273, "learning_rate": 4.0282791817087845e-07, "loss": 0.1923, "step": 20150 }, { "epoch": 18.032200357781754, "grad_norm": 1.2660489082336426, "learning_rate": 4.0252707581227434e-07, "loss": 0.1933, "step": 20160 }, { "epoch": 18.041144901610018, "grad_norm": 2.1290335655212402, "learning_rate": 4.0222623345367023e-07, "loss": 0.1979, "step": 20170 }, { "epoch": 18.05008944543828, "grad_norm": 1.3781754970550537, "learning_rate": 4.0192539109506617e-07, "loss": 0.1975, "step": 20180 }, { "epoch": 18.05903398926655, "grad_norm": 1.3132025003433228, "learning_rate": 4.0162454873646206e-07, "loss": 0.1993, "step": 20190 }, { "epoch": 18.067978533094813, "grad_norm": 1.3299691677093506, "learning_rate": 4.01323706377858e-07, "loss": 0.1916, "step": 20200 }, { "epoch": 18.076923076923077, "grad_norm": 1.939542293548584, "learning_rate": 4.010228640192539e-07, "loss": 0.1988, "step": 20210 }, { "epoch": 18.08586762075134, "grad_norm": 1.4302418231964111, "learning_rate": 4.007220216606498e-07, "loss": 0.1948, "step": 20220 }, { "epoch": 18.094812164579608, "grad_norm": 1.2856136560440063, "learning_rate": 4.004211793020457e-07, "loss": 0.1948, "step": 20230 }, { "epoch": 18.10375670840787, "grad_norm": 1.3175781965255737, "learning_rate": 4.001203369434416e-07, "loss": 0.1963, "step": 20240 }, { "epoch": 18.112701252236135, "grad_norm": 1.9059381484985352, "learning_rate": 3.9981949458483756e-07, "loss": 0.1893, "step": 20250 }, { "epoch": 18.1216457960644, "grad_norm": 1.9008736610412598, "learning_rate": 3.9951865222623345e-07, "loss": 0.1902, "step": 20260 }, { "epoch": 18.130590339892667, "grad_norm": 1.2346973419189453, "learning_rate": 3.9921780986762934e-07, "loss": 0.1934, "step": 20270 }, { "epoch": 18.13953488372093, "grad_norm": 1.430349349975586, "learning_rate": 3.989169675090253e-07, "loss": 0.2063, "step": 20280 }, { "epoch": 18.148479427549194, "grad_norm": 1.5357002019882202, "learning_rate": 3.9861612515042117e-07, "loss": 0.1951, "step": 20290 }, { "epoch": 18.157423971377458, "grad_norm": 1.565921425819397, "learning_rate": 3.9831528279181706e-07, "loss": 0.2069, "step": 20300 }, { "epoch": 18.166368515205725, "grad_norm": 1.4086253643035889, "learning_rate": 3.9801444043321295e-07, "loss": 0.1901, "step": 20310 }, { "epoch": 18.17531305903399, "grad_norm": 1.5787662267684937, "learning_rate": 3.977135980746089e-07, "loss": 0.1959, "step": 20320 }, { "epoch": 18.184257602862253, "grad_norm": 1.3202236890792847, "learning_rate": 3.974127557160048e-07, "loss": 0.1905, "step": 20330 }, { "epoch": 18.19320214669052, "grad_norm": 1.4912652969360352, "learning_rate": 3.9711191335740067e-07, "loss": 0.197, "step": 20340 }, { "epoch": 18.202146690518784, "grad_norm": 1.5808525085449219, "learning_rate": 3.968110709987966e-07, "loss": 0.2094, "step": 20350 }, { "epoch": 18.211091234347048, "grad_norm": 1.6998366117477417, "learning_rate": 3.965102286401925e-07, "loss": 0.1879, "step": 20360 }, { "epoch": 18.220035778175312, "grad_norm": 1.2697880268096924, "learning_rate": 3.9620938628158845e-07, "loss": 0.1995, "step": 20370 }, { "epoch": 18.22898032200358, "grad_norm": 1.6824156045913696, "learning_rate": 3.9590854392298434e-07, "loss": 0.1952, "step": 20380 }, { "epoch": 18.237924865831843, "grad_norm": 1.4301773309707642, "learning_rate": 3.9560770156438023e-07, "loss": 0.2053, "step": 20390 }, { "epoch": 18.246869409660107, "grad_norm": 1.4164366722106934, "learning_rate": 3.9530685920577617e-07, "loss": 0.1886, "step": 20400 }, { "epoch": 18.25581395348837, "grad_norm": 1.2958494424819946, "learning_rate": 3.9500601684717206e-07, "loss": 0.1816, "step": 20410 }, { "epoch": 18.264758497316638, "grad_norm": 1.4988905191421509, "learning_rate": 3.94705174488568e-07, "loss": 0.1913, "step": 20420 }, { "epoch": 18.273703041144902, "grad_norm": 1.38814115524292, "learning_rate": 3.944043321299639e-07, "loss": 0.1902, "step": 20430 }, { "epoch": 18.282647584973166, "grad_norm": 1.3260778188705444, "learning_rate": 3.941034897713598e-07, "loss": 0.1885, "step": 20440 }, { "epoch": 18.29159212880143, "grad_norm": 2.0683860778808594, "learning_rate": 3.9380264741275573e-07, "loss": 0.1929, "step": 20450 }, { "epoch": 18.300536672629697, "grad_norm": 1.2236251831054688, "learning_rate": 3.935018050541516e-07, "loss": 0.1837, "step": 20460 }, { "epoch": 18.30948121645796, "grad_norm": 1.4860965013504028, "learning_rate": 3.9320096269554756e-07, "loss": 0.2023, "step": 20470 }, { "epoch": 18.318425760286225, "grad_norm": 1.5370088815689087, "learning_rate": 3.929001203369434e-07, "loss": 0.1958, "step": 20480 }, { "epoch": 18.32737030411449, "grad_norm": 1.4012691974639893, "learning_rate": 3.925992779783393e-07, "loss": 0.1801, "step": 20490 }, { "epoch": 18.336314847942756, "grad_norm": 1.4192802906036377, "learning_rate": 3.9229843561973523e-07, "loss": 0.1889, "step": 20500 }, { "epoch": 18.34525939177102, "grad_norm": 1.4101192951202393, "learning_rate": 3.919975932611311e-07, "loss": 0.189, "step": 20510 }, { "epoch": 18.354203935599283, "grad_norm": 1.4301940202713013, "learning_rate": 3.9169675090252706e-07, "loss": 0.2039, "step": 20520 }, { "epoch": 18.36314847942755, "grad_norm": 1.3676986694335938, "learning_rate": 3.9139590854392295e-07, "loss": 0.1956, "step": 20530 }, { "epoch": 18.372093023255815, "grad_norm": 1.1791801452636719, "learning_rate": 3.910950661853189e-07, "loss": 0.1897, "step": 20540 }, { "epoch": 18.38103756708408, "grad_norm": 1.5008537769317627, "learning_rate": 3.907942238267148e-07, "loss": 0.198, "step": 20550 }, { "epoch": 18.389982110912342, "grad_norm": 1.3062366247177124, "learning_rate": 3.904933814681107e-07, "loss": 0.1938, "step": 20560 }, { "epoch": 18.39892665474061, "grad_norm": 1.6328556537628174, "learning_rate": 3.901925391095066e-07, "loss": 0.1902, "step": 20570 }, { "epoch": 18.407871198568873, "grad_norm": 1.6620492935180664, "learning_rate": 3.898916967509025e-07, "loss": 0.1849, "step": 20580 }, { "epoch": 18.416815742397137, "grad_norm": 1.5257842540740967, "learning_rate": 3.8959085439229845e-07, "loss": 0.1999, "step": 20590 }, { "epoch": 18.4257602862254, "grad_norm": 1.7352404594421387, "learning_rate": 3.8929001203369434e-07, "loss": 0.203, "step": 20600 }, { "epoch": 18.43470483005367, "grad_norm": 1.398389220237732, "learning_rate": 3.8898916967509023e-07, "loss": 0.1869, "step": 20610 }, { "epoch": 18.443649373881932, "grad_norm": 1.4931424856185913, "learning_rate": 3.8868832731648617e-07, "loss": 0.1927, "step": 20620 }, { "epoch": 18.452593917710196, "grad_norm": 1.3729106187820435, "learning_rate": 3.8838748495788206e-07, "loss": 0.191, "step": 20630 }, { "epoch": 18.46153846153846, "grad_norm": 1.6463043689727783, "learning_rate": 3.88086642599278e-07, "loss": 0.1883, "step": 20640 }, { "epoch": 18.470483005366727, "grad_norm": 1.7752493619918823, "learning_rate": 3.8778580024067384e-07, "loss": 0.1994, "step": 20650 }, { "epoch": 18.47942754919499, "grad_norm": 1.4545904397964478, "learning_rate": 3.8748495788206973e-07, "loss": 0.1951, "step": 20660 }, { "epoch": 18.488372093023255, "grad_norm": 1.4974523782730103, "learning_rate": 3.871841155234657e-07, "loss": 0.1925, "step": 20670 }, { "epoch": 18.497316636851522, "grad_norm": 1.3493813276290894, "learning_rate": 3.8688327316486156e-07, "loss": 0.1861, "step": 20680 }, { "epoch": 18.506261180679786, "grad_norm": 1.1932425498962402, "learning_rate": 3.865824308062575e-07, "loss": 0.1933, "step": 20690 }, { "epoch": 18.51520572450805, "grad_norm": 1.5326428413391113, "learning_rate": 3.862815884476534e-07, "loss": 0.1859, "step": 20700 }, { "epoch": 18.524150268336314, "grad_norm": 1.3875932693481445, "learning_rate": 3.8598074608904934e-07, "loss": 0.1851, "step": 20710 }, { "epoch": 18.53309481216458, "grad_norm": 1.3919353485107422, "learning_rate": 3.8567990373044523e-07, "loss": 0.2017, "step": 20720 }, { "epoch": 18.542039355992845, "grad_norm": 1.5686875581741333, "learning_rate": 3.853790613718411e-07, "loss": 0.1971, "step": 20730 }, { "epoch": 18.55098389982111, "grad_norm": 1.4372426271438599, "learning_rate": 3.8507821901323706e-07, "loss": 0.1991, "step": 20740 }, { "epoch": 18.559928443649373, "grad_norm": 1.2772883176803589, "learning_rate": 3.8477737665463295e-07, "loss": 0.2028, "step": 20750 }, { "epoch": 18.56887298747764, "grad_norm": 1.4629807472229004, "learning_rate": 3.844765342960289e-07, "loss": 0.1931, "step": 20760 }, { "epoch": 18.577817531305904, "grad_norm": 1.483835220336914, "learning_rate": 3.841756919374248e-07, "loss": 0.1839, "step": 20770 }, { "epoch": 18.586762075134168, "grad_norm": 1.3380502462387085, "learning_rate": 3.838748495788207e-07, "loss": 0.1942, "step": 20780 }, { "epoch": 18.59570661896243, "grad_norm": 1.3125088214874268, "learning_rate": 3.835740072202166e-07, "loss": 0.1998, "step": 20790 }, { "epoch": 18.6046511627907, "grad_norm": 1.4294259548187256, "learning_rate": 3.832731648616125e-07, "loss": 0.1932, "step": 20800 }, { "epoch": 18.613595706618963, "grad_norm": 1.4665868282318115, "learning_rate": 3.8297232250300845e-07, "loss": 0.1929, "step": 20810 }, { "epoch": 18.622540250447226, "grad_norm": 1.2699508666992188, "learning_rate": 3.826714801444043e-07, "loss": 0.1859, "step": 20820 }, { "epoch": 18.631484794275494, "grad_norm": 1.6539772748947144, "learning_rate": 3.823706377858002e-07, "loss": 0.1875, "step": 20830 }, { "epoch": 18.640429338103758, "grad_norm": 1.6890268325805664, "learning_rate": 3.820697954271961e-07, "loss": 0.1987, "step": 20840 }, { "epoch": 18.64937388193202, "grad_norm": 1.480602741241455, "learning_rate": 3.81768953068592e-07, "loss": 0.1923, "step": 20850 }, { "epoch": 18.658318425760285, "grad_norm": 1.5095171928405762, "learning_rate": 3.8146811070998795e-07, "loss": 0.1915, "step": 20860 }, { "epoch": 18.667262969588553, "grad_norm": 1.3432358503341675, "learning_rate": 3.8116726835138384e-07, "loss": 0.1903, "step": 20870 }, { "epoch": 18.676207513416816, "grad_norm": 1.7593339681625366, "learning_rate": 3.8086642599277973e-07, "loss": 0.1968, "step": 20880 }, { "epoch": 18.68515205724508, "grad_norm": 1.431167721748352, "learning_rate": 3.805655836341757e-07, "loss": 0.195, "step": 20890 }, { "epoch": 18.694096601073344, "grad_norm": 1.451204538345337, "learning_rate": 3.8026474127557157e-07, "loss": 0.1934, "step": 20900 }, { "epoch": 18.70304114490161, "grad_norm": 1.2916609048843384, "learning_rate": 3.799638989169675e-07, "loss": 0.1894, "step": 20910 }, { "epoch": 18.711985688729875, "grad_norm": 1.6644831895828247, "learning_rate": 3.796630565583634e-07, "loss": 0.1839, "step": 20920 }, { "epoch": 18.72093023255814, "grad_norm": 1.2776670455932617, "learning_rate": 3.7936221419975934e-07, "loss": 0.1877, "step": 20930 }, { "epoch": 18.729874776386403, "grad_norm": 1.3584338426589966, "learning_rate": 3.7906137184115523e-07, "loss": 0.1946, "step": 20940 }, { "epoch": 18.73881932021467, "grad_norm": 1.5859999656677246, "learning_rate": 3.787605294825511e-07, "loss": 0.1917, "step": 20950 }, { "epoch": 18.747763864042934, "grad_norm": 1.5412558317184448, "learning_rate": 3.7845968712394706e-07, "loss": 0.1902, "step": 20960 }, { "epoch": 18.756708407871198, "grad_norm": 1.4061285257339478, "learning_rate": 3.7815884476534295e-07, "loss": 0.1922, "step": 20970 }, { "epoch": 18.76565295169946, "grad_norm": 1.3371307849884033, "learning_rate": 3.778580024067389e-07, "loss": 0.1851, "step": 20980 }, { "epoch": 18.77459749552773, "grad_norm": 2.0113933086395264, "learning_rate": 3.7755716004813473e-07, "loss": 0.1969, "step": 20990 }, { "epoch": 18.783542039355993, "grad_norm": 1.2509182691574097, "learning_rate": 3.772563176895306e-07, "loss": 0.1946, "step": 21000 }, { "epoch": 18.792486583184257, "grad_norm": 1.297754168510437, "learning_rate": 3.7695547533092657e-07, "loss": 0.2006, "step": 21010 }, { "epoch": 18.801431127012524, "grad_norm": 1.376678705215454, "learning_rate": 3.7665463297232246e-07, "loss": 0.1799, "step": 21020 }, { "epoch": 18.810375670840788, "grad_norm": 1.495245099067688, "learning_rate": 3.763537906137184e-07, "loss": 0.1919, "step": 21030 }, { "epoch": 18.81932021466905, "grad_norm": 1.2178292274475098, "learning_rate": 3.760529482551143e-07, "loss": 0.1972, "step": 21040 }, { "epoch": 18.828264758497316, "grad_norm": 1.226501703262329, "learning_rate": 3.757521058965102e-07, "loss": 0.1968, "step": 21050 }, { "epoch": 18.837209302325583, "grad_norm": 1.2760741710662842, "learning_rate": 3.754512635379061e-07, "loss": 0.1909, "step": 21060 }, { "epoch": 18.846153846153847, "grad_norm": 1.7594233751296997, "learning_rate": 3.75150421179302e-07, "loss": 0.193, "step": 21070 }, { "epoch": 18.85509838998211, "grad_norm": 1.4522331953048706, "learning_rate": 3.7484957882069795e-07, "loss": 0.1961, "step": 21080 }, { "epoch": 18.864042933810374, "grad_norm": 1.6493743658065796, "learning_rate": 3.7454873646209384e-07, "loss": 0.1892, "step": 21090 }, { "epoch": 18.87298747763864, "grad_norm": 1.2935068607330322, "learning_rate": 3.742478941034898e-07, "loss": 0.1832, "step": 21100 }, { "epoch": 18.881932021466906, "grad_norm": 1.1401342153549194, "learning_rate": 3.739470517448857e-07, "loss": 0.1793, "step": 21110 }, { "epoch": 18.89087656529517, "grad_norm": 1.3705753087997437, "learning_rate": 3.7364620938628157e-07, "loss": 0.1945, "step": 21120 }, { "epoch": 18.899821109123433, "grad_norm": 1.6233444213867188, "learning_rate": 3.733453670276775e-07, "loss": 0.1961, "step": 21130 }, { "epoch": 18.9087656529517, "grad_norm": 1.460810661315918, "learning_rate": 3.730445246690734e-07, "loss": 0.1934, "step": 21140 }, { "epoch": 18.917710196779964, "grad_norm": 1.409800410270691, "learning_rate": 3.7274368231046934e-07, "loss": 0.1817, "step": 21150 }, { "epoch": 18.926654740608228, "grad_norm": 1.3818022012710571, "learning_rate": 3.7244283995186523e-07, "loss": 0.1972, "step": 21160 }, { "epoch": 18.935599284436492, "grad_norm": 1.642561912536621, "learning_rate": 3.7214199759326107e-07, "loss": 0.1905, "step": 21170 }, { "epoch": 18.94454382826476, "grad_norm": 1.5107719898223877, "learning_rate": 3.71841155234657e-07, "loss": 0.1916, "step": 21180 }, { "epoch": 18.953488372093023, "grad_norm": 1.3990639448165894, "learning_rate": 3.715403128760529e-07, "loss": 0.196, "step": 21190 }, { "epoch": 18.962432915921287, "grad_norm": 1.2956585884094238, "learning_rate": 3.7123947051744884e-07, "loss": 0.1875, "step": 21200 }, { "epoch": 18.971377459749554, "grad_norm": 1.4675569534301758, "learning_rate": 3.7093862815884473e-07, "loss": 0.183, "step": 21210 }, { "epoch": 18.980322003577818, "grad_norm": 1.4663444757461548, "learning_rate": 3.706377858002406e-07, "loss": 0.1845, "step": 21220 }, { "epoch": 18.989266547406082, "grad_norm": 1.0990031957626343, "learning_rate": 3.7033694344163657e-07, "loss": 0.1918, "step": 21230 }, { "epoch": 18.998211091234346, "grad_norm": 1.2096024751663208, "learning_rate": 3.7003610108303246e-07, "loss": 0.1987, "step": 21240 }, { "epoch": 19.0, "eval_bleu": 77.8231, "eval_gen_len": 74.6833, "eval_loss": 0.14946186542510986, "eval_runtime": 56.7661, "eval_samples_per_second": 18.356, "eval_steps_per_second": 0.194, "step": 21242 }, { "epoch": 19.007155635062613, "grad_norm": 1.557692050933838, "learning_rate": 3.697352587244284e-07, "loss": 0.1898, "step": 21250 }, { "epoch": 19.016100178890877, "grad_norm": 1.4740185737609863, "learning_rate": 3.694344163658243e-07, "loss": 0.187, "step": 21260 }, { "epoch": 19.02504472271914, "grad_norm": 1.4456875324249268, "learning_rate": 3.6913357400722023e-07, "loss": 0.1924, "step": 21270 }, { "epoch": 19.033989266547405, "grad_norm": 1.546985149383545, "learning_rate": 3.688327316486161e-07, "loss": 0.1892, "step": 21280 }, { "epoch": 19.042933810375672, "grad_norm": 1.4747453927993774, "learning_rate": 3.68531889290012e-07, "loss": 0.1934, "step": 21290 }, { "epoch": 19.051878354203936, "grad_norm": 1.3266552686691284, "learning_rate": 3.6823104693140796e-07, "loss": 0.2022, "step": 21300 }, { "epoch": 19.0608228980322, "grad_norm": 1.3492190837860107, "learning_rate": 3.6793020457280385e-07, "loss": 0.1921, "step": 21310 }, { "epoch": 19.069767441860463, "grad_norm": 1.6898155212402344, "learning_rate": 3.676293622141998e-07, "loss": 0.1947, "step": 21320 }, { "epoch": 19.07871198568873, "grad_norm": 1.310855507850647, "learning_rate": 3.673285198555957e-07, "loss": 0.1927, "step": 21330 }, { "epoch": 19.087656529516995, "grad_norm": 1.9882363080978394, "learning_rate": 3.670276774969915e-07, "loss": 0.1936, "step": 21340 }, { "epoch": 19.09660107334526, "grad_norm": 1.7846614122390747, "learning_rate": 3.6672683513838746e-07, "loss": 0.1872, "step": 21350 }, { "epoch": 19.105545617173526, "grad_norm": 1.561082124710083, "learning_rate": 3.6642599277978335e-07, "loss": 0.1938, "step": 21360 }, { "epoch": 19.11449016100179, "grad_norm": 1.4885318279266357, "learning_rate": 3.661251504211793e-07, "loss": 0.1832, "step": 21370 }, { "epoch": 19.123434704830053, "grad_norm": 1.2708408832550049, "learning_rate": 3.658243080625752e-07, "loss": 0.1891, "step": 21380 }, { "epoch": 19.132379248658317, "grad_norm": 1.5973031520843506, "learning_rate": 3.6552346570397107e-07, "loss": 0.1862, "step": 21390 }, { "epoch": 19.141323792486585, "grad_norm": 1.3048038482666016, "learning_rate": 3.65222623345367e-07, "loss": 0.1994, "step": 21400 }, { "epoch": 19.15026833631485, "grad_norm": 1.4048810005187988, "learning_rate": 3.649217809867629e-07, "loss": 0.194, "step": 21410 }, { "epoch": 19.159212880143112, "grad_norm": 1.5730185508728027, "learning_rate": 3.6462093862815885e-07, "loss": 0.189, "step": 21420 }, { "epoch": 19.168157423971376, "grad_norm": 1.3440287113189697, "learning_rate": 3.6432009626955474e-07, "loss": 0.1819, "step": 21430 }, { "epoch": 19.177101967799643, "grad_norm": 1.4186955690383911, "learning_rate": 3.640192539109506e-07, "loss": 0.1939, "step": 21440 }, { "epoch": 19.186046511627907, "grad_norm": 1.454852819442749, "learning_rate": 3.6371841155234657e-07, "loss": 0.199, "step": 21450 }, { "epoch": 19.19499105545617, "grad_norm": 1.2862147092819214, "learning_rate": 3.6341756919374246e-07, "loss": 0.1878, "step": 21460 }, { "epoch": 19.203935599284435, "grad_norm": 1.3074766397476196, "learning_rate": 3.631167268351384e-07, "loss": 0.1871, "step": 21470 }, { "epoch": 19.212880143112702, "grad_norm": 1.3759242296218872, "learning_rate": 3.628158844765343e-07, "loss": 0.1939, "step": 21480 }, { "epoch": 19.221824686940966, "grad_norm": 1.3743611574172974, "learning_rate": 3.6251504211793023e-07, "loss": 0.1854, "step": 21490 }, { "epoch": 19.23076923076923, "grad_norm": 1.6088826656341553, "learning_rate": 3.622141997593261e-07, "loss": 0.1998, "step": 21500 }, { "epoch": 19.239713774597494, "grad_norm": 1.5923534631729126, "learning_rate": 3.6191335740072196e-07, "loss": 0.195, "step": 21510 }, { "epoch": 19.24865831842576, "grad_norm": 1.2471575736999512, "learning_rate": 3.616125150421179e-07, "loss": 0.185, "step": 21520 }, { "epoch": 19.257602862254025, "grad_norm": 1.300024390220642, "learning_rate": 3.613116726835138e-07, "loss": 0.1857, "step": 21530 }, { "epoch": 19.26654740608229, "grad_norm": 1.4788153171539307, "learning_rate": 3.6101083032490974e-07, "loss": 0.1841, "step": 21540 }, { "epoch": 19.275491949910556, "grad_norm": 1.4675010442733765, "learning_rate": 3.607099879663056e-07, "loss": 0.1875, "step": 21550 }, { "epoch": 19.28443649373882, "grad_norm": 1.3291878700256348, "learning_rate": 3.604091456077015e-07, "loss": 0.185, "step": 21560 }, { "epoch": 19.293381037567084, "grad_norm": 1.4330105781555176, "learning_rate": 3.6010830324909746e-07, "loss": 0.1791, "step": 21570 }, { "epoch": 19.302325581395348, "grad_norm": 1.32407808303833, "learning_rate": 3.5980746089049335e-07, "loss": 0.1924, "step": 21580 }, { "epoch": 19.311270125223615, "grad_norm": 1.3751063346862793, "learning_rate": 3.595066185318893e-07, "loss": 0.191, "step": 21590 }, { "epoch": 19.32021466905188, "grad_norm": 1.3922836780548096, "learning_rate": 3.592057761732852e-07, "loss": 0.1774, "step": 21600 }, { "epoch": 19.329159212880143, "grad_norm": 1.4643456935882568, "learning_rate": 3.5890493381468107e-07, "loss": 0.1879, "step": 21610 }, { "epoch": 19.338103756708406, "grad_norm": 1.7122917175292969, "learning_rate": 3.58604091456077e-07, "loss": 0.1908, "step": 21620 }, { "epoch": 19.347048300536674, "grad_norm": 1.4380452632904053, "learning_rate": 3.583032490974729e-07, "loss": 0.1887, "step": 21630 }, { "epoch": 19.355992844364938, "grad_norm": 1.6291236877441406, "learning_rate": 3.5800240673886885e-07, "loss": 0.1882, "step": 21640 }, { "epoch": 19.3649373881932, "grad_norm": 1.417327642440796, "learning_rate": 3.5770156438026474e-07, "loss": 0.1911, "step": 21650 }, { "epoch": 19.373881932021465, "grad_norm": 1.411293864250183, "learning_rate": 3.574007220216607e-07, "loss": 0.1828, "step": 21660 }, { "epoch": 19.382826475849733, "grad_norm": 1.677729606628418, "learning_rate": 3.5709987966305657e-07, "loss": 0.1905, "step": 21670 }, { "epoch": 19.391771019677996, "grad_norm": 1.2971285581588745, "learning_rate": 3.567990373044524e-07, "loss": 0.1822, "step": 21680 }, { "epoch": 19.40071556350626, "grad_norm": 1.531691312789917, "learning_rate": 3.5649819494584835e-07, "loss": 0.1938, "step": 21690 }, { "epoch": 19.409660107334528, "grad_norm": 1.55259108543396, "learning_rate": 3.5619735258724424e-07, "loss": 0.1893, "step": 21700 }, { "epoch": 19.41860465116279, "grad_norm": 1.436160683631897, "learning_rate": 3.558965102286402e-07, "loss": 0.193, "step": 21710 }, { "epoch": 19.427549194991055, "grad_norm": 1.3020164966583252, "learning_rate": 3.5559566787003607e-07, "loss": 0.1882, "step": 21720 }, { "epoch": 19.43649373881932, "grad_norm": 1.7475745677947998, "learning_rate": 3.5529482551143196e-07, "loss": 0.1977, "step": 21730 }, { "epoch": 19.445438282647586, "grad_norm": 1.4146769046783447, "learning_rate": 3.549939831528279e-07, "loss": 0.1994, "step": 21740 }, { "epoch": 19.45438282647585, "grad_norm": 1.409281849861145, "learning_rate": 3.546931407942238e-07, "loss": 0.1884, "step": 21750 }, { "epoch": 19.463327370304114, "grad_norm": 1.7853525876998901, "learning_rate": 3.5439229843561974e-07, "loss": 0.1893, "step": 21760 }, { "epoch": 19.472271914132378, "grad_norm": 1.4283171892166138, "learning_rate": 3.5409145607701563e-07, "loss": 0.1919, "step": 21770 }, { "epoch": 19.481216457960645, "grad_norm": 1.1920448541641235, "learning_rate": 3.537906137184115e-07, "loss": 0.1875, "step": 21780 }, { "epoch": 19.49016100178891, "grad_norm": 1.3144375085830688, "learning_rate": 3.5348977135980746e-07, "loss": 0.1907, "step": 21790 }, { "epoch": 19.499105545617173, "grad_norm": 1.510340929031372, "learning_rate": 3.5318892900120335e-07, "loss": 0.1969, "step": 21800 }, { "epoch": 19.508050089445437, "grad_norm": 1.3244644403457642, "learning_rate": 3.528880866425993e-07, "loss": 0.1915, "step": 21810 }, { "epoch": 19.516994633273704, "grad_norm": 1.3958264589309692, "learning_rate": 3.525872442839952e-07, "loss": 0.1891, "step": 21820 }, { "epoch": 19.525939177101968, "grad_norm": 1.2823486328125, "learning_rate": 3.5228640192539107e-07, "loss": 0.1744, "step": 21830 }, { "epoch": 19.53488372093023, "grad_norm": 1.3641853332519531, "learning_rate": 3.51985559566787e-07, "loss": 0.1906, "step": 21840 }, { "epoch": 19.543828264758496, "grad_norm": 1.4786450862884521, "learning_rate": 3.516847172081829e-07, "loss": 0.1857, "step": 21850 }, { "epoch": 19.552772808586763, "grad_norm": 1.5867925882339478, "learning_rate": 3.513838748495788e-07, "loss": 0.1959, "step": 21860 }, { "epoch": 19.561717352415027, "grad_norm": 1.313033938407898, "learning_rate": 3.510830324909747e-07, "loss": 0.1939, "step": 21870 }, { "epoch": 19.57066189624329, "grad_norm": 1.3654066324234009, "learning_rate": 3.5078219013237063e-07, "loss": 0.1905, "step": 21880 }, { "epoch": 19.579606440071558, "grad_norm": 1.2623218297958374, "learning_rate": 3.504813477737665e-07, "loss": 0.1893, "step": 21890 }, { "epoch": 19.58855098389982, "grad_norm": 1.386467456817627, "learning_rate": 3.501805054151624e-07, "loss": 0.1815, "step": 21900 }, { "epoch": 19.597495527728086, "grad_norm": 1.5557708740234375, "learning_rate": 3.4987966305655835e-07, "loss": 0.1917, "step": 21910 }, { "epoch": 19.60644007155635, "grad_norm": 1.7414817810058594, "learning_rate": 3.4957882069795424e-07, "loss": 0.1895, "step": 21920 }, { "epoch": 19.615384615384617, "grad_norm": 1.3845630884170532, "learning_rate": 3.492779783393502e-07, "loss": 0.1854, "step": 21930 }, { "epoch": 19.62432915921288, "grad_norm": 1.4406414031982422, "learning_rate": 3.4897713598074607e-07, "loss": 0.194, "step": 21940 }, { "epoch": 19.633273703041144, "grad_norm": 1.2725870609283447, "learning_rate": 3.4867629362214196e-07, "loss": 0.1874, "step": 21950 }, { "epoch": 19.642218246869408, "grad_norm": 1.4587557315826416, "learning_rate": 3.483754512635379e-07, "loss": 0.181, "step": 21960 }, { "epoch": 19.651162790697676, "grad_norm": 1.490944743156433, "learning_rate": 3.480746089049338e-07, "loss": 0.1975, "step": 21970 }, { "epoch": 19.66010733452594, "grad_norm": 1.3130711317062378, "learning_rate": 3.4777376654632974e-07, "loss": 0.1851, "step": 21980 }, { "epoch": 19.669051878354203, "grad_norm": 1.3030730485916138, "learning_rate": 3.4747292418772563e-07, "loss": 0.189, "step": 21990 }, { "epoch": 19.677996422182467, "grad_norm": 1.2224133014678955, "learning_rate": 3.471720818291215e-07, "loss": 0.1839, "step": 22000 }, { "epoch": 19.686940966010734, "grad_norm": 1.2977553606033325, "learning_rate": 3.4687123947051746e-07, "loss": 0.1919, "step": 22010 }, { "epoch": 19.695885509838998, "grad_norm": 1.4645330905914307, "learning_rate": 3.4657039711191335e-07, "loss": 0.1899, "step": 22020 }, { "epoch": 19.704830053667262, "grad_norm": 1.3017830848693848, "learning_rate": 3.4626955475330924e-07, "loss": 0.1785, "step": 22030 }, { "epoch": 19.71377459749553, "grad_norm": 1.2776682376861572, "learning_rate": 3.4596871239470513e-07, "loss": 0.1886, "step": 22040 }, { "epoch": 19.722719141323793, "grad_norm": 1.4559818506240845, "learning_rate": 3.456678700361011e-07, "loss": 0.1801, "step": 22050 }, { "epoch": 19.731663685152057, "grad_norm": 1.3212449550628662, "learning_rate": 3.4536702767749696e-07, "loss": 0.188, "step": 22060 }, { "epoch": 19.74060822898032, "grad_norm": 1.4003950357437134, "learning_rate": 3.4506618531889285e-07, "loss": 0.1869, "step": 22070 }, { "epoch": 19.74955277280859, "grad_norm": 1.3492873907089233, "learning_rate": 3.447653429602888e-07, "loss": 0.1887, "step": 22080 }, { "epoch": 19.758497316636852, "grad_norm": 1.3928955793380737, "learning_rate": 3.444645006016847e-07, "loss": 0.1918, "step": 22090 }, { "epoch": 19.767441860465116, "grad_norm": 1.2398922443389893, "learning_rate": 3.4416365824308063e-07, "loss": 0.1765, "step": 22100 }, { "epoch": 19.77638640429338, "grad_norm": 1.2678223848342896, "learning_rate": 3.438628158844765e-07, "loss": 0.1883, "step": 22110 }, { "epoch": 19.785330948121647, "grad_norm": 1.7187891006469727, "learning_rate": 3.435619735258724e-07, "loss": 0.19, "step": 22120 }, { "epoch": 19.79427549194991, "grad_norm": 1.3717964887619019, "learning_rate": 3.4326113116726835e-07, "loss": 0.1804, "step": 22130 }, { "epoch": 19.803220035778175, "grad_norm": 1.3578808307647705, "learning_rate": 3.4296028880866424e-07, "loss": 0.1908, "step": 22140 }, { "epoch": 19.81216457960644, "grad_norm": 1.4975496530532837, "learning_rate": 3.426594464500602e-07, "loss": 0.194, "step": 22150 }, { "epoch": 19.821109123434706, "grad_norm": 1.504811406135559, "learning_rate": 3.423586040914561e-07, "loss": 0.1988, "step": 22160 }, { "epoch": 19.83005366726297, "grad_norm": 1.393159031867981, "learning_rate": 3.4205776173285196e-07, "loss": 0.1858, "step": 22170 }, { "epoch": 19.838998211091234, "grad_norm": 1.8559398651123047, "learning_rate": 3.417569193742479e-07, "loss": 0.1897, "step": 22180 }, { "epoch": 19.8479427549195, "grad_norm": 1.650848627090454, "learning_rate": 3.414560770156438e-07, "loss": 0.1931, "step": 22190 }, { "epoch": 19.856887298747765, "grad_norm": 1.3729771375656128, "learning_rate": 3.4115523465703974e-07, "loss": 0.1888, "step": 22200 }, { "epoch": 19.86583184257603, "grad_norm": 1.2925593852996826, "learning_rate": 3.408543922984356e-07, "loss": 0.1862, "step": 22210 }, { "epoch": 19.874776386404292, "grad_norm": 1.4130479097366333, "learning_rate": 3.4055354993983147e-07, "loss": 0.1857, "step": 22220 }, { "epoch": 19.88372093023256, "grad_norm": 1.4012774229049683, "learning_rate": 3.402527075812274e-07, "loss": 0.1892, "step": 22230 }, { "epoch": 19.892665474060824, "grad_norm": 1.2611286640167236, "learning_rate": 3.399518652226233e-07, "loss": 0.1854, "step": 22240 }, { "epoch": 19.901610017889087, "grad_norm": 1.2847506999969482, "learning_rate": 3.3965102286401924e-07, "loss": 0.1821, "step": 22250 }, { "epoch": 19.91055456171735, "grad_norm": 1.5255744457244873, "learning_rate": 3.3935018050541513e-07, "loss": 0.1842, "step": 22260 }, { "epoch": 19.91949910554562, "grad_norm": 1.539740800857544, "learning_rate": 3.390493381468111e-07, "loss": 0.1875, "step": 22270 }, { "epoch": 19.928443649373882, "grad_norm": 1.4575883150100708, "learning_rate": 3.3874849578820696e-07, "loss": 0.187, "step": 22280 }, { "epoch": 19.937388193202146, "grad_norm": 1.5852409601211548, "learning_rate": 3.3844765342960285e-07, "loss": 0.1976, "step": 22290 }, { "epoch": 19.94633273703041, "grad_norm": 1.4154672622680664, "learning_rate": 3.381468110709988e-07, "loss": 0.1851, "step": 22300 }, { "epoch": 19.955277280858677, "grad_norm": 1.5170589685440063, "learning_rate": 3.378459687123947e-07, "loss": 0.1902, "step": 22310 }, { "epoch": 19.96422182468694, "grad_norm": 1.44078528881073, "learning_rate": 3.3754512635379063e-07, "loss": 0.1863, "step": 22320 }, { "epoch": 19.973166368515205, "grad_norm": 1.4237490892410278, "learning_rate": 3.372442839951865e-07, "loss": 0.1852, "step": 22330 }, { "epoch": 19.98211091234347, "grad_norm": 1.545115351676941, "learning_rate": 3.369434416365824e-07, "loss": 0.1785, "step": 22340 }, { "epoch": 19.991055456171736, "grad_norm": 1.4572222232818604, "learning_rate": 3.3664259927797835e-07, "loss": 0.1818, "step": 22350 }, { "epoch": 20.0, "grad_norm": 2.268296241760254, "learning_rate": 3.3634175691937424e-07, "loss": 0.1855, "step": 22360 }, { "epoch": 20.0, "eval_bleu": 78.0784, "eval_gen_len": 74.6804, "eval_loss": 0.14719858765602112, "eval_runtime": 59.9182, "eval_samples_per_second": 17.39, "eval_steps_per_second": 0.184, "step": 22360 }, { "epoch": 20.0, "step": 22360, "total_flos": 1.544729654989947e+17, "train_loss": 0.0, "train_runtime": 0.2648, "train_samples_per_second": 12659125.519, "train_steps_per_second": 63331.882 } ], "logging_steps": 10, "max_steps": 16770, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.544729654989947e+17, "train_batch_size": 100, "trial_name": null, "trial_params": null }