en-to-lg-ufal / trainer_state.json
EricPeter's picture
Upload folder using huggingface_hub
19af202 verified
{
"best_metric": 1.578300952911377,
"best_model_checkpoint": "en-to-lg-ufal/checkpoint-19400",
"epoch": 4.999355753124597,
"eval_steps": 500,
"global_step": 19400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012884937508053087,
"grad_norm": 0.9295567870140076,
"learning_rate": 1.9949484536082476e-05,
"loss": 3.2843,
"step": 50
},
{
"epoch": 0.025769875016106173,
"grad_norm": 1.0073925256729126,
"learning_rate": 1.9897938144329896e-05,
"loss": 3.1584,
"step": 100
},
{
"epoch": 0.038654812524159254,
"grad_norm": 1.3255535364151,
"learning_rate": 1.984639175257732e-05,
"loss": 3.0963,
"step": 150
},
{
"epoch": 0.051539750032212346,
"grad_norm": 1.086804747581482,
"learning_rate": 1.9794845360824745e-05,
"loss": 2.9216,
"step": 200
},
{
"epoch": 0.06442468754026542,
"grad_norm": 0.6704310178756714,
"learning_rate": 1.9743298969072166e-05,
"loss": 2.8337,
"step": 250
},
{
"epoch": 0.07730962504831851,
"grad_norm": 0.837518572807312,
"learning_rate": 1.969278350515464e-05,
"loss": 2.7777,
"step": 300
},
{
"epoch": 0.09019456255637161,
"grad_norm": 0.7694929242134094,
"learning_rate": 1.9641237113402064e-05,
"loss": 2.7939,
"step": 350
},
{
"epoch": 0.10307950006442469,
"grad_norm": 1.0657893419265747,
"learning_rate": 1.9589690721649485e-05,
"loss": 2.7371,
"step": 400
},
{
"epoch": 0.11596443757247778,
"grad_norm": 0.7758269906044006,
"learning_rate": 1.953814432989691e-05,
"loss": 2.6623,
"step": 450
},
{
"epoch": 0.12884937508053085,
"grad_norm": 0.746475100517273,
"learning_rate": 1.948659793814433e-05,
"loss": 2.6365,
"step": 500
},
{
"epoch": 0.14173431258858393,
"grad_norm": 0.8395822048187256,
"learning_rate": 1.9435051546391754e-05,
"loss": 2.6025,
"step": 550
},
{
"epoch": 0.15461925009663702,
"grad_norm": 1.061213493347168,
"learning_rate": 1.938350515463918e-05,
"loss": 2.5525,
"step": 600
},
{
"epoch": 0.16750418760469013,
"grad_norm": 0.8460017442703247,
"learning_rate": 1.93319587628866e-05,
"loss": 2.554,
"step": 650
},
{
"epoch": 0.18038912511274321,
"grad_norm": 0.8066027164459229,
"learning_rate": 1.9280412371134024e-05,
"loss": 2.4868,
"step": 700
},
{
"epoch": 0.1932740626207963,
"grad_norm": 0.8909623622894287,
"learning_rate": 1.9228865979381445e-05,
"loss": 2.5062,
"step": 750
},
{
"epoch": 0.20615900012884938,
"grad_norm": 0.9352700114250183,
"learning_rate": 1.9177319587628865e-05,
"loss": 2.4822,
"step": 800
},
{
"epoch": 0.21904393763690247,
"grad_norm": 1.2132598161697388,
"learning_rate": 1.912577319587629e-05,
"loss": 2.408,
"step": 850
},
{
"epoch": 0.23192887514495555,
"grad_norm": 1.0032589435577393,
"learning_rate": 1.907422680412371e-05,
"loss": 2.436,
"step": 900
},
{
"epoch": 0.24481381265300864,
"grad_norm": 1.0249050855636597,
"learning_rate": 1.9022680412371135e-05,
"loss": 2.4474,
"step": 950
},
{
"epoch": 0.2576987501610617,
"grad_norm": 0.8265942335128784,
"learning_rate": 1.897113402061856e-05,
"loss": 2.4005,
"step": 1000
},
{
"epoch": 0.2705836876691148,
"grad_norm": 0.9384586215019226,
"learning_rate": 1.891958762886598e-05,
"loss": 2.2971,
"step": 1050
},
{
"epoch": 0.28346862517716787,
"grad_norm": 0.9439546465873718,
"learning_rate": 1.8868041237113404e-05,
"loss": 2.3563,
"step": 1100
},
{
"epoch": 0.29635356268522095,
"grad_norm": 0.9652894139289856,
"learning_rate": 1.8816494845360825e-05,
"loss": 2.2672,
"step": 1150
},
{
"epoch": 0.30923850019327404,
"grad_norm": 0.8074690103530884,
"learning_rate": 1.876494845360825e-05,
"loss": 2.319,
"step": 1200
},
{
"epoch": 0.3221234377013272,
"grad_norm": 0.8441233038902283,
"learning_rate": 1.8713402061855674e-05,
"loss": 2.3201,
"step": 1250
},
{
"epoch": 0.33500837520938026,
"grad_norm": 1.0416673421859741,
"learning_rate": 1.8661855670103094e-05,
"loss": 2.2416,
"step": 1300
},
{
"epoch": 0.34789331271743335,
"grad_norm": 1.100706696510315,
"learning_rate": 1.861030927835052e-05,
"loss": 2.2427,
"step": 1350
},
{
"epoch": 0.36077825022548643,
"grad_norm": 0.9824424386024475,
"learning_rate": 1.855876288659794e-05,
"loss": 2.196,
"step": 1400
},
{
"epoch": 0.3736631877335395,
"grad_norm": 0.9210222363471985,
"learning_rate": 1.850721649484536e-05,
"loss": 2.2371,
"step": 1450
},
{
"epoch": 0.3865481252415926,
"grad_norm": 1.0536917448043823,
"learning_rate": 1.8455670103092785e-05,
"loss": 2.2008,
"step": 1500
},
{
"epoch": 0.3994330627496457,
"grad_norm": 1.0008552074432373,
"learning_rate": 1.8404123711340206e-05,
"loss": 2.1586,
"step": 1550
},
{
"epoch": 0.41231800025769877,
"grad_norm": 0.8722209334373474,
"learning_rate": 1.835257731958763e-05,
"loss": 2.1753,
"step": 1600
},
{
"epoch": 0.42520293776575185,
"grad_norm": 0.7571916580200195,
"learning_rate": 1.8301030927835054e-05,
"loss": 2.1598,
"step": 1650
},
{
"epoch": 0.43808787527380494,
"grad_norm": 1.054757833480835,
"learning_rate": 1.8249484536082475e-05,
"loss": 2.18,
"step": 1700
},
{
"epoch": 0.450972812781858,
"grad_norm": 0.8249649405479431,
"learning_rate": 1.81979381443299e-05,
"loss": 2.1078,
"step": 1750
},
{
"epoch": 0.4638577502899111,
"grad_norm": 1.7045085430145264,
"learning_rate": 1.814639175257732e-05,
"loss": 2.211,
"step": 1800
},
{
"epoch": 0.4767426877979642,
"grad_norm": 1.0341459512710571,
"learning_rate": 1.8094845360824744e-05,
"loss": 2.1393,
"step": 1850
},
{
"epoch": 0.4896276253060173,
"grad_norm": 0.9365245699882507,
"learning_rate": 1.804329896907217e-05,
"loss": 2.1602,
"step": 1900
},
{
"epoch": 0.5025125628140703,
"grad_norm": 1.2039780616760254,
"learning_rate": 1.799175257731959e-05,
"loss": 2.1431,
"step": 1950
},
{
"epoch": 0.5153975003221234,
"grad_norm": 0.7472810745239258,
"learning_rate": 1.7940206185567014e-05,
"loss": 2.1242,
"step": 2000
},
{
"epoch": 0.5282824378301765,
"grad_norm": 0.8963159918785095,
"learning_rate": 1.7888659793814435e-05,
"loss": 2.1757,
"step": 2050
},
{
"epoch": 0.5411673753382296,
"grad_norm": 1.0002330541610718,
"learning_rate": 1.7837113402061855e-05,
"loss": 2.1329,
"step": 2100
},
{
"epoch": 0.5540523128462826,
"grad_norm": 0.944322943687439,
"learning_rate": 1.778556701030928e-05,
"loss": 2.1254,
"step": 2150
},
{
"epoch": 0.5669372503543357,
"grad_norm": 1.0756226778030396,
"learning_rate": 1.77340206185567e-05,
"loss": 2.1479,
"step": 2200
},
{
"epoch": 0.5798221878623888,
"grad_norm": 0.9357224106788635,
"learning_rate": 1.7682474226804125e-05,
"loss": 2.142,
"step": 2250
},
{
"epoch": 0.5927071253704419,
"grad_norm": 0.9683809876441956,
"learning_rate": 1.763092783505155e-05,
"loss": 2.1744,
"step": 2300
},
{
"epoch": 0.605592062878495,
"grad_norm": 0.9993259310722351,
"learning_rate": 1.757938144329897e-05,
"loss": 2.0672,
"step": 2350
},
{
"epoch": 0.6184770003865481,
"grad_norm": 1.168818473815918,
"learning_rate": 1.7527835051546394e-05,
"loss": 2.1412,
"step": 2400
},
{
"epoch": 0.6313619378946013,
"grad_norm": 1.0189549922943115,
"learning_rate": 1.7476288659793815e-05,
"loss": 2.079,
"step": 2450
},
{
"epoch": 0.6442468754026544,
"grad_norm": 0.935614824295044,
"learning_rate": 1.742474226804124e-05,
"loss": 2.0944,
"step": 2500
},
{
"epoch": 0.6571318129107074,
"grad_norm": 0.9308194518089294,
"learning_rate": 1.7373195876288664e-05,
"loss": 2.0767,
"step": 2550
},
{
"epoch": 0.6700167504187605,
"grad_norm": 0.9042763113975525,
"learning_rate": 1.7321649484536084e-05,
"loss": 2.0909,
"step": 2600
},
{
"epoch": 0.6829016879268136,
"grad_norm": 0.9609789252281189,
"learning_rate": 1.7270103092783505e-05,
"loss": 2.1105,
"step": 2650
},
{
"epoch": 0.6957866254348667,
"grad_norm": 1.844524621963501,
"learning_rate": 1.721855670103093e-05,
"loss": 2.1664,
"step": 2700
},
{
"epoch": 0.7086715629429198,
"grad_norm": 1.3245840072631836,
"learning_rate": 1.716701030927835e-05,
"loss": 2.0887,
"step": 2750
},
{
"epoch": 0.7215565004509729,
"grad_norm": 0.9674375057220459,
"learning_rate": 1.7115463917525775e-05,
"loss": 2.0579,
"step": 2800
},
{
"epoch": 0.734441437959026,
"grad_norm": 1.1117796897888184,
"learning_rate": 1.7063917525773196e-05,
"loss": 2.0793,
"step": 2850
},
{
"epoch": 0.747326375467079,
"grad_norm": 1.099692702293396,
"learning_rate": 1.701237113402062e-05,
"loss": 2.0568,
"step": 2900
},
{
"epoch": 0.7602113129751321,
"grad_norm": 1.1181882619857788,
"learning_rate": 1.6961855670103094e-05,
"loss": 2.1385,
"step": 2950
},
{
"epoch": 0.7730962504831852,
"grad_norm": 1.0414690971374512,
"learning_rate": 1.6911340206185568e-05,
"loss": 2.0686,
"step": 3000
},
{
"epoch": 0.7859811879912383,
"grad_norm": 0.9875026345252991,
"learning_rate": 1.6859793814432992e-05,
"loss": 2.0503,
"step": 3050
},
{
"epoch": 0.7988661254992914,
"grad_norm": 1.0894653797149658,
"learning_rate": 1.6808247422680413e-05,
"loss": 2.0283,
"step": 3100
},
{
"epoch": 0.8117510630073445,
"grad_norm": 0.9688855409622192,
"learning_rate": 1.6756701030927837e-05,
"loss": 2.0442,
"step": 3150
},
{
"epoch": 0.8246360005153975,
"grad_norm": 0.8581517338752747,
"learning_rate": 1.6705154639175258e-05,
"loss": 2.0938,
"step": 3200
},
{
"epoch": 0.8375209380234506,
"grad_norm": 1.0975722074508667,
"learning_rate": 1.6653608247422682e-05,
"loss": 2.0493,
"step": 3250
},
{
"epoch": 0.8504058755315037,
"grad_norm": 0.9611416459083557,
"learning_rate": 1.6602061855670103e-05,
"loss": 2.0434,
"step": 3300
},
{
"epoch": 0.8632908130395568,
"grad_norm": 0.9956973195075989,
"learning_rate": 1.6550515463917527e-05,
"loss": 2.0556,
"step": 3350
},
{
"epoch": 0.8761757505476099,
"grad_norm": 1.0307831764221191,
"learning_rate": 1.6498969072164948e-05,
"loss": 2.0353,
"step": 3400
},
{
"epoch": 0.889060688055663,
"grad_norm": 0.9811009168624878,
"learning_rate": 1.6447422680412372e-05,
"loss": 2.0574,
"step": 3450
},
{
"epoch": 0.901945625563716,
"grad_norm": 1.0394349098205566,
"learning_rate": 1.6395876288659797e-05,
"loss": 2.0544,
"step": 3500
},
{
"epoch": 0.9148305630717691,
"grad_norm": 0.9915798902511597,
"learning_rate": 1.6344329896907218e-05,
"loss": 2.0685,
"step": 3550
},
{
"epoch": 0.9277155005798222,
"grad_norm": 0.8833404183387756,
"learning_rate": 1.6292783505154642e-05,
"loss": 2.0734,
"step": 3600
},
{
"epoch": 0.9406004380878753,
"grad_norm": 0.9089716672897339,
"learning_rate": 1.6241237113402063e-05,
"loss": 2.0531,
"step": 3650
},
{
"epoch": 0.9534853755959284,
"grad_norm": 0.9168672561645508,
"learning_rate": 1.6189690721649487e-05,
"loss": 2.008,
"step": 3700
},
{
"epoch": 0.9663703131039815,
"grad_norm": 0.9824495911598206,
"learning_rate": 1.6138144329896908e-05,
"loss": 2.0488,
"step": 3750
},
{
"epoch": 0.9792552506120346,
"grad_norm": 1.2295233011245728,
"learning_rate": 1.6086597938144332e-05,
"loss": 2.0184,
"step": 3800
},
{
"epoch": 0.9921401881200876,
"grad_norm": 0.9734981656074524,
"learning_rate": 1.6035051546391753e-05,
"loss": 2.0659,
"step": 3850
},
{
"epoch": 0.9998711506249195,
"eval_bleu": 18.1461,
"eval_gen_len": 45.4751,
"eval_loss": 1.7681266069412231,
"eval_runtime": 2364.104,
"eval_samples_per_second": 6.565,
"eval_steps_per_second": 0.41,
"step": 3880
},
{
"epoch": 1.0050251256281406,
"grad_norm": 1.2103891372680664,
"learning_rate": 1.5983505154639177e-05,
"loss": 2.0579,
"step": 3900
},
{
"epoch": 1.0179100631361937,
"grad_norm": 0.983440637588501,
"learning_rate": 1.5931958762886598e-05,
"loss": 1.9984,
"step": 3950
},
{
"epoch": 1.0307950006442468,
"grad_norm": 1.0848294496536255,
"learning_rate": 1.5880412371134022e-05,
"loss": 2.0428,
"step": 4000
},
{
"epoch": 1.0436799381522999,
"grad_norm": 1.143467903137207,
"learning_rate": 1.5828865979381443e-05,
"loss": 2.0236,
"step": 4050
},
{
"epoch": 1.056564875660353,
"grad_norm": 0.9652382731437683,
"learning_rate": 1.5777319587628867e-05,
"loss": 2.0219,
"step": 4100
},
{
"epoch": 1.069449813168406,
"grad_norm": 1.0381783246994019,
"learning_rate": 1.5725773195876292e-05,
"loss": 2.0466,
"step": 4150
},
{
"epoch": 1.0823347506764591,
"grad_norm": 1.0150736570358276,
"learning_rate": 1.5674226804123713e-05,
"loss": 1.9878,
"step": 4200
},
{
"epoch": 1.0952196881845122,
"grad_norm": 1.0291893482208252,
"learning_rate": 1.5622680412371137e-05,
"loss": 2.0637,
"step": 4250
},
{
"epoch": 1.1081046256925653,
"grad_norm": 1.5909788608551025,
"learning_rate": 1.5571134020618558e-05,
"loss": 1.9356,
"step": 4300
},
{
"epoch": 1.1209895632006184,
"grad_norm": 1.1417341232299805,
"learning_rate": 1.551958762886598e-05,
"loss": 2.0089,
"step": 4350
},
{
"epoch": 1.1338745007086715,
"grad_norm": 1.0295405387878418,
"learning_rate": 1.5468041237113403e-05,
"loss": 2.0486,
"step": 4400
},
{
"epoch": 1.1467594382167245,
"grad_norm": 1.1314431428909302,
"learning_rate": 1.5416494845360827e-05,
"loss": 1.9785,
"step": 4450
},
{
"epoch": 1.1596443757247776,
"grad_norm": 0.9702105522155762,
"learning_rate": 1.5364948453608248e-05,
"loss": 1.9474,
"step": 4500
},
{
"epoch": 1.1725293132328307,
"grad_norm": 1.9038368463516235,
"learning_rate": 1.5313402061855672e-05,
"loss": 2.0264,
"step": 4550
},
{
"epoch": 1.1854142507408838,
"grad_norm": 1.0110225677490234,
"learning_rate": 1.5261855670103093e-05,
"loss": 2.0689,
"step": 4600
},
{
"epoch": 1.1982991882489369,
"grad_norm": 0.9490695595741272,
"learning_rate": 1.5210309278350517e-05,
"loss": 1.9998,
"step": 4650
},
{
"epoch": 1.21118412575699,
"grad_norm": 1.0398602485656738,
"learning_rate": 1.515876288659794e-05,
"loss": 1.9622,
"step": 4700
},
{
"epoch": 1.224069063265043,
"grad_norm": 1.088680624961853,
"learning_rate": 1.5107216494845362e-05,
"loss": 2.0096,
"step": 4750
},
{
"epoch": 1.2369540007730961,
"grad_norm": 1.0555858612060547,
"learning_rate": 1.5055670103092785e-05,
"loss": 1.9494,
"step": 4800
},
{
"epoch": 1.2498389382811492,
"grad_norm": 1.1039170026779175,
"learning_rate": 1.5004123711340208e-05,
"loss": 2.0132,
"step": 4850
},
{
"epoch": 1.2627238757892023,
"grad_norm": 1.069201946258545,
"learning_rate": 1.4952577319587632e-05,
"loss": 2.043,
"step": 4900
},
{
"epoch": 1.2756088132972554,
"grad_norm": 17.700525283813477,
"learning_rate": 1.4901030927835051e-05,
"loss": 1.9629,
"step": 4950
},
{
"epoch": 1.2884937508053085,
"grad_norm": 0.9558641314506531,
"learning_rate": 1.4849484536082475e-05,
"loss": 1.9488,
"step": 5000
},
{
"epoch": 1.3013786883133616,
"grad_norm": 1.0301564931869507,
"learning_rate": 1.4797938144329898e-05,
"loss": 2.0172,
"step": 5050
},
{
"epoch": 1.3142636258214146,
"grad_norm": 1.017805814743042,
"learning_rate": 1.474639175257732e-05,
"loss": 2.04,
"step": 5100
},
{
"epoch": 1.3271485633294677,
"grad_norm": 0.8880634903907776,
"learning_rate": 1.4694845360824743e-05,
"loss": 2.0109,
"step": 5150
},
{
"epoch": 1.3400335008375208,
"grad_norm": 0.9009851813316345,
"learning_rate": 1.4643298969072166e-05,
"loss": 1.9195,
"step": 5200
},
{
"epoch": 1.352918438345574,
"grad_norm": 0.9800803661346436,
"learning_rate": 1.459175257731959e-05,
"loss": 2.006,
"step": 5250
},
{
"epoch": 1.365803375853627,
"grad_norm": 0.9194086194038391,
"learning_rate": 1.4540206185567012e-05,
"loss": 1.927,
"step": 5300
},
{
"epoch": 1.37868831336168,
"grad_norm": 1.1191316843032837,
"learning_rate": 1.4488659793814435e-05,
"loss": 2.0157,
"step": 5350
},
{
"epoch": 1.3915732508697332,
"grad_norm": 1.159419298171997,
"learning_rate": 1.4437113402061857e-05,
"loss": 1.9806,
"step": 5400
},
{
"epoch": 1.4044581883777862,
"grad_norm": 1.0512197017669678,
"learning_rate": 1.438556701030928e-05,
"loss": 1.9496,
"step": 5450
},
{
"epoch": 1.4173431258858393,
"grad_norm": 1.125361442565918,
"learning_rate": 1.4334020618556703e-05,
"loss": 1.9632,
"step": 5500
},
{
"epoch": 1.4302280633938924,
"grad_norm": 1.0887775421142578,
"learning_rate": 1.4282474226804123e-05,
"loss": 1.928,
"step": 5550
},
{
"epoch": 1.4431130009019455,
"grad_norm": 1.0122510194778442,
"learning_rate": 1.4230927835051546e-05,
"loss": 1.9677,
"step": 5600
},
{
"epoch": 1.4559979384099986,
"grad_norm": 0.9938000440597534,
"learning_rate": 1.417938144329897e-05,
"loss": 1.9492,
"step": 5650
},
{
"epoch": 1.4688828759180517,
"grad_norm": 1.1632938385009766,
"learning_rate": 1.4127835051546393e-05,
"loss": 1.9629,
"step": 5700
},
{
"epoch": 1.4817678134261048,
"grad_norm": 0.9199478626251221,
"learning_rate": 1.4076288659793815e-05,
"loss": 1.999,
"step": 5750
},
{
"epoch": 1.4946527509341578,
"grad_norm": 1.1851013898849487,
"learning_rate": 1.4024742268041238e-05,
"loss": 1.964,
"step": 5800
},
{
"epoch": 1.507537688442211,
"grad_norm": 1.541742205619812,
"learning_rate": 1.397319587628866e-05,
"loss": 2.0018,
"step": 5850
},
{
"epoch": 1.520422625950264,
"grad_norm": 1.3189605474472046,
"learning_rate": 1.3921649484536083e-05,
"loss": 1.9245,
"step": 5900
},
{
"epoch": 1.533307563458317,
"grad_norm": 0.995586097240448,
"learning_rate": 1.3870103092783507e-05,
"loss": 2.0303,
"step": 5950
},
{
"epoch": 1.5461925009663702,
"grad_norm": 1.0431631803512573,
"learning_rate": 1.381855670103093e-05,
"loss": 1.9044,
"step": 6000
},
{
"epoch": 1.5590774384744233,
"grad_norm": 1.0870169401168823,
"learning_rate": 1.3767010309278352e-05,
"loss": 2.0023,
"step": 6050
},
{
"epoch": 1.5719623759824763,
"grad_norm": 0.921909511089325,
"learning_rate": 1.3715463917525775e-05,
"loss": 1.9186,
"step": 6100
},
{
"epoch": 1.5848473134905294,
"grad_norm": 1.1961994171142578,
"learning_rate": 1.3663917525773196e-05,
"loss": 1.9109,
"step": 6150
},
{
"epoch": 1.5977322509985825,
"grad_norm": 1.0308939218521118,
"learning_rate": 1.3612371134020618e-05,
"loss": 1.9146,
"step": 6200
},
{
"epoch": 1.6106171885066356,
"grad_norm": 2.139348030090332,
"learning_rate": 1.3560824742268041e-05,
"loss": 1.9541,
"step": 6250
},
{
"epoch": 1.6235021260146887,
"grad_norm": 1.0335361957550049,
"learning_rate": 1.3509278350515465e-05,
"loss": 2.0021,
"step": 6300
},
{
"epoch": 1.6363870635227418,
"grad_norm": 1.0377309322357178,
"learning_rate": 1.3457731958762888e-05,
"loss": 1.9546,
"step": 6350
},
{
"epoch": 1.6492720010307949,
"grad_norm": 0.9605665802955627,
"learning_rate": 1.340618556701031e-05,
"loss": 1.9477,
"step": 6400
},
{
"epoch": 1.662156938538848,
"grad_norm": 1.1977301836013794,
"learning_rate": 1.3354639175257733e-05,
"loss": 1.928,
"step": 6450
},
{
"epoch": 1.675041876046901,
"grad_norm": 0.9851065874099731,
"learning_rate": 1.3303092783505156e-05,
"loss": 1.9224,
"step": 6500
},
{
"epoch": 1.687926813554954,
"grad_norm": 0.8746098875999451,
"learning_rate": 1.3251546391752578e-05,
"loss": 1.9798,
"step": 6550
},
{
"epoch": 1.7008117510630072,
"grad_norm": 0.9900497794151306,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.9244,
"step": 6600
},
{
"epoch": 1.7136966885710603,
"grad_norm": 0.9657949805259705,
"learning_rate": 1.3148453608247425e-05,
"loss": 1.9565,
"step": 6650
},
{
"epoch": 1.7265816260791134,
"grad_norm": 1.1366913318634033,
"learning_rate": 1.3096907216494847e-05,
"loss": 1.9724,
"step": 6700
},
{
"epoch": 1.7394665635871664,
"grad_norm": 1.15602445602417,
"learning_rate": 1.3045360824742268e-05,
"loss": 1.927,
"step": 6750
},
{
"epoch": 1.7523515010952195,
"grad_norm": 0.9413411021232605,
"learning_rate": 1.2993814432989691e-05,
"loss": 1.9347,
"step": 6800
},
{
"epoch": 1.7652364386032726,
"grad_norm": 0.9607951641082764,
"learning_rate": 1.2942268041237113e-05,
"loss": 1.9495,
"step": 6850
},
{
"epoch": 1.7781213761113257,
"grad_norm": 1.155685305595398,
"learning_rate": 1.2890721649484536e-05,
"loss": 1.9498,
"step": 6900
},
{
"epoch": 1.7910063136193788,
"grad_norm": 1.8821039199829102,
"learning_rate": 1.283917525773196e-05,
"loss": 1.9411,
"step": 6950
},
{
"epoch": 1.8038912511274319,
"grad_norm": 1.2264201641082764,
"learning_rate": 1.2787628865979383e-05,
"loss": 1.9488,
"step": 7000
},
{
"epoch": 1.816776188635485,
"grad_norm": 0.9997029304504395,
"learning_rate": 1.2736082474226805e-05,
"loss": 1.9162,
"step": 7050
},
{
"epoch": 1.829661126143538,
"grad_norm": 1.1943738460540771,
"learning_rate": 1.2684536082474228e-05,
"loss": 1.961,
"step": 7100
},
{
"epoch": 1.8425460636515911,
"grad_norm": 1.1875113248825073,
"learning_rate": 1.263298969072165e-05,
"loss": 1.9256,
"step": 7150
},
{
"epoch": 1.8554310011596442,
"grad_norm": 1.0550329685211182,
"learning_rate": 1.2581443298969073e-05,
"loss": 1.9519,
"step": 7200
},
{
"epoch": 1.8683159386676973,
"grad_norm": 1.3292375802993774,
"learning_rate": 1.2529896907216497e-05,
"loss": 1.9341,
"step": 7250
},
{
"epoch": 1.8812008761757506,
"grad_norm": 1.0914188623428345,
"learning_rate": 1.247835051546392e-05,
"loss": 1.894,
"step": 7300
},
{
"epoch": 1.8940858136838037,
"grad_norm": 1.1687994003295898,
"learning_rate": 1.242680412371134e-05,
"loss": 1.9044,
"step": 7350
},
{
"epoch": 1.9069707511918568,
"grad_norm": 1.0040736198425293,
"learning_rate": 1.2376288659793816e-05,
"loss": 1.919,
"step": 7400
},
{
"epoch": 1.9198556886999099,
"grad_norm": 1.0108208656311035,
"learning_rate": 1.2324742268041239e-05,
"loss": 1.8785,
"step": 7450
},
{
"epoch": 1.932740626207963,
"grad_norm": 1.0039801597595215,
"learning_rate": 1.2273195876288662e-05,
"loss": 1.8761,
"step": 7500
},
{
"epoch": 1.945625563716016,
"grad_norm": 1.0580838918685913,
"learning_rate": 1.2221649484536084e-05,
"loss": 1.8699,
"step": 7550
},
{
"epoch": 1.9585105012240691,
"grad_norm": 1.1629561185836792,
"learning_rate": 1.2170103092783505e-05,
"loss": 1.927,
"step": 7600
},
{
"epoch": 1.9713954387321222,
"grad_norm": 0.908470094203949,
"learning_rate": 1.2118556701030928e-05,
"loss": 1.9778,
"step": 7650
},
{
"epoch": 1.9842803762401753,
"grad_norm": 1.1411256790161133,
"learning_rate": 1.206701030927835e-05,
"loss": 1.9035,
"step": 7700
},
{
"epoch": 1.9971653137482284,
"grad_norm": 0.9729508757591248,
"learning_rate": 1.2015463917525774e-05,
"loss": 1.9071,
"step": 7750
},
{
"epoch": 2.0,
"eval_bleu": 19.4026,
"eval_gen_len": 45.2194,
"eval_loss": 1.662958025932312,
"eval_runtime": 2310.4182,
"eval_samples_per_second": 6.717,
"eval_steps_per_second": 0.42,
"step": 7761
},
{
"epoch": 2.0100502512562812,
"grad_norm": 1.2045557498931885,
"learning_rate": 1.1963917525773197e-05,
"loss": 1.8937,
"step": 7800
},
{
"epoch": 2.0229351887643343,
"grad_norm": 0.9809572696685791,
"learning_rate": 1.191237113402062e-05,
"loss": 1.9757,
"step": 7850
},
{
"epoch": 2.0358201262723874,
"grad_norm": 1.0871580839157104,
"learning_rate": 1.1860824742268042e-05,
"loss": 1.9366,
"step": 7900
},
{
"epoch": 2.0487050637804405,
"grad_norm": 1.1801034212112427,
"learning_rate": 1.1809278350515465e-05,
"loss": 1.92,
"step": 7950
},
{
"epoch": 2.0615900012884936,
"grad_norm": 1.118897557258606,
"learning_rate": 1.1757731958762887e-05,
"loss": 1.9593,
"step": 8000
},
{
"epoch": 2.0744749387965467,
"grad_norm": 1.1403993368148804,
"learning_rate": 1.1706185567010311e-05,
"loss": 1.8699,
"step": 8050
},
{
"epoch": 2.0873598763045997,
"grad_norm": 3.7084872722625732,
"learning_rate": 1.1654639175257734e-05,
"loss": 1.8686,
"step": 8100
},
{
"epoch": 2.100244813812653,
"grad_norm": 1.135362982749939,
"learning_rate": 1.1603092783505157e-05,
"loss": 1.9068,
"step": 8150
},
{
"epoch": 2.113129751320706,
"grad_norm": 1.5067939758300781,
"learning_rate": 1.1551546391752577e-05,
"loss": 1.9196,
"step": 8200
},
{
"epoch": 2.126014688828759,
"grad_norm": 1.0155831575393677,
"learning_rate": 1.15e-05,
"loss": 1.8711,
"step": 8250
},
{
"epoch": 2.138899626336812,
"grad_norm": 1.2201752662658691,
"learning_rate": 1.1448453608247423e-05,
"loss": 1.8985,
"step": 8300
},
{
"epoch": 2.151784563844865,
"grad_norm": 1.0999071598052979,
"learning_rate": 1.1396907216494845e-05,
"loss": 1.8978,
"step": 8350
},
{
"epoch": 2.1646695013529182,
"grad_norm": 1.1638729572296143,
"learning_rate": 1.134536082474227e-05,
"loss": 1.8593,
"step": 8400
},
{
"epoch": 2.1775544388609713,
"grad_norm": 1.1784203052520752,
"learning_rate": 1.1293814432989692e-05,
"loss": 1.8663,
"step": 8450
},
{
"epoch": 2.1904393763690244,
"grad_norm": 1.026315450668335,
"learning_rate": 1.1242268041237115e-05,
"loss": 1.8285,
"step": 8500
},
{
"epoch": 2.2033243138770775,
"grad_norm": 0.9852938652038574,
"learning_rate": 1.1190721649484537e-05,
"loss": 1.9048,
"step": 8550
},
{
"epoch": 2.2162092513851306,
"grad_norm": 1.052303671836853,
"learning_rate": 1.113917525773196e-05,
"loss": 1.8808,
"step": 8600
},
{
"epoch": 2.2290941888931837,
"grad_norm": 5.098678112030029,
"learning_rate": 1.1087628865979382e-05,
"loss": 1.8679,
"step": 8650
},
{
"epoch": 2.2419791264012368,
"grad_norm": 1.1427991390228271,
"learning_rate": 1.1036082474226806e-05,
"loss": 1.9065,
"step": 8700
},
{
"epoch": 2.25486406390929,
"grad_norm": 3.338353395462036,
"learning_rate": 1.0984536082474229e-05,
"loss": 1.8915,
"step": 8750
},
{
"epoch": 2.267749001417343,
"grad_norm": 1.2049264907836914,
"learning_rate": 1.093298969072165e-05,
"loss": 1.9018,
"step": 8800
},
{
"epoch": 2.280633938925396,
"grad_norm": 1.0751088857650757,
"learning_rate": 1.0881443298969072e-05,
"loss": 1.8914,
"step": 8850
},
{
"epoch": 2.293518876433449,
"grad_norm": 0.9149059653282166,
"learning_rate": 1.0829896907216495e-05,
"loss": 1.8717,
"step": 8900
},
{
"epoch": 2.306403813941502,
"grad_norm": 1.0154598951339722,
"learning_rate": 1.0778350515463918e-05,
"loss": 1.8979,
"step": 8950
},
{
"epoch": 2.3192887514495553,
"grad_norm": 1.198451042175293,
"learning_rate": 1.072680412371134e-05,
"loss": 1.8987,
"step": 9000
},
{
"epoch": 2.3321736889576083,
"grad_norm": 1.0957529544830322,
"learning_rate": 1.0675257731958764e-05,
"loss": 1.892,
"step": 9050
},
{
"epoch": 2.3450586264656614,
"grad_norm": 1.055180311203003,
"learning_rate": 1.0623711340206187e-05,
"loss": 1.8694,
"step": 9100
},
{
"epoch": 2.3579435639737145,
"grad_norm": 1.094000220298767,
"learning_rate": 1.057216494845361e-05,
"loss": 1.9105,
"step": 9150
},
{
"epoch": 2.3708285014817676,
"grad_norm": 1.0135473012924194,
"learning_rate": 1.0520618556701032e-05,
"loss": 1.892,
"step": 9200
},
{
"epoch": 2.3837134389898207,
"grad_norm": 1.2554734945297241,
"learning_rate": 1.0469072164948455e-05,
"loss": 1.9458,
"step": 9250
},
{
"epoch": 2.3965983764978738,
"grad_norm": 1.312153697013855,
"learning_rate": 1.0417525773195877e-05,
"loss": 1.9138,
"step": 9300
},
{
"epoch": 2.409483314005927,
"grad_norm": 1.0656461715698242,
"learning_rate": 1.0367010309278351e-05,
"loss": 1.931,
"step": 9350
},
{
"epoch": 2.42236825151398,
"grad_norm": 1.4363470077514648,
"learning_rate": 1.0315463917525774e-05,
"loss": 1.8815,
"step": 9400
},
{
"epoch": 2.435253189022033,
"grad_norm": 1.2165566682815552,
"learning_rate": 1.0263917525773196e-05,
"loss": 1.8774,
"step": 9450
},
{
"epoch": 2.448138126530086,
"grad_norm": 1.2879478931427002,
"learning_rate": 1.021237113402062e-05,
"loss": 1.9324,
"step": 9500
},
{
"epoch": 2.461023064038139,
"grad_norm": 1.1694004535675049,
"learning_rate": 1.0160824742268043e-05,
"loss": 1.8595,
"step": 9550
},
{
"epoch": 2.4739080015461923,
"grad_norm": 1.1013145446777344,
"learning_rate": 1.0109278350515466e-05,
"loss": 1.8523,
"step": 9600
},
{
"epoch": 2.4867929390542454,
"grad_norm": 1.0102790594100952,
"learning_rate": 1.0057731958762887e-05,
"loss": 1.8239,
"step": 9650
},
{
"epoch": 2.4996778765622985,
"grad_norm": 2.130802869796753,
"learning_rate": 1.0006185567010309e-05,
"loss": 1.9528,
"step": 9700
},
{
"epoch": 2.5125628140703515,
"grad_norm": 1.3339593410491943,
"learning_rate": 9.954639175257733e-06,
"loss": 1.8553,
"step": 9750
},
{
"epoch": 2.5254477515784046,
"grad_norm": 1.0675934553146362,
"learning_rate": 9.903092783505154e-06,
"loss": 1.8547,
"step": 9800
},
{
"epoch": 2.5383326890864577,
"grad_norm": 1.111695408821106,
"learning_rate": 9.851546391752578e-06,
"loss": 1.81,
"step": 9850
},
{
"epoch": 2.551217626594511,
"grad_norm": 1.3534823656082153,
"learning_rate": 9.800000000000001e-06,
"loss": 1.8789,
"step": 9900
},
{
"epoch": 2.564102564102564,
"grad_norm": 1.138906717300415,
"learning_rate": 9.748453608247424e-06,
"loss": 1.8355,
"step": 9950
},
{
"epoch": 2.576987501610617,
"grad_norm": 1.183947205543518,
"learning_rate": 9.696907216494846e-06,
"loss": 1.8563,
"step": 10000
},
{
"epoch": 2.58987243911867,
"grad_norm": 0.9370452165603638,
"learning_rate": 9.645360824742269e-06,
"loss": 1.8845,
"step": 10050
},
{
"epoch": 2.602757376626723,
"grad_norm": 1.1878234148025513,
"learning_rate": 9.593814432989691e-06,
"loss": 1.9203,
"step": 10100
},
{
"epoch": 2.615642314134776,
"grad_norm": 1.0847914218902588,
"learning_rate": 9.542268041237114e-06,
"loss": 1.8816,
"step": 10150
},
{
"epoch": 2.6285272516428293,
"grad_norm": 1.2127825021743774,
"learning_rate": 9.490721649484536e-06,
"loss": 1.899,
"step": 10200
},
{
"epoch": 2.6414121891508824,
"grad_norm": 1.0567888021469116,
"learning_rate": 9.439175257731959e-06,
"loss": 1.8433,
"step": 10250
},
{
"epoch": 2.6542971266589355,
"grad_norm": 1.1838716268539429,
"learning_rate": 9.387628865979383e-06,
"loss": 1.8931,
"step": 10300
},
{
"epoch": 2.6671820641669886,
"grad_norm": 1.1113821268081665,
"learning_rate": 9.336082474226806e-06,
"loss": 1.9041,
"step": 10350
},
{
"epoch": 2.6800670016750416,
"grad_norm": 1.538613200187683,
"learning_rate": 9.284536082474228e-06,
"loss": 1.9085,
"step": 10400
},
{
"epoch": 2.6929519391830947,
"grad_norm": 1.1364761590957642,
"learning_rate": 9.23298969072165e-06,
"loss": 1.8925,
"step": 10450
},
{
"epoch": 2.705836876691148,
"grad_norm": 1.1954172849655151,
"learning_rate": 9.181443298969073e-06,
"loss": 1.8608,
"step": 10500
},
{
"epoch": 2.718721814199201,
"grad_norm": 0.8984624147415161,
"learning_rate": 9.129896907216496e-06,
"loss": 1.823,
"step": 10550
},
{
"epoch": 2.731606751707254,
"grad_norm": 1.0665663480758667,
"learning_rate": 9.078350515463919e-06,
"loss": 1.9052,
"step": 10600
},
{
"epoch": 2.744491689215307,
"grad_norm": 1.2751344442367554,
"learning_rate": 9.026804123711341e-06,
"loss": 1.8573,
"step": 10650
},
{
"epoch": 2.75737662672336,
"grad_norm": 0.964619517326355,
"learning_rate": 8.975257731958764e-06,
"loss": 1.9581,
"step": 10700
},
{
"epoch": 2.7702615642314132,
"grad_norm": 1.1248286962509155,
"learning_rate": 8.923711340206186e-06,
"loss": 1.9004,
"step": 10750
},
{
"epoch": 2.7831465017394663,
"grad_norm": 1.2471715211868286,
"learning_rate": 8.872164948453609e-06,
"loss": 1.8969,
"step": 10800
},
{
"epoch": 2.7960314392475194,
"grad_norm": 1.3639956712722778,
"learning_rate": 8.820618556701031e-06,
"loss": 1.8493,
"step": 10850
},
{
"epoch": 2.8089163767555725,
"grad_norm": 1.1183199882507324,
"learning_rate": 8.769072164948454e-06,
"loss": 1.8888,
"step": 10900
},
{
"epoch": 2.8218013142636256,
"grad_norm": 1.3162132501602173,
"learning_rate": 8.717525773195877e-06,
"loss": 1.8567,
"step": 10950
},
{
"epoch": 2.8346862517716787,
"grad_norm": 1.2056224346160889,
"learning_rate": 8.6659793814433e-06,
"loss": 1.8687,
"step": 11000
},
{
"epoch": 2.8475711892797317,
"grad_norm": 1.285947322845459,
"learning_rate": 8.614432989690722e-06,
"loss": 1.8864,
"step": 11050
},
{
"epoch": 2.860456126787785,
"grad_norm": 1.292939305305481,
"learning_rate": 8.562886597938144e-06,
"loss": 1.9056,
"step": 11100
},
{
"epoch": 2.873341064295838,
"grad_norm": 1.2155085802078247,
"learning_rate": 8.511340206185568e-06,
"loss": 1.8699,
"step": 11150
},
{
"epoch": 2.886226001803891,
"grad_norm": 1.4173967838287354,
"learning_rate": 8.459793814432991e-06,
"loss": 1.8581,
"step": 11200
},
{
"epoch": 2.899110939311944,
"grad_norm": 1.0226136445999146,
"learning_rate": 8.408247422680414e-06,
"loss": 1.8491,
"step": 11250
},
{
"epoch": 2.911995876819997,
"grad_norm": 1.2074532508850098,
"learning_rate": 8.356701030927836e-06,
"loss": 1.8493,
"step": 11300
},
{
"epoch": 2.9248808143280502,
"grad_norm": 1.0812984704971313,
"learning_rate": 8.305154639175259e-06,
"loss": 1.8839,
"step": 11350
},
{
"epoch": 2.9377657518361033,
"grad_norm": 1.3052395582199097,
"learning_rate": 8.253608247422681e-06,
"loss": 1.869,
"step": 11400
},
{
"epoch": 2.9506506893441564,
"grad_norm": 1.0708857774734497,
"learning_rate": 8.202061855670104e-06,
"loss": 1.855,
"step": 11450
},
{
"epoch": 2.9635356268522095,
"grad_norm": 0.9860512614250183,
"learning_rate": 8.150515463917526e-06,
"loss": 1.8535,
"step": 11500
},
{
"epoch": 2.9764205643602626,
"grad_norm": 0.9245162010192871,
"learning_rate": 8.098969072164949e-06,
"loss": 1.8647,
"step": 11550
},
{
"epoch": 2.9893055018683157,
"grad_norm": 1.1266101598739624,
"learning_rate": 8.047422680412372e-06,
"loss": 1.8646,
"step": 11600
},
{
"epoch": 2.9998711506249194,
"eval_bleu": 19.9649,
"eval_gen_len": 45.274,
"eval_loss": 1.6131339073181152,
"eval_runtime": 2307.2556,
"eval_samples_per_second": 6.727,
"eval_steps_per_second": 0.42,
"step": 11641
},
{
"epoch": 3.002190439376369,
"grad_norm": 0.924880862236023,
"learning_rate": 7.995876288659794e-06,
"loss": 1.8485,
"step": 11650
},
{
"epoch": 3.0150753768844223,
"grad_norm": 1.0113394260406494,
"learning_rate": 7.944329896907217e-06,
"loss": 1.8349,
"step": 11700
},
{
"epoch": 3.0279603143924754,
"grad_norm": 1.1226181983947754,
"learning_rate": 7.89278350515464e-06,
"loss": 1.8707,
"step": 11750
},
{
"epoch": 3.0408452519005285,
"grad_norm": 1.0973234176635742,
"learning_rate": 7.841237113402062e-06,
"loss": 1.8104,
"step": 11800
},
{
"epoch": 3.0537301894085815,
"grad_norm": 1.2233961820602417,
"learning_rate": 7.789690721649486e-06,
"loss": 1.8668,
"step": 11850
},
{
"epoch": 3.0666151269166346,
"grad_norm": 1.1300643682479858,
"learning_rate": 7.738144329896909e-06,
"loss": 1.8661,
"step": 11900
},
{
"epoch": 3.0795000644246877,
"grad_norm": 1.1732138395309448,
"learning_rate": 7.68659793814433e-06,
"loss": 1.8213,
"step": 11950
},
{
"epoch": 3.092385001932741,
"grad_norm": 1.459231972694397,
"learning_rate": 7.635051546391754e-06,
"loss": 1.8533,
"step": 12000
},
{
"epoch": 3.105269939440794,
"grad_norm": 1.353126049041748,
"learning_rate": 7.5835051546391755e-06,
"loss": 1.8838,
"step": 12050
},
{
"epoch": 3.118154876948847,
"grad_norm": 0.9796210527420044,
"learning_rate": 7.531958762886599e-06,
"loss": 1.8079,
"step": 12100
},
{
"epoch": 3.1310398144569,
"grad_norm": 1.1230041980743408,
"learning_rate": 7.4804123711340214e-06,
"loss": 1.9103,
"step": 12150
},
{
"epoch": 3.143924751964953,
"grad_norm": 1.3261069059371948,
"learning_rate": 7.428865979381444e-06,
"loss": 1.845,
"step": 12200
},
{
"epoch": 3.156809689473006,
"grad_norm": 1.0127289295196533,
"learning_rate": 7.377319587628866e-06,
"loss": 1.8408,
"step": 12250
},
{
"epoch": 3.1696946269810593,
"grad_norm": 1.1761748790740967,
"learning_rate": 7.325773195876289e-06,
"loss": 1.855,
"step": 12300
},
{
"epoch": 3.1825795644891124,
"grad_norm": 1.1443302631378174,
"learning_rate": 7.274226804123712e-06,
"loss": 1.8234,
"step": 12350
},
{
"epoch": 3.1954645019971655,
"grad_norm": 1.1420938968658447,
"learning_rate": 7.222680412371135e-06,
"loss": 1.9063,
"step": 12400
},
{
"epoch": 3.2083494395052186,
"grad_norm": 0.9729594588279724,
"learning_rate": 7.171134020618558e-06,
"loss": 1.9181,
"step": 12450
},
{
"epoch": 3.2212343770132716,
"grad_norm": 1.2192091941833496,
"learning_rate": 7.11958762886598e-06,
"loss": 1.9125,
"step": 12500
},
{
"epoch": 3.2341193145213247,
"grad_norm": 1.335284948348999,
"learning_rate": 7.068041237113402e-06,
"loss": 1.8348,
"step": 12550
},
{
"epoch": 3.247004252029378,
"grad_norm": 1.5923230648040771,
"learning_rate": 7.016494845360825e-06,
"loss": 1.8523,
"step": 12600
},
{
"epoch": 3.259889189537431,
"grad_norm": 1.3718382120132446,
"learning_rate": 6.964948453608248e-06,
"loss": 1.8404,
"step": 12650
},
{
"epoch": 3.272774127045484,
"grad_norm": 1.3347010612487793,
"learning_rate": 6.9134020618556705e-06,
"loss": 1.852,
"step": 12700
},
{
"epoch": 3.285659064553537,
"grad_norm": 1.3411351442337036,
"learning_rate": 6.861855670103094e-06,
"loss": 1.8505,
"step": 12750
},
{
"epoch": 3.29854400206159,
"grad_norm": 1.2156487703323364,
"learning_rate": 6.8103092783505165e-06,
"loss": 1.8589,
"step": 12800
},
{
"epoch": 3.3114289395696432,
"grad_norm": 1.1836252212524414,
"learning_rate": 6.758762886597938e-06,
"loss": 1.8518,
"step": 12850
},
{
"epoch": 3.3243138770776963,
"grad_norm": 1.3949558734893799,
"learning_rate": 6.707216494845361e-06,
"loss": 1.8459,
"step": 12900
},
{
"epoch": 3.3371988145857494,
"grad_norm": 1.0333205461502075,
"learning_rate": 6.655670103092784e-06,
"loss": 1.86,
"step": 12950
},
{
"epoch": 3.3500837520938025,
"grad_norm": 1.0828937292099,
"learning_rate": 6.604123711340207e-06,
"loss": 1.8463,
"step": 13000
},
{
"epoch": 3.3629686896018556,
"grad_norm": 1.1059962511062622,
"learning_rate": 6.552577319587629e-06,
"loss": 1.8608,
"step": 13050
},
{
"epoch": 3.3758536271099087,
"grad_norm": 1.025884747505188,
"learning_rate": 6.501030927835053e-06,
"loss": 1.8371,
"step": 13100
},
{
"epoch": 3.3887385646179617,
"grad_norm": 1.0845879316329956,
"learning_rate": 6.449484536082474e-06,
"loss": 1.8205,
"step": 13150
},
{
"epoch": 3.401623502126015,
"grad_norm": 0.9505090713500977,
"learning_rate": 6.397938144329897e-06,
"loss": 1.8467,
"step": 13200
},
{
"epoch": 3.414508439634068,
"grad_norm": 1.1278256177902222,
"learning_rate": 6.34639175257732e-06,
"loss": 1.8406,
"step": 13250
},
{
"epoch": 3.427393377142121,
"grad_norm": 1.0838017463684082,
"learning_rate": 6.294845360824743e-06,
"loss": 1.8284,
"step": 13300
},
{
"epoch": 3.440278314650174,
"grad_norm": 1.212337851524353,
"learning_rate": 6.2432989690721655e-06,
"loss": 1.8202,
"step": 13350
},
{
"epoch": 3.453163252158227,
"grad_norm": 1.0882956981658936,
"learning_rate": 6.191752577319589e-06,
"loss": 1.8493,
"step": 13400
},
{
"epoch": 3.4660481896662803,
"grad_norm": 1.213768482208252,
"learning_rate": 6.140206185567011e-06,
"loss": 1.8518,
"step": 13450
},
{
"epoch": 3.4789331271743333,
"grad_norm": 1.2855191230773926,
"learning_rate": 6.088659793814433e-06,
"loss": 1.8574,
"step": 13500
},
{
"epoch": 3.4918180646823864,
"grad_norm": 1.0810940265655518,
"learning_rate": 6.037113402061856e-06,
"loss": 1.8766,
"step": 13550
},
{
"epoch": 3.5047030021904395,
"grad_norm": 1.15432608127594,
"learning_rate": 5.985567010309279e-06,
"loss": 1.836,
"step": 13600
},
{
"epoch": 3.5175879396984926,
"grad_norm": 1.468928337097168,
"learning_rate": 5.934020618556702e-06,
"loss": 1.8165,
"step": 13650
},
{
"epoch": 3.5304728772065457,
"grad_norm": 1.6314187049865723,
"learning_rate": 5.882474226804124e-06,
"loss": 1.8725,
"step": 13700
},
{
"epoch": 3.5433578147145988,
"grad_norm": 1.1987876892089844,
"learning_rate": 5.830927835051546e-06,
"loss": 1.8486,
"step": 13750
},
{
"epoch": 3.556242752222652,
"grad_norm": 1.1263744831085205,
"learning_rate": 5.779381443298969e-06,
"loss": 1.8739,
"step": 13800
},
{
"epoch": 3.569127689730705,
"grad_norm": 1.2357795238494873,
"learning_rate": 5.727835051546392e-06,
"loss": 1.848,
"step": 13850
},
{
"epoch": 3.582012627238758,
"grad_norm": 1.228352427482605,
"learning_rate": 5.6762886597938145e-06,
"loss": 1.8785,
"step": 13900
},
{
"epoch": 3.594897564746811,
"grad_norm": 1.0710021257400513,
"learning_rate": 5.624742268041238e-06,
"loss": 1.8108,
"step": 13950
},
{
"epoch": 3.607782502254864,
"grad_norm": 0.9839572906494141,
"learning_rate": 5.5731958762886605e-06,
"loss": 1.8779,
"step": 14000
},
{
"epoch": 3.6206674397629173,
"grad_norm": 1.1732807159423828,
"learning_rate": 5.521649484536082e-06,
"loss": 1.8692,
"step": 14050
},
{
"epoch": 3.6335523772709704,
"grad_norm": 1.0930730104446411,
"learning_rate": 5.470103092783506e-06,
"loss": 1.7912,
"step": 14100
},
{
"epoch": 3.6464373147790234,
"grad_norm": 1.1306408643722534,
"learning_rate": 5.418556701030928e-06,
"loss": 1.7992,
"step": 14150
},
{
"epoch": 3.6593222522870765,
"grad_norm": 1.171573281288147,
"learning_rate": 5.367010309278351e-06,
"loss": 1.8286,
"step": 14200
},
{
"epoch": 3.6722071897951296,
"grad_norm": 1.033572793006897,
"learning_rate": 5.315463917525774e-06,
"loss": 1.8168,
"step": 14250
},
{
"epoch": 3.6850921273031827,
"grad_norm": 1.109149694442749,
"learning_rate": 5.263917525773197e-06,
"loss": 1.8229,
"step": 14300
},
{
"epoch": 3.697977064811236,
"grad_norm": 1.085472822189331,
"learning_rate": 5.2123711340206184e-06,
"loss": 1.7883,
"step": 14350
},
{
"epoch": 3.710862002319289,
"grad_norm": 1.0914117097854614,
"learning_rate": 5.160824742268041e-06,
"loss": 1.8375,
"step": 14400
},
{
"epoch": 3.723746939827342,
"grad_norm": 1.3772042989730835,
"learning_rate": 5.110309278350516e-06,
"loss": 1.8259,
"step": 14450
},
{
"epoch": 3.736631877335395,
"grad_norm": 0.9119631052017212,
"learning_rate": 5.058762886597939e-06,
"loss": 1.8679,
"step": 14500
},
{
"epoch": 3.749516814843448,
"grad_norm": 1.1717164516448975,
"learning_rate": 5.007216494845362e-06,
"loss": 1.8524,
"step": 14550
},
{
"epoch": 3.762401752351501,
"grad_norm": 1.131783127784729,
"learning_rate": 4.955670103092784e-06,
"loss": 1.8081,
"step": 14600
},
{
"epoch": 3.7752866898595543,
"grad_norm": 1.1898800134658813,
"learning_rate": 4.904123711340207e-06,
"loss": 1.8172,
"step": 14650
},
{
"epoch": 3.7881716273676074,
"grad_norm": 1.0781954526901245,
"learning_rate": 4.8525773195876294e-06,
"loss": 1.8365,
"step": 14700
},
{
"epoch": 3.8010565648756605,
"grad_norm": 1.1128448247909546,
"learning_rate": 4.801030927835052e-06,
"loss": 1.8904,
"step": 14750
},
{
"epoch": 3.8139415023837135,
"grad_norm": 1.0720164775848389,
"learning_rate": 4.7494845360824746e-06,
"loss": 1.8521,
"step": 14800
},
{
"epoch": 3.8268264398917666,
"grad_norm": 1.0853550434112549,
"learning_rate": 4.697938144329897e-06,
"loss": 1.7686,
"step": 14850
},
{
"epoch": 3.8397113773998197,
"grad_norm": 0.9527387619018555,
"learning_rate": 4.64639175257732e-06,
"loss": 1.8529,
"step": 14900
},
{
"epoch": 3.852596314907873,
"grad_norm": 1.3065271377563477,
"learning_rate": 4.594845360824743e-06,
"loss": 1.8569,
"step": 14950
},
{
"epoch": 3.865481252415926,
"grad_norm": 1.2607804536819458,
"learning_rate": 4.543298969072165e-06,
"loss": 1.8417,
"step": 15000
},
{
"epoch": 3.878366189923979,
"grad_norm": 1.089626669883728,
"learning_rate": 4.491752577319588e-06,
"loss": 1.8491,
"step": 15050
},
{
"epoch": 3.891251127432032,
"grad_norm": 1.2275793552398682,
"learning_rate": 4.440206185567011e-06,
"loss": 1.8475,
"step": 15100
},
{
"epoch": 3.904136064940085,
"grad_norm": 1.2494066953659058,
"learning_rate": 4.388659793814433e-06,
"loss": 1.8095,
"step": 15150
},
{
"epoch": 3.9170210024481382,
"grad_norm": 1.0344122648239136,
"learning_rate": 4.337113402061856e-06,
"loss": 1.8177,
"step": 15200
},
{
"epoch": 3.9299059399561913,
"grad_norm": 1.20706307888031,
"learning_rate": 4.285567010309279e-06,
"loss": 1.7988,
"step": 15250
},
{
"epoch": 3.9427908774642444,
"grad_norm": 1.0796010494232178,
"learning_rate": 4.234020618556701e-06,
"loss": 1.8232,
"step": 15300
},
{
"epoch": 3.9556758149722975,
"grad_norm": 1.2336502075195312,
"learning_rate": 4.1824742268041245e-06,
"loss": 1.8759,
"step": 15350
},
{
"epoch": 3.9685607524803506,
"grad_norm": 1.0533246994018555,
"learning_rate": 4.130927835051547e-06,
"loss": 1.8084,
"step": 15400
},
{
"epoch": 3.9814456899884036,
"grad_norm": 1.1837642192840576,
"learning_rate": 4.07938144329897e-06,
"loss": 1.8939,
"step": 15450
},
{
"epoch": 3.9943306274964567,
"grad_norm": 1.1679872274398804,
"learning_rate": 4.027835051546392e-06,
"loss": 1.8759,
"step": 15500
},
{
"epoch": 4.0,
"eval_bleu": 20.3549,
"eval_gen_len": 45.3627,
"eval_loss": 1.5858944654464722,
"eval_runtime": 2286.6294,
"eval_samples_per_second": 6.787,
"eval_steps_per_second": 0.424,
"step": 15522
},
{
"epoch": 4.007215565004509,
"grad_norm": 1.6103421449661255,
"learning_rate": 3.976288659793815e-06,
"loss": 1.8672,
"step": 15550
},
{
"epoch": 4.0201005025125625,
"grad_norm": 1.074686050415039,
"learning_rate": 3.924742268041237e-06,
"loss": 1.8157,
"step": 15600
},
{
"epoch": 4.0329854400206155,
"grad_norm": 1.11213219165802,
"learning_rate": 3.87319587628866e-06,
"loss": 1.8017,
"step": 15650
},
{
"epoch": 4.045870377528669,
"grad_norm": 1.1408663988113403,
"learning_rate": 3.821649484536083e-06,
"loss": 1.7926,
"step": 15700
},
{
"epoch": 4.058755315036722,
"grad_norm": 1.0430666208267212,
"learning_rate": 3.7701030927835054e-06,
"loss": 1.7889,
"step": 15750
},
{
"epoch": 4.071640252544775,
"grad_norm": 1.1866077184677124,
"learning_rate": 3.718556701030928e-06,
"loss": 1.7969,
"step": 15800
},
{
"epoch": 4.084525190052828,
"grad_norm": 1.362838625907898,
"learning_rate": 3.667010309278351e-06,
"loss": 1.8444,
"step": 15850
},
{
"epoch": 4.097410127560881,
"grad_norm": 1.0319502353668213,
"learning_rate": 3.6154639175257735e-06,
"loss": 1.8415,
"step": 15900
},
{
"epoch": 4.110295065068934,
"grad_norm": 1.0608545541763306,
"learning_rate": 3.563917525773196e-06,
"loss": 1.8614,
"step": 15950
},
{
"epoch": 4.123180002576987,
"grad_norm": 1.0609222650527954,
"learning_rate": 3.512371134020619e-06,
"loss": 1.8395,
"step": 16000
},
{
"epoch": 4.13606494008504,
"grad_norm": 1.0247282981872559,
"learning_rate": 3.460824742268041e-06,
"loss": 1.8515,
"step": 16050
},
{
"epoch": 4.148949877593093,
"grad_norm": 1.1988474130630493,
"learning_rate": 3.409278350515464e-06,
"loss": 1.8241,
"step": 16100
},
{
"epoch": 4.161834815101146,
"grad_norm": 1.164461374282837,
"learning_rate": 3.357731958762887e-06,
"loss": 1.8444,
"step": 16150
},
{
"epoch": 4.1747197526091995,
"grad_norm": 1.090458631515503,
"learning_rate": 3.3061855670103093e-06,
"loss": 1.844,
"step": 16200
},
{
"epoch": 4.187604690117253,
"grad_norm": 1.4075111150741577,
"learning_rate": 3.2546391752577323e-06,
"loss": 1.8042,
"step": 16250
},
{
"epoch": 4.200489627625306,
"grad_norm": 1.1348927021026611,
"learning_rate": 3.2030927835051553e-06,
"loss": 1.7964,
"step": 16300
},
{
"epoch": 4.213374565133359,
"grad_norm": 1.3190233707427979,
"learning_rate": 3.1515463917525774e-06,
"loss": 1.8265,
"step": 16350
},
{
"epoch": 4.226259502641412,
"grad_norm": 1.0522139072418213,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.8129,
"step": 16400
},
{
"epoch": 4.239144440149465,
"grad_norm": 1.0048468112945557,
"learning_rate": 3.048453608247423e-06,
"loss": 1.8452,
"step": 16450
},
{
"epoch": 4.252029377657518,
"grad_norm": 1.180903434753418,
"learning_rate": 2.9969072164948455e-06,
"loss": 1.8357,
"step": 16500
},
{
"epoch": 4.264914315165571,
"grad_norm": 1.297938585281372,
"learning_rate": 2.945360824742268e-06,
"loss": 1.8436,
"step": 16550
},
{
"epoch": 4.277799252673624,
"grad_norm": 1.0574970245361328,
"learning_rate": 2.893814432989691e-06,
"loss": 1.8493,
"step": 16600
},
{
"epoch": 4.290684190181677,
"grad_norm": 1.082375168800354,
"learning_rate": 2.8422680412371136e-06,
"loss": 1.8424,
"step": 16650
},
{
"epoch": 4.30356912768973,
"grad_norm": 0.9978700280189514,
"learning_rate": 2.790721649484536e-06,
"loss": 1.865,
"step": 16700
},
{
"epoch": 4.316454065197783,
"grad_norm": 1.1757255792617798,
"learning_rate": 2.739175257731959e-06,
"loss": 1.8743,
"step": 16750
},
{
"epoch": 4.3293390027058365,
"grad_norm": 1.1812617778778076,
"learning_rate": 2.6876288659793813e-06,
"loss": 1.8218,
"step": 16800
},
{
"epoch": 4.34222394021389,
"grad_norm": 1.126605749130249,
"learning_rate": 2.6360824742268043e-06,
"loss": 1.8278,
"step": 16850
},
{
"epoch": 4.355108877721943,
"grad_norm": 1.1985175609588623,
"learning_rate": 2.5845360824742273e-06,
"loss": 1.8033,
"step": 16900
},
{
"epoch": 4.367993815229996,
"grad_norm": 1.1603890657424927,
"learning_rate": 2.5329896907216494e-06,
"loss": 1.8361,
"step": 16950
},
{
"epoch": 4.380878752738049,
"grad_norm": 1.209686517715454,
"learning_rate": 2.4814432989690724e-06,
"loss": 1.8762,
"step": 17000
},
{
"epoch": 4.393763690246102,
"grad_norm": 1.105088233947754,
"learning_rate": 2.429896907216495e-06,
"loss": 1.8506,
"step": 17050
},
{
"epoch": 4.406648627754155,
"grad_norm": 1.1008309125900269,
"learning_rate": 2.378350515463918e-06,
"loss": 1.8256,
"step": 17100
},
{
"epoch": 4.419533565262208,
"grad_norm": 1.3243298530578613,
"learning_rate": 2.3268041237113405e-06,
"loss": 1.8009,
"step": 17150
},
{
"epoch": 4.432418502770261,
"grad_norm": 1.0257518291473389,
"learning_rate": 2.275257731958763e-06,
"loss": 1.8524,
"step": 17200
},
{
"epoch": 4.445303440278314,
"grad_norm": 0.9655742645263672,
"learning_rate": 2.2237113402061856e-06,
"loss": 1.8044,
"step": 17250
},
{
"epoch": 4.458188377786367,
"grad_norm": 1.0721204280853271,
"learning_rate": 2.172164948453608e-06,
"loss": 1.8242,
"step": 17300
},
{
"epoch": 4.47107331529442,
"grad_norm": 1.227541208267212,
"learning_rate": 2.120618556701031e-06,
"loss": 1.8186,
"step": 17350
},
{
"epoch": 4.4839582528024735,
"grad_norm": 1.0894291400909424,
"learning_rate": 2.070103092783505e-06,
"loss": 1.7768,
"step": 17400
},
{
"epoch": 4.496843190310527,
"grad_norm": 1.004269003868103,
"learning_rate": 2.0185567010309277e-06,
"loss": 1.8019,
"step": 17450
},
{
"epoch": 4.50972812781858,
"grad_norm": 1.1534968614578247,
"learning_rate": 1.9670103092783507e-06,
"loss": 1.8244,
"step": 17500
},
{
"epoch": 4.522613065326633,
"grad_norm": 1.3757740259170532,
"learning_rate": 1.9154639175257733e-06,
"loss": 1.8458,
"step": 17550
},
{
"epoch": 4.535498002834686,
"grad_norm": 1.184401035308838,
"learning_rate": 1.8639175257731958e-06,
"loss": 1.829,
"step": 17600
},
{
"epoch": 4.548382940342739,
"grad_norm": 1.2132309675216675,
"learning_rate": 1.8123711340206188e-06,
"loss": 1.8529,
"step": 17650
},
{
"epoch": 4.561267877850792,
"grad_norm": 1.0804411172866821,
"learning_rate": 1.7608247422680414e-06,
"loss": 1.8534,
"step": 17700
},
{
"epoch": 4.574152815358845,
"grad_norm": 1.5346250534057617,
"learning_rate": 1.709278350515464e-06,
"loss": 1.8277,
"step": 17750
},
{
"epoch": 4.587037752866898,
"grad_norm": 1.1482338905334473,
"learning_rate": 1.6577319587628867e-06,
"loss": 1.803,
"step": 17800
},
{
"epoch": 4.599922690374951,
"grad_norm": 1.171758770942688,
"learning_rate": 1.6061855670103093e-06,
"loss": 1.83,
"step": 17850
},
{
"epoch": 4.612807627883004,
"grad_norm": 1.3336864709854126,
"learning_rate": 1.554639175257732e-06,
"loss": 1.8386,
"step": 17900
},
{
"epoch": 4.625692565391057,
"grad_norm": 1.0265740156173706,
"learning_rate": 1.5030927835051548e-06,
"loss": 1.8072,
"step": 17950
},
{
"epoch": 4.6385775028991105,
"grad_norm": 0.9137164950370789,
"learning_rate": 1.4515463917525774e-06,
"loss": 1.8189,
"step": 18000
},
{
"epoch": 4.651462440407164,
"grad_norm": 1.2193052768707275,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.8489,
"step": 18050
},
{
"epoch": 4.664347377915217,
"grad_norm": 1.1527855396270752,
"learning_rate": 1.348453608247423e-06,
"loss": 1.7922,
"step": 18100
},
{
"epoch": 4.67723231542327,
"grad_norm": 0.9622436761856079,
"learning_rate": 1.2969072164948455e-06,
"loss": 1.81,
"step": 18150
},
{
"epoch": 4.690117252931323,
"grad_norm": 1.173771858215332,
"learning_rate": 1.245360824742268e-06,
"loss": 1.8375,
"step": 18200
},
{
"epoch": 4.703002190439376,
"grad_norm": 1.1019172668457031,
"learning_rate": 1.1938144329896908e-06,
"loss": 1.836,
"step": 18250
},
{
"epoch": 4.715887127947429,
"grad_norm": 1.419956088066101,
"learning_rate": 1.1422680412371134e-06,
"loss": 1.7774,
"step": 18300
},
{
"epoch": 4.728772065455482,
"grad_norm": 0.9891812205314636,
"learning_rate": 1.0907216494845362e-06,
"loss": 1.7989,
"step": 18350
},
{
"epoch": 4.741657002963535,
"grad_norm": 1.0100419521331787,
"learning_rate": 1.039175257731959e-06,
"loss": 1.8439,
"step": 18400
},
{
"epoch": 4.754541940471588,
"grad_norm": 1.1432263851165771,
"learning_rate": 9.876288659793815e-07,
"loss": 1.8154,
"step": 18450
},
{
"epoch": 4.767426877979641,
"grad_norm": 1.1132447719573975,
"learning_rate": 9.360824742268042e-07,
"loss": 1.8817,
"step": 18500
},
{
"epoch": 4.7803118154876945,
"grad_norm": 1.086591362953186,
"learning_rate": 8.845360824742269e-07,
"loss": 1.8116,
"step": 18550
},
{
"epoch": 4.7931967529957475,
"grad_norm": 1.0462530851364136,
"learning_rate": 8.329896907216496e-07,
"loss": 1.8432,
"step": 18600
},
{
"epoch": 4.806081690503801,
"grad_norm": 1.0310077667236328,
"learning_rate": 7.814432989690722e-07,
"loss": 1.825,
"step": 18650
},
{
"epoch": 4.818966628011854,
"grad_norm": 1.0306345224380493,
"learning_rate": 7.298969072164949e-07,
"loss": 1.8085,
"step": 18700
},
{
"epoch": 4.831851565519907,
"grad_norm": 1.4541102647781372,
"learning_rate": 6.783505154639176e-07,
"loss": 1.8276,
"step": 18750
},
{
"epoch": 4.84473650302796,
"grad_norm": 1.2799372673034668,
"learning_rate": 6.268041237113402e-07,
"loss": 1.8876,
"step": 18800
},
{
"epoch": 4.857621440536013,
"grad_norm": 1.0106414556503296,
"learning_rate": 5.75257731958763e-07,
"loss": 1.7739,
"step": 18850
},
{
"epoch": 4.870506378044066,
"grad_norm": 1.11018967628479,
"learning_rate": 5.237113402061856e-07,
"loss": 1.8272,
"step": 18900
},
{
"epoch": 4.883391315552119,
"grad_norm": 1.587908387184143,
"learning_rate": 4.7216494845360834e-07,
"loss": 1.8069,
"step": 18950
},
{
"epoch": 4.896276253060172,
"grad_norm": 1.1059330701828003,
"learning_rate": 4.2061855670103096e-07,
"loss": 1.85,
"step": 19000
},
{
"epoch": 4.909161190568225,
"grad_norm": 1.2126922607421875,
"learning_rate": 3.690721649484536e-07,
"loss": 1.8446,
"step": 19050
},
{
"epoch": 4.922046128076278,
"grad_norm": 1.0558154582977295,
"learning_rate": 3.1752577319587635e-07,
"loss": 1.8169,
"step": 19100
},
{
"epoch": 4.9349310655843315,
"grad_norm": 1.0007387399673462,
"learning_rate": 2.65979381443299e-07,
"loss": 1.813,
"step": 19150
},
{
"epoch": 4.947816003092385,
"grad_norm": 1.1500272750854492,
"learning_rate": 2.1443298969072168e-07,
"loss": 1.8094,
"step": 19200
},
{
"epoch": 4.960700940600438,
"grad_norm": 1.0796419382095337,
"learning_rate": 1.6288659793814433e-07,
"loss": 1.7628,
"step": 19250
},
{
"epoch": 4.973585878108491,
"grad_norm": 1.1715208292007446,
"learning_rate": 1.1134020618556701e-07,
"loss": 1.8339,
"step": 19300
},
{
"epoch": 4.986470815616544,
"grad_norm": 1.0896481275558472,
"learning_rate": 5.97938144329897e-08,
"loss": 1.8318,
"step": 19350
},
{
"epoch": 4.999355753124597,
"grad_norm": 1.2462085485458374,
"learning_rate": 8.247422680412371e-09,
"loss": 1.8287,
"step": 19400
},
{
"epoch": 4.999355753124597,
"eval_bleu": 20.4616,
"eval_gen_len": 45.2528,
"eval_loss": 1.578300952911377,
"eval_runtime": 2280.7257,
"eval_samples_per_second": 6.805,
"eval_steps_per_second": 0.425,
"step": 19400
}
],
"logging_steps": 50,
"max_steps": 19400,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.497673924411392e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}