mamba2-700m-c1.1 / trainer_state.json
rwitz2's picture
Upload folder using huggingface_hub
3ce0a72 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11786454262354065,
"eval_steps": 200,
"global_step": 19000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00015508492450465875,
"grad_norm": 0.12764382362365723,
"learning_rate": 0.0015,
"loss": 3.062,
"step": 25
},
{
"epoch": 0.0003101698490093175,
"grad_norm": 0.08861421793699265,
"learning_rate": 0.0015,
"loss": 3.0523,
"step": 50
},
{
"epoch": 0.00046525477351397625,
"grad_norm": 0.10059793293476105,
"learning_rate": 0.0015,
"loss": 3.0271,
"step": 75
},
{
"epoch": 0.000620339698018635,
"grad_norm": 0.09730365872383118,
"learning_rate": 0.0015,
"loss": 3.0421,
"step": 100
},
{
"epoch": 0.0007754246225232938,
"grad_norm": 0.15407200157642365,
"learning_rate": 0.0015,
"loss": 2.9894,
"step": 125
},
{
"epoch": 0.0009305095470279525,
"grad_norm": 0.12250959873199463,
"learning_rate": 0.0015,
"loss": 3.0055,
"step": 150
},
{
"epoch": 0.0010855944715326112,
"grad_norm": 0.08540652692317963,
"learning_rate": 0.0015,
"loss": 3.0025,
"step": 175
},
{
"epoch": 0.00124067939603727,
"grad_norm": 0.1479829102754593,
"learning_rate": 0.0015,
"loss": 2.9881,
"step": 200
},
{
"epoch": 0.00124067939603727,
"eval_loss": 4.852784156799316,
"perplexity": 128.09652709960938,
"step": 200
},
{
"epoch": 0.0013957643205419288,
"grad_norm": 0.1036139577627182,
"learning_rate": 0.0015,
"loss": 2.9609,
"step": 225
},
{
"epoch": 0.0015508492450465876,
"grad_norm": 0.10382606089115143,
"learning_rate": 0.0015,
"loss": 2.9771,
"step": 250
},
{
"epoch": 0.0017059341695512462,
"grad_norm": 0.08648105710744858,
"learning_rate": 0.0015,
"loss": 2.9522,
"step": 275
},
{
"epoch": 0.001861019094055905,
"grad_norm": 0.08675844967365265,
"learning_rate": 0.0015,
"loss": 2.9833,
"step": 300
},
{
"epoch": 0.0020161040185605636,
"grad_norm": 0.1417882740497589,
"learning_rate": 0.0015,
"loss": 2.9626,
"step": 325
},
{
"epoch": 0.0021711889430652224,
"grad_norm": 0.09860406816005707,
"learning_rate": 0.0015,
"loss": 2.9515,
"step": 350
},
{
"epoch": 0.002326273867569881,
"grad_norm": 0.11757214367389679,
"learning_rate": 0.0015,
"loss": 2.9523,
"step": 375
},
{
"epoch": 0.00248135879207454,
"grad_norm": 0.11415340006351471,
"learning_rate": 0.0015,
"loss": 2.9579,
"step": 400
},
{
"epoch": 0.00248135879207454,
"eval_loss": 4.8426313400268555,
"perplexity": 126.80257415771484,
"step": 400
},
{
"epoch": 0.002636443716579199,
"grad_norm": 0.10692940652370453,
"learning_rate": 0.0015,
"loss": 2.9273,
"step": 425
},
{
"epoch": 0.0027915286410838576,
"grad_norm": 0.12780559062957764,
"learning_rate": 0.0015,
"loss": 2.9577,
"step": 450
},
{
"epoch": 0.0029466135655885164,
"grad_norm": 0.21147418022155762,
"learning_rate": 0.0015,
"loss": 2.9118,
"step": 475
},
{
"epoch": 0.003101698490093175,
"grad_norm": 0.13209331035614014,
"learning_rate": 0.0015,
"loss": 2.9584,
"step": 500
},
{
"epoch": 0.0032567834145978336,
"grad_norm": 0.13230836391448975,
"learning_rate": 0.0015,
"loss": 2.9621,
"step": 525
},
{
"epoch": 0.0034118683391024924,
"grad_norm": 0.11265246570110321,
"learning_rate": 0.0015,
"loss": 2.941,
"step": 550
},
{
"epoch": 0.003566953263607151,
"grad_norm": 0.10484226047992706,
"learning_rate": 0.0015,
"loss": 2.9311,
"step": 575
},
{
"epoch": 0.00372203818811181,
"grad_norm": 0.13941314816474915,
"learning_rate": 0.0015,
"loss": 2.9741,
"step": 600
},
{
"epoch": 0.00372203818811181,
"eval_loss": 4.831629276275635,
"perplexity": 125.41513061523438,
"step": 600
},
{
"epoch": 0.0038771231126164688,
"grad_norm": 0.0885343998670578,
"learning_rate": 0.0015,
"loss": 2.944,
"step": 625
},
{
"epoch": 0.004032208037121127,
"grad_norm": 0.093564473092556,
"learning_rate": 0.0015,
"loss": 2.9673,
"step": 650
},
{
"epoch": 0.004187292961625786,
"grad_norm": 0.15350665152072906,
"learning_rate": 0.0015,
"loss": 2.9314,
"step": 675
},
{
"epoch": 0.004342377886130445,
"grad_norm": 0.11337901651859283,
"learning_rate": 0.0015,
"loss": 2.97,
"step": 700
},
{
"epoch": 0.004497462810635104,
"grad_norm": 0.13508272171020508,
"learning_rate": 0.0015,
"loss": 2.9121,
"step": 725
},
{
"epoch": 0.004652547735139762,
"grad_norm": 0.10049441456794739,
"learning_rate": 0.0015,
"loss": 2.9572,
"step": 750
},
{
"epoch": 0.004807632659644422,
"grad_norm": 0.1017594188451767,
"learning_rate": 0.0015,
"loss": 2.9207,
"step": 775
},
{
"epoch": 0.00496271758414908,
"grad_norm": 0.09874167293310165,
"learning_rate": 0.0015,
"loss": 2.9258,
"step": 800
},
{
"epoch": 0.00496271758414908,
"eval_loss": 4.783432960510254,
"perplexity": 119.51393127441406,
"step": 800
},
{
"epoch": 0.005117802508653739,
"grad_norm": 0.09769408404827118,
"learning_rate": 0.0015,
"loss": 2.9606,
"step": 825
},
{
"epoch": 0.005272887433158398,
"grad_norm": 0.11946038156747818,
"learning_rate": 0.0015,
"loss": 2.889,
"step": 850
},
{
"epoch": 0.005427972357663056,
"grad_norm": 0.12191672623157501,
"learning_rate": 0.0015,
"loss": 2.9094,
"step": 875
},
{
"epoch": 0.005583057282167715,
"grad_norm": 0.09349209070205688,
"learning_rate": 0.0015,
"loss": 2.9242,
"step": 900
},
{
"epoch": 0.0057381422066723736,
"grad_norm": 0.07793531566858292,
"learning_rate": 0.0015,
"loss": 2.9692,
"step": 925
},
{
"epoch": 0.005893227131177033,
"grad_norm": 0.1276599019765854,
"learning_rate": 0.0015,
"loss": 2.9339,
"step": 950
},
{
"epoch": 0.006048312055681691,
"grad_norm": 0.11083021759986877,
"learning_rate": 0.0015,
"loss": 2.9251,
"step": 975
},
{
"epoch": 0.00620339698018635,
"grad_norm": 0.13207702338695526,
"learning_rate": 0.0015,
"loss": 2.8567,
"step": 1000
},
{
"epoch": 0.00620339698018635,
"eval_loss": 4.790068626403809,
"perplexity": 120.30962371826172,
"step": 1000
},
{
"epoch": 0.006358481904691009,
"grad_norm": 0.20453479886054993,
"learning_rate": 0.0015,
"loss": 2.9127,
"step": 1025
},
{
"epoch": 0.006513566829195667,
"grad_norm": 0.12530989944934845,
"learning_rate": 0.0015,
"loss": 2.9147,
"step": 1050
},
{
"epoch": 0.006668651753700326,
"grad_norm": 0.11520997434854507,
"learning_rate": 0.0015,
"loss": 2.936,
"step": 1075
},
{
"epoch": 0.006823736678204985,
"grad_norm": 0.09191219508647919,
"learning_rate": 0.0015,
"loss": 2.9115,
"step": 1100
},
{
"epoch": 0.006978821602709644,
"grad_norm": 0.07251202315092087,
"learning_rate": 0.0015,
"loss": 2.9154,
"step": 1125
},
{
"epoch": 0.007133906527214302,
"grad_norm": 0.10054546594619751,
"learning_rate": 0.0015,
"loss": 2.8924,
"step": 1150
},
{
"epoch": 0.007288991451718962,
"grad_norm": 0.1192697063088417,
"learning_rate": 0.0015,
"loss": 2.957,
"step": 1175
},
{
"epoch": 0.00744407637622362,
"grad_norm": 0.14840476214885712,
"learning_rate": 0.0015,
"loss": 2.895,
"step": 1200
},
{
"epoch": 0.00744407637622362,
"eval_loss": 4.770949363708496,
"perplexity": 118.03124237060547,
"step": 1200
},
{
"epoch": 0.007599161300728279,
"grad_norm": 0.11221906542778015,
"learning_rate": 0.0015,
"loss": 2.9131,
"step": 1225
},
{
"epoch": 0.0077542462252329376,
"grad_norm": 0.11528974026441574,
"learning_rate": 0.0015,
"loss": 2.8783,
"step": 1250
},
{
"epoch": 0.007909331149737596,
"grad_norm": 0.0807015597820282,
"learning_rate": 0.0015,
"loss": 2.91,
"step": 1275
},
{
"epoch": 0.008064416074242254,
"grad_norm": 0.1435490846633911,
"learning_rate": 0.0015,
"loss": 2.9198,
"step": 1300
},
{
"epoch": 0.008219500998746914,
"grad_norm": 0.11956608295440674,
"learning_rate": 0.0015,
"loss": 2.8771,
"step": 1325
},
{
"epoch": 0.008374585923251573,
"grad_norm": 0.10362117737531662,
"learning_rate": 0.0015,
"loss": 2.8913,
"step": 1350
},
{
"epoch": 0.008529670847756231,
"grad_norm": 0.07132004201412201,
"learning_rate": 0.0015,
"loss": 2.946,
"step": 1375
},
{
"epoch": 0.00868475577226089,
"grad_norm": 0.08756817877292633,
"learning_rate": 0.0015,
"loss": 2.9015,
"step": 1400
},
{
"epoch": 0.00868475577226089,
"eval_loss": 4.769084453582764,
"perplexity": 117.81133270263672,
"step": 1400
},
{
"epoch": 0.00883984069676555,
"grad_norm": 0.18067917227745056,
"learning_rate": 0.0015,
"loss": 2.8887,
"step": 1425
},
{
"epoch": 0.008994925621270208,
"grad_norm": 0.09742950648069382,
"learning_rate": 0.0015,
"loss": 2.8834,
"step": 1450
},
{
"epoch": 0.009150010545774866,
"grad_norm": 0.09857803583145142,
"learning_rate": 0.0015,
"loss": 2.8856,
"step": 1475
},
{
"epoch": 0.009305095470279525,
"grad_norm": 0.17605328559875488,
"learning_rate": 0.0015,
"loss": 2.9238,
"step": 1500
},
{
"epoch": 0.009460180394784183,
"grad_norm": 0.08441105484962463,
"learning_rate": 0.0015,
"loss": 2.8605,
"step": 1525
},
{
"epoch": 0.009615265319288843,
"grad_norm": 0.15339621901512146,
"learning_rate": 0.0015,
"loss": 2.9421,
"step": 1550
},
{
"epoch": 0.009770350243793502,
"grad_norm": 0.21426236629486084,
"learning_rate": 0.0015,
"loss": 2.8899,
"step": 1575
},
{
"epoch": 0.00992543516829816,
"grad_norm": 0.16503557562828064,
"learning_rate": 0.0015,
"loss": 2.878,
"step": 1600
},
{
"epoch": 0.00992543516829816,
"eval_loss": 4.774999618530273,
"perplexity": 118.51026916503906,
"step": 1600
},
{
"epoch": 0.010080520092802818,
"grad_norm": 0.11398541182279587,
"learning_rate": 0.0015,
"loss": 2.866,
"step": 1625
},
{
"epoch": 0.010235605017307478,
"grad_norm": 0.16510234773159027,
"learning_rate": 0.0015,
"loss": 2.8936,
"step": 1650
},
{
"epoch": 0.010390689941812137,
"grad_norm": 0.08827799558639526,
"learning_rate": 0.0015,
"loss": 2.8789,
"step": 1675
},
{
"epoch": 0.010545774866316795,
"grad_norm": 0.12703286111354828,
"learning_rate": 0.0015,
"loss": 2.9104,
"step": 1700
},
{
"epoch": 0.010700859790821454,
"grad_norm": 0.10185768455266953,
"learning_rate": 0.0015,
"loss": 2.8389,
"step": 1725
},
{
"epoch": 0.010855944715326112,
"grad_norm": 0.13076236844062805,
"learning_rate": 0.0015,
"loss": 2.8603,
"step": 1750
},
{
"epoch": 0.011011029639830772,
"grad_norm": 0.08955707401037216,
"learning_rate": 0.0015,
"loss": 2.8283,
"step": 1775
},
{
"epoch": 0.01116611456433543,
"grad_norm": 0.07163148373365402,
"learning_rate": 0.0015,
"loss": 2.8852,
"step": 1800
},
{
"epoch": 0.01116611456433543,
"eval_loss": 4.75281286239624,
"perplexity": 115.90986633300781,
"step": 1800
},
{
"epoch": 0.011321199488840089,
"grad_norm": 0.09710580855607986,
"learning_rate": 0.0015,
"loss": 2.8573,
"step": 1825
},
{
"epoch": 0.011476284413344747,
"grad_norm": 0.11669810861349106,
"learning_rate": 0.0015,
"loss": 2.8674,
"step": 1850
},
{
"epoch": 0.011631369337849405,
"grad_norm": 0.11174403876066208,
"learning_rate": 0.0015,
"loss": 2.9121,
"step": 1875
},
{
"epoch": 0.011786454262354066,
"grad_norm": 0.09547118842601776,
"learning_rate": 0.0015,
"loss": 2.9033,
"step": 1900
},
{
"epoch": 0.011941539186858724,
"grad_norm": 0.09878171980381012,
"learning_rate": 0.0015,
"loss": 2.8738,
"step": 1925
},
{
"epoch": 0.012096624111363382,
"grad_norm": 0.09479096531867981,
"learning_rate": 0.0015,
"loss": 2.8775,
"step": 1950
},
{
"epoch": 0.01225170903586804,
"grad_norm": 0.12434259057044983,
"learning_rate": 0.0015,
"loss": 2.8452,
"step": 1975
},
{
"epoch": 0.0124067939603727,
"grad_norm": 0.09166444838047028,
"learning_rate": 0.0015,
"loss": 2.8546,
"step": 2000
},
{
"epoch": 0.0124067939603727,
"eval_loss": 4.748600482940674,
"perplexity": 115.42263793945312,
"step": 2000
},
{
"epoch": 0.01256187888487736,
"grad_norm": 0.07793508470058441,
"learning_rate": 0.0015,
"loss": 2.8306,
"step": 2025
},
{
"epoch": 0.012716963809382018,
"grad_norm": 0.1670406609773636,
"learning_rate": 0.0015,
"loss": 2.863,
"step": 2050
},
{
"epoch": 0.012872048733886676,
"grad_norm": 0.20754718780517578,
"learning_rate": 0.0015,
"loss": 2.8871,
"step": 2075
},
{
"epoch": 0.013027133658391334,
"grad_norm": 0.14225496351718903,
"learning_rate": 0.0015,
"loss": 2.8498,
"step": 2100
},
{
"epoch": 0.013182218582895994,
"grad_norm": 0.11809197813272476,
"learning_rate": 0.0015,
"loss": 2.8206,
"step": 2125
},
{
"epoch": 0.013337303507400653,
"grad_norm": 0.09541622549295425,
"learning_rate": 0.0015,
"loss": 2.8585,
"step": 2150
},
{
"epoch": 0.013492388431905311,
"grad_norm": 0.1115843802690506,
"learning_rate": 0.0015,
"loss": 2.8533,
"step": 2175
},
{
"epoch": 0.01364747335640997,
"grad_norm": 0.08517899364233017,
"learning_rate": 0.0015,
"loss": 2.8477,
"step": 2200
},
{
"epoch": 0.01364747335640997,
"eval_loss": 4.753279685974121,
"perplexity": 115.9639892578125,
"step": 2200
},
{
"epoch": 0.01380255828091463,
"grad_norm": 0.13083544373512268,
"learning_rate": 0.0015,
"loss": 2.8518,
"step": 2225
},
{
"epoch": 0.013957643205419288,
"grad_norm": 0.07403870671987534,
"learning_rate": 0.0015,
"loss": 2.8685,
"step": 2250
},
{
"epoch": 0.014112728129923946,
"grad_norm": 0.16436311602592468,
"learning_rate": 0.0015,
"loss": 2.8601,
"step": 2275
},
{
"epoch": 0.014267813054428605,
"grad_norm": 0.12990187108516693,
"learning_rate": 0.0015,
"loss": 2.8332,
"step": 2300
},
{
"epoch": 0.014422897978933263,
"grad_norm": 0.0897112786769867,
"learning_rate": 0.0015,
"loss": 2.8578,
"step": 2325
},
{
"epoch": 0.014577982903437923,
"grad_norm": 0.10096879303455353,
"learning_rate": 0.0015,
"loss": 2.802,
"step": 2350
},
{
"epoch": 0.014733067827942582,
"grad_norm": 0.0850217416882515,
"learning_rate": 0.0015,
"loss": 2.8529,
"step": 2375
},
{
"epoch": 0.01488815275244724,
"grad_norm": 0.11395123600959778,
"learning_rate": 0.0015,
"loss": 2.8655,
"step": 2400
},
{
"epoch": 0.01488815275244724,
"eval_loss": 4.743602275848389,
"perplexity": 114.84716796875,
"step": 2400
},
{
"epoch": 0.015043237676951898,
"grad_norm": 0.1590801179409027,
"learning_rate": 0.0015,
"loss": 2.8227,
"step": 2425
},
{
"epoch": 0.015198322601456558,
"grad_norm": 0.16819922626018524,
"learning_rate": 0.0015,
"loss": 2.8551,
"step": 2450
},
{
"epoch": 0.015353407525961217,
"grad_norm": 0.15390118956565857,
"learning_rate": 0.0015,
"loss": 2.8691,
"step": 2475
},
{
"epoch": 0.015508492450465875,
"grad_norm": 0.10976951569318771,
"learning_rate": 0.0015,
"loss": 2.8615,
"step": 2500
},
{
"epoch": 0.015663577374970535,
"grad_norm": 0.09539350867271423,
"learning_rate": 0.0015,
"loss": 2.7755,
"step": 2525
},
{
"epoch": 0.015818662299475192,
"grad_norm": 0.09798863530158997,
"learning_rate": 0.0015,
"loss": 2.7675,
"step": 2550
},
{
"epoch": 0.015973747223979852,
"grad_norm": 0.10233014822006226,
"learning_rate": 0.0015,
"loss": 2.7905,
"step": 2575
},
{
"epoch": 0.01612883214848451,
"grad_norm": 0.09607812017202377,
"learning_rate": 0.0015,
"loss": 2.779,
"step": 2600
},
{
"epoch": 0.01612883214848451,
"eval_loss": 4.757762432098389,
"perplexity": 116.48499298095703,
"step": 2600
},
{
"epoch": 0.01628391707298917,
"grad_norm": 0.09782920032739639,
"learning_rate": 0.0015,
"loss": 2.8455,
"step": 2625
},
{
"epoch": 0.01643900199749383,
"grad_norm": 0.08443335443735123,
"learning_rate": 0.0015,
"loss": 2.8537,
"step": 2650
},
{
"epoch": 0.016594086921998485,
"grad_norm": 0.1567981094121933,
"learning_rate": 0.0015,
"loss": 2.8334,
"step": 2675
},
{
"epoch": 0.016749171846503146,
"grad_norm": 0.1279255449771881,
"learning_rate": 0.0015,
"loss": 2.8733,
"step": 2700
},
{
"epoch": 0.016904256771007802,
"grad_norm": 0.09086953848600388,
"learning_rate": 0.0015,
"loss": 2.7992,
"step": 2725
},
{
"epoch": 0.017059341695512462,
"grad_norm": 0.15084481239318848,
"learning_rate": 0.0015,
"loss": 2.7891,
"step": 2750
},
{
"epoch": 0.017214426620017122,
"grad_norm": 0.1059018149971962,
"learning_rate": 0.0015,
"loss": 2.8088,
"step": 2775
},
{
"epoch": 0.01736951154452178,
"grad_norm": 0.08803548663854599,
"learning_rate": 0.0015,
"loss": 2.817,
"step": 2800
},
{
"epoch": 0.01736951154452178,
"eval_loss": 4.730724334716797,
"perplexity": 113.37765502929688,
"step": 2800
},
{
"epoch": 0.01752459646902644,
"grad_norm": 0.0954984724521637,
"learning_rate": 0.0015,
"loss": 2.8528,
"step": 2825
},
{
"epoch": 0.0176796813935311,
"grad_norm": 0.14015914499759674,
"learning_rate": 0.0015,
"loss": 2.8131,
"step": 2850
},
{
"epoch": 0.017834766318035756,
"grad_norm": 0.07908599078655243,
"learning_rate": 0.0015,
"loss": 2.8371,
"step": 2875
},
{
"epoch": 0.017989851242540416,
"grad_norm": 0.14578266441822052,
"learning_rate": 0.0015,
"loss": 2.8033,
"step": 2900
},
{
"epoch": 0.018144936167045073,
"grad_norm": 0.10059946030378342,
"learning_rate": 0.0015,
"loss": 2.8165,
"step": 2925
},
{
"epoch": 0.018300021091549733,
"grad_norm": 0.10238490998744965,
"learning_rate": 0.0015,
"loss": 2.7739,
"step": 2950
},
{
"epoch": 0.018455106016054393,
"grad_norm": 0.12706336379051208,
"learning_rate": 0.0015,
"loss": 2.8018,
"step": 2975
},
{
"epoch": 0.01861019094055905,
"grad_norm": 0.1252700239419937,
"learning_rate": 0.0015,
"loss": 2.8155,
"step": 3000
},
{
"epoch": 0.01861019094055905,
"eval_loss": 4.707705020904541,
"perplexity": 110.79759216308594,
"step": 3000
},
{
"epoch": 0.01876527586506371,
"grad_norm": 0.13322588801383972,
"learning_rate": 0.0015,
"loss": 2.8201,
"step": 3025
},
{
"epoch": 0.018920360789568366,
"grad_norm": 0.14152252674102783,
"learning_rate": 0.0015,
"loss": 2.7942,
"step": 3050
},
{
"epoch": 0.019075445714073026,
"grad_norm": 0.1276037096977234,
"learning_rate": 0.0015,
"loss": 2.8065,
"step": 3075
},
{
"epoch": 0.019230530638577686,
"grad_norm": 0.11600831896066666,
"learning_rate": 0.0015,
"loss": 2.8335,
"step": 3100
},
{
"epoch": 0.019385615563082343,
"grad_norm": 0.11985427141189575,
"learning_rate": 0.0015,
"loss": 2.7993,
"step": 3125
},
{
"epoch": 0.019540700487587003,
"grad_norm": 0.11630894988775253,
"learning_rate": 0.0015,
"loss": 2.7838,
"step": 3150
},
{
"epoch": 0.01969578541209166,
"grad_norm": 0.08493560552597046,
"learning_rate": 0.0015,
"loss": 2.7884,
"step": 3175
},
{
"epoch": 0.01985087033659632,
"grad_norm": 0.12671016156673431,
"learning_rate": 0.0015,
"loss": 2.7763,
"step": 3200
},
{
"epoch": 0.01985087033659632,
"eval_loss": 4.7127766609191895,
"perplexity": 111.3609390258789,
"step": 3200
},
{
"epoch": 0.02000595526110098,
"grad_norm": 0.10381816327571869,
"learning_rate": 0.0015,
"loss": 2.7849,
"step": 3225
},
{
"epoch": 0.020161040185605637,
"grad_norm": 0.12319795787334442,
"learning_rate": 0.0015,
"loss": 2.8325,
"step": 3250
},
{
"epoch": 0.020316125110110297,
"grad_norm": 0.11378122121095657,
"learning_rate": 0.0015,
"loss": 2.7609,
"step": 3275
},
{
"epoch": 0.020471210034614957,
"grad_norm": 0.08910433948040009,
"learning_rate": 0.0015,
"loss": 2.7886,
"step": 3300
},
{
"epoch": 0.020626294959119613,
"grad_norm": 0.11803348362445831,
"learning_rate": 0.0015,
"loss": 2.7716,
"step": 3325
},
{
"epoch": 0.020781379883624274,
"grad_norm": 0.10203807801008224,
"learning_rate": 0.0015,
"loss": 2.778,
"step": 3350
},
{
"epoch": 0.02093646480812893,
"grad_norm": 0.07175683230161667,
"learning_rate": 0.0015,
"loss": 2.7844,
"step": 3375
},
{
"epoch": 0.02109154973263359,
"grad_norm": 0.1556989699602127,
"learning_rate": 0.0015,
"loss": 2.748,
"step": 3400
},
{
"epoch": 0.02109154973263359,
"eval_loss": 4.711516857147217,
"perplexity": 111.22074127197266,
"step": 3400
},
{
"epoch": 0.02124663465713825,
"grad_norm": 0.11983326822519302,
"learning_rate": 0.0015,
"loss": 2.7747,
"step": 3425
},
{
"epoch": 0.021401719581642907,
"grad_norm": 0.09098344296216965,
"learning_rate": 0.0015,
"loss": 2.7609,
"step": 3450
},
{
"epoch": 0.021556804506147567,
"grad_norm": 0.1238594651222229,
"learning_rate": 0.0015,
"loss": 2.7849,
"step": 3475
},
{
"epoch": 0.021711889430652224,
"grad_norm": 0.10654041916131973,
"learning_rate": 0.0015,
"loss": 2.7742,
"step": 3500
},
{
"epoch": 0.021866974355156884,
"grad_norm": 0.12955708801746368,
"learning_rate": 0.0015,
"loss": 2.7302,
"step": 3525
},
{
"epoch": 0.022022059279661544,
"grad_norm": 0.0945751890540123,
"learning_rate": 0.0015,
"loss": 2.7366,
"step": 3550
},
{
"epoch": 0.0221771442041662,
"grad_norm": 0.11322261393070221,
"learning_rate": 0.0015,
"loss": 2.7307,
"step": 3575
},
{
"epoch": 0.02233222912867086,
"grad_norm": 0.14438313245773315,
"learning_rate": 0.0015,
"loss": 2.741,
"step": 3600
},
{
"epoch": 0.02233222912867086,
"eval_loss": 4.7056427001953125,
"perplexity": 110.56932830810547,
"step": 3600
},
{
"epoch": 0.022487314053175517,
"grad_norm": 0.12101957201957703,
"learning_rate": 0.0015,
"loss": 2.7699,
"step": 3625
},
{
"epoch": 0.022642398977680177,
"grad_norm": 0.13060438632965088,
"learning_rate": 0.0015,
"loss": 2.7534,
"step": 3650
},
{
"epoch": 0.022797483902184838,
"grad_norm": 0.18028861284255981,
"learning_rate": 0.0015,
"loss": 2.7716,
"step": 3675
},
{
"epoch": 0.022952568826689494,
"grad_norm": 0.2551407217979431,
"learning_rate": 0.0015,
"loss": 2.7505,
"step": 3700
},
{
"epoch": 0.023107653751194154,
"grad_norm": 0.14461354911327362,
"learning_rate": 0.0015,
"loss": 2.762,
"step": 3725
},
{
"epoch": 0.02326273867569881,
"grad_norm": 0.08960037678480148,
"learning_rate": 0.0015,
"loss": 2.7752,
"step": 3750
},
{
"epoch": 0.02341782360020347,
"grad_norm": 0.12423495948314667,
"learning_rate": 0.0015,
"loss": 2.7649,
"step": 3775
},
{
"epoch": 0.02357290852470813,
"grad_norm": 0.11889061331748962,
"learning_rate": 0.0015,
"loss": 2.7465,
"step": 3800
},
{
"epoch": 0.02357290852470813,
"eval_loss": 4.709405422210693,
"perplexity": 110.98615264892578,
"step": 3800
},
{
"epoch": 0.023727993449212788,
"grad_norm": 0.1310662031173706,
"learning_rate": 0.0015,
"loss": 2.7739,
"step": 3825
},
{
"epoch": 0.023883078373717448,
"grad_norm": 0.10841766744852066,
"learning_rate": 0.0015,
"loss": 2.7558,
"step": 3850
},
{
"epoch": 0.024038163298222108,
"grad_norm": 0.11951743066310883,
"learning_rate": 0.0015,
"loss": 2.7574,
"step": 3875
},
{
"epoch": 0.024193248222726765,
"grad_norm": 0.10914873331785202,
"learning_rate": 0.0015,
"loss": 2.7593,
"step": 3900
},
{
"epoch": 0.024348333147231425,
"grad_norm": 0.12661431729793549,
"learning_rate": 0.0015,
"loss": 2.7405,
"step": 3925
},
{
"epoch": 0.02450341807173608,
"grad_norm": 0.09351510554552078,
"learning_rate": 0.0015,
"loss": 2.7614,
"step": 3950
},
{
"epoch": 0.02465850299624074,
"grad_norm": 0.10916408896446228,
"learning_rate": 0.0015,
"loss": 2.7348,
"step": 3975
},
{
"epoch": 0.0248135879207454,
"grad_norm": 0.1506185084581375,
"learning_rate": 0.0015,
"loss": 2.7465,
"step": 4000
},
{
"epoch": 0.0248135879207454,
"eval_loss": 4.691644191741943,
"perplexity": 109.03230285644531,
"step": 4000
},
{
"epoch": 0.024968672845250058,
"grad_norm": 0.16664201021194458,
"learning_rate": 0.0015,
"loss": 2.7099,
"step": 4025
},
{
"epoch": 0.02512375776975472,
"grad_norm": 0.08793428540229797,
"learning_rate": 0.0015,
"loss": 2.7062,
"step": 4050
},
{
"epoch": 0.025278842694259375,
"grad_norm": 0.10746140778064728,
"learning_rate": 0.0015,
"loss": 2.7013,
"step": 4075
},
{
"epoch": 0.025433927618764035,
"grad_norm": 0.14466698467731476,
"learning_rate": 0.0015,
"loss": 2.7366,
"step": 4100
},
{
"epoch": 0.025589012543268695,
"grad_norm": 0.12191653996706009,
"learning_rate": 0.0015,
"loss": 2.7042,
"step": 4125
},
{
"epoch": 0.025744097467773352,
"grad_norm": 0.10167489945888519,
"learning_rate": 0.0015,
"loss": 2.7215,
"step": 4150
},
{
"epoch": 0.025899182392278012,
"grad_norm": 0.11334148049354553,
"learning_rate": 0.0015,
"loss": 2.7365,
"step": 4175
},
{
"epoch": 0.02605426731678267,
"grad_norm": 0.09303794056177139,
"learning_rate": 0.0015,
"loss": 2.7471,
"step": 4200
},
{
"epoch": 0.02605426731678267,
"eval_loss": 4.692121505737305,
"perplexity": 109.08435821533203,
"step": 4200
},
{
"epoch": 0.02620935224128733,
"grad_norm": 0.09444712847471237,
"learning_rate": 0.0015,
"loss": 2.6965,
"step": 4225
},
{
"epoch": 0.02636443716579199,
"grad_norm": 0.09560113400220871,
"learning_rate": 0.0015,
"loss": 2.7186,
"step": 4250
},
{
"epoch": 0.026519522090296645,
"grad_norm": 0.10814715176820755,
"learning_rate": 0.0015,
"loss": 2.7,
"step": 4275
},
{
"epoch": 0.026674607014801305,
"grad_norm": 0.12008251994848251,
"learning_rate": 0.0015,
"loss": 2.6827,
"step": 4300
},
{
"epoch": 0.026829691939305966,
"grad_norm": 0.13892072439193726,
"learning_rate": 0.0015,
"loss": 2.7481,
"step": 4325
},
{
"epoch": 0.026984776863810622,
"grad_norm": 0.10116352885961533,
"learning_rate": 0.0015,
"loss": 2.6839,
"step": 4350
},
{
"epoch": 0.027139861788315282,
"grad_norm": 0.2541595697402954,
"learning_rate": 0.0015,
"loss": 2.6987,
"step": 4375
},
{
"epoch": 0.02729494671281994,
"grad_norm": 0.11070574074983597,
"learning_rate": 0.0015,
"loss": 2.7102,
"step": 4400
},
{
"epoch": 0.02729494671281994,
"eval_loss": 4.702114105224609,
"perplexity": 110.17985534667969,
"step": 4400
},
{
"epoch": 0.0274500316373246,
"grad_norm": 0.09290622174739838,
"learning_rate": 0.0015,
"loss": 2.744,
"step": 4425
},
{
"epoch": 0.02760511656182926,
"grad_norm": 0.09867129474878311,
"learning_rate": 0.0015,
"loss": 2.6979,
"step": 4450
},
{
"epoch": 0.027760201486333916,
"grad_norm": 0.08975850045681,
"learning_rate": 0.0015,
"loss": 2.7346,
"step": 4475
},
{
"epoch": 0.027915286410838576,
"grad_norm": 0.1251811683177948,
"learning_rate": 0.0015,
"loss": 2.6901,
"step": 4500
},
{
"epoch": 0.028070371335343233,
"grad_norm": 0.10718528181314468,
"learning_rate": 0.0015,
"loss": 2.6584,
"step": 4525
},
{
"epoch": 0.028225456259847893,
"grad_norm": 0.1920158714056015,
"learning_rate": 0.0015,
"loss": 2.6776,
"step": 4550
},
{
"epoch": 0.028380541184352553,
"grad_norm": 0.11409153789281845,
"learning_rate": 0.0015,
"loss": 2.7052,
"step": 4575
},
{
"epoch": 0.02853562610885721,
"grad_norm": 0.12506772577762604,
"learning_rate": 0.0015,
"loss": 2.6954,
"step": 4600
},
{
"epoch": 0.02853562610885721,
"eval_loss": 4.685390949249268,
"perplexity": 108.35262298583984,
"step": 4600
},
{
"epoch": 0.02869071103336187,
"grad_norm": 0.1093166172504425,
"learning_rate": 0.0015,
"loss": 2.7257,
"step": 4625
},
{
"epoch": 0.028845795957866526,
"grad_norm": 0.16628532111644745,
"learning_rate": 0.0015,
"loss": 2.6782,
"step": 4650
},
{
"epoch": 0.029000880882371186,
"grad_norm": 0.1638079136610031,
"learning_rate": 0.0015,
"loss": 2.6884,
"step": 4675
},
{
"epoch": 0.029155965806875846,
"grad_norm": 0.11411619931459427,
"learning_rate": 0.0015,
"loss": 2.7054,
"step": 4700
},
{
"epoch": 0.029311050731380503,
"grad_norm": 0.09292814135551453,
"learning_rate": 0.0015,
"loss": 2.6826,
"step": 4725
},
{
"epoch": 0.029466135655885163,
"grad_norm": 0.09136354923248291,
"learning_rate": 0.0015,
"loss": 2.6936,
"step": 4750
},
{
"epoch": 0.029621220580389823,
"grad_norm": 0.1188502386212349,
"learning_rate": 0.0015,
"loss": 2.6466,
"step": 4775
},
{
"epoch": 0.02977630550489448,
"grad_norm": 0.09645655751228333,
"learning_rate": 0.0015,
"loss": 2.6092,
"step": 4800
},
{
"epoch": 0.02977630550489448,
"eval_loss": 4.683995723724365,
"perplexity": 108.20155334472656,
"step": 4800
},
{
"epoch": 0.02993139042939914,
"grad_norm": 0.17193672060966492,
"learning_rate": 0.0015,
"loss": 2.6916,
"step": 4825
},
{
"epoch": 0.030086475353903797,
"grad_norm": 0.14866988360881805,
"learning_rate": 0.0015,
"loss": 2.6776,
"step": 4850
},
{
"epoch": 0.030241560278408457,
"grad_norm": 0.10588869452476501,
"learning_rate": 0.0015,
"loss": 2.6773,
"step": 4875
},
{
"epoch": 0.030396645202913117,
"grad_norm": 0.12059559673070908,
"learning_rate": 0.0015,
"loss": 2.639,
"step": 4900
},
{
"epoch": 0.030551730127417773,
"grad_norm": 0.13296598196029663,
"learning_rate": 0.0015,
"loss": 2.6359,
"step": 4925
},
{
"epoch": 0.030706815051922434,
"grad_norm": 0.12300167232751846,
"learning_rate": 0.0015,
"loss": 2.668,
"step": 4950
},
{
"epoch": 0.03086189997642709,
"grad_norm": 0.15900522470474243,
"learning_rate": 0.0015,
"loss": 2.6252,
"step": 4975
},
{
"epoch": 0.03101698490093175,
"grad_norm": 0.138090580701828,
"learning_rate": 0.0015,
"loss": 2.659,
"step": 5000
},
{
"epoch": 0.03101698490093175,
"eval_loss": 4.688181400299072,
"perplexity": 108.65540313720703,
"step": 5000
},
{
"epoch": 0.03117206982543641,
"grad_norm": 0.13720737397670746,
"learning_rate": 0.0015,
"loss": 2.6096,
"step": 5025
},
{
"epoch": 0.03132715474994107,
"grad_norm": 0.13671600818634033,
"learning_rate": 0.0015,
"loss": 2.647,
"step": 5050
},
{
"epoch": 0.031482239674445724,
"grad_norm": 0.12611277401447296,
"learning_rate": 0.0015,
"loss": 2.639,
"step": 5075
},
{
"epoch": 0.031637324598950384,
"grad_norm": 0.12045291066169739,
"learning_rate": 0.0015,
"loss": 2.663,
"step": 5100
},
{
"epoch": 0.031792409523455044,
"grad_norm": 0.10857657343149185,
"learning_rate": 0.0015,
"loss": 2.6677,
"step": 5125
},
{
"epoch": 0.031947494447959704,
"grad_norm": 0.12052007764577866,
"learning_rate": 0.0015,
"loss": 2.6508,
"step": 5150
},
{
"epoch": 0.032102579372464364,
"grad_norm": 0.10999467223882675,
"learning_rate": 0.0015,
"loss": 2.661,
"step": 5175
},
{
"epoch": 0.03225766429696902,
"grad_norm": 0.11075185984373093,
"learning_rate": 0.0015,
"loss": 2.6645,
"step": 5200
},
{
"epoch": 0.03225766429696902,
"eval_loss": 4.706582546234131,
"perplexity": 110.67329406738281,
"step": 5200
},
{
"epoch": 0.03241274922147368,
"grad_norm": 0.09703061729669571,
"learning_rate": 0.0015,
"loss": 2.6109,
"step": 5225
},
{
"epoch": 0.03256783414597834,
"grad_norm": 0.13556119799613953,
"learning_rate": 0.0015,
"loss": 2.6621,
"step": 5250
},
{
"epoch": 0.032722919070483,
"grad_norm": 0.09178316593170166,
"learning_rate": 0.0015,
"loss": 2.6263,
"step": 5275
},
{
"epoch": 0.03287800399498766,
"grad_norm": 0.10839138180017471,
"learning_rate": 0.0015,
"loss": 2.5999,
"step": 5300
},
{
"epoch": 0.03303308891949231,
"grad_norm": 0.12049377709627151,
"learning_rate": 0.0015,
"loss": 2.6085,
"step": 5325
},
{
"epoch": 0.03318817384399697,
"grad_norm": 0.15260230004787445,
"learning_rate": 0.0015,
"loss": 2.664,
"step": 5350
},
{
"epoch": 0.03334325876850163,
"grad_norm": 0.12393297255039215,
"learning_rate": 0.0015,
"loss": 2.6234,
"step": 5375
},
{
"epoch": 0.03349834369300629,
"grad_norm": 0.1284521073102951,
"learning_rate": 0.0015,
"loss": 2.5624,
"step": 5400
},
{
"epoch": 0.03349834369300629,
"eval_loss": 4.696901321411133,
"perplexity": 109.60700988769531,
"step": 5400
},
{
"epoch": 0.03365342861751095,
"grad_norm": 0.18052247166633606,
"learning_rate": 0.0015,
"loss": 2.5779,
"step": 5425
},
{
"epoch": 0.033808513542015604,
"grad_norm": 0.11775010824203491,
"learning_rate": 0.0015,
"loss": 2.6167,
"step": 5450
},
{
"epoch": 0.033963598466520264,
"grad_norm": 0.13769109547138214,
"learning_rate": 0.0015,
"loss": 2.6117,
"step": 5475
},
{
"epoch": 0.034118683391024925,
"grad_norm": 0.09634970873594284,
"learning_rate": 0.0015,
"loss": 2.613,
"step": 5500
},
{
"epoch": 0.034273768315529585,
"grad_norm": 0.14692488312721252,
"learning_rate": 0.0015,
"loss": 2.6176,
"step": 5525
},
{
"epoch": 0.034428853240034245,
"grad_norm": 0.21920783817768097,
"learning_rate": 0.0015,
"loss": 2.6196,
"step": 5550
},
{
"epoch": 0.034583938164538905,
"grad_norm": 0.1033003106713295,
"learning_rate": 0.0015,
"loss": 2.5872,
"step": 5575
},
{
"epoch": 0.03473902308904356,
"grad_norm": 0.09867612272500992,
"learning_rate": 0.0015,
"loss": 2.5782,
"step": 5600
},
{
"epoch": 0.03473902308904356,
"eval_loss": 4.704063892364502,
"perplexity": 110.3948974609375,
"step": 5600
},
{
"epoch": 0.03489410801354822,
"grad_norm": 0.1032184287905693,
"learning_rate": 0.0015,
"loss": 2.6187,
"step": 5625
},
{
"epoch": 0.03504919293805288,
"grad_norm": 0.12661318480968475,
"learning_rate": 0.0015,
"loss": 2.5805,
"step": 5650
},
{
"epoch": 0.03520427786255754,
"grad_norm": 0.28772449493408203,
"learning_rate": 0.0015,
"loss": 2.7518,
"step": 5675
},
{
"epoch": 0.0353593627870622,
"grad_norm": 0.10005131363868713,
"learning_rate": 0.0015,
"loss": 2.8556,
"step": 5700
},
{
"epoch": 0.03551444771156685,
"grad_norm": 0.10379570722579956,
"learning_rate": 0.0015,
"loss": 2.8648,
"step": 5725
},
{
"epoch": 0.03566953263607151,
"grad_norm": 0.08921229094266891,
"learning_rate": 0.0015,
"loss": 2.8421,
"step": 5750
},
{
"epoch": 0.03582461756057617,
"grad_norm": 0.15366144478321075,
"learning_rate": 0.0015,
"loss": 2.8162,
"step": 5775
},
{
"epoch": 0.03597970248508083,
"grad_norm": 0.12743431329727173,
"learning_rate": 0.0015,
"loss": 2.8635,
"step": 5800
},
{
"epoch": 0.03597970248508083,
"eval_loss": 4.674878120422363,
"perplexity": 107.21949768066406,
"step": 5800
},
{
"epoch": 0.03613478740958549,
"grad_norm": 0.08773666620254517,
"learning_rate": 0.0015,
"loss": 2.8787,
"step": 5825
},
{
"epoch": 0.036289872334090145,
"grad_norm": 0.11721781641244888,
"learning_rate": 0.0015,
"loss": 2.853,
"step": 5850
},
{
"epoch": 0.036444957258594805,
"grad_norm": 0.09957700222730637,
"learning_rate": 0.0015,
"loss": 2.8163,
"step": 5875
},
{
"epoch": 0.036600042183099465,
"grad_norm": 0.09999966621398926,
"learning_rate": 0.0015,
"loss": 2.8206,
"step": 5900
},
{
"epoch": 0.036755127107604126,
"grad_norm": 0.09899301081895828,
"learning_rate": 0.0015,
"loss": 2.8378,
"step": 5925
},
{
"epoch": 0.036910212032108786,
"grad_norm": 0.09676779061555862,
"learning_rate": 0.0015,
"loss": 2.8385,
"step": 5950
},
{
"epoch": 0.03706529695661344,
"grad_norm": 0.14397811889648438,
"learning_rate": 0.0015,
"loss": 2.8639,
"step": 5975
},
{
"epoch": 0.0372203818811181,
"grad_norm": 0.08991026133298874,
"learning_rate": 0.0015,
"loss": 2.862,
"step": 6000
},
{
"epoch": 0.0372203818811181,
"eval_loss": 4.649503707885742,
"perplexity": 104.53309631347656,
"step": 6000
},
{
"epoch": 0.03737546680562276,
"grad_norm": 0.11916879564523697,
"learning_rate": 0.0015,
"loss": 2.8336,
"step": 6025
},
{
"epoch": 0.03753055173012742,
"grad_norm": 0.1533547192811966,
"learning_rate": 0.0015,
"loss": 2.8154,
"step": 6050
},
{
"epoch": 0.03768563665463208,
"grad_norm": 0.10416785627603531,
"learning_rate": 0.0015,
"loss": 2.8073,
"step": 6075
},
{
"epoch": 0.03784072157913673,
"grad_norm": 0.1307593733072281,
"learning_rate": 0.0015,
"loss": 2.8227,
"step": 6100
},
{
"epoch": 0.03799580650364139,
"grad_norm": 0.11226139962673187,
"learning_rate": 0.0015,
"loss": 2.8316,
"step": 6125
},
{
"epoch": 0.03815089142814605,
"grad_norm": 0.12050950527191162,
"learning_rate": 0.0015,
"loss": 2.8636,
"step": 6150
},
{
"epoch": 0.03830597635265071,
"grad_norm": 0.14836955070495605,
"learning_rate": 0.0015,
"loss": 2.8433,
"step": 6175
},
{
"epoch": 0.03846106127715537,
"grad_norm": 0.1240909993648529,
"learning_rate": 0.0015,
"loss": 2.885,
"step": 6200
},
{
"epoch": 0.03846106127715537,
"eval_loss": 4.652696132659912,
"perplexity": 104.86734008789062,
"step": 6200
},
{
"epoch": 0.038616146201660026,
"grad_norm": 0.09549515694379807,
"learning_rate": 0.0015,
"loss": 2.822,
"step": 6225
},
{
"epoch": 0.038771231126164686,
"grad_norm": 0.1386450082063675,
"learning_rate": 0.0015,
"loss": 2.8455,
"step": 6250
},
{
"epoch": 0.038926316050669346,
"grad_norm": 0.10233025252819061,
"learning_rate": 0.0015,
"loss": 2.834,
"step": 6275
},
{
"epoch": 0.039081400975174006,
"grad_norm": 0.09776704013347626,
"learning_rate": 0.0015,
"loss": 2.8114,
"step": 6300
},
{
"epoch": 0.039236485899678666,
"grad_norm": 0.09631351381540298,
"learning_rate": 0.0015,
"loss": 2.8107,
"step": 6325
},
{
"epoch": 0.03939157082418332,
"grad_norm": 0.08424117416143417,
"learning_rate": 0.0015,
"loss": 2.8373,
"step": 6350
},
{
"epoch": 0.03954665574868798,
"grad_norm": 0.14171521365642548,
"learning_rate": 0.0015,
"loss": 2.8394,
"step": 6375
},
{
"epoch": 0.03970174067319264,
"grad_norm": 0.11349046230316162,
"learning_rate": 0.0015,
"loss": 2.8131,
"step": 6400
},
{
"epoch": 0.03970174067319264,
"eval_loss": 4.652514934539795,
"perplexity": 104.84834289550781,
"step": 6400
},
{
"epoch": 0.0398568255976973,
"grad_norm": 0.09066054224967957,
"learning_rate": 0.0015,
"loss": 2.8758,
"step": 6425
},
{
"epoch": 0.04001191052220196,
"grad_norm": 0.09391192346811295,
"learning_rate": 0.0015,
"loss": 2.826,
"step": 6450
},
{
"epoch": 0.04016699544670661,
"grad_norm": 0.17412593960762024,
"learning_rate": 0.0015,
"loss": 2.8487,
"step": 6475
},
{
"epoch": 0.04032208037121127,
"grad_norm": 0.17672564089298248,
"learning_rate": 0.0015,
"loss": 2.8441,
"step": 6500
},
{
"epoch": 0.04047716529571593,
"grad_norm": 0.11427825689315796,
"learning_rate": 0.0015,
"loss": 2.8843,
"step": 6525
},
{
"epoch": 0.04063225022022059,
"grad_norm": 0.13745597004890442,
"learning_rate": 0.0015,
"loss": 2.8458,
"step": 6550
},
{
"epoch": 0.040787335144725254,
"grad_norm": 0.12339327484369278,
"learning_rate": 0.0015,
"loss": 2.8299,
"step": 6575
},
{
"epoch": 0.040942420069229914,
"grad_norm": 0.11045660078525543,
"learning_rate": 0.0015,
"loss": 2.8504,
"step": 6600
},
{
"epoch": 0.040942420069229914,
"eval_loss": 4.645139217376709,
"perplexity": 104.0778579711914,
"step": 6600
},
{
"epoch": 0.04109750499373457,
"grad_norm": 0.14822149276733398,
"learning_rate": 0.0015,
"loss": 2.8438,
"step": 6625
},
{
"epoch": 0.04125258991823923,
"grad_norm": 0.09271769225597382,
"learning_rate": 0.0015,
"loss": 2.8195,
"step": 6650
},
{
"epoch": 0.04140767484274389,
"grad_norm": 0.12357133626937866,
"learning_rate": 0.0015,
"loss": 2.8434,
"step": 6675
},
{
"epoch": 0.04156275976724855,
"grad_norm": 0.12669824063777924,
"learning_rate": 0.0015,
"loss": 2.8262,
"step": 6700
},
{
"epoch": 0.04171784469175321,
"grad_norm": 0.10409893840551376,
"learning_rate": 0.0015,
"loss": 2.8164,
"step": 6725
},
{
"epoch": 0.04187292961625786,
"grad_norm": 0.10687699913978577,
"learning_rate": 0.0015,
"loss": 2.83,
"step": 6750
},
{
"epoch": 0.04202801454076252,
"grad_norm": 0.09924216568470001,
"learning_rate": 0.0015,
"loss": 2.8415,
"step": 6775
},
{
"epoch": 0.04218309946526718,
"grad_norm": 0.11719833314418793,
"learning_rate": 0.0015,
"loss": 2.8368,
"step": 6800
},
{
"epoch": 0.04218309946526718,
"eval_loss": 4.673882484436035,
"perplexity": 107.11280059814453,
"step": 6800
},
{
"epoch": 0.04233818438977184,
"grad_norm": 0.10162920504808426,
"learning_rate": 0.0015,
"loss": 2.8285,
"step": 6825
},
{
"epoch": 0.0424932693142765,
"grad_norm": 0.10563603043556213,
"learning_rate": 0.0015,
"loss": 2.809,
"step": 6850
},
{
"epoch": 0.042648354238781154,
"grad_norm": 0.079631008207798,
"learning_rate": 0.0015,
"loss": 2.8362,
"step": 6875
},
{
"epoch": 0.042803439163285814,
"grad_norm": 0.11915802210569382,
"learning_rate": 0.0015,
"loss": 2.8211,
"step": 6900
},
{
"epoch": 0.042958524087790474,
"grad_norm": 0.13783864676952362,
"learning_rate": 0.0015,
"loss": 2.8403,
"step": 6925
},
{
"epoch": 0.043113609012295134,
"grad_norm": 0.17333541810512543,
"learning_rate": 0.0015,
"loss": 2.8699,
"step": 6950
},
{
"epoch": 0.043268693936799794,
"grad_norm": 0.10923554003238678,
"learning_rate": 0.0015,
"loss": 2.8016,
"step": 6975
},
{
"epoch": 0.04342377886130445,
"grad_norm": 0.10525023192167282,
"learning_rate": 0.0015,
"loss": 2.8302,
"step": 7000
},
{
"epoch": 0.04342377886130445,
"eval_loss": 4.660215854644775,
"perplexity": 105.65888977050781,
"step": 7000
},
{
"epoch": 0.04357886378580911,
"grad_norm": 0.10499420017004013,
"learning_rate": 0.0015,
"loss": 2.8215,
"step": 7025
},
{
"epoch": 0.04373394871031377,
"grad_norm": 0.09560755640268326,
"learning_rate": 0.0015,
"loss": 2.8279,
"step": 7050
},
{
"epoch": 0.04388903363481843,
"grad_norm": 0.10454019159078598,
"learning_rate": 0.0015,
"loss": 2.8161,
"step": 7075
},
{
"epoch": 0.04404411855932309,
"grad_norm": 0.0982690081000328,
"learning_rate": 0.0015,
"loss": 2.7895,
"step": 7100
},
{
"epoch": 0.04419920348382774,
"grad_norm": 0.10405784100294113,
"learning_rate": 0.0015,
"loss": 2.7945,
"step": 7125
},
{
"epoch": 0.0443542884083324,
"grad_norm": 0.09310988336801529,
"learning_rate": 0.0015,
"loss": 2.8535,
"step": 7150
},
{
"epoch": 0.04450937333283706,
"grad_norm": 0.1031995639204979,
"learning_rate": 0.0015,
"loss": 2.8298,
"step": 7175
},
{
"epoch": 0.04466445825734172,
"grad_norm": 0.09206147491931915,
"learning_rate": 0.0015,
"loss": 2.794,
"step": 7200
},
{
"epoch": 0.04466445825734172,
"eval_loss": 4.642621994018555,
"perplexity": 103.81619262695312,
"step": 7200
},
{
"epoch": 0.04481954318184638,
"grad_norm": 0.1051359549164772,
"learning_rate": 0.0015,
"loss": 2.7996,
"step": 7225
},
{
"epoch": 0.044974628106351035,
"grad_norm": 0.12941063940525055,
"learning_rate": 0.0015,
"loss": 2.792,
"step": 7250
},
{
"epoch": 0.045129713030855695,
"grad_norm": 0.09297281503677368,
"learning_rate": 0.0015,
"loss": 2.7847,
"step": 7275
},
{
"epoch": 0.045284797955360355,
"grad_norm": 0.11114951968193054,
"learning_rate": 0.0015,
"loss": 2.8164,
"step": 7300
},
{
"epoch": 0.045439882879865015,
"grad_norm": 0.08519440144300461,
"learning_rate": 0.0015,
"loss": 2.8053,
"step": 7325
},
{
"epoch": 0.045594967804369675,
"grad_norm": 0.11148552596569061,
"learning_rate": 0.0015,
"loss": 2.7871,
"step": 7350
},
{
"epoch": 0.04575005272887433,
"grad_norm": 0.136012002825737,
"learning_rate": 0.0015,
"loss": 2.8457,
"step": 7375
},
{
"epoch": 0.04590513765337899,
"grad_norm": 0.1037759929895401,
"learning_rate": 0.0015,
"loss": 2.748,
"step": 7400
},
{
"epoch": 0.04590513765337899,
"eval_loss": 4.631537437438965,
"perplexity": 102.67179107666016,
"step": 7400
},
{
"epoch": 0.04606022257788365,
"grad_norm": 0.11162275820970535,
"learning_rate": 0.0015,
"loss": 2.8044,
"step": 7425
},
{
"epoch": 0.04621530750238831,
"grad_norm": 0.11309058219194412,
"learning_rate": 0.0015,
"loss": 2.8198,
"step": 7450
},
{
"epoch": 0.04637039242689297,
"grad_norm": 0.09359199553728104,
"learning_rate": 0.0015,
"loss": 2.8302,
"step": 7475
},
{
"epoch": 0.04652547735139762,
"grad_norm": 0.09513767808675766,
"learning_rate": 0.0015,
"loss": 2.8325,
"step": 7500
},
{
"epoch": 0.04668056227590228,
"grad_norm": 0.08243551850318909,
"learning_rate": 0.0015,
"loss": 2.7925,
"step": 7525
},
{
"epoch": 0.04683564720040694,
"grad_norm": 0.08001349121332169,
"learning_rate": 0.0015,
"loss": 2.8406,
"step": 7550
},
{
"epoch": 0.0469907321249116,
"grad_norm": 0.11749595403671265,
"learning_rate": 0.0015,
"loss": 2.7762,
"step": 7575
},
{
"epoch": 0.04714581704941626,
"grad_norm": 0.15697765350341797,
"learning_rate": 0.0015,
"loss": 2.8137,
"step": 7600
},
{
"epoch": 0.04714581704941626,
"eval_loss": 4.643322467803955,
"perplexity": 103.8889389038086,
"step": 7600
},
{
"epoch": 0.04730090197392092,
"grad_norm": 0.1004658117890358,
"learning_rate": 0.0015,
"loss": 2.7787,
"step": 7625
},
{
"epoch": 0.047455986898425576,
"grad_norm": 0.11577022075653076,
"learning_rate": 0.0015,
"loss": 2.806,
"step": 7650
},
{
"epoch": 0.047611071822930236,
"grad_norm": 0.10791046917438507,
"learning_rate": 0.0015,
"loss": 2.7637,
"step": 7675
},
{
"epoch": 0.047766156747434896,
"grad_norm": 0.09490654617547989,
"learning_rate": 0.0015,
"loss": 2.8187,
"step": 7700
},
{
"epoch": 0.047921241671939556,
"grad_norm": 0.10448817163705826,
"learning_rate": 0.0015,
"loss": 2.8335,
"step": 7725
},
{
"epoch": 0.048076326596444216,
"grad_norm": 0.10800398141145706,
"learning_rate": 0.0015,
"loss": 2.8138,
"step": 7750
},
{
"epoch": 0.04823141152094887,
"grad_norm": 0.10268035531044006,
"learning_rate": 0.0015,
"loss": 2.8074,
"step": 7775
},
{
"epoch": 0.04838649644545353,
"grad_norm": 0.145925372838974,
"learning_rate": 0.0015,
"loss": 2.8161,
"step": 7800
},
{
"epoch": 0.04838649644545353,
"eval_loss": 4.628528118133545,
"perplexity": 102.36328887939453,
"step": 7800
},
{
"epoch": 0.04854158136995819,
"grad_norm": 0.1422831267118454,
"learning_rate": 0.0015,
"loss": 2.8179,
"step": 7825
},
{
"epoch": 0.04869666629446285,
"grad_norm": 0.10019826889038086,
"learning_rate": 0.0015,
"loss": 2.8228,
"step": 7850
},
{
"epoch": 0.04885175121896751,
"grad_norm": 0.12028387933969498,
"learning_rate": 0.0015,
"loss": 2.8359,
"step": 7875
},
{
"epoch": 0.04900683614347216,
"grad_norm": 0.08171118795871735,
"learning_rate": 0.0015,
"loss": 2.7829,
"step": 7900
},
{
"epoch": 0.04916192106797682,
"grad_norm": 0.138522207736969,
"learning_rate": 0.0015,
"loss": 2.7992,
"step": 7925
},
{
"epoch": 0.04931700599248148,
"grad_norm": 0.10419227927923203,
"learning_rate": 0.0015,
"loss": 2.8097,
"step": 7950
},
{
"epoch": 0.04947209091698614,
"grad_norm": 0.1020691841840744,
"learning_rate": 0.0015,
"loss": 2.8152,
"step": 7975
},
{
"epoch": 0.0496271758414908,
"grad_norm": 0.12423787266016006,
"learning_rate": 0.0015,
"loss": 2.7966,
"step": 8000
},
{
"epoch": 0.0496271758414908,
"eval_loss": 4.6273722648620605,
"perplexity": 102.24504089355469,
"step": 8000
},
{
"epoch": 0.049782260765995456,
"grad_norm": 0.15230977535247803,
"learning_rate": 0.0015,
"loss": 2.7575,
"step": 8025
},
{
"epoch": 0.049937345690500116,
"grad_norm": 0.12649676203727722,
"learning_rate": 0.0015,
"loss": 2.7897,
"step": 8050
},
{
"epoch": 0.05009243061500478,
"grad_norm": 0.11257271468639374,
"learning_rate": 0.0015,
"loss": 2.8115,
"step": 8075
},
{
"epoch": 0.05024751553950944,
"grad_norm": 0.09349871426820755,
"learning_rate": 0.0015,
"loss": 2.8041,
"step": 8100
},
{
"epoch": 0.0504026004640141,
"grad_norm": 0.14108401536941528,
"learning_rate": 0.0015,
"loss": 2.7772,
"step": 8125
},
{
"epoch": 0.05055768538851875,
"grad_norm": 0.17286863923072815,
"learning_rate": 0.0015,
"loss": 2.8197,
"step": 8150
},
{
"epoch": 0.05071277031302341,
"grad_norm": 0.10759209096431732,
"learning_rate": 0.0015,
"loss": 2.8396,
"step": 8175
},
{
"epoch": 0.05086785523752807,
"grad_norm": 0.10236554592847824,
"learning_rate": 0.0015,
"loss": 2.8175,
"step": 8200
},
{
"epoch": 0.05086785523752807,
"eval_loss": 4.610519886016846,
"perplexity": 100.5363998413086,
"step": 8200
},
{
"epoch": 0.05102294016203273,
"grad_norm": 0.12348885089159012,
"learning_rate": 0.0015,
"loss": 2.8139,
"step": 8225
},
{
"epoch": 0.05117802508653739,
"grad_norm": 0.10251584649085999,
"learning_rate": 0.0015,
"loss": 2.8436,
"step": 8250
},
{
"epoch": 0.051333110011042044,
"grad_norm": 0.10069389641284943,
"learning_rate": 0.0015,
"loss": 2.8409,
"step": 8275
},
{
"epoch": 0.051488194935546704,
"grad_norm": 0.1546829789876938,
"learning_rate": 0.0015,
"loss": 2.8199,
"step": 8300
},
{
"epoch": 0.051643279860051364,
"grad_norm": 0.10704527795314789,
"learning_rate": 0.0015,
"loss": 2.7721,
"step": 8325
},
{
"epoch": 0.051798364784556024,
"grad_norm": 0.12251198291778564,
"learning_rate": 0.0015,
"loss": 2.8175,
"step": 8350
},
{
"epoch": 0.051953449709060684,
"grad_norm": 0.11113474518060684,
"learning_rate": 0.0015,
"loss": 2.8085,
"step": 8375
},
{
"epoch": 0.05210853463356534,
"grad_norm": 0.1341187059879303,
"learning_rate": 0.0015,
"loss": 2.8169,
"step": 8400
},
{
"epoch": 0.05210853463356534,
"eval_loss": 4.610434532165527,
"perplexity": 100.52782440185547,
"step": 8400
},
{
"epoch": 0.05226361955807,
"grad_norm": 0.16195224225521088,
"learning_rate": 0.0015,
"loss": 2.8266,
"step": 8425
},
{
"epoch": 0.05241870448257466,
"grad_norm": 0.1637653261423111,
"learning_rate": 0.0015,
"loss": 2.8106,
"step": 8450
},
{
"epoch": 0.05257378940707932,
"grad_norm": 0.10014921426773071,
"learning_rate": 0.0015,
"loss": 2.8103,
"step": 8475
},
{
"epoch": 0.05272887433158398,
"grad_norm": 0.11419603228569031,
"learning_rate": 0.0015,
"loss": 2.7965,
"step": 8500
},
{
"epoch": 0.05288395925608863,
"grad_norm": 0.08137035369873047,
"learning_rate": 0.0015,
"loss": 2.7802,
"step": 8525
},
{
"epoch": 0.05303904418059329,
"grad_norm": 0.08078640699386597,
"learning_rate": 0.0015,
"loss": 2.7819,
"step": 8550
},
{
"epoch": 0.05319412910509795,
"grad_norm": 0.13133442401885986,
"learning_rate": 0.0015,
"loss": 2.83,
"step": 8575
},
{
"epoch": 0.05334921402960261,
"grad_norm": 0.08819993585348129,
"learning_rate": 0.0015,
"loss": 2.833,
"step": 8600
},
{
"epoch": 0.05334921402960261,
"eval_loss": 4.603670120239258,
"perplexity": 99.85010528564453,
"step": 8600
},
{
"epoch": 0.05350429895410727,
"grad_norm": 0.14662431180477142,
"learning_rate": 0.0015,
"loss": 2.8201,
"step": 8625
},
{
"epoch": 0.05365938387861193,
"grad_norm": 0.10400764644145966,
"learning_rate": 0.0015,
"loss": 2.7944,
"step": 8650
},
{
"epoch": 0.053814468803116584,
"grad_norm": 0.2790142297744751,
"learning_rate": 0.0015,
"loss": 2.8307,
"step": 8675
},
{
"epoch": 0.053969553727621244,
"grad_norm": 0.13645683228969574,
"learning_rate": 0.0015,
"loss": 2.7904,
"step": 8700
},
{
"epoch": 0.054124638652125905,
"grad_norm": 0.09604925662279129,
"learning_rate": 0.0015,
"loss": 2.76,
"step": 8725
},
{
"epoch": 0.054279723576630565,
"grad_norm": 0.07631650567054749,
"learning_rate": 0.0015,
"loss": 2.7955,
"step": 8750
},
{
"epoch": 0.054434808501135225,
"grad_norm": 0.13132531940937042,
"learning_rate": 0.0015,
"loss": 2.8308,
"step": 8775
},
{
"epoch": 0.05458989342563988,
"grad_norm": 0.08334681391716003,
"learning_rate": 0.0015,
"loss": 2.755,
"step": 8800
},
{
"epoch": 0.05458989342563988,
"eval_loss": 4.597860336303711,
"perplexity": 99.27168273925781,
"step": 8800
},
{
"epoch": 0.05474497835014454,
"grad_norm": 0.10585317760705948,
"learning_rate": 0.0015,
"loss": 2.7708,
"step": 8825
},
{
"epoch": 0.0549000632746492,
"grad_norm": 0.08953095227479935,
"learning_rate": 0.0015,
"loss": 2.7622,
"step": 8850
},
{
"epoch": 0.05505514819915386,
"grad_norm": 0.10430523008108139,
"learning_rate": 0.0015,
"loss": 2.8255,
"step": 8875
},
{
"epoch": 0.05521023312365852,
"grad_norm": 0.08961856365203857,
"learning_rate": 0.0015,
"loss": 2.7835,
"step": 8900
},
{
"epoch": 0.05536531804816317,
"grad_norm": 0.13602201640605927,
"learning_rate": 0.0015,
"loss": 2.813,
"step": 8925
},
{
"epoch": 0.05552040297266783,
"grad_norm": 0.1858643889427185,
"learning_rate": 0.0015,
"loss": 2.8296,
"step": 8950
},
{
"epoch": 0.05567548789717249,
"grad_norm": 0.12873806059360504,
"learning_rate": 0.0015,
"loss": 2.7669,
"step": 8975
},
{
"epoch": 0.05583057282167715,
"grad_norm": 0.09891733527183533,
"learning_rate": 0.0015,
"loss": 2.7829,
"step": 9000
},
{
"epoch": 0.05583057282167715,
"eval_loss": 4.606179714202881,
"perplexity": 100.10100555419922,
"step": 9000
},
{
"epoch": 0.05598565774618181,
"grad_norm": 0.1619413048028946,
"learning_rate": 0.0015,
"loss": 2.7885,
"step": 9025
},
{
"epoch": 0.056140742670686465,
"grad_norm": 0.1223379522562027,
"learning_rate": 0.0015,
"loss": 2.7829,
"step": 9050
},
{
"epoch": 0.056295827595191125,
"grad_norm": 0.10872245579957962,
"learning_rate": 0.0015,
"loss": 2.7962,
"step": 9075
},
{
"epoch": 0.056450912519695785,
"grad_norm": 0.11461862176656723,
"learning_rate": 0.0015,
"loss": 2.7476,
"step": 9100
},
{
"epoch": 0.056605997444200445,
"grad_norm": 0.08933119475841522,
"learning_rate": 0.0015,
"loss": 2.7745,
"step": 9125
},
{
"epoch": 0.056761082368705106,
"grad_norm": 0.12911683320999146,
"learning_rate": 0.0015,
"loss": 2.8029,
"step": 9150
},
{
"epoch": 0.05691616729320976,
"grad_norm": 0.13963252305984497,
"learning_rate": 0.0015,
"loss": 2.7931,
"step": 9175
},
{
"epoch": 0.05707125221771442,
"grad_norm": 0.13462606072425842,
"learning_rate": 0.0015,
"loss": 2.7771,
"step": 9200
},
{
"epoch": 0.05707125221771442,
"eval_loss": 4.619841575622559,
"perplexity": 101.47795104980469,
"step": 9200
},
{
"epoch": 0.05722633714221908,
"grad_norm": 0.12551379203796387,
"learning_rate": 0.0015,
"loss": 2.7934,
"step": 9225
},
{
"epoch": 0.05738142206672374,
"grad_norm": 0.12379872798919678,
"learning_rate": 0.0015,
"loss": 2.7882,
"step": 9250
},
{
"epoch": 0.0575365069912284,
"grad_norm": 0.0940781831741333,
"learning_rate": 0.0015,
"loss": 2.7658,
"step": 9275
},
{
"epoch": 0.05769159191573305,
"grad_norm": 0.14165829122066498,
"learning_rate": 0.0015,
"loss": 2.7973,
"step": 9300
},
{
"epoch": 0.05784667684023771,
"grad_norm": 0.10727201402187347,
"learning_rate": 0.0015,
"loss": 2.815,
"step": 9325
},
{
"epoch": 0.05800176176474237,
"grad_norm": 0.1628653109073639,
"learning_rate": 0.0015,
"loss": 2.7854,
"step": 9350
},
{
"epoch": 0.05815684668924703,
"grad_norm": 0.09925588220357895,
"learning_rate": 0.0015,
"loss": 2.7578,
"step": 9375
},
{
"epoch": 0.05831193161375169,
"grad_norm": 0.1587476134300232,
"learning_rate": 0.0015,
"loss": 2.7296,
"step": 9400
},
{
"epoch": 0.05831193161375169,
"eval_loss": 4.604221343994141,
"perplexity": 99.90515899658203,
"step": 9400
},
{
"epoch": 0.058467016538256346,
"grad_norm": 0.10519708693027496,
"learning_rate": 0.0015,
"loss": 2.7712,
"step": 9425
},
{
"epoch": 0.058622101462761006,
"grad_norm": 0.10321429371833801,
"learning_rate": 0.0015,
"loss": 2.7281,
"step": 9450
},
{
"epoch": 0.058777186387265666,
"grad_norm": 0.20060209929943085,
"learning_rate": 0.0015,
"loss": 2.807,
"step": 9475
},
{
"epoch": 0.058932271311770326,
"grad_norm": 0.10847010463476181,
"learning_rate": 0.0015,
"loss": 2.8078,
"step": 9500
},
{
"epoch": 0.059087356236274986,
"grad_norm": 0.11248752474784851,
"learning_rate": 0.0015,
"loss": 2.796,
"step": 9525
},
{
"epoch": 0.059242441160779646,
"grad_norm": 0.13171915709972382,
"learning_rate": 0.0015,
"loss": 2.7658,
"step": 9550
},
{
"epoch": 0.0593975260852843,
"grad_norm": 0.12041529268026352,
"learning_rate": 0.0015,
"loss": 2.7507,
"step": 9575
},
{
"epoch": 0.05955261100978896,
"grad_norm": 0.11275593191385269,
"learning_rate": 0.0015,
"loss": 2.8022,
"step": 9600
},
{
"epoch": 0.05955261100978896,
"eval_loss": 4.5886077880859375,
"perplexity": 98.3573989868164,
"step": 9600
},
{
"epoch": 0.05970769593429362,
"grad_norm": 0.1715971678495407,
"learning_rate": 0.0015,
"loss": 2.8003,
"step": 9625
},
{
"epoch": 0.05986278085879828,
"grad_norm": 0.1223614364862442,
"learning_rate": 0.0015,
"loss": 2.8012,
"step": 9650
},
{
"epoch": 0.06001786578330294,
"grad_norm": 0.114704430103302,
"learning_rate": 0.0015,
"loss": 2.7963,
"step": 9675
},
{
"epoch": 0.06017295070780759,
"grad_norm": 0.10282139480113983,
"learning_rate": 0.0015,
"loss": 2.7965,
"step": 9700
},
{
"epoch": 0.06032803563231225,
"grad_norm": 0.10494767129421234,
"learning_rate": 0.0015,
"loss": 2.7698,
"step": 9725
},
{
"epoch": 0.06048312055681691,
"grad_norm": 0.0908605083823204,
"learning_rate": 0.0015,
"loss": 2.749,
"step": 9750
},
{
"epoch": 0.06063820548132157,
"grad_norm": 0.0847998857498169,
"learning_rate": 0.0015,
"loss": 2.838,
"step": 9775
},
{
"epoch": 0.060793290405826234,
"grad_norm": 0.24615754187107086,
"learning_rate": 0.0015,
"loss": 2.8117,
"step": 9800
},
{
"epoch": 0.060793290405826234,
"eval_loss": 4.593789100646973,
"perplexity": 98.86833953857422,
"step": 9800
},
{
"epoch": 0.06094837533033089,
"grad_norm": 0.0959208682179451,
"learning_rate": 0.0015,
"loss": 2.7845,
"step": 9825
},
{
"epoch": 0.06110346025483555,
"grad_norm": 0.09963307529687881,
"learning_rate": 0.0015,
"loss": 2.8296,
"step": 9850
},
{
"epoch": 0.06125854517934021,
"grad_norm": 0.1115136444568634,
"learning_rate": 0.0015,
"loss": 2.7586,
"step": 9875
},
{
"epoch": 0.06141363010384487,
"grad_norm": 0.13883067667484283,
"learning_rate": 0.0015,
"loss": 2.7978,
"step": 9900
},
{
"epoch": 0.06156871502834953,
"grad_norm": 0.2048570066690445,
"learning_rate": 0.0015,
"loss": 2.8397,
"step": 9925
},
{
"epoch": 0.06172379995285418,
"grad_norm": 0.1306881606578827,
"learning_rate": 0.0015,
"loss": 2.8084,
"step": 9950
},
{
"epoch": 0.06187888487735884,
"grad_norm": 0.18285603821277618,
"learning_rate": 0.0015,
"loss": 2.7989,
"step": 9975
},
{
"epoch": 0.0620339698018635,
"grad_norm": 0.1109723299741745,
"learning_rate": 0.0015,
"loss": 2.8064,
"step": 10000
},
{
"epoch": 0.0620339698018635,
"eval_loss": 4.5877556800842285,
"perplexity": 98.27362823486328,
"step": 10000
},
{
"epoch": 0.06218905472636816,
"grad_norm": 0.12350066751241684,
"learning_rate": 0.0015,
"loss": 2.7684,
"step": 10025
},
{
"epoch": 0.06234413965087282,
"grad_norm": 0.11565285176038742,
"learning_rate": 0.0015,
"loss": 2.7748,
"step": 10050
},
{
"epoch": 0.062499224575377474,
"grad_norm": 0.1117839589715004,
"learning_rate": 0.0015,
"loss": 2.8044,
"step": 10075
},
{
"epoch": 0.06265430949988214,
"grad_norm": 0.1102209985256195,
"learning_rate": 0.0015,
"loss": 2.7844,
"step": 10100
},
{
"epoch": 0.0628093944243868,
"grad_norm": 0.10270575433969498,
"learning_rate": 0.0015,
"loss": 2.7685,
"step": 10125
},
{
"epoch": 0.06296447934889145,
"grad_norm": 0.09842963516712189,
"learning_rate": 0.0015,
"loss": 2.8048,
"step": 10150
},
{
"epoch": 0.06311956427339611,
"grad_norm": 0.10446088761091232,
"learning_rate": 0.0015,
"loss": 2.8051,
"step": 10175
},
{
"epoch": 0.06327464919790077,
"grad_norm": 0.14759957790374756,
"learning_rate": 0.0015,
"loss": 2.8089,
"step": 10200
},
{
"epoch": 0.06327464919790077,
"eval_loss": 4.588883399963379,
"perplexity": 98.38451385498047,
"step": 10200
},
{
"epoch": 0.06342973412240543,
"grad_norm": 0.12910906970500946,
"learning_rate": 0.0015,
"loss": 2.8193,
"step": 10225
},
{
"epoch": 0.06358481904691009,
"grad_norm": 0.13095402717590332,
"learning_rate": 0.0015,
"loss": 2.7509,
"step": 10250
},
{
"epoch": 0.06373990397141474,
"grad_norm": 0.16069594025611877,
"learning_rate": 0.0015,
"loss": 2.7911,
"step": 10275
},
{
"epoch": 0.06389498889591941,
"grad_norm": 0.08322907984256744,
"learning_rate": 0.0015,
"loss": 2.8025,
"step": 10300
},
{
"epoch": 0.06405007382042406,
"grad_norm": 0.2328927367925644,
"learning_rate": 0.0015,
"loss": 2.7863,
"step": 10325
},
{
"epoch": 0.06420515874492873,
"grad_norm": 0.09172859787940979,
"learning_rate": 0.0015,
"loss": 2.8101,
"step": 10350
},
{
"epoch": 0.06436024366943338,
"grad_norm": 0.13464473187923431,
"learning_rate": 0.0015,
"loss": 2.7718,
"step": 10375
},
{
"epoch": 0.06451532859393803,
"grad_norm": 0.1284090131521225,
"learning_rate": 0.0015,
"loss": 2.7667,
"step": 10400
},
{
"epoch": 0.06451532859393803,
"eval_loss": 4.59510612487793,
"perplexity": 98.99864196777344,
"step": 10400
},
{
"epoch": 0.0646704135184427,
"grad_norm": 0.13565704226493835,
"learning_rate": 0.0015,
"loss": 2.7552,
"step": 10425
},
{
"epoch": 0.06482549844294735,
"grad_norm": 0.1089024469256401,
"learning_rate": 0.0015,
"loss": 2.7838,
"step": 10450
},
{
"epoch": 0.06498058336745202,
"grad_norm": 0.11035135388374329,
"learning_rate": 0.0015,
"loss": 2.7986,
"step": 10475
},
{
"epoch": 0.06513566829195667,
"grad_norm": 0.08107917010784149,
"learning_rate": 0.0015,
"loss": 2.7791,
"step": 10500
},
{
"epoch": 0.06529075321646133,
"grad_norm": 0.10200012475252151,
"learning_rate": 0.0015,
"loss": 2.7636,
"step": 10525
},
{
"epoch": 0.065445838140966,
"grad_norm": 0.08427785336971283,
"learning_rate": 0.0015,
"loss": 2.794,
"step": 10550
},
{
"epoch": 0.06560092306547065,
"grad_norm": 0.10828018933534622,
"learning_rate": 0.0015,
"loss": 2.7778,
"step": 10575
},
{
"epoch": 0.06575600798997532,
"grad_norm": 0.12101134657859802,
"learning_rate": 0.0015,
"loss": 2.7469,
"step": 10600
},
{
"epoch": 0.06575600798997532,
"eval_loss": 4.597805500030518,
"perplexity": 99.2662353515625,
"step": 10600
},
{
"epoch": 0.06591109291447997,
"grad_norm": 0.11220554262399673,
"learning_rate": 0.0015,
"loss": 2.7294,
"step": 10625
},
{
"epoch": 0.06606617783898462,
"grad_norm": 0.13899332284927368,
"learning_rate": 0.0015,
"loss": 2.763,
"step": 10650
},
{
"epoch": 0.06622126276348929,
"grad_norm": 0.11773937195539474,
"learning_rate": 0.0015,
"loss": 2.7866,
"step": 10675
},
{
"epoch": 0.06637634768799394,
"grad_norm": 0.11059702187776566,
"learning_rate": 0.0015,
"loss": 2.8076,
"step": 10700
},
{
"epoch": 0.06653143261249861,
"grad_norm": 0.1251254379749298,
"learning_rate": 0.0015,
"loss": 2.7674,
"step": 10725
},
{
"epoch": 0.06668651753700326,
"grad_norm": 0.12195979803800583,
"learning_rate": 0.0015,
"loss": 2.768,
"step": 10750
},
{
"epoch": 0.06684160246150792,
"grad_norm": 0.1487302929162979,
"learning_rate": 0.0015,
"loss": 2.762,
"step": 10775
},
{
"epoch": 0.06699668738601258,
"grad_norm": 0.1315547525882721,
"learning_rate": 0.0015,
"loss": 2.7348,
"step": 10800
},
{
"epoch": 0.06699668738601258,
"eval_loss": 4.566490650177002,
"perplexity": 96.20589447021484,
"step": 10800
},
{
"epoch": 0.06715177231051724,
"grad_norm": 0.13864025473594666,
"learning_rate": 0.0015,
"loss": 2.7517,
"step": 10825
},
{
"epoch": 0.0673068572350219,
"grad_norm": 0.08808566629886627,
"learning_rate": 0.0015,
"loss": 2.7718,
"step": 10850
},
{
"epoch": 0.06746194215952656,
"grad_norm": 0.115321584045887,
"learning_rate": 0.0015,
"loss": 2.7007,
"step": 10875
},
{
"epoch": 0.06761702708403121,
"grad_norm": 0.10276370495557785,
"learning_rate": 0.0015,
"loss": 2.7692,
"step": 10900
},
{
"epoch": 0.06777211200853588,
"grad_norm": 0.09534792602062225,
"learning_rate": 0.0015,
"loss": 2.8186,
"step": 10925
},
{
"epoch": 0.06792719693304053,
"grad_norm": 0.14239507913589478,
"learning_rate": 0.0015,
"loss": 2.7801,
"step": 10950
},
{
"epoch": 0.0680822818575452,
"grad_norm": 0.11848737299442291,
"learning_rate": 0.0015,
"loss": 2.7394,
"step": 10975
},
{
"epoch": 0.06823736678204985,
"grad_norm": 0.09367898106575012,
"learning_rate": 0.0015,
"loss": 2.8043,
"step": 11000
},
{
"epoch": 0.06823736678204985,
"eval_loss": 4.5800089836120605,
"perplexity": 97.51527404785156,
"step": 11000
},
{
"epoch": 0.0683924517065545,
"grad_norm": 0.1494915634393692,
"learning_rate": 0.0015,
"loss": 2.7841,
"step": 11025
},
{
"epoch": 0.06854753663105917,
"grad_norm": 0.09982737898826599,
"learning_rate": 0.0015,
"loss": 2.7933,
"step": 11050
},
{
"epoch": 0.06870262155556382,
"grad_norm": 0.12379477173089981,
"learning_rate": 0.0015,
"loss": 2.7419,
"step": 11075
},
{
"epoch": 0.06885770648006849,
"grad_norm": 0.11405149102210999,
"learning_rate": 0.0015,
"loss": 2.763,
"step": 11100
},
{
"epoch": 0.06901279140457314,
"grad_norm": 0.09574620425701141,
"learning_rate": 0.0015,
"loss": 2.7961,
"step": 11125
},
{
"epoch": 0.06916787632907781,
"grad_norm": 0.2947874963283539,
"learning_rate": 0.0015,
"loss": 2.789,
"step": 11150
},
{
"epoch": 0.06932296125358246,
"grad_norm": 0.09219149500131607,
"learning_rate": 0.0015,
"loss": 2.7951,
"step": 11175
},
{
"epoch": 0.06947804617808712,
"grad_norm": 0.11840498447418213,
"learning_rate": 0.0015,
"loss": 2.7717,
"step": 11200
},
{
"epoch": 0.06947804617808712,
"eval_loss": 4.564184188842773,
"perplexity": 95.98426055908203,
"step": 11200
},
{
"epoch": 0.06963313110259178,
"grad_norm": 0.09422053396701813,
"learning_rate": 0.0015,
"loss": 2.7976,
"step": 11225
},
{
"epoch": 0.06978821602709644,
"grad_norm": 0.11220031976699829,
"learning_rate": 0.0015,
"loss": 2.7634,
"step": 11250
},
{
"epoch": 0.0699433009516011,
"grad_norm": 0.10228817909955978,
"learning_rate": 0.0015,
"loss": 2.7256,
"step": 11275
},
{
"epoch": 0.07009838587610576,
"grad_norm": 0.0929483100771904,
"learning_rate": 0.0015,
"loss": 2.8005,
"step": 11300
},
{
"epoch": 0.07025347080061041,
"grad_norm": 0.11491668224334717,
"learning_rate": 0.0015,
"loss": 2.7504,
"step": 11325
},
{
"epoch": 0.07040855572511508,
"grad_norm": 0.15256111323833466,
"learning_rate": 0.0015,
"loss": 2.7609,
"step": 11350
},
{
"epoch": 0.07056364064961973,
"grad_norm": 0.11576159298419952,
"learning_rate": 0.0015,
"loss": 2.7742,
"step": 11375
},
{
"epoch": 0.0707187255741244,
"grad_norm": 0.08809765428304672,
"learning_rate": 0.0015,
"loss": 2.7891,
"step": 11400
},
{
"epoch": 0.0707187255741244,
"eval_loss": 4.568883895874023,
"perplexity": 96.43641662597656,
"step": 11400
},
{
"epoch": 0.07087381049862905,
"grad_norm": 0.08563827723264694,
"learning_rate": 0.0015,
"loss": 2.8066,
"step": 11425
},
{
"epoch": 0.0710288954231337,
"grad_norm": 0.18896931409835815,
"learning_rate": 0.0015,
"loss": 2.8055,
"step": 11450
},
{
"epoch": 0.07118398034763837,
"grad_norm": 0.13940319418907166,
"learning_rate": 0.0015,
"loss": 2.7766,
"step": 11475
},
{
"epoch": 0.07133906527214302,
"grad_norm": 0.09737322479486465,
"learning_rate": 0.0015,
"loss": 2.7945,
"step": 11500
},
{
"epoch": 0.07149415019664769,
"grad_norm": 0.11357785761356354,
"learning_rate": 0.0015,
"loss": 2.7799,
"step": 11525
},
{
"epoch": 0.07164923512115234,
"grad_norm": 0.10513681918382645,
"learning_rate": 0.0015,
"loss": 2.7627,
"step": 11550
},
{
"epoch": 0.071804320045657,
"grad_norm": 0.1434682458639145,
"learning_rate": 0.0015,
"loss": 2.8055,
"step": 11575
},
{
"epoch": 0.07195940497016166,
"grad_norm": 0.10169105976819992,
"learning_rate": 0.0015,
"loss": 2.7832,
"step": 11600
},
{
"epoch": 0.07195940497016166,
"eval_loss": 4.560365676879883,
"perplexity": 95.61843872070312,
"step": 11600
},
{
"epoch": 0.07211448989466632,
"grad_norm": 0.1385478526353836,
"learning_rate": 0.0015,
"loss": 2.7548,
"step": 11625
},
{
"epoch": 0.07226957481917098,
"grad_norm": 0.1300746351480484,
"learning_rate": 0.0015,
"loss": 2.7553,
"step": 11650
},
{
"epoch": 0.07242465974367564,
"grad_norm": 0.11596991866827011,
"learning_rate": 0.0015,
"loss": 2.8095,
"step": 11675
},
{
"epoch": 0.07257974466818029,
"grad_norm": 0.11611347645521164,
"learning_rate": 0.0015,
"loss": 2.76,
"step": 11700
},
{
"epoch": 0.07273482959268496,
"grad_norm": 0.11249697953462601,
"learning_rate": 0.0015,
"loss": 2.7827,
"step": 11725
},
{
"epoch": 0.07288991451718961,
"grad_norm": 0.1243973895907402,
"learning_rate": 0.0015,
"loss": 2.7754,
"step": 11750
},
{
"epoch": 0.07304499944169428,
"grad_norm": 0.08843350410461426,
"learning_rate": 0.0015,
"loss": 2.8079,
"step": 11775
},
{
"epoch": 0.07320008436619893,
"grad_norm": 0.09881053864955902,
"learning_rate": 0.0015,
"loss": 2.7961,
"step": 11800
},
{
"epoch": 0.07320008436619893,
"eval_loss": 4.567913055419922,
"perplexity": 96.34283447265625,
"step": 11800
},
{
"epoch": 0.07335516929070358,
"grad_norm": 0.08978071063756943,
"learning_rate": 0.0015,
"loss": 2.7786,
"step": 11825
},
{
"epoch": 0.07351025421520825,
"grad_norm": 0.1376107782125473,
"learning_rate": 0.0015,
"loss": 2.7931,
"step": 11850
},
{
"epoch": 0.0736653391397129,
"grad_norm": 0.09934777021408081,
"learning_rate": 0.0015,
"loss": 2.7787,
"step": 11875
},
{
"epoch": 0.07382042406421757,
"grad_norm": 0.17031100392341614,
"learning_rate": 0.0015,
"loss": 2.7997,
"step": 11900
},
{
"epoch": 0.07397550898872222,
"grad_norm": 0.13974526524543762,
"learning_rate": 0.0015,
"loss": 2.7975,
"step": 11925
},
{
"epoch": 0.07413059391322688,
"grad_norm": 0.12611718475818634,
"learning_rate": 0.0015,
"loss": 2.792,
"step": 11950
},
{
"epoch": 0.07428567883773154,
"grad_norm": 0.15177124738693237,
"learning_rate": 0.0015,
"loss": 2.7904,
"step": 11975
},
{
"epoch": 0.0744407637622362,
"grad_norm": 0.1411113739013672,
"learning_rate": 0.0015,
"loss": 2.7677,
"step": 12000
},
{
"epoch": 0.0744407637622362,
"eval_loss": 4.5571770668029785,
"perplexity": 95.31403350830078,
"step": 12000
},
{
"epoch": 0.07459584868674086,
"grad_norm": 0.08981940150260925,
"learning_rate": 0.0015,
"loss": 2.7765,
"step": 12025
},
{
"epoch": 0.07475093361124552,
"grad_norm": 0.09796686470508575,
"learning_rate": 0.0015,
"loss": 2.7503,
"step": 12050
},
{
"epoch": 0.07490601853575017,
"grad_norm": 0.1125386580824852,
"learning_rate": 0.0015,
"loss": 2.7263,
"step": 12075
},
{
"epoch": 0.07506110346025484,
"grad_norm": 0.11394508183002472,
"learning_rate": 0.0015,
"loss": 2.7855,
"step": 12100
},
{
"epoch": 0.07521618838475949,
"grad_norm": 0.11744117736816406,
"learning_rate": 0.0015,
"loss": 2.7698,
"step": 12125
},
{
"epoch": 0.07537127330926416,
"grad_norm": 0.17264704406261444,
"learning_rate": 0.0015,
"loss": 2.7592,
"step": 12150
},
{
"epoch": 0.07552635823376881,
"grad_norm": 0.10691671818494797,
"learning_rate": 0.0015,
"loss": 2.7519,
"step": 12175
},
{
"epoch": 0.07568144315827346,
"grad_norm": 0.1205432191491127,
"learning_rate": 0.0015,
"loss": 2.7676,
"step": 12200
},
{
"epoch": 0.07568144315827346,
"eval_loss": 4.544521808624268,
"perplexity": 94.11540985107422,
"step": 12200
},
{
"epoch": 0.07583652808277813,
"grad_norm": 0.1253867894411087,
"learning_rate": 0.0015,
"loss": 2.7698,
"step": 12225
},
{
"epoch": 0.07599161300728279,
"grad_norm": 0.1450471729040146,
"learning_rate": 0.0015,
"loss": 2.77,
"step": 12250
},
{
"epoch": 0.07614669793178745,
"grad_norm": 0.17055222392082214,
"learning_rate": 0.0015,
"loss": 2.7352,
"step": 12275
},
{
"epoch": 0.0763017828562921,
"grad_norm": 0.10687011480331421,
"learning_rate": 0.0015,
"loss": 2.7988,
"step": 12300
},
{
"epoch": 0.07645686778079676,
"grad_norm": 0.15520496666431427,
"learning_rate": 0.0015,
"loss": 2.7828,
"step": 12325
},
{
"epoch": 0.07661195270530143,
"grad_norm": 0.09279755502939224,
"learning_rate": 0.0015,
"loss": 2.7222,
"step": 12350
},
{
"epoch": 0.07676703762980608,
"grad_norm": 0.18024928867816925,
"learning_rate": 0.0015,
"loss": 2.7555,
"step": 12375
},
{
"epoch": 0.07692212255431075,
"grad_norm": 0.13292630016803741,
"learning_rate": 0.0015,
"loss": 2.733,
"step": 12400
},
{
"epoch": 0.07692212255431075,
"eval_loss": 4.538700103759766,
"perplexity": 93.569091796875,
"step": 12400
},
{
"epoch": 0.0770772074788154,
"grad_norm": 0.09353446960449219,
"learning_rate": 0.0015,
"loss": 2.7768,
"step": 12425
},
{
"epoch": 0.07723229240332005,
"grad_norm": 0.0946316123008728,
"learning_rate": 0.0015,
"loss": 2.7321,
"step": 12450
},
{
"epoch": 0.07738737732782472,
"grad_norm": 0.11109050363302231,
"learning_rate": 0.0015,
"loss": 2.7607,
"step": 12475
},
{
"epoch": 0.07754246225232937,
"grad_norm": 0.10057735443115234,
"learning_rate": 0.0015,
"loss": 2.7707,
"step": 12500
},
{
"epoch": 0.07769754717683404,
"grad_norm": 0.1466909795999527,
"learning_rate": 0.0015,
"loss": 2.7434,
"step": 12525
},
{
"epoch": 0.07785263210133869,
"grad_norm": 0.09831534326076508,
"learning_rate": 0.0015,
"loss": 2.7858,
"step": 12550
},
{
"epoch": 0.07800771702584335,
"grad_norm": 0.13202817738056183,
"learning_rate": 0.0015,
"loss": 2.7884,
"step": 12575
},
{
"epoch": 0.07816280195034801,
"grad_norm": 0.10797799378633499,
"learning_rate": 0.0015,
"loss": 2.7788,
"step": 12600
},
{
"epoch": 0.07816280195034801,
"eval_loss": 4.5452494621276855,
"perplexity": 94.18392181396484,
"step": 12600
},
{
"epoch": 0.07831788687485267,
"grad_norm": 0.10239394754171371,
"learning_rate": 0.0015,
"loss": 2.7803,
"step": 12625
},
{
"epoch": 0.07847297179935733,
"grad_norm": 0.10468672215938568,
"learning_rate": 0.0015,
"loss": 2.7449,
"step": 12650
},
{
"epoch": 0.07862805672386199,
"grad_norm": 0.13691146671772003,
"learning_rate": 0.0015,
"loss": 2.7837,
"step": 12675
},
{
"epoch": 0.07878314164836664,
"grad_norm": 0.16976097226142883,
"learning_rate": 0.0015,
"loss": 2.7557,
"step": 12700
},
{
"epoch": 0.0789382265728713,
"grad_norm": 0.09623986482620239,
"learning_rate": 0.0015,
"loss": 2.7576,
"step": 12725
},
{
"epoch": 0.07909331149737596,
"grad_norm": 0.11203131079673767,
"learning_rate": 0.0015,
"loss": 2.7846,
"step": 12750
},
{
"epoch": 0.07924839642188063,
"grad_norm": 0.12257611751556396,
"learning_rate": 0.0015,
"loss": 2.8015,
"step": 12775
},
{
"epoch": 0.07940348134638528,
"grad_norm": 0.08369628340005875,
"learning_rate": 0.0015,
"loss": 2.7616,
"step": 12800
},
{
"epoch": 0.07940348134638528,
"eval_loss": 4.548933506011963,
"perplexity": 94.53153991699219,
"step": 12800
},
{
"epoch": 0.07955856627088993,
"grad_norm": 0.12149519473314285,
"learning_rate": 0.0015,
"loss": 2.7651,
"step": 12825
},
{
"epoch": 0.0797136511953946,
"grad_norm": 0.09911686927080154,
"learning_rate": 0.0015,
"loss": 2.7964,
"step": 12850
},
{
"epoch": 0.07986873611989925,
"grad_norm": 0.09883631020784378,
"learning_rate": 0.0015,
"loss": 2.7461,
"step": 12875
},
{
"epoch": 0.08002382104440392,
"grad_norm": 0.08828576654195786,
"learning_rate": 0.0015,
"loss": 2.7735,
"step": 12900
},
{
"epoch": 0.08017890596890857,
"grad_norm": 0.18119321763515472,
"learning_rate": 0.0015,
"loss": 2.7863,
"step": 12925
},
{
"epoch": 0.08033399089341323,
"grad_norm": 0.09123501181602478,
"learning_rate": 0.0015,
"loss": 2.7559,
"step": 12950
},
{
"epoch": 0.0804890758179179,
"grad_norm": 0.18334759771823883,
"learning_rate": 0.0015,
"loss": 2.7357,
"step": 12975
},
{
"epoch": 0.08064416074242255,
"grad_norm": 0.08934136480093002,
"learning_rate": 0.0015,
"loss": 2.8003,
"step": 13000
},
{
"epoch": 0.08064416074242255,
"eval_loss": 4.537932395935059,
"perplexity": 93.49728393554688,
"step": 13000
},
{
"epoch": 0.08079924566692721,
"grad_norm": 0.117793008685112,
"learning_rate": 0.0015,
"loss": 2.738,
"step": 13025
},
{
"epoch": 0.08095433059143187,
"grad_norm": 0.1012151837348938,
"learning_rate": 0.0015,
"loss": 2.767,
"step": 13050
},
{
"epoch": 0.08110941551593653,
"grad_norm": 0.1099851131439209,
"learning_rate": 0.0015,
"loss": 2.7899,
"step": 13075
},
{
"epoch": 0.08126450044044119,
"grad_norm": 0.105575330555439,
"learning_rate": 0.0015,
"loss": 2.7857,
"step": 13100
},
{
"epoch": 0.08141958536494584,
"grad_norm": 0.11926279962062836,
"learning_rate": 0.0015,
"loss": 2.7821,
"step": 13125
},
{
"epoch": 0.08157467028945051,
"grad_norm": 0.1669924259185791,
"learning_rate": 0.0015,
"loss": 2.7673,
"step": 13150
},
{
"epoch": 0.08172975521395516,
"grad_norm": 0.11445988714694977,
"learning_rate": 0.0015,
"loss": 2.8081,
"step": 13175
},
{
"epoch": 0.08188484013845983,
"grad_norm": 0.09700124710798264,
"learning_rate": 0.0015,
"loss": 2.7841,
"step": 13200
},
{
"epoch": 0.08188484013845983,
"eval_loss": 4.540359973907471,
"perplexity": 93.72453308105469,
"step": 13200
},
{
"epoch": 0.08203992506296448,
"grad_norm": 0.11112058907747269,
"learning_rate": 0.0015,
"loss": 2.7471,
"step": 13225
},
{
"epoch": 0.08219500998746913,
"grad_norm": 0.17890195548534393,
"learning_rate": 0.0015,
"loss": 2.7898,
"step": 13250
},
{
"epoch": 0.0823500949119738,
"grad_norm": 0.12197751551866531,
"learning_rate": 0.0015,
"loss": 2.7328,
"step": 13275
},
{
"epoch": 0.08250517983647845,
"grad_norm": 0.11677111685276031,
"learning_rate": 0.0015,
"loss": 2.7849,
"step": 13300
},
{
"epoch": 0.08266026476098312,
"grad_norm": 0.15514017641544342,
"learning_rate": 0.0015,
"loss": 2.7561,
"step": 13325
},
{
"epoch": 0.08281534968548777,
"grad_norm": 0.10389192402362823,
"learning_rate": 0.0015,
"loss": 2.7611,
"step": 13350
},
{
"epoch": 0.08297043460999243,
"grad_norm": 0.10176412016153336,
"learning_rate": 0.0015,
"loss": 2.7793,
"step": 13375
},
{
"epoch": 0.0831255195344971,
"grad_norm": 0.1043052077293396,
"learning_rate": 0.0015,
"loss": 2.7375,
"step": 13400
},
{
"epoch": 0.0831255195344971,
"eval_loss": 4.5388336181640625,
"perplexity": 93.58158111572266,
"step": 13400
},
{
"epoch": 0.08328060445900175,
"grad_norm": 0.08918718248605728,
"learning_rate": 0.0015,
"loss": 2.7465,
"step": 13425
},
{
"epoch": 0.08343568938350641,
"grad_norm": 0.10008233785629272,
"learning_rate": 0.0015,
"loss": 2.7776,
"step": 13450
},
{
"epoch": 0.08359077430801107,
"grad_norm": 0.10228800773620605,
"learning_rate": 0.0015,
"loss": 2.756,
"step": 13475
},
{
"epoch": 0.08374585923251572,
"grad_norm": 0.0868915542960167,
"learning_rate": 0.0015,
"loss": 2.7556,
"step": 13500
},
{
"epoch": 0.08390094415702039,
"grad_norm": 0.11076166480779648,
"learning_rate": 0.0015,
"loss": 2.6975,
"step": 13525
},
{
"epoch": 0.08405602908152504,
"grad_norm": 0.13617128133773804,
"learning_rate": 0.0015,
"loss": 2.7643,
"step": 13550
},
{
"epoch": 0.08421111400602971,
"grad_norm": 0.15346932411193848,
"learning_rate": 0.0015,
"loss": 2.7966,
"step": 13575
},
{
"epoch": 0.08436619893053436,
"grad_norm": 0.17080894112586975,
"learning_rate": 0.0015,
"loss": 2.7636,
"step": 13600
},
{
"epoch": 0.08436619893053436,
"eval_loss": 4.513378620147705,
"perplexity": 91.22953033447266,
"step": 13600
},
{
"epoch": 0.08452128385503901,
"grad_norm": 0.11548548936843872,
"learning_rate": 0.0015,
"loss": 2.7729,
"step": 13625
},
{
"epoch": 0.08467636877954368,
"grad_norm": 0.14650912582874298,
"learning_rate": 0.0015,
"loss": 2.7063,
"step": 13650
},
{
"epoch": 0.08483145370404833,
"grad_norm": 0.09750749915838242,
"learning_rate": 0.0015,
"loss": 2.7648,
"step": 13675
},
{
"epoch": 0.084986538628553,
"grad_norm": 0.18051239848136902,
"learning_rate": 0.0015,
"loss": 2.754,
"step": 13700
},
{
"epoch": 0.08514162355305765,
"grad_norm": 0.21637938916683197,
"learning_rate": 0.0015,
"loss": 2.7529,
"step": 13725
},
{
"epoch": 0.08529670847756231,
"grad_norm": 0.10037226974964142,
"learning_rate": 0.0015,
"loss": 2.7638,
"step": 13750
},
{
"epoch": 0.08545179340206698,
"grad_norm": 0.1033267229795456,
"learning_rate": 0.0015,
"loss": 2.7713,
"step": 13775
},
{
"epoch": 0.08560687832657163,
"grad_norm": 0.09179462492465973,
"learning_rate": 0.0015,
"loss": 2.8278,
"step": 13800
},
{
"epoch": 0.08560687832657163,
"eval_loss": 4.508410453796387,
"perplexity": 90.77741241455078,
"step": 13800
},
{
"epoch": 0.0857619632510763,
"grad_norm": 0.09874552488327026,
"learning_rate": 0.0015,
"loss": 2.7544,
"step": 13825
},
{
"epoch": 0.08591704817558095,
"grad_norm": 0.17807777225971222,
"learning_rate": 0.0015,
"loss": 2.7401,
"step": 13850
},
{
"epoch": 0.0860721331000856,
"grad_norm": 0.14388497173786163,
"learning_rate": 0.0015,
"loss": 2.7879,
"step": 13875
},
{
"epoch": 0.08622721802459027,
"grad_norm": 0.13081450760364532,
"learning_rate": 0.0015,
"loss": 2.7162,
"step": 13900
},
{
"epoch": 0.08638230294909492,
"grad_norm": 0.15077342092990875,
"learning_rate": 0.0015,
"loss": 2.757,
"step": 13925
},
{
"epoch": 0.08653738787359959,
"grad_norm": 0.11368410289287567,
"learning_rate": 0.0015,
"loss": 2.7546,
"step": 13950
},
{
"epoch": 0.08669247279810424,
"grad_norm": 0.16447153687477112,
"learning_rate": 0.0015,
"loss": 2.7371,
"step": 13975
},
{
"epoch": 0.0868475577226089,
"grad_norm": 0.20563559234142303,
"learning_rate": 0.0015,
"loss": 2.7474,
"step": 14000
},
{
"epoch": 0.0868475577226089,
"eval_loss": 4.525671005249023,
"perplexity": 92.35787963867188,
"step": 14000
},
{
"epoch": 0.08700264264711356,
"grad_norm": 0.10695035755634308,
"learning_rate": 0.0015,
"loss": 2.7565,
"step": 14025
},
{
"epoch": 0.08715772757161822,
"grad_norm": 0.12368099391460419,
"learning_rate": 0.0015,
"loss": 2.784,
"step": 14050
},
{
"epoch": 0.08731281249612288,
"grad_norm": 0.11491699516773224,
"learning_rate": 0.0015,
"loss": 2.7477,
"step": 14075
},
{
"epoch": 0.08746789742062754,
"grad_norm": 0.10570378601551056,
"learning_rate": 0.0015,
"loss": 2.7575,
"step": 14100
},
{
"epoch": 0.08762298234513219,
"grad_norm": 0.09137633442878723,
"learning_rate": 0.0015,
"loss": 2.7517,
"step": 14125
},
{
"epoch": 0.08777806726963686,
"grad_norm": 0.09999803453683853,
"learning_rate": 0.0015,
"loss": 2.7446,
"step": 14150
},
{
"epoch": 0.08793315219414151,
"grad_norm": 0.15709616243839264,
"learning_rate": 0.0015,
"loss": 2.7606,
"step": 14175
},
{
"epoch": 0.08808823711864618,
"grad_norm": 0.10327859222888947,
"learning_rate": 0.0015,
"loss": 2.7441,
"step": 14200
},
{
"epoch": 0.08808823711864618,
"eval_loss": 4.521189212799072,
"perplexity": 91.94487762451172,
"step": 14200
},
{
"epoch": 0.08824332204315083,
"grad_norm": 0.1964125633239746,
"learning_rate": 0.0015,
"loss": 2.7109,
"step": 14225
},
{
"epoch": 0.08839840696765548,
"grad_norm": 0.12792247533798218,
"learning_rate": 0.0015,
"loss": 2.7401,
"step": 14250
},
{
"epoch": 0.08855349189216015,
"grad_norm": 0.17532923817634583,
"learning_rate": 0.0015,
"loss": 2.7609,
"step": 14275
},
{
"epoch": 0.0887085768166648,
"grad_norm": 0.096143439412117,
"learning_rate": 0.0015,
"loss": 2.7749,
"step": 14300
},
{
"epoch": 0.08886366174116947,
"grad_norm": 0.12778601050376892,
"learning_rate": 0.0015,
"loss": 2.6981,
"step": 14325
},
{
"epoch": 0.08901874666567412,
"grad_norm": 0.1130848377943039,
"learning_rate": 0.0015,
"loss": 2.7255,
"step": 14350
},
{
"epoch": 0.08917383159017878,
"grad_norm": 0.0818464607000351,
"learning_rate": 0.0015,
"loss": 2.7223,
"step": 14375
},
{
"epoch": 0.08932891651468344,
"grad_norm": 0.10516222566366196,
"learning_rate": 0.0015,
"loss": 2.7672,
"step": 14400
},
{
"epoch": 0.08932891651468344,
"eval_loss": 4.524067401885986,
"perplexity": 92.20989227294922,
"step": 14400
},
{
"epoch": 0.0894840014391881,
"grad_norm": 0.08912840485572815,
"learning_rate": 0.0015,
"loss": 2.7349,
"step": 14425
},
{
"epoch": 0.08963908636369276,
"grad_norm": 0.11931388080120087,
"learning_rate": 0.0015,
"loss": 2.7326,
"step": 14450
},
{
"epoch": 0.08979417128819742,
"grad_norm": 0.12271756678819656,
"learning_rate": 0.0015,
"loss": 2.7327,
"step": 14475
},
{
"epoch": 0.08994925621270207,
"grad_norm": 0.1567191183567047,
"learning_rate": 0.0015,
"loss": 2.7573,
"step": 14500
},
{
"epoch": 0.09010434113720674,
"grad_norm": 0.1841791719198227,
"learning_rate": 0.0015,
"loss": 2.7582,
"step": 14525
},
{
"epoch": 0.09025942606171139,
"grad_norm": 0.12743189930915833,
"learning_rate": 0.0015,
"loss": 2.8061,
"step": 14550
},
{
"epoch": 0.09041451098621606,
"grad_norm": 0.11932828277349472,
"learning_rate": 0.0015,
"loss": 2.7447,
"step": 14575
},
{
"epoch": 0.09056959591072071,
"grad_norm": 0.18284690380096436,
"learning_rate": 0.0015,
"loss": 2.7436,
"step": 14600
},
{
"epoch": 0.09056959591072071,
"eval_loss": 4.515897750854492,
"perplexity": 91.45964050292969,
"step": 14600
},
{
"epoch": 0.09072468083522536,
"grad_norm": 0.17987670004367828,
"learning_rate": 0.0015,
"loss": 2.7831,
"step": 14625
},
{
"epoch": 0.09087976575973003,
"grad_norm": 0.10992395132780075,
"learning_rate": 0.0015,
"loss": 2.7516,
"step": 14650
},
{
"epoch": 0.09103485068423468,
"grad_norm": 0.09343726187944412,
"learning_rate": 0.0015,
"loss": 2.7475,
"step": 14675
},
{
"epoch": 0.09118993560873935,
"grad_norm": 0.10370751470327377,
"learning_rate": 0.0015,
"loss": 2.7518,
"step": 14700
},
{
"epoch": 0.091345020533244,
"grad_norm": 0.11190348863601685,
"learning_rate": 0.0015,
"loss": 2.7482,
"step": 14725
},
{
"epoch": 0.09150010545774866,
"grad_norm": 0.12450053542852402,
"learning_rate": 0.0015,
"loss": 2.7726,
"step": 14750
},
{
"epoch": 0.09165519038225332,
"grad_norm": 0.11882703006267548,
"learning_rate": 0.0015,
"loss": 2.7318,
"step": 14775
},
{
"epoch": 0.09181027530675798,
"grad_norm": 0.1315181404352188,
"learning_rate": 0.0015,
"loss": 2.757,
"step": 14800
},
{
"epoch": 0.09181027530675798,
"eval_loss": 4.521557807922363,
"perplexity": 91.97877502441406,
"step": 14800
},
{
"epoch": 0.09196536023126264,
"grad_norm": 0.18574784696102142,
"learning_rate": 0.0015,
"loss": 2.7353,
"step": 14825
},
{
"epoch": 0.0921204451557673,
"grad_norm": 0.17665444314479828,
"learning_rate": 0.0015,
"loss": 2.7687,
"step": 14850
},
{
"epoch": 0.09227553008027195,
"grad_norm": 0.12507860362529755,
"learning_rate": 0.0015,
"loss": 2.7386,
"step": 14875
},
{
"epoch": 0.09243061500477662,
"grad_norm": 0.10472691059112549,
"learning_rate": 0.0015,
"loss": 2.7716,
"step": 14900
},
{
"epoch": 0.09258569992928127,
"grad_norm": 0.10282575339078903,
"learning_rate": 0.0015,
"loss": 2.7312,
"step": 14925
},
{
"epoch": 0.09274078485378594,
"grad_norm": 0.12706094980239868,
"learning_rate": 0.0015,
"loss": 2.7995,
"step": 14950
},
{
"epoch": 0.09289586977829059,
"grad_norm": 0.15283973515033722,
"learning_rate": 0.0015,
"loss": 2.7313,
"step": 14975
},
{
"epoch": 0.09305095470279524,
"grad_norm": 0.12476324290037155,
"learning_rate": 0.0015,
"loss": 2.7727,
"step": 15000
},
{
"epoch": 0.09305095470279524,
"eval_loss": 4.547565937042236,
"perplexity": 94.40234375,
"step": 15000
},
{
"epoch": 0.09320603962729991,
"grad_norm": 0.12369734048843384,
"learning_rate": 0.0015,
"loss": 2.7565,
"step": 15025
},
{
"epoch": 0.09336112455180456,
"grad_norm": 0.1322038471698761,
"learning_rate": 0.0015,
"loss": 2.7588,
"step": 15050
},
{
"epoch": 0.09351620947630923,
"grad_norm": 0.0926559790968895,
"learning_rate": 0.0015,
"loss": 2.7393,
"step": 15075
},
{
"epoch": 0.09367129440081388,
"grad_norm": 0.17404210567474365,
"learning_rate": 0.0015,
"loss": 2.723,
"step": 15100
},
{
"epoch": 0.09382637932531855,
"grad_norm": 0.10326647758483887,
"learning_rate": 0.0015,
"loss": 2.7853,
"step": 15125
},
{
"epoch": 0.0939814642498232,
"grad_norm": 0.13869203627109528,
"learning_rate": 0.0015,
"loss": 2.7535,
"step": 15150
},
{
"epoch": 0.09413654917432786,
"grad_norm": 0.14325955510139465,
"learning_rate": 0.0015,
"loss": 2.7597,
"step": 15175
},
{
"epoch": 0.09429163409883252,
"grad_norm": 0.11783768236637115,
"learning_rate": 0.0015,
"loss": 2.7524,
"step": 15200
},
{
"epoch": 0.09429163409883252,
"eval_loss": 4.5251593589782715,
"perplexity": 92.31063842773438,
"step": 15200
},
{
"epoch": 0.09444671902333718,
"grad_norm": 0.12261676043272018,
"learning_rate": 0.0015,
"loss": 2.7279,
"step": 15225
},
{
"epoch": 0.09460180394784184,
"grad_norm": 0.09966279566287994,
"learning_rate": 0.0015,
"loss": 2.8119,
"step": 15250
},
{
"epoch": 0.0947568888723465,
"grad_norm": 0.1052974984049797,
"learning_rate": 0.0015,
"loss": 2.7392,
"step": 15275
},
{
"epoch": 0.09491197379685115,
"grad_norm": 0.11074663698673248,
"learning_rate": 0.0015,
"loss": 2.7319,
"step": 15300
},
{
"epoch": 0.09506705872135582,
"grad_norm": 0.09762706607580185,
"learning_rate": 0.0015,
"loss": 2.7806,
"step": 15325
},
{
"epoch": 0.09522214364586047,
"grad_norm": 0.08552476018667221,
"learning_rate": 0.0015,
"loss": 2.7351,
"step": 15350
},
{
"epoch": 0.09537722857036514,
"grad_norm": 0.13211695849895477,
"learning_rate": 0.0015,
"loss": 2.7667,
"step": 15375
},
{
"epoch": 0.09553231349486979,
"grad_norm": 0.12074939906597137,
"learning_rate": 0.0015,
"loss": 2.7614,
"step": 15400
},
{
"epoch": 0.09553231349486979,
"eval_loss": 4.53213357925415,
"perplexity": 92.95668029785156,
"step": 15400
},
{
"epoch": 0.09568739841937444,
"grad_norm": 0.11755666136741638,
"learning_rate": 0.0015,
"loss": 2.7101,
"step": 15425
},
{
"epoch": 0.09584248334387911,
"grad_norm": 0.10476246476173401,
"learning_rate": 0.0015,
"loss": 2.7391,
"step": 15450
},
{
"epoch": 0.09599756826838376,
"grad_norm": 0.10921350121498108,
"learning_rate": 0.0015,
"loss": 2.7423,
"step": 15475
},
{
"epoch": 0.09615265319288843,
"grad_norm": 0.11517275124788284,
"learning_rate": 0.0015,
"loss": 2.7374,
"step": 15500
},
{
"epoch": 0.09630773811739309,
"grad_norm": 0.10500945895910263,
"learning_rate": 0.0015,
"loss": 2.73,
"step": 15525
},
{
"epoch": 0.09646282304189774,
"grad_norm": 0.0962584912776947,
"learning_rate": 0.0015,
"loss": 2.7597,
"step": 15550
},
{
"epoch": 0.0966179079664024,
"grad_norm": 0.1273050308227539,
"learning_rate": 0.0015,
"loss": 2.7306,
"step": 15575
},
{
"epoch": 0.09677299289090706,
"grad_norm": 0.11249135434627533,
"learning_rate": 0.0015,
"loss": 2.7859,
"step": 15600
},
{
"epoch": 0.09677299289090706,
"eval_loss": 4.537318706512451,
"perplexity": 93.43992614746094,
"step": 15600
},
{
"epoch": 0.09692807781541173,
"grad_norm": 0.19111056625843048,
"learning_rate": 0.0015,
"loss": 2.7386,
"step": 15625
},
{
"epoch": 0.09708316273991638,
"grad_norm": 0.10486472398042679,
"learning_rate": 0.0015,
"loss": 2.7462,
"step": 15650
},
{
"epoch": 0.09723824766442103,
"grad_norm": 0.1453208327293396,
"learning_rate": 0.0015,
"loss": 2.762,
"step": 15675
},
{
"epoch": 0.0973933325889257,
"grad_norm": 0.08459452539682388,
"learning_rate": 0.0015,
"loss": 2.7353,
"step": 15700
},
{
"epoch": 0.09754841751343035,
"grad_norm": 0.11150529980659485,
"learning_rate": 0.0015,
"loss": 2.7617,
"step": 15725
},
{
"epoch": 0.09770350243793502,
"grad_norm": 0.11301703006029129,
"learning_rate": 0.0015,
"loss": 2.7623,
"step": 15750
},
{
"epoch": 0.09785858736243967,
"grad_norm": 0.16564789414405823,
"learning_rate": 0.0015,
"loss": 2.7315,
"step": 15775
},
{
"epoch": 0.09801367228694433,
"grad_norm": 0.08968822658061981,
"learning_rate": 0.0015,
"loss": 2.7842,
"step": 15800
},
{
"epoch": 0.09801367228694433,
"eval_loss": 4.528219223022461,
"perplexity": 92.5935287475586,
"step": 15800
},
{
"epoch": 0.09816875721144899,
"grad_norm": 0.1233256533741951,
"learning_rate": 0.0015,
"loss": 2.7584,
"step": 15825
},
{
"epoch": 0.09832384213595365,
"grad_norm": 0.18926863372325897,
"learning_rate": 0.0015,
"loss": 2.7651,
"step": 15850
},
{
"epoch": 0.09847892706045831,
"grad_norm": 0.0912550836801529,
"learning_rate": 0.0015,
"loss": 2.7551,
"step": 15875
},
{
"epoch": 0.09863401198496297,
"grad_norm": 0.1443813592195511,
"learning_rate": 0.0015,
"loss": 2.7378,
"step": 15900
},
{
"epoch": 0.09878909690946762,
"grad_norm": 0.11620072275400162,
"learning_rate": 0.0015,
"loss": 2.7706,
"step": 15925
},
{
"epoch": 0.09894418183397229,
"grad_norm": 0.10275860130786896,
"learning_rate": 0.0015,
"loss": 2.7502,
"step": 15950
},
{
"epoch": 0.09909926675847694,
"grad_norm": 0.1417694240808487,
"learning_rate": 0.0015,
"loss": 2.706,
"step": 15975
},
{
"epoch": 0.0992543516829816,
"grad_norm": 0.1121877133846283,
"learning_rate": 0.0015,
"loss": 2.7537,
"step": 16000
},
{
"epoch": 0.0992543516829816,
"eval_loss": 4.520648956298828,
"perplexity": 91.89521789550781,
"step": 16000
},
{
"epoch": 0.09940943660748626,
"grad_norm": 0.10022582858800888,
"learning_rate": 0.0015,
"loss": 2.7213,
"step": 16025
},
{
"epoch": 0.09956452153199091,
"grad_norm": 0.09722616523504257,
"learning_rate": 0.0015,
"loss": 2.7437,
"step": 16050
},
{
"epoch": 0.09971960645649558,
"grad_norm": 0.11053729802370071,
"learning_rate": 0.0015,
"loss": 2.7495,
"step": 16075
},
{
"epoch": 0.09987469138100023,
"grad_norm": 0.10231011360883713,
"learning_rate": 0.0015,
"loss": 2.7505,
"step": 16100
},
{
"epoch": 0.1000297763055049,
"grad_norm": 0.135975643992424,
"learning_rate": 0.0015,
"loss": 2.7487,
"step": 16125
},
{
"epoch": 0.10018486123000955,
"grad_norm": 0.11350739002227783,
"learning_rate": 0.0015,
"loss": 2.7484,
"step": 16150
},
{
"epoch": 0.1003399461545142,
"grad_norm": 0.10639143735170364,
"learning_rate": 0.0015,
"loss": 2.7429,
"step": 16175
},
{
"epoch": 0.10049503107901887,
"grad_norm": 0.09016221761703491,
"learning_rate": 0.0015,
"loss": 2.7891,
"step": 16200
},
{
"epoch": 0.10049503107901887,
"eval_loss": 4.5112504959106445,
"perplexity": 91.03558349609375,
"step": 16200
},
{
"epoch": 0.10065011600352353,
"grad_norm": 0.11324500292539597,
"learning_rate": 0.0015,
"loss": 2.7678,
"step": 16225
},
{
"epoch": 0.1008052009280282,
"grad_norm": 0.13268886506557465,
"learning_rate": 0.0015,
"loss": 2.723,
"step": 16250
},
{
"epoch": 0.10096028585253285,
"grad_norm": 0.11448831856250763,
"learning_rate": 0.0015,
"loss": 2.7328,
"step": 16275
},
{
"epoch": 0.1011153707770375,
"grad_norm": 0.10799309611320496,
"learning_rate": 0.0015,
"loss": 2.7478,
"step": 16300
},
{
"epoch": 0.10127045570154217,
"grad_norm": 0.19559204578399658,
"learning_rate": 0.0015,
"loss": 2.7606,
"step": 16325
},
{
"epoch": 0.10142554062604682,
"grad_norm": 0.14151975512504578,
"learning_rate": 0.0015,
"loss": 2.7279,
"step": 16350
},
{
"epoch": 0.10158062555055149,
"grad_norm": 0.10044725239276886,
"learning_rate": 0.0015,
"loss": 2.7609,
"step": 16375
},
{
"epoch": 0.10173571047505614,
"grad_norm": 0.10686340183019638,
"learning_rate": 0.0015,
"loss": 2.7295,
"step": 16400
},
{
"epoch": 0.10173571047505614,
"eval_loss": 4.521287441253662,
"perplexity": 91.95391082763672,
"step": 16400
},
{
"epoch": 0.1018907953995608,
"grad_norm": 0.1561044305562973,
"learning_rate": 0.0015,
"loss": 2.7769,
"step": 16425
},
{
"epoch": 0.10204588032406546,
"grad_norm": 0.12182148545980453,
"learning_rate": 0.0015,
"loss": 2.757,
"step": 16450
},
{
"epoch": 0.10220096524857011,
"grad_norm": 0.20665724575519562,
"learning_rate": 0.0015,
"loss": 2.7349,
"step": 16475
},
{
"epoch": 0.10235605017307478,
"grad_norm": 0.09160878509283066,
"learning_rate": 0.0015,
"loss": 2.7393,
"step": 16500
},
{
"epoch": 0.10251113509757943,
"grad_norm": 0.16651533544063568,
"learning_rate": 0.0015,
"loss": 2.7441,
"step": 16525
},
{
"epoch": 0.10266622002208409,
"grad_norm": 0.09358719736337662,
"learning_rate": 0.0015,
"loss": 2.7297,
"step": 16550
},
{
"epoch": 0.10282130494658875,
"grad_norm": 0.20277003943920135,
"learning_rate": 0.0015,
"loss": 2.7506,
"step": 16575
},
{
"epoch": 0.10297638987109341,
"grad_norm": 0.13382607698440552,
"learning_rate": 0.0015,
"loss": 2.7924,
"step": 16600
},
{
"epoch": 0.10297638987109341,
"eval_loss": 4.525242328643799,
"perplexity": 92.31829833984375,
"step": 16600
},
{
"epoch": 0.10313147479559807,
"grad_norm": 0.09686290472745895,
"learning_rate": 0.0015,
"loss": 2.7417,
"step": 16625
},
{
"epoch": 0.10328655972010273,
"grad_norm": 0.11446567624807358,
"learning_rate": 0.0015,
"loss": 2.7582,
"step": 16650
},
{
"epoch": 0.10344164464460738,
"grad_norm": 0.15948985517024994,
"learning_rate": 0.0015,
"loss": 2.7254,
"step": 16675
},
{
"epoch": 0.10359672956911205,
"grad_norm": 0.1254827231168747,
"learning_rate": 0.0015,
"loss": 2.7515,
"step": 16700
},
{
"epoch": 0.1037518144936167,
"grad_norm": 0.11295375972986221,
"learning_rate": 0.0015,
"loss": 2.7058,
"step": 16725
},
{
"epoch": 0.10390689941812137,
"grad_norm": 0.10659389197826385,
"learning_rate": 0.0015,
"loss": 2.7281,
"step": 16750
},
{
"epoch": 0.10406198434262602,
"grad_norm": 0.1045156791806221,
"learning_rate": 0.0015,
"loss": 2.7131,
"step": 16775
},
{
"epoch": 0.10421706926713067,
"grad_norm": 0.13835974037647247,
"learning_rate": 0.0015,
"loss": 2.744,
"step": 16800
},
{
"epoch": 0.10421706926713067,
"eval_loss": 4.507747650146484,
"perplexity": 90.7172622680664,
"step": 16800
},
{
"epoch": 0.10437215419163534,
"grad_norm": 0.19872727990150452,
"learning_rate": 0.0015,
"loss": 2.7642,
"step": 16825
},
{
"epoch": 0.10452723911614,
"grad_norm": 0.13754956424236298,
"learning_rate": 0.0015,
"loss": 2.7652,
"step": 16850
},
{
"epoch": 0.10468232404064466,
"grad_norm": 0.1451335996389389,
"learning_rate": 0.0015,
"loss": 2.7561,
"step": 16875
},
{
"epoch": 0.10483740896514931,
"grad_norm": 0.16750144958496094,
"learning_rate": 0.0015,
"loss": 2.7206,
"step": 16900
},
{
"epoch": 0.10499249388965397,
"grad_norm": 0.12020619958639145,
"learning_rate": 0.0015,
"loss": 2.699,
"step": 16925
},
{
"epoch": 0.10514757881415863,
"grad_norm": 0.16792155802249908,
"learning_rate": 0.0015,
"loss": 2.8062,
"step": 16950
},
{
"epoch": 0.10530266373866329,
"grad_norm": 0.11066465824842453,
"learning_rate": 0.0015,
"loss": 2.6968,
"step": 16975
},
{
"epoch": 0.10545774866316796,
"grad_norm": 0.11885298788547516,
"learning_rate": 0.0015,
"loss": 2.7699,
"step": 17000
},
{
"epoch": 0.10545774866316796,
"eval_loss": 4.524214744567871,
"perplexity": 92.22348022460938,
"step": 17000
},
{
"epoch": 0.10561283358767261,
"grad_norm": 0.1298653483390808,
"learning_rate": 0.0015,
"loss": 2.7199,
"step": 17025
},
{
"epoch": 0.10576791851217726,
"grad_norm": 0.11387672275304794,
"learning_rate": 0.0015,
"loss": 2.7528,
"step": 17050
},
{
"epoch": 0.10592300343668193,
"grad_norm": 0.09852533042430878,
"learning_rate": 0.0015,
"loss": 2.7277,
"step": 17075
},
{
"epoch": 0.10607808836118658,
"grad_norm": 0.11046476662158966,
"learning_rate": 0.0015,
"loss": 2.722,
"step": 17100
},
{
"epoch": 0.10623317328569125,
"grad_norm": 0.11632421612739563,
"learning_rate": 0.0015,
"loss": 2.726,
"step": 17125
},
{
"epoch": 0.1063882582101959,
"grad_norm": 0.11760540306568146,
"learning_rate": 0.0015,
"loss": 2.7267,
"step": 17150
},
{
"epoch": 0.10654334313470057,
"grad_norm": 0.12264183163642883,
"learning_rate": 0.0015,
"loss": 2.8037,
"step": 17175
},
{
"epoch": 0.10669842805920522,
"grad_norm": 0.15346336364746094,
"learning_rate": 0.0015,
"loss": 2.7668,
"step": 17200
},
{
"epoch": 0.10669842805920522,
"eval_loss": 4.503612995147705,
"perplexity": 90.34294891357422,
"step": 17200
},
{
"epoch": 0.10685351298370988,
"grad_norm": 0.10642746090888977,
"learning_rate": 0.0015,
"loss": 2.7295,
"step": 17225
},
{
"epoch": 0.10700859790821454,
"grad_norm": 0.10965430736541748,
"learning_rate": 0.0015,
"loss": 2.7113,
"step": 17250
},
{
"epoch": 0.1071636828327192,
"grad_norm": 0.09912869334220886,
"learning_rate": 0.0015,
"loss": 2.7353,
"step": 17275
},
{
"epoch": 0.10731876775722386,
"grad_norm": 0.14111942052841187,
"learning_rate": 0.0015,
"loss": 2.7064,
"step": 17300
},
{
"epoch": 0.10747385268172852,
"grad_norm": 0.11583065241575241,
"learning_rate": 0.0015,
"loss": 2.722,
"step": 17325
},
{
"epoch": 0.10762893760623317,
"grad_norm": 0.09374859184026718,
"learning_rate": 0.0015,
"loss": 2.6964,
"step": 17350
},
{
"epoch": 0.10778402253073784,
"grad_norm": 0.11704573035240173,
"learning_rate": 0.0015,
"loss": 2.7518,
"step": 17375
},
{
"epoch": 0.10793910745524249,
"grad_norm": 0.13960668444633484,
"learning_rate": 0.0015,
"loss": 2.7373,
"step": 17400
},
{
"epoch": 0.10793910745524249,
"eval_loss": 4.514464378356934,
"perplexity": 91.3286361694336,
"step": 17400
},
{
"epoch": 0.10809419237974716,
"grad_norm": 0.1006089448928833,
"learning_rate": 0.0015,
"loss": 2.7199,
"step": 17425
},
{
"epoch": 0.10824927730425181,
"grad_norm": 0.14851173758506775,
"learning_rate": 0.0015,
"loss": 2.7202,
"step": 17450
},
{
"epoch": 0.10840436222875646,
"grad_norm": 0.11992091685533524,
"learning_rate": 0.0015,
"loss": 2.6932,
"step": 17475
},
{
"epoch": 0.10855944715326113,
"grad_norm": 0.12420158833265305,
"learning_rate": 0.0015,
"loss": 2.7395,
"step": 17500
},
{
"epoch": 0.10871453207776578,
"grad_norm": 0.09945713728666306,
"learning_rate": 0.0015,
"loss": 2.7323,
"step": 17525
},
{
"epoch": 0.10886961700227045,
"grad_norm": 0.13007710874080658,
"learning_rate": 0.0015,
"loss": 2.7438,
"step": 17550
},
{
"epoch": 0.1090247019267751,
"grad_norm": 0.10875315964221954,
"learning_rate": 0.0015,
"loss": 2.7656,
"step": 17575
},
{
"epoch": 0.10917978685127976,
"grad_norm": 0.1075393334031105,
"learning_rate": 0.0015,
"loss": 2.7174,
"step": 17600
},
{
"epoch": 0.10917978685127976,
"eval_loss": 4.4858293533325195,
"perplexity": 88.75052642822266,
"step": 17600
},
{
"epoch": 0.10933487177578442,
"grad_norm": 0.16400013864040375,
"learning_rate": 0.0015,
"loss": 2.7389,
"step": 17625
},
{
"epoch": 0.10948995670028908,
"grad_norm": 0.1368722766637802,
"learning_rate": 0.0015,
"loss": 2.7198,
"step": 17650
},
{
"epoch": 0.10964504162479374,
"grad_norm": 0.23104597628116608,
"learning_rate": 0.0015,
"loss": 2.7346,
"step": 17675
},
{
"epoch": 0.1098001265492984,
"grad_norm": 0.12463794648647308,
"learning_rate": 0.0015,
"loss": 2.691,
"step": 17700
},
{
"epoch": 0.10995521147380305,
"grad_norm": 0.19538962841033936,
"learning_rate": 0.0015,
"loss": 2.6917,
"step": 17725
},
{
"epoch": 0.11011029639830772,
"grad_norm": 0.12000603973865509,
"learning_rate": 0.0015,
"loss": 2.7431,
"step": 17750
},
{
"epoch": 0.11026538132281237,
"grad_norm": 0.15090298652648926,
"learning_rate": 0.0015,
"loss": 2.7493,
"step": 17775
},
{
"epoch": 0.11042046624731704,
"grad_norm": 0.13190440833568573,
"learning_rate": 0.0015,
"loss": 2.7582,
"step": 17800
},
{
"epoch": 0.11042046624731704,
"eval_loss": 4.493134021759033,
"perplexity": 89.40119171142578,
"step": 17800
},
{
"epoch": 0.11057555117182169,
"grad_norm": 0.12455850094556808,
"learning_rate": 0.0015,
"loss": 2.7574,
"step": 17825
},
{
"epoch": 0.11073063609632634,
"grad_norm": 0.14911110699176788,
"learning_rate": 0.0015,
"loss": 2.7285,
"step": 17850
},
{
"epoch": 0.11088572102083101,
"grad_norm": 0.16008728742599487,
"learning_rate": 0.0015,
"loss": 2.733,
"step": 17875
},
{
"epoch": 0.11104080594533566,
"grad_norm": 0.1668420433998108,
"learning_rate": 0.0015,
"loss": 2.7259,
"step": 17900
},
{
"epoch": 0.11119589086984033,
"grad_norm": 0.11736566573381424,
"learning_rate": 0.0015,
"loss": 2.7682,
"step": 17925
},
{
"epoch": 0.11135097579434498,
"grad_norm": 0.11538700759410858,
"learning_rate": 0.0015,
"loss": 2.7656,
"step": 17950
},
{
"epoch": 0.11150606071884964,
"grad_norm": 0.09440570324659348,
"learning_rate": 0.0015,
"loss": 2.7517,
"step": 17975
},
{
"epoch": 0.1116611456433543,
"grad_norm": 0.20621652901172638,
"learning_rate": 0.0015,
"loss": 2.7292,
"step": 18000
},
{
"epoch": 0.1116611456433543,
"eval_loss": 4.493429183959961,
"perplexity": 89.42758178710938,
"step": 18000
},
{
"epoch": 0.11181623056785896,
"grad_norm": 0.12027841061353683,
"learning_rate": 0.0015,
"loss": 2.7049,
"step": 18025
},
{
"epoch": 0.11197131549236362,
"grad_norm": 0.08760379254817963,
"learning_rate": 0.0015,
"loss": 2.7291,
"step": 18050
},
{
"epoch": 0.11212640041686828,
"grad_norm": 0.1251729428768158,
"learning_rate": 0.0015,
"loss": 2.7149,
"step": 18075
},
{
"epoch": 0.11228148534137293,
"grad_norm": 0.10340214520692825,
"learning_rate": 0.0015,
"loss": 2.7437,
"step": 18100
},
{
"epoch": 0.1124365702658776,
"grad_norm": 0.10546920448541641,
"learning_rate": 0.0015,
"loss": 2.7656,
"step": 18125
},
{
"epoch": 0.11259165519038225,
"grad_norm": 0.12438227981328964,
"learning_rate": 0.0015,
"loss": 2.7171,
"step": 18150
},
{
"epoch": 0.11274674011488692,
"grad_norm": 0.14557534456253052,
"learning_rate": 0.0015,
"loss": 2.7395,
"step": 18175
},
{
"epoch": 0.11290182503939157,
"grad_norm": 0.13714823126792908,
"learning_rate": 0.0015,
"loss": 2.7066,
"step": 18200
},
{
"epoch": 0.11290182503939157,
"eval_loss": 4.4876604080200195,
"perplexity": 88.9131851196289,
"step": 18200
},
{
"epoch": 0.11305690996389622,
"grad_norm": 0.12662547826766968,
"learning_rate": 0.0015,
"loss": 2.6665,
"step": 18225
},
{
"epoch": 0.11321199488840089,
"grad_norm": 0.10047092288732529,
"learning_rate": 0.0015,
"loss": 2.7332,
"step": 18250
},
{
"epoch": 0.11336707981290554,
"grad_norm": 0.11126455664634705,
"learning_rate": 0.0015,
"loss": 2.7154,
"step": 18275
},
{
"epoch": 0.11352216473741021,
"grad_norm": 0.10023871064186096,
"learning_rate": 0.0015,
"loss": 2.7007,
"step": 18300
},
{
"epoch": 0.11367724966191486,
"grad_norm": 0.11821885406970978,
"learning_rate": 0.0015,
"loss": 2.7081,
"step": 18325
},
{
"epoch": 0.11383233458641952,
"grad_norm": 0.1216677874326706,
"learning_rate": 0.0015,
"loss": 2.74,
"step": 18350
},
{
"epoch": 0.11398741951092418,
"grad_norm": 0.1125161275267601,
"learning_rate": 0.0015,
"loss": 2.733,
"step": 18375
},
{
"epoch": 0.11414250443542884,
"grad_norm": 0.18253153562545776,
"learning_rate": 0.0015,
"loss": 2.7085,
"step": 18400
},
{
"epoch": 0.11414250443542884,
"eval_loss": 4.501376628875732,
"perplexity": 90.1411361694336,
"step": 18400
},
{
"epoch": 0.1142975893599335,
"grad_norm": 0.13288918137550354,
"learning_rate": 0.0015,
"loss": 2.7033,
"step": 18425
},
{
"epoch": 0.11445267428443816,
"grad_norm": 0.1069432720541954,
"learning_rate": 0.0015,
"loss": 2.7063,
"step": 18450
},
{
"epoch": 0.11460775920894281,
"grad_norm": 0.1035354733467102,
"learning_rate": 0.0015,
"loss": 2.7174,
"step": 18475
},
{
"epoch": 0.11476284413344748,
"grad_norm": 0.1121230348944664,
"learning_rate": 0.0015,
"loss": 2.7,
"step": 18500
},
{
"epoch": 0.11491792905795213,
"grad_norm": 0.13324719667434692,
"learning_rate": 0.0015,
"loss": 2.7423,
"step": 18525
},
{
"epoch": 0.1150730139824568,
"grad_norm": 0.0891190841794014,
"learning_rate": 0.0015,
"loss": 2.7418,
"step": 18550
},
{
"epoch": 0.11522809890696145,
"grad_norm": 0.10579492896795273,
"learning_rate": 0.0015,
"loss": 2.7321,
"step": 18575
},
{
"epoch": 0.1153831838314661,
"grad_norm": 0.1010003387928009,
"learning_rate": 0.0015,
"loss": 2.7071,
"step": 18600
},
{
"epoch": 0.1153831838314661,
"eval_loss": 4.508904933929443,
"perplexity": 90.82231140136719,
"step": 18600
},
{
"epoch": 0.11553826875597077,
"grad_norm": 0.1599242389202118,
"learning_rate": 0.0015,
"loss": 2.7222,
"step": 18625
},
{
"epoch": 0.11569335368047542,
"grad_norm": 0.09344537556171417,
"learning_rate": 0.0015,
"loss": 2.7424,
"step": 18650
},
{
"epoch": 0.11584843860498009,
"grad_norm": 0.13959461450576782,
"learning_rate": 0.0015,
"loss": 2.7584,
"step": 18675
},
{
"epoch": 0.11600352352948474,
"grad_norm": 0.11661764234304428,
"learning_rate": 0.0015,
"loss": 2.7363,
"step": 18700
},
{
"epoch": 0.1161586084539894,
"grad_norm": 0.11968798190355301,
"learning_rate": 0.0015,
"loss": 2.7314,
"step": 18725
},
{
"epoch": 0.11631369337849407,
"grad_norm": 0.22232107818126678,
"learning_rate": 0.0015,
"loss": 2.6992,
"step": 18750
},
{
"epoch": 0.11646877830299872,
"grad_norm": 0.1387198567390442,
"learning_rate": 0.0015,
"loss": 2.7001,
"step": 18775
},
{
"epoch": 0.11662386322750339,
"grad_norm": 0.17059509456157684,
"learning_rate": 0.0015,
"loss": 2.7002,
"step": 18800
},
{
"epoch": 0.11662386322750339,
"eval_loss": 4.516000270843506,
"perplexity": 91.4690170288086,
"step": 18800
},
{
"epoch": 0.11677894815200804,
"grad_norm": 0.10877668112516403,
"learning_rate": 0.0015,
"loss": 2.7171,
"step": 18825
},
{
"epoch": 0.11693403307651269,
"grad_norm": 0.11746638268232346,
"learning_rate": 0.0015,
"loss": 2.7006,
"step": 18850
},
{
"epoch": 0.11708911800101736,
"grad_norm": 0.17617632448673248,
"learning_rate": 0.0015,
"loss": 2.7427,
"step": 18875
},
{
"epoch": 0.11724420292552201,
"grad_norm": 0.09788820147514343,
"learning_rate": 0.0015,
"loss": 2.7507,
"step": 18900
},
{
"epoch": 0.11739928785002668,
"grad_norm": 0.1285056471824646,
"learning_rate": 0.0015,
"loss": 2.7386,
"step": 18925
},
{
"epoch": 0.11755437277453133,
"grad_norm": 0.11705992370843887,
"learning_rate": 0.0015,
"loss": 2.7234,
"step": 18950
},
{
"epoch": 0.11770945769903599,
"grad_norm": 0.09166467934846878,
"learning_rate": 0.0015,
"loss": 2.7825,
"step": 18975
},
{
"epoch": 0.11786454262354065,
"grad_norm": 0.11318054795265198,
"learning_rate": 0.0015,
"loss": 2.778,
"step": 19000
},
{
"epoch": 0.11786454262354065,
"eval_loss": 4.499363422393799,
"perplexity": 89.95984649658203,
"step": 19000
}
],
"logging_steps": 25,
"max_steps": 161202,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 60,
"trial_name": null,
"trial_params": null
}