|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.11786454262354065, |
|
"eval_steps": 200, |
|
"global_step": 19000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00015508492450465875, |
|
"grad_norm": 0.12764382362365723, |
|
"learning_rate": 0.0015, |
|
"loss": 3.062, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0003101698490093175, |
|
"grad_norm": 0.08861421793699265, |
|
"learning_rate": 0.0015, |
|
"loss": 3.0523, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00046525477351397625, |
|
"grad_norm": 0.10059793293476105, |
|
"learning_rate": 0.0015, |
|
"loss": 3.0271, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.000620339698018635, |
|
"grad_norm": 0.09730365872383118, |
|
"learning_rate": 0.0015, |
|
"loss": 3.0421, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0007754246225232938, |
|
"grad_norm": 0.15407200157642365, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9894, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0009305095470279525, |
|
"grad_norm": 0.12250959873199463, |
|
"learning_rate": 0.0015, |
|
"loss": 3.0055, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0010855944715326112, |
|
"grad_norm": 0.08540652692317963, |
|
"learning_rate": 0.0015, |
|
"loss": 3.0025, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.00124067939603727, |
|
"grad_norm": 0.1479829102754593, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9881, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00124067939603727, |
|
"eval_loss": 4.852784156799316, |
|
"perplexity": 128.09652709960938, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0013957643205419288, |
|
"grad_norm": 0.1036139577627182, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9609, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.0015508492450465876, |
|
"grad_norm": 0.10382606089115143, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9771, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0017059341695512462, |
|
"grad_norm": 0.08648105710744858, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9522, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.001861019094055905, |
|
"grad_norm": 0.08675844967365265, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9833, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0020161040185605636, |
|
"grad_norm": 0.1417882740497589, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9626, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0021711889430652224, |
|
"grad_norm": 0.09860406816005707, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9515, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.002326273867569881, |
|
"grad_norm": 0.11757214367389679, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9523, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.00248135879207454, |
|
"grad_norm": 0.11415340006351471, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9579, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.00248135879207454, |
|
"eval_loss": 4.8426313400268555, |
|
"perplexity": 126.80257415771484, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.002636443716579199, |
|
"grad_norm": 0.10692940652370453, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9273, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.0027915286410838576, |
|
"grad_norm": 0.12780559062957764, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9577, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0029466135655885164, |
|
"grad_norm": 0.21147418022155762, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9118, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.003101698490093175, |
|
"grad_norm": 0.13209331035614014, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9584, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0032567834145978336, |
|
"grad_norm": 0.13230836391448975, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9621, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.0034118683391024924, |
|
"grad_norm": 0.11265246570110321, |
|
"learning_rate": 0.0015, |
|
"loss": 2.941, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.003566953263607151, |
|
"grad_norm": 0.10484226047992706, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9311, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.00372203818811181, |
|
"grad_norm": 0.13941314816474915, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9741, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.00372203818811181, |
|
"eval_loss": 4.831629276275635, |
|
"perplexity": 125.41513061523438, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0038771231126164688, |
|
"grad_norm": 0.0885343998670578, |
|
"learning_rate": 0.0015, |
|
"loss": 2.944, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.004032208037121127, |
|
"grad_norm": 0.093564473092556, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9673, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.004187292961625786, |
|
"grad_norm": 0.15350665152072906, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9314, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.004342377886130445, |
|
"grad_norm": 0.11337901651859283, |
|
"learning_rate": 0.0015, |
|
"loss": 2.97, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.004497462810635104, |
|
"grad_norm": 0.13508272171020508, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9121, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.004652547735139762, |
|
"grad_norm": 0.10049441456794739, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9572, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.004807632659644422, |
|
"grad_norm": 0.1017594188451767, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9207, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.00496271758414908, |
|
"grad_norm": 0.09874167293310165, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9258, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.00496271758414908, |
|
"eval_loss": 4.783432960510254, |
|
"perplexity": 119.51393127441406, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.005117802508653739, |
|
"grad_norm": 0.09769408404827118, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9606, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.005272887433158398, |
|
"grad_norm": 0.11946038156747818, |
|
"learning_rate": 0.0015, |
|
"loss": 2.889, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.005427972357663056, |
|
"grad_norm": 0.12191672623157501, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9094, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.005583057282167715, |
|
"grad_norm": 0.09349209070205688, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9242, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.0057381422066723736, |
|
"grad_norm": 0.07793531566858292, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9692, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.005893227131177033, |
|
"grad_norm": 0.1276599019765854, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9339, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.006048312055681691, |
|
"grad_norm": 0.11083021759986877, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9251, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.00620339698018635, |
|
"grad_norm": 0.13207702338695526, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8567, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.00620339698018635, |
|
"eval_loss": 4.790068626403809, |
|
"perplexity": 120.30962371826172, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.006358481904691009, |
|
"grad_norm": 0.20453479886054993, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9127, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.006513566829195667, |
|
"grad_norm": 0.12530989944934845, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9147, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.006668651753700326, |
|
"grad_norm": 0.11520997434854507, |
|
"learning_rate": 0.0015, |
|
"loss": 2.936, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.006823736678204985, |
|
"grad_norm": 0.09191219508647919, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9115, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.006978821602709644, |
|
"grad_norm": 0.07251202315092087, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9154, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.007133906527214302, |
|
"grad_norm": 0.10054546594619751, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8924, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.007288991451718962, |
|
"grad_norm": 0.1192697063088417, |
|
"learning_rate": 0.0015, |
|
"loss": 2.957, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.00744407637622362, |
|
"grad_norm": 0.14840476214885712, |
|
"learning_rate": 0.0015, |
|
"loss": 2.895, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.00744407637622362, |
|
"eval_loss": 4.770949363708496, |
|
"perplexity": 118.03124237060547, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.007599161300728279, |
|
"grad_norm": 0.11221906542778015, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9131, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.0077542462252329376, |
|
"grad_norm": 0.11528974026441574, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8783, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.007909331149737596, |
|
"grad_norm": 0.0807015597820282, |
|
"learning_rate": 0.0015, |
|
"loss": 2.91, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.008064416074242254, |
|
"grad_norm": 0.1435490846633911, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9198, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.008219500998746914, |
|
"grad_norm": 0.11956608295440674, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8771, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.008374585923251573, |
|
"grad_norm": 0.10362117737531662, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8913, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.008529670847756231, |
|
"grad_norm": 0.07132004201412201, |
|
"learning_rate": 0.0015, |
|
"loss": 2.946, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.00868475577226089, |
|
"grad_norm": 0.08756817877292633, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9015, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.00868475577226089, |
|
"eval_loss": 4.769084453582764, |
|
"perplexity": 117.81133270263672, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.00883984069676555, |
|
"grad_norm": 0.18067917227745056, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8887, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.008994925621270208, |
|
"grad_norm": 0.09742950648069382, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8834, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.009150010545774866, |
|
"grad_norm": 0.09857803583145142, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8856, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.009305095470279525, |
|
"grad_norm": 0.17605328559875488, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9238, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.009460180394784183, |
|
"grad_norm": 0.08441105484962463, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8605, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.009615265319288843, |
|
"grad_norm": 0.15339621901512146, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9421, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.009770350243793502, |
|
"grad_norm": 0.21426236629486084, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8899, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.00992543516829816, |
|
"grad_norm": 0.16503557562828064, |
|
"learning_rate": 0.0015, |
|
"loss": 2.878, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.00992543516829816, |
|
"eval_loss": 4.774999618530273, |
|
"perplexity": 118.51026916503906, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.010080520092802818, |
|
"grad_norm": 0.11398541182279587, |
|
"learning_rate": 0.0015, |
|
"loss": 2.866, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.010235605017307478, |
|
"grad_norm": 0.16510234773159027, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8936, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.010390689941812137, |
|
"grad_norm": 0.08827799558639526, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8789, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.010545774866316795, |
|
"grad_norm": 0.12703286111354828, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9104, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.010700859790821454, |
|
"grad_norm": 0.10185768455266953, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8389, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.010855944715326112, |
|
"grad_norm": 0.13076236844062805, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8603, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.011011029639830772, |
|
"grad_norm": 0.08955707401037216, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8283, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.01116611456433543, |
|
"grad_norm": 0.07163148373365402, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8852, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.01116611456433543, |
|
"eval_loss": 4.75281286239624, |
|
"perplexity": 115.90986633300781, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.011321199488840089, |
|
"grad_norm": 0.09710580855607986, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8573, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.011476284413344747, |
|
"grad_norm": 0.11669810861349106, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8674, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.011631369337849405, |
|
"grad_norm": 0.11174403876066208, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9121, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.011786454262354066, |
|
"grad_norm": 0.09547118842601776, |
|
"learning_rate": 0.0015, |
|
"loss": 2.9033, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.011941539186858724, |
|
"grad_norm": 0.09878171980381012, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8738, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.012096624111363382, |
|
"grad_norm": 0.09479096531867981, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8775, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.01225170903586804, |
|
"grad_norm": 0.12434259057044983, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8452, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.0124067939603727, |
|
"grad_norm": 0.09166444838047028, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8546, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0124067939603727, |
|
"eval_loss": 4.748600482940674, |
|
"perplexity": 115.42263793945312, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.01256187888487736, |
|
"grad_norm": 0.07793508470058441, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8306, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.012716963809382018, |
|
"grad_norm": 0.1670406609773636, |
|
"learning_rate": 0.0015, |
|
"loss": 2.863, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.012872048733886676, |
|
"grad_norm": 0.20754718780517578, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8871, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.013027133658391334, |
|
"grad_norm": 0.14225496351718903, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8498, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.013182218582895994, |
|
"grad_norm": 0.11809197813272476, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8206, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.013337303507400653, |
|
"grad_norm": 0.09541622549295425, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8585, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.013492388431905311, |
|
"grad_norm": 0.1115843802690506, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8533, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.01364747335640997, |
|
"grad_norm": 0.08517899364233017, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8477, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.01364747335640997, |
|
"eval_loss": 4.753279685974121, |
|
"perplexity": 115.9639892578125, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.01380255828091463, |
|
"grad_norm": 0.13083544373512268, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8518, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.013957643205419288, |
|
"grad_norm": 0.07403870671987534, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8685, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.014112728129923946, |
|
"grad_norm": 0.16436311602592468, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8601, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.014267813054428605, |
|
"grad_norm": 0.12990187108516693, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8332, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.014422897978933263, |
|
"grad_norm": 0.0897112786769867, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8578, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.014577982903437923, |
|
"grad_norm": 0.10096879303455353, |
|
"learning_rate": 0.0015, |
|
"loss": 2.802, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.014733067827942582, |
|
"grad_norm": 0.0850217416882515, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8529, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.01488815275244724, |
|
"grad_norm": 0.11395123600959778, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8655, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.01488815275244724, |
|
"eval_loss": 4.743602275848389, |
|
"perplexity": 114.84716796875, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.015043237676951898, |
|
"grad_norm": 0.1590801179409027, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8227, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.015198322601456558, |
|
"grad_norm": 0.16819922626018524, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8551, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.015353407525961217, |
|
"grad_norm": 0.15390118956565857, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8691, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.015508492450465875, |
|
"grad_norm": 0.10976951569318771, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8615, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.015663577374970535, |
|
"grad_norm": 0.09539350867271423, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7755, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.015818662299475192, |
|
"grad_norm": 0.09798863530158997, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7675, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.015973747223979852, |
|
"grad_norm": 0.10233014822006226, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7905, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.01612883214848451, |
|
"grad_norm": 0.09607812017202377, |
|
"learning_rate": 0.0015, |
|
"loss": 2.779, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.01612883214848451, |
|
"eval_loss": 4.757762432098389, |
|
"perplexity": 116.48499298095703, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.01628391707298917, |
|
"grad_norm": 0.09782920032739639, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8455, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.01643900199749383, |
|
"grad_norm": 0.08443335443735123, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8537, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.016594086921998485, |
|
"grad_norm": 0.1567981094121933, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8334, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.016749171846503146, |
|
"grad_norm": 0.1279255449771881, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8733, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.016904256771007802, |
|
"grad_norm": 0.09086953848600388, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7992, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.017059341695512462, |
|
"grad_norm": 0.15084481239318848, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7891, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.017214426620017122, |
|
"grad_norm": 0.1059018149971962, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8088, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.01736951154452178, |
|
"grad_norm": 0.08803548663854599, |
|
"learning_rate": 0.0015, |
|
"loss": 2.817, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.01736951154452178, |
|
"eval_loss": 4.730724334716797, |
|
"perplexity": 113.37765502929688, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.01752459646902644, |
|
"grad_norm": 0.0954984724521637, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8528, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.0176796813935311, |
|
"grad_norm": 0.14015914499759674, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8131, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.017834766318035756, |
|
"grad_norm": 0.07908599078655243, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8371, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.017989851242540416, |
|
"grad_norm": 0.14578266441822052, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8033, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.018144936167045073, |
|
"grad_norm": 0.10059946030378342, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8165, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.018300021091549733, |
|
"grad_norm": 0.10238490998744965, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7739, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.018455106016054393, |
|
"grad_norm": 0.12706336379051208, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8018, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.01861019094055905, |
|
"grad_norm": 0.1252700239419937, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8155, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.01861019094055905, |
|
"eval_loss": 4.707705020904541, |
|
"perplexity": 110.79759216308594, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.01876527586506371, |
|
"grad_norm": 0.13322588801383972, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8201, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.018920360789568366, |
|
"grad_norm": 0.14152252674102783, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7942, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.019075445714073026, |
|
"grad_norm": 0.1276037096977234, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8065, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.019230530638577686, |
|
"grad_norm": 0.11600831896066666, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8335, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.019385615563082343, |
|
"grad_norm": 0.11985427141189575, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7993, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.019540700487587003, |
|
"grad_norm": 0.11630894988775253, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7838, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.01969578541209166, |
|
"grad_norm": 0.08493560552597046, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7884, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.01985087033659632, |
|
"grad_norm": 0.12671016156673431, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7763, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.01985087033659632, |
|
"eval_loss": 4.7127766609191895, |
|
"perplexity": 111.3609390258789, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.02000595526110098, |
|
"grad_norm": 0.10381816327571869, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7849, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.020161040185605637, |
|
"grad_norm": 0.12319795787334442, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8325, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.020316125110110297, |
|
"grad_norm": 0.11378122121095657, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7609, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.020471210034614957, |
|
"grad_norm": 0.08910433948040009, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7886, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.020626294959119613, |
|
"grad_norm": 0.11803348362445831, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7716, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.020781379883624274, |
|
"grad_norm": 0.10203807801008224, |
|
"learning_rate": 0.0015, |
|
"loss": 2.778, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.02093646480812893, |
|
"grad_norm": 0.07175683230161667, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7844, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.02109154973263359, |
|
"grad_norm": 0.1556989699602127, |
|
"learning_rate": 0.0015, |
|
"loss": 2.748, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.02109154973263359, |
|
"eval_loss": 4.711516857147217, |
|
"perplexity": 111.22074127197266, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.02124663465713825, |
|
"grad_norm": 0.11983326822519302, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7747, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.021401719581642907, |
|
"grad_norm": 0.09098344296216965, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7609, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.021556804506147567, |
|
"grad_norm": 0.1238594651222229, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7849, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.021711889430652224, |
|
"grad_norm": 0.10654041916131973, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7742, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.021866974355156884, |
|
"grad_norm": 0.12955708801746368, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7302, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.022022059279661544, |
|
"grad_norm": 0.0945751890540123, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7366, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.0221771442041662, |
|
"grad_norm": 0.11322261393070221, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7307, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.02233222912867086, |
|
"grad_norm": 0.14438313245773315, |
|
"learning_rate": 0.0015, |
|
"loss": 2.741, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.02233222912867086, |
|
"eval_loss": 4.7056427001953125, |
|
"perplexity": 110.56932830810547, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.022487314053175517, |
|
"grad_norm": 0.12101957201957703, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7699, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.022642398977680177, |
|
"grad_norm": 0.13060438632965088, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7534, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.022797483902184838, |
|
"grad_norm": 0.18028861284255981, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7716, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.022952568826689494, |
|
"grad_norm": 0.2551407217979431, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7505, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.023107653751194154, |
|
"grad_norm": 0.14461354911327362, |
|
"learning_rate": 0.0015, |
|
"loss": 2.762, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.02326273867569881, |
|
"grad_norm": 0.08960037678480148, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7752, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.02341782360020347, |
|
"grad_norm": 0.12423495948314667, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7649, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.02357290852470813, |
|
"grad_norm": 0.11889061331748962, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7465, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.02357290852470813, |
|
"eval_loss": 4.709405422210693, |
|
"perplexity": 110.98615264892578, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.023727993449212788, |
|
"grad_norm": 0.1310662031173706, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7739, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.023883078373717448, |
|
"grad_norm": 0.10841766744852066, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7558, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.024038163298222108, |
|
"grad_norm": 0.11951743066310883, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7574, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.024193248222726765, |
|
"grad_norm": 0.10914873331785202, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7593, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.024348333147231425, |
|
"grad_norm": 0.12661431729793549, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7405, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.02450341807173608, |
|
"grad_norm": 0.09351510554552078, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7614, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.02465850299624074, |
|
"grad_norm": 0.10916408896446228, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7348, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.0248135879207454, |
|
"grad_norm": 0.1506185084581375, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7465, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0248135879207454, |
|
"eval_loss": 4.691644191741943, |
|
"perplexity": 109.03230285644531, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.024968672845250058, |
|
"grad_norm": 0.16664201021194458, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7099, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.02512375776975472, |
|
"grad_norm": 0.08793428540229797, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7062, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.025278842694259375, |
|
"grad_norm": 0.10746140778064728, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7013, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.025433927618764035, |
|
"grad_norm": 0.14466698467731476, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7366, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.025589012543268695, |
|
"grad_norm": 0.12191653996706009, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7042, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.025744097467773352, |
|
"grad_norm": 0.10167489945888519, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7215, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.025899182392278012, |
|
"grad_norm": 0.11334148049354553, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7365, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.02605426731678267, |
|
"grad_norm": 0.09303794056177139, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7471, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.02605426731678267, |
|
"eval_loss": 4.692121505737305, |
|
"perplexity": 109.08435821533203, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.02620935224128733, |
|
"grad_norm": 0.09444712847471237, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6965, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.02636443716579199, |
|
"grad_norm": 0.09560113400220871, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7186, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.026519522090296645, |
|
"grad_norm": 0.10814715176820755, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.026674607014801305, |
|
"grad_norm": 0.12008251994848251, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6827, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.026829691939305966, |
|
"grad_norm": 0.13892072439193726, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7481, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.026984776863810622, |
|
"grad_norm": 0.10116352885961533, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6839, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.027139861788315282, |
|
"grad_norm": 0.2541595697402954, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6987, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.02729494671281994, |
|
"grad_norm": 0.11070574074983597, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7102, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.02729494671281994, |
|
"eval_loss": 4.702114105224609, |
|
"perplexity": 110.17985534667969, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.0274500316373246, |
|
"grad_norm": 0.09290622174739838, |
|
"learning_rate": 0.0015, |
|
"loss": 2.744, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.02760511656182926, |
|
"grad_norm": 0.09867129474878311, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6979, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.027760201486333916, |
|
"grad_norm": 0.08975850045681, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7346, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.027915286410838576, |
|
"grad_norm": 0.1251811683177948, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6901, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.028070371335343233, |
|
"grad_norm": 0.10718528181314468, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6584, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.028225456259847893, |
|
"grad_norm": 0.1920158714056015, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6776, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.028380541184352553, |
|
"grad_norm": 0.11409153789281845, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7052, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.02853562610885721, |
|
"grad_norm": 0.12506772577762604, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6954, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.02853562610885721, |
|
"eval_loss": 4.685390949249268, |
|
"perplexity": 108.35262298583984, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.02869071103336187, |
|
"grad_norm": 0.1093166172504425, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7257, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.028845795957866526, |
|
"grad_norm": 0.16628532111644745, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6782, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.029000880882371186, |
|
"grad_norm": 0.1638079136610031, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6884, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.029155965806875846, |
|
"grad_norm": 0.11411619931459427, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7054, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.029311050731380503, |
|
"grad_norm": 0.09292814135551453, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6826, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.029466135655885163, |
|
"grad_norm": 0.09136354923248291, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6936, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.029621220580389823, |
|
"grad_norm": 0.1188502386212349, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6466, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.02977630550489448, |
|
"grad_norm": 0.09645655751228333, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6092, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.02977630550489448, |
|
"eval_loss": 4.683995723724365, |
|
"perplexity": 108.20155334472656, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.02993139042939914, |
|
"grad_norm": 0.17193672060966492, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6916, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.030086475353903797, |
|
"grad_norm": 0.14866988360881805, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6776, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.030241560278408457, |
|
"grad_norm": 0.10588869452476501, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6773, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.030396645202913117, |
|
"grad_norm": 0.12059559673070908, |
|
"learning_rate": 0.0015, |
|
"loss": 2.639, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.030551730127417773, |
|
"grad_norm": 0.13296598196029663, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6359, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.030706815051922434, |
|
"grad_norm": 0.12300167232751846, |
|
"learning_rate": 0.0015, |
|
"loss": 2.668, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.03086189997642709, |
|
"grad_norm": 0.15900522470474243, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6252, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.03101698490093175, |
|
"grad_norm": 0.138090580701828, |
|
"learning_rate": 0.0015, |
|
"loss": 2.659, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.03101698490093175, |
|
"eval_loss": 4.688181400299072, |
|
"perplexity": 108.65540313720703, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.03117206982543641, |
|
"grad_norm": 0.13720737397670746, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6096, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.03132715474994107, |
|
"grad_norm": 0.13671600818634033, |
|
"learning_rate": 0.0015, |
|
"loss": 2.647, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.031482239674445724, |
|
"grad_norm": 0.12611277401447296, |
|
"learning_rate": 0.0015, |
|
"loss": 2.639, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.031637324598950384, |
|
"grad_norm": 0.12045291066169739, |
|
"learning_rate": 0.0015, |
|
"loss": 2.663, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.031792409523455044, |
|
"grad_norm": 0.10857657343149185, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6677, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 0.031947494447959704, |
|
"grad_norm": 0.12052007764577866, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6508, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.032102579372464364, |
|
"grad_norm": 0.10999467223882675, |
|
"learning_rate": 0.0015, |
|
"loss": 2.661, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 0.03225766429696902, |
|
"grad_norm": 0.11075185984373093, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6645, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.03225766429696902, |
|
"eval_loss": 4.706582546234131, |
|
"perplexity": 110.67329406738281, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.03241274922147368, |
|
"grad_norm": 0.09703061729669571, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6109, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 0.03256783414597834, |
|
"grad_norm": 0.13556119799613953, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6621, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.032722919070483, |
|
"grad_norm": 0.09178316593170166, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6263, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 0.03287800399498766, |
|
"grad_norm": 0.10839138180017471, |
|
"learning_rate": 0.0015, |
|
"loss": 2.5999, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.03303308891949231, |
|
"grad_norm": 0.12049377709627151, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6085, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 0.03318817384399697, |
|
"grad_norm": 0.15260230004787445, |
|
"learning_rate": 0.0015, |
|
"loss": 2.664, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.03334325876850163, |
|
"grad_norm": 0.12393297255039215, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6234, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 0.03349834369300629, |
|
"grad_norm": 0.1284521073102951, |
|
"learning_rate": 0.0015, |
|
"loss": 2.5624, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.03349834369300629, |
|
"eval_loss": 4.696901321411133, |
|
"perplexity": 109.60700988769531, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.03365342861751095, |
|
"grad_norm": 0.18052247166633606, |
|
"learning_rate": 0.0015, |
|
"loss": 2.5779, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 0.033808513542015604, |
|
"grad_norm": 0.11775010824203491, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6167, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.033963598466520264, |
|
"grad_norm": 0.13769109547138214, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6117, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 0.034118683391024925, |
|
"grad_norm": 0.09634970873594284, |
|
"learning_rate": 0.0015, |
|
"loss": 2.613, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.034273768315529585, |
|
"grad_norm": 0.14692488312721252, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6176, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 0.034428853240034245, |
|
"grad_norm": 0.21920783817768097, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6196, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.034583938164538905, |
|
"grad_norm": 0.1033003106713295, |
|
"learning_rate": 0.0015, |
|
"loss": 2.5872, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 0.03473902308904356, |
|
"grad_norm": 0.09867612272500992, |
|
"learning_rate": 0.0015, |
|
"loss": 2.5782, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.03473902308904356, |
|
"eval_loss": 4.704063892364502, |
|
"perplexity": 110.3948974609375, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.03489410801354822, |
|
"grad_norm": 0.1032184287905693, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6187, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 0.03504919293805288, |
|
"grad_norm": 0.12661318480968475, |
|
"learning_rate": 0.0015, |
|
"loss": 2.5805, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.03520427786255754, |
|
"grad_norm": 0.28772449493408203, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7518, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 0.0353593627870622, |
|
"grad_norm": 0.10005131363868713, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8556, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.03551444771156685, |
|
"grad_norm": 0.10379570722579956, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8648, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 0.03566953263607151, |
|
"grad_norm": 0.08921229094266891, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8421, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.03582461756057617, |
|
"grad_norm": 0.15366144478321075, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8162, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 0.03597970248508083, |
|
"grad_norm": 0.12743431329727173, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8635, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.03597970248508083, |
|
"eval_loss": 4.674878120422363, |
|
"perplexity": 107.21949768066406, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.03613478740958549, |
|
"grad_norm": 0.08773666620254517, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8787, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 0.036289872334090145, |
|
"grad_norm": 0.11721781641244888, |
|
"learning_rate": 0.0015, |
|
"loss": 2.853, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.036444957258594805, |
|
"grad_norm": 0.09957700222730637, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8163, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 0.036600042183099465, |
|
"grad_norm": 0.09999966621398926, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8206, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.036755127107604126, |
|
"grad_norm": 0.09899301081895828, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8378, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 0.036910212032108786, |
|
"grad_norm": 0.09676779061555862, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8385, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.03706529695661344, |
|
"grad_norm": 0.14397811889648438, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8639, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 0.0372203818811181, |
|
"grad_norm": 0.08991026133298874, |
|
"learning_rate": 0.0015, |
|
"loss": 2.862, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0372203818811181, |
|
"eval_loss": 4.649503707885742, |
|
"perplexity": 104.53309631347656, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.03737546680562276, |
|
"grad_norm": 0.11916879564523697, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8336, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 0.03753055173012742, |
|
"grad_norm": 0.1533547192811966, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8154, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.03768563665463208, |
|
"grad_norm": 0.10416785627603531, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8073, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 0.03784072157913673, |
|
"grad_norm": 0.1307593733072281, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8227, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.03799580650364139, |
|
"grad_norm": 0.11226139962673187, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8316, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 0.03815089142814605, |
|
"grad_norm": 0.12050950527191162, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8636, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.03830597635265071, |
|
"grad_norm": 0.14836955070495605, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8433, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 0.03846106127715537, |
|
"grad_norm": 0.1240909993648529, |
|
"learning_rate": 0.0015, |
|
"loss": 2.885, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.03846106127715537, |
|
"eval_loss": 4.652696132659912, |
|
"perplexity": 104.86734008789062, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.038616146201660026, |
|
"grad_norm": 0.09549515694379807, |
|
"learning_rate": 0.0015, |
|
"loss": 2.822, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 0.038771231126164686, |
|
"grad_norm": 0.1386450082063675, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8455, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.038926316050669346, |
|
"grad_norm": 0.10233025252819061, |
|
"learning_rate": 0.0015, |
|
"loss": 2.834, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 0.039081400975174006, |
|
"grad_norm": 0.09776704013347626, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8114, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.039236485899678666, |
|
"grad_norm": 0.09631351381540298, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8107, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 0.03939157082418332, |
|
"grad_norm": 0.08424117416143417, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8373, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.03954665574868798, |
|
"grad_norm": 0.14171521365642548, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8394, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 0.03970174067319264, |
|
"grad_norm": 0.11349046230316162, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8131, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.03970174067319264, |
|
"eval_loss": 4.652514934539795, |
|
"perplexity": 104.84834289550781, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.0398568255976973, |
|
"grad_norm": 0.09066054224967957, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8758, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 0.04001191052220196, |
|
"grad_norm": 0.09391192346811295, |
|
"learning_rate": 0.0015, |
|
"loss": 2.826, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.04016699544670661, |
|
"grad_norm": 0.17412593960762024, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8487, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 0.04032208037121127, |
|
"grad_norm": 0.17672564089298248, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8441, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.04047716529571593, |
|
"grad_norm": 0.11427825689315796, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8843, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 0.04063225022022059, |
|
"grad_norm": 0.13745597004890442, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8458, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.040787335144725254, |
|
"grad_norm": 0.12339327484369278, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8299, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 0.040942420069229914, |
|
"grad_norm": 0.11045660078525543, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8504, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.040942420069229914, |
|
"eval_loss": 4.645139217376709, |
|
"perplexity": 104.0778579711914, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.04109750499373457, |
|
"grad_norm": 0.14822149276733398, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8438, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 0.04125258991823923, |
|
"grad_norm": 0.09271769225597382, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8195, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.04140767484274389, |
|
"grad_norm": 0.12357133626937866, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8434, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 0.04156275976724855, |
|
"grad_norm": 0.12669824063777924, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8262, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.04171784469175321, |
|
"grad_norm": 0.10409893840551376, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8164, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 0.04187292961625786, |
|
"grad_norm": 0.10687699913978577, |
|
"learning_rate": 0.0015, |
|
"loss": 2.83, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.04202801454076252, |
|
"grad_norm": 0.09924216568470001, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8415, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 0.04218309946526718, |
|
"grad_norm": 0.11719833314418793, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8368, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.04218309946526718, |
|
"eval_loss": 4.673882484436035, |
|
"perplexity": 107.11280059814453, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.04233818438977184, |
|
"grad_norm": 0.10162920504808426, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8285, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 0.0424932693142765, |
|
"grad_norm": 0.10563603043556213, |
|
"learning_rate": 0.0015, |
|
"loss": 2.809, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.042648354238781154, |
|
"grad_norm": 0.079631008207798, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8362, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 0.042803439163285814, |
|
"grad_norm": 0.11915802210569382, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8211, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.042958524087790474, |
|
"grad_norm": 0.13783864676952362, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8403, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 0.043113609012295134, |
|
"grad_norm": 0.17333541810512543, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8699, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.043268693936799794, |
|
"grad_norm": 0.10923554003238678, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8016, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 0.04342377886130445, |
|
"grad_norm": 0.10525023192167282, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8302, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.04342377886130445, |
|
"eval_loss": 4.660215854644775, |
|
"perplexity": 105.65888977050781, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.04357886378580911, |
|
"grad_norm": 0.10499420017004013, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8215, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 0.04373394871031377, |
|
"grad_norm": 0.09560755640268326, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8279, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.04388903363481843, |
|
"grad_norm": 0.10454019159078598, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8161, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 0.04404411855932309, |
|
"grad_norm": 0.0982690081000328, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7895, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.04419920348382774, |
|
"grad_norm": 0.10405784100294113, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7945, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 0.0443542884083324, |
|
"grad_norm": 0.09310988336801529, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8535, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.04450937333283706, |
|
"grad_norm": 0.1031995639204979, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8298, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 0.04466445825734172, |
|
"grad_norm": 0.09206147491931915, |
|
"learning_rate": 0.0015, |
|
"loss": 2.794, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.04466445825734172, |
|
"eval_loss": 4.642621994018555, |
|
"perplexity": 103.81619262695312, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.04481954318184638, |
|
"grad_norm": 0.1051359549164772, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7996, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 0.044974628106351035, |
|
"grad_norm": 0.12941063940525055, |
|
"learning_rate": 0.0015, |
|
"loss": 2.792, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.045129713030855695, |
|
"grad_norm": 0.09297281503677368, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7847, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 0.045284797955360355, |
|
"grad_norm": 0.11114951968193054, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8164, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.045439882879865015, |
|
"grad_norm": 0.08519440144300461, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8053, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 0.045594967804369675, |
|
"grad_norm": 0.11148552596569061, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7871, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.04575005272887433, |
|
"grad_norm": 0.136012002825737, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8457, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 0.04590513765337899, |
|
"grad_norm": 0.1037759929895401, |
|
"learning_rate": 0.0015, |
|
"loss": 2.748, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.04590513765337899, |
|
"eval_loss": 4.631537437438965, |
|
"perplexity": 102.67179107666016, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.04606022257788365, |
|
"grad_norm": 0.11162275820970535, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8044, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 0.04621530750238831, |
|
"grad_norm": 0.11309058219194412, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8198, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.04637039242689297, |
|
"grad_norm": 0.09359199553728104, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8302, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 0.04652547735139762, |
|
"grad_norm": 0.09513767808675766, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8325, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.04668056227590228, |
|
"grad_norm": 0.08243551850318909, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7925, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 0.04683564720040694, |
|
"grad_norm": 0.08001349121332169, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8406, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.0469907321249116, |
|
"grad_norm": 0.11749595403671265, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7762, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 0.04714581704941626, |
|
"grad_norm": 0.15697765350341797, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8137, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.04714581704941626, |
|
"eval_loss": 4.643322467803955, |
|
"perplexity": 103.8889389038086, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.04730090197392092, |
|
"grad_norm": 0.1004658117890358, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7787, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 0.047455986898425576, |
|
"grad_norm": 0.11577022075653076, |
|
"learning_rate": 0.0015, |
|
"loss": 2.806, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.047611071822930236, |
|
"grad_norm": 0.10791046917438507, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7637, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 0.047766156747434896, |
|
"grad_norm": 0.09490654617547989, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8187, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.047921241671939556, |
|
"grad_norm": 0.10448817163705826, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8335, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 0.048076326596444216, |
|
"grad_norm": 0.10800398141145706, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8138, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.04823141152094887, |
|
"grad_norm": 0.10268035531044006, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8074, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 0.04838649644545353, |
|
"grad_norm": 0.145925372838974, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8161, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.04838649644545353, |
|
"eval_loss": 4.628528118133545, |
|
"perplexity": 102.36328887939453, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.04854158136995819, |
|
"grad_norm": 0.1422831267118454, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8179, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 0.04869666629446285, |
|
"grad_norm": 0.10019826889038086, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8228, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.04885175121896751, |
|
"grad_norm": 0.12028387933969498, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8359, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 0.04900683614347216, |
|
"grad_norm": 0.08171118795871735, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7829, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.04916192106797682, |
|
"grad_norm": 0.138522207736969, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7992, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 0.04931700599248148, |
|
"grad_norm": 0.10419227927923203, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8097, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.04947209091698614, |
|
"grad_norm": 0.1020691841840744, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8152, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 0.0496271758414908, |
|
"grad_norm": 0.12423787266016006, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7966, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0496271758414908, |
|
"eval_loss": 4.6273722648620605, |
|
"perplexity": 102.24504089355469, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.049782260765995456, |
|
"grad_norm": 0.15230977535247803, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7575, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 0.049937345690500116, |
|
"grad_norm": 0.12649676203727722, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7897, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.05009243061500478, |
|
"grad_norm": 0.11257271468639374, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8115, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 0.05024751553950944, |
|
"grad_norm": 0.09349871426820755, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8041, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.0504026004640141, |
|
"grad_norm": 0.14108401536941528, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7772, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 0.05055768538851875, |
|
"grad_norm": 0.17286863923072815, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8197, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.05071277031302341, |
|
"grad_norm": 0.10759209096431732, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8396, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 0.05086785523752807, |
|
"grad_norm": 0.10236554592847824, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8175, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.05086785523752807, |
|
"eval_loss": 4.610519886016846, |
|
"perplexity": 100.5363998413086, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.05102294016203273, |
|
"grad_norm": 0.12348885089159012, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8139, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 0.05117802508653739, |
|
"grad_norm": 0.10251584649085999, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8436, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.051333110011042044, |
|
"grad_norm": 0.10069389641284943, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8409, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 0.051488194935546704, |
|
"grad_norm": 0.1546829789876938, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8199, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.051643279860051364, |
|
"grad_norm": 0.10704527795314789, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7721, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 0.051798364784556024, |
|
"grad_norm": 0.12251198291778564, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8175, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.051953449709060684, |
|
"grad_norm": 0.11113474518060684, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8085, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 0.05210853463356534, |
|
"grad_norm": 0.1341187059879303, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8169, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.05210853463356534, |
|
"eval_loss": 4.610434532165527, |
|
"perplexity": 100.52782440185547, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.05226361955807, |
|
"grad_norm": 0.16195224225521088, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8266, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 0.05241870448257466, |
|
"grad_norm": 0.1637653261423111, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8106, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.05257378940707932, |
|
"grad_norm": 0.10014921426773071, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8103, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 0.05272887433158398, |
|
"grad_norm": 0.11419603228569031, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7965, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.05288395925608863, |
|
"grad_norm": 0.08137035369873047, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7802, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 0.05303904418059329, |
|
"grad_norm": 0.08078640699386597, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7819, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.05319412910509795, |
|
"grad_norm": 0.13133442401885986, |
|
"learning_rate": 0.0015, |
|
"loss": 2.83, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 0.05334921402960261, |
|
"grad_norm": 0.08819993585348129, |
|
"learning_rate": 0.0015, |
|
"loss": 2.833, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.05334921402960261, |
|
"eval_loss": 4.603670120239258, |
|
"perplexity": 99.85010528564453, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.05350429895410727, |
|
"grad_norm": 0.14662431180477142, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8201, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 0.05365938387861193, |
|
"grad_norm": 0.10400764644145966, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7944, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.053814468803116584, |
|
"grad_norm": 0.2790142297744751, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8307, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 0.053969553727621244, |
|
"grad_norm": 0.13645683228969574, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7904, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.054124638652125905, |
|
"grad_norm": 0.09604925662279129, |
|
"learning_rate": 0.0015, |
|
"loss": 2.76, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 0.054279723576630565, |
|
"grad_norm": 0.07631650567054749, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7955, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.054434808501135225, |
|
"grad_norm": 0.13132531940937042, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8308, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 0.05458989342563988, |
|
"grad_norm": 0.08334681391716003, |
|
"learning_rate": 0.0015, |
|
"loss": 2.755, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.05458989342563988, |
|
"eval_loss": 4.597860336303711, |
|
"perplexity": 99.27168273925781, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.05474497835014454, |
|
"grad_norm": 0.10585317760705948, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7708, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 0.0549000632746492, |
|
"grad_norm": 0.08953095227479935, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7622, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.05505514819915386, |
|
"grad_norm": 0.10430523008108139, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8255, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 0.05521023312365852, |
|
"grad_norm": 0.08961856365203857, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7835, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.05536531804816317, |
|
"grad_norm": 0.13602201640605927, |
|
"learning_rate": 0.0015, |
|
"loss": 2.813, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 0.05552040297266783, |
|
"grad_norm": 0.1858643889427185, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8296, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.05567548789717249, |
|
"grad_norm": 0.12873806059360504, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7669, |
|
"step": 8975 |
|
}, |
|
{ |
|
"epoch": 0.05583057282167715, |
|
"grad_norm": 0.09891733527183533, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7829, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.05583057282167715, |
|
"eval_loss": 4.606179714202881, |
|
"perplexity": 100.10100555419922, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.05598565774618181, |
|
"grad_norm": 0.1619413048028946, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7885, |
|
"step": 9025 |
|
}, |
|
{ |
|
"epoch": 0.056140742670686465, |
|
"grad_norm": 0.1223379522562027, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7829, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.056295827595191125, |
|
"grad_norm": 0.10872245579957962, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7962, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 0.056450912519695785, |
|
"grad_norm": 0.11461862176656723, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7476, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.056605997444200445, |
|
"grad_norm": 0.08933119475841522, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7745, |
|
"step": 9125 |
|
}, |
|
{ |
|
"epoch": 0.056761082368705106, |
|
"grad_norm": 0.12911683320999146, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8029, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.05691616729320976, |
|
"grad_norm": 0.13963252305984497, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7931, |
|
"step": 9175 |
|
}, |
|
{ |
|
"epoch": 0.05707125221771442, |
|
"grad_norm": 0.13462606072425842, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7771, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.05707125221771442, |
|
"eval_loss": 4.619841575622559, |
|
"perplexity": 101.47795104980469, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.05722633714221908, |
|
"grad_norm": 0.12551379203796387, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7934, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 0.05738142206672374, |
|
"grad_norm": 0.12379872798919678, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7882, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.0575365069912284, |
|
"grad_norm": 0.0940781831741333, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7658, |
|
"step": 9275 |
|
}, |
|
{ |
|
"epoch": 0.05769159191573305, |
|
"grad_norm": 0.14165829122066498, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7973, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.05784667684023771, |
|
"grad_norm": 0.10727201402187347, |
|
"learning_rate": 0.0015, |
|
"loss": 2.815, |
|
"step": 9325 |
|
}, |
|
{ |
|
"epoch": 0.05800176176474237, |
|
"grad_norm": 0.1628653109073639, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7854, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.05815684668924703, |
|
"grad_norm": 0.09925588220357895, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7578, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 0.05831193161375169, |
|
"grad_norm": 0.1587476134300232, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7296, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.05831193161375169, |
|
"eval_loss": 4.604221343994141, |
|
"perplexity": 99.90515899658203, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.058467016538256346, |
|
"grad_norm": 0.10519708693027496, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7712, |
|
"step": 9425 |
|
}, |
|
{ |
|
"epoch": 0.058622101462761006, |
|
"grad_norm": 0.10321429371833801, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7281, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.058777186387265666, |
|
"grad_norm": 0.20060209929943085, |
|
"learning_rate": 0.0015, |
|
"loss": 2.807, |
|
"step": 9475 |
|
}, |
|
{ |
|
"epoch": 0.058932271311770326, |
|
"grad_norm": 0.10847010463476181, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8078, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.059087356236274986, |
|
"grad_norm": 0.11248752474784851, |
|
"learning_rate": 0.0015, |
|
"loss": 2.796, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 0.059242441160779646, |
|
"grad_norm": 0.13171915709972382, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7658, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.0593975260852843, |
|
"grad_norm": 0.12041529268026352, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7507, |
|
"step": 9575 |
|
}, |
|
{ |
|
"epoch": 0.05955261100978896, |
|
"grad_norm": 0.11275593191385269, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8022, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.05955261100978896, |
|
"eval_loss": 4.5886077880859375, |
|
"perplexity": 98.3573989868164, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.05970769593429362, |
|
"grad_norm": 0.1715971678495407, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8003, |
|
"step": 9625 |
|
}, |
|
{ |
|
"epoch": 0.05986278085879828, |
|
"grad_norm": 0.1223614364862442, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8012, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.06001786578330294, |
|
"grad_norm": 0.114704430103302, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7963, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 0.06017295070780759, |
|
"grad_norm": 0.10282139480113983, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7965, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.06032803563231225, |
|
"grad_norm": 0.10494767129421234, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7698, |
|
"step": 9725 |
|
}, |
|
{ |
|
"epoch": 0.06048312055681691, |
|
"grad_norm": 0.0908605083823204, |
|
"learning_rate": 0.0015, |
|
"loss": 2.749, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.06063820548132157, |
|
"grad_norm": 0.0847998857498169, |
|
"learning_rate": 0.0015, |
|
"loss": 2.838, |
|
"step": 9775 |
|
}, |
|
{ |
|
"epoch": 0.060793290405826234, |
|
"grad_norm": 0.24615754187107086, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8117, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.060793290405826234, |
|
"eval_loss": 4.593789100646973, |
|
"perplexity": 98.86833953857422, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.06094837533033089, |
|
"grad_norm": 0.0959208682179451, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7845, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 0.06110346025483555, |
|
"grad_norm": 0.09963307529687881, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8296, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.06125854517934021, |
|
"grad_norm": 0.1115136444568634, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7586, |
|
"step": 9875 |
|
}, |
|
{ |
|
"epoch": 0.06141363010384487, |
|
"grad_norm": 0.13883067667484283, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7978, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.06156871502834953, |
|
"grad_norm": 0.2048570066690445, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8397, |
|
"step": 9925 |
|
}, |
|
{ |
|
"epoch": 0.06172379995285418, |
|
"grad_norm": 0.1306881606578827, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8084, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.06187888487735884, |
|
"grad_norm": 0.18285603821277618, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7989, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 0.0620339698018635, |
|
"grad_norm": 0.1109723299741745, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8064, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.0620339698018635, |
|
"eval_loss": 4.5877556800842285, |
|
"perplexity": 98.27362823486328, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.06218905472636816, |
|
"grad_norm": 0.12350066751241684, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7684, |
|
"step": 10025 |
|
}, |
|
{ |
|
"epoch": 0.06234413965087282, |
|
"grad_norm": 0.11565285176038742, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7748, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.062499224575377474, |
|
"grad_norm": 0.1117839589715004, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8044, |
|
"step": 10075 |
|
}, |
|
{ |
|
"epoch": 0.06265430949988214, |
|
"grad_norm": 0.1102209985256195, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7844, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.0628093944243868, |
|
"grad_norm": 0.10270575433969498, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7685, |
|
"step": 10125 |
|
}, |
|
{ |
|
"epoch": 0.06296447934889145, |
|
"grad_norm": 0.09842963516712189, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8048, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.06311956427339611, |
|
"grad_norm": 0.10446088761091232, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8051, |
|
"step": 10175 |
|
}, |
|
{ |
|
"epoch": 0.06327464919790077, |
|
"grad_norm": 0.14759957790374756, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8089, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.06327464919790077, |
|
"eval_loss": 4.588883399963379, |
|
"perplexity": 98.38451385498047, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.06342973412240543, |
|
"grad_norm": 0.12910906970500946, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8193, |
|
"step": 10225 |
|
}, |
|
{ |
|
"epoch": 0.06358481904691009, |
|
"grad_norm": 0.13095402717590332, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7509, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.06373990397141474, |
|
"grad_norm": 0.16069594025611877, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7911, |
|
"step": 10275 |
|
}, |
|
{ |
|
"epoch": 0.06389498889591941, |
|
"grad_norm": 0.08322907984256744, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8025, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.06405007382042406, |
|
"grad_norm": 0.2328927367925644, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7863, |
|
"step": 10325 |
|
}, |
|
{ |
|
"epoch": 0.06420515874492873, |
|
"grad_norm": 0.09172859787940979, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8101, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.06436024366943338, |
|
"grad_norm": 0.13464473187923431, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7718, |
|
"step": 10375 |
|
}, |
|
{ |
|
"epoch": 0.06451532859393803, |
|
"grad_norm": 0.1284090131521225, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7667, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.06451532859393803, |
|
"eval_loss": 4.59510612487793, |
|
"perplexity": 98.99864196777344, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.0646704135184427, |
|
"grad_norm": 0.13565704226493835, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7552, |
|
"step": 10425 |
|
}, |
|
{ |
|
"epoch": 0.06482549844294735, |
|
"grad_norm": 0.1089024469256401, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7838, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.06498058336745202, |
|
"grad_norm": 0.11035135388374329, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7986, |
|
"step": 10475 |
|
}, |
|
{ |
|
"epoch": 0.06513566829195667, |
|
"grad_norm": 0.08107917010784149, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7791, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.06529075321646133, |
|
"grad_norm": 0.10200012475252151, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7636, |
|
"step": 10525 |
|
}, |
|
{ |
|
"epoch": 0.065445838140966, |
|
"grad_norm": 0.08427785336971283, |
|
"learning_rate": 0.0015, |
|
"loss": 2.794, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.06560092306547065, |
|
"grad_norm": 0.10828018933534622, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7778, |
|
"step": 10575 |
|
}, |
|
{ |
|
"epoch": 0.06575600798997532, |
|
"grad_norm": 0.12101134657859802, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7469, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.06575600798997532, |
|
"eval_loss": 4.597805500030518, |
|
"perplexity": 99.2662353515625, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.06591109291447997, |
|
"grad_norm": 0.11220554262399673, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7294, |
|
"step": 10625 |
|
}, |
|
{ |
|
"epoch": 0.06606617783898462, |
|
"grad_norm": 0.13899332284927368, |
|
"learning_rate": 0.0015, |
|
"loss": 2.763, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.06622126276348929, |
|
"grad_norm": 0.11773937195539474, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7866, |
|
"step": 10675 |
|
}, |
|
{ |
|
"epoch": 0.06637634768799394, |
|
"grad_norm": 0.11059702187776566, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8076, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.06653143261249861, |
|
"grad_norm": 0.1251254379749298, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7674, |
|
"step": 10725 |
|
}, |
|
{ |
|
"epoch": 0.06668651753700326, |
|
"grad_norm": 0.12195979803800583, |
|
"learning_rate": 0.0015, |
|
"loss": 2.768, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.06684160246150792, |
|
"grad_norm": 0.1487302929162979, |
|
"learning_rate": 0.0015, |
|
"loss": 2.762, |
|
"step": 10775 |
|
}, |
|
{ |
|
"epoch": 0.06699668738601258, |
|
"grad_norm": 0.1315547525882721, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7348, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.06699668738601258, |
|
"eval_loss": 4.566490650177002, |
|
"perplexity": 96.20589447021484, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.06715177231051724, |
|
"grad_norm": 0.13864025473594666, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7517, |
|
"step": 10825 |
|
}, |
|
{ |
|
"epoch": 0.0673068572350219, |
|
"grad_norm": 0.08808566629886627, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7718, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.06746194215952656, |
|
"grad_norm": 0.115321584045887, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7007, |
|
"step": 10875 |
|
}, |
|
{ |
|
"epoch": 0.06761702708403121, |
|
"grad_norm": 0.10276370495557785, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7692, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.06777211200853588, |
|
"grad_norm": 0.09534792602062225, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8186, |
|
"step": 10925 |
|
}, |
|
{ |
|
"epoch": 0.06792719693304053, |
|
"grad_norm": 0.14239507913589478, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7801, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.0680822818575452, |
|
"grad_norm": 0.11848737299442291, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7394, |
|
"step": 10975 |
|
}, |
|
{ |
|
"epoch": 0.06823736678204985, |
|
"grad_norm": 0.09367898106575012, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8043, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.06823736678204985, |
|
"eval_loss": 4.5800089836120605, |
|
"perplexity": 97.51527404785156, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.0683924517065545, |
|
"grad_norm": 0.1494915634393692, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7841, |
|
"step": 11025 |
|
}, |
|
{ |
|
"epoch": 0.06854753663105917, |
|
"grad_norm": 0.09982737898826599, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7933, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.06870262155556382, |
|
"grad_norm": 0.12379477173089981, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7419, |
|
"step": 11075 |
|
}, |
|
{ |
|
"epoch": 0.06885770648006849, |
|
"grad_norm": 0.11405149102210999, |
|
"learning_rate": 0.0015, |
|
"loss": 2.763, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.06901279140457314, |
|
"grad_norm": 0.09574620425701141, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7961, |
|
"step": 11125 |
|
}, |
|
{ |
|
"epoch": 0.06916787632907781, |
|
"grad_norm": 0.2947874963283539, |
|
"learning_rate": 0.0015, |
|
"loss": 2.789, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.06932296125358246, |
|
"grad_norm": 0.09219149500131607, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7951, |
|
"step": 11175 |
|
}, |
|
{ |
|
"epoch": 0.06947804617808712, |
|
"grad_norm": 0.11840498447418213, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7717, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.06947804617808712, |
|
"eval_loss": 4.564184188842773, |
|
"perplexity": 95.98426055908203, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.06963313110259178, |
|
"grad_norm": 0.09422053396701813, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7976, |
|
"step": 11225 |
|
}, |
|
{ |
|
"epoch": 0.06978821602709644, |
|
"grad_norm": 0.11220031976699829, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7634, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.0699433009516011, |
|
"grad_norm": 0.10228817909955978, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7256, |
|
"step": 11275 |
|
}, |
|
{ |
|
"epoch": 0.07009838587610576, |
|
"grad_norm": 0.0929483100771904, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8005, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.07025347080061041, |
|
"grad_norm": 0.11491668224334717, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7504, |
|
"step": 11325 |
|
}, |
|
{ |
|
"epoch": 0.07040855572511508, |
|
"grad_norm": 0.15256111323833466, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7609, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.07056364064961973, |
|
"grad_norm": 0.11576159298419952, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7742, |
|
"step": 11375 |
|
}, |
|
{ |
|
"epoch": 0.0707187255741244, |
|
"grad_norm": 0.08809765428304672, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7891, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.0707187255741244, |
|
"eval_loss": 4.568883895874023, |
|
"perplexity": 96.43641662597656, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.07087381049862905, |
|
"grad_norm": 0.08563827723264694, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8066, |
|
"step": 11425 |
|
}, |
|
{ |
|
"epoch": 0.0710288954231337, |
|
"grad_norm": 0.18896931409835815, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8055, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.07118398034763837, |
|
"grad_norm": 0.13940319418907166, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7766, |
|
"step": 11475 |
|
}, |
|
{ |
|
"epoch": 0.07133906527214302, |
|
"grad_norm": 0.09737322479486465, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7945, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.07149415019664769, |
|
"grad_norm": 0.11357785761356354, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7799, |
|
"step": 11525 |
|
}, |
|
{ |
|
"epoch": 0.07164923512115234, |
|
"grad_norm": 0.10513681918382645, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7627, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.071804320045657, |
|
"grad_norm": 0.1434682458639145, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8055, |
|
"step": 11575 |
|
}, |
|
{ |
|
"epoch": 0.07195940497016166, |
|
"grad_norm": 0.10169105976819992, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7832, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.07195940497016166, |
|
"eval_loss": 4.560365676879883, |
|
"perplexity": 95.61843872070312, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.07211448989466632, |
|
"grad_norm": 0.1385478526353836, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7548, |
|
"step": 11625 |
|
}, |
|
{ |
|
"epoch": 0.07226957481917098, |
|
"grad_norm": 0.1300746351480484, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7553, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.07242465974367564, |
|
"grad_norm": 0.11596991866827011, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8095, |
|
"step": 11675 |
|
}, |
|
{ |
|
"epoch": 0.07257974466818029, |
|
"grad_norm": 0.11611347645521164, |
|
"learning_rate": 0.0015, |
|
"loss": 2.76, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.07273482959268496, |
|
"grad_norm": 0.11249697953462601, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7827, |
|
"step": 11725 |
|
}, |
|
{ |
|
"epoch": 0.07288991451718961, |
|
"grad_norm": 0.1243973895907402, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7754, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.07304499944169428, |
|
"grad_norm": 0.08843350410461426, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8079, |
|
"step": 11775 |
|
}, |
|
{ |
|
"epoch": 0.07320008436619893, |
|
"grad_norm": 0.09881053864955902, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7961, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.07320008436619893, |
|
"eval_loss": 4.567913055419922, |
|
"perplexity": 96.34283447265625, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.07335516929070358, |
|
"grad_norm": 0.08978071063756943, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7786, |
|
"step": 11825 |
|
}, |
|
{ |
|
"epoch": 0.07351025421520825, |
|
"grad_norm": 0.1376107782125473, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7931, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.0736653391397129, |
|
"grad_norm": 0.09934777021408081, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7787, |
|
"step": 11875 |
|
}, |
|
{ |
|
"epoch": 0.07382042406421757, |
|
"grad_norm": 0.17031100392341614, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7997, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.07397550898872222, |
|
"grad_norm": 0.13974526524543762, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7975, |
|
"step": 11925 |
|
}, |
|
{ |
|
"epoch": 0.07413059391322688, |
|
"grad_norm": 0.12611718475818634, |
|
"learning_rate": 0.0015, |
|
"loss": 2.792, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.07428567883773154, |
|
"grad_norm": 0.15177124738693237, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7904, |
|
"step": 11975 |
|
}, |
|
{ |
|
"epoch": 0.0744407637622362, |
|
"grad_norm": 0.1411113739013672, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7677, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.0744407637622362, |
|
"eval_loss": 4.5571770668029785, |
|
"perplexity": 95.31403350830078, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.07459584868674086, |
|
"grad_norm": 0.08981940150260925, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7765, |
|
"step": 12025 |
|
}, |
|
{ |
|
"epoch": 0.07475093361124552, |
|
"grad_norm": 0.09796686470508575, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7503, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.07490601853575017, |
|
"grad_norm": 0.1125386580824852, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7263, |
|
"step": 12075 |
|
}, |
|
{ |
|
"epoch": 0.07506110346025484, |
|
"grad_norm": 0.11394508183002472, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7855, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.07521618838475949, |
|
"grad_norm": 0.11744117736816406, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7698, |
|
"step": 12125 |
|
}, |
|
{ |
|
"epoch": 0.07537127330926416, |
|
"grad_norm": 0.17264704406261444, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7592, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.07552635823376881, |
|
"grad_norm": 0.10691671818494797, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7519, |
|
"step": 12175 |
|
}, |
|
{ |
|
"epoch": 0.07568144315827346, |
|
"grad_norm": 0.1205432191491127, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7676, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.07568144315827346, |
|
"eval_loss": 4.544521808624268, |
|
"perplexity": 94.11540985107422, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.07583652808277813, |
|
"grad_norm": 0.1253867894411087, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7698, |
|
"step": 12225 |
|
}, |
|
{ |
|
"epoch": 0.07599161300728279, |
|
"grad_norm": 0.1450471729040146, |
|
"learning_rate": 0.0015, |
|
"loss": 2.77, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.07614669793178745, |
|
"grad_norm": 0.17055222392082214, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7352, |
|
"step": 12275 |
|
}, |
|
{ |
|
"epoch": 0.0763017828562921, |
|
"grad_norm": 0.10687011480331421, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7988, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.07645686778079676, |
|
"grad_norm": 0.15520496666431427, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7828, |
|
"step": 12325 |
|
}, |
|
{ |
|
"epoch": 0.07661195270530143, |
|
"grad_norm": 0.09279755502939224, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7222, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.07676703762980608, |
|
"grad_norm": 0.18024928867816925, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7555, |
|
"step": 12375 |
|
}, |
|
{ |
|
"epoch": 0.07692212255431075, |
|
"grad_norm": 0.13292630016803741, |
|
"learning_rate": 0.0015, |
|
"loss": 2.733, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.07692212255431075, |
|
"eval_loss": 4.538700103759766, |
|
"perplexity": 93.569091796875, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.0770772074788154, |
|
"grad_norm": 0.09353446960449219, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7768, |
|
"step": 12425 |
|
}, |
|
{ |
|
"epoch": 0.07723229240332005, |
|
"grad_norm": 0.0946316123008728, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7321, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.07738737732782472, |
|
"grad_norm": 0.11109050363302231, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7607, |
|
"step": 12475 |
|
}, |
|
{ |
|
"epoch": 0.07754246225232937, |
|
"grad_norm": 0.10057735443115234, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7707, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.07769754717683404, |
|
"grad_norm": 0.1466909795999527, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7434, |
|
"step": 12525 |
|
}, |
|
{ |
|
"epoch": 0.07785263210133869, |
|
"grad_norm": 0.09831534326076508, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7858, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.07800771702584335, |
|
"grad_norm": 0.13202817738056183, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7884, |
|
"step": 12575 |
|
}, |
|
{ |
|
"epoch": 0.07816280195034801, |
|
"grad_norm": 0.10797799378633499, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7788, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.07816280195034801, |
|
"eval_loss": 4.5452494621276855, |
|
"perplexity": 94.18392181396484, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.07831788687485267, |
|
"grad_norm": 0.10239394754171371, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7803, |
|
"step": 12625 |
|
}, |
|
{ |
|
"epoch": 0.07847297179935733, |
|
"grad_norm": 0.10468672215938568, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7449, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.07862805672386199, |
|
"grad_norm": 0.13691146671772003, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7837, |
|
"step": 12675 |
|
}, |
|
{ |
|
"epoch": 0.07878314164836664, |
|
"grad_norm": 0.16976097226142883, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7557, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.0789382265728713, |
|
"grad_norm": 0.09623986482620239, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7576, |
|
"step": 12725 |
|
}, |
|
{ |
|
"epoch": 0.07909331149737596, |
|
"grad_norm": 0.11203131079673767, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7846, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.07924839642188063, |
|
"grad_norm": 0.12257611751556396, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8015, |
|
"step": 12775 |
|
}, |
|
{ |
|
"epoch": 0.07940348134638528, |
|
"grad_norm": 0.08369628340005875, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7616, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.07940348134638528, |
|
"eval_loss": 4.548933506011963, |
|
"perplexity": 94.53153991699219, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.07955856627088993, |
|
"grad_norm": 0.12149519473314285, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7651, |
|
"step": 12825 |
|
}, |
|
{ |
|
"epoch": 0.0797136511953946, |
|
"grad_norm": 0.09911686927080154, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7964, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.07986873611989925, |
|
"grad_norm": 0.09883631020784378, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7461, |
|
"step": 12875 |
|
}, |
|
{ |
|
"epoch": 0.08002382104440392, |
|
"grad_norm": 0.08828576654195786, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7735, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.08017890596890857, |
|
"grad_norm": 0.18119321763515472, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7863, |
|
"step": 12925 |
|
}, |
|
{ |
|
"epoch": 0.08033399089341323, |
|
"grad_norm": 0.09123501181602478, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7559, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.0804890758179179, |
|
"grad_norm": 0.18334759771823883, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7357, |
|
"step": 12975 |
|
}, |
|
{ |
|
"epoch": 0.08064416074242255, |
|
"grad_norm": 0.08934136480093002, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8003, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.08064416074242255, |
|
"eval_loss": 4.537932395935059, |
|
"perplexity": 93.49728393554688, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.08079924566692721, |
|
"grad_norm": 0.117793008685112, |
|
"learning_rate": 0.0015, |
|
"loss": 2.738, |
|
"step": 13025 |
|
}, |
|
{ |
|
"epoch": 0.08095433059143187, |
|
"grad_norm": 0.1012151837348938, |
|
"learning_rate": 0.0015, |
|
"loss": 2.767, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.08110941551593653, |
|
"grad_norm": 0.1099851131439209, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7899, |
|
"step": 13075 |
|
}, |
|
{ |
|
"epoch": 0.08126450044044119, |
|
"grad_norm": 0.105575330555439, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7857, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.08141958536494584, |
|
"grad_norm": 0.11926279962062836, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7821, |
|
"step": 13125 |
|
}, |
|
{ |
|
"epoch": 0.08157467028945051, |
|
"grad_norm": 0.1669924259185791, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7673, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 0.08172975521395516, |
|
"grad_norm": 0.11445988714694977, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8081, |
|
"step": 13175 |
|
}, |
|
{ |
|
"epoch": 0.08188484013845983, |
|
"grad_norm": 0.09700124710798264, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7841, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.08188484013845983, |
|
"eval_loss": 4.540359973907471, |
|
"perplexity": 93.72453308105469, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.08203992506296448, |
|
"grad_norm": 0.11112058907747269, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7471, |
|
"step": 13225 |
|
}, |
|
{ |
|
"epoch": 0.08219500998746913, |
|
"grad_norm": 0.17890195548534393, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7898, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 0.0823500949119738, |
|
"grad_norm": 0.12197751551866531, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7328, |
|
"step": 13275 |
|
}, |
|
{ |
|
"epoch": 0.08250517983647845, |
|
"grad_norm": 0.11677111685276031, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7849, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.08266026476098312, |
|
"grad_norm": 0.15514017641544342, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7561, |
|
"step": 13325 |
|
}, |
|
{ |
|
"epoch": 0.08281534968548777, |
|
"grad_norm": 0.10389192402362823, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7611, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 0.08297043460999243, |
|
"grad_norm": 0.10176412016153336, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7793, |
|
"step": 13375 |
|
}, |
|
{ |
|
"epoch": 0.0831255195344971, |
|
"grad_norm": 0.1043052077293396, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7375, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.0831255195344971, |
|
"eval_loss": 4.5388336181640625, |
|
"perplexity": 93.58158111572266, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.08328060445900175, |
|
"grad_norm": 0.08918718248605728, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7465, |
|
"step": 13425 |
|
}, |
|
{ |
|
"epoch": 0.08343568938350641, |
|
"grad_norm": 0.10008233785629272, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7776, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 0.08359077430801107, |
|
"grad_norm": 0.10228800773620605, |
|
"learning_rate": 0.0015, |
|
"loss": 2.756, |
|
"step": 13475 |
|
}, |
|
{ |
|
"epoch": 0.08374585923251572, |
|
"grad_norm": 0.0868915542960167, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7556, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.08390094415702039, |
|
"grad_norm": 0.11076166480779648, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6975, |
|
"step": 13525 |
|
}, |
|
{ |
|
"epoch": 0.08405602908152504, |
|
"grad_norm": 0.13617128133773804, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7643, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 0.08421111400602971, |
|
"grad_norm": 0.15346932411193848, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7966, |
|
"step": 13575 |
|
}, |
|
{ |
|
"epoch": 0.08436619893053436, |
|
"grad_norm": 0.17080894112586975, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7636, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.08436619893053436, |
|
"eval_loss": 4.513378620147705, |
|
"perplexity": 91.22953033447266, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.08452128385503901, |
|
"grad_norm": 0.11548548936843872, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7729, |
|
"step": 13625 |
|
}, |
|
{ |
|
"epoch": 0.08467636877954368, |
|
"grad_norm": 0.14650912582874298, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7063, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 0.08483145370404833, |
|
"grad_norm": 0.09750749915838242, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7648, |
|
"step": 13675 |
|
}, |
|
{ |
|
"epoch": 0.084986538628553, |
|
"grad_norm": 0.18051239848136902, |
|
"learning_rate": 0.0015, |
|
"loss": 2.754, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.08514162355305765, |
|
"grad_norm": 0.21637938916683197, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7529, |
|
"step": 13725 |
|
}, |
|
{ |
|
"epoch": 0.08529670847756231, |
|
"grad_norm": 0.10037226974964142, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7638, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 0.08545179340206698, |
|
"grad_norm": 0.1033267229795456, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7713, |
|
"step": 13775 |
|
}, |
|
{ |
|
"epoch": 0.08560687832657163, |
|
"grad_norm": 0.09179462492465973, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8278, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.08560687832657163, |
|
"eval_loss": 4.508410453796387, |
|
"perplexity": 90.77741241455078, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.0857619632510763, |
|
"grad_norm": 0.09874552488327026, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7544, |
|
"step": 13825 |
|
}, |
|
{ |
|
"epoch": 0.08591704817558095, |
|
"grad_norm": 0.17807777225971222, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7401, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 0.0860721331000856, |
|
"grad_norm": 0.14388497173786163, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7879, |
|
"step": 13875 |
|
}, |
|
{ |
|
"epoch": 0.08622721802459027, |
|
"grad_norm": 0.13081450760364532, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7162, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.08638230294909492, |
|
"grad_norm": 0.15077342092990875, |
|
"learning_rate": 0.0015, |
|
"loss": 2.757, |
|
"step": 13925 |
|
}, |
|
{ |
|
"epoch": 0.08653738787359959, |
|
"grad_norm": 0.11368410289287567, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7546, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 0.08669247279810424, |
|
"grad_norm": 0.16447153687477112, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7371, |
|
"step": 13975 |
|
}, |
|
{ |
|
"epoch": 0.0868475577226089, |
|
"grad_norm": 0.20563559234142303, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7474, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.0868475577226089, |
|
"eval_loss": 4.525671005249023, |
|
"perplexity": 92.35787963867188, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.08700264264711356, |
|
"grad_norm": 0.10695035755634308, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7565, |
|
"step": 14025 |
|
}, |
|
{ |
|
"epoch": 0.08715772757161822, |
|
"grad_norm": 0.12368099391460419, |
|
"learning_rate": 0.0015, |
|
"loss": 2.784, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 0.08731281249612288, |
|
"grad_norm": 0.11491699516773224, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7477, |
|
"step": 14075 |
|
}, |
|
{ |
|
"epoch": 0.08746789742062754, |
|
"grad_norm": 0.10570378601551056, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7575, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.08762298234513219, |
|
"grad_norm": 0.09137633442878723, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7517, |
|
"step": 14125 |
|
}, |
|
{ |
|
"epoch": 0.08777806726963686, |
|
"grad_norm": 0.09999803453683853, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7446, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 0.08793315219414151, |
|
"grad_norm": 0.15709616243839264, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7606, |
|
"step": 14175 |
|
}, |
|
{ |
|
"epoch": 0.08808823711864618, |
|
"grad_norm": 0.10327859222888947, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7441, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.08808823711864618, |
|
"eval_loss": 4.521189212799072, |
|
"perplexity": 91.94487762451172, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.08824332204315083, |
|
"grad_norm": 0.1964125633239746, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7109, |
|
"step": 14225 |
|
}, |
|
{ |
|
"epoch": 0.08839840696765548, |
|
"grad_norm": 0.12792247533798218, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7401, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 0.08855349189216015, |
|
"grad_norm": 0.17532923817634583, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7609, |
|
"step": 14275 |
|
}, |
|
{ |
|
"epoch": 0.0887085768166648, |
|
"grad_norm": 0.096143439412117, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7749, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.08886366174116947, |
|
"grad_norm": 0.12778601050376892, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6981, |
|
"step": 14325 |
|
}, |
|
{ |
|
"epoch": 0.08901874666567412, |
|
"grad_norm": 0.1130848377943039, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7255, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 0.08917383159017878, |
|
"grad_norm": 0.0818464607000351, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7223, |
|
"step": 14375 |
|
}, |
|
{ |
|
"epoch": 0.08932891651468344, |
|
"grad_norm": 0.10516222566366196, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7672, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.08932891651468344, |
|
"eval_loss": 4.524067401885986, |
|
"perplexity": 92.20989227294922, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.0894840014391881, |
|
"grad_norm": 0.08912840485572815, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7349, |
|
"step": 14425 |
|
}, |
|
{ |
|
"epoch": 0.08963908636369276, |
|
"grad_norm": 0.11931388080120087, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7326, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 0.08979417128819742, |
|
"grad_norm": 0.12271756678819656, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7327, |
|
"step": 14475 |
|
}, |
|
{ |
|
"epoch": 0.08994925621270207, |
|
"grad_norm": 0.1567191183567047, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7573, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.09010434113720674, |
|
"grad_norm": 0.1841791719198227, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7582, |
|
"step": 14525 |
|
}, |
|
{ |
|
"epoch": 0.09025942606171139, |
|
"grad_norm": 0.12743189930915833, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8061, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 0.09041451098621606, |
|
"grad_norm": 0.11932828277349472, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7447, |
|
"step": 14575 |
|
}, |
|
{ |
|
"epoch": 0.09056959591072071, |
|
"grad_norm": 0.18284690380096436, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7436, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.09056959591072071, |
|
"eval_loss": 4.515897750854492, |
|
"perplexity": 91.45964050292969, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.09072468083522536, |
|
"grad_norm": 0.17987670004367828, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7831, |
|
"step": 14625 |
|
}, |
|
{ |
|
"epoch": 0.09087976575973003, |
|
"grad_norm": 0.10992395132780075, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7516, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 0.09103485068423468, |
|
"grad_norm": 0.09343726187944412, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7475, |
|
"step": 14675 |
|
}, |
|
{ |
|
"epoch": 0.09118993560873935, |
|
"grad_norm": 0.10370751470327377, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7518, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.091345020533244, |
|
"grad_norm": 0.11190348863601685, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7482, |
|
"step": 14725 |
|
}, |
|
{ |
|
"epoch": 0.09150010545774866, |
|
"grad_norm": 0.12450053542852402, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7726, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 0.09165519038225332, |
|
"grad_norm": 0.11882703006267548, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7318, |
|
"step": 14775 |
|
}, |
|
{ |
|
"epoch": 0.09181027530675798, |
|
"grad_norm": 0.1315181404352188, |
|
"learning_rate": 0.0015, |
|
"loss": 2.757, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.09181027530675798, |
|
"eval_loss": 4.521557807922363, |
|
"perplexity": 91.97877502441406, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.09196536023126264, |
|
"grad_norm": 0.18574784696102142, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7353, |
|
"step": 14825 |
|
}, |
|
{ |
|
"epoch": 0.0921204451557673, |
|
"grad_norm": 0.17665444314479828, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7687, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 0.09227553008027195, |
|
"grad_norm": 0.12507860362529755, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7386, |
|
"step": 14875 |
|
}, |
|
{ |
|
"epoch": 0.09243061500477662, |
|
"grad_norm": 0.10472691059112549, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7716, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.09258569992928127, |
|
"grad_norm": 0.10282575339078903, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7312, |
|
"step": 14925 |
|
}, |
|
{ |
|
"epoch": 0.09274078485378594, |
|
"grad_norm": 0.12706094980239868, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7995, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 0.09289586977829059, |
|
"grad_norm": 0.15283973515033722, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7313, |
|
"step": 14975 |
|
}, |
|
{ |
|
"epoch": 0.09305095470279524, |
|
"grad_norm": 0.12476324290037155, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7727, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.09305095470279524, |
|
"eval_loss": 4.547565937042236, |
|
"perplexity": 94.40234375, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.09320603962729991, |
|
"grad_norm": 0.12369734048843384, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7565, |
|
"step": 15025 |
|
}, |
|
{ |
|
"epoch": 0.09336112455180456, |
|
"grad_norm": 0.1322038471698761, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7588, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 0.09351620947630923, |
|
"grad_norm": 0.0926559790968895, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7393, |
|
"step": 15075 |
|
}, |
|
{ |
|
"epoch": 0.09367129440081388, |
|
"grad_norm": 0.17404210567474365, |
|
"learning_rate": 0.0015, |
|
"loss": 2.723, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.09382637932531855, |
|
"grad_norm": 0.10326647758483887, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7853, |
|
"step": 15125 |
|
}, |
|
{ |
|
"epoch": 0.0939814642498232, |
|
"grad_norm": 0.13869203627109528, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7535, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 0.09413654917432786, |
|
"grad_norm": 0.14325955510139465, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7597, |
|
"step": 15175 |
|
}, |
|
{ |
|
"epoch": 0.09429163409883252, |
|
"grad_norm": 0.11783768236637115, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7524, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.09429163409883252, |
|
"eval_loss": 4.5251593589782715, |
|
"perplexity": 92.31063842773438, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.09444671902333718, |
|
"grad_norm": 0.12261676043272018, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7279, |
|
"step": 15225 |
|
}, |
|
{ |
|
"epoch": 0.09460180394784184, |
|
"grad_norm": 0.09966279566287994, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8119, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 0.0947568888723465, |
|
"grad_norm": 0.1052974984049797, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7392, |
|
"step": 15275 |
|
}, |
|
{ |
|
"epoch": 0.09491197379685115, |
|
"grad_norm": 0.11074663698673248, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7319, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.09506705872135582, |
|
"grad_norm": 0.09762706607580185, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7806, |
|
"step": 15325 |
|
}, |
|
{ |
|
"epoch": 0.09522214364586047, |
|
"grad_norm": 0.08552476018667221, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7351, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 0.09537722857036514, |
|
"grad_norm": 0.13211695849895477, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7667, |
|
"step": 15375 |
|
}, |
|
{ |
|
"epoch": 0.09553231349486979, |
|
"grad_norm": 0.12074939906597137, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7614, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.09553231349486979, |
|
"eval_loss": 4.53213357925415, |
|
"perplexity": 92.95668029785156, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.09568739841937444, |
|
"grad_norm": 0.11755666136741638, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7101, |
|
"step": 15425 |
|
}, |
|
{ |
|
"epoch": 0.09584248334387911, |
|
"grad_norm": 0.10476246476173401, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7391, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 0.09599756826838376, |
|
"grad_norm": 0.10921350121498108, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7423, |
|
"step": 15475 |
|
}, |
|
{ |
|
"epoch": 0.09615265319288843, |
|
"grad_norm": 0.11517275124788284, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7374, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.09630773811739309, |
|
"grad_norm": 0.10500945895910263, |
|
"learning_rate": 0.0015, |
|
"loss": 2.73, |
|
"step": 15525 |
|
}, |
|
{ |
|
"epoch": 0.09646282304189774, |
|
"grad_norm": 0.0962584912776947, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7597, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 0.0966179079664024, |
|
"grad_norm": 0.1273050308227539, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7306, |
|
"step": 15575 |
|
}, |
|
{ |
|
"epoch": 0.09677299289090706, |
|
"grad_norm": 0.11249135434627533, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7859, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.09677299289090706, |
|
"eval_loss": 4.537318706512451, |
|
"perplexity": 93.43992614746094, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.09692807781541173, |
|
"grad_norm": 0.19111056625843048, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7386, |
|
"step": 15625 |
|
}, |
|
{ |
|
"epoch": 0.09708316273991638, |
|
"grad_norm": 0.10486472398042679, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7462, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 0.09723824766442103, |
|
"grad_norm": 0.1453208327293396, |
|
"learning_rate": 0.0015, |
|
"loss": 2.762, |
|
"step": 15675 |
|
}, |
|
{ |
|
"epoch": 0.0973933325889257, |
|
"grad_norm": 0.08459452539682388, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7353, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.09754841751343035, |
|
"grad_norm": 0.11150529980659485, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7617, |
|
"step": 15725 |
|
}, |
|
{ |
|
"epoch": 0.09770350243793502, |
|
"grad_norm": 0.11301703006029129, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7623, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 0.09785858736243967, |
|
"grad_norm": 0.16564789414405823, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7315, |
|
"step": 15775 |
|
}, |
|
{ |
|
"epoch": 0.09801367228694433, |
|
"grad_norm": 0.08968822658061981, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7842, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.09801367228694433, |
|
"eval_loss": 4.528219223022461, |
|
"perplexity": 92.5935287475586, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.09816875721144899, |
|
"grad_norm": 0.1233256533741951, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7584, |
|
"step": 15825 |
|
}, |
|
{ |
|
"epoch": 0.09832384213595365, |
|
"grad_norm": 0.18926863372325897, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7651, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 0.09847892706045831, |
|
"grad_norm": 0.0912550836801529, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7551, |
|
"step": 15875 |
|
}, |
|
{ |
|
"epoch": 0.09863401198496297, |
|
"grad_norm": 0.1443813592195511, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7378, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.09878909690946762, |
|
"grad_norm": 0.11620072275400162, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7706, |
|
"step": 15925 |
|
}, |
|
{ |
|
"epoch": 0.09894418183397229, |
|
"grad_norm": 0.10275860130786896, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7502, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 0.09909926675847694, |
|
"grad_norm": 0.1417694240808487, |
|
"learning_rate": 0.0015, |
|
"loss": 2.706, |
|
"step": 15975 |
|
}, |
|
{ |
|
"epoch": 0.0992543516829816, |
|
"grad_norm": 0.1121877133846283, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7537, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.0992543516829816, |
|
"eval_loss": 4.520648956298828, |
|
"perplexity": 91.89521789550781, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.09940943660748626, |
|
"grad_norm": 0.10022582858800888, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7213, |
|
"step": 16025 |
|
}, |
|
{ |
|
"epoch": 0.09956452153199091, |
|
"grad_norm": 0.09722616523504257, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7437, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 0.09971960645649558, |
|
"grad_norm": 0.11053729802370071, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7495, |
|
"step": 16075 |
|
}, |
|
{ |
|
"epoch": 0.09987469138100023, |
|
"grad_norm": 0.10231011360883713, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7505, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.1000297763055049, |
|
"grad_norm": 0.135975643992424, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7487, |
|
"step": 16125 |
|
}, |
|
{ |
|
"epoch": 0.10018486123000955, |
|
"grad_norm": 0.11350739002227783, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7484, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 0.1003399461545142, |
|
"grad_norm": 0.10639143735170364, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7429, |
|
"step": 16175 |
|
}, |
|
{ |
|
"epoch": 0.10049503107901887, |
|
"grad_norm": 0.09016221761703491, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7891, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.10049503107901887, |
|
"eval_loss": 4.5112504959106445, |
|
"perplexity": 91.03558349609375, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.10065011600352353, |
|
"grad_norm": 0.11324500292539597, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7678, |
|
"step": 16225 |
|
}, |
|
{ |
|
"epoch": 0.1008052009280282, |
|
"grad_norm": 0.13268886506557465, |
|
"learning_rate": 0.0015, |
|
"loss": 2.723, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 0.10096028585253285, |
|
"grad_norm": 0.11448831856250763, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7328, |
|
"step": 16275 |
|
}, |
|
{ |
|
"epoch": 0.1011153707770375, |
|
"grad_norm": 0.10799309611320496, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7478, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.10127045570154217, |
|
"grad_norm": 0.19559204578399658, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7606, |
|
"step": 16325 |
|
}, |
|
{ |
|
"epoch": 0.10142554062604682, |
|
"grad_norm": 0.14151975512504578, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7279, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 0.10158062555055149, |
|
"grad_norm": 0.10044725239276886, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7609, |
|
"step": 16375 |
|
}, |
|
{ |
|
"epoch": 0.10173571047505614, |
|
"grad_norm": 0.10686340183019638, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7295, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.10173571047505614, |
|
"eval_loss": 4.521287441253662, |
|
"perplexity": 91.95391082763672, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.1018907953995608, |
|
"grad_norm": 0.1561044305562973, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7769, |
|
"step": 16425 |
|
}, |
|
{ |
|
"epoch": 0.10204588032406546, |
|
"grad_norm": 0.12182148545980453, |
|
"learning_rate": 0.0015, |
|
"loss": 2.757, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 0.10220096524857011, |
|
"grad_norm": 0.20665724575519562, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7349, |
|
"step": 16475 |
|
}, |
|
{ |
|
"epoch": 0.10235605017307478, |
|
"grad_norm": 0.09160878509283066, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7393, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.10251113509757943, |
|
"grad_norm": 0.16651533544063568, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7441, |
|
"step": 16525 |
|
}, |
|
{ |
|
"epoch": 0.10266622002208409, |
|
"grad_norm": 0.09358719736337662, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7297, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 0.10282130494658875, |
|
"grad_norm": 0.20277003943920135, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7506, |
|
"step": 16575 |
|
}, |
|
{ |
|
"epoch": 0.10297638987109341, |
|
"grad_norm": 0.13382607698440552, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7924, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.10297638987109341, |
|
"eval_loss": 4.525242328643799, |
|
"perplexity": 92.31829833984375, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.10313147479559807, |
|
"grad_norm": 0.09686290472745895, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7417, |
|
"step": 16625 |
|
}, |
|
{ |
|
"epoch": 0.10328655972010273, |
|
"grad_norm": 0.11446567624807358, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7582, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 0.10344164464460738, |
|
"grad_norm": 0.15948985517024994, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7254, |
|
"step": 16675 |
|
}, |
|
{ |
|
"epoch": 0.10359672956911205, |
|
"grad_norm": 0.1254827231168747, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7515, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.1037518144936167, |
|
"grad_norm": 0.11295375972986221, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7058, |
|
"step": 16725 |
|
}, |
|
{ |
|
"epoch": 0.10390689941812137, |
|
"grad_norm": 0.10659389197826385, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7281, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 0.10406198434262602, |
|
"grad_norm": 0.1045156791806221, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7131, |
|
"step": 16775 |
|
}, |
|
{ |
|
"epoch": 0.10421706926713067, |
|
"grad_norm": 0.13835974037647247, |
|
"learning_rate": 0.0015, |
|
"loss": 2.744, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.10421706926713067, |
|
"eval_loss": 4.507747650146484, |
|
"perplexity": 90.7172622680664, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.10437215419163534, |
|
"grad_norm": 0.19872727990150452, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7642, |
|
"step": 16825 |
|
}, |
|
{ |
|
"epoch": 0.10452723911614, |
|
"grad_norm": 0.13754956424236298, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7652, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 0.10468232404064466, |
|
"grad_norm": 0.1451335996389389, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7561, |
|
"step": 16875 |
|
}, |
|
{ |
|
"epoch": 0.10483740896514931, |
|
"grad_norm": 0.16750144958496094, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7206, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.10499249388965397, |
|
"grad_norm": 0.12020619958639145, |
|
"learning_rate": 0.0015, |
|
"loss": 2.699, |
|
"step": 16925 |
|
}, |
|
{ |
|
"epoch": 0.10514757881415863, |
|
"grad_norm": 0.16792155802249908, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8062, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 0.10530266373866329, |
|
"grad_norm": 0.11066465824842453, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6968, |
|
"step": 16975 |
|
}, |
|
{ |
|
"epoch": 0.10545774866316796, |
|
"grad_norm": 0.11885298788547516, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7699, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.10545774866316796, |
|
"eval_loss": 4.524214744567871, |
|
"perplexity": 92.22348022460938, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.10561283358767261, |
|
"grad_norm": 0.1298653483390808, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7199, |
|
"step": 17025 |
|
}, |
|
{ |
|
"epoch": 0.10576791851217726, |
|
"grad_norm": 0.11387672275304794, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7528, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 0.10592300343668193, |
|
"grad_norm": 0.09852533042430878, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7277, |
|
"step": 17075 |
|
}, |
|
{ |
|
"epoch": 0.10607808836118658, |
|
"grad_norm": 0.11046476662158966, |
|
"learning_rate": 0.0015, |
|
"loss": 2.722, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.10623317328569125, |
|
"grad_norm": 0.11632421612739563, |
|
"learning_rate": 0.0015, |
|
"loss": 2.726, |
|
"step": 17125 |
|
}, |
|
{ |
|
"epoch": 0.1063882582101959, |
|
"grad_norm": 0.11760540306568146, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7267, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 0.10654334313470057, |
|
"grad_norm": 0.12264183163642883, |
|
"learning_rate": 0.0015, |
|
"loss": 2.8037, |
|
"step": 17175 |
|
}, |
|
{ |
|
"epoch": 0.10669842805920522, |
|
"grad_norm": 0.15346336364746094, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7668, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.10669842805920522, |
|
"eval_loss": 4.503612995147705, |
|
"perplexity": 90.34294891357422, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.10685351298370988, |
|
"grad_norm": 0.10642746090888977, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7295, |
|
"step": 17225 |
|
}, |
|
{ |
|
"epoch": 0.10700859790821454, |
|
"grad_norm": 0.10965430736541748, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7113, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 0.1071636828327192, |
|
"grad_norm": 0.09912869334220886, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7353, |
|
"step": 17275 |
|
}, |
|
{ |
|
"epoch": 0.10731876775722386, |
|
"grad_norm": 0.14111942052841187, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7064, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.10747385268172852, |
|
"grad_norm": 0.11583065241575241, |
|
"learning_rate": 0.0015, |
|
"loss": 2.722, |
|
"step": 17325 |
|
}, |
|
{ |
|
"epoch": 0.10762893760623317, |
|
"grad_norm": 0.09374859184026718, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6964, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 0.10778402253073784, |
|
"grad_norm": 0.11704573035240173, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7518, |
|
"step": 17375 |
|
}, |
|
{ |
|
"epoch": 0.10793910745524249, |
|
"grad_norm": 0.13960668444633484, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7373, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.10793910745524249, |
|
"eval_loss": 4.514464378356934, |
|
"perplexity": 91.3286361694336, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.10809419237974716, |
|
"grad_norm": 0.1006089448928833, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7199, |
|
"step": 17425 |
|
}, |
|
{ |
|
"epoch": 0.10824927730425181, |
|
"grad_norm": 0.14851173758506775, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7202, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 0.10840436222875646, |
|
"grad_norm": 0.11992091685533524, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6932, |
|
"step": 17475 |
|
}, |
|
{ |
|
"epoch": 0.10855944715326113, |
|
"grad_norm": 0.12420158833265305, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7395, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.10871453207776578, |
|
"grad_norm": 0.09945713728666306, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7323, |
|
"step": 17525 |
|
}, |
|
{ |
|
"epoch": 0.10886961700227045, |
|
"grad_norm": 0.13007710874080658, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7438, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 0.1090247019267751, |
|
"grad_norm": 0.10875315964221954, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7656, |
|
"step": 17575 |
|
}, |
|
{ |
|
"epoch": 0.10917978685127976, |
|
"grad_norm": 0.1075393334031105, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7174, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.10917978685127976, |
|
"eval_loss": 4.4858293533325195, |
|
"perplexity": 88.75052642822266, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.10933487177578442, |
|
"grad_norm": 0.16400013864040375, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7389, |
|
"step": 17625 |
|
}, |
|
{ |
|
"epoch": 0.10948995670028908, |
|
"grad_norm": 0.1368722766637802, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7198, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 0.10964504162479374, |
|
"grad_norm": 0.23104597628116608, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7346, |
|
"step": 17675 |
|
}, |
|
{ |
|
"epoch": 0.1098001265492984, |
|
"grad_norm": 0.12463794648647308, |
|
"learning_rate": 0.0015, |
|
"loss": 2.691, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.10995521147380305, |
|
"grad_norm": 0.19538962841033936, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6917, |
|
"step": 17725 |
|
}, |
|
{ |
|
"epoch": 0.11011029639830772, |
|
"grad_norm": 0.12000603973865509, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7431, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 0.11026538132281237, |
|
"grad_norm": 0.15090298652648926, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7493, |
|
"step": 17775 |
|
}, |
|
{ |
|
"epoch": 0.11042046624731704, |
|
"grad_norm": 0.13190440833568573, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7582, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.11042046624731704, |
|
"eval_loss": 4.493134021759033, |
|
"perplexity": 89.40119171142578, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.11057555117182169, |
|
"grad_norm": 0.12455850094556808, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7574, |
|
"step": 17825 |
|
}, |
|
{ |
|
"epoch": 0.11073063609632634, |
|
"grad_norm": 0.14911110699176788, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7285, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 0.11088572102083101, |
|
"grad_norm": 0.16008728742599487, |
|
"learning_rate": 0.0015, |
|
"loss": 2.733, |
|
"step": 17875 |
|
}, |
|
{ |
|
"epoch": 0.11104080594533566, |
|
"grad_norm": 0.1668420433998108, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7259, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.11119589086984033, |
|
"grad_norm": 0.11736566573381424, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7682, |
|
"step": 17925 |
|
}, |
|
{ |
|
"epoch": 0.11135097579434498, |
|
"grad_norm": 0.11538700759410858, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7656, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 0.11150606071884964, |
|
"grad_norm": 0.09440570324659348, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7517, |
|
"step": 17975 |
|
}, |
|
{ |
|
"epoch": 0.1116611456433543, |
|
"grad_norm": 0.20621652901172638, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7292, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.1116611456433543, |
|
"eval_loss": 4.493429183959961, |
|
"perplexity": 89.42758178710938, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.11181623056785896, |
|
"grad_norm": 0.12027841061353683, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7049, |
|
"step": 18025 |
|
}, |
|
{ |
|
"epoch": 0.11197131549236362, |
|
"grad_norm": 0.08760379254817963, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7291, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 0.11212640041686828, |
|
"grad_norm": 0.1251729428768158, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7149, |
|
"step": 18075 |
|
}, |
|
{ |
|
"epoch": 0.11228148534137293, |
|
"grad_norm": 0.10340214520692825, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7437, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.1124365702658776, |
|
"grad_norm": 0.10546920448541641, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7656, |
|
"step": 18125 |
|
}, |
|
{ |
|
"epoch": 0.11259165519038225, |
|
"grad_norm": 0.12438227981328964, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7171, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 0.11274674011488692, |
|
"grad_norm": 0.14557534456253052, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7395, |
|
"step": 18175 |
|
}, |
|
{ |
|
"epoch": 0.11290182503939157, |
|
"grad_norm": 0.13714823126792908, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7066, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.11290182503939157, |
|
"eval_loss": 4.4876604080200195, |
|
"perplexity": 88.9131851196289, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.11305690996389622, |
|
"grad_norm": 0.12662547826766968, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6665, |
|
"step": 18225 |
|
}, |
|
{ |
|
"epoch": 0.11321199488840089, |
|
"grad_norm": 0.10047092288732529, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7332, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 0.11336707981290554, |
|
"grad_norm": 0.11126455664634705, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7154, |
|
"step": 18275 |
|
}, |
|
{ |
|
"epoch": 0.11352216473741021, |
|
"grad_norm": 0.10023871064186096, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7007, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.11367724966191486, |
|
"grad_norm": 0.11821885406970978, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7081, |
|
"step": 18325 |
|
}, |
|
{ |
|
"epoch": 0.11383233458641952, |
|
"grad_norm": 0.1216677874326706, |
|
"learning_rate": 0.0015, |
|
"loss": 2.74, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 0.11398741951092418, |
|
"grad_norm": 0.1125161275267601, |
|
"learning_rate": 0.0015, |
|
"loss": 2.733, |
|
"step": 18375 |
|
}, |
|
{ |
|
"epoch": 0.11414250443542884, |
|
"grad_norm": 0.18253153562545776, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7085, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.11414250443542884, |
|
"eval_loss": 4.501376628875732, |
|
"perplexity": 90.1411361694336, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.1142975893599335, |
|
"grad_norm": 0.13288918137550354, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7033, |
|
"step": 18425 |
|
}, |
|
{ |
|
"epoch": 0.11445267428443816, |
|
"grad_norm": 0.1069432720541954, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7063, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 0.11460775920894281, |
|
"grad_norm": 0.1035354733467102, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7174, |
|
"step": 18475 |
|
}, |
|
{ |
|
"epoch": 0.11476284413344748, |
|
"grad_norm": 0.1121230348944664, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.11491792905795213, |
|
"grad_norm": 0.13324719667434692, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7423, |
|
"step": 18525 |
|
}, |
|
{ |
|
"epoch": 0.1150730139824568, |
|
"grad_norm": 0.0891190841794014, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7418, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 0.11522809890696145, |
|
"grad_norm": 0.10579492896795273, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7321, |
|
"step": 18575 |
|
}, |
|
{ |
|
"epoch": 0.1153831838314661, |
|
"grad_norm": 0.1010003387928009, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7071, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.1153831838314661, |
|
"eval_loss": 4.508904933929443, |
|
"perplexity": 90.82231140136719, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.11553826875597077, |
|
"grad_norm": 0.1599242389202118, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7222, |
|
"step": 18625 |
|
}, |
|
{ |
|
"epoch": 0.11569335368047542, |
|
"grad_norm": 0.09344537556171417, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7424, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 0.11584843860498009, |
|
"grad_norm": 0.13959461450576782, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7584, |
|
"step": 18675 |
|
}, |
|
{ |
|
"epoch": 0.11600352352948474, |
|
"grad_norm": 0.11661764234304428, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7363, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.1161586084539894, |
|
"grad_norm": 0.11968798190355301, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7314, |
|
"step": 18725 |
|
}, |
|
{ |
|
"epoch": 0.11631369337849407, |
|
"grad_norm": 0.22232107818126678, |
|
"learning_rate": 0.0015, |
|
"loss": 2.6992, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 0.11646877830299872, |
|
"grad_norm": 0.1387198567390442, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7001, |
|
"step": 18775 |
|
}, |
|
{ |
|
"epoch": 0.11662386322750339, |
|
"grad_norm": 0.17059509456157684, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7002, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.11662386322750339, |
|
"eval_loss": 4.516000270843506, |
|
"perplexity": 91.4690170288086, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.11677894815200804, |
|
"grad_norm": 0.10877668112516403, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7171, |
|
"step": 18825 |
|
}, |
|
{ |
|
"epoch": 0.11693403307651269, |
|
"grad_norm": 0.11746638268232346, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7006, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 0.11708911800101736, |
|
"grad_norm": 0.17617632448673248, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7427, |
|
"step": 18875 |
|
}, |
|
{ |
|
"epoch": 0.11724420292552201, |
|
"grad_norm": 0.09788820147514343, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7507, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.11739928785002668, |
|
"grad_norm": 0.1285056471824646, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7386, |
|
"step": 18925 |
|
}, |
|
{ |
|
"epoch": 0.11755437277453133, |
|
"grad_norm": 0.11705992370843887, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7234, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 0.11770945769903599, |
|
"grad_norm": 0.09166467934846878, |
|
"learning_rate": 0.0015, |
|
"loss": 2.7825, |
|
"step": 18975 |
|
}, |
|
{ |
|
"epoch": 0.11786454262354065, |
|
"grad_norm": 0.11318054795265198, |
|
"learning_rate": 0.0015, |
|
"loss": 2.778, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.11786454262354065, |
|
"eval_loss": 4.499363422393799, |
|
"perplexity": 89.95984649658203, |
|
"step": 19000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 161202, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": true, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 60, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|