|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997336884154461, |
|
"eval_steps": 500, |
|
"global_step": 1877, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005326231691078562, |
|
"grad_norm": 0.5528136747380069, |
|
"learning_rate": 1.0638297872340427e-06, |
|
"loss": 1.7189, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002663115845539281, |
|
"grad_norm": 0.5731888996610293, |
|
"learning_rate": 5.319148936170213e-06, |
|
"loss": 1.7582, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005326231691078562, |
|
"grad_norm": 0.5597088455341374, |
|
"learning_rate": 1.0638297872340426e-05, |
|
"loss": 1.7331, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007989347536617843, |
|
"grad_norm": 0.6373028746703387, |
|
"learning_rate": 1.595744680851064e-05, |
|
"loss": 1.7083, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010652463382157125, |
|
"grad_norm": 0.6190956100247592, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 1.6453, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013315579227696404, |
|
"grad_norm": 0.5346124338518666, |
|
"learning_rate": 2.6595744680851064e-05, |
|
"loss": 1.5715, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.015978695073235686, |
|
"grad_norm": 0.25993843509864667, |
|
"learning_rate": 3.191489361702128e-05, |
|
"loss": 1.4359, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018641810918774968, |
|
"grad_norm": 0.45126098872878045, |
|
"learning_rate": 3.723404255319149e-05, |
|
"loss": 1.4122, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02130492676431425, |
|
"grad_norm": 0.16268004246261306, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 1.3378, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023968042609853527, |
|
"grad_norm": 0.16755798993478008, |
|
"learning_rate": 4.787234042553192e-05, |
|
"loss": 1.3315, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02663115845539281, |
|
"grad_norm": 0.14666977328343594, |
|
"learning_rate": 5.319148936170213e-05, |
|
"loss": 1.3259, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02929427430093209, |
|
"grad_norm": 0.10629659674680719, |
|
"learning_rate": 5.851063829787234e-05, |
|
"loss": 1.288, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03195739014647137, |
|
"grad_norm": 0.11263809733941188, |
|
"learning_rate": 6.382978723404256e-05, |
|
"loss": 1.2763, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03462050599201065, |
|
"grad_norm": 0.14445654795383106, |
|
"learning_rate": 6.914893617021277e-05, |
|
"loss": 1.256, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.037283621837549935, |
|
"grad_norm": 0.07679775841947081, |
|
"learning_rate": 7.446808510638298e-05, |
|
"loss": 1.2481, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03994673768308921, |
|
"grad_norm": 0.07097172061630208, |
|
"learning_rate": 7.978723404255319e-05, |
|
"loss": 1.2417, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0426098535286285, |
|
"grad_norm": 0.07545648577598915, |
|
"learning_rate": 8.510638297872341e-05, |
|
"loss": 1.2048, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.045272969374167776, |
|
"grad_norm": 0.06657701317469632, |
|
"learning_rate": 9.042553191489363e-05, |
|
"loss": 1.2138, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.047936085219707054, |
|
"grad_norm": 0.08276972517588223, |
|
"learning_rate": 9.574468085106384e-05, |
|
"loss": 1.2267, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05059920106524634, |
|
"grad_norm": 0.14086227463755532, |
|
"learning_rate": 0.00010106382978723406, |
|
"loss": 1.2185, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05326231691078562, |
|
"grad_norm": 0.08222057468849275, |
|
"learning_rate": 0.00010638297872340425, |
|
"loss": 1.2021, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0559254327563249, |
|
"grad_norm": 0.06741247738810993, |
|
"learning_rate": 0.00011170212765957446, |
|
"loss": 1.1957, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.05858854860186418, |
|
"grad_norm": 0.07692927491859038, |
|
"learning_rate": 0.00011702127659574468, |
|
"loss": 1.1901, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06125166444740346, |
|
"grad_norm": 0.08953054401601632, |
|
"learning_rate": 0.0001223404255319149, |
|
"loss": 1.2002, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06391478029294274, |
|
"grad_norm": 0.066045987418387, |
|
"learning_rate": 0.00012765957446808513, |
|
"loss": 1.2086, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06657789613848203, |
|
"grad_norm": 0.06476207146640194, |
|
"learning_rate": 0.00013297872340425532, |
|
"loss": 1.215, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0692410119840213, |
|
"grad_norm": 0.07334327374343644, |
|
"learning_rate": 0.00013829787234042554, |
|
"loss": 1.181, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07190412782956059, |
|
"grad_norm": 0.071815699820189, |
|
"learning_rate": 0.00014361702127659576, |
|
"loss": 1.1795, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07456724367509987, |
|
"grad_norm": 0.08268574123602224, |
|
"learning_rate": 0.00014893617021276596, |
|
"loss": 1.1892, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07723035952063914, |
|
"grad_norm": 0.07585606469879155, |
|
"learning_rate": 0.00015425531914893618, |
|
"loss": 1.173, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07989347536617843, |
|
"grad_norm": 0.06900850276332868, |
|
"learning_rate": 0.00015957446808510637, |
|
"loss": 1.1889, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08255659121171771, |
|
"grad_norm": 0.07702070923317432, |
|
"learning_rate": 0.00016489361702127662, |
|
"loss": 1.1705, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.085219707057257, |
|
"grad_norm": 0.0724505634260966, |
|
"learning_rate": 0.00017021276595744682, |
|
"loss": 1.1801, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08788282290279627, |
|
"grad_norm": 0.07762044850846143, |
|
"learning_rate": 0.000175531914893617, |
|
"loss": 1.1863, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.09054593874833555, |
|
"grad_norm": 0.0825988729050522, |
|
"learning_rate": 0.00018085106382978726, |
|
"loss": 1.1687, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09320905459387484, |
|
"grad_norm": 0.08362625552402488, |
|
"learning_rate": 0.00018617021276595746, |
|
"loss": 1.1875, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.09587217043941411, |
|
"grad_norm": 0.07968629276225715, |
|
"learning_rate": 0.00019148936170212768, |
|
"loss": 1.1629, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0985352862849534, |
|
"grad_norm": 0.09010747938874886, |
|
"learning_rate": 0.00019680851063829787, |
|
"loss": 1.1682, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.10119840213049268, |
|
"grad_norm": 0.10276104499280464, |
|
"learning_rate": 0.00019999930805760402, |
|
"loss": 1.1618, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.10386151797603196, |
|
"grad_norm": 0.07684956533429005, |
|
"learning_rate": 0.00019999152381561955, |
|
"loss": 1.1902, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.10652463382157124, |
|
"grad_norm": 0.09365634557072464, |
|
"learning_rate": 0.0001999750910791767, |
|
"loss": 1.1673, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10918774966711052, |
|
"grad_norm": 0.08320439951882774, |
|
"learning_rate": 0.00019995001126958025, |
|
"loss": 1.1845, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1118508655126498, |
|
"grad_norm": 0.08595161594886752, |
|
"learning_rate": 0.00019991628655604003, |
|
"loss": 1.1444, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11451398135818908, |
|
"grad_norm": 0.07678856693871118, |
|
"learning_rate": 0.00019987391985548328, |
|
"loss": 1.1724, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.11717709720372836, |
|
"grad_norm": 0.07433277571881601, |
|
"learning_rate": 0.0001998229148323023, |
|
"loss": 1.1469, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11984021304926765, |
|
"grad_norm": 0.11001095605643386, |
|
"learning_rate": 0.00019976327589803767, |
|
"loss": 1.1383, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.12250332889480692, |
|
"grad_norm": 0.0784178760835021, |
|
"learning_rate": 0.0001996950082109965, |
|
"loss": 1.1818, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.12516644474034622, |
|
"grad_norm": 0.08047194112395492, |
|
"learning_rate": 0.00019961811767580648, |
|
"loss": 1.1445, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1278295605858855, |
|
"grad_norm": 0.0670667235371544, |
|
"learning_rate": 0.0001995326109429049, |
|
"loss": 1.1741, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13049267643142476, |
|
"grad_norm": 0.07072589446768075, |
|
"learning_rate": 0.00019943849540796375, |
|
"loss": 1.157, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.13315579227696406, |
|
"grad_norm": 0.07466892570841129, |
|
"learning_rate": 0.0001993357792112498, |
|
"loss": 1.125, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13581890812250333, |
|
"grad_norm": 0.07302104613788317, |
|
"learning_rate": 0.0001992244712369207, |
|
"loss": 1.1615, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1384820239680426, |
|
"grad_norm": 0.07211635352591637, |
|
"learning_rate": 0.00019910458111225646, |
|
"loss": 1.1441, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1411451398135819, |
|
"grad_norm": 0.07103357444221702, |
|
"learning_rate": 0.00019897611920682677, |
|
"loss": 1.1493, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.14380825565912117, |
|
"grad_norm": 0.0698227187710226, |
|
"learning_rate": 0.00019883909663159424, |
|
"loss": 1.1568, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.14647137150466044, |
|
"grad_norm": 0.07137557168765225, |
|
"learning_rate": 0.0001986935252379532, |
|
"loss": 1.171, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.14913448735019974, |
|
"grad_norm": 0.07605080544337586, |
|
"learning_rate": 0.00019853941761670483, |
|
"loss": 1.1623, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.151797603195739, |
|
"grad_norm": 0.09532848101140429, |
|
"learning_rate": 0.00019837678709696798, |
|
"loss": 1.1888, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.15446071904127828, |
|
"grad_norm": 0.07485256895909924, |
|
"learning_rate": 0.00019820564774502644, |
|
"loss": 1.1483, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.15712383488681758, |
|
"grad_norm": 0.07483378156117482, |
|
"learning_rate": 0.0001980260143631122, |
|
"loss": 1.1375, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.15978695073235685, |
|
"grad_norm": 0.07954155407009747, |
|
"learning_rate": 0.00019783790248812533, |
|
"loss": 1.1696, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16245006657789615, |
|
"grad_norm": 0.08502452471103343, |
|
"learning_rate": 0.00019764132839029, |
|
"loss": 1.168, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.16511318242343542, |
|
"grad_norm": 0.08384910068571033, |
|
"learning_rate": 0.00019743630907174725, |
|
"loss": 1.1659, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1677762982689747, |
|
"grad_norm": 0.06905463640642404, |
|
"learning_rate": 0.0001972228622650846, |
|
"loss": 1.1612, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.170439414114514, |
|
"grad_norm": 0.19257912301232658, |
|
"learning_rate": 0.0001970010064318021, |
|
"loss": 1.1517, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.17310252996005326, |
|
"grad_norm": 0.0793114626498931, |
|
"learning_rate": 0.00019677076076071566, |
|
"loss": 1.1385, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.17576564580559254, |
|
"grad_norm": 0.07393026070000318, |
|
"learning_rate": 0.00019653214516629735, |
|
"loss": 1.1426, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.17842876165113183, |
|
"grad_norm": 0.08179432509124362, |
|
"learning_rate": 0.00019628518028695307, |
|
"loss": 1.1104, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1810918774966711, |
|
"grad_norm": 0.09735291608528279, |
|
"learning_rate": 0.00019602988748323717, |
|
"loss": 1.1563, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.18375499334221038, |
|
"grad_norm": 0.06743724715009518, |
|
"learning_rate": 0.00019576628883600535, |
|
"loss": 1.1406, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.18641810918774968, |
|
"grad_norm": 0.075326384879952, |
|
"learning_rate": 0.00019549440714450444, |
|
"loss": 1.1572, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.18908122503328895, |
|
"grad_norm": 0.07438689728031705, |
|
"learning_rate": 0.00019521426592440072, |
|
"loss": 1.1479, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.19174434087882822, |
|
"grad_norm": 0.07277611336304127, |
|
"learning_rate": 0.00019492588940574586, |
|
"loss": 1.1549, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.19440745672436752, |
|
"grad_norm": 0.0695324135241875, |
|
"learning_rate": 0.0001946293025308813, |
|
"loss": 1.1435, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1970705725699068, |
|
"grad_norm": 0.06685927618032904, |
|
"learning_rate": 0.00019432453095228076, |
|
"loss": 1.1641, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.19973368841544606, |
|
"grad_norm": 0.0680367135740568, |
|
"learning_rate": 0.00019401160103033174, |
|
"loss": 1.1261, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.20239680426098536, |
|
"grad_norm": 0.08027336453756874, |
|
"learning_rate": 0.00019369053983105532, |
|
"loss": 1.1368, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.20505992010652463, |
|
"grad_norm": 0.0707161713953054, |
|
"learning_rate": 0.00019336137512376532, |
|
"loss": 1.1588, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.20772303595206393, |
|
"grad_norm": 0.07189527593634382, |
|
"learning_rate": 0.00019302413537866642, |
|
"loss": 1.1552, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2103861517976032, |
|
"grad_norm": 0.0716934364253126, |
|
"learning_rate": 0.0001926788497643916, |
|
"loss": 1.1577, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.21304926764314247, |
|
"grad_norm": 0.065943892133018, |
|
"learning_rate": 0.00019232554814547953, |
|
"loss": 1.1203, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.21571238348868177, |
|
"grad_norm": 0.07352621386091099, |
|
"learning_rate": 0.00019196426107979128, |
|
"loss": 1.1266, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.21837549933422104, |
|
"grad_norm": 0.07441803674470306, |
|
"learning_rate": 0.00019159501981586737, |
|
"loss": 1.1432, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2210386151797603, |
|
"grad_norm": 0.07291702193187057, |
|
"learning_rate": 0.00019121785629022501, |
|
"loss": 1.1344, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.2237017310252996, |
|
"grad_norm": 0.07094925179230635, |
|
"learning_rate": 0.00019083280312459593, |
|
"loss": 1.1137, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.22636484687083888, |
|
"grad_norm": 0.07399044805064979, |
|
"learning_rate": 0.0001904398936231047, |
|
"loss": 1.1533, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.22902796271637815, |
|
"grad_norm": 0.07782197426798759, |
|
"learning_rate": 0.00019003916176938836, |
|
"loss": 1.1458, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.23169107856191745, |
|
"grad_norm": 0.06822071830212563, |
|
"learning_rate": 0.00018963064222365694, |
|
"loss": 1.1448, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.23435419440745672, |
|
"grad_norm": 0.06944246120343146, |
|
"learning_rate": 0.00018921437031969558, |
|
"loss": 1.1577, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.237017310252996, |
|
"grad_norm": 0.07108688216307608, |
|
"learning_rate": 0.0001887903820618087, |
|
"loss": 1.1526, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.2396804260985353, |
|
"grad_norm": 0.08455610060485116, |
|
"learning_rate": 0.00018835871412170563, |
|
"loss": 1.1517, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.24234354194407456, |
|
"grad_norm": 0.06664786445884358, |
|
"learning_rate": 0.0001879194038353289, |
|
"loss": 1.1537, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.24500665778961384, |
|
"grad_norm": 0.07089581724112333, |
|
"learning_rate": 0.00018747248919962498, |
|
"loss": 1.1409, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.24766977363515313, |
|
"grad_norm": 0.07242825833109466, |
|
"learning_rate": 0.00018701800886925782, |
|
"loss": 1.1303, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.25033288948069243, |
|
"grad_norm": 0.06598593287452807, |
|
"learning_rate": 0.00018655600215326546, |
|
"loss": 1.1401, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2529960053262317, |
|
"grad_norm": 0.07020789015379635, |
|
"learning_rate": 0.00018608650901166032, |
|
"loss": 1.1542, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.255659121171771, |
|
"grad_norm": 0.06441793150662321, |
|
"learning_rate": 0.0001856095700519726, |
|
"loss": 1.1276, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2583222370173103, |
|
"grad_norm": 0.07254719498292789, |
|
"learning_rate": 0.0001851252265257384, |
|
"loss": 1.1212, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.2609853528628495, |
|
"grad_norm": 0.06917909155716108, |
|
"learning_rate": 0.0001846335203249316, |
|
"loss": 1.1298, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2636484687083888, |
|
"grad_norm": 0.07470942417701212, |
|
"learning_rate": 0.00018413449397834051, |
|
"loss": 1.1456, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2663115845539281, |
|
"grad_norm": 0.0693858861935873, |
|
"learning_rate": 0.00018362819064788956, |
|
"loss": 1.1327, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.26897470039946736, |
|
"grad_norm": 0.07182079092553902, |
|
"learning_rate": 0.00018311465412490608, |
|
"loss": 1.1628, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.27163781624500666, |
|
"grad_norm": 0.06682954118119949, |
|
"learning_rate": 0.00018259392882633265, |
|
"loss": 1.1528, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.27430093209054596, |
|
"grad_norm": 0.07248673749669132, |
|
"learning_rate": 0.00018206605979088542, |
|
"loss": 1.156, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2769640479360852, |
|
"grad_norm": 0.06950216959497392, |
|
"learning_rate": 0.0001815310926751586, |
|
"loss": 1.119, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2796271637816245, |
|
"grad_norm": 0.07067673018407011, |
|
"learning_rate": 0.00018098907374967555, |
|
"loss": 1.1211, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.2822902796271638, |
|
"grad_norm": 0.06820842392733384, |
|
"learning_rate": 0.00018044004989488664, |
|
"loss": 1.1281, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.28495339547270304, |
|
"grad_norm": 0.07418230217074875, |
|
"learning_rate": 0.00017988406859711456, |
|
"loss": 1.1409, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.28761651131824234, |
|
"grad_norm": 0.07009876259688716, |
|
"learning_rate": 0.00017932117794444713, |
|
"loss": 1.1381, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.29027962716378164, |
|
"grad_norm": 0.07129309605598672, |
|
"learning_rate": 0.00017875142662257786, |
|
"loss": 1.1387, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.2929427430093209, |
|
"grad_norm": 0.07830622678131702, |
|
"learning_rate": 0.00017817486391059532, |
|
"loss": 1.1165, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2956058588548602, |
|
"grad_norm": 0.0709756673443606, |
|
"learning_rate": 0.0001775915396767205, |
|
"loss": 1.129, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.2982689747003995, |
|
"grad_norm": 0.06710174636010342, |
|
"learning_rate": 0.00017700150437399405, |
|
"loss": 1.1183, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3009320905459387, |
|
"grad_norm": 0.07321620332053846, |
|
"learning_rate": 0.0001764048090359121, |
|
"loss": 1.1502, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.303595206391478, |
|
"grad_norm": 0.07613131980579707, |
|
"learning_rate": 0.00017580150527201241, |
|
"loss": 1.1322, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3062583222370173, |
|
"grad_norm": 0.07480921806539248, |
|
"learning_rate": 0.0001751916452634105, |
|
"loss": 1.1269, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.30892143808255657, |
|
"grad_norm": 0.07386122393966031, |
|
"learning_rate": 0.0001745752817582865, |
|
"loss": 1.1528, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.31158455392809586, |
|
"grad_norm": 0.07032971968151802, |
|
"learning_rate": 0.00017395246806732267, |
|
"loss": 1.1642, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.31424766977363516, |
|
"grad_norm": 0.07910944378906298, |
|
"learning_rate": 0.00017332325805909256, |
|
"loss": 1.1328, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3169107856191744, |
|
"grad_norm": 0.06775943214366806, |
|
"learning_rate": 0.00017268770615540177, |
|
"loss": 1.1142, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.3195739014647137, |
|
"grad_norm": 0.0858014191359942, |
|
"learning_rate": 0.00017204586732658087, |
|
"loss": 1.1393, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.322237017310253, |
|
"grad_norm": 0.06968407560583738, |
|
"learning_rate": 0.00017139779708673085, |
|
"loss": 1.1428, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3249001331557923, |
|
"grad_norm": 0.06812443512073688, |
|
"learning_rate": 0.00017074355148892167, |
|
"loss": 1.1592, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.32756324900133155, |
|
"grad_norm": 0.07150170839574509, |
|
"learning_rate": 0.00017008318712034403, |
|
"loss": 1.1018, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.33022636484687085, |
|
"grad_norm": 0.06906369302490485, |
|
"learning_rate": 0.00016941676109741508, |
|
"loss": 1.1442, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.33288948069241014, |
|
"grad_norm": 0.07869503084625909, |
|
"learning_rate": 0.00016874433106083814, |
|
"loss": 1.1132, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3355525965379494, |
|
"grad_norm": 0.07767900677929127, |
|
"learning_rate": 0.00016806595517061744, |
|
"loss": 1.1362, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3382157123834887, |
|
"grad_norm": 0.06780573276986938, |
|
"learning_rate": 0.00016738169210102764, |
|
"loss": 1.1382, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.340878828229028, |
|
"grad_norm": 0.07855904717914698, |
|
"learning_rate": 0.00016669160103553884, |
|
"loss": 1.1146, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.34354194407456723, |
|
"grad_norm": 0.06976051466447154, |
|
"learning_rate": 0.00016599574166169782, |
|
"loss": 1.1156, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.34620505992010653, |
|
"grad_norm": 0.0659729198246215, |
|
"learning_rate": 0.0001652941741659655, |
|
"loss": 1.1636, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3488681757656458, |
|
"grad_norm": 0.06820327345322129, |
|
"learning_rate": 0.00016458695922851125, |
|
"loss": 1.1272, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.35153129161118507, |
|
"grad_norm": 0.0706423611601847, |
|
"learning_rate": 0.0001638741580179645, |
|
"loss": 1.15, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.35419440745672437, |
|
"grad_norm": 0.07009862917994238, |
|
"learning_rate": 0.0001631558321861241, |
|
"loss": 1.1133, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.35685752330226367, |
|
"grad_norm": 0.10252097837226529, |
|
"learning_rate": 0.00016243204386262616, |
|
"loss": 1.1275, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3595206391478029, |
|
"grad_norm": 0.0677270369109815, |
|
"learning_rate": 0.0001617028556495699, |
|
"loss": 1.1463, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.3621837549933422, |
|
"grad_norm": 0.07081566637081647, |
|
"learning_rate": 0.00016096833061610336, |
|
"loss": 1.1557, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3648468708388815, |
|
"grad_norm": 0.07606309640077409, |
|
"learning_rate": 0.0001602285322929684, |
|
"loss": 1.1279, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.36750998668442075, |
|
"grad_norm": 0.06926585358652293, |
|
"learning_rate": 0.00015948352466700562, |
|
"loss": 1.1058, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.37017310252996005, |
|
"grad_norm": 0.0768394516058797, |
|
"learning_rate": 0.00015873337217562012, |
|
"loss": 1.1451, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.37283621837549935, |
|
"grad_norm": 0.07574776045146851, |
|
"learning_rate": 0.00015797813970120806, |
|
"loss": 1.1529, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3754993342210386, |
|
"grad_norm": 0.08825811667362324, |
|
"learning_rate": 0.00015721789256554493, |
|
"loss": 1.1427, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.3781624500665779, |
|
"grad_norm": 0.07195501325596203, |
|
"learning_rate": 0.00015645269652413572, |
|
"loss": 1.1348, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3808255659121172, |
|
"grad_norm": 0.07470988656942844, |
|
"learning_rate": 0.00015568261776052747, |
|
"loss": 1.1389, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.38348868175765644, |
|
"grad_norm": 0.07234292608880714, |
|
"learning_rate": 0.0001549077228805851, |
|
"loss": 1.1265, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.38615179760319573, |
|
"grad_norm": 0.07603813138240277, |
|
"learning_rate": 0.00015412807890673012, |
|
"loss": 1.0975, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.38881491344873503, |
|
"grad_norm": 0.06914740850103318, |
|
"learning_rate": 0.00015334375327214435, |
|
"loss": 1.1656, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3914780292942743, |
|
"grad_norm": 0.0723373355088882, |
|
"learning_rate": 0.00015255481381493686, |
|
"loss": 1.1235, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.3941411451398136, |
|
"grad_norm": 0.07469762097501292, |
|
"learning_rate": 0.00015176132877227672, |
|
"loss": 1.1401, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3968042609853529, |
|
"grad_norm": 0.06845354027517625, |
|
"learning_rate": 0.00015096336677449123, |
|
"loss": 1.1299, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.3994673768308921, |
|
"grad_norm": 0.07857096344059177, |
|
"learning_rate": 0.0001501609968391295, |
|
"loss": 1.1362, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4021304926764314, |
|
"grad_norm": 0.07079465135436822, |
|
"learning_rate": 0.00014935428836499332, |
|
"loss": 1.1268, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.4047936085219707, |
|
"grad_norm": 0.07113035589654983, |
|
"learning_rate": 0.0001485433111261346, |
|
"loss": 1.1357, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.40745672436750996, |
|
"grad_norm": 0.0703269232774503, |
|
"learning_rate": 0.0001477281352658203, |
|
"loss": 1.1239, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.41011984021304926, |
|
"grad_norm": 0.07059355929742223, |
|
"learning_rate": 0.00014690883129046584, |
|
"loss": 1.1442, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.41278295605858856, |
|
"grad_norm": 0.07289277380542494, |
|
"learning_rate": 0.0001460854700635366, |
|
"loss": 1.1267, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.41544607190412786, |
|
"grad_norm": 0.06984202720886337, |
|
"learning_rate": 0.00014525812279941896, |
|
"loss": 1.1258, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4181091877496671, |
|
"grad_norm": 0.07422048925652114, |
|
"learning_rate": 0.00014442686105726067, |
|
"loss": 1.1193, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.4207723035952064, |
|
"grad_norm": 0.0718716738772194, |
|
"learning_rate": 0.00014359175673478162, |
|
"loss": 1.133, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4234354194407457, |
|
"grad_norm": 0.07204344165616748, |
|
"learning_rate": 0.00014275288206205524, |
|
"loss": 1.0967, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.42609853528628494, |
|
"grad_norm": 0.07353413673583019, |
|
"learning_rate": 0.00014191030959526105, |
|
"loss": 1.1261, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.42876165113182424, |
|
"grad_norm": 0.0707040442967912, |
|
"learning_rate": 0.00014106411221040933, |
|
"loss": 1.128, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.43142476697736354, |
|
"grad_norm": 0.07086259967904554, |
|
"learning_rate": 0.00014021436309703765, |
|
"loss": 1.107, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4340878828229028, |
|
"grad_norm": 0.06994136097058291, |
|
"learning_rate": 0.00013936113575188075, |
|
"loss": 1.1221, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4367509986684421, |
|
"grad_norm": 0.06961342073957084, |
|
"learning_rate": 0.00013850450397251345, |
|
"loss": 1.1208, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4394141145139814, |
|
"grad_norm": 0.07040932769118938, |
|
"learning_rate": 0.0001376445418509679, |
|
"loss": 1.1208, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.4420772303595206, |
|
"grad_norm": 0.07187314857901307, |
|
"learning_rate": 0.00013678132376732517, |
|
"loss": 1.1267, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4447403462050599, |
|
"grad_norm": 0.07002729221567015, |
|
"learning_rate": 0.00013591492438328183, |
|
"loss": 1.1421, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.4474034620505992, |
|
"grad_norm": 0.07235067324392022, |
|
"learning_rate": 0.0001350454186356924, |
|
"loss": 1.1191, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.45006657789613846, |
|
"grad_norm": 0.07568410556158327, |
|
"learning_rate": 0.00013417288173008776, |
|
"loss": 1.1123, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.45272969374167776, |
|
"grad_norm": 0.07613155957113646, |
|
"learning_rate": 0.00013329738913417068, |
|
"loss": 1.1137, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.45539280958721706, |
|
"grad_norm": 0.06854447943505289, |
|
"learning_rate": 0.00013241901657128825, |
|
"loss": 1.132, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.4580559254327563, |
|
"grad_norm": 0.06884442803824642, |
|
"learning_rate": 0.00013153784001388247, |
|
"loss": 1.1352, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4607190412782956, |
|
"grad_norm": 0.0818107600119684, |
|
"learning_rate": 0.00013065393567691913, |
|
"loss": 1.101, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.4633821571238349, |
|
"grad_norm": 0.07690425325074156, |
|
"learning_rate": 0.00012976738001129606, |
|
"loss": 1.1052, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.46604527296937415, |
|
"grad_norm": 0.07952080928038242, |
|
"learning_rate": 0.00012887824969723034, |
|
"loss": 1.1172, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.46870838881491345, |
|
"grad_norm": 0.06637081804522467, |
|
"learning_rate": 0.00012798662163762635, |
|
"loss": 1.1236, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.47137150466045274, |
|
"grad_norm": 0.06753657536020181, |
|
"learning_rate": 0.00012709257295142422, |
|
"loss": 1.1304, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.474034620505992, |
|
"grad_norm": 0.07408199495580661, |
|
"learning_rate": 0.00012619618096692943, |
|
"loss": 1.1523, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4766977363515313, |
|
"grad_norm": 0.07508862680813526, |
|
"learning_rate": 0.0001252975232151248, |
|
"loss": 1.1158, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.4793608521970706, |
|
"grad_norm": 0.07064406721668945, |
|
"learning_rate": 0.0001243966774229645, |
|
"loss": 1.1334, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.48202396804260983, |
|
"grad_norm": 0.0709335857697155, |
|
"learning_rate": 0.00012349372150665118, |
|
"loss": 1.1104, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.48468708388814913, |
|
"grad_norm": 0.06779392856513489, |
|
"learning_rate": 0.00012258873356489714, |
|
"loss": 1.1299, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.4873501997336884, |
|
"grad_norm": 0.07922531663031743, |
|
"learning_rate": 0.00012168179187216893, |
|
"loss": 1.13, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.49001331557922767, |
|
"grad_norm": 0.07185035702343927, |
|
"learning_rate": 0.0001207729748719177, |
|
"loss": 1.1402, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.49267643142476697, |
|
"grad_norm": 0.07162588557593508, |
|
"learning_rate": 0.00011986236116979406, |
|
"loss": 1.1308, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.49533954727030627, |
|
"grad_norm": 0.07242326329471309, |
|
"learning_rate": 0.0001189500295268495, |
|
"loss": 1.106, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4980026631158455, |
|
"grad_norm": 0.07434286419305244, |
|
"learning_rate": 0.0001180360588527242, |
|
"loss": 1.119, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5006657789613849, |
|
"grad_norm": 0.07304306593688702, |
|
"learning_rate": 0.00011712052819882171, |
|
"loss": 1.1503, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5033288948069241, |
|
"grad_norm": 0.07214685709611712, |
|
"learning_rate": 0.00011620351675147195, |
|
"loss": 1.1095, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5059920106524634, |
|
"grad_norm": 0.07192838129765652, |
|
"learning_rate": 0.0001152851038250819, |
|
"loss": 1.1451, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5086551264980027, |
|
"grad_norm": 0.06935787206272043, |
|
"learning_rate": 0.00011436536885527576, |
|
"loss": 1.1251, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.511318242343542, |
|
"grad_norm": 0.06801727149157547, |
|
"learning_rate": 0.00011344439139202421, |
|
"loss": 1.1084, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5139813581890812, |
|
"grad_norm": 0.07024555806497951, |
|
"learning_rate": 0.00011252225109276404, |
|
"loss": 1.1278, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5166444740346205, |
|
"grad_norm": 0.06796418511383114, |
|
"learning_rate": 0.00011159902771550837, |
|
"loss": 1.1092, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5193075898801598, |
|
"grad_norm": 0.07219790467805971, |
|
"learning_rate": 0.00011067480111194817, |
|
"loss": 1.1286, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.521970705725699, |
|
"grad_norm": 0.06944360419194191, |
|
"learning_rate": 0.00010974965122054579, |
|
"loss": 1.1184, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5246338215712384, |
|
"grad_norm": 0.07229326850745169, |
|
"learning_rate": 0.00010882365805962083, |
|
"loss": 1.1212, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.5272969374167776, |
|
"grad_norm": 0.07181125732929394, |
|
"learning_rate": 0.00010789690172042912, |
|
"loss": 1.1137, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5299600532623169, |
|
"grad_norm": 0.07145294348037948, |
|
"learning_rate": 0.00010696946236023567, |
|
"loss": 1.1365, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5326231691078562, |
|
"grad_norm": 0.07057067837966788, |
|
"learning_rate": 0.00010604142019538135, |
|
"loss": 1.1176, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5352862849533955, |
|
"grad_norm": 0.07461665274600122, |
|
"learning_rate": 0.00010511285549434509, |
|
"loss": 1.1152, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.5379494007989347, |
|
"grad_norm": 0.07009722058482384, |
|
"learning_rate": 0.00010418384857080117, |
|
"loss": 1.1117, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5406125166444741, |
|
"grad_norm": 0.07081680661261375, |
|
"learning_rate": 0.00010325447977667263, |
|
"loss": 1.1328, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.5432756324900133, |
|
"grad_norm": 0.06980388274242631, |
|
"learning_rate": 0.00010232482949518156, |
|
"loss": 1.1404, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5459387483355526, |
|
"grad_norm": 0.06946364728493262, |
|
"learning_rate": 0.00010139497813389654, |
|
"loss": 1.1127, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.5486018641810919, |
|
"grad_norm": 0.06947172503952885, |
|
"learning_rate": 0.00010046500611777798, |
|
"loss": 1.0937, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5512649800266312, |
|
"grad_norm": 0.07414647895551518, |
|
"learning_rate": 9.953499388222202e-05, |
|
"loss": 1.132, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.5539280958721704, |
|
"grad_norm": 0.07085672498663681, |
|
"learning_rate": 9.860502186610349e-05, |
|
"loss": 1.0998, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5565912117177098, |
|
"grad_norm": 0.07198312270884867, |
|
"learning_rate": 9.767517050481846e-05, |
|
"loss": 1.1263, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.559254327563249, |
|
"grad_norm": 0.07070349541708286, |
|
"learning_rate": 9.67455202233274e-05, |
|
"loss": 1.1143, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5619174434087882, |
|
"grad_norm": 0.06990981328605791, |
|
"learning_rate": 9.581615142919887e-05, |
|
"loss": 1.1168, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.5645805592543276, |
|
"grad_norm": 0.07221018297233557, |
|
"learning_rate": 9.488714450565491e-05, |
|
"loss": 1.1123, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5672436750998668, |
|
"grad_norm": 0.06895775963564511, |
|
"learning_rate": 9.395857980461867e-05, |
|
"loss": 1.1294, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.5699067909454061, |
|
"grad_norm": 0.06904508970279108, |
|
"learning_rate": 9.303053763976434e-05, |
|
"loss": 1.1179, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5725699067909454, |
|
"grad_norm": 0.07131791944898686, |
|
"learning_rate": 9.210309827957089e-05, |
|
"loss": 1.1297, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.5752330226364847, |
|
"grad_norm": 0.07117429268373339, |
|
"learning_rate": 9.117634194037922e-05, |
|
"loss": 1.1285, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5778961384820239, |
|
"grad_norm": 0.0720403827517469, |
|
"learning_rate": 9.025034877945422e-05, |
|
"loss": 1.1418, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5805592543275633, |
|
"grad_norm": 0.07399424819774852, |
|
"learning_rate": 8.932519888805185e-05, |
|
"loss": 1.1521, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5832223701731025, |
|
"grad_norm": 0.06757782738616369, |
|
"learning_rate": 8.840097228449165e-05, |
|
"loss": 1.1468, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.5858854860186418, |
|
"grad_norm": 0.07513462470349377, |
|
"learning_rate": 8.747774890723599e-05, |
|
"loss": 1.1008, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5885486018641811, |
|
"grad_norm": 0.07193663724723076, |
|
"learning_rate": 8.655560860797582e-05, |
|
"loss": 1.1364, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5912117177097204, |
|
"grad_norm": 0.07451485853238139, |
|
"learning_rate": 8.563463114472425e-05, |
|
"loss": 1.1077, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5938748335552596, |
|
"grad_norm": 0.07208026414944517, |
|
"learning_rate": 8.471489617491812e-05, |
|
"loss": 1.0828, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.596537949400799, |
|
"grad_norm": 0.06968442188475359, |
|
"learning_rate": 8.379648324852808e-05, |
|
"loss": 1.0975, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5992010652463382, |
|
"grad_norm": 0.07690321076998301, |
|
"learning_rate": 8.287947180117832e-05, |
|
"loss": 1.1149, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6018641810918774, |
|
"grad_norm": 0.07088556456471248, |
|
"learning_rate": 8.196394114727585e-05, |
|
"loss": 1.1193, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6045272969374168, |
|
"grad_norm": 0.07306182012233282, |
|
"learning_rate": 8.104997047315048e-05, |
|
"loss": 1.1222, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.607190412782956, |
|
"grad_norm": 0.0735136578466246, |
|
"learning_rate": 8.013763883020596e-05, |
|
"loss": 1.1326, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6098535286284953, |
|
"grad_norm": 0.0792004084312011, |
|
"learning_rate": 7.92270251280823e-05, |
|
"loss": 1.1125, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.6125166444740346, |
|
"grad_norm": 0.06995636578434544, |
|
"learning_rate": 7.831820812783108e-05, |
|
"loss": 1.1397, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6151797603195739, |
|
"grad_norm": 0.07156672219633958, |
|
"learning_rate": 7.741126643510292e-05, |
|
"loss": 1.1047, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6178428761651131, |
|
"grad_norm": 0.06990042451203095, |
|
"learning_rate": 7.650627849334881e-05, |
|
"loss": 1.0991, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6205059920106525, |
|
"grad_norm": 0.07175460373797926, |
|
"learning_rate": 7.560332257703555e-05, |
|
"loss": 1.1179, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6231691078561917, |
|
"grad_norm": 0.07079860289283964, |
|
"learning_rate": 7.470247678487522e-05, |
|
"loss": 1.1179, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.625832223701731, |
|
"grad_norm": 0.06955306779443195, |
|
"learning_rate": 7.380381903307061e-05, |
|
"loss": 1.1261, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.6284953395472703, |
|
"grad_norm": 0.07040121930899221, |
|
"learning_rate": 7.290742704857585e-05, |
|
"loss": 1.128, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6311584553928096, |
|
"grad_norm": 0.0696134794381735, |
|
"learning_rate": 7.201337836237365e-05, |
|
"loss": 1.1006, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.6338215712383488, |
|
"grad_norm": 0.0731972119965864, |
|
"learning_rate": 7.112175030276969e-05, |
|
"loss": 1.122, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6364846870838882, |
|
"grad_norm": 0.07600883906312528, |
|
"learning_rate": 7.023261998870395e-05, |
|
"loss": 1.1054, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.6391478029294274, |
|
"grad_norm": 0.06927208348922254, |
|
"learning_rate": 6.934606432308086e-05, |
|
"loss": 1.1128, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6418109187749668, |
|
"grad_norm": 0.06936985978073082, |
|
"learning_rate": 6.846215998611757e-05, |
|
"loss": 1.118, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.644474034620506, |
|
"grad_norm": 0.07077273319280314, |
|
"learning_rate": 6.758098342871174e-05, |
|
"loss": 1.1093, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6471371504660453, |
|
"grad_norm": 0.07079727575348424, |
|
"learning_rate": 6.670261086582933e-05, |
|
"loss": 1.1231, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.6498002663115846, |
|
"grad_norm": 0.07181688909396321, |
|
"learning_rate": 6.582711826991226e-05, |
|
"loss": 1.1042, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6524633821571239, |
|
"grad_norm": 0.07337324643783341, |
|
"learning_rate": 6.495458136430765e-05, |
|
"loss": 1.1042, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.6551264980026631, |
|
"grad_norm": 0.07212862237245772, |
|
"learning_rate": 6.408507561671819e-05, |
|
"loss": 1.1509, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6577896138482024, |
|
"grad_norm": 0.07047120035609092, |
|
"learning_rate": 6.321867623267481e-05, |
|
"loss": 1.1355, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.6604527296937417, |
|
"grad_norm": 0.07459478385718604, |
|
"learning_rate": 6.23554581490321e-05, |
|
"loss": 1.1178, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6631158455392809, |
|
"grad_norm": 0.06825726344400981, |
|
"learning_rate": 6.149549602748656e-05, |
|
"loss": 1.0862, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.6657789613848203, |
|
"grad_norm": 0.07240193352920372, |
|
"learning_rate": 6.063886424811929e-05, |
|
"loss": 1.1292, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6684420772303595, |
|
"grad_norm": 0.07197359550672061, |
|
"learning_rate": 5.9785636902962374e-05, |
|
"loss": 1.1306, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.6711051930758988, |
|
"grad_norm": 0.07343322507680913, |
|
"learning_rate": 5.893588778959067e-05, |
|
"loss": 1.1365, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6737683089214381, |
|
"grad_norm": 0.07842185463764602, |
|
"learning_rate": 5.8089690404738925e-05, |
|
"loss": 1.1395, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.6764314247669774, |
|
"grad_norm": 0.07270053681642126, |
|
"learning_rate": 5.7247117937944786e-05, |
|
"loss": 1.1035, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6790945406125166, |
|
"grad_norm": 0.07124055104083139, |
|
"learning_rate": 5.640824326521841e-05, |
|
"loss": 1.1121, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.681757656458056, |
|
"grad_norm": 0.07031788560347749, |
|
"learning_rate": 5.5573138942739365e-05, |
|
"loss": 1.1192, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6844207723035952, |
|
"grad_norm": 0.07253058219251593, |
|
"learning_rate": 5.4741877200581057e-05, |
|
"loss": 1.1324, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.6870838881491345, |
|
"grad_norm": 0.0725305584439251, |
|
"learning_rate": 5.391452993646342e-05, |
|
"loss": 1.1387, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6897470039946738, |
|
"grad_norm": 0.07021354602161774, |
|
"learning_rate": 5.30911687095342e-05, |
|
"loss": 1.126, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6924101198402131, |
|
"grad_norm": 0.07012940672098344, |
|
"learning_rate": 5.227186473417971e-05, |
|
"loss": 1.1486, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6950732356857523, |
|
"grad_norm": 0.07232533360594767, |
|
"learning_rate": 5.145668887386543e-05, |
|
"loss": 1.1111, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.6977363515312917, |
|
"grad_norm": 0.07242821758103567, |
|
"learning_rate": 5.064571163500667e-05, |
|
"loss": 1.1181, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7003994673768309, |
|
"grad_norm": 0.07148030530795384, |
|
"learning_rate": 4.983900316087051e-05, |
|
"loss": 1.0922, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.7030625832223701, |
|
"grad_norm": 0.07277623674879777, |
|
"learning_rate": 4.90366332255088e-05, |
|
"loss": 1.0985, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7057256990679095, |
|
"grad_norm": 0.07450043090731064, |
|
"learning_rate": 4.823867122772329e-05, |
|
"loss": 1.1177, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.7083888149134487, |
|
"grad_norm": 0.07438166449706331, |
|
"learning_rate": 4.744518618506319e-05, |
|
"loss": 1.1225, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.711051930758988, |
|
"grad_norm": 0.07157294185481793, |
|
"learning_rate": 4.665624672785566e-05, |
|
"loss": 1.1291, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.7137150466045273, |
|
"grad_norm": 0.07520781395221099, |
|
"learning_rate": 4.5871921093269875e-05, |
|
"loss": 1.1082, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7163781624500666, |
|
"grad_norm": 0.07203181974145231, |
|
"learning_rate": 4.5092277119414975e-05, |
|
"loss": 1.1333, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.7190412782956058, |
|
"grad_norm": 0.07130465564504203, |
|
"learning_rate": 4.431738223947252e-05, |
|
"loss": 1.0951, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7217043941411452, |
|
"grad_norm": 0.075489827909183, |
|
"learning_rate": 4.35473034758643e-05, |
|
"loss": 1.1223, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.7243675099866844, |
|
"grad_norm": 0.07030155738463333, |
|
"learning_rate": 4.2782107434455054e-05, |
|
"loss": 1.1222, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7270306258322237, |
|
"grad_norm": 0.07218531177873333, |
|
"learning_rate": 4.202186029879195e-05, |
|
"loss": 1.1135, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.729693741677763, |
|
"grad_norm": 0.07513760712282253, |
|
"learning_rate": 4.12666278243799e-05, |
|
"loss": 1.1181, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7323568575233023, |
|
"grad_norm": 0.07435576640285345, |
|
"learning_rate": 4.0516475332994383e-05, |
|
"loss": 1.119, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.7350199733688415, |
|
"grad_norm": 0.07125188708419379, |
|
"learning_rate": 3.9771467707031615e-05, |
|
"loss": 1.1201, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7376830892143809, |
|
"grad_norm": 0.07352772721979937, |
|
"learning_rate": 3.903166938389664e-05, |
|
"loss": 1.112, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.7403462050599201, |
|
"grad_norm": 0.07055845215545062, |
|
"learning_rate": 3.8297144350430144e-05, |
|
"loss": 1.1046, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.7430093209054593, |
|
"grad_norm": 0.07019837722638089, |
|
"learning_rate": 3.756795613737388e-05, |
|
"loss": 1.1306, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.7456724367509987, |
|
"grad_norm": 0.07239297940522006, |
|
"learning_rate": 3.684416781387589e-05, |
|
"loss": 1.1184, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.748335552596538, |
|
"grad_norm": 0.07225126596889433, |
|
"learning_rate": 3.6125841982035536e-05, |
|
"loss": 1.0843, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.7509986684420772, |
|
"grad_norm": 0.07431737331558284, |
|
"learning_rate": 3.5413040771488746e-05, |
|
"loss": 1.1145, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7536617842876165, |
|
"grad_norm": 0.07502128782750854, |
|
"learning_rate": 3.47058258340345e-05, |
|
"loss": 1.1114, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.7563249001331558, |
|
"grad_norm": 0.07207529195587527, |
|
"learning_rate": 3.4004258338302195e-05, |
|
"loss": 1.116, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.758988015978695, |
|
"grad_norm": 0.07002467859956689, |
|
"learning_rate": 3.3308398964461206e-05, |
|
"loss": 1.1198, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.7616511318242344, |
|
"grad_norm": 0.07224058742693344, |
|
"learning_rate": 3.261830789897241e-05, |
|
"loss": 1.1367, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7643142476697736, |
|
"grad_norm": 0.07150872607987452, |
|
"learning_rate": 3.193404482938256e-05, |
|
"loss": 1.0982, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.7669773635153129, |
|
"grad_norm": 0.07178998194161153, |
|
"learning_rate": 3.1255668939161894e-05, |
|
"loss": 1.1301, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7696404793608522, |
|
"grad_norm": 0.07537933178179766, |
|
"learning_rate": 3.058323890258498e-05, |
|
"loss": 1.0962, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.7723035952063915, |
|
"grad_norm": 0.07189617812023931, |
|
"learning_rate": 2.9916812879655975e-05, |
|
"loss": 1.1299, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7749667110519307, |
|
"grad_norm": 0.07135524342299995, |
|
"learning_rate": 2.925644851107835e-05, |
|
"loss": 1.1189, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.7776298268974701, |
|
"grad_norm": 0.06926273022163672, |
|
"learning_rate": 2.860220291326915e-05, |
|
"loss": 1.1068, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7802929427430093, |
|
"grad_norm": 0.07236268331467403, |
|
"learning_rate": 2.7954132673419143e-05, |
|
"loss": 1.0981, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.7829560585885486, |
|
"grad_norm": 0.07137745473045948, |
|
"learning_rate": 2.7312293844598246e-05, |
|
"loss": 1.1045, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7856191744340879, |
|
"grad_norm": 0.07319916214034358, |
|
"learning_rate": 2.6676741940907478e-05, |
|
"loss": 1.1281, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.7882822902796272, |
|
"grad_norm": 0.07414585611908868, |
|
"learning_rate": 2.6047531932677383e-05, |
|
"loss": 1.1225, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7909454061251664, |
|
"grad_norm": 0.07066978423343756, |
|
"learning_rate": 2.542471824171353e-05, |
|
"loss": 1.1356, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.7936085219707057, |
|
"grad_norm": 0.07124448208473912, |
|
"learning_rate": 2.4808354736589523e-05, |
|
"loss": 1.1323, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.796271637816245, |
|
"grad_norm": 0.07192509266254882, |
|
"learning_rate": 2.419849472798761e-05, |
|
"loss": 1.1386, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.7989347536617842, |
|
"grad_norm": 0.07470536521159465, |
|
"learning_rate": 2.359519096408791e-05, |
|
"loss": 1.1103, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8015978695073236, |
|
"grad_norm": 0.07210949951260932, |
|
"learning_rate": 2.2998495626005957e-05, |
|
"loss": 1.1108, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.8042609853528628, |
|
"grad_norm": 0.07516415250373631, |
|
"learning_rate": 2.240846032327949e-05, |
|
"loss": 1.1404, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.8069241011984021, |
|
"grad_norm": 0.07560562529629619, |
|
"learning_rate": 2.1825136089404718e-05, |
|
"loss": 1.0935, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.8095872170439414, |
|
"grad_norm": 0.07195974938474745, |
|
"learning_rate": 2.1248573377422155e-05, |
|
"loss": 1.1182, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8122503328894807, |
|
"grad_norm": 0.07250882969384367, |
|
"learning_rate": 2.0678822055552906e-05, |
|
"loss": 1.1189, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.8149134487350199, |
|
"grad_norm": 0.0721751215640965, |
|
"learning_rate": 2.0115931402885458e-05, |
|
"loss": 1.1115, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8175765645805593, |
|
"grad_norm": 0.0753848259347461, |
|
"learning_rate": 1.955995010511338e-05, |
|
"loss": 1.1348, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.8202396804260985, |
|
"grad_norm": 0.0719207373284397, |
|
"learning_rate": 1.901092625032448e-05, |
|
"loss": 1.1042, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.8229027962716379, |
|
"grad_norm": 0.07032664869488064, |
|
"learning_rate": 1.84689073248414e-05, |
|
"loss": 1.1009, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.8255659121171771, |
|
"grad_norm": 0.0700654057925292, |
|
"learning_rate": 1.7933940209114597e-05, |
|
"loss": 1.1269, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8282290279627164, |
|
"grad_norm": 0.07325193867745135, |
|
"learning_rate": 1.7406071173667372e-05, |
|
"loss": 1.1138, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.8308921438082557, |
|
"grad_norm": 0.07059680065497263, |
|
"learning_rate": 1.6885345875093918e-05, |
|
"loss": 1.1202, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.833555259653795, |
|
"grad_norm": 0.06973843219886788, |
|
"learning_rate": 1.6371809352110447e-05, |
|
"loss": 1.109, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.8362183754993342, |
|
"grad_norm": 0.07028429615451927, |
|
"learning_rate": 1.5865506021659516e-05, |
|
"loss": 1.1422, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.8388814913448736, |
|
"grad_norm": 0.07070004444035057, |
|
"learning_rate": 1.5366479675068435e-05, |
|
"loss": 1.1139, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.8415446071904128, |
|
"grad_norm": 0.06902941716806588, |
|
"learning_rate": 1.4874773474261638e-05, |
|
"loss": 1.1179, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.844207723035952, |
|
"grad_norm": 0.07227429481260465, |
|
"learning_rate": 1.4390429948027428e-05, |
|
"loss": 1.1156, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.8468708388814914, |
|
"grad_norm": 0.07067313252269349, |
|
"learning_rate": 1.3913490988339718e-05, |
|
"loss": 1.1209, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.8495339547270306, |
|
"grad_norm": 0.07249077076436629, |
|
"learning_rate": 1.3443997846734535e-05, |
|
"loss": 1.1303, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.8521970705725699, |
|
"grad_norm": 0.07121930519374212, |
|
"learning_rate": 1.2981991130742211e-05, |
|
"loss": 1.1069, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8548601864181092, |
|
"grad_norm": 0.06967637890151408, |
|
"learning_rate": 1.2527510800375043e-05, |
|
"loss": 1.1007, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.8575233022636485, |
|
"grad_norm": 0.07205061200000021, |
|
"learning_rate": 1.20805961646711e-05, |
|
"loss": 1.1199, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8601864181091877, |
|
"grad_norm": 0.07129006954483187, |
|
"learning_rate": 1.1641285878294372e-05, |
|
"loss": 1.1054, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.8628495339547271, |
|
"grad_norm": 0.07540152645446788, |
|
"learning_rate": 1.1209617938191307e-05, |
|
"loss": 1.1032, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.8655126498002663, |
|
"grad_norm": 0.07238275344561401, |
|
"learning_rate": 1.0785629680304432e-05, |
|
"loss": 1.1246, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.8681757656458056, |
|
"grad_norm": 0.07120538294411066, |
|
"learning_rate": 1.0369357776343103e-05, |
|
"loss": 1.0932, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8708388814913449, |
|
"grad_norm": 0.07125849884630578, |
|
"learning_rate": 9.960838230611635e-06, |
|
"loss": 1.0728, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.8735019973368842, |
|
"grad_norm": 0.07387109327159767, |
|
"learning_rate": 9.560106376895306e-06, |
|
"loss": 1.1275, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8761651131824234, |
|
"grad_norm": 0.08867831950654811, |
|
"learning_rate": 9.167196875404094e-06, |
|
"loss": 1.1134, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.8788282290279628, |
|
"grad_norm": 0.07206488784580595, |
|
"learning_rate": 8.782143709775015e-06, |
|
"loss": 1.109, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.881491344873502, |
|
"grad_norm": 0.07037650282884113, |
|
"learning_rate": 8.40498018413266e-06, |
|
"loss": 1.0862, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.8841544607190412, |
|
"grad_norm": 0.07329640323219759, |
|
"learning_rate": 8.035738920208714e-06, |
|
"loss": 1.1539, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8868175765645806, |
|
"grad_norm": 0.07238224996992595, |
|
"learning_rate": 7.67445185452046e-06, |
|
"loss": 1.14, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.8894806924101198, |
|
"grad_norm": 0.0709614207657161, |
|
"learning_rate": 7.321150235608399e-06, |
|
"loss": 1.1084, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8921438082556591, |
|
"grad_norm": 0.0726275702018448, |
|
"learning_rate": 6.9758646213336165e-06, |
|
"loss": 1.1227, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.8948069241011984, |
|
"grad_norm": 0.07467788565919785, |
|
"learning_rate": 6.6386248762347004e-06, |
|
"loss": 1.1135, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8974700399467377, |
|
"grad_norm": 0.07203922681266979, |
|
"learning_rate": 6.309460168944692e-06, |
|
"loss": 1.1071, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.9001331557922769, |
|
"grad_norm": 0.07114745434226347, |
|
"learning_rate": 5.988398969668285e-06, |
|
"loss": 1.1248, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9027962716378163, |
|
"grad_norm": 0.07111413913413335, |
|
"learning_rate": 5.6754690477192396e-06, |
|
"loss": 1.0872, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.9054593874833555, |
|
"grad_norm": 0.07072306177993802, |
|
"learning_rate": 5.370697469118713e-06, |
|
"loss": 1.0824, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9081225033288948, |
|
"grad_norm": 0.07185114603062198, |
|
"learning_rate": 5.074110594254133e-06, |
|
"loss": 1.107, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.9107856191744341, |
|
"grad_norm": 0.07285137048040193, |
|
"learning_rate": 4.78573407559928e-06, |
|
"loss": 1.1173, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.9134487350199734, |
|
"grad_norm": 0.07013700832744958, |
|
"learning_rate": 4.5055928554955665e-06, |
|
"loss": 1.116, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.9161118508655126, |
|
"grad_norm": 0.06933918410316768, |
|
"learning_rate": 4.233711163994669e-06, |
|
"loss": 1.1038, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.918774966711052, |
|
"grad_norm": 0.072978505928099, |
|
"learning_rate": 3.970112516762825e-06, |
|
"loss": 1.104, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.9214380825565912, |
|
"grad_norm": 0.07096666064833597, |
|
"learning_rate": 3.7148197130469576e-06, |
|
"loss": 1.1169, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.9241011984021305, |
|
"grad_norm": 0.07024364472390462, |
|
"learning_rate": 3.467854833702644e-06, |
|
"loss": 1.1051, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.9267643142476698, |
|
"grad_norm": 0.0725948329149083, |
|
"learning_rate": 3.229239239284354e-06, |
|
"loss": 1.1257, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.929427430093209, |
|
"grad_norm": 0.07215970561357568, |
|
"learning_rate": 2.9989935681979164e-06, |
|
"loss": 1.1262, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.9320905459387483, |
|
"grad_norm": 0.0737761172543898, |
|
"learning_rate": 2.777137734915403e-06, |
|
"loss": 1.1091, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9347536617842876, |
|
"grad_norm": 0.0714550851543958, |
|
"learning_rate": 2.563690928252749e-06, |
|
"loss": 1.1283, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.9374167776298269, |
|
"grad_norm": 0.07157410935337963, |
|
"learning_rate": 2.358671609710017e-06, |
|
"loss": 1.1239, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.9400798934753661, |
|
"grad_norm": 0.0715459262435765, |
|
"learning_rate": 2.1620975118746835e-06, |
|
"loss": 1.1283, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.9427430093209055, |
|
"grad_norm": 0.07286793622379441, |
|
"learning_rate": 1.9739856368878096e-06, |
|
"loss": 1.1443, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.9454061251664447, |
|
"grad_norm": 0.07254981755374063, |
|
"learning_rate": 1.794352254973597e-06, |
|
"loss": 1.0752, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.948069241011984, |
|
"grad_norm": 0.07583062173231125, |
|
"learning_rate": 1.6232129030320453e-06, |
|
"loss": 1.1011, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.9507323568575233, |
|
"grad_norm": 0.07237886602221254, |
|
"learning_rate": 1.4605823832951948e-06, |
|
"loss": 1.1063, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.9533954727030626, |
|
"grad_norm": 0.07194863306691765, |
|
"learning_rate": 1.3064747620468054e-06, |
|
"loss": 1.0914, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.9560585885486018, |
|
"grad_norm": 0.07189535177125876, |
|
"learning_rate": 1.1609033684057857e-06, |
|
"loss": 1.1048, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.9587217043941412, |
|
"grad_norm": 0.07110020697306084, |
|
"learning_rate": 1.0238807931732487e-06, |
|
"loss": 1.1219, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9613848202396804, |
|
"grad_norm": 0.07370460890021728, |
|
"learning_rate": 8.95418887743571e-07, |
|
"loss": 1.1317, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.9640479360852197, |
|
"grad_norm": 0.07207691237434337, |
|
"learning_rate": 7.75528763079314e-07, |
|
"loss": 1.1126, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.966711051930759, |
|
"grad_norm": 0.07440378502380679, |
|
"learning_rate": 6.642207887502027e-07, |
|
"loss": 1.1262, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.9693741677762983, |
|
"grad_norm": 0.06937782526053558, |
|
"learning_rate": 5.615045920362549e-07, |
|
"loss": 1.1191, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.9720372836218375, |
|
"grad_norm": 0.07098237504174736, |
|
"learning_rate": 4.673890570951023e-07, |
|
"loss": 1.1218, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.9747003994673769, |
|
"grad_norm": 0.07181367245928705, |
|
"learning_rate": 3.8188232419352764e-07, |
|
"loss": 1.1514, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.9773635153129161, |
|
"grad_norm": 0.07128549274991237, |
|
"learning_rate": 3.049917890034837e-07, |
|
"loss": 1.0945, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.9800266311584553, |
|
"grad_norm": 0.07161530878264005, |
|
"learning_rate": 2.3672410196232675e-07, |
|
"loss": 1.1056, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.9826897470039947, |
|
"grad_norm": 0.07090950092701657, |
|
"learning_rate": 1.7708516769769924e-07, |
|
"loss": 1.1109, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.9853528628495339, |
|
"grad_norm": 0.07140303003446424, |
|
"learning_rate": 1.2608014451672702e-07, |
|
"loss": 1.1252, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9880159786950732, |
|
"grad_norm": 0.07103068810909154, |
|
"learning_rate": 8.371344395996516e-08, |
|
"loss": 1.1255, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.9906790945406125, |
|
"grad_norm": 0.07153307782175972, |
|
"learning_rate": 4.998873041975882e-08, |
|
"loss": 1.1365, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.9933422103861518, |
|
"grad_norm": 0.06996817906726589, |
|
"learning_rate": 2.490892082331886e-08, |
|
"loss": 1.1142, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.996005326231691, |
|
"grad_norm": 0.07126895143317796, |
|
"learning_rate": 8.476184380468155e-09, |
|
"loss": 1.1091, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.9986684420772304, |
|
"grad_norm": 0.0706408297239244, |
|
"learning_rate": 6.919423959805826e-10, |
|
"loss": 1.1206, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.9997336884154461, |
|
"eval_loss": 1.118857979774475, |
|
"eval_runtime": 1652.5253, |
|
"eval_samples_per_second": 8.045, |
|
"eval_steps_per_second": 0.503, |
|
"step": 1877 |
|
}, |
|
{ |
|
"epoch": 0.9997336884154461, |
|
"step": 1877, |
|
"total_flos": 2.979798729936077e+16, |
|
"train_loss": 1.1429596533467938, |
|
"train_runtime": 55739.5179, |
|
"train_samples_per_second": 2.156, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1877, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.979798729936077e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|