|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 99.2, |
|
"eval_steps": 50, |
|
"global_step": 6200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.730820894241333, |
|
"learning_rate": 0.00019967741935483872, |
|
"loss": 2.5389, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7758501172065735, |
|
"learning_rate": 0.00019935483870967745, |
|
"loss": 1.8047, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.077883720397949, |
|
"learning_rate": 0.00019906451612903227, |
|
"loss": 1.5731, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.27714163064956665, |
|
"learning_rate": 0.00019874193548387098, |
|
"loss": 1.508, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.1314336508512497, |
|
"learning_rate": 0.00019841935483870968, |
|
"loss": 1.4705, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.1689644455909729, |
|
"learning_rate": 0.0001980967741935484, |
|
"loss": 1.4734, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.14989492297172546, |
|
"learning_rate": 0.00019777419354838712, |
|
"loss": 1.4617, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.6179828643798828, |
|
"learning_rate": 0.00019745161290322583, |
|
"loss": 1.4413, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.594003677368164, |
|
"learning_rate": 0.0001971290322580645, |
|
"loss": 1.353, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.08125967532396317, |
|
"learning_rate": 0.00019680645161290324, |
|
"loss": 1.3227, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.09471185505390167, |
|
"learning_rate": 0.00019648387096774195, |
|
"loss": 1.3214, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.10813447833061218, |
|
"learning_rate": 0.00019616129032258065, |
|
"loss": 1.3236, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.07840854674577713, |
|
"learning_rate": 0.0001958387096774194, |
|
"loss": 1.3222, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.08169138431549072, |
|
"learning_rate": 0.00019551612903225807, |
|
"loss": 1.3196, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.15425510704517365, |
|
"learning_rate": 0.00019519354838709677, |
|
"loss": 1.3191, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.08724704384803772, |
|
"learning_rate": 0.0001948709677419355, |
|
"loss": 1.3223, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.1369275003671646, |
|
"learning_rate": 0.0001945483870967742, |
|
"loss": 1.3209, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.12385846674442291, |
|
"learning_rate": 0.0001942258064516129, |
|
"loss": 1.3211, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.1191711276769638, |
|
"learning_rate": 0.00019390322580645162, |
|
"loss": 1.3225, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.0959768146276474, |
|
"learning_rate": 0.00019358064516129033, |
|
"loss": 1.3192, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.08907590806484222, |
|
"learning_rate": 0.00019325806451612904, |
|
"loss": 1.319, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.1829214245080948, |
|
"learning_rate": 0.00019293548387096777, |
|
"loss": 1.3198, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.10891153663396835, |
|
"learning_rate": 0.00019261290322580645, |
|
"loss": 1.32, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.10339193791151047, |
|
"learning_rate": 0.00019229032258064516, |
|
"loss": 1.3191, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.0916794016957283, |
|
"learning_rate": 0.0001919677419354839, |
|
"loss": 1.3223, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.15707476437091827, |
|
"learning_rate": 0.0001916451612903226, |
|
"loss": 1.3196, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.11287837475538254, |
|
"learning_rate": 0.0001913225806451613, |
|
"loss": 1.3168, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.19220077991485596, |
|
"learning_rate": 0.000191, |
|
"loss": 1.3164, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.08514147996902466, |
|
"learning_rate": 0.00019067741935483871, |
|
"loss": 1.3193, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.12788636982440948, |
|
"learning_rate": 0.00019035483870967742, |
|
"loss": 1.3194, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.14810492098331451, |
|
"learning_rate": 0.00019003225806451615, |
|
"loss": 1.3193, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.16166192293167114, |
|
"learning_rate": 0.00018970967741935486, |
|
"loss": 1.3183, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.23055078089237213, |
|
"learning_rate": 0.00018938709677419354, |
|
"loss": 1.3156, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 0.12718935310840607, |
|
"learning_rate": 0.00018906451612903227, |
|
"loss": 1.3163, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.20426005125045776, |
|
"learning_rate": 0.00018874193548387098, |
|
"loss": 1.3157, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 0.11526554077863693, |
|
"learning_rate": 0.00018841935483870968, |
|
"loss": 1.3161, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.1523907333612442, |
|
"learning_rate": 0.0001880967741935484, |
|
"loss": 1.3176, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.2060098499059677, |
|
"learning_rate": 0.0001877741935483871, |
|
"loss": 1.3142, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.24104587733745575, |
|
"learning_rate": 0.0001874516129032258, |
|
"loss": 1.3123, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.17447544634342194, |
|
"learning_rate": 0.00018712903225806454, |
|
"loss": 1.31, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.5600000000000005, |
|
"grad_norm": 0.1756139099597931, |
|
"learning_rate": 0.00018680645161290324, |
|
"loss": 1.3142, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.17624247074127197, |
|
"learning_rate": 0.00018648387096774195, |
|
"loss": 1.3157, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.1455477774143219, |
|
"learning_rate": 0.00018616129032258065, |
|
"loss": 1.3155, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.21999657154083252, |
|
"learning_rate": 0.00018583870967741936, |
|
"loss": 1.3143, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.24742542207241058, |
|
"learning_rate": 0.00018551612903225807, |
|
"loss": 1.3003, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 0.2892099618911743, |
|
"learning_rate": 0.0001851935483870968, |
|
"loss": 1.3064, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 0.2647784650325775, |
|
"learning_rate": 0.00018487096774193548, |
|
"loss": 1.3058, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 0.2782827913761139, |
|
"learning_rate": 0.0001845483870967742, |
|
"loss": 1.3097, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 0.17432355880737305, |
|
"learning_rate": 0.00018422580645161292, |
|
"loss": 1.3141, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.20223839581012726, |
|
"learning_rate": 0.00018390322580645163, |
|
"loss": 1.3125, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 0.39165350794792175, |
|
"learning_rate": 0.00018358064516129033, |
|
"loss": 1.2921, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 0.35312631726264954, |
|
"learning_rate": 0.00018325806451612904, |
|
"loss": 1.2952, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 0.34015020728111267, |
|
"learning_rate": 0.00018293548387096774, |
|
"loss": 1.297, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 0.3401159346103668, |
|
"learning_rate": 0.00018261290322580648, |
|
"loss": 1.298, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 0.3697248697280884, |
|
"learning_rate": 0.00018229032258064518, |
|
"loss": 1.3025, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 0.31431111693382263, |
|
"learning_rate": 0.00018196774193548386, |
|
"loss": 1.3042, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 0.5856003761291504, |
|
"learning_rate": 0.0001816451612903226, |
|
"loss": 1.2838, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 0.5546128749847412, |
|
"learning_rate": 0.0001813225806451613, |
|
"loss": 1.2766, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"grad_norm": 0.5272495150566101, |
|
"learning_rate": 0.000181, |
|
"loss": 1.2825, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 0.37607982754707336, |
|
"learning_rate": 0.00018067741935483874, |
|
"loss": 1.2855, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 0.4203632175922394, |
|
"learning_rate": 0.00018035483870967742, |
|
"loss": 1.2905, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 0.5385815501213074, |
|
"learning_rate": 0.00018003225806451613, |
|
"loss": 1.2862, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 10.08, |
|
"grad_norm": 0.6651461124420166, |
|
"learning_rate": 0.00017970967741935486, |
|
"loss": 1.2753, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 10.24, |
|
"grad_norm": 0.6366870999336243, |
|
"learning_rate": 0.00017938709677419357, |
|
"loss": 1.2522, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"grad_norm": 0.662529468536377, |
|
"learning_rate": 0.00017906451612903225, |
|
"loss": 1.2568, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 10.56, |
|
"grad_norm": 0.569629967212677, |
|
"learning_rate": 0.00017874193548387098, |
|
"loss": 1.2596, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 10.72, |
|
"grad_norm": 0.5770915150642395, |
|
"learning_rate": 0.00017841935483870969, |
|
"loss": 1.2699, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 10.88, |
|
"grad_norm": 0.5522739887237549, |
|
"learning_rate": 0.0001780967741935484, |
|
"loss": 1.2682, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 11.04, |
|
"grad_norm": 0.7272565960884094, |
|
"learning_rate": 0.00017777419354838712, |
|
"loss": 1.2536, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"grad_norm": 0.9644076824188232, |
|
"learning_rate": 0.0001774516129032258, |
|
"loss": 1.22, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 11.36, |
|
"grad_norm": 0.7902132272720337, |
|
"learning_rate": 0.0001771290322580645, |
|
"loss": 1.2189, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 11.52, |
|
"grad_norm": 0.7859879732131958, |
|
"learning_rate": 0.00017680645161290324, |
|
"loss": 1.231, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 11.68, |
|
"grad_norm": 0.7887389659881592, |
|
"learning_rate": 0.00017648387096774195, |
|
"loss": 1.2358, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 11.84, |
|
"grad_norm": 0.7551841139793396, |
|
"learning_rate": 0.00017616129032258066, |
|
"loss": 1.2444, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.6560466885566711, |
|
"learning_rate": 0.00017583870967741936, |
|
"loss": 1.2326, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 12.16, |
|
"grad_norm": 1.0130267143249512, |
|
"learning_rate": 0.00017551612903225807, |
|
"loss": 1.167, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"grad_norm": 0.8946852087974548, |
|
"learning_rate": 0.00017519354838709677, |
|
"loss": 1.1776, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"grad_norm": 1.0538991689682007, |
|
"learning_rate": 0.0001748709677419355, |
|
"loss": 1.1792, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"grad_norm": 1.0721157789230347, |
|
"learning_rate": 0.00017454838709677421, |
|
"loss": 1.1915, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 0.8850572109222412, |
|
"learning_rate": 0.0001742258064516129, |
|
"loss": 1.193, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 1.0722297430038452, |
|
"learning_rate": 0.00017390322580645163, |
|
"loss": 1.1925, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 1.2234816551208496, |
|
"learning_rate": 0.00017358064516129033, |
|
"loss": 1.1253, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 1.289270043373108, |
|
"learning_rate": 0.00017325806451612904, |
|
"loss": 1.1006, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"grad_norm": 1.1833810806274414, |
|
"learning_rate": 0.00017293548387096775, |
|
"loss": 1.102, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 1.2299394607543945, |
|
"learning_rate": 0.00017261290322580645, |
|
"loss": 1.1233, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 13.76, |
|
"grad_norm": 1.1658111810684204, |
|
"learning_rate": 0.00017229032258064516, |
|
"loss": 1.1253, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"grad_norm": 1.1958309412002563, |
|
"learning_rate": 0.0001719677419354839, |
|
"loss": 1.1253, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 14.08, |
|
"grad_norm": 1.418826699256897, |
|
"learning_rate": 0.0001716451612903226, |
|
"loss": 1.0708, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 14.24, |
|
"grad_norm": 1.5377317667007446, |
|
"learning_rate": 0.0001713225806451613, |
|
"loss": 1.0089, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"grad_norm": 1.5082684755325317, |
|
"learning_rate": 0.000171, |
|
"loss": 1.0263, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 14.56, |
|
"grad_norm": 1.5111769437789917, |
|
"learning_rate": 0.00017067741935483872, |
|
"loss": 1.0347, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 14.72, |
|
"grad_norm": 1.4334949254989624, |
|
"learning_rate": 0.00017035483870967742, |
|
"loss": 1.037, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 14.88, |
|
"grad_norm": 2.889636278152466, |
|
"learning_rate": 0.00017003225806451616, |
|
"loss": 1.0348, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 15.04, |
|
"grad_norm": 1.4357563257217407, |
|
"learning_rate": 0.00016970967741935483, |
|
"loss": 1.0162, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"grad_norm": 1.679137110710144, |
|
"learning_rate": 0.00016938709677419357, |
|
"loss": 0.9209, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 15.36, |
|
"grad_norm": 1.6809172630310059, |
|
"learning_rate": 0.00016906451612903227, |
|
"loss": 0.9224, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 15.52, |
|
"grad_norm": 2.0711913108825684, |
|
"learning_rate": 0.00016874193548387098, |
|
"loss": 0.9374, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 15.68, |
|
"grad_norm": 1.6750158071517944, |
|
"learning_rate": 0.00016841935483870969, |
|
"loss": 0.9386, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 15.84, |
|
"grad_norm": 1.7292613983154297, |
|
"learning_rate": 0.0001680967741935484, |
|
"loss": 0.949, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 1.7566314935684204, |
|
"learning_rate": 0.0001677741935483871, |
|
"loss": 0.9549, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 16.16, |
|
"grad_norm": 1.9804630279541016, |
|
"learning_rate": 0.00016745161290322583, |
|
"loss": 0.8478, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 16.32, |
|
"grad_norm": 1.8297134637832642, |
|
"learning_rate": 0.00016712903225806454, |
|
"loss": 0.813, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 16.48, |
|
"grad_norm": 2.0066633224487305, |
|
"learning_rate": 0.00016680645161290322, |
|
"loss": 0.835, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 16.64, |
|
"grad_norm": 1.9635342359542847, |
|
"learning_rate": 0.00016648387096774195, |
|
"loss": 0.8383, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"grad_norm": 1.8945978879928589, |
|
"learning_rate": 0.00016616129032258066, |
|
"loss": 0.8675, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 16.96, |
|
"grad_norm": 1.917394757270813, |
|
"learning_rate": 0.00016583870967741936, |
|
"loss": 0.8529, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 17.12, |
|
"grad_norm": 1.9029055833816528, |
|
"learning_rate": 0.00016551612903225807, |
|
"loss": 0.7593, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 17.28, |
|
"grad_norm": 2.5159807205200195, |
|
"learning_rate": 0.00016519354838709678, |
|
"loss": 0.7418, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 17.44, |
|
"grad_norm": 1.9958096742630005, |
|
"learning_rate": 0.00016487096774193548, |
|
"loss": 0.7489, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"grad_norm": 2.0814130306243896, |
|
"learning_rate": 0.00016454838709677421, |
|
"loss": 0.7502, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 17.76, |
|
"grad_norm": 2.4516828060150146, |
|
"learning_rate": 0.00016422580645161292, |
|
"loss": 0.7607, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 17.92, |
|
"grad_norm": 2.1293084621429443, |
|
"learning_rate": 0.0001639032258064516, |
|
"loss": 0.7671, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 18.08, |
|
"grad_norm": 2.431506872177124, |
|
"learning_rate": 0.00016358064516129033, |
|
"loss": 0.7117, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 18.24, |
|
"grad_norm": 2.156888723373413, |
|
"learning_rate": 0.00016325806451612904, |
|
"loss": 0.6516, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"grad_norm": 2.404205322265625, |
|
"learning_rate": 0.00016293548387096775, |
|
"loss": 0.6595, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 18.56, |
|
"grad_norm": 2.5965211391448975, |
|
"learning_rate": 0.00016261290322580648, |
|
"loss": 0.6738, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 18.72, |
|
"grad_norm": 2.4526731967926025, |
|
"learning_rate": 0.00016229032258064516, |
|
"loss": 0.6581, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"grad_norm": 2.5058436393737793, |
|
"learning_rate": 0.00016196774193548386, |
|
"loss": 0.6821, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 19.04, |
|
"grad_norm": 2.1841073036193848, |
|
"learning_rate": 0.0001616451612903226, |
|
"loss": 0.6644, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 2.2386417388916016, |
|
"learning_rate": 0.0001613225806451613, |
|
"loss": 0.572, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 19.36, |
|
"grad_norm": 2.338043689727783, |
|
"learning_rate": 0.000161, |
|
"loss": 0.5822, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"grad_norm": 2.37507700920105, |
|
"learning_rate": 0.00016067741935483872, |
|
"loss": 0.5925, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"grad_norm": 2.571474552154541, |
|
"learning_rate": 0.00016035483870967742, |
|
"loss": 0.6041, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"grad_norm": 2.4885830879211426, |
|
"learning_rate": 0.00016003225806451613, |
|
"loss": 0.6237, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 2.9572579860687256, |
|
"learning_rate": 0.00015970967741935486, |
|
"loss": 0.6022, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 20.16, |
|
"grad_norm": 2.3328864574432373, |
|
"learning_rate": 0.00015938709677419354, |
|
"loss": 0.5042, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 20.32, |
|
"grad_norm": 2.6092636585235596, |
|
"learning_rate": 0.00015906451612903225, |
|
"loss": 0.5112, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 20.48, |
|
"grad_norm": 2.577423572540283, |
|
"learning_rate": 0.00015874193548387098, |
|
"loss": 0.5242, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 20.64, |
|
"grad_norm": 2.496828556060791, |
|
"learning_rate": 0.0001584193548387097, |
|
"loss": 0.5326, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 20.8, |
|
"grad_norm": 2.479008197784424, |
|
"learning_rate": 0.0001580967741935484, |
|
"loss": 0.5427, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 20.96, |
|
"grad_norm": 3.1138076782226562, |
|
"learning_rate": 0.0001577741935483871, |
|
"loss": 0.5466, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 21.12, |
|
"grad_norm": 2.6781411170959473, |
|
"learning_rate": 0.0001574516129032258, |
|
"loss": 0.4764, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 21.28, |
|
"grad_norm": 3.1711771488189697, |
|
"learning_rate": 0.0001571290322580645, |
|
"loss": 0.4623, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 21.44, |
|
"grad_norm": 2.4297163486480713, |
|
"learning_rate": 0.00015680645161290325, |
|
"loss": 0.4542, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 21.6, |
|
"grad_norm": 2.8714449405670166, |
|
"learning_rate": 0.00015648387096774195, |
|
"loss": 0.4752, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 21.76, |
|
"grad_norm": 2.6193573474884033, |
|
"learning_rate": 0.00015616129032258066, |
|
"loss": 0.484, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 21.92, |
|
"grad_norm": 3.0310218334198, |
|
"learning_rate": 0.00015583870967741936, |
|
"loss": 0.4792, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 22.08, |
|
"grad_norm": 2.4924027919769287, |
|
"learning_rate": 0.00015551612903225807, |
|
"loss": 0.4529, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 22.24, |
|
"grad_norm": 2.0856494903564453, |
|
"learning_rate": 0.00015519354838709678, |
|
"loss": 0.4077, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 22.4, |
|
"grad_norm": 2.794166088104248, |
|
"learning_rate": 0.0001548709677419355, |
|
"loss": 0.4142, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 22.56, |
|
"grad_norm": 2.5052692890167236, |
|
"learning_rate": 0.0001545483870967742, |
|
"loss": 0.4209, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 22.72, |
|
"grad_norm": 2.6941845417022705, |
|
"learning_rate": 0.00015422580645161292, |
|
"loss": 0.4299, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 22.88, |
|
"grad_norm": 3.0589232444763184, |
|
"learning_rate": 0.00015390322580645163, |
|
"loss": 0.4398, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 23.04, |
|
"grad_norm": 2.647005558013916, |
|
"learning_rate": 0.00015358064516129033, |
|
"loss": 0.4283, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 23.2, |
|
"grad_norm": 2.0591695308685303, |
|
"learning_rate": 0.00015325806451612904, |
|
"loss": 0.3667, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 23.36, |
|
"grad_norm": 2.557981014251709, |
|
"learning_rate": 0.00015293548387096775, |
|
"loss": 0.3748, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 23.52, |
|
"grad_norm": 2.7171032428741455, |
|
"learning_rate": 0.00015261290322580645, |
|
"loss": 0.3771, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 23.68, |
|
"grad_norm": 2.8117640018463135, |
|
"learning_rate": 0.00015229032258064516, |
|
"loss": 0.3857, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 23.84, |
|
"grad_norm": 2.635938882827759, |
|
"learning_rate": 0.0001519677419354839, |
|
"loss": 0.3955, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 2.530909299850464, |
|
"learning_rate": 0.00015164516129032257, |
|
"loss": 0.4003, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 24.16, |
|
"grad_norm": 2.716811418533325, |
|
"learning_rate": 0.0001513225806451613, |
|
"loss": 0.3257, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 24.32, |
|
"grad_norm": 2.6447346210479736, |
|
"learning_rate": 0.000151, |
|
"loss": 0.3342, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 24.48, |
|
"grad_norm": 2.419239044189453, |
|
"learning_rate": 0.00015067741935483872, |
|
"loss": 0.3443, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 24.64, |
|
"grad_norm": 2.9881439208984375, |
|
"learning_rate": 0.00015035483870967742, |
|
"loss": 0.351, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 24.8, |
|
"grad_norm": 2.9499566555023193, |
|
"learning_rate": 0.00015003225806451613, |
|
"loss": 0.3626, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 24.96, |
|
"grad_norm": 3.1362252235412598, |
|
"learning_rate": 0.00014970967741935484, |
|
"loss": 0.3572, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 25.12, |
|
"grad_norm": 2.9108505249023438, |
|
"learning_rate": 0.00014938709677419357, |
|
"loss": 0.3141, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 25.28, |
|
"grad_norm": 2.4450314044952393, |
|
"learning_rate": 0.00014906451612903228, |
|
"loss": 0.3062, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 25.44, |
|
"grad_norm": 3.3903231620788574, |
|
"learning_rate": 0.00014874193548387095, |
|
"loss": 0.3137, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 25.6, |
|
"grad_norm": 2.7393417358398438, |
|
"learning_rate": 0.0001484193548387097, |
|
"loss": 0.3276, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 25.76, |
|
"grad_norm": 2.811481475830078, |
|
"learning_rate": 0.0001480967741935484, |
|
"loss": 0.3337, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 25.92, |
|
"grad_norm": 2.320997953414917, |
|
"learning_rate": 0.0001477741935483871, |
|
"loss": 0.3257, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 26.08, |
|
"grad_norm": 2.6050562858581543, |
|
"learning_rate": 0.00014745161290322583, |
|
"loss": 0.3058, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 26.24, |
|
"grad_norm": 2.153953790664673, |
|
"learning_rate": 0.0001471290322580645, |
|
"loss": 0.2795, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 26.4, |
|
"grad_norm": 2.5632269382476807, |
|
"learning_rate": 0.00014680645161290322, |
|
"loss": 0.284, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 26.56, |
|
"grad_norm": 2.355788230895996, |
|
"learning_rate": 0.00014648387096774195, |
|
"loss": 0.2955, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 26.72, |
|
"grad_norm": 2.563262462615967, |
|
"learning_rate": 0.00014616129032258066, |
|
"loss": 0.2965, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 26.88, |
|
"grad_norm": 2.3319389820098877, |
|
"learning_rate": 0.00014583870967741936, |
|
"loss": 0.3023, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 27.04, |
|
"grad_norm": 1.6385563611984253, |
|
"learning_rate": 0.00014551612903225807, |
|
"loss": 0.295, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 27.2, |
|
"grad_norm": 2.8859646320343018, |
|
"learning_rate": 0.00014519354838709678, |
|
"loss": 0.2576, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 27.36, |
|
"grad_norm": 2.1979823112487793, |
|
"learning_rate": 0.00014487096774193548, |
|
"loss": 0.2686, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 27.52, |
|
"grad_norm": 2.4276225566864014, |
|
"learning_rate": 0.00014454838709677422, |
|
"loss": 0.2667, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 27.68, |
|
"grad_norm": 2.81669282913208, |
|
"learning_rate": 0.0001442258064516129, |
|
"loss": 0.2726, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 27.84, |
|
"grad_norm": 3.1404976844787598, |
|
"learning_rate": 0.0001439032258064516, |
|
"loss": 0.2855, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 3.198063373565674, |
|
"learning_rate": 0.00014358064516129034, |
|
"loss": 0.2911, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 28.16, |
|
"grad_norm": 2.2350854873657227, |
|
"learning_rate": 0.00014325806451612904, |
|
"loss": 0.2378, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 28.32, |
|
"grad_norm": 2.4164915084838867, |
|
"learning_rate": 0.00014293548387096775, |
|
"loss": 0.2397, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 28.48, |
|
"grad_norm": 2.8860878944396973, |
|
"learning_rate": 0.00014261290322580645, |
|
"loss": 0.2505, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 28.64, |
|
"grad_norm": 2.7349557876586914, |
|
"learning_rate": 0.00014229032258064516, |
|
"loss": 0.262, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 28.8, |
|
"grad_norm": 2.604602813720703, |
|
"learning_rate": 0.00014196774193548387, |
|
"loss": 0.2639, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 28.96, |
|
"grad_norm": 2.5355823040008545, |
|
"learning_rate": 0.0001416451612903226, |
|
"loss": 0.2666, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 29.12, |
|
"grad_norm": 2.2154204845428467, |
|
"learning_rate": 0.0001413225806451613, |
|
"loss": 0.233, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 29.28, |
|
"grad_norm": 2.8737149238586426, |
|
"learning_rate": 0.000141, |
|
"loss": 0.2245, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 29.44, |
|
"grad_norm": 2.359257221221924, |
|
"learning_rate": 0.00014067741935483872, |
|
"loss": 0.2314, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 29.6, |
|
"grad_norm": 2.4809696674346924, |
|
"learning_rate": 0.00014035483870967742, |
|
"loss": 0.24, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 29.76, |
|
"grad_norm": 2.770080804824829, |
|
"learning_rate": 0.00014003225806451613, |
|
"loss": 0.2471, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 29.92, |
|
"grad_norm": 2.3980712890625, |
|
"learning_rate": 0.00013970967741935486, |
|
"loss": 0.2459, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 30.08, |
|
"grad_norm": 2.22361421585083, |
|
"learning_rate": 0.00013938709677419354, |
|
"loss": 0.2293, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 30.24, |
|
"grad_norm": 2.23351788520813, |
|
"learning_rate": 0.00013906451612903228, |
|
"loss": 0.2161, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 30.4, |
|
"grad_norm": 2.324384927749634, |
|
"learning_rate": 0.00013874193548387098, |
|
"loss": 0.2181, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 30.56, |
|
"grad_norm": 2.4072265625, |
|
"learning_rate": 0.0001384193548387097, |
|
"loss": 0.2252, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 30.72, |
|
"grad_norm": 2.428682565689087, |
|
"learning_rate": 0.0001380967741935484, |
|
"loss": 0.2332, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 30.88, |
|
"grad_norm": 2.5889081954956055, |
|
"learning_rate": 0.0001377741935483871, |
|
"loss": 0.234, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 31.04, |
|
"grad_norm": 2.178375005722046, |
|
"learning_rate": 0.0001374516129032258, |
|
"loss": 0.2221, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 31.2, |
|
"grad_norm": 2.2690091133117676, |
|
"learning_rate": 0.00013712903225806451, |
|
"loss": 0.1978, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 31.36, |
|
"grad_norm": 2.445535659790039, |
|
"learning_rate": 0.00013680645161290325, |
|
"loss": 0.2075, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 31.52, |
|
"grad_norm": 2.297614097595215, |
|
"learning_rate": 0.00013648387096774193, |
|
"loss": 0.2105, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 31.68, |
|
"grad_norm": 2.456094980239868, |
|
"learning_rate": 0.00013616129032258066, |
|
"loss": 0.2183, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 31.84, |
|
"grad_norm": 2.9073026180267334, |
|
"learning_rate": 0.00013583870967741937, |
|
"loss": 0.2219, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 2.5645761489868164, |
|
"learning_rate": 0.00013551612903225807, |
|
"loss": 0.2244, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 32.16, |
|
"grad_norm": 1.9080466032028198, |
|
"learning_rate": 0.00013519354838709678, |
|
"loss": 0.1852, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 32.32, |
|
"grad_norm": 2.4245429039001465, |
|
"learning_rate": 0.00013487096774193548, |
|
"loss": 0.1941, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 32.48, |
|
"grad_norm": 2.873189687728882, |
|
"learning_rate": 0.0001345483870967742, |
|
"loss": 0.198, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 32.64, |
|
"grad_norm": 3.123337984085083, |
|
"learning_rate": 0.00013422580645161292, |
|
"loss": 0.203, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 32.8, |
|
"grad_norm": 2.592850923538208, |
|
"learning_rate": 0.00013390322580645163, |
|
"loss": 0.2095, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 32.96, |
|
"grad_norm": 2.4933552742004395, |
|
"learning_rate": 0.0001335806451612903, |
|
"loss": 0.2053, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 33.12, |
|
"grad_norm": 1.862947940826416, |
|
"learning_rate": 0.00013325806451612904, |
|
"loss": 0.1886, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 33.28, |
|
"grad_norm": 2.4500813484191895, |
|
"learning_rate": 0.00013293548387096775, |
|
"loss": 0.1834, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 33.44, |
|
"grad_norm": 2.4213106632232666, |
|
"learning_rate": 0.00013261290322580645, |
|
"loss": 0.1916, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 33.6, |
|
"grad_norm": 2.4137024879455566, |
|
"learning_rate": 0.0001322903225806452, |
|
"loss": 0.1889, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 33.76, |
|
"grad_norm": 1.9225903749465942, |
|
"learning_rate": 0.00013196774193548387, |
|
"loss": 0.1932, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 33.92, |
|
"grad_norm": 2.627751350402832, |
|
"learning_rate": 0.00013164516129032257, |
|
"loss": 0.1994, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 34.08, |
|
"grad_norm": 2.1216776371002197, |
|
"learning_rate": 0.0001313225806451613, |
|
"loss": 0.1854, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 34.24, |
|
"grad_norm": 1.79988431930542, |
|
"learning_rate": 0.000131, |
|
"loss": 0.1764, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 34.4, |
|
"grad_norm": 2.725584030151367, |
|
"learning_rate": 0.00013067741935483872, |
|
"loss": 0.1724, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 34.56, |
|
"grad_norm": 3.0872607231140137, |
|
"learning_rate": 0.00013035483870967743, |
|
"loss": 0.1864, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 34.72, |
|
"grad_norm": 2.670351982116699, |
|
"learning_rate": 0.00013003225806451613, |
|
"loss": 0.189, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 34.88, |
|
"grad_norm": 2.467918872833252, |
|
"learning_rate": 0.00012970967741935484, |
|
"loss": 0.1892, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 35.04, |
|
"grad_norm": 1.591225504875183, |
|
"learning_rate": 0.00012938709677419357, |
|
"loss": 0.1862, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 35.2, |
|
"grad_norm": 2.1949949264526367, |
|
"learning_rate": 0.00012906451612903225, |
|
"loss": 0.1594, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 35.36, |
|
"grad_norm": 2.2106893062591553, |
|
"learning_rate": 0.00012874193548387096, |
|
"loss": 0.1699, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 35.52, |
|
"grad_norm": 1.835793375968933, |
|
"learning_rate": 0.0001284193548387097, |
|
"loss": 0.1699, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 35.68, |
|
"grad_norm": 1.770975947380066, |
|
"learning_rate": 0.0001280967741935484, |
|
"loss": 0.1786, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 35.84, |
|
"grad_norm": 2.7737338542938232, |
|
"learning_rate": 0.0001277741935483871, |
|
"loss": 0.1799, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 2.317680835723877, |
|
"learning_rate": 0.0001274516129032258, |
|
"loss": 0.1807, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 36.16, |
|
"grad_norm": 2.137967586517334, |
|
"learning_rate": 0.00012712903225806451, |
|
"loss": 0.155, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 36.32, |
|
"grad_norm": 1.764809012413025, |
|
"learning_rate": 0.00012680645161290322, |
|
"loss": 0.1602, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 36.48, |
|
"grad_norm": 2.4306418895721436, |
|
"learning_rate": 0.00012648387096774195, |
|
"loss": 0.1644, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 36.64, |
|
"grad_norm": 1.7215269804000854, |
|
"learning_rate": 0.00012616129032258066, |
|
"loss": 0.1675, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 36.8, |
|
"grad_norm": 2.1169686317443848, |
|
"learning_rate": 0.00012583870967741937, |
|
"loss": 0.169, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 36.96, |
|
"grad_norm": 2.576965093612671, |
|
"learning_rate": 0.00012551612903225807, |
|
"loss": 0.1759, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 37.12, |
|
"grad_norm": 2.349083185195923, |
|
"learning_rate": 0.00012519354838709678, |
|
"loss": 0.1569, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 37.28, |
|
"grad_norm": 1.698199987411499, |
|
"learning_rate": 0.00012487096774193549, |
|
"loss": 0.1523, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 37.44, |
|
"grad_norm": 2.2408599853515625, |
|
"learning_rate": 0.00012454838709677422, |
|
"loss": 0.1579, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 37.6, |
|
"grad_norm": 1.7537211179733276, |
|
"learning_rate": 0.0001242258064516129, |
|
"loss": 0.1613, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 37.76, |
|
"grad_norm": 2.2816600799560547, |
|
"learning_rate": 0.0001239032258064516, |
|
"loss": 0.1604, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 37.92, |
|
"grad_norm": 2.135903835296631, |
|
"learning_rate": 0.00012358064516129034, |
|
"loss": 0.1666, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 38.08, |
|
"grad_norm": 1.4444613456726074, |
|
"learning_rate": 0.00012325806451612904, |
|
"loss": 0.1574, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 38.24, |
|
"grad_norm": 2.1274943351745605, |
|
"learning_rate": 0.00012293548387096775, |
|
"loss": 0.1497, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 38.4, |
|
"grad_norm": 2.0582194328308105, |
|
"learning_rate": 0.00012261290322580646, |
|
"loss": 0.1502, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 38.56, |
|
"grad_norm": 1.99543035030365, |
|
"learning_rate": 0.00012229032258064516, |
|
"loss": 0.1527, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 38.72, |
|
"grad_norm": 2.4016642570495605, |
|
"learning_rate": 0.00012196774193548388, |
|
"loss": 0.1579, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 38.88, |
|
"grad_norm": 1.889532446861267, |
|
"learning_rate": 0.0001216451612903226, |
|
"loss": 0.1581, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 39.04, |
|
"grad_norm": 1.5487650632858276, |
|
"learning_rate": 0.0001213225806451613, |
|
"loss": 0.1557, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 39.2, |
|
"grad_norm": 1.7775896787643433, |
|
"learning_rate": 0.000121, |
|
"loss": 0.1433, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 39.36, |
|
"grad_norm": 1.629921555519104, |
|
"learning_rate": 0.00012067741935483872, |
|
"loss": 0.1477, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 39.52, |
|
"grad_norm": 1.9216187000274658, |
|
"learning_rate": 0.00012035483870967743, |
|
"loss": 0.1481, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 39.68, |
|
"grad_norm": 2.4521870613098145, |
|
"learning_rate": 0.00012003225806451615, |
|
"loss": 0.1548, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 39.84, |
|
"grad_norm": 2.893139600753784, |
|
"learning_rate": 0.00011970967741935484, |
|
"loss": 0.1542, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 1.7710884809494019, |
|
"learning_rate": 0.00011938709677419356, |
|
"loss": 0.1539, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 40.16, |
|
"grad_norm": 2.467780590057373, |
|
"learning_rate": 0.00011906451612903226, |
|
"loss": 0.1384, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 40.32, |
|
"grad_norm": 1.692098617553711, |
|
"learning_rate": 0.00011874193548387098, |
|
"loss": 0.1398, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 40.48, |
|
"grad_norm": 2.4998574256896973, |
|
"learning_rate": 0.00011841935483870968, |
|
"loss": 0.141, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 40.64, |
|
"grad_norm": 2.455094575881958, |
|
"learning_rate": 0.00011809677419354838, |
|
"loss": 0.1472, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 40.8, |
|
"grad_norm": 2.2501373291015625, |
|
"learning_rate": 0.0001177741935483871, |
|
"loss": 0.1478, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 40.96, |
|
"grad_norm": 1.7352842092514038, |
|
"learning_rate": 0.00011745161290322582, |
|
"loss": 0.1523, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 41.12, |
|
"grad_norm": 1.6160428524017334, |
|
"learning_rate": 0.00011712903225806453, |
|
"loss": 0.1383, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 41.28, |
|
"grad_norm": 1.7578778266906738, |
|
"learning_rate": 0.00011680645161290322, |
|
"loss": 0.1344, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 41.44, |
|
"grad_norm": 1.668540120124817, |
|
"learning_rate": 0.00011648387096774194, |
|
"loss": 0.137, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 41.6, |
|
"grad_norm": 1.9453370571136475, |
|
"learning_rate": 0.00011616129032258065, |
|
"loss": 0.1445, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 41.76, |
|
"grad_norm": 2.1329894065856934, |
|
"learning_rate": 0.00011583870967741937, |
|
"loss": 0.1401, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 41.92, |
|
"grad_norm": 2.9534623622894287, |
|
"learning_rate": 0.00011551612903225807, |
|
"loss": 0.1451, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 42.08, |
|
"grad_norm": 1.3531379699707031, |
|
"learning_rate": 0.00011519354838709677, |
|
"loss": 0.1375, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 42.24, |
|
"grad_norm": 2.283677816390991, |
|
"learning_rate": 0.00011487096774193549, |
|
"loss": 0.1307, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 42.4, |
|
"grad_norm": 1.6229647397994995, |
|
"learning_rate": 0.0001145483870967742, |
|
"loss": 0.1313, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 42.56, |
|
"grad_norm": 1.5721402168273926, |
|
"learning_rate": 0.00011422580645161291, |
|
"loss": 0.1322, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 42.72, |
|
"grad_norm": 1.6071618795394897, |
|
"learning_rate": 0.0001139032258064516, |
|
"loss": 0.1376, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 42.88, |
|
"grad_norm": 1.4502424001693726, |
|
"learning_rate": 0.00011358064516129032, |
|
"loss": 0.1368, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 43.04, |
|
"grad_norm": 1.2102515697479248, |
|
"learning_rate": 0.00011325806451612903, |
|
"loss": 0.1363, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 43.2, |
|
"grad_norm": 1.4724687337875366, |
|
"learning_rate": 0.00011293548387096775, |
|
"loss": 0.1241, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 43.36, |
|
"grad_norm": 2.126573085784912, |
|
"learning_rate": 0.00011261290322580647, |
|
"loss": 0.1281, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 43.52, |
|
"grad_norm": 2.0378658771514893, |
|
"learning_rate": 0.00011229032258064516, |
|
"loss": 0.1352, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 43.68, |
|
"grad_norm": 1.978238821029663, |
|
"learning_rate": 0.00011196774193548387, |
|
"loss": 0.1357, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 43.84, |
|
"grad_norm": 1.5167338848114014, |
|
"learning_rate": 0.00011164516129032259, |
|
"loss": 0.1378, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 2.6229848861694336, |
|
"learning_rate": 0.0001113225806451613, |
|
"loss": 0.1368, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 44.16, |
|
"grad_norm": 2.586956024169922, |
|
"learning_rate": 0.00011100000000000001, |
|
"loss": 0.1215, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 44.32, |
|
"grad_norm": 1.5434893369674683, |
|
"learning_rate": 0.00011067741935483871, |
|
"loss": 0.122, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 44.48, |
|
"grad_norm": 1.6991748809814453, |
|
"learning_rate": 0.00011035483870967743, |
|
"loss": 0.1288, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 44.64, |
|
"grad_norm": 1.6747218370437622, |
|
"learning_rate": 0.00011003225806451613, |
|
"loss": 0.1298, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 44.8, |
|
"grad_norm": 1.8977768421173096, |
|
"learning_rate": 0.00010970967741935485, |
|
"loss": 0.1274, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 44.96, |
|
"grad_norm": 2.1921539306640625, |
|
"learning_rate": 0.00010938709677419355, |
|
"loss": 0.1328, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 45.12, |
|
"grad_norm": 2.201087474822998, |
|
"learning_rate": 0.00010906451612903225, |
|
"loss": 0.1199, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 45.28, |
|
"grad_norm": 1.6816660165786743, |
|
"learning_rate": 0.00010874193548387097, |
|
"loss": 0.1214, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 45.44, |
|
"grad_norm": 1.0567231178283691, |
|
"learning_rate": 0.00010841935483870969, |
|
"loss": 0.1234, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 45.6, |
|
"grad_norm": 1.3555560111999512, |
|
"learning_rate": 0.0001080967741935484, |
|
"loss": 0.1236, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 45.76, |
|
"grad_norm": 2.097130537033081, |
|
"learning_rate": 0.00010777419354838709, |
|
"loss": 0.1275, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 45.92, |
|
"grad_norm": 2.127514123916626, |
|
"learning_rate": 0.00010745161290322581, |
|
"loss": 0.1286, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 46.08, |
|
"grad_norm": 1.4086848497390747, |
|
"learning_rate": 0.00010712903225806452, |
|
"loss": 0.1203, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 46.24, |
|
"grad_norm": 1.6010750532150269, |
|
"learning_rate": 0.00010680645161290324, |
|
"loss": 0.1175, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 46.4, |
|
"grad_norm": 1.5432184934616089, |
|
"learning_rate": 0.00010648387096774196, |
|
"loss": 0.1163, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 46.56, |
|
"grad_norm": 3.0002214908599854, |
|
"learning_rate": 0.00010616129032258065, |
|
"loss": 0.1257, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 46.72, |
|
"grad_norm": 1.6969698667526245, |
|
"learning_rate": 0.00010583870967741935, |
|
"loss": 0.1207, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 46.88, |
|
"grad_norm": 1.35691237449646, |
|
"learning_rate": 0.00010551612903225807, |
|
"loss": 0.1257, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 47.04, |
|
"grad_norm": 1.2951841354370117, |
|
"learning_rate": 0.00010519354838709678, |
|
"loss": 0.1226, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 47.2, |
|
"grad_norm": 1.980775237083435, |
|
"learning_rate": 0.0001048709677419355, |
|
"loss": 0.1121, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 47.36, |
|
"grad_norm": 1.7462553977966309, |
|
"learning_rate": 0.00010454838709677419, |
|
"loss": 0.1193, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 47.52, |
|
"grad_norm": 1.3529144525527954, |
|
"learning_rate": 0.00010422580645161291, |
|
"loss": 0.115, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 47.68, |
|
"grad_norm": 1.7577213048934937, |
|
"learning_rate": 0.00010390322580645162, |
|
"loss": 0.1201, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 47.84, |
|
"grad_norm": 1.2776310443878174, |
|
"learning_rate": 0.00010358064516129034, |
|
"loss": 0.1198, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 1.3981118202209473, |
|
"learning_rate": 0.00010325806451612903, |
|
"loss": 0.1204, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 48.16, |
|
"grad_norm": 1.208207130432129, |
|
"learning_rate": 0.00010293548387096774, |
|
"loss": 0.1063, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 48.32, |
|
"grad_norm": 1.3536474704742432, |
|
"learning_rate": 0.00010261290322580646, |
|
"loss": 0.1123, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 48.48, |
|
"grad_norm": 1.3631086349487305, |
|
"learning_rate": 0.00010229032258064516, |
|
"loss": 0.115, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 48.64, |
|
"grad_norm": 1.2178473472595215, |
|
"learning_rate": 0.00010196774193548388, |
|
"loss": 0.1143, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 48.8, |
|
"grad_norm": 2.1066434383392334, |
|
"learning_rate": 0.00010164516129032258, |
|
"loss": 0.1234, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 48.96, |
|
"grad_norm": 1.818852186203003, |
|
"learning_rate": 0.0001013225806451613, |
|
"loss": 0.1242, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 49.12, |
|
"grad_norm": 1.2515507936477661, |
|
"learning_rate": 0.000101, |
|
"loss": 0.1118, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 49.28, |
|
"grad_norm": 1.3625049591064453, |
|
"learning_rate": 0.00010067741935483872, |
|
"loss": 0.1097, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 49.44, |
|
"grad_norm": 1.7623980045318604, |
|
"learning_rate": 0.00010035483870967743, |
|
"loss": 0.1126, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 49.6, |
|
"grad_norm": 1.3493503332138062, |
|
"learning_rate": 0.00010003225806451612, |
|
"loss": 0.1139, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 49.76, |
|
"grad_norm": 1.5169587135314941, |
|
"learning_rate": 9.970967741935484e-05, |
|
"loss": 0.1154, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 49.92, |
|
"grad_norm": 2.1966283321380615, |
|
"learning_rate": 9.938709677419356e-05, |
|
"loss": 0.1171, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 50.08, |
|
"grad_norm": 1.6833268404006958, |
|
"learning_rate": 9.906451612903225e-05, |
|
"loss": 0.113, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 50.24, |
|
"grad_norm": 1.4803876876831055, |
|
"learning_rate": 9.874193548387097e-05, |
|
"loss": 0.1054, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 50.4, |
|
"grad_norm": 1.3878694772720337, |
|
"learning_rate": 9.841935483870969e-05, |
|
"loss": 0.1079, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 50.56, |
|
"grad_norm": 1.64521062374115, |
|
"learning_rate": 9.809677419354838e-05, |
|
"loss": 0.1113, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 50.72, |
|
"grad_norm": 1.4510949850082397, |
|
"learning_rate": 9.77741935483871e-05, |
|
"loss": 0.1147, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 50.88, |
|
"grad_norm": 1.4455626010894775, |
|
"learning_rate": 9.745161290322581e-05, |
|
"loss": 0.1137, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 51.04, |
|
"grad_norm": 1.1447566747665405, |
|
"learning_rate": 9.712903225806452e-05, |
|
"loss": 0.1111, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 51.2, |
|
"grad_norm": 1.1405003070831299, |
|
"learning_rate": 9.680645161290322e-05, |
|
"loss": 0.1049, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 51.36, |
|
"grad_norm": 1.3528752326965332, |
|
"learning_rate": 9.648387096774194e-05, |
|
"loss": 0.1081, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 51.52, |
|
"grad_norm": 1.6244195699691772, |
|
"learning_rate": 9.616129032258065e-05, |
|
"loss": 0.1083, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 51.68, |
|
"grad_norm": 1.4993714094161987, |
|
"learning_rate": 9.583870967741936e-05, |
|
"loss": 0.1082, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 51.84, |
|
"grad_norm": 1.1913836002349854, |
|
"learning_rate": 9.551612903225808e-05, |
|
"loss": 0.1101, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 1.444038987159729, |
|
"learning_rate": 9.519354838709678e-05, |
|
"loss": 0.1136, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 52.16, |
|
"grad_norm": 1.5950225591659546, |
|
"learning_rate": 9.487096774193549e-05, |
|
"loss": 0.1015, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 52.32, |
|
"grad_norm": 1.5138227939605713, |
|
"learning_rate": 9.45483870967742e-05, |
|
"loss": 0.1044, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 52.48, |
|
"grad_norm": 1.7190989255905151, |
|
"learning_rate": 9.422580645161291e-05, |
|
"loss": 0.1049, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 52.64, |
|
"grad_norm": 2.0404067039489746, |
|
"learning_rate": 9.390322580645162e-05, |
|
"loss": 0.1084, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 52.8, |
|
"grad_norm": 0.8778730630874634, |
|
"learning_rate": 9.358064516129033e-05, |
|
"loss": 0.1089, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 52.96, |
|
"grad_norm": 1.302156686782837, |
|
"learning_rate": 9.325806451612905e-05, |
|
"loss": 0.1064, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 53.12, |
|
"grad_norm": 1.0487626791000366, |
|
"learning_rate": 9.293548387096774e-05, |
|
"loss": 0.1041, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 53.28, |
|
"grad_norm": 1.5156755447387695, |
|
"learning_rate": 9.261290322580646e-05, |
|
"loss": 0.1025, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 53.44, |
|
"grad_norm": 1.1233017444610596, |
|
"learning_rate": 9.229032258064516e-05, |
|
"loss": 0.1061, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 53.6, |
|
"grad_norm": 1.3437849283218384, |
|
"learning_rate": 9.196774193548387e-05, |
|
"loss": 0.1072, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 53.76, |
|
"grad_norm": 1.4854118824005127, |
|
"learning_rate": 9.164516129032259e-05, |
|
"loss": 0.1047, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 53.92, |
|
"grad_norm": 1.4864035844802856, |
|
"learning_rate": 9.13225806451613e-05, |
|
"loss": 0.1068, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 54.08, |
|
"grad_norm": 1.0401661396026611, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.1034, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 54.24, |
|
"grad_norm": 0.8670012354850769, |
|
"learning_rate": 9.067741935483871e-05, |
|
"loss": 0.0974, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 54.4, |
|
"grad_norm": 1.1909117698669434, |
|
"learning_rate": 9.035483870967743e-05, |
|
"loss": 0.1007, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 54.56, |
|
"grad_norm": 1.3757662773132324, |
|
"learning_rate": 9.003225806451614e-05, |
|
"loss": 0.1038, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 54.72, |
|
"grad_norm": 0.9391764402389526, |
|
"learning_rate": 8.970967741935484e-05, |
|
"loss": 0.1056, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 54.88, |
|
"grad_norm": 1.0170888900756836, |
|
"learning_rate": 8.938709677419356e-05, |
|
"loss": 0.1023, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 55.04, |
|
"grad_norm": 1.0093938112258911, |
|
"learning_rate": 8.906451612903227e-05, |
|
"loss": 0.1058, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 55.2, |
|
"grad_norm": 1.8801138401031494, |
|
"learning_rate": 8.874193548387097e-05, |
|
"loss": 0.099, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 55.36, |
|
"grad_norm": 1.5623430013656616, |
|
"learning_rate": 8.841935483870968e-05, |
|
"loss": 0.1046, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 55.52, |
|
"grad_norm": 1.7789967060089111, |
|
"learning_rate": 8.809677419354839e-05, |
|
"loss": 0.1038, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 55.68, |
|
"grad_norm": 1.797473669052124, |
|
"learning_rate": 8.777419354838709e-05, |
|
"loss": 0.1062, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 55.84, |
|
"grad_norm": 2.0963029861450195, |
|
"learning_rate": 8.745161290322581e-05, |
|
"loss": 0.104, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 1.711259126663208, |
|
"learning_rate": 8.712903225806452e-05, |
|
"loss": 0.107, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 56.16, |
|
"grad_norm": 1.2828420400619507, |
|
"learning_rate": 8.680645161290322e-05, |
|
"loss": 0.0967, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 56.32, |
|
"grad_norm": 1.2137057781219482, |
|
"learning_rate": 8.648387096774194e-05, |
|
"loss": 0.0971, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 56.48, |
|
"grad_norm": 0.9034223556518555, |
|
"learning_rate": 8.616129032258065e-05, |
|
"loss": 0.1006, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 56.64, |
|
"grad_norm": 1.3555456399917603, |
|
"learning_rate": 8.583870967741936e-05, |
|
"loss": 0.1018, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 56.8, |
|
"grad_norm": 1.7107219696044922, |
|
"learning_rate": 8.551612903225806e-05, |
|
"loss": 0.1041, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 56.96, |
|
"grad_norm": 1.4730465412139893, |
|
"learning_rate": 8.519354838709678e-05, |
|
"loss": 0.1051, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 57.12, |
|
"grad_norm": 1.1063456535339355, |
|
"learning_rate": 8.487096774193549e-05, |
|
"loss": 0.0968, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 57.28, |
|
"grad_norm": 0.9461372494697571, |
|
"learning_rate": 8.45483870967742e-05, |
|
"loss": 0.0958, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 57.44, |
|
"grad_norm": 0.8849135041236877, |
|
"learning_rate": 8.422580645161291e-05, |
|
"loss": 0.0984, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 57.6, |
|
"grad_norm": 0.8585776090621948, |
|
"learning_rate": 8.390322580645161e-05, |
|
"loss": 0.1, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 57.76, |
|
"grad_norm": 1.3201136589050293, |
|
"learning_rate": 8.358064516129033e-05, |
|
"loss": 0.1013, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 57.92, |
|
"grad_norm": 0.9458446502685547, |
|
"learning_rate": 8.325806451612905e-05, |
|
"loss": 0.1042, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 58.08, |
|
"grad_norm": 1.1642829179763794, |
|
"learning_rate": 8.293548387096774e-05, |
|
"loss": 0.0974, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 58.24, |
|
"grad_norm": 0.7290544509887695, |
|
"learning_rate": 8.261290322580646e-05, |
|
"loss": 0.0947, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 58.4, |
|
"grad_norm": 1.4136425256729126, |
|
"learning_rate": 8.229032258064517e-05, |
|
"loss": 0.0966, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 58.56, |
|
"grad_norm": 1.6335221529006958, |
|
"learning_rate": 8.196774193548387e-05, |
|
"loss": 0.0976, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 58.72, |
|
"grad_norm": 0.7871268391609192, |
|
"learning_rate": 8.164516129032258e-05, |
|
"loss": 0.0996, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 58.88, |
|
"grad_norm": 1.2082417011260986, |
|
"learning_rate": 8.13225806451613e-05, |
|
"loss": 0.1004, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 59.04, |
|
"grad_norm": 0.9343645572662354, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.0991, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 59.2, |
|
"grad_norm": 0.9338416457176208, |
|
"learning_rate": 8.067741935483871e-05, |
|
"loss": 0.0937, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 59.36, |
|
"grad_norm": 0.8510011434555054, |
|
"learning_rate": 8.035483870967743e-05, |
|
"loss": 0.0956, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 59.52, |
|
"grad_norm": 0.7901081442832947, |
|
"learning_rate": 8.003225806451614e-05, |
|
"loss": 0.0961, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 59.68, |
|
"grad_norm": 0.8202269673347473, |
|
"learning_rate": 7.970967741935484e-05, |
|
"loss": 0.0971, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 59.84, |
|
"grad_norm": 0.5913488268852234, |
|
"learning_rate": 7.938709677419355e-05, |
|
"loss": 0.0986, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 0.8020423650741577, |
|
"learning_rate": 7.906451612903227e-05, |
|
"loss": 0.1004, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 60.16, |
|
"grad_norm": 1.2948077917099, |
|
"learning_rate": 7.874193548387097e-05, |
|
"loss": 0.092, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 60.32, |
|
"grad_norm": 0.8374224305152893, |
|
"learning_rate": 7.841935483870968e-05, |
|
"loss": 0.0954, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 60.48, |
|
"grad_norm": 1.1318821907043457, |
|
"learning_rate": 7.80967741935484e-05, |
|
"loss": 0.0961, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 60.64, |
|
"grad_norm": 1.1860573291778564, |
|
"learning_rate": 7.777419354838709e-05, |
|
"loss": 0.0958, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 60.8, |
|
"grad_norm": 1.1659208536148071, |
|
"learning_rate": 7.745161290322581e-05, |
|
"loss": 0.0975, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 60.96, |
|
"grad_norm": 1.3710063695907593, |
|
"learning_rate": 7.712903225806452e-05, |
|
"loss": 0.0997, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 61.12, |
|
"grad_norm": 0.7192108035087585, |
|
"learning_rate": 7.680645161290323e-05, |
|
"loss": 0.0931, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 61.28, |
|
"grad_norm": 0.9875419735908508, |
|
"learning_rate": 7.648387096774194e-05, |
|
"loss": 0.0933, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 61.44, |
|
"grad_norm": 1.3378856182098389, |
|
"learning_rate": 7.616129032258065e-05, |
|
"loss": 0.0939, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 61.6, |
|
"grad_norm": 0.9775927662849426, |
|
"learning_rate": 7.583870967741936e-05, |
|
"loss": 0.0958, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 61.76, |
|
"grad_norm": 0.815967857837677, |
|
"learning_rate": 7.551612903225806e-05, |
|
"loss": 0.0971, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 61.92, |
|
"grad_norm": 0.711142897605896, |
|
"learning_rate": 7.519354838709678e-05, |
|
"loss": 0.098, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 62.08, |
|
"grad_norm": 0.757862389087677, |
|
"learning_rate": 7.487096774193548e-05, |
|
"loss": 0.0948, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 62.24, |
|
"grad_norm": 0.9134785532951355, |
|
"learning_rate": 7.45483870967742e-05, |
|
"loss": 0.0913, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 62.4, |
|
"grad_norm": 1.1586359739303589, |
|
"learning_rate": 7.422580645161292e-05, |
|
"loss": 0.0933, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 62.56, |
|
"grad_norm": 0.93504399061203, |
|
"learning_rate": 7.390322580645161e-05, |
|
"loss": 0.0946, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 62.72, |
|
"grad_norm": 1.2178099155426025, |
|
"learning_rate": 7.358064516129033e-05, |
|
"loss": 0.0972, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 62.88, |
|
"grad_norm": 1.491281270980835, |
|
"learning_rate": 7.325806451612903e-05, |
|
"loss": 0.095, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 63.04, |
|
"grad_norm": 0.789310097694397, |
|
"learning_rate": 7.293548387096774e-05, |
|
"loss": 0.0949, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 63.2, |
|
"grad_norm": 0.8426668643951416, |
|
"learning_rate": 7.261290322580645e-05, |
|
"loss": 0.0925, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 63.36, |
|
"grad_norm": 0.6679518818855286, |
|
"learning_rate": 7.229032258064517e-05, |
|
"loss": 0.0935, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 63.52, |
|
"grad_norm": 1.1688344478607178, |
|
"learning_rate": 7.196774193548387e-05, |
|
"loss": 0.095, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 63.68, |
|
"grad_norm": 0.9475545883178711, |
|
"learning_rate": 7.164516129032258e-05, |
|
"loss": 0.0954, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 63.84, |
|
"grad_norm": 0.7896549701690674, |
|
"learning_rate": 7.13225806451613e-05, |
|
"loss": 0.0973, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.9919887185096741, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.0984, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 64.16, |
|
"grad_norm": 0.8883840441703796, |
|
"learning_rate": 7.067741935483871e-05, |
|
"loss": 0.0909, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 64.32, |
|
"grad_norm": 1.3879891633987427, |
|
"learning_rate": 7.035483870967742e-05, |
|
"loss": 0.0924, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 64.48, |
|
"grad_norm": 0.6254050135612488, |
|
"learning_rate": 7.003225806451614e-05, |
|
"loss": 0.0937, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 64.64, |
|
"grad_norm": 0.8522732257843018, |
|
"learning_rate": 6.970967741935484e-05, |
|
"loss": 0.0944, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 64.8, |
|
"grad_norm": 0.8993662595748901, |
|
"learning_rate": 6.938709677419355e-05, |
|
"loss": 0.0946, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 64.96, |
|
"grad_norm": 0.6210134029388428, |
|
"learning_rate": 6.906451612903227e-05, |
|
"loss": 0.0961, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 65.12, |
|
"grad_norm": 1.4237314462661743, |
|
"learning_rate": 6.874193548387096e-05, |
|
"loss": 0.0915, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 65.28, |
|
"grad_norm": 0.590337336063385, |
|
"learning_rate": 6.841935483870968e-05, |
|
"loss": 0.0903, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 65.44, |
|
"grad_norm": 0.8954355120658875, |
|
"learning_rate": 6.809677419354839e-05, |
|
"loss": 0.0915, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 65.6, |
|
"grad_norm": 1.272484302520752, |
|
"learning_rate": 6.77741935483871e-05, |
|
"loss": 0.0919, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 65.76, |
|
"grad_norm": 0.5799235105514526, |
|
"learning_rate": 6.745161290322581e-05, |
|
"loss": 0.0953, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 65.92, |
|
"grad_norm": 0.5489311814308167, |
|
"learning_rate": 6.712903225806452e-05, |
|
"loss": 0.0944, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 66.08, |
|
"grad_norm": 0.454113632440567, |
|
"learning_rate": 6.680645161290323e-05, |
|
"loss": 0.0925, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 66.24, |
|
"grad_norm": 0.46685802936553955, |
|
"learning_rate": 6.648387096774193e-05, |
|
"loss": 0.09, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 66.4, |
|
"grad_norm": 0.5726438164710999, |
|
"learning_rate": 6.616129032258065e-05, |
|
"loss": 0.0921, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 66.56, |
|
"grad_norm": 0.9260037541389465, |
|
"learning_rate": 6.583870967741936e-05, |
|
"loss": 0.0937, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 66.72, |
|
"grad_norm": 0.6579046249389648, |
|
"learning_rate": 6.551612903225806e-05, |
|
"loss": 0.0927, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 66.88, |
|
"grad_norm": 1.529139518737793, |
|
"learning_rate": 6.519354838709678e-05, |
|
"loss": 0.0941, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 67.04, |
|
"grad_norm": 0.49119099974632263, |
|
"learning_rate": 6.487096774193549e-05, |
|
"loss": 0.0922, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 67.2, |
|
"grad_norm": 0.5991088151931763, |
|
"learning_rate": 6.45483870967742e-05, |
|
"loss": 0.0891, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 67.36, |
|
"grad_norm": 0.5796908140182495, |
|
"learning_rate": 6.42258064516129e-05, |
|
"loss": 0.0897, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 67.52, |
|
"grad_norm": 0.4511209726333618, |
|
"learning_rate": 6.390322580645162e-05, |
|
"loss": 0.0907, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 67.68, |
|
"grad_norm": 0.35779204964637756, |
|
"learning_rate": 6.358064516129033e-05, |
|
"loss": 0.0924, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 67.84, |
|
"grad_norm": 0.5029581189155579, |
|
"learning_rate": 6.325806451612903e-05, |
|
"loss": 0.093, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 0.5110875964164734, |
|
"learning_rate": 6.293548387096775e-05, |
|
"loss": 0.0937, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 68.16, |
|
"grad_norm": 0.390669584274292, |
|
"learning_rate": 6.261290322580645e-05, |
|
"loss": 0.0886, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 68.32, |
|
"grad_norm": 0.6021801233291626, |
|
"learning_rate": 6.229032258064517e-05, |
|
"loss": 0.0876, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 68.48, |
|
"grad_norm": 0.41805922985076904, |
|
"learning_rate": 6.196774193548387e-05, |
|
"loss": 0.0898, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 68.64, |
|
"grad_norm": 0.40193480253219604, |
|
"learning_rate": 6.164516129032258e-05, |
|
"loss": 0.0911, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 68.8, |
|
"grad_norm": 1.7792600393295288, |
|
"learning_rate": 6.13225806451613e-05, |
|
"loss": 0.0931, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 68.96, |
|
"grad_norm": 0.4636439383029938, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.0924, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 69.12, |
|
"grad_norm": 0.8066326975822449, |
|
"learning_rate": 6.067741935483872e-05, |
|
"loss": 0.0896, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 69.28, |
|
"grad_norm": 0.5064697265625, |
|
"learning_rate": 6.035483870967742e-05, |
|
"loss": 0.088, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 69.44, |
|
"grad_norm": 0.40276482701301575, |
|
"learning_rate": 6.003225806451613e-05, |
|
"loss": 0.0896, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 69.6, |
|
"grad_norm": 0.39066290855407715, |
|
"learning_rate": 5.970967741935484e-05, |
|
"loss": 0.0912, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 69.76, |
|
"grad_norm": 0.5445528030395508, |
|
"learning_rate": 5.938709677419355e-05, |
|
"loss": 0.0932, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 69.92, |
|
"grad_norm": 0.44867417216300964, |
|
"learning_rate": 5.906451612903226e-05, |
|
"loss": 0.0933, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 70.08, |
|
"grad_norm": 1.0833595991134644, |
|
"learning_rate": 5.874193548387097e-05, |
|
"loss": 0.0907, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 70.24, |
|
"grad_norm": 0.5424122214317322, |
|
"learning_rate": 5.841935483870968e-05, |
|
"loss": 0.0883, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 70.4, |
|
"grad_norm": 0.46907031536102295, |
|
"learning_rate": 5.809677419354839e-05, |
|
"loss": 0.0887, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 70.56, |
|
"grad_norm": 0.40855908393859863, |
|
"learning_rate": 5.77741935483871e-05, |
|
"loss": 0.089, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 70.72, |
|
"grad_norm": 0.37600624561309814, |
|
"learning_rate": 5.745161290322581e-05, |
|
"loss": 0.0915, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 70.88, |
|
"grad_norm": 0.6693145036697388, |
|
"learning_rate": 5.712903225806452e-05, |
|
"loss": 0.0919, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 71.04, |
|
"grad_norm": 0.4039818346500397, |
|
"learning_rate": 5.6806451612903234e-05, |
|
"loss": 0.0909, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 71.2, |
|
"grad_norm": 0.3611462414264679, |
|
"learning_rate": 5.648387096774193e-05, |
|
"loss": 0.0874, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 71.36, |
|
"grad_norm": 0.402322381734848, |
|
"learning_rate": 5.616129032258065e-05, |
|
"loss": 0.0877, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 71.52, |
|
"grad_norm": 0.48968443274497986, |
|
"learning_rate": 5.583870967741935e-05, |
|
"loss": 0.0893, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 71.68, |
|
"grad_norm": 0.4324292242527008, |
|
"learning_rate": 5.5516129032258065e-05, |
|
"loss": 0.0898, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 71.84, |
|
"grad_norm": 0.44097113609313965, |
|
"learning_rate": 5.519354838709677e-05, |
|
"loss": 0.0903, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 0.38512125611305237, |
|
"learning_rate": 5.4870967741935484e-05, |
|
"loss": 0.0921, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 72.16, |
|
"grad_norm": 0.37481260299682617, |
|
"learning_rate": 5.45483870967742e-05, |
|
"loss": 0.0867, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 72.32, |
|
"grad_norm": 0.4293956160545349, |
|
"learning_rate": 5.4225806451612904e-05, |
|
"loss": 0.0876, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 72.48, |
|
"grad_norm": 0.4083200991153717, |
|
"learning_rate": 5.390322580645162e-05, |
|
"loss": 0.0886, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 72.64, |
|
"grad_norm": 0.3753615617752075, |
|
"learning_rate": 5.358064516129032e-05, |
|
"loss": 0.09, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 72.8, |
|
"grad_norm": 0.3720114529132843, |
|
"learning_rate": 5.3258064516129036e-05, |
|
"loss": 0.0892, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 72.96, |
|
"grad_norm": 0.3209419250488281, |
|
"learning_rate": 5.293548387096774e-05, |
|
"loss": 0.0915, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 73.12, |
|
"grad_norm": 0.35520559549331665, |
|
"learning_rate": 5.2612903225806455e-05, |
|
"loss": 0.0871, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 73.28, |
|
"grad_norm": 0.3513979911804199, |
|
"learning_rate": 5.229032258064517e-05, |
|
"loss": 0.087, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 73.44, |
|
"grad_norm": 0.5205033421516418, |
|
"learning_rate": 5.1967741935483874e-05, |
|
"loss": 0.088, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 73.6, |
|
"grad_norm": 0.3683389127254486, |
|
"learning_rate": 5.164516129032259e-05, |
|
"loss": 0.0886, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 73.76, |
|
"grad_norm": 0.45570969581604004, |
|
"learning_rate": 5.132258064516129e-05, |
|
"loss": 0.0902, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 73.92, |
|
"grad_norm": 0.34694406390190125, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.0909, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 74.08, |
|
"grad_norm": 0.3391146957874298, |
|
"learning_rate": 5.0677419354838706e-05, |
|
"loss": 0.0888, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 74.24, |
|
"grad_norm": 0.45434221625328064, |
|
"learning_rate": 5.035483870967742e-05, |
|
"loss": 0.0866, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 74.4, |
|
"grad_norm": 0.4046492576599121, |
|
"learning_rate": 5.003225806451614e-05, |
|
"loss": 0.0881, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 74.56, |
|
"grad_norm": 0.36426255106925964, |
|
"learning_rate": 4.970967741935484e-05, |
|
"loss": 0.0887, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 74.72, |
|
"grad_norm": 0.38272616267204285, |
|
"learning_rate": 4.938709677419355e-05, |
|
"loss": 0.0884, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 74.88, |
|
"grad_norm": 0.4772709012031555, |
|
"learning_rate": 4.9064516129032264e-05, |
|
"loss": 0.0902, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 75.04, |
|
"grad_norm": 0.32595178484916687, |
|
"learning_rate": 4.874193548387097e-05, |
|
"loss": 0.0893, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 75.2, |
|
"grad_norm": 0.4148579239845276, |
|
"learning_rate": 4.8419354838709676e-05, |
|
"loss": 0.0856, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 75.36, |
|
"grad_norm": 0.43145865201950073, |
|
"learning_rate": 4.809677419354839e-05, |
|
"loss": 0.0866, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 75.52, |
|
"grad_norm": 0.4075751006603241, |
|
"learning_rate": 4.7774193548387096e-05, |
|
"loss": 0.0881, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 75.68, |
|
"grad_norm": 0.33043548464775085, |
|
"learning_rate": 4.745161290322581e-05, |
|
"loss": 0.0898, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 75.84, |
|
"grad_norm": 0.36957836151123047, |
|
"learning_rate": 4.712903225806452e-05, |
|
"loss": 0.0904, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 0.3838096857070923, |
|
"learning_rate": 4.680645161290323e-05, |
|
"loss": 0.0896, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 76.16, |
|
"grad_norm": 0.42275360226631165, |
|
"learning_rate": 4.648387096774194e-05, |
|
"loss": 0.0867, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 76.32, |
|
"grad_norm": 0.5027480721473694, |
|
"learning_rate": 4.616129032258065e-05, |
|
"loss": 0.0864, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 76.48, |
|
"grad_norm": 0.36403143405914307, |
|
"learning_rate": 4.583870967741935e-05, |
|
"loss": 0.088, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 76.64, |
|
"grad_norm": 0.3124987781047821, |
|
"learning_rate": 4.5516129032258066e-05, |
|
"loss": 0.0881, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 76.8, |
|
"grad_norm": 0.38175061345100403, |
|
"learning_rate": 4.519354838709678e-05, |
|
"loss": 0.0891, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 76.96, |
|
"grad_norm": 0.41596823930740356, |
|
"learning_rate": 4.4870967741935485e-05, |
|
"loss": 0.0898, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 77.12, |
|
"grad_norm": 0.31852710247039795, |
|
"learning_rate": 4.45483870967742e-05, |
|
"loss": 0.0864, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 77.28, |
|
"grad_norm": 0.4198415279388428, |
|
"learning_rate": 4.4225806451612905e-05, |
|
"loss": 0.0857, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 77.44, |
|
"grad_norm": 0.392374724149704, |
|
"learning_rate": 4.390322580645162e-05, |
|
"loss": 0.0864, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 77.6, |
|
"grad_norm": 0.4310876429080963, |
|
"learning_rate": 4.3580645161290324e-05, |
|
"loss": 0.0893, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 77.76, |
|
"grad_norm": 0.3877263367176056, |
|
"learning_rate": 4.325806451612903e-05, |
|
"loss": 0.0888, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 77.92, |
|
"grad_norm": 0.31759241223335266, |
|
"learning_rate": 4.293548387096775e-05, |
|
"loss": 0.0902, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 78.08, |
|
"grad_norm": 0.3832705616950989, |
|
"learning_rate": 4.2612903225806456e-05, |
|
"loss": 0.0883, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 78.24, |
|
"grad_norm": 0.3763550817966461, |
|
"learning_rate": 4.229032258064516e-05, |
|
"loss": 0.086, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 78.4, |
|
"grad_norm": 0.45972681045532227, |
|
"learning_rate": 4.1967741935483875e-05, |
|
"loss": 0.087, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 78.56, |
|
"grad_norm": 0.4792405664920807, |
|
"learning_rate": 4.164516129032258e-05, |
|
"loss": 0.0886, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 78.72, |
|
"grad_norm": 0.34303322434425354, |
|
"learning_rate": 4.132258064516129e-05, |
|
"loss": 0.0884, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 78.88, |
|
"grad_norm": 0.33531343936920166, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.0892, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 79.04, |
|
"grad_norm": 0.3543599545955658, |
|
"learning_rate": 4.0677419354838713e-05, |
|
"loss": 0.0891, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 79.2, |
|
"grad_norm": 0.3615137040615082, |
|
"learning_rate": 4.035483870967742e-05, |
|
"loss": 0.0853, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 79.36, |
|
"grad_norm": 0.40525078773498535, |
|
"learning_rate": 4.003225806451613e-05, |
|
"loss": 0.0864, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 79.52, |
|
"grad_norm": 0.3445266783237457, |
|
"learning_rate": 3.970967741935484e-05, |
|
"loss": 0.088, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 79.68, |
|
"grad_norm": 0.4922199547290802, |
|
"learning_rate": 3.938709677419355e-05, |
|
"loss": 0.0885, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 79.84, |
|
"grad_norm": 0.3156628906726837, |
|
"learning_rate": 3.906451612903226e-05, |
|
"loss": 0.0888, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 0.32096874713897705, |
|
"learning_rate": 3.8741935483870964e-05, |
|
"loss": 0.0889, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 80.16, |
|
"grad_norm": 0.3683740794658661, |
|
"learning_rate": 3.8419354838709684e-05, |
|
"loss": 0.0853, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 80.32, |
|
"grad_norm": 0.43887290358543396, |
|
"learning_rate": 3.809677419354839e-05, |
|
"loss": 0.0855, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 80.48, |
|
"grad_norm": 0.4039384424686432, |
|
"learning_rate": 3.7774193548387096e-05, |
|
"loss": 0.0865, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 80.64, |
|
"grad_norm": 0.3594481647014618, |
|
"learning_rate": 3.745161290322581e-05, |
|
"loss": 0.0871, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 80.8, |
|
"grad_norm": 0.3050839304924011, |
|
"learning_rate": 3.7129032258064516e-05, |
|
"loss": 0.0892, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 80.96, |
|
"grad_norm": 0.3346671164035797, |
|
"learning_rate": 3.680645161290323e-05, |
|
"loss": 0.0898, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 81.12, |
|
"grad_norm": 0.32323309779167175, |
|
"learning_rate": 3.648387096774194e-05, |
|
"loss": 0.0871, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 81.28, |
|
"grad_norm": 0.5844283699989319, |
|
"learning_rate": 3.616129032258065e-05, |
|
"loss": 0.085, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 81.44, |
|
"grad_norm": 0.39134910702705383, |
|
"learning_rate": 3.583870967741936e-05, |
|
"loss": 0.0868, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 81.6, |
|
"grad_norm": 0.3730677366256714, |
|
"learning_rate": 3.551612903225807e-05, |
|
"loss": 0.0879, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 81.76, |
|
"grad_norm": 0.40242692828178406, |
|
"learning_rate": 3.519354838709677e-05, |
|
"loss": 0.0878, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 81.92, |
|
"grad_norm": 0.40863659977912903, |
|
"learning_rate": 3.4870967741935486e-05, |
|
"loss": 0.0887, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 82.08, |
|
"grad_norm": 0.3297955095767975, |
|
"learning_rate": 3.454838709677419e-05, |
|
"loss": 0.0871, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 82.24, |
|
"grad_norm": 0.39270907640457153, |
|
"learning_rate": 3.4225806451612905e-05, |
|
"loss": 0.0851, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 82.4, |
|
"grad_norm": 0.35612475872039795, |
|
"learning_rate": 3.390322580645162e-05, |
|
"loss": 0.0864, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 82.56, |
|
"grad_norm": 0.419009804725647, |
|
"learning_rate": 3.3580645161290325e-05, |
|
"loss": 0.0872, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 82.72, |
|
"grad_norm": 0.44817715883255005, |
|
"learning_rate": 3.325806451612903e-05, |
|
"loss": 0.0882, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 82.88, |
|
"grad_norm": 0.36784857511520386, |
|
"learning_rate": 3.2935483870967744e-05, |
|
"loss": 0.0878, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 83.04, |
|
"grad_norm": 0.42855191230773926, |
|
"learning_rate": 3.261290322580645e-05, |
|
"loss": 0.088, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 83.2, |
|
"grad_norm": 0.34956803917884827, |
|
"learning_rate": 3.229032258064516e-05, |
|
"loss": 0.0848, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 83.36, |
|
"grad_norm": 0.3654519021511078, |
|
"learning_rate": 3.1967741935483876e-05, |
|
"loss": 0.0862, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 83.52, |
|
"grad_norm": 0.3838617205619812, |
|
"learning_rate": 3.164516129032258e-05, |
|
"loss": 0.0862, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 83.68, |
|
"grad_norm": 0.36475858092308044, |
|
"learning_rate": 3.1322580645161295e-05, |
|
"loss": 0.0876, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 83.84, |
|
"grad_norm": 0.39168354868888855, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.0882, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"grad_norm": 0.38403555750846863, |
|
"learning_rate": 3.067741935483871e-05, |
|
"loss": 0.0885, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 84.16, |
|
"grad_norm": 0.3362351059913635, |
|
"learning_rate": 3.035483870967742e-05, |
|
"loss": 0.0852, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 84.32, |
|
"grad_norm": 0.37289589643478394, |
|
"learning_rate": 3.0032258064516127e-05, |
|
"loss": 0.0852, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 84.48, |
|
"grad_norm": 0.3794906437397003, |
|
"learning_rate": 2.9709677419354843e-05, |
|
"loss": 0.0866, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 84.64, |
|
"grad_norm": 0.46871262788772583, |
|
"learning_rate": 2.938709677419355e-05, |
|
"loss": 0.0871, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 84.8, |
|
"grad_norm": 0.3682585060596466, |
|
"learning_rate": 2.906451612903226e-05, |
|
"loss": 0.0879, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 84.96, |
|
"grad_norm": 0.36091622710227966, |
|
"learning_rate": 2.874193548387097e-05, |
|
"loss": 0.0884, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 85.12, |
|
"grad_norm": 0.3225698173046112, |
|
"learning_rate": 2.8419354838709678e-05, |
|
"loss": 0.0858, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 85.28, |
|
"grad_norm": 0.36931437253952026, |
|
"learning_rate": 2.8096774193548388e-05, |
|
"loss": 0.0853, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 85.44, |
|
"grad_norm": 0.3807561993598938, |
|
"learning_rate": 2.77741935483871e-05, |
|
"loss": 0.0859, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 85.6, |
|
"grad_norm": 0.4048006534576416, |
|
"learning_rate": 2.745161290322581e-05, |
|
"loss": 0.0865, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 85.76, |
|
"grad_norm": 0.3060702979564667, |
|
"learning_rate": 2.712903225806452e-05, |
|
"loss": 0.0874, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 85.92, |
|
"grad_norm": 0.3515985608100891, |
|
"learning_rate": 2.6806451612903226e-05, |
|
"loss": 0.088, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 86.08, |
|
"grad_norm": 0.3089566230773926, |
|
"learning_rate": 2.6483870967741936e-05, |
|
"loss": 0.0871, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 86.24, |
|
"grad_norm": 0.42711663246154785, |
|
"learning_rate": 2.6161290322580645e-05, |
|
"loss": 0.0844, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 86.4, |
|
"grad_norm": 0.37982869148254395, |
|
"learning_rate": 2.5838709677419355e-05, |
|
"loss": 0.0861, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 86.56, |
|
"grad_norm": 0.38406598567962646, |
|
"learning_rate": 2.5516129032258068e-05, |
|
"loss": 0.0866, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 86.72, |
|
"grad_norm": 0.37800315022468567, |
|
"learning_rate": 2.5193548387096777e-05, |
|
"loss": 0.0868, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 86.88, |
|
"grad_norm": 0.39667201042175293, |
|
"learning_rate": 2.4870967741935487e-05, |
|
"loss": 0.0874, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 87.04, |
|
"grad_norm": 0.30831629037857056, |
|
"learning_rate": 2.4548387096774193e-05, |
|
"loss": 0.087, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 87.2, |
|
"grad_norm": 0.3226090669631958, |
|
"learning_rate": 2.4225806451612903e-05, |
|
"loss": 0.0851, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 87.36, |
|
"grad_norm": 0.3767179250717163, |
|
"learning_rate": 2.3903225806451616e-05, |
|
"loss": 0.0847, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 87.52, |
|
"grad_norm": 0.4611583650112152, |
|
"learning_rate": 2.3580645161290325e-05, |
|
"loss": 0.086, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 87.68, |
|
"grad_norm": 0.4075451195240021, |
|
"learning_rate": 2.325806451612903e-05, |
|
"loss": 0.087, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 87.84, |
|
"grad_norm": 0.3248244822025299, |
|
"learning_rate": 2.293548387096774e-05, |
|
"loss": 0.0874, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"grad_norm": 0.3889995813369751, |
|
"learning_rate": 2.2612903225806454e-05, |
|
"loss": 0.0876, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 88.16, |
|
"grad_norm": 0.3838357925415039, |
|
"learning_rate": 2.229032258064516e-05, |
|
"loss": 0.0847, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 88.32, |
|
"grad_norm": 0.35209789872169495, |
|
"learning_rate": 2.196774193548387e-05, |
|
"loss": 0.0843, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 88.48, |
|
"grad_norm": 0.3327479958534241, |
|
"learning_rate": 2.1645161290322583e-05, |
|
"loss": 0.086, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 88.64, |
|
"grad_norm": 0.36551928520202637, |
|
"learning_rate": 2.1322580645161293e-05, |
|
"loss": 0.0863, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 88.8, |
|
"grad_norm": 0.38019606471061707, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.0866, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 88.96, |
|
"grad_norm": 0.3546133041381836, |
|
"learning_rate": 2.0677419354838712e-05, |
|
"loss": 0.0877, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 89.12, |
|
"grad_norm": 0.276177316904068, |
|
"learning_rate": 2.035483870967742e-05, |
|
"loss": 0.0856, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 89.28, |
|
"grad_norm": 0.36877840757369995, |
|
"learning_rate": 2.003225806451613e-05, |
|
"loss": 0.0845, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 89.44, |
|
"grad_norm": 0.3937029540538788, |
|
"learning_rate": 1.9709677419354837e-05, |
|
"loss": 0.0853, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 89.6, |
|
"grad_norm": 0.3682461977005005, |
|
"learning_rate": 1.938709677419355e-05, |
|
"loss": 0.0861, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 89.76, |
|
"grad_norm": 0.3939973711967468, |
|
"learning_rate": 1.906451612903226e-05, |
|
"loss": 0.0863, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 89.92, |
|
"grad_norm": 0.35440462827682495, |
|
"learning_rate": 1.8741935483870966e-05, |
|
"loss": 0.0871, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 90.08, |
|
"grad_norm": 0.3386973738670349, |
|
"learning_rate": 1.841935483870968e-05, |
|
"loss": 0.0852, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 90.24, |
|
"grad_norm": 0.359678715467453, |
|
"learning_rate": 1.809677419354839e-05, |
|
"loss": 0.084, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 90.4, |
|
"grad_norm": 0.38088712096214294, |
|
"learning_rate": 1.7774193548387098e-05, |
|
"loss": 0.0851, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 90.56, |
|
"grad_norm": 0.4327407479286194, |
|
"learning_rate": 1.7451612903225808e-05, |
|
"loss": 0.0861, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 90.72, |
|
"grad_norm": 0.3494204580783844, |
|
"learning_rate": 1.7129032258064517e-05, |
|
"loss": 0.086, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 90.88, |
|
"grad_norm": 0.3655516803264618, |
|
"learning_rate": 1.6806451612903227e-05, |
|
"loss": 0.0867, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 91.04, |
|
"grad_norm": 0.3738172948360443, |
|
"learning_rate": 1.6483870967741937e-05, |
|
"loss": 0.0867, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 91.2, |
|
"grad_norm": 0.33062541484832764, |
|
"learning_rate": 1.6161290322580646e-05, |
|
"loss": 0.0841, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 91.36, |
|
"grad_norm": 0.3357619345188141, |
|
"learning_rate": 1.5838709677419356e-05, |
|
"loss": 0.0845, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 91.52, |
|
"grad_norm": 0.3250825107097626, |
|
"learning_rate": 1.5516129032258065e-05, |
|
"loss": 0.0858, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 91.68, |
|
"grad_norm": 0.4149022102355957, |
|
"learning_rate": 1.5193548387096777e-05, |
|
"loss": 0.0857, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 91.84, |
|
"grad_norm": 0.3375944495201111, |
|
"learning_rate": 1.4870967741935485e-05, |
|
"loss": 0.0863, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"grad_norm": 0.39512765407562256, |
|
"learning_rate": 1.4548387096774194e-05, |
|
"loss": 0.0867, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 92.16, |
|
"grad_norm": 0.36112552881240845, |
|
"learning_rate": 1.4225806451612905e-05, |
|
"loss": 0.0838, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 92.32, |
|
"grad_norm": 0.3789423704147339, |
|
"learning_rate": 1.3903225806451613e-05, |
|
"loss": 0.0839, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 92.48, |
|
"grad_norm": 0.4131874740123749, |
|
"learning_rate": 1.3580645161290323e-05, |
|
"loss": 0.0845, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 92.64, |
|
"grad_norm": 0.4238837659358978, |
|
"learning_rate": 1.3258064516129033e-05, |
|
"loss": 0.0859, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 92.8, |
|
"grad_norm": 0.4758787751197815, |
|
"learning_rate": 1.2935483870967744e-05, |
|
"loss": 0.0862, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 92.96, |
|
"grad_norm": 0.4589889943599701, |
|
"learning_rate": 1.2612903225806452e-05, |
|
"loss": 0.0867, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 93.12, |
|
"grad_norm": 0.3451363146305084, |
|
"learning_rate": 1.2290322580645163e-05, |
|
"loss": 0.0844, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 93.28, |
|
"grad_norm": 0.3493219316005707, |
|
"learning_rate": 1.1967741935483871e-05, |
|
"loss": 0.0847, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 93.44, |
|
"grad_norm": 0.3858239948749542, |
|
"learning_rate": 1.1645161290322582e-05, |
|
"loss": 0.085, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 93.6, |
|
"grad_norm": 0.4442474842071533, |
|
"learning_rate": 1.132258064516129e-05, |
|
"loss": 0.0851, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 93.76, |
|
"grad_norm": 0.41025981307029724, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.0858, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 93.92, |
|
"grad_norm": 0.39134228229522705, |
|
"learning_rate": 1.0677419354838711e-05, |
|
"loss": 0.0855, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 94.08, |
|
"grad_norm": 0.39811140298843384, |
|
"learning_rate": 1.0354838709677419e-05, |
|
"loss": 0.0848, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 94.24, |
|
"grad_norm": 0.37056440114974976, |
|
"learning_rate": 1.003225806451613e-05, |
|
"loss": 0.0834, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 94.4, |
|
"grad_norm": 0.41117194294929504, |
|
"learning_rate": 9.709677419354838e-06, |
|
"loss": 0.0846, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 94.56, |
|
"grad_norm": 0.3775732219219208, |
|
"learning_rate": 9.38709677419355e-06, |
|
"loss": 0.0846, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 94.72, |
|
"grad_norm": 0.4182074964046478, |
|
"learning_rate": 9.064516129032259e-06, |
|
"loss": 0.085, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 94.88, |
|
"grad_norm": 0.3670865595340729, |
|
"learning_rate": 8.741935483870969e-06, |
|
"loss": 0.0854, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 95.04, |
|
"grad_norm": 0.3670247793197632, |
|
"learning_rate": 8.419354838709678e-06, |
|
"loss": 0.0853, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 95.2, |
|
"grad_norm": 0.3669385313987732, |
|
"learning_rate": 8.096774193548388e-06, |
|
"loss": 0.0839, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 95.36, |
|
"grad_norm": 0.41123315691947937, |
|
"learning_rate": 7.774193548387097e-06, |
|
"loss": 0.0838, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 95.52, |
|
"grad_norm": 0.4040362238883972, |
|
"learning_rate": 7.451612903225806e-06, |
|
"loss": 0.0847, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 95.68, |
|
"grad_norm": 0.40712663531303406, |
|
"learning_rate": 7.1290322580645166e-06, |
|
"loss": 0.0851, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 95.84, |
|
"grad_norm": 0.46202871203422546, |
|
"learning_rate": 6.806451612903226e-06, |
|
"loss": 0.0853, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"grad_norm": 0.36839255690574646, |
|
"learning_rate": 6.483870967741936e-06, |
|
"loss": 0.0856, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 96.16, |
|
"grad_norm": 0.34025129675865173, |
|
"learning_rate": 6.161290322580645e-06, |
|
"loss": 0.084, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 96.32, |
|
"grad_norm": 0.37305212020874023, |
|
"learning_rate": 5.838709677419355e-06, |
|
"loss": 0.0838, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 96.48, |
|
"grad_norm": 0.3265480101108551, |
|
"learning_rate": 5.5161290322580645e-06, |
|
"loss": 0.0843, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 96.64, |
|
"grad_norm": 0.42518019676208496, |
|
"learning_rate": 5.193548387096774e-06, |
|
"loss": 0.0843, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 96.8, |
|
"grad_norm": 0.40051665902137756, |
|
"learning_rate": 4.870967741935484e-06, |
|
"loss": 0.0847, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 96.96, |
|
"grad_norm": 0.4043062627315521, |
|
"learning_rate": 4.548387096774194e-06, |
|
"loss": 0.0849, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 97.12, |
|
"grad_norm": 0.350195050239563, |
|
"learning_rate": 4.225806451612904e-06, |
|
"loss": 0.0845, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 97.28, |
|
"grad_norm": 0.4239977300167084, |
|
"learning_rate": 3.903225806451613e-06, |
|
"loss": 0.084, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 97.44, |
|
"grad_norm": 0.3817554712295532, |
|
"learning_rate": 3.5806451612903225e-06, |
|
"loss": 0.0842, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 97.6, |
|
"grad_norm": 0.3956522047519684, |
|
"learning_rate": 3.258064516129032e-06, |
|
"loss": 0.0837, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 97.76, |
|
"grad_norm": 0.4088551104068756, |
|
"learning_rate": 2.935483870967742e-06, |
|
"loss": 0.0842, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 97.92, |
|
"grad_norm": 0.4062351882457733, |
|
"learning_rate": 2.6129032258064518e-06, |
|
"loss": 0.084, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 98.08, |
|
"grad_norm": 0.3595307767391205, |
|
"learning_rate": 2.2903225806451614e-06, |
|
"loss": 0.0836, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 98.24, |
|
"grad_norm": 0.4038846790790558, |
|
"learning_rate": 1.967741935483871e-06, |
|
"loss": 0.0831, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 98.4, |
|
"grad_norm": 0.411885529756546, |
|
"learning_rate": 1.6451612903225808e-06, |
|
"loss": 0.0839, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 98.56, |
|
"grad_norm": 0.3277604579925537, |
|
"learning_rate": 1.3225806451612904e-06, |
|
"loss": 0.084, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 98.72, |
|
"grad_norm": 0.3658997118473053, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0843, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 98.88, |
|
"grad_norm": 0.4060486853122711, |
|
"learning_rate": 6.774193548387097e-07, |
|
"loss": 0.0838, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 99.04, |
|
"grad_norm": 0.3686084449291229, |
|
"learning_rate": 3.548387096774194e-07, |
|
"loss": 0.0835, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 99.2, |
|
"grad_norm": 0.36468222737312317, |
|
"learning_rate": 3.2258064516129035e-08, |
|
"loss": 0.0837, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 99.2, |
|
"step": 6200, |
|
"total_flos": 5.286498354069504e+17, |
|
"train_loss": 0.3455863463590222, |
|
"train_runtime": 16034.7635, |
|
"train_samples_per_second": 6.236, |
|
"train_steps_per_second": 0.387 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.286498354069504e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|